Coverage for src / local_deep_research / database / models / research.py: 94%

127 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1""" 

2Core research models for tasks, queries, and results. 

3""" 

4 

5import enum 

6 

7from sqlalchemy import ( 

8 JSON, 

9 Column, 

10 Enum, 

11 Float, 

12 ForeignKey, 

13 Index, 

14 Integer, 

15 String, 

16 Text, 

17) 

18from sqlalchemy.orm import relationship 

19from sqlalchemy_utc import UtcDateTime, utcnow 

20 

21from ...constants import ResearchStatus 

22from .base import Base 

23 

24 

25class ResearchTask(Base): 

26 """ 

27 Main research tasks that users create. 

28 This is the top-level object that contains all research activities. 

29 """ 

30 

31 __tablename__ = "research_tasks" 

32 

33 id = Column(Integer, primary_key=True) 

34 title = Column(String(500), nullable=False) 

35 description = Column(Text) 

36 status = Column( 

37 String(50), default="pending" 

38 ) # pending, in_progress, completed, failed 

39 priority = Column(Integer, default=0) # Higher number = higher priority 

40 tags = Column(JSON) # List of tags for categorization 

41 research_metadata = Column(JSON) # Flexible field for additional data 

42 

43 # Timestamps 

44 created_at = Column(UtcDateTime, default=utcnow()) 

45 updated_at = Column(UtcDateTime, default=utcnow(), onupdate=utcnow()) 

46 started_at = Column(UtcDateTime) 

47 completed_at = Column(UtcDateTime) 

48 

49 # Relationships 

50 searches = relationship( 

51 "SearchQuery", 

52 back_populates="research_task", 

53 cascade="all, delete-orphan", 

54 ) 

55 results = relationship( 

56 "SearchResult", 

57 back_populates="research_task", 

58 cascade="all, delete-orphan", 

59 ) 

60 reports = relationship( 

61 "Report", back_populates="research_task", cascade="all, delete-orphan" 

62 ) 

63 

64 def __repr__(self): 

65 return f"<ResearchTask(title='{self.title}', status='{self.status}')>" 

66 

67 

68class SearchQuery(Base): 

69 """ 

70 Individual search queries executed as part of research tasks. 

71 Tracks what was searched and when. 

72 """ 

73 

74 __tablename__ = "search_queries" 

75 

76 id = Column(Integer, primary_key=True) 

77 research_task_id = Column( 

78 Integer, ForeignKey("research_tasks.id", ondelete="CASCADE") 

79 ) 

80 query = Column(Text, nullable=False) 

81 search_engine = Column(String(50)) # google, bing, duckduckgo, etc. 

82 search_type = Column(String(50)) # web, academic, news, etc. 

83 parameters = Column(JSON) # Additional search parameters 

84 

85 # Status tracking 

86 status = Column( 

87 String(50), default="pending" 

88 ) # pending, executing, completed, failed 

89 error_message = Column(Text) 

90 retry_count = Column(Integer, default=0) 

91 

92 # Timestamps 

93 created_at = Column(UtcDateTime, default=utcnow()) 

94 executed_at = Column(UtcDateTime) 

95 completed_at = Column(UtcDateTime) 

96 

97 # Relationships 

98 research_task = relationship("ResearchTask", back_populates="searches") 

99 results = relationship( 

100 "SearchResult", 

101 back_populates="search_query", 

102 cascade="all, delete-orphan", 

103 ) 

104 

105 # Indexes for performance 

106 __table_args__ = ( 

107 Index("idx_research_task_status", "research_task_id", "status"), 

108 Index("idx_search_engine", "search_engine", "status"), 

109 ) 

110 

111 def __repr__(self): 

112 return f"<SearchQuery(query='{self.query[:50]}...', status='{self.status}')>" 

113 

114 

115class SearchResult(Base): 

116 """ 

117 Individual search results from queries. 

118 Stores both the initial result and any fetched content. 

119 """ 

120 

121 __tablename__ = "search_results" 

122 

123 id = Column(Integer, primary_key=True) 

124 research_task_id = Column( 

125 Integer, ForeignKey("research_tasks.id", ondelete="CASCADE") 

126 ) 

127 search_query_id = Column( 

128 Integer, ForeignKey("search_queries.id", ondelete="CASCADE") 

129 ) 

130 

131 # Basic result information 

132 title = Column(String(500)) 

133 url = Column(Text, index=True) # Indexed for deduplication 

134 snippet = Column(Text) 

135 

136 # Extended content 

137 content = Column(Text) # Full content if fetched 

138 content_type = Column(String(50)) # html, pdf, text, etc. 

139 content_hash = Column(String(64)) # For deduplication 

140 

141 # Metadata 

142 relevance_score = Column(Float) # Calculated relevance 

143 position = Column(Integer) # Position in search results 

144 domain = Column(String(255), index=True) 

145 language = Column(String(10)) 

146 published_date = Column(UtcDateTime) 

147 author = Column(String(255)) 

148 

149 # Status tracking 

150 fetch_status = Column(String(50)) # pending, fetched, failed, skipped 

151 fetch_error = Column(Text) 

152 

153 # Timestamps 

154 created_at = Column(UtcDateTime, default=utcnow()) 

155 fetched_at = Column(UtcDateTime) 

156 

157 # Relationships 

158 research_task = relationship("ResearchTask", back_populates="results") 

159 search_query = relationship("SearchQuery", back_populates="results") 

160 

161 # Indexes for performance 

162 __table_args__ = ( 

163 Index("idx_task_relevance", "research_task_id", "relevance_score"), 

164 Index("idx_content_hash", "content_hash"), 

165 Index("idx_domain_task", "domain", "research_task_id"), 

166 ) 

167 

168 def __repr__(self): 

169 return f"<SearchResult(title='{self.title[:50] if self.title else 'No title'}...', score={self.relevance_score})>" 

170 

171 

172class ResearchMode(enum.Enum): 

173 """Research modes available.""" 

174 

175 QUICK = "quick" 

176 DETAILED = "detailed" 

177 

178 

179class ResearchResource(Base): 

180 """Resources associated with research projects.""" 

181 

182 __tablename__ = "research_resources" 

183 

184 id = Column(Integer, primary_key=True, autoincrement=True) 

185 research_id = Column( 

186 String(36), 

187 ForeignKey("research_history.id", ondelete="CASCADE"), 

188 nullable=False, 

189 ) 

190 title = Column(Text) 

191 url = Column(Text) 

192 content_preview = Column(Text) 

193 source_type = Column(Text) 

194 resource_metadata = Column("metadata", JSON) 

195 created_at = Column(String, nullable=False) 

196 

197 # Relationship 

198 research = relationship("ResearchHistory", back_populates="resources") 

199 

200 def __repr__(self): 

201 return f"<ResearchResource(title='{self.title}', url='{self.url}')>" 

202 

203 

204class ResearchHistory(Base): 

205 """ 

206 Research history table. 

207 Tracks research sessions and their progress. 

208 """ 

209 

210 __tablename__ = "research_history" 

211 

212 # UUID as primary key 

213 id = Column(String(36), primary_key=True) 

214 # The search query. 

215 query = Column(Text, nullable=False) 

216 # The mode of research (e.g., 'quick_summary', 'detailed_report'). 

217 mode = Column(Text, nullable=False) 

218 # Current status of the research. 

219 status = Column(Text, nullable=False) 

220 # The timestamp when the research started. 

221 created_at = Column(Text, nullable=False) 

222 # The timestamp when the research was completed. 

223 completed_at = Column(Text) 

224 # Duration of the research in seconds. 

225 duration_seconds = Column(Integer) 

226 # Path to the generated report. 

227 report_path = Column(Text) 

228 # Report content stored in database 

229 report_content = Column(Text) 

230 # Additional metadata about the research. 

231 research_meta = Column(JSON) 

232 # Latest progress log message. 

233 progress_log = Column(JSON) 

234 # Current progress of the research (as a percentage). 

235 progress = Column(Integer) 

236 # Title of the research report. 

237 title = Column(Text) 

238 

239 # Relationships 

240 resources = relationship( 

241 "ResearchResource", 

242 back_populates="research", 

243 cascade="all, delete-orphan", 

244 ) 

245 

246 def __repr__(self): 

247 return f"<ResearchHistory(query='{self.query[:50]}...', status={self.status})>" 

248 

249 

250class Research(Base): 

251 """ 

252 Modern research tracking with better type safety. 

253 """ 

254 

255 __tablename__ = "research" 

256 

257 id = Column(Integer, primary_key=True, index=True) 

258 query = Column(String, nullable=False) 

259 status = Column( 

260 Enum(ResearchStatus), default=ResearchStatus.PENDING, nullable=False 

261 ) 

262 mode = Column( 

263 Enum(ResearchMode), default=ResearchMode.QUICK, nullable=False 

264 ) 

265 created_at = Column(UtcDateTime, server_default=utcnow(), nullable=False) 

266 updated_at = Column( 

267 UtcDateTime, server_default=utcnow(), onupdate=utcnow(), nullable=False 

268 ) 

269 progress = Column(Float, default=0.0, nullable=False) 

270 start_time = Column(UtcDateTime, nullable=True) 

271 end_time = Column(UtcDateTime, nullable=True) 

272 error_message = Column(Text, nullable=True) 

273 

274 # Relationship 

275 strategy = relationship( 

276 "ResearchStrategy", back_populates="research", uselist=False 

277 ) 

278 

279 def __repr__(self): 

280 return f"<Research(query='{self.query[:50]}...', status={self.status.value})>" 

281 

282 

283class ResearchStrategy(Base): 

284 """ 

285 Track which search strategy was used for each research. 

286 """ 

287 

288 __tablename__ = "research_strategies" 

289 

290 id = Column(Integer, primary_key=True, index=True) 

291 research_id = Column( 

292 Integer, 

293 ForeignKey("research.id", ondelete="CASCADE"), 

294 nullable=False, 

295 unique=True, 

296 index=True, 

297 ) 

298 strategy_name = Column(String(100), nullable=False, index=True) 

299 created_at = Column(UtcDateTime, server_default=utcnow(), nullable=False) 

300 

301 # Relationship 

302 research = relationship("Research", back_populates="strategy") 

303 

304 def __repr__(self): 

305 return f"<ResearchStrategy(research_id={self.research_id}, strategy={self.strategy_name})>"