Coverage for src/local_deep_research/database/models/research.py: 95%

135 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1""" 

2Core research models for tasks, queries, and results. 

3""" 

4 

5import enum 

6 

7from sqlalchemy import ( 

8 JSON, 

9 Column, 

10 Enum, 

11 Float, 

12 ForeignKey, 

13 Index, 

14 Integer, 

15 String, 

16 Text, 

17 text, 

18) 

19from sqlalchemy.orm import relationship 

20 

21from sqlalchemy_utc import UtcDateTime, utcnow 

22 

23from ...constants import ResearchStatus 

24from .base import Base 

25 

26 

27class ResearchTask(Base): 

28 """ 

29 Main research tasks that users create. 

30 This is the top-level object that contains all research activities. 

31 """ 

32 

33 __tablename__ = "research_tasks" 

34 

35 id = Column(Integer, primary_key=True) 

36 title = Column(String(500), nullable=False) 

37 description = Column(Text) 

38 status = Column( 

39 String(50), default="pending" 

40 ) # pending, in_progress, completed, failed 

41 priority = Column(Integer, default=0) # Higher number = higher priority 

42 tags = Column(JSON) # List of tags for categorization 

43 research_metadata = Column(JSON) # Flexible field for additional data 

44 

45 # Timestamps 

46 created_at = Column(UtcDateTime, default=utcnow()) 

47 updated_at = Column(UtcDateTime, default=utcnow(), onupdate=utcnow()) 

48 started_at = Column(UtcDateTime) 

49 completed_at = Column(UtcDateTime) 

50 

51 # Relationships 

52 searches = relationship( 

53 "SearchQuery", 

54 back_populates="research_task", 

55 cascade="all, delete-orphan", 

56 ) 

57 results = relationship( 

58 "SearchResult", 

59 back_populates="research_task", 

60 cascade="all, delete-orphan", 

61 ) 

62 reports = relationship( 

63 "Report", back_populates="research_task", cascade="all, delete-orphan" 

64 ) 

65 

66 def __repr__(self): 

67 return f"<ResearchTask(title='{self.title}', status='{self.status}')>" 

68 

69 

70class SearchQuery(Base): 

71 """ 

72 Individual search queries executed as part of research tasks. 

73 Tracks what was searched and when. 

74 """ 

75 

76 __tablename__ = "search_queries" 

77 

78 id = Column(Integer, primary_key=True) 

79 research_task_id = Column( 

80 Integer, ForeignKey("research_tasks.id", ondelete="CASCADE") 

81 ) 

82 query = Column(Text, nullable=False) 

83 search_engine = Column(String(50)) # google, bing, duckduckgo, etc. 

84 search_type = Column(String(50)) # web, academic, news, etc. 

85 parameters = Column(JSON) # Additional search parameters 

86 

87 # Status tracking 

88 status = Column( 

89 String(50), default="pending" 

90 ) # pending, executing, completed, failed 

91 error_message = Column(Text) 

92 retry_count = Column(Integer, default=0) 

93 

94 # Timestamps 

95 created_at = Column(UtcDateTime, default=utcnow()) 

96 executed_at = Column(UtcDateTime) 

97 completed_at = Column(UtcDateTime) 

98 

99 # Relationships 

100 research_task = relationship("ResearchTask", back_populates="searches") 

101 results = relationship( 

102 "SearchResult", 

103 back_populates="search_query", 

104 cascade="all, delete-orphan", 

105 ) 

106 

107 # Indexes for performance 

108 __table_args__ = ( 

109 Index("idx_research_task_status", "research_task_id", "status"), 

110 Index("idx_search_engine", "search_engine", "status"), 

111 ) 

112 

113 def __repr__(self): 

114 return f"<SearchQuery(query='{self.query[:50]}...', status='{self.status}')>" 

115 

116 

117class SearchResult(Base): 

118 """ 

119 Individual search results from queries. 

120 Stores both the initial result and any fetched content. 

121 """ 

122 

123 __tablename__ = "search_results" 

124 

125 id = Column(Integer, primary_key=True) 

126 research_task_id = Column( 

127 Integer, ForeignKey("research_tasks.id", ondelete="CASCADE") 

128 ) 

129 search_query_id = Column( 

130 Integer, ForeignKey("search_queries.id", ondelete="CASCADE") 

131 ) 

132 

133 # Basic result information 

134 title = Column(String(500)) 

135 url = Column(Text, index=True) # Indexed for deduplication 

136 snippet = Column(Text) 

137 

138 # Extended content 

139 content = Column(Text) # Full content if fetched 

140 content_type = Column(String(50)) # html, pdf, text, etc. 

141 content_hash = Column(String(64)) # For deduplication 

142 

143 # Metadata 

144 relevance_score = Column(Float) # Calculated relevance 

145 position = Column(Integer) # Position in search results 

146 domain = Column(String(255), index=True) 

147 language = Column(String(10)) 

148 published_date = Column(UtcDateTime) 

149 author = Column(String(255)) 

150 

151 # Status tracking 

152 fetch_status = Column(String(50)) # pending, fetched, failed, skipped 

153 fetch_error = Column(Text) 

154 

155 # Timestamps 

156 created_at = Column(UtcDateTime, default=utcnow()) 

157 fetched_at = Column(UtcDateTime) 

158 

159 # Relationships 

160 research_task = relationship("ResearchTask", back_populates="results") 

161 search_query = relationship("SearchQuery", back_populates="results") 

162 

163 # Indexes for performance 

164 __table_args__ = ( 

165 Index("idx_task_relevance", "research_task_id", "relevance_score"), 

166 Index("idx_content_hash", "content_hash"), 

167 Index("idx_domain_task", "domain", "research_task_id"), 

168 ) 

169 

170 def __repr__(self): 

171 return f"<SearchResult(title='{self.title[:50] if self.title else 'No title'}...', score={self.relevance_score})>" 

172 

173 

174class ResearchMode(enum.Enum): 

175 """Research modes available.""" 

176 

177 QUICK = "quick" 

178 DETAILED = "detailed" 

179 

180 

181class ResearchResource(Base): 

182 """Resources associated with research projects.""" 

183 

184 __tablename__ = "research_resources" 

185 

186 id = Column(Integer, primary_key=True, autoincrement=True) 

187 # Named Index() in __table_args__ below (NOT index=True on this 

188 # column) so both the create_all path (fresh installs, test fixtures) 

189 # and the migration path (0006:286-289) produce an identically- 

190 # named index: ix_research_resources_research_id. Using index=True 

191 # would give create_all an auto-named index that diverges from the 

192 # migration's named one; that asymmetry previously meant fresh 

193 # installs had no research_id index and ran full-table scans on 

194 # 20+ call sites. 

195 research_id = Column( 

196 String(36), 

197 ForeignKey("research_history.id", ondelete="CASCADE"), 

198 nullable=False, 

199 ) 

200 title = Column(Text) 

201 url = Column(Text) 

202 content_preview = Column(Text) 

203 source_type = Column(Text) 

204 resource_metadata = Column("metadata", JSON) 

205 created_at = Column(String, nullable=False) 

206 document_id = Column( 

207 String(36), 

208 ForeignKey("documents.id", ondelete="SET NULL"), 

209 nullable=True, 

210 index=True, 

211 ) 

212 

213 # Relationships 

214 research = relationship("ResearchHistory", back_populates="resources") 

215 document = relationship("Document", foreign_keys=[document_id]) 

216 paper_appearance = relationship( 

217 "PaperAppearance", 

218 back_populates="resource", 

219 uselist=False, 

220 cascade="all, delete-orphan", 

221 ) 

222 

223 __table_args__ = ( 

224 Index("ix_research_resources_research_id", "research_id"), 

225 ) 

226 

227 def __repr__(self): 

228 return f"<ResearchResource(title='{self.title}', url='{self.url}')>" 

229 

230 

231class ResearchHistory(Base): 

232 """ 

233 Research history table. 

234 Tracks research sessions and their progress. 

235 """ 

236 

237 __tablename__ = "research_history" 

238 

239 # UUID as primary key 

240 id = Column(String(36), primary_key=True) 

241 # The search query. 

242 query = Column(Text, nullable=False) 

243 # The mode of research (e.g., 'quick_summary', 'detailed_report'). 

244 mode = Column(Text, nullable=False) 

245 # Current status of the research. 

246 status = Column(Text, nullable=False) 

247 # The timestamp when the research started. 

248 created_at = Column(Text, nullable=False) 

249 # The timestamp when the research was completed. 

250 completed_at = Column(Text) 

251 # Duration of the research in seconds. 

252 duration_seconds = Column(Integer) 

253 # Path to the generated report. 

254 report_path = Column(Text) 

255 # Report content stored in database 

256 report_content = Column(Text) 

257 # Additional metadata about the research. 

258 research_meta = Column(JSON) 

259 # Latest progress log message. 

260 progress_log = Column(JSON) 

261 # Current progress of the research (as a percentage). 

262 progress = Column(Integer) 

263 # Title of the research report. 

264 title = Column(Text) 

265 

266 # Optional link to chat session that triggered this research. 

267 # Named Index() in __table_args__ below (NOT index=True on this 

268 # column) so both the create_all path (fresh installs, test 

269 # fixtures) and the migration path (in migration 0010) produce an 

270 # identically-named index: ix_research_history_chat_session_id. 

271 # The matching ResearchResource pattern above documents the same 

272 # divergence risk in detail. 

273 chat_session_id = Column( 

274 String(36), 

275 ForeignKey("chat_sessions.id", ondelete="SET NULL"), 

276 nullable=True, 

277 ) 

278 

279 # Atomic counter for ChatService.add_progress_step's per-research 

280 # sequence_number allocation (mirrors ChatSession.message_count). 

281 step_count = Column(Integer, nullable=False, default=0, server_default="0") 

282 

283 # Relationships 

284 resources = relationship( 

285 "ResearchResource", 

286 back_populates="research", 

287 cascade="all, delete-orphan", 

288 ) 

289 chat_session = relationship( 

290 "ChatSession", 

291 foreign_keys=[chat_session_id], 

292 back_populates="researches", 

293 ) 

294 chat_messages = relationship( 

295 "ChatMessage", 

296 back_populates="research", 

297 passive_deletes=True, 

298 ) 

299 progress_steps = relationship( 

300 "ChatProgressStep", 

301 back_populates="research", 

302 passive_deletes=True, 

303 ) 

304 

305 __table_args__ = ( 

306 Index("ix_research_history_chat_session_id", "chat_session_id"), 

307 # Partial unique index closing the SELECT-then-INSERT race in 

308 # chat/routes.py::send_message — only one in-progress research 

309 # per chat session, NULL chat_session_id rows unconstrained. 

310 # Migration 0010 mirrors this index exactly. 

311 Index( 

312 "ux_research_history_chat_session_in_progress", 

313 "chat_session_id", 

314 unique=True, 

315 sqlite_where=text( 

316 "status = 'in_progress' AND chat_session_id IS NOT NULL" 

317 ), 

318 postgresql_where=text( 

319 "status = 'in_progress' AND chat_session_id IS NOT NULL" 

320 ), 

321 ), 

322 ) 

323 

324 def __repr__(self): 

325 return f"<ResearchHistory(query='{self.query[:50]}...', status={self.status})>" 

326 

327 

328class Research(Base): 

329 """ 

330 Modern research tracking with better type safety. 

331 """ 

332 

333 __tablename__ = "research" 

334 

335 id = Column(Integer, primary_key=True) 

336 query = Column(String, nullable=False) 

337 status = Column( 

338 Enum(ResearchStatus), default=ResearchStatus.PENDING, nullable=False 

339 ) 

340 mode = Column( 

341 Enum(ResearchMode), default=ResearchMode.QUICK, nullable=False 

342 ) 

343 created_at = Column(UtcDateTime, server_default=utcnow(), nullable=False) 

344 updated_at = Column( 

345 UtcDateTime, server_default=utcnow(), onupdate=utcnow(), nullable=False 

346 ) 

347 progress = Column(Float, default=0.0, nullable=False) 

348 start_time = Column(UtcDateTime, nullable=True) 

349 end_time = Column(UtcDateTime, nullable=True) 

350 error_message = Column(Text, nullable=True) 

351 

352 def __repr__(self): 

353 return f"<Research(query='{self.query[:50]}...', status={self.status.value})>" 

354 

355 

356class ResearchStrategy(Base): 

357 """ 

358 Track which search strategy was used for each research. 

359 """ 

360 

361 __tablename__ = "research_strategies" 

362 

363 id = Column(Integer, primary_key=True) 

364 # FK targets research_history.id (the live UUID-keyed table) rather than 

365 # the dormant `research` table. save_research_strategy passes the 

366 # research_history UUID, so the prior Integer FK to research.id raised 

367 # FOREIGN KEY constraint failed on every commit once v1.6.0 turned on 

368 # PRAGMA foreign_keys. 

369 research_id = Column( 

370 String(36), 

371 ForeignKey("research_history.id", ondelete="CASCADE"), 

372 nullable=False, 

373 unique=True, 

374 index=True, 

375 ) 

376 strategy_name = Column(String(100), nullable=False, index=True) 

377 created_at = Column(UtcDateTime, server_default=utcnow(), nullable=False) 

378 

379 def __repr__(self): 

380 return f"<ResearchStrategy(research_id={self.research_id}, strategy={self.strategy_name})>"