Coverage for src / local_deep_research / database / models / research.py: 95%

129 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2Core research models for tasks, queries, and results. 

3""" 

4 

5import enum 

6 

7from sqlalchemy import ( 

8 JSON, 

9 Column, 

10 Enum, 

11 Float, 

12 ForeignKey, 

13 Index, 

14 Integer, 

15 String, 

16 Text, 

17) 

18from sqlalchemy.orm import relationship 

19 

20from sqlalchemy_utc import UtcDateTime, utcnow 

21 

22from ...constants import ResearchStatus 

23from .base import Base 

24 

25 

26class ResearchTask(Base): 

27 """ 

28 Main research tasks that users create. 

29 This is the top-level object that contains all research activities. 

30 """ 

31 

32 __tablename__ = "research_tasks" 

33 

34 id = Column(Integer, primary_key=True) 

35 title = Column(String(500), nullable=False) 

36 description = Column(Text) 

37 status = Column( 

38 String(50), default="pending" 

39 ) # pending, in_progress, completed, failed 

40 priority = Column(Integer, default=0) # Higher number = higher priority 

41 tags = Column(JSON) # List of tags for categorization 

42 research_metadata = Column(JSON) # Flexible field for additional data 

43 

44 # Timestamps 

45 created_at = Column(UtcDateTime, default=utcnow()) 

46 updated_at = Column(UtcDateTime, default=utcnow(), onupdate=utcnow()) 

47 started_at = Column(UtcDateTime) 

48 completed_at = Column(UtcDateTime) 

49 

50 # Relationships 

51 searches = relationship( 

52 "SearchQuery", 

53 back_populates="research_task", 

54 cascade="all, delete-orphan", 

55 ) 

56 results = relationship( 

57 "SearchResult", 

58 back_populates="research_task", 

59 cascade="all, delete-orphan", 

60 ) 

61 reports = relationship( 

62 "Report", back_populates="research_task", cascade="all, delete-orphan" 

63 ) 

64 

65 def __repr__(self): 

66 return f"<ResearchTask(title='{self.title}', status='{self.status}')>" 

67 

68 

69class SearchQuery(Base): 

70 """ 

71 Individual search queries executed as part of research tasks. 

72 Tracks what was searched and when. 

73 """ 

74 

75 __tablename__ = "search_queries" 

76 

77 id = Column(Integer, primary_key=True) 

78 research_task_id = Column( 

79 Integer, ForeignKey("research_tasks.id", ondelete="CASCADE") 

80 ) 

81 query = Column(Text, nullable=False) 

82 search_engine = Column(String(50)) # google, bing, duckduckgo, etc. 

83 search_type = Column(String(50)) # web, academic, news, etc. 

84 parameters = Column(JSON) # Additional search parameters 

85 

86 # Status tracking 

87 status = Column( 

88 String(50), default="pending" 

89 ) # pending, executing, completed, failed 

90 error_message = Column(Text) 

91 retry_count = Column(Integer, default=0) 

92 

93 # Timestamps 

94 created_at = Column(UtcDateTime, default=utcnow()) 

95 executed_at = Column(UtcDateTime) 

96 completed_at = Column(UtcDateTime) 

97 

98 # Relationships 

99 research_task = relationship("ResearchTask", back_populates="searches") 

100 results = relationship( 

101 "SearchResult", 

102 back_populates="search_query", 

103 cascade="all, delete-orphan", 

104 ) 

105 

106 # Indexes for performance 

107 __table_args__ = ( 

108 Index("idx_research_task_status", "research_task_id", "status"), 

109 Index("idx_search_engine", "search_engine", "status"), 

110 ) 

111 

112 def __repr__(self): 

113 return f"<SearchQuery(query='{self.query[:50]}...', status='{self.status}')>" 

114 

115 

116class SearchResult(Base): 

117 """ 

118 Individual search results from queries. 

119 Stores both the initial result and any fetched content. 

120 """ 

121 

122 __tablename__ = "search_results" 

123 

124 id = Column(Integer, primary_key=True) 

125 research_task_id = Column( 

126 Integer, ForeignKey("research_tasks.id", ondelete="CASCADE") 

127 ) 

128 search_query_id = Column( 

129 Integer, ForeignKey("search_queries.id", ondelete="CASCADE") 

130 ) 

131 

132 # Basic result information 

133 title = Column(String(500)) 

134 url = Column(Text, index=True) # Indexed for deduplication 

135 snippet = Column(Text) 

136 

137 # Extended content 

138 content = Column(Text) # Full content if fetched 

139 content_type = Column(String(50)) # html, pdf, text, etc. 

140 content_hash = Column(String(64)) # For deduplication 

141 

142 # Metadata 

143 relevance_score = Column(Float) # Calculated relevance 

144 position = Column(Integer) # Position in search results 

145 domain = Column(String(255), index=True) 

146 language = Column(String(10)) 

147 published_date = Column(UtcDateTime) 

148 author = Column(String(255)) 

149 

150 # Status tracking 

151 fetch_status = Column(String(50)) # pending, fetched, failed, skipped 

152 fetch_error = Column(Text) 

153 

154 # Timestamps 

155 created_at = Column(UtcDateTime, default=utcnow()) 

156 fetched_at = Column(UtcDateTime) 

157 

158 # Relationships 

159 research_task = relationship("ResearchTask", back_populates="results") 

160 search_query = relationship("SearchQuery", back_populates="results") 

161 

162 # Indexes for performance 

163 __table_args__ = ( 

164 Index("idx_task_relevance", "research_task_id", "relevance_score"), 

165 Index("idx_content_hash", "content_hash"), 

166 Index("idx_domain_task", "domain", "research_task_id"), 

167 ) 

168 

169 def __repr__(self): 

170 return f"<SearchResult(title='{self.title[:50] if self.title else 'No title'}...', score={self.relevance_score})>" 

171 

172 

173class ResearchMode(enum.Enum): 

174 """Research modes available.""" 

175 

176 QUICK = "quick" 

177 DETAILED = "detailed" 

178 

179 

180class ResearchResource(Base): 

181 """Resources associated with research projects.""" 

182 

183 __tablename__ = "research_resources" 

184 

185 id = Column(Integer, primary_key=True, autoincrement=True) 

186 research_id = Column( 

187 String(36), 

188 ForeignKey("research_history.id", ondelete="CASCADE"), 

189 nullable=False, 

190 ) 

191 title = Column(Text) 

192 url = Column(Text) 

193 content_preview = Column(Text) 

194 source_type = Column(Text) 

195 resource_metadata = Column("metadata", JSON) 

196 created_at = Column(String, nullable=False) 

197 document_id = Column( 

198 String(36), 

199 ForeignKey("documents.id", ondelete="SET NULL"), 

200 nullable=True, 

201 index=True, 

202 ) 

203 

204 # Relationships 

205 research = relationship("ResearchHistory", back_populates="resources") 

206 document = relationship("Document", foreign_keys=[document_id]) 

207 

208 def __repr__(self): 

209 return f"<ResearchResource(title='{self.title}', url='{self.url}')>" 

210 

211 

212class ResearchHistory(Base): 

213 """ 

214 Research history table. 

215 Tracks research sessions and their progress. 

216 """ 

217 

218 __tablename__ = "research_history" 

219 

220 # UUID as primary key 

221 id = Column(String(36), primary_key=True) 

222 # The search query. 

223 query = Column(Text, nullable=False) 

224 # The mode of research (e.g., 'quick_summary', 'detailed_report'). 

225 mode = Column(Text, nullable=False) 

226 # Current status of the research. 

227 status = Column(Text, nullable=False) 

228 # The timestamp when the research started. 

229 created_at = Column(Text, nullable=False) 

230 # The timestamp when the research was completed. 

231 completed_at = Column(Text) 

232 # Duration of the research in seconds. 

233 duration_seconds = Column(Integer) 

234 # Path to the generated report. 

235 report_path = Column(Text) 

236 # Report content stored in database 

237 report_content = Column(Text) 

238 # Additional metadata about the research. 

239 research_meta = Column(JSON) 

240 # Latest progress log message. 

241 progress_log = Column(JSON) 

242 # Current progress of the research (as a percentage). 

243 progress = Column(Integer) 

244 # Title of the research report. 

245 title = Column(Text) 

246 

247 # Relationships 

248 resources = relationship( 

249 "ResearchResource", 

250 back_populates="research", 

251 cascade="all, delete-orphan", 

252 ) 

253 

254 def __repr__(self): 

255 return f"<ResearchHistory(query='{self.query[:50]}...', status={self.status})>" 

256 

257 

258class Research(Base): 

259 """ 

260 Modern research tracking with better type safety. 

261 """ 

262 

263 __tablename__ = "research" 

264 

265 id = Column(Integer, primary_key=True, index=True) 

266 query = Column(String, nullable=False) 

267 status = Column( 

268 Enum(ResearchStatus), default=ResearchStatus.PENDING, nullable=False 

269 ) 

270 mode = Column( 

271 Enum(ResearchMode), default=ResearchMode.QUICK, nullable=False 

272 ) 

273 created_at = Column(UtcDateTime, server_default=utcnow(), nullable=False) 

274 updated_at = Column( 

275 UtcDateTime, server_default=utcnow(), onupdate=utcnow(), nullable=False 

276 ) 

277 progress = Column(Float, default=0.0, nullable=False) 

278 start_time = Column(UtcDateTime, nullable=True) 

279 end_time = Column(UtcDateTime, nullable=True) 

280 error_message = Column(Text, nullable=True) 

281 

282 # Relationship 

283 strategy = relationship( 

284 "ResearchStrategy", back_populates="research", uselist=False 

285 ) 

286 

287 def __repr__(self): 

288 return f"<Research(query='{self.query[:50]}...', status={self.status.value})>" 

289 

290 

291class ResearchStrategy(Base): 

292 """ 

293 Track which search strategy was used for each research. 

294 """ 

295 

296 __tablename__ = "research_strategies" 

297 

298 id = Column(Integer, primary_key=True, index=True) 

299 research_id = Column( 

300 Integer, 

301 ForeignKey("research.id", ondelete="CASCADE"), 

302 nullable=False, 

303 unique=True, 

304 index=True, 

305 ) 

306 strategy_name = Column(String(100), nullable=False, index=True) 

307 created_at = Column(UtcDateTime, server_default=utcnow(), nullable=False) 

308 

309 # Relationship 

310 research = relationship("Research", back_populates="strategy") 

311 

312 def __repr__(self): 

313 return f"<ResearchStrategy(research_id={self.research_id}, strategy={self.strategy_name})>"