Coverage for src / local_deep_research / database / models / research.py: 95%

133 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2Core research models for tasks, queries, and results. 

3""" 

4 

5import enum 

6 

7from sqlalchemy import ( 

8 JSON, 

9 Column, 

10 Enum, 

11 Float, 

12 ForeignKey, 

13 Index, 

14 Integer, 

15 String, 

16 Text, 

17) 

18from sqlalchemy.orm import relationship 

19from sqlalchemy_utc import UtcDateTime, utcnow 

20 

21from .base import Base 

22 

23 

24class ResearchTask(Base): 

25 """ 

26 Main research tasks that users create. 

27 This is the top-level object that contains all research activities. 

28 """ 

29 

30 __tablename__ = "research_tasks" 

31 

32 id = Column(Integer, primary_key=True) 

33 title = Column(String(500), nullable=False) 

34 description = Column(Text) 

35 status = Column( 

36 String(50), default="pending" 

37 ) # pending, in_progress, completed, failed 

38 priority = Column(Integer, default=0) # Higher number = higher priority 

39 tags = Column(JSON) # List of tags for categorization 

40 research_metadata = Column(JSON) # Flexible field for additional data 

41 

42 # Timestamps 

43 created_at = Column(UtcDateTime, default=utcnow()) 

44 updated_at = Column(UtcDateTime, default=utcnow(), onupdate=utcnow()) 

45 started_at = Column(UtcDateTime) 

46 completed_at = Column(UtcDateTime) 

47 

48 # Relationships 

49 searches = relationship( 

50 "SearchQuery", 

51 back_populates="research_task", 

52 cascade="all, delete-orphan", 

53 ) 

54 results = relationship( 

55 "SearchResult", 

56 back_populates="research_task", 

57 cascade="all, delete-orphan", 

58 ) 

59 reports = relationship( 

60 "Report", back_populates="research_task", cascade="all, delete-orphan" 

61 ) 

62 

63 def __repr__(self): 

64 return f"<ResearchTask(title='{self.title}', status='{self.status}')>" 

65 

66 

67class SearchQuery(Base): 

68 """ 

69 Individual search queries executed as part of research tasks. 

70 Tracks what was searched and when. 

71 """ 

72 

73 __tablename__ = "search_queries" 

74 

75 id = Column(Integer, primary_key=True) 

76 research_task_id = Column( 

77 Integer, ForeignKey("research_tasks.id", ondelete="CASCADE") 

78 ) 

79 query = Column(Text, nullable=False) 

80 search_engine = Column(String(50)) # google, bing, duckduckgo, etc. 

81 search_type = Column(String(50)) # web, academic, news, etc. 

82 parameters = Column(JSON) # Additional search parameters 

83 

84 # Status tracking 

85 status = Column( 

86 String(50), default="pending" 

87 ) # pending, executing, completed, failed 

88 error_message = Column(Text) 

89 retry_count = Column(Integer, default=0) 

90 

91 # Timestamps 

92 created_at = Column(UtcDateTime, default=utcnow()) 

93 executed_at = Column(UtcDateTime) 

94 completed_at = Column(UtcDateTime) 

95 

96 # Relationships 

97 research_task = relationship("ResearchTask", back_populates="searches") 

98 results = relationship( 

99 "SearchResult", 

100 back_populates="search_query", 

101 cascade="all, delete-orphan", 

102 ) 

103 

104 # Indexes for performance 

105 __table_args__ = ( 

106 Index("idx_research_task_status", "research_task_id", "status"), 

107 Index("idx_search_engine", "search_engine", "status"), 

108 ) 

109 

110 def __repr__(self): 

111 return f"<SearchQuery(query='{self.query[:50]}...', status='{self.status}')>" 

112 

113 

114class SearchResult(Base): 

115 """ 

116 Individual search results from queries. 

117 Stores both the initial result and any fetched content. 

118 """ 

119 

120 __tablename__ = "search_results" 

121 

122 id = Column(Integer, primary_key=True) 

123 research_task_id = Column( 

124 Integer, ForeignKey("research_tasks.id", ondelete="CASCADE") 

125 ) 

126 search_query_id = Column( 

127 Integer, ForeignKey("search_queries.id", ondelete="CASCADE") 

128 ) 

129 

130 # Basic result information 

131 title = Column(String(500)) 

132 url = Column(Text, index=True) # Indexed for deduplication 

133 snippet = Column(Text) 

134 

135 # Extended content 

136 content = Column(Text) # Full content if fetched 

137 content_type = Column(String(50)) # html, pdf, text, etc. 

138 content_hash = Column(String(64)) # For deduplication 

139 

140 # Metadata 

141 relevance_score = Column(Float) # Calculated relevance 

142 position = Column(Integer) # Position in search results 

143 domain = Column(String(255), index=True) 

144 language = Column(String(10)) 

145 published_date = Column(UtcDateTime) 

146 author = Column(String(255)) 

147 

148 # Status tracking 

149 fetch_status = Column(String(50)) # pending, fetched, failed, skipped 

150 fetch_error = Column(Text) 

151 

152 # Timestamps 

153 created_at = Column(UtcDateTime, default=utcnow()) 

154 fetched_at = Column(UtcDateTime) 

155 

156 # Relationships 

157 research_task = relationship("ResearchTask", back_populates="results") 

158 search_query = relationship("SearchQuery", back_populates="results") 

159 

160 # Indexes for performance 

161 __table_args__ = ( 

162 Index("idx_task_relevance", "research_task_id", "relevance_score"), 

163 Index("idx_content_hash", "content_hash"), 

164 Index("idx_domain_task", "domain", "research_task_id"), 

165 ) 

166 

167 def __repr__(self): 

168 return f"<SearchResult(title='{self.title[:50] if self.title else 'No title'}...', score={self.relevance_score})>" 

169 

170 

171class ResearchMode(enum.Enum): 

172 """Research modes available.""" 

173 

174 QUICK = "quick" 

175 DETAILED = "detailed" 

176 

177 

178class ResearchStatus(enum.Enum): 

179 """Status of research operations.""" 

180 

181 PENDING = "pending" 

182 IN_PROGRESS = "in_progress" 

183 COMPLETED = "completed" 

184 FAILED = "failed" 

185 CANCELLED = "cancelled" 

186 SUSPENDED = "suspended" 

187 

188 

189class ResearchResource(Base): 

190 """Resources associated with research projects.""" 

191 

192 __tablename__ = "research_resources" 

193 

194 id = Column(Integer, primary_key=True, autoincrement=True) 

195 research_id = Column( 

196 String(36), 

197 ForeignKey("research_history.id", ondelete="CASCADE"), 

198 nullable=False, 

199 ) 

200 title = Column(Text) 

201 url = Column(Text) 

202 content_preview = Column(Text) 

203 source_type = Column(Text) 

204 resource_metadata = Column("metadata", JSON) 

205 created_at = Column(String, nullable=False) 

206 

207 # Relationship 

208 research = relationship("ResearchHistory", back_populates="resources") 

209 

210 def __repr__(self): 

211 return f"<ResearchResource(title='{self.title}', url='{self.url}')>" 

212 

213 

214class ResearchHistory(Base): 

215 """ 

216 Research history table. 

217 Tracks research sessions and their progress. 

218 """ 

219 

220 __tablename__ = "research_history" 

221 

222 # UUID as primary key 

223 id = Column(String(36), primary_key=True) 

224 # The search query. 

225 query = Column(Text, nullable=False) 

226 # The mode of research (e.g., 'quick_summary', 'detailed_report'). 

227 mode = Column(Text, nullable=False) 

228 # Current status of the research. 

229 status = Column(Text, nullable=False) 

230 # The timestamp when the research started. 

231 created_at = Column(Text, nullable=False) 

232 # The timestamp when the research was completed. 

233 completed_at = Column(Text) 

234 # Duration of the research in seconds. 

235 duration_seconds = Column(Integer) 

236 # Path to the generated report. 

237 report_path = Column(Text) 

238 # Report content stored in database 

239 report_content = Column(Text) 

240 # Additional metadata about the research. 

241 research_meta = Column(JSON) 

242 # Latest progress log message. 

243 progress_log = Column(JSON) 

244 # Current progress of the research (as a percentage). 

245 progress = Column(Integer) 

246 # Title of the research report. 

247 title = Column(Text) 

248 

249 # Relationships 

250 resources = relationship( 

251 "ResearchResource", 

252 back_populates="research", 

253 cascade="all, delete-orphan", 

254 ) 

255 

256 def __repr__(self): 

257 return f"<ResearchHistory(query='{self.query[:50]}...', status={self.status})>" 

258 

259 

260class Research(Base): 

261 """ 

262 Modern research tracking with better type safety. 

263 """ 

264 

265 __tablename__ = "research" 

266 

267 id = Column(Integer, primary_key=True, index=True) 

268 query = Column(String, nullable=False) 

269 status = Column( 

270 Enum(ResearchStatus), default=ResearchStatus.PENDING, nullable=False 

271 ) 

272 mode = Column( 

273 Enum(ResearchMode), default=ResearchMode.QUICK, nullable=False 

274 ) 

275 created_at = Column(UtcDateTime, server_default=utcnow(), nullable=False) 

276 updated_at = Column( 

277 UtcDateTime, server_default=utcnow(), onupdate=utcnow(), nullable=False 

278 ) 

279 progress = Column(Float, default=0.0, nullable=False) 

280 start_time = Column(UtcDateTime, nullable=True) 

281 end_time = Column(UtcDateTime, nullable=True) 

282 error_message = Column(Text, nullable=True) 

283 

284 # Relationship 

285 strategy = relationship( 

286 "ResearchStrategy", back_populates="research", uselist=False 

287 ) 

288 

289 def __repr__(self): 

290 return f"<Research(query='{self.query[:50]}...', status={self.status.value})>" 

291 

292 

293class ResearchStrategy(Base): 

294 """ 

295 Track which search strategy was used for each research. 

296 """ 

297 

298 __tablename__ = "research_strategies" 

299 

300 id = Column(Integer, primary_key=True, index=True) 

301 research_id = Column( 

302 Integer, 

303 ForeignKey("research.id", ondelete="CASCADE"), 

304 nullable=False, 

305 unique=True, 

306 index=True, 

307 ) 

308 strategy_name = Column(String(100), nullable=False, index=True) 

309 created_at = Column(UtcDateTime, server_default=utcnow(), nullable=False) 

310 

311 # Relationship 

312 research = relationship("Research", back_populates="strategy") 

313 

314 def __repr__(self): 

315 return f"<ResearchStrategy(research_id={self.research_id}, strategy={self.strategy_name})>"