Coverage for src/local_deep_research/database/models/research.py: 95%
135 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""
2Core research models for tasks, queries, and results.
3"""
5import enum
7from sqlalchemy import (
8 JSON,
9 Column,
10 Enum,
11 Float,
12 ForeignKey,
13 Index,
14 Integer,
15 String,
16 Text,
17 text,
18)
19from sqlalchemy.orm import relationship
21from sqlalchemy_utc import UtcDateTime, utcnow
23from ...constants import ResearchStatus
24from .base import Base
27class ResearchTask(Base):
28 """
29 Main research tasks that users create.
30 This is the top-level object that contains all research activities.
31 """
33 __tablename__ = "research_tasks"
35 id = Column(Integer, primary_key=True)
36 title = Column(String(500), nullable=False)
37 description = Column(Text)
38 status = Column(
39 String(50), default="pending"
40 ) # pending, in_progress, completed, failed
41 priority = Column(Integer, default=0) # Higher number = higher priority
42 tags = Column(JSON) # List of tags for categorization
43 research_metadata = Column(JSON) # Flexible field for additional data
45 # Timestamps
46 created_at = Column(UtcDateTime, default=utcnow())
47 updated_at = Column(UtcDateTime, default=utcnow(), onupdate=utcnow())
48 started_at = Column(UtcDateTime)
49 completed_at = Column(UtcDateTime)
51 # Relationships
52 searches = relationship(
53 "SearchQuery",
54 back_populates="research_task",
55 cascade="all, delete-orphan",
56 )
57 results = relationship(
58 "SearchResult",
59 back_populates="research_task",
60 cascade="all, delete-orphan",
61 )
62 reports = relationship(
63 "Report", back_populates="research_task", cascade="all, delete-orphan"
64 )
66 def __repr__(self):
67 return f"<ResearchTask(title='{self.title}', status='{self.status}')>"
70class SearchQuery(Base):
71 """
72 Individual search queries executed as part of research tasks.
73 Tracks what was searched and when.
74 """
76 __tablename__ = "search_queries"
78 id = Column(Integer, primary_key=True)
79 research_task_id = Column(
80 Integer, ForeignKey("research_tasks.id", ondelete="CASCADE")
81 )
82 query = Column(Text, nullable=False)
83 search_engine = Column(String(50)) # google, bing, duckduckgo, etc.
84 search_type = Column(String(50)) # web, academic, news, etc.
85 parameters = Column(JSON) # Additional search parameters
87 # Status tracking
88 status = Column(
89 String(50), default="pending"
90 ) # pending, executing, completed, failed
91 error_message = Column(Text)
92 retry_count = Column(Integer, default=0)
94 # Timestamps
95 created_at = Column(UtcDateTime, default=utcnow())
96 executed_at = Column(UtcDateTime)
97 completed_at = Column(UtcDateTime)
99 # Relationships
100 research_task = relationship("ResearchTask", back_populates="searches")
101 results = relationship(
102 "SearchResult",
103 back_populates="search_query",
104 cascade="all, delete-orphan",
105 )
107 # Indexes for performance
108 __table_args__ = (
109 Index("idx_research_task_status", "research_task_id", "status"),
110 Index("idx_search_engine", "search_engine", "status"),
111 )
113 def __repr__(self):
114 return f"<SearchQuery(query='{self.query[:50]}...', status='{self.status}')>"
117class SearchResult(Base):
118 """
119 Individual search results from queries.
120 Stores both the initial result and any fetched content.
121 """
123 __tablename__ = "search_results"
125 id = Column(Integer, primary_key=True)
126 research_task_id = Column(
127 Integer, ForeignKey("research_tasks.id", ondelete="CASCADE")
128 )
129 search_query_id = Column(
130 Integer, ForeignKey("search_queries.id", ondelete="CASCADE")
131 )
133 # Basic result information
134 title = Column(String(500))
135 url = Column(Text, index=True) # Indexed for deduplication
136 snippet = Column(Text)
138 # Extended content
139 content = Column(Text) # Full content if fetched
140 content_type = Column(String(50)) # html, pdf, text, etc.
141 content_hash = Column(String(64)) # For deduplication
143 # Metadata
144 relevance_score = Column(Float) # Calculated relevance
145 position = Column(Integer) # Position in search results
146 domain = Column(String(255), index=True)
147 language = Column(String(10))
148 published_date = Column(UtcDateTime)
149 author = Column(String(255))
151 # Status tracking
152 fetch_status = Column(String(50)) # pending, fetched, failed, skipped
153 fetch_error = Column(Text)
155 # Timestamps
156 created_at = Column(UtcDateTime, default=utcnow())
157 fetched_at = Column(UtcDateTime)
159 # Relationships
160 research_task = relationship("ResearchTask", back_populates="results")
161 search_query = relationship("SearchQuery", back_populates="results")
163 # Indexes for performance
164 __table_args__ = (
165 Index("idx_task_relevance", "research_task_id", "relevance_score"),
166 Index("idx_content_hash", "content_hash"),
167 Index("idx_domain_task", "domain", "research_task_id"),
168 )
170 def __repr__(self):
171 return f"<SearchResult(title='{self.title[:50] if self.title else 'No title'}...', score={self.relevance_score})>"
174class ResearchMode(enum.Enum):
175 """Research modes available."""
177 QUICK = "quick"
178 DETAILED = "detailed"
181class ResearchResource(Base):
182 """Resources associated with research projects."""
184 __tablename__ = "research_resources"
186 id = Column(Integer, primary_key=True, autoincrement=True)
187 # Named Index() in __table_args__ below (NOT index=True on this
188 # column) so both the create_all path (fresh installs, test fixtures)
189 # and the migration path (0006:286-289) produce an identically-
190 # named index: ix_research_resources_research_id. Using index=True
191 # would give create_all an auto-named index that diverges from the
192 # migration's named one; that asymmetry previously meant fresh
193 # installs had no research_id index and ran full-table scans on
194 # 20+ call sites.
195 research_id = Column(
196 String(36),
197 ForeignKey("research_history.id", ondelete="CASCADE"),
198 nullable=False,
199 )
200 title = Column(Text)
201 url = Column(Text)
202 content_preview = Column(Text)
203 source_type = Column(Text)
204 resource_metadata = Column("metadata", JSON)
205 created_at = Column(String, nullable=False)
206 document_id = Column(
207 String(36),
208 ForeignKey("documents.id", ondelete="SET NULL"),
209 nullable=True,
210 index=True,
211 )
213 # Relationships
214 research = relationship("ResearchHistory", back_populates="resources")
215 document = relationship("Document", foreign_keys=[document_id])
216 paper_appearance = relationship(
217 "PaperAppearance",
218 back_populates="resource",
219 uselist=False,
220 cascade="all, delete-orphan",
221 )
223 __table_args__ = (
224 Index("ix_research_resources_research_id", "research_id"),
225 )
227 def __repr__(self):
228 return f"<ResearchResource(title='{self.title}', url='{self.url}')>"
231class ResearchHistory(Base):
232 """
233 Research history table.
234 Tracks research sessions and their progress.
235 """
237 __tablename__ = "research_history"
239 # UUID as primary key
240 id = Column(String(36), primary_key=True)
241 # The search query.
242 query = Column(Text, nullable=False)
243 # The mode of research (e.g., 'quick_summary', 'detailed_report').
244 mode = Column(Text, nullable=False)
245 # Current status of the research.
246 status = Column(Text, nullable=False)
247 # The timestamp when the research started.
248 created_at = Column(Text, nullable=False)
249 # The timestamp when the research was completed.
250 completed_at = Column(Text)
251 # Duration of the research in seconds.
252 duration_seconds = Column(Integer)
253 # Path to the generated report.
254 report_path = Column(Text)
255 # Report content stored in database
256 report_content = Column(Text)
257 # Additional metadata about the research.
258 research_meta = Column(JSON)
259 # Latest progress log message.
260 progress_log = Column(JSON)
261 # Current progress of the research (as a percentage).
262 progress = Column(Integer)
263 # Title of the research report.
264 title = Column(Text)
266 # Optional link to chat session that triggered this research.
267 # Named Index() in __table_args__ below (NOT index=True on this
268 # column) so both the create_all path (fresh installs, test
269 # fixtures) and the migration path (in migration 0010) produce an
270 # identically-named index: ix_research_history_chat_session_id.
271 # The matching ResearchResource pattern above documents the same
272 # divergence risk in detail.
273 chat_session_id = Column(
274 String(36),
275 ForeignKey("chat_sessions.id", ondelete="SET NULL"),
276 nullable=True,
277 )
279 # Atomic counter for ChatService.add_progress_step's per-research
280 # sequence_number allocation (mirrors ChatSession.message_count).
281 step_count = Column(Integer, nullable=False, default=0, server_default="0")
283 # Relationships
284 resources = relationship(
285 "ResearchResource",
286 back_populates="research",
287 cascade="all, delete-orphan",
288 )
289 chat_session = relationship(
290 "ChatSession",
291 foreign_keys=[chat_session_id],
292 back_populates="researches",
293 )
294 chat_messages = relationship(
295 "ChatMessage",
296 back_populates="research",
297 passive_deletes=True,
298 )
299 progress_steps = relationship(
300 "ChatProgressStep",
301 back_populates="research",
302 passive_deletes=True,
303 )
305 __table_args__ = (
306 Index("ix_research_history_chat_session_id", "chat_session_id"),
307 # Partial unique index closing the SELECT-then-INSERT race in
308 # chat/routes.py::send_message — only one in-progress research
309 # per chat session, NULL chat_session_id rows unconstrained.
310 # Migration 0010 mirrors this index exactly.
311 Index(
312 "ux_research_history_chat_session_in_progress",
313 "chat_session_id",
314 unique=True,
315 sqlite_where=text(
316 "status = 'in_progress' AND chat_session_id IS NOT NULL"
317 ),
318 postgresql_where=text(
319 "status = 'in_progress' AND chat_session_id IS NOT NULL"
320 ),
321 ),
322 )
324 def __repr__(self):
325 return f"<ResearchHistory(query='{self.query[:50]}...', status={self.status})>"
328class Research(Base):
329 """
330 Modern research tracking with better type safety.
331 """
333 __tablename__ = "research"
335 id = Column(Integer, primary_key=True)
336 query = Column(String, nullable=False)
337 status = Column(
338 Enum(ResearchStatus), default=ResearchStatus.PENDING, nullable=False
339 )
340 mode = Column(
341 Enum(ResearchMode), default=ResearchMode.QUICK, nullable=False
342 )
343 created_at = Column(UtcDateTime, server_default=utcnow(), nullable=False)
344 updated_at = Column(
345 UtcDateTime, server_default=utcnow(), onupdate=utcnow(), nullable=False
346 )
347 progress = Column(Float, default=0.0, nullable=False)
348 start_time = Column(UtcDateTime, nullable=True)
349 end_time = Column(UtcDateTime, nullable=True)
350 error_message = Column(Text, nullable=True)
352 def __repr__(self):
353 return f"<Research(query='{self.query[:50]}...', status={self.status.value})>"
356class ResearchStrategy(Base):
357 """
358 Track which search strategy was used for each research.
359 """
361 __tablename__ = "research_strategies"
363 id = Column(Integer, primary_key=True)
364 # FK targets research_history.id (the live UUID-keyed table) rather than
365 # the dormant `research` table. save_research_strategy passes the
366 # research_history UUID, so the prior Integer FK to research.id raised
367 # FOREIGN KEY constraint failed on every commit once v1.6.0 turned on
368 # PRAGMA foreign_keys.
369 research_id = Column(
370 String(36),
371 ForeignKey("research_history.id", ondelete="CASCADE"),
372 nullable=False,
373 unique=True,
374 index=True,
375 )
376 strategy_name = Column(String(100), nullable=False, index=True)
377 created_at = Column(UtcDateTime, server_default=utcnow(), nullable=False)
379 def __repr__(self):
380 return f"<ResearchStrategy(research_id={self.research_id}, strategy={self.strategy_name})>"