Coverage for src/local_deep_research/research_library/search/services/research_history

1"""

2Research History Indexer Service

4Enables semantic search over research history by:

5- Converting ResearchHistory reports into indexable Documents

6- Linking documents to the Research History collection

7- Triggering RAG indexing via LibraryRAGService

8"""

10import hashlib

11import uuid

12from datetime import datetime, UTC

13from typing import Any, Dict, Optional

15from loguru import logger

16from sqlalchemy.exc import IntegrityError

18from ....constants import ResearchStatus

19from ....database.library_init import ensure_research_history_collection

20from ....database.models.library import (

21 Document,

22 DocumentStatus,

23 SourceType,

24)

25from ....database.models.research import ResearchHistory

26from ....database.session_context import get_user_db_session

27from ...utils import ensure_in_collection

30class ResearchHistoryIndexer:

31 """

32 Service for indexing research history into a searchable collection.

34 Converts research reports into Documents that can be indexed for

35 semantic search using the existing RAG infrastructure.

36 """

38 # Source type names used in the database

39 SOURCE_TYPE_REPORT = "research_report"

40 COLLECTION_TYPE = "research_history"

42 # convert_all_research pages through candidates this many rows at a time.

43 # report_content is a large Text column, so loading every completed

44 # report body at once can exhaust memory on a big history (#4560). This

45 # caps how many bodies are resident at any moment.

46 CONVERT_BATCH_SIZE = 50

48 def __init__(self, username: str, db_password: Optional[str] = None):

49 """

50 Initialize the indexer for a user.

52 Args:

53 username: Username for database access

54 db_password: Optional database password for encrypted DB access

55 """

56 self.username = username

57 self.db_password = db_password

59 def get_or_create_collection(self) -> str:

60 """

61 Get or create the Research History collection for this user.

63 Returns:

64 UUID of the Research History collection

65 """

66 return ensure_research_history_collection(

67 self.username, self.db_password

68 )

70 def index_research(

71 self,

72 research_id: str,

73 collection_id: Optional[str] = None,

74 ) -> Dict[str, Any]:

75 """

76 Convert a single research entry into a Document and add it to a

77 collection. Idempotent — safe to call multiple times.

79 Args:

80 research_id: UUID of the research to index

81 collection_id: Target collection UUID (defaults to Research History)

83 Returns:

84 Dict with status and document count

85 """

86 if collection_id is None: 86 ↛ 89line 86 didn't jump to line 89 because the condition on line 86 was always true

87 collection_id = self.get_or_create_collection()

89 with get_user_db_session(self.username, self.db_password) as session:

90 research = (

91 session.query(ResearchHistory)

92 .filter(ResearchHistory.id == research_id)

93 .first()

94 )

96 if not research:

97 return {"status": "error", "error": "Research not found"}

99 if research.status != ResearchStatus.COMPLETED:

100 return {

101 "status": "error",

102 "error": "Research is not yet completed",

103 }

104

105 if not research.report_content:

106 return {

107 "status": "error",

108 "error": "Research has no report content",

109 }

110

111 try:

112 report_doc = self._create_document_from_report(

113 research, collection_id, session

114 )

115 if report_doc is None:

116 return {

117 "status": "error",

118 "error": "SourceType 'research_report' not found. "

119 "Run library initialization.",

120 }

121 logger.info(

122 f"Created/found document for research: {research_id[:8]}"

123 )

124 except Exception:

125 logger.exception("Error creating report document")

126 return {

127 "status": "error",

128 "error": "Failed to create report document",

129 }

130

131 try:

132 session.commit()

133 except IntegrityError:

134 session.rollback()

135 logger.info(

136 f"DocumentCollection already exists for research "

137 f"{research_id[:8]} (concurrent insert)"

138 )

139

140 return {

141 "status": "success",

142 "research_id": research_id,

143 "collection_id": collection_id,

144 "documents_added": 1,

145 }

146

147 def convert_all_research(self, force: bool = False) -> Dict[str, Any]:

148 """

149 Convert all completed research entries into Documents (without RAG indexing).

150

151 Single-session implementation that calls private helpers directly to

152 avoid the nested-session issues that arise on SQLite when

153 index_research opens its own session inside a loop.

154

155 Args:

156 force: If True, process all entries even if already converted.

157 If False (default), skip entries that already have a report

158 Document.

159

160 Returns:

161 Dict with:

162 - converted: Number of research entries successfully converted

163 - skipped: Number of entries skipped (already converted)

164 - failed: Number of entries that raised an exception

165 - collection_id: UUID of the Research History collection

166

167 Note: the "already converted" filter checks ``Document.research_id``.

168 When two research entries produce identical ``report_content``,

169 ``_create_document_from_report`` reuses the existing Document (its

170 ``research_id`` stays as the first creator's), so the duplicate

171 research keeps appearing in the candidate set on every call. Calling

172 this from a hot path (request handler, polling loop) will repeatedly

173 re-attempt those entries. Call only from explicit user actions

174 (e.g. the manual ``/convert-all`` endpoint or ``auto_convert_research``

175 on research completion).

176

177 Only report Documents are created; source documents are not indexed.

178 """

179 collection_id = self.get_or_create_collection()

180

181 with get_user_db_session(self.username, self.db_password) as session:

182 # Resolve the report SourceType — required to create report Documents

183 report_type = (

184 session.query(SourceType)

185 .filter_by(name=self.SOURCE_TYPE_REPORT)

186 .first()

187 )

188 if report_type is None:

189 logger.warning(

190 f"SourceType '{self.SOURCE_TYPE_REPORT}' not found. "

191 "Run library initialization to seed source types before "

192 "converting research history."

193 )

194 return {

195 "converted": 0,

196 "skipped": 0,

197 "failed": 0,

198 "collection_id": collection_id,

199 }

200

201 # Build subquery of research IDs that already have a report Document

202 already_converted_subquery = (

203 session.query(Document.research_id)

204 .filter(Document.source_type_id == report_type.id)

205 .filter(Document.research_id.isnot(None))

206 .distinct()

207 .subquery()

208 )

209

210 # Count total eligible research entries (before filtering)

211 total_eligible = (

212 session.query(ResearchHistory)

213 .filter(ResearchHistory.status == ResearchStatus.COMPLETED)

214 .filter(ResearchHistory.report_content.isnot(None))

215 .filter(ResearchHistory.report_content != "")

216 .count()

217 )

218

219 # Fetch candidate IDs only — optionally excluding already-converted

220 # entries. We must NOT materialize every full ResearchHistory row

221 # here: report_content is a large Text column, and loading every

222 # completed report body at once can exhaust memory on a big history

223 # (#4560). IDs are tiny, so the full candidate list is cheap; we

224 # then load the full rows one bounded batch at a time below.

225 id_query = (

226 session.query(ResearchHistory.id)

227 .filter(ResearchHistory.status == ResearchStatus.COMPLETED)

228 .filter(ResearchHistory.report_content.isnot(None))

229 .filter(ResearchHistory.report_content != "")

230 .order_by(ResearchHistory.created_at.desc())

231 )

232 if not force:

233 id_query = id_query.filter(

234 ResearchHistory.id.notin_(

235 already_converted_subquery.select()

236 )

237 )

238

239 research_ids = [row.id for row in id_query.all()]

240

241 converted = 0

242 skipped = total_eligible - len(research_ids) if not force else 0

243 failed = 0

244

245 for start in range(0, len(research_ids), self.CONVERT_BATCH_SIZE):

246 batch_ids = research_ids[

247 start : start + self.CONVERT_BATCH_SIZE

248 ]

249 # Load one batch of full rows (report bodies) at a time so peak

250 # memory stays bounded regardless of total history size.

251 batch = (

252 session.query(ResearchHistory)

253 .filter(ResearchHistory.id.in_(batch_ids))

254 .order_by(ResearchHistory.created_at.desc())

255 .all()

256 )

257

258 for research in batch:

259 try:

260 # Create (or reuse) report Document

261 report_doc = self._create_document_from_report(

262 research,

263 collection_id,

264 session,

265 report_type_id=report_type.id,

266 )

267 if report_doc is None:

268 # SourceType missing inside helper (already warned)

269 failed += 1

270 continue

271

272 # Commit each entry individually so a rollback on

273 # failure only loses the failing entry, not the batch.

274 session.commit()

275 converted += 1

276

277 except Exception:

278 logger.exception(

279 f"Error converting research {research.id}"

280 )

281 session.rollback()

282 failed += 1

283

284 logger.info(

285 f"convert_all_research complete — converted={converted}, "

286 f"skipped={skipped}, failed={failed}"

287 )

288 return {

289 "converted": converted,

290 "skipped": skipped,

291 "failed": failed,

292 "collection_id": collection_id,

293 }

294

295 def _create_document_from_report(

296 self,

297 research: ResearchHistory,

298 collection_id: str,

299 session,

300 report_type_id: Optional[str] = None,

301 ) -> Optional[Document]:

302 """

303 Create a Document from a research report.

304

305 Args:

306 research: ResearchHistory entry

307 collection_id: Target collection UUID

308 session: Database session

309 report_type_id: Pre-resolved SourceType ID (avoids N+1 queries

310 when called in a loop from convert_all_research)

311

312 Returns:

313 Created Document or None if skipped

314 """

315 # Resolve report SourceType (cached ID avoids per-entry query)

316 if report_type_id is None:

317 report_type = (

318 session.query(SourceType)

319 .filter_by(name=self.SOURCE_TYPE_REPORT)

320 .first()

321 )

322 if report_type is None:

323 logger.warning(

324 f"SourceType '{self.SOURCE_TYPE_REPORT}' not found for research "

325 f"{research.id}. Cannot create document — run library initialization "

326 "to seed source types."

327 )

328 return None

329 report_type_id = report_type.id

330 existing_doc = (

331 session.query(Document)

332 .filter(Document.research_id == research.id)

333 .filter(Document.source_type_id == report_type_id)

334 .first()

335 )

336

337 if existing_doc:

338 # Ensure it's in the collection

339 ensure_in_collection(session, existing_doc.id, collection_id)

340 return existing_doc

341

342 # Create document or reuse existing one with same content hash

343 # (document_hash has a unique constraint, so identical content

344 # must share a Document row — research_id points to the first creator)

345 content = research.report_content

346 doc_hash = hashlib.sha256(content.encode("utf-8")).hexdigest()

347

348 document = (

349 session.query(Document)

350 .filter(Document.document_hash == doc_hash)

351 .first()

352 )

353

354 if document is None:

355 doc_id = str(uuid.uuid4())

356 document = Document(

357 id=doc_id,

358 source_type_id=report_type_id,

359 research_id=research.id,

360 document_hash=doc_hash,

361 file_size=len(content.encode("utf-8")),

362 file_type="markdown",

363 mime_type="text/markdown",

364 title=research.title

365 or (research.query[:100] if research.query else "Untitled"),

366 text_content=content,

367 status=DocumentStatus.COMPLETED,

368 processed_at=datetime.now(UTC),

369 character_count=len(content),

370 word_count=len(content.split()),

371 )

372 session.add(document)

373 session.flush()

374

375 ensure_in_collection(session, document.id, collection_id)

376 return document

377

378

379def auto_convert_research(

380 username: str, research_id: str, db_password: str | None = None

381):

382 """Auto-convert a completed research entry to a document in the History collection.

383

384 Safe to call from any context — exceptions are caught and logged.

385 """

386 try:

387 indexer = ResearchHistoryIndexer(username, db_password=db_password)

388 result = indexer.index_research(research_id)

389 logger.info(

390 f"Auto-converted research {research_id} for user {username}: "

391 f"{result.get('status')}"

392 )

393 except Exception:

394 logger.warning(

395 f"Failed to auto-convert research {research_id} for user {username}"

396 )

Coverage for src/local_deep_research/research_library/search/services/research_history_indexer.py: 99%

107 statements