Coverage for src / local_deep_research / web_search_engines / engines / search_engine_paperless.py: 90%

301 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2Paperless-ngx search engine implementation for Local Deep Research. 

3 

4This module provides a proper search engine implementation that connects to a Paperless-ngx 

5instance, allowing LDR to search and retrieve documents from your personal 

6document management system. 

7""" 

8 

9import re 

10from typing import Any, Dict, List, Optional 

11import requests 

12from urllib.parse import urljoin 

13 

14from langchain_core.language_models import BaseLLM 

15from loguru import logger 

16 

17from ..search_engine_base import BaseSearchEngine 

18from ...security import safe_get 

19 

20 

21class PaperlessSearchEngine(BaseSearchEngine): 

22 """Paperless-ngx search engine implementation with full LDR integration.""" 

23 

24 is_lexical = True 

25 needs_llm_relevance_filter = True 

26 

27 # Class constants for magic numbers 

28 MAX_SNIPPET_LENGTH = 3000 # Reasonable limit to avoid context window issues 

29 SNIPPET_CONTEXT_BEFORE = 500 # Characters before matched term in snippet 

30 SNIPPET_CONTEXT_AFTER = 2500 # Characters after matched term in snippet 

31 

32 def __init__( 

33 self, 

34 api_url: str | None = None, 

35 api_key: str | None = None, 

36 api_token: str 

37 | None = None, # Support both for backwards compatibility 

38 max_results: int = 10, 

39 timeout: int = 30, 

40 verify_ssl: bool = True, 

41 include_content: bool = True, 

42 llm: Optional[BaseLLM] = None, 

43 settings_snapshot: Optional[Dict[str, Any]] = None, 

44 **kwargs, 

45 ): 

46 """ 

47 Initialize the Paperless-ngx search engine. 

48 

49 Args: 

50 api_url: Base URL of Paperless-ngx instance (e.g., "http://localhost:8000") 

51 If not provided, will look for PAPERLESS_API_URL env var 

52 api_key: API token for authentication (preferred parameter name) 

53 api_token: API token for authentication (backwards compatibility) 

54 If not provided, will look for PAPERLESS_API_TOKEN env var 

55 max_results: Maximum number of search results 

56 timeout: Request timeout in seconds 

57 verify_ssl: Whether to verify SSL certificates 

58 include_content: Whether to include document content in results 

59 llm: Language model for relevance filtering (optional) 

60 settings_snapshot: Settings snapshot for thread context 

61 **kwargs: Additional parameters passed to parent 

62 """ 

63 super().__init__( 

64 max_results=max_results, 

65 llm=llm, 

66 settings_snapshot=settings_snapshot, 

67 **kwargs, 

68 ) 

69 

70 # Use provided configuration or get from settings 

71 self.api_url = api_url 

72 # Support both api_key and api_token for compatibility 

73 self.api_token = api_key or api_token 

74 

75 # If no API URL provided, try to get from settings_snapshot 

76 if not self.api_url and settings_snapshot: 

77 self.api_url = settings_snapshot.get( 

78 "search.engine.web.paperless.default_params.api_url", 

79 "http://localhost:8000", 

80 ) 

81 

82 # If no API token provided, try to get from settings_snapshot 

83 if not self.api_token and settings_snapshot: 

84 self.api_token = settings_snapshot.get( 

85 "search.engine.web.paperless.api_key", "" 

86 ) 

87 

88 # Fix AttributeError: Check if api_url is None before calling rstrip 

89 if self.api_url: 

90 # Remove trailing slash from API URL 

91 self.api_url = self.api_url.rstrip("/") 

92 else: 

93 # Default to localhost if nothing provided 

94 self.api_url = "http://localhost:8000" 

95 logger.warning( 

96 "No Paperless API URL provided, using default: http://localhost:8000" 

97 ) 

98 

99 self.timeout = timeout 

100 self.verify_ssl = verify_ssl 

101 self.include_content = include_content 

102 

103 # Set up headers for authentication 

104 self.headers = {} 

105 if self.api_token: 

106 self.headers["Authorization"] = f"Token {self.api_token}" 

107 

108 logger.info( 

109 f"Initialized Paperless-ngx search engine for {self.api_url}" 

110 ) 

111 

112 def _make_request( 

113 self, endpoint: str, params: Optional[Dict] = None 

114 ) -> Dict[str, Any]: 

115 """ 

116 Make a request to the Paperless-ngx API. 

117 

118 Args: 

119 endpoint: API endpoint path 

120 params: Query parameters 

121 

122 Returns: 

123 JSON response from the API 

124 """ 

125 url = urljoin(self.api_url or "", endpoint) 

126 

127 logger.debug(f"Making request to: {url}") 

128 logger.debug(f"Request params: {params}") 

129 logger.debug( 

130 f"Headers: {self.headers.keys() if self.headers else 'None'}" 

131 ) 

132 

133 try: 

134 # Paperless is typically a local/private network service 

135 response = safe_get( 

136 url, 

137 params=params, 

138 headers=self.headers, 

139 timeout=self.timeout, 

140 verify=self.verify_ssl, 

141 allow_private_ips=True, 

142 allow_localhost=True, 

143 ) 

144 response.raise_for_status() 

145 result = response.json() 

146 

147 # Log response details 

148 if isinstance(result, dict): 148 ↛ 166line 148 didn't jump to line 166 because the condition on line 148 was always true

149 if "results" in result: 

150 logger.info( 

151 f"API returned {len(result.get('results', []))} results, total count: {result.get('count', 'unknown')}" 

152 ) 

153 # Log first result details if available 

154 if result.get("results"): 

155 first = result["results"][0] 

156 logger.debug( 

157 f"First result: id={first.get('id')}, title='{first.get('title', 'No title')[:50]}...'" 

158 ) 

159 if "__search_hit__" in first: 

160 logger.debug( 

161 f"Has search hit data with score={first['__search_hit__'].get('score')}" 

162 ) 

163 else: 

164 logger.debug(f"API response keys: {result.keys()}") 

165 

166 return result # type: ignore[no-any-return] 

167 except requests.exceptions.RequestException: 

168 logger.exception("Error making request to Paperless-ngx") 

169 logger.debug(f"Failed URL: {url}, params: {params}") 

170 return {} 

171 

172 def _expand_query_with_llm(self, query: str) -> str: 

173 """ 

174 Use LLM to expand query with relevant keywords and synonyms. 

175 

176 Args: 

177 query: Original search query 

178 

179 Returns: 

180 Expanded query with keywords 

181 """ 

182 if not self.llm: 

183 logger.info( 

184 f"No LLM available for query expansion, using original: '{query}'" 

185 ) 

186 return query 

187 

188 try: 

189 prompt = f"""Paperless-ngx uses TF-IDF keyword search, not semantic search. 

190Convert this query into keywords that would appear in documents. 

191 

192Query: "{query}" 

193 

194Output format: keyword1 OR keyword2 OR "multi word phrase" OR keyword3 

195Include synonyms, plural forms, and technical terms. 

196 

197IMPORTANT: Output ONLY the search query. No explanations, no additional text.""" 

198 

199 logger.debug( 

200 f"Sending query expansion prompt to LLM for: '{query}'" 

201 ) 

202 response = self.llm.invoke(prompt) 

203 expanded = ( 

204 str(response.content) 

205 if hasattr(response, "content") 

206 else str(response) 

207 ).strip() 

208 

209 logger.debug( 

210 f"Raw LLM response (first 500 chars): {expanded[:500]}" 

211 ) 

212 

213 # Clean up the response - remove any explanatory text 

214 if "\n" in expanded: 214 ↛ 215line 214 didn't jump to line 215 because the condition on line 214 was never true

215 expanded = expanded.split("\n")[0] 

216 logger.debug("Took first line of LLM response") 

217 

218 # Always trust the LLM's expansion - it knows better than hard-coded rules 

219 logger.info( 

220 f"LLM expanded query from '{query}' to {len(expanded)} chars with {expanded.count('OR')} ORs" 

221 ) 

222 logger.debug( 

223 f"Expanded query preview (first 200 chars): {expanded[:200]}..." 

224 ) 

225 return expanded 

226 

227 except Exception: 

228 logger.exception("Failed to expand query with LLM") 

229 return query 

230 

231 def _multi_pass_search(self, query: str) -> List[Dict[str, Any]]: 

232 """ 

233 Perform multiple search passes with different strategies. 

234 

235 Args: 

236 query: Original search query 

237 

238 Returns: 

239 Combined and deduplicated results 

240 """ 

241 logger.info(f"Starting multi-pass search for query: '{query}'") 

242 all_results = {} # Use dict to deduplicate by doc_id 

243 

244 # Pass 1: Original query 

245 params = { 

246 "query": query, 

247 "page_size": self.max_results, 

248 "ordering": "-score", 

249 } 

250 

251 logger.info( 

252 f"Pass 1 - Original query: '{query}' (max_results={self.max_results})" 

253 ) 

254 response = self._make_request("/api/documents/", params=params) 

255 

256 if response and "results" in response: 

257 pass1_count = len(response["results"]) 

258 logger.info(f"Pass 1 returned {pass1_count} documents") 

259 for doc in response["results"]: 

260 doc_id = doc.get("id") 

261 if doc_id and doc_id not in all_results: 261 ↛ 259line 261 didn't jump to line 259 because the condition on line 261 was always true

262 all_results[doc_id] = doc 

263 logger.debug( 

264 f"Added doc {doc_id}: {doc.get('title', 'No title')}" 

265 ) 

266 else: 

267 logger.warning( 

268 f"Pass 1 returned no results or invalid response: {response}" 

269 ) 

270 

271 # Pass 2: LLM-expanded keywords (if LLM available) 

272 if self.llm: 

273 expanded_query = self._expand_query_with_llm(query) 

274 if expanded_query != query: 274 ↛ 305line 274 didn't jump to line 305 because the condition on line 274 was always true

275 params["query"] = expanded_query 

276 params["page_size"] = self.max_results * 2 # Get more results 

277 

278 logger.info( 

279 f"Pass 2 - Using expanded query with {expanded_query.count('OR')} ORs" 

280 ) 

281 logger.debug( 

282 f"Pass 2 - Full expanded query (first 500 chars): '{expanded_query[:500]}...'" 

283 ) 

284 logger.info( 

285 f"Pass 2 - Max results set to: {params['page_size']}" 

286 ) 

287 response = self._make_request("/api/documents/", params=params) 

288 

289 if response and "results" in response: 289 ↛ 303line 289 didn't jump to line 303 because the condition on line 289 was always true

290 pass2_new = 0 

291 for doc in response["results"]: 

292 doc_id = doc.get("id") 

293 if doc_id and doc_id not in all_results: 

294 all_results[doc_id] = doc 

295 pass2_new += 1 

296 logger.debug( 

297 f"Pass 2 added new doc {doc_id}: {doc.get('title', 'No title')}" 

298 ) 

299 logger.info( 

300 f"Pass 2 found {len(response['results'])} docs, added {pass2_new} new" 

301 ) 

302 else: 

303 logger.warning("Pass 2 returned no results") 

304 else: 

305 logger.info("Pass 2 skipped - expanded query same as original") 

306 else: 

307 logger.info("Pass 2 skipped - no LLM available") 

308 

309 # Sort by relevance score if available 

310 logger.info(f"Total unique documents collected: {len(all_results)}") 

311 sorted_results = sorted( 

312 all_results.values(), 

313 key=lambda x: x.get("__search_hit__", {}).get("score", 0), 

314 reverse=True, 

315 ) 

316 

317 final_results = sorted_results[: self.max_results] 

318 logger.info( 

319 f"Returning top {len(final_results)} documents after sorting by score" 

320 ) 

321 

322 # Log titles and scores of final results 

323 for i, doc in enumerate(final_results[:5], 1): # Log first 5 

324 score = doc.get("__search_hit__", {}).get("score", 0) 

325 logger.debug( 

326 f"Result {i}: '{doc.get('title', 'No title')}' (score={score})" 

327 ) 

328 

329 return final_results 

330 

331 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

332 """ 

333 Get preview results from Paperless-ngx using multi-pass strategy. 

334 

335 Args: 

336 query: Search query 

337 

338 Returns: 

339 List of preview dictionaries 

340 """ 

341 try: 

342 # Use multi-pass search strategy 

343 results = self._multi_pass_search(query) 

344 

345 if not results: 

346 return [] 

347 

348 # Convert documents to preview format 

349 # Note: Each document may return multiple previews (one per highlight) 

350 previews = [] 

351 for doc_data in results: 

352 doc_previews = self._convert_document_to_preview( 

353 doc_data, query 

354 ) 

355 # Handle both single preview and list of previews 

356 if isinstance(doc_previews, list): 356 ↛ 357line 356 didn't jump to line 357 because the condition on line 356 was never true

357 previews.extend(doc_previews) 

358 else: 

359 previews.append(doc_previews) 

360 

361 logger.info( 

362 f"Found {len(previews)} documents in Paperless-ngx for query: {query}" 

363 ) 

364 return previews 

365 

366 except Exception: 

367 logger.exception("Error getting previews from Paperless-ngx") 

368 return [] 

369 

370 def _convert_document_to_preview( 

371 self, doc_data: Dict[str, Any], query: str = "" 

372 ) -> Dict[str, Any] | List[Dict[str, Any]]: 

373 """ 

374 Convert a Paperless-ngx document to LDR preview format. 

375 

376 Args: 

377 doc_data: Document data from the API 

378 query: Original search query (for context) 

379 

380 Returns: 

381 Preview dictionary in LDR format 

382 """ 

383 # Extract title 

384 title = doc_data.get("title", f"Document {doc_data.get('id')}") 

385 doc_id = doc_data.get("id") 

386 

387 logger.info( 

388 f"Converting document {doc_id}: '{title}' to preview format" 

389 ) 

390 

391 # Build URL - use the web interface URL for user access 

392 url = f"{self.api_url}/documents/{doc_id}/details" 

393 logger.debug(f"Generated URL for doc {doc_id}: {url}") 

394 

395 # Extract snippet - prefer highlighted content from search 

396 snippet = "" 

397 search_score = 0.0 

398 search_rank = None 

399 all_highlights = [] # Initialize empty highlights list 

400 

401 if "__search_hit__" in doc_data: 

402 search_hit = doc_data["__search_hit__"] 

403 logger.debug( 

404 f"Found __search_hit__ data for doc {doc_id}: score={search_hit.get('score')}, rank={search_hit.get('rank')}" 

405 ) 

406 

407 # Get highlights - this is the search snippet with matched terms 

408 if search_hit.get("highlights"): 408 ↛ 461line 408 didn't jump to line 461 because the condition on line 408 was always true

409 # Highlights can be a string or list 

410 highlights = search_hit.get("highlights") 

411 logger.info( 

412 f"Found highlights for doc {doc_id}: type={type(highlights).__name__}, length={len(str(highlights))}" 

413 ) 

414 

415 if isinstance(highlights, list): 

416 logger.debug( 

417 f"Highlights is list with {len(highlights)} items" 

418 ) 

419 # IMPORTANT: Store highlights list for processing later 

420 # Each highlight will become a separate search result for proper citation 

421 all_highlights = highlights 

422 # Use first highlight for the default snippet 

423 snippet = highlights[0] if highlights else "" 

424 logger.info( 

425 f"Will create {len(highlights)} separate results from highlights" 

426 ) 

427 else: 

428 all_highlights = [ 

429 str(highlights) 

430 ] # Single highlight as list 

431 snippet = str(highlights) 

432 

433 logger.debug( 

434 f"Raw snippet before cleaning (first 200 chars): {snippet[:200]}" 

435 ) 

436 

437 # Clean HTML tags but preserve the matched text 

438 snippet = re.sub(r"<span[^>]*>", "**", snippet) 

439 snippet = re.sub(r"</span>", "**", snippet) 

440 snippet = re.sub(r"<[^>]+>", "", snippet) 

441 

442 logger.debug( 

443 f"Cleaned snippet (first 200 chars): {snippet[:200]}" 

444 ) 

445 

446 # Limit snippet length to avoid context window issues 

447 if ( 447 ↛ 452line 447 didn't jump to line 452 because the condition on line 447 was never true

448 self.MAX_SNIPPET_LENGTH 

449 and len(snippet) > self.MAX_SNIPPET_LENGTH 

450 ): 

451 # Cut at word boundary to avoid mid-word truncation 

452 snippet = ( 

453 snippet[: self.MAX_SNIPPET_LENGTH].rsplit(" ", 1)[0] 

454 + "..." 

455 ) 

456 logger.debug( 

457 f"Truncated snippet to {self.MAX_SNIPPET_LENGTH} chars" 

458 ) 

459 

460 # Get search relevance metadata 

461 search_score = search_hit.get("score", 0.0) 

462 search_rank = search_hit.get("rank") 

463 logger.info( 

464 f"Search metadata for doc {doc_id}: score={search_score}, rank={search_rank}" 

465 ) 

466 else: 

467 logger.warning( 

468 f"No __search_hit__ data for doc {doc_id}, will use content fallback" 

469 ) 

470 

471 if not snippet: 

472 logger.info( 

473 f"No snippet from highlights for doc {doc_id}, using content fallback" 

474 ) 

475 # Fallback to content preview if no highlights available 

476 content = doc_data.get("content", "") 

477 if content: 477 ↛ 520line 477 didn't jump to line 520 because the condition on line 477 was always true

478 logger.debug(f"Document has content of length {len(content)}") 

479 # Try to find context around query terms if possible 

480 if query: 480 ↛ 512line 480 didn't jump to line 512 because the condition on line 480 was always true

481 query_terms = query.lower().split() 

482 content_lower = content.lower() 

483 logger.debug( 

484 f"Searching for query terms in content: {query_terms}" 

485 ) 

486 

487 # Find first occurrence of any query term 

488 best_pos = -1 

489 for term in query_terms: 

490 pos = content_lower.find(term) 

491 if pos != -1 and (best_pos == -1 or pos < best_pos): 491 ↛ 489line 491 didn't jump to line 489 because the condition on line 491 was always true

492 best_pos = pos 

493 logger.debug( 

494 f"Found term '{term}' at position {pos}" 

495 ) 

496 

497 if best_pos != -1: 497 ↛ 507line 497 didn't jump to line 507 because the condition on line 497 was always true

498 # Extract context around the found term - much larger context for research 

499 start = max(0, best_pos - 2000) 

500 end = min(len(content), best_pos + 8000) 

501 snippet = "..." + content[start:end] + "..." 

502 logger.info( 

503 f"Extracted snippet around query term at position {best_pos}" 

504 ) 

505 else: 

506 # Just take the beginning - use 10000 chars for research 

507 snippet = content[:10000] 

508 logger.info( 

509 "No query terms found, using first 10000 chars of content" 

510 ) 

511 else: 

512 snippet = content[:10000] 

513 logger.info( 

514 "No query provided, using first 10000 chars of content" 

515 ) 

516 

517 if len(content) > 10000: 517 ↛ 518line 517 didn't jump to line 518 because the condition on line 517 was never true

518 snippet += "..." 

519 else: 

520 logger.warning(f"No content available for doc {doc_id}") 

521 

522 logger.info(f"Final snippet for doc {doc_id} has length {len(snippet)}") 

523 

524 # Build metadata 

525 metadata = { 

526 "doc_id": str(doc_id), 

527 "correspondent": doc_data.get("correspondent_name", ""), 

528 "document_type": doc_data.get("document_type_name", ""), 

529 "created": doc_data.get("created", ""), 

530 "modified": doc_data.get("modified", ""), 

531 "archive_serial_number": doc_data.get("archive_serial_number"), 

532 "search_score": search_score, 

533 "search_rank": search_rank, 

534 } 

535 

536 # Add tags if present 

537 tags = doc_data.get("tags_list", []) 

538 if isinstance(tags, list) and tags: 

539 metadata["tags"] = ", ".join(str(tag) for tag in tags) 

540 

541 # Build enhanced title with available metadata for better citations 

542 title_parts = [] 

543 

544 # Add correspondent/author if available 

545 correspondent = doc_data.get("correspondent_name", "") 

546 if correspondent: 

547 title_parts.append(f"{correspondent}.") 

548 logger.debug(f"Added correspondent to title: {correspondent}") 

549 

550 # Add the document title 

551 title_parts.append(title) 

552 

553 # Add document type if it's meaningful (not just generic types) 

554 doc_type = doc_data.get("document_type_name", "") 

555 if doc_type and doc_type not in ["Letter", "Other", "Document", ""]: 

556 title_parts.append(f"({doc_type})") 

557 logger.debug(f"Added document type to title: {doc_type}") 

558 

559 # Add year from created date if available 

560 created_date = doc_data.get("created", "") 

561 if created_date and len(created_date) >= 4: 

562 year = created_date[:4] 

563 title_parts.append(year) 

564 logger.debug(f"Added year to title: {year}") 

565 

566 # Format the enhanced title for display in sources list 

567 if title_parts: 567 ↛ 570line 567 didn't jump to line 570 because the condition on line 567 was always true

568 enhanced_title = " ".join(title_parts) 

569 else: 

570 enhanced_title = title 

571 

572 logger.info(f"Enhanced title for doc {doc_id}: '{enhanced_title}'") 

573 

574 # Build the preview 

575 preview = { 

576 "title": enhanced_title, # Use enhanced title with bibliographic info 

577 "url": url, 

578 "link": url, # Add 'link' key for compatibility with search utilities 

579 "snippet": snippet, 

580 "author": doc_data.get("correspondent_name", ""), 

581 "date": doc_data.get("created", ""), 

582 "source": "Paperless", # Keep source as the system name like other engines 

583 "metadata": metadata, 

584 "_raw_data": doc_data, # Store raw data for full content retrieval 

585 } 

586 

587 logger.info( 

588 f"Built preview for doc {doc_id}: URL={url}, snippet_len={len(snippet)}, has_author={bool(preview['author'])}, has_date={bool(preview['date'])}" 

589 ) 

590 

591 # Check if we have multiple highlights to return as separate results 

592 if len(all_highlights) > 1: 

593 # Create multiple previews, one for each highlight 

594 previews = [] 

595 for i, highlight in enumerate(all_highlights): 

596 # Clean each highlight 

597 clean_snippet = re.sub(r"<span[^>]*>", "**", str(highlight)) 

598 clean_snippet = re.sub(r"</span>", "**", clean_snippet) 

599 clean_snippet = re.sub(r"<[^>]+>", "", clean_snippet) 

600 

601 # Create a preview for this highlight 

602 highlight_preview = { 

603 "title": f"{enhanced_title} (excerpt {i + 1})", # Differentiate each excerpt 

604 "url": url, 

605 "link": url, 

606 "snippet": clean_snippet, 

607 "author": doc_data.get("correspondent_name", ""), 

608 "date": doc_data.get("created", ""), 

609 "source": "Paperless", 

610 "metadata": { 

611 **metadata, 

612 "excerpt_number": i + 1, 

613 "total_excerpts": len(all_highlights), 

614 }, 

615 "_raw_data": doc_data, 

616 } 

617 previews.append(highlight_preview) 

618 

619 logger.info( 

620 f"Created {len(previews)} separate previews from highlights for doc {doc_id}" 

621 ) 

622 return previews 

623 # Single preview (original behavior) 

624 return preview 

625 

626 def _get_full_content( 

627 self, relevant_items: List[Dict[str, Any]] 

628 ) -> List[Dict[str, Any]]: 

629 """ 

630 Get full content for relevant documents. 

631 

632 Args: 

633 relevant_items: List of relevant preview dictionaries 

634 

635 Returns: 

636 List of dictionaries with full content 

637 """ 

638 if not self.include_content: 

639 # If content inclusion is disabled, just return previews 

640 return relevant_items 

641 

642 logger.info(f"Getting full content for {len(relevant_items)} documents") 

643 results = [] 

644 for idx, item in enumerate(relevant_items): 

645 try: 

646 logger.info( 

647 f"Processing document {idx + 1}: title='{item.get('title', 'No title')[:50]}...', url={item.get('url', 'No URL')}" 

648 ) 

649 logger.debug(f"Document {idx + 1} keys: {item.keys()}") 

650 logger.debug( 

651 f"Document {idx + 1} has snippet of length: {len(item.get('snippet', ''))}" 

652 ) 

653 

654 # Get the full document content if we have the raw data 

655 if "_raw_data" in item: 

656 doc_data = item["_raw_data"] 

657 full_content = doc_data.get("content", "") 

658 

659 if not full_content: 

660 # Try to fetch the document details 

661 doc_id = item["metadata"].get("doc_id") 

662 if doc_id: 662 ↛ 671line 662 didn't jump to line 671 because the condition on line 662 was always true

663 detail_response = self._make_request( 

664 f"/api/documents/{doc_id}/" 

665 ) 

666 if detail_response: 

667 full_content = detail_response.get( 

668 "content", "" 

669 ) 

670 

671 item["full_content"] = full_content or item["snippet"] 

672 logger.info( 

673 f"Document {idx + 1} full content length: {len(item['full_content'])}" 

674 ) 

675 else: 

676 # Fallback to snippet if no raw data 

677 item["full_content"] = item["snippet"] 

678 logger.info( 

679 f"Document {idx + 1} using snippet as full content (no raw data)" 

680 ) 

681 

682 # Log the final document structure for debugging citation issues 

683 logger.info( 

684 f"Document {idx + 1} final structure: title='{item.get('title', '')[:50]}...', has_link={bool(item.get('link'))}, has_url={bool(item.get('url'))}, source='{item.get('source', 'Unknown')}'" 

685 ) 

686 

687 # Remove the raw data from the result 

688 item.pop("_raw_data", None) 

689 results.append(item) 

690 

691 except Exception: 

692 logger.exception("Error getting full content for document") 

693 item["full_content"] = item["snippet"] 

694 item.pop("_raw_data", None) 

695 results.append(item) 

696 

697 return results 

698 

699 def run( 

700 self, query: str, research_context: Dict[str, Any] | None = None 

701 ) -> List[Dict[str, Any]]: 

702 """ 

703 Execute search on Paperless-ngx. 

704 

705 Args: 

706 query: Search query 

707 research_context: Context from previous research 

708 

709 Returns: 

710 List of search results in LDR format 

711 """ 

712 try: 

713 # Get previews 

714 previews = self._get_previews(query) 

715 

716 if not previews: 

717 return [] 

718 

719 # Apply LLM relevance filtering if enabled by the factory 

720 enable_llm_filter = getattr( 

721 self, "enable_llm_relevance_filter", False 

722 ) 

723 if enable_llm_filter and self.llm: 723 ↛ 724line 723 didn't jump to line 724 because the condition on line 723 was never true

724 filtered_previews = self._filter_for_relevance(previews, query) 

725 if not filtered_previews: 

726 logger.info( 

727 f"LLM relevance filter returned no results " 

728 f"from {len(previews)} previews for query: {query}" 

729 ) 

730 else: 

731 filtered_previews = previews 

732 

733 # Get full content for relevant items 

734 results = self._get_full_content(filtered_previews) 

735 

736 logger.info( 

737 f"Search completed successfully, returning {len(results)} results" 

738 ) 

739 # Enhanced logging to track document structure for citation debugging 

740 for i, r in enumerate(results[:3], 1): 

741 logger.info( 

742 f"Result {i}: title='{r.get('title', '')[:50]}...', " 

743 f"has_full_content={bool(r.get('full_content'))}, " 

744 f"full_content_len={len(r.get('full_content', ''))}, " 

745 f"snippet_len={len(r.get('snippet', ''))}, " 

746 f"url={r.get('url', '')[:50]}" 

747 ) 

748 

749 return results 

750 

751 except Exception: 

752 logger.exception("Error in Paperless-ngx search") 

753 return [] 

754 

755 async def arun(self, query: str) -> List[Dict[str, Any]]: 

756 """ 

757 Async version of search. 

758 

759 Currently falls back to sync version. 

760 """ 

761 return self.run(query) 

762 

763 def test_connection(self) -> bool: 

764 """ 

765 Test the connection to Paperless-ngx. 

766 

767 Returns: 

768 True if connection successful, False otherwise 

769 """ 

770 try: 

771 response = self._make_request("/api/") 

772 return bool(response) 

773 except Exception: 

774 logger.exception("Failed to connect to Paperless-ngx") 

775 return False 

776 

777 def get_document_count(self) -> int: 

778 """ 

779 Get the total number of documents in Paperless-ngx. 

780 

781 Returns: 

782 Number of documents, or -1 if error 

783 """ 

784 try: 

785 response = self._make_request( 

786 "/api/documents/", params={"page_size": 1} 

787 ) 

788 return int(response.get("count", -1)) 

789 except Exception: 

790 logger.debug("Failed to fetch document count", exc_info=True) 

791 return -1