Coverage for src / local_deep_research / web_search_engines / engines / search_engine_paperless.py: 81%

295 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2Paperless-ngx search engine implementation for Local Deep Research. 

3 

4This module provides a proper search engine implementation that connects to a Paperless-ngx 

5instance, allowing LDR to search and retrieve documents from your personal 

6document management system. 

7""" 

8 

9import re 

10from typing import Any, Dict, List, Optional 

11import requests 

12from urllib.parse import urljoin 

13 

14from langchain_core.language_models import BaseLLM 

15from loguru import logger 

16 

17from ..search_engine_base import BaseSearchEngine 

18from ...security import safe_get 

19 

20 

21class PaperlessSearchEngine(BaseSearchEngine): 

22 """Paperless-ngx search engine implementation with full LDR integration.""" 

23 

24 # Class constants for magic numbers 

25 MAX_SNIPPET_LENGTH = 3000 # Reasonable limit to avoid context window issues 

26 SNIPPET_CONTEXT_BEFORE = 500 # Characters before matched term in snippet 

27 SNIPPET_CONTEXT_AFTER = 2500 # Characters after matched term in snippet 

28 

29 def __init__( 

30 self, 

31 api_url: str = None, 

32 api_key: str = None, 

33 api_token: str = None, # Support both for backwards compatibility 

34 max_results: int = 10, 

35 timeout: int = 30, 

36 verify_ssl: bool = True, 

37 include_content: bool = True, 

38 llm: Optional[BaseLLM] = None, 

39 settings_snapshot: Optional[Dict[str, Any]] = None, 

40 **kwargs, 

41 ): 

42 """ 

43 Initialize the Paperless-ngx search engine. 

44 

45 Args: 

46 api_url: Base URL of Paperless-ngx instance (e.g., "http://localhost:8000") 

47 If not provided, will look for PAPERLESS_API_URL env var 

48 api_key: API token for authentication (preferred parameter name) 

49 api_token: API token for authentication (backwards compatibility) 

50 If not provided, will look for PAPERLESS_API_TOKEN env var 

51 max_results: Maximum number of search results 

52 timeout: Request timeout in seconds 

53 verify_ssl: Whether to verify SSL certificates 

54 include_content: Whether to include document content in results 

55 llm: Language model for relevance filtering (optional) 

56 settings_snapshot: Settings snapshot for thread context 

57 **kwargs: Additional parameters passed to parent 

58 """ 

59 super().__init__( 

60 max_results=max_results, 

61 llm=llm, 

62 settings_snapshot=settings_snapshot, 

63 **kwargs, 

64 ) 

65 

66 # Use provided configuration or get from settings 

67 self.api_url = api_url 

68 # Support both api_key and api_token for compatibility 

69 self.api_token = api_key or api_token 

70 

71 # If no API URL provided, try to get from settings_snapshot 

72 if not self.api_url and settings_snapshot: 

73 self.api_url = settings_snapshot.get( 

74 "search.engine.web.paperless.default_params.api_url", 

75 "http://localhost:8000", 

76 ) 

77 

78 # If no API token provided, try to get from settings_snapshot 

79 if not self.api_token and settings_snapshot: 

80 self.api_token = settings_snapshot.get( 

81 "search.engine.web.paperless.api_key", "" 

82 ) 

83 

84 # Fix AttributeError: Check if api_url is None before calling rstrip 

85 if self.api_url: 

86 # Remove trailing slash from API URL 

87 self.api_url = self.api_url.rstrip("/") 

88 else: 

89 # Default to localhost if nothing provided 

90 self.api_url = "http://localhost:8000" 

91 logger.warning( 

92 "No Paperless API URL provided, using default: http://localhost:8000" 

93 ) 

94 

95 self.timeout = timeout 

96 self.verify_ssl = verify_ssl 

97 self.include_content = include_content 

98 

99 # Set up headers for authentication 

100 self.headers = {} 

101 if self.api_token: 

102 self.headers["Authorization"] = f"Token {self.api_token}" 

103 

104 logger.info( 

105 f"Initialized Paperless-ngx search engine for {self.api_url}" 

106 ) 

107 

108 def _make_request( 

109 self, endpoint: str, params: Optional[Dict] = None 

110 ) -> Dict[str, Any]: 

111 """ 

112 Make a request to the Paperless-ngx API. 

113 

114 Args: 

115 endpoint: API endpoint path 

116 params: Query parameters 

117 

118 Returns: 

119 JSON response from the API 

120 """ 

121 url = urljoin(self.api_url, endpoint) 

122 

123 logger.debug(f"Making request to: {url}") 

124 logger.debug(f"Request params: {params}") 

125 logger.debug( 

126 f"Headers: {self.headers.keys() if self.headers else 'None'}" 

127 ) 

128 

129 try: 

130 # Paperless is typically a local/private network service 

131 response = safe_get( 

132 url, 

133 params=params, 

134 headers=self.headers, 

135 timeout=self.timeout, 

136 verify=self.verify_ssl, 

137 allow_private_ips=True, 

138 allow_localhost=True, 

139 ) 

140 response.raise_for_status() 

141 result = response.json() 

142 

143 # Log response details 

144 if isinstance(result, dict): 144 ↛ 162line 144 didn't jump to line 162 because the condition on line 144 was always true

145 if "results" in result: 

146 logger.info( 

147 f"API returned {len(result.get('results', []))} results, total count: {result.get('count', 'unknown')}" 

148 ) 

149 # Log first result details if available 

150 if result.get("results"): 

151 first = result["results"][0] 

152 logger.debug( 

153 f"First result: id={first.get('id')}, title='{first.get('title', 'No title')[:50]}...'" 

154 ) 

155 if "__search_hit__" in first: 155 ↛ 162line 155 didn't jump to line 162 because the condition on line 155 was always true

156 logger.debug( 

157 f"Has search hit data with score={first['__search_hit__'].get('score')}" 

158 ) 

159 else: 

160 logger.debug(f"API response keys: {result.keys()}") 

161 

162 return result 

163 except requests.exceptions.RequestException: 

164 logger.exception("Error making request to Paperless-ngx") 

165 logger.debug(f"Failed URL: {url}, params: {params}") 

166 return {} 

167 

168 def _expand_query_with_llm(self, query: str) -> str: 

169 """ 

170 Use LLM to expand query with relevant keywords and synonyms. 

171 

172 Args: 

173 query: Original search query 

174 

175 Returns: 

176 Expanded query with keywords 

177 """ 

178 if not self.llm: 

179 logger.info( 

180 f"No LLM available for query expansion, using original: '{query}'" 

181 ) 

182 return query 

183 

184 try: 

185 prompt = f"""Paperless-ngx uses TF-IDF keyword search, not semantic search. 

186Convert this query into keywords that would appear in documents. 

187 

188Query: "{query}" 

189 

190Output format: keyword1 OR keyword2 OR "multi word phrase" OR keyword3 

191Include synonyms, plural forms, and technical terms. 

192 

193IMPORTANT: Output ONLY the search query. No explanations, no additional text.""" 

194 

195 logger.debug( 

196 f"Sending query expansion prompt to LLM for: '{query}'" 

197 ) 

198 response = self.llm.invoke(prompt) 

199 expanded = response.content.strip() 

200 

201 logger.debug( 

202 f"Raw LLM response (first 500 chars): {expanded[:500]}" 

203 ) 

204 

205 # Clean up the response - remove any explanatory text 

206 if "\n" in expanded: 206 ↛ 207line 206 didn't jump to line 207 because the condition on line 206 was never true

207 expanded = expanded.split("\n")[0] 

208 logger.debug("Took first line of LLM response") 

209 

210 # Always trust the LLM's expansion - it knows better than hard-coded rules 

211 logger.info( 

212 f"LLM expanded query from '{query}' to {len(expanded)} chars with {expanded.count('OR')} ORs" 

213 ) 

214 logger.debug( 

215 f"Expanded query preview (first 200 chars): {expanded[:200]}..." 

216 ) 

217 return expanded 

218 

219 except Exception: 

220 logger.exception("Failed to expand query with LLM") 

221 return query 

222 

223 def _multi_pass_search(self, query: str) -> List[Dict[str, Any]]: 

224 """ 

225 Perform multiple search passes with different strategies. 

226 

227 Args: 

228 query: Original search query 

229 

230 Returns: 

231 Combined and deduplicated results 

232 """ 

233 logger.info(f"Starting multi-pass search for query: '{query}'") 

234 all_results = {} # Use dict to deduplicate by doc_id 

235 

236 # Pass 1: Original query 

237 params = { 

238 "query": query, 

239 "page_size": self.max_results, 

240 "ordering": "-score", 

241 } 

242 

243 logger.info( 

244 f"Pass 1 - Original query: '{query}' (max_results={self.max_results})" 

245 ) 

246 response = self._make_request("/api/documents/", params=params) 

247 

248 if response and "results" in response: 

249 pass1_count = len(response["results"]) 

250 logger.info(f"Pass 1 returned {pass1_count} documents") 

251 for doc in response["results"]: 

252 doc_id = doc.get("id") 

253 if doc_id and doc_id not in all_results: 253 ↛ 251line 253 didn't jump to line 251 because the condition on line 253 was always true

254 all_results[doc_id] = doc 

255 logger.debug( 

256 f"Added doc {doc_id}: {doc.get('title', 'No title')}" 

257 ) 

258 else: 

259 logger.warning( 

260 f"Pass 1 returned no results or invalid response: {response}" 

261 ) 

262 

263 # Pass 2: LLM-expanded keywords (if LLM available) 

264 if self.llm: 264 ↛ 265line 264 didn't jump to line 265 because the condition on line 264 was never true

265 expanded_query = self._expand_query_with_llm(query) 

266 if expanded_query != query: 

267 params["query"] = expanded_query 

268 params["page_size"] = self.max_results * 2 # Get more results 

269 

270 logger.info( 

271 f"Pass 2 - Using expanded query with {expanded_query.count('OR')} ORs" 

272 ) 

273 logger.debug( 

274 f"Pass 2 - Full expanded query (first 500 chars): '{expanded_query[:500]}...'" 

275 ) 

276 logger.info( 

277 f"Pass 2 - Max results set to: {params['page_size']}" 

278 ) 

279 response = self._make_request("/api/documents/", params=params) 

280 

281 if response and "results" in response: 

282 pass2_new = 0 

283 for doc in response["results"]: 

284 doc_id = doc.get("id") 

285 if doc_id and doc_id not in all_results: 

286 all_results[doc_id] = doc 

287 pass2_new += 1 

288 logger.debug( 

289 f"Pass 2 added new doc {doc_id}: {doc.get('title', 'No title')}" 

290 ) 

291 logger.info( 

292 f"Pass 2 found {len(response['results'])} docs, added {pass2_new} new" 

293 ) 

294 else: 

295 logger.warning("Pass 2 returned no results") 

296 else: 

297 logger.info("Pass 2 skipped - expanded query same as original") 

298 else: 

299 logger.info("Pass 2 skipped - no LLM available") 

300 

301 # Sort by relevance score if available 

302 logger.info(f"Total unique documents collected: {len(all_results)}") 

303 sorted_results = sorted( 

304 all_results.values(), 

305 key=lambda x: x.get("__search_hit__", {}).get("score", 0), 

306 reverse=True, 

307 ) 

308 

309 final_results = sorted_results[: self.max_results] 

310 logger.info( 

311 f"Returning top {len(final_results)} documents after sorting by score" 

312 ) 

313 

314 # Log titles and scores of final results 

315 for i, doc in enumerate(final_results[:5], 1): # Log first 5 

316 score = doc.get("__search_hit__", {}).get("score", 0) 

317 logger.debug( 

318 f"Result {i}: '{doc.get('title', 'No title')}' (score={score})" 

319 ) 

320 

321 return final_results 

322 

323 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

324 """ 

325 Get preview results from Paperless-ngx using multi-pass strategy. 

326 

327 Args: 

328 query: Search query 

329 

330 Returns: 

331 List of preview dictionaries 

332 """ 

333 try: 

334 # Use multi-pass search strategy 

335 results = self._multi_pass_search(query) 

336 

337 if not results: 

338 return [] 

339 

340 # Convert documents to preview format 

341 # Note: Each document may return multiple previews (one per highlight) 

342 previews = [] 

343 for doc_data in results: 

344 doc_previews = self._convert_document_to_preview( 

345 doc_data, query 

346 ) 

347 # Handle both single preview and list of previews 

348 if isinstance(doc_previews, list): 348 ↛ 349line 348 didn't jump to line 349 because the condition on line 348 was never true

349 previews.extend(doc_previews) 

350 else: 

351 previews.append(doc_previews) 

352 

353 logger.info( 

354 f"Found {len(previews)} documents in Paperless-ngx for query: {query}" 

355 ) 

356 return previews 

357 

358 except Exception: 

359 logger.exception("Error getting previews from Paperless-ngx") 

360 return [] 

361 

362 def _convert_document_to_preview( 

363 self, doc_data: Dict[str, Any], query: str = "" 

364 ) -> Dict[str, Any]: 

365 """ 

366 Convert a Paperless-ngx document to LDR preview format. 

367 

368 Args: 

369 doc_data: Document data from the API 

370 query: Original search query (for context) 

371 

372 Returns: 

373 Preview dictionary in LDR format 

374 """ 

375 # Extract title 

376 title = doc_data.get("title", f"Document {doc_data.get('id')}") 

377 doc_id = doc_data.get("id") 

378 

379 logger.info( 

380 f"Converting document {doc_id}: '{title}' to preview format" 

381 ) 

382 

383 # Build URL - use the web interface URL for user access 

384 url = f"{self.api_url}/documents/{doc_id}/details" 

385 logger.debug(f"Generated URL for doc {doc_id}: {url}") 

386 

387 # Extract snippet - prefer highlighted content from search 

388 snippet = "" 

389 search_score = 0.0 

390 search_rank = None 

391 all_highlights = [] # Initialize empty highlights list 

392 

393 if "__search_hit__" in doc_data: 

394 search_hit = doc_data["__search_hit__"] 

395 logger.debug( 

396 f"Found __search_hit__ data for doc {doc_id}: score={search_hit.get('score')}, rank={search_hit.get('rank')}" 

397 ) 

398 

399 # Get highlights - this is the search snippet with matched terms 

400 if search_hit.get("highlights"): 400 ↛ 453line 400 didn't jump to line 453 because the condition on line 400 was always true

401 # Highlights can be a string or list 

402 highlights = search_hit.get("highlights") 

403 logger.info( 

404 f"Found highlights for doc {doc_id}: type={type(highlights).__name__}, length={len(str(highlights))}" 

405 ) 

406 

407 if isinstance(highlights, list): 

408 logger.debug( 

409 f"Highlights is list with {len(highlights)} items" 

410 ) 

411 # IMPORTANT: Store highlights list for processing later 

412 # Each highlight will become a separate search result for proper citation 

413 all_highlights = highlights 

414 # Use first highlight for the default snippet 

415 snippet = highlights[0] if highlights else "" 

416 logger.info( 

417 f"Will create {len(highlights)} separate results from highlights" 

418 ) 

419 else: 

420 all_highlights = [ 

421 str(highlights) 

422 ] # Single highlight as list 

423 snippet = str(highlights) 

424 

425 logger.debug( 

426 f"Raw snippet before cleaning (first 200 chars): {snippet[:200]}" 

427 ) 

428 

429 # Clean HTML tags but preserve the matched text 

430 snippet = re.sub(r"<span[^>]*>", "**", snippet) 

431 snippet = re.sub(r"</span>", "**", snippet) 

432 snippet = re.sub(r"<[^>]+>", "", snippet) 

433 

434 logger.debug( 

435 f"Cleaned snippet (first 200 chars): {snippet[:200]}" 

436 ) 

437 

438 # Limit snippet length to avoid context window issues 

439 if ( 439 ↛ 444line 439 didn't jump to line 444 because the condition on line 439 was never true

440 self.MAX_SNIPPET_LENGTH 

441 and len(snippet) > self.MAX_SNIPPET_LENGTH 

442 ): 

443 # Cut at word boundary to avoid mid-word truncation 

444 snippet = ( 

445 snippet[: self.MAX_SNIPPET_LENGTH].rsplit(" ", 1)[0] 

446 + "..." 

447 ) 

448 logger.debug( 

449 f"Truncated snippet to {self.MAX_SNIPPET_LENGTH} chars" 

450 ) 

451 

452 # Get search relevance metadata 

453 search_score = search_hit.get("score", 0.0) 

454 search_rank = search_hit.get("rank") 

455 logger.info( 

456 f"Search metadata for doc {doc_id}: score={search_score}, rank={search_rank}" 

457 ) 

458 else: 

459 logger.warning( 

460 f"No __search_hit__ data for doc {doc_id}, will use content fallback" 

461 ) 

462 

463 if not snippet: 

464 logger.info( 

465 f"No snippet from highlights for doc {doc_id}, using content fallback" 

466 ) 

467 # Fallback to content preview if no highlights available 

468 content = doc_data.get("content", "") 

469 if content: 469 ↛ 512line 469 didn't jump to line 512 because the condition on line 469 was always true

470 logger.debug(f"Document has content of length {len(content)}") 

471 # Try to find context around query terms if possible 

472 if query: 472 ↛ 504line 472 didn't jump to line 504 because the condition on line 472 was always true

473 query_terms = query.lower().split() 

474 content_lower = content.lower() 

475 logger.debug( 

476 f"Searching for query terms in content: {query_terms}" 

477 ) 

478 

479 # Find first occurrence of any query term 

480 best_pos = -1 

481 for term in query_terms: 

482 pos = content_lower.find(term) 

483 if pos != -1 and (best_pos == -1 or pos < best_pos): 483 ↛ 481line 483 didn't jump to line 481 because the condition on line 483 was always true

484 best_pos = pos 

485 logger.debug( 

486 f"Found term '{term}' at position {pos}" 

487 ) 

488 

489 if best_pos != -1: 489 ↛ 499line 489 didn't jump to line 499 because the condition on line 489 was always true

490 # Extract context around the found term - much larger context for research 

491 start = max(0, best_pos - 2000) 

492 end = min(len(content), best_pos + 8000) 

493 snippet = "..." + content[start:end] + "..." 

494 logger.info( 

495 f"Extracted snippet around query term at position {best_pos}" 

496 ) 

497 else: 

498 # Just take the beginning - use 10000 chars for research 

499 snippet = content[:10000] 

500 logger.info( 

501 "No query terms found, using first 10000 chars of content" 

502 ) 

503 else: 

504 snippet = content[:10000] 

505 logger.info( 

506 "No query provided, using first 10000 chars of content" 

507 ) 

508 

509 if len(content) > 10000: 509 ↛ 510line 509 didn't jump to line 510 because the condition on line 509 was never true

510 snippet += "..." 

511 else: 

512 logger.warning(f"No content available for doc {doc_id}") 

513 

514 logger.info(f"Final snippet for doc {doc_id} has length {len(snippet)}") 

515 

516 # Build metadata 

517 metadata = { 

518 "doc_id": str(doc_id), 

519 "correspondent": doc_data.get("correspondent_name", ""), 

520 "document_type": doc_data.get("document_type_name", ""), 

521 "created": doc_data.get("created", ""), 

522 "modified": doc_data.get("modified", ""), 

523 "archive_serial_number": doc_data.get("archive_serial_number"), 

524 "search_score": search_score, 

525 "search_rank": search_rank, 

526 } 

527 

528 # Add tags if present 

529 tags = doc_data.get("tags_list", []) 

530 if isinstance(tags, list) and tags: 530 ↛ 531line 530 didn't jump to line 531 because the condition on line 530 was never true

531 metadata["tags"] = ", ".join(str(tag) for tag in tags) 

532 

533 # Build enhanced title with available metadata for better citations 

534 title_parts = [] 

535 

536 # Add correspondent/author if available 

537 correspondent = doc_data.get("correspondent_name", "") 

538 if correspondent: 

539 title_parts.append(f"{correspondent}.") 

540 logger.debug(f"Added correspondent to title: {correspondent}") 

541 

542 # Add the document title 

543 title_parts.append(title) 

544 

545 # Add document type if it's meaningful (not just generic types) 

546 doc_type = doc_data.get("document_type_name", "") 

547 if doc_type and doc_type not in ["Letter", "Other", "Document", ""]: 

548 title_parts.append(f"({doc_type})") 

549 logger.debug(f"Added document type to title: {doc_type}") 

550 

551 # Add year from created date if available 

552 created_date = doc_data.get("created", "") 

553 if created_date and len(created_date) >= 4: 

554 year = created_date[:4] 

555 title_parts.append(year) 

556 logger.debug(f"Added year to title: {year}") 

557 

558 # Format the enhanced title for display in sources list 

559 if title_parts: 559 ↛ 562line 559 didn't jump to line 562 because the condition on line 559 was always true

560 enhanced_title = " ".join(title_parts) 

561 else: 

562 enhanced_title = title 

563 

564 logger.info(f"Enhanced title for doc {doc_id}: '{enhanced_title}'") 

565 

566 # Build the preview 

567 preview = { 

568 "title": enhanced_title, # Use enhanced title with bibliographic info 

569 "url": url, 

570 "link": url, # Add 'link' key for compatibility with search utilities 

571 "snippet": snippet, 

572 "author": doc_data.get("correspondent_name", ""), 

573 "date": doc_data.get("created", ""), 

574 "source": "Paperless", # Keep source as the system name like other engines 

575 "metadata": metadata, 

576 "_raw_data": doc_data, # Store raw data for full content retrieval 

577 } 

578 

579 logger.info( 

580 f"Built preview for doc {doc_id}: URL={url}, snippet_len={len(snippet)}, has_author={bool(preview['author'])}, has_date={bool(preview['date'])}" 

581 ) 

582 

583 # Check if we have multiple highlights to return as separate results 

584 if len(all_highlights) > 1: 

585 # Create multiple previews, one for each highlight 

586 previews = [] 

587 for i, highlight in enumerate(all_highlights): 

588 # Clean each highlight 

589 clean_snippet = re.sub(r"<span[^>]*>", "**", str(highlight)) 

590 clean_snippet = re.sub(r"</span>", "**", clean_snippet) 

591 clean_snippet = re.sub(r"<[^>]+>", "", clean_snippet) 

592 

593 # Create a preview for this highlight 

594 highlight_preview = { 

595 "title": f"{enhanced_title} (excerpt {i + 1})", # Differentiate each excerpt 

596 "url": url, 

597 "link": url, 

598 "snippet": clean_snippet, 

599 "author": doc_data.get("correspondent_name", ""), 

600 "date": doc_data.get("created", ""), 

601 "source": "Paperless", 

602 "metadata": { 

603 **metadata, 

604 "excerpt_number": i + 1, 

605 "total_excerpts": len(all_highlights), 

606 }, 

607 "_raw_data": doc_data, 

608 } 

609 previews.append(highlight_preview) 

610 

611 logger.info( 

612 f"Created {len(previews)} separate previews from highlights for doc {doc_id}" 

613 ) 

614 return previews 

615 else: 

616 # Single preview (original behavior) 

617 return preview 

618 

619 def _get_full_content( 

620 self, relevant_items: List[Dict[str, Any]] 

621 ) -> List[Dict[str, Any]]: 

622 """ 

623 Get full content for relevant documents. 

624 

625 Args: 

626 relevant_items: List of relevant preview dictionaries 

627 

628 Returns: 

629 List of dictionaries with full content 

630 """ 

631 if not self.include_content: 

632 # If content inclusion is disabled, just return previews 

633 return relevant_items 

634 

635 logger.info(f"Getting full content for {len(relevant_items)} documents") 

636 results = [] 

637 for idx, item in enumerate(relevant_items): 

638 try: 

639 logger.info( 

640 f"Processing document {idx + 1}: title='{item.get('title', 'No title')[:50]}...', url={item.get('url', 'No URL')}" 

641 ) 

642 logger.debug(f"Document {idx + 1} keys: {item.keys()}") 

643 logger.debug( 

644 f"Document {idx + 1} has snippet of length: {len(item.get('snippet', ''))}" 

645 ) 

646 

647 # Get the full document content if we have the raw data 

648 if "_raw_data" in item: 

649 doc_data = item["_raw_data"] 

650 full_content = doc_data.get("content", "") 

651 

652 if not full_content: 

653 # Try to fetch the document details 

654 doc_id = item["metadata"].get("doc_id") 

655 if doc_id: 655 ↛ 664line 655 didn't jump to line 664 because the condition on line 655 was always true

656 detail_response = self._make_request( 

657 f"/api/documents/{doc_id}/" 

658 ) 

659 if detail_response: 659 ↛ 664line 659 didn't jump to line 664 because the condition on line 659 was always true

660 full_content = detail_response.get( 

661 "content", "" 

662 ) 

663 

664 item["full_content"] = full_content or item["snippet"] 

665 logger.info( 

666 f"Document {idx + 1} full content length: {len(item['full_content'])}" 

667 ) 

668 else: 

669 # Fallback to snippet if no raw data 

670 item["full_content"] = item["snippet"] 

671 logger.info( 

672 f"Document {idx + 1} using snippet as full content (no raw data)" 

673 ) 

674 

675 # Log the final document structure for debugging citation issues 

676 logger.info( 

677 f"Document {idx + 1} final structure: title='{item.get('title', '')[:50]}...', has_link={bool(item.get('link'))}, has_url={bool(item.get('url'))}, source='{item.get('source', 'Unknown')}'" 

678 ) 

679 

680 # Remove the raw data from the result 

681 item.pop("_raw_data", None) 

682 results.append(item) 

683 

684 except Exception: 

685 logger.exception("Error getting full content for document") 

686 item["full_content"] = item["snippet"] 

687 item.pop("_raw_data", None) 

688 results.append(item) 

689 

690 return results 

691 

692 def run( 

693 self, query: str, research_context: Dict[str, Any] | None = None 

694 ) -> List[Dict[str, Any]]: 

695 """ 

696 Execute search on Paperless-ngx. 

697 

698 Args: 

699 query: Search query 

700 research_context: Context from previous research 

701 

702 Returns: 

703 List of search results in LDR format 

704 """ 

705 try: 

706 # Get previews 

707 previews = self._get_previews(query) 

708 

709 if not previews: 

710 return [] 

711 

712 # Apply LLM filtering if available 

713 if ( 713 ↛ 718line 713 didn't jump to line 718 because the condition on line 713 was never true

714 self.llm 

715 and hasattr(self, "_content_filters") 

716 and self._content_filters 

717 ): 

718 filtered_previews = self._apply_content_filters(previews, query) 

719 else: 

720 filtered_previews = previews 

721 

722 # Get full content for relevant items 

723 results = self._get_full_content(filtered_previews) 

724 

725 logger.info( 

726 f"Search completed successfully, returning {len(results)} results" 

727 ) 

728 # Enhanced logging to track document structure for citation debugging 

729 for i, r in enumerate(results[:3], 1): 

730 logger.info( 

731 f"Result {i}: title='{r.get('title', '')[:50]}...', " 

732 f"has_full_content={bool(r.get('full_content'))}, " 

733 f"full_content_len={len(r.get('full_content', ''))}, " 

734 f"snippet_len={len(r.get('snippet', ''))}, " 

735 f"url={r.get('url', '')[:50]}" 

736 ) 

737 

738 return results 

739 

740 except Exception: 

741 logger.exception("Error in Paperless-ngx search") 

742 return [] 

743 

744 async def arun(self, query: str) -> List[Dict[str, Any]]: 

745 """ 

746 Async version of search. 

747 

748 Currently falls back to sync version. 

749 """ 

750 return self.run(query) 

751 

752 def test_connection(self) -> bool: 

753 """ 

754 Test the connection to Paperless-ngx. 

755 

756 Returns: 

757 True if connection successful, False otherwise 

758 """ 

759 try: 

760 response = self._make_request("/api/") 

761 return bool(response) 

762 except Exception: 

763 logger.exception("Failed to connect to Paperless-ngx") 

764 return False 

765 

766 def get_document_count(self) -> int: 

767 """ 

768 Get the total number of documents in Paperless-ngx. 

769 

770 Returns: 

771 Number of documents, or -1 if error 

772 """ 

773 try: 

774 response = self._make_request( 

775 "/api/documents/", params={"page_size": 1} 

776 ) 

777 return response.get("count", -1) 

778 except Exception: 

779 return -1