Coverage for src / local_deep_research / web_search_engines / engines / search_engine_paperless.py: 90%
301 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2Paperless-ngx search engine implementation for Local Deep Research.
4This module provides a proper search engine implementation that connects to a Paperless-ngx
5instance, allowing LDR to search and retrieve documents from your personal
6document management system.
7"""
9import re
10from typing import Any, Dict, List, Optional
11import requests
12from urllib.parse import urljoin
14from langchain_core.language_models import BaseLLM
15from loguru import logger
17from ..search_engine_base import BaseSearchEngine
18from ...security import safe_get
21class PaperlessSearchEngine(BaseSearchEngine):
22 """Paperless-ngx search engine implementation with full LDR integration."""
24 is_lexical = True
25 needs_llm_relevance_filter = True
27 # Class constants for magic numbers
28 MAX_SNIPPET_LENGTH = 3000 # Reasonable limit to avoid context window issues
29 SNIPPET_CONTEXT_BEFORE = 500 # Characters before matched term in snippet
30 SNIPPET_CONTEXT_AFTER = 2500 # Characters after matched term in snippet
32 def __init__(
33 self,
34 api_url: str | None = None,
35 api_key: str | None = None,
36 api_token: str
37 | None = None, # Support both for backwards compatibility
38 max_results: int = 10,
39 timeout: int = 30,
40 verify_ssl: bool = True,
41 include_content: bool = True,
42 llm: Optional[BaseLLM] = None,
43 settings_snapshot: Optional[Dict[str, Any]] = None,
44 **kwargs,
45 ):
46 """
47 Initialize the Paperless-ngx search engine.
49 Args:
50 api_url: Base URL of Paperless-ngx instance (e.g., "http://localhost:8000")
51 If not provided, will look for PAPERLESS_API_URL env var
52 api_key: API token for authentication (preferred parameter name)
53 api_token: API token for authentication (backwards compatibility)
54 If not provided, will look for PAPERLESS_API_TOKEN env var
55 max_results: Maximum number of search results
56 timeout: Request timeout in seconds
57 verify_ssl: Whether to verify SSL certificates
58 include_content: Whether to include document content in results
59 llm: Language model for relevance filtering (optional)
60 settings_snapshot: Settings snapshot for thread context
61 **kwargs: Additional parameters passed to parent
62 """
63 super().__init__(
64 max_results=max_results,
65 llm=llm,
66 settings_snapshot=settings_snapshot,
67 **kwargs,
68 )
70 # Use provided configuration or get from settings
71 self.api_url = api_url
72 # Support both api_key and api_token for compatibility
73 self.api_token = api_key or api_token
75 # If no API URL provided, try to get from settings_snapshot
76 if not self.api_url and settings_snapshot:
77 self.api_url = settings_snapshot.get(
78 "search.engine.web.paperless.default_params.api_url",
79 "http://localhost:8000",
80 )
82 # If no API token provided, try to get from settings_snapshot
83 if not self.api_token and settings_snapshot:
84 self.api_token = settings_snapshot.get(
85 "search.engine.web.paperless.api_key", ""
86 )
88 # Fix AttributeError: Check if api_url is None before calling rstrip
89 if self.api_url:
90 # Remove trailing slash from API URL
91 self.api_url = self.api_url.rstrip("/")
92 else:
93 # Default to localhost if nothing provided
94 self.api_url = "http://localhost:8000"
95 logger.warning(
96 "No Paperless API URL provided, using default: http://localhost:8000"
97 )
99 self.timeout = timeout
100 self.verify_ssl = verify_ssl
101 self.include_content = include_content
103 # Set up headers for authentication
104 self.headers = {}
105 if self.api_token:
106 self.headers["Authorization"] = f"Token {self.api_token}"
108 logger.info(
109 f"Initialized Paperless-ngx search engine for {self.api_url}"
110 )
112 def _make_request(
113 self, endpoint: str, params: Optional[Dict] = None
114 ) -> Dict[str, Any]:
115 """
116 Make a request to the Paperless-ngx API.
118 Args:
119 endpoint: API endpoint path
120 params: Query parameters
122 Returns:
123 JSON response from the API
124 """
125 url = urljoin(self.api_url or "", endpoint)
127 logger.debug(f"Making request to: {url}")
128 logger.debug(f"Request params: {params}")
129 logger.debug(
130 f"Headers: {self.headers.keys() if self.headers else 'None'}"
131 )
133 try:
134 # Paperless is typically a local/private network service
135 response = safe_get(
136 url,
137 params=params,
138 headers=self.headers,
139 timeout=self.timeout,
140 verify=self.verify_ssl,
141 allow_private_ips=True,
142 allow_localhost=True,
143 )
144 response.raise_for_status()
145 result = response.json()
147 # Log response details
148 if isinstance(result, dict): 148 ↛ 166line 148 didn't jump to line 166 because the condition on line 148 was always true
149 if "results" in result:
150 logger.info(
151 f"API returned {len(result.get('results', []))} results, total count: {result.get('count', 'unknown')}"
152 )
153 # Log first result details if available
154 if result.get("results"):
155 first = result["results"][0]
156 logger.debug(
157 f"First result: id={first.get('id')}, title='{first.get('title', 'No title')[:50]}...'"
158 )
159 if "__search_hit__" in first:
160 logger.debug(
161 f"Has search hit data with score={first['__search_hit__'].get('score')}"
162 )
163 else:
164 logger.debug(f"API response keys: {result.keys()}")
166 return result # type: ignore[no-any-return]
167 except requests.exceptions.RequestException:
168 logger.exception("Error making request to Paperless-ngx")
169 logger.debug(f"Failed URL: {url}, params: {params}")
170 return {}
172 def _expand_query_with_llm(self, query: str) -> str:
173 """
174 Use LLM to expand query with relevant keywords and synonyms.
176 Args:
177 query: Original search query
179 Returns:
180 Expanded query with keywords
181 """
182 if not self.llm:
183 logger.info(
184 f"No LLM available for query expansion, using original: '{query}'"
185 )
186 return query
188 try:
189 prompt = f"""Paperless-ngx uses TF-IDF keyword search, not semantic search.
190Convert this query into keywords that would appear in documents.
192Query: "{query}"
194Output format: keyword1 OR keyword2 OR "multi word phrase" OR keyword3
195Include synonyms, plural forms, and technical terms.
197IMPORTANT: Output ONLY the search query. No explanations, no additional text."""
199 logger.debug(
200 f"Sending query expansion prompt to LLM for: '{query}'"
201 )
202 response = self.llm.invoke(prompt)
203 expanded = (
204 str(response.content)
205 if hasattr(response, "content")
206 else str(response)
207 ).strip()
209 logger.debug(
210 f"Raw LLM response (first 500 chars): {expanded[:500]}"
211 )
213 # Clean up the response - remove any explanatory text
214 if "\n" in expanded: 214 ↛ 215line 214 didn't jump to line 215 because the condition on line 214 was never true
215 expanded = expanded.split("\n")[0]
216 logger.debug("Took first line of LLM response")
218 # Always trust the LLM's expansion - it knows better than hard-coded rules
219 logger.info(
220 f"LLM expanded query from '{query}' to {len(expanded)} chars with {expanded.count('OR')} ORs"
221 )
222 logger.debug(
223 f"Expanded query preview (first 200 chars): {expanded[:200]}..."
224 )
225 return expanded
227 except Exception:
228 logger.exception("Failed to expand query with LLM")
229 return query
231 def _multi_pass_search(self, query: str) -> List[Dict[str, Any]]:
232 """
233 Perform multiple search passes with different strategies.
235 Args:
236 query: Original search query
238 Returns:
239 Combined and deduplicated results
240 """
241 logger.info(f"Starting multi-pass search for query: '{query}'")
242 all_results = {} # Use dict to deduplicate by doc_id
244 # Pass 1: Original query
245 params = {
246 "query": query,
247 "page_size": self.max_results,
248 "ordering": "-score",
249 }
251 logger.info(
252 f"Pass 1 - Original query: '{query}' (max_results={self.max_results})"
253 )
254 response = self._make_request("/api/documents/", params=params)
256 if response and "results" in response:
257 pass1_count = len(response["results"])
258 logger.info(f"Pass 1 returned {pass1_count} documents")
259 for doc in response["results"]:
260 doc_id = doc.get("id")
261 if doc_id and doc_id not in all_results: 261 ↛ 259line 261 didn't jump to line 259 because the condition on line 261 was always true
262 all_results[doc_id] = doc
263 logger.debug(
264 f"Added doc {doc_id}: {doc.get('title', 'No title')}"
265 )
266 else:
267 logger.warning(
268 f"Pass 1 returned no results or invalid response: {response}"
269 )
271 # Pass 2: LLM-expanded keywords (if LLM available)
272 if self.llm:
273 expanded_query = self._expand_query_with_llm(query)
274 if expanded_query != query: 274 ↛ 305line 274 didn't jump to line 305 because the condition on line 274 was always true
275 params["query"] = expanded_query
276 params["page_size"] = self.max_results * 2 # Get more results
278 logger.info(
279 f"Pass 2 - Using expanded query with {expanded_query.count('OR')} ORs"
280 )
281 logger.debug(
282 f"Pass 2 - Full expanded query (first 500 chars): '{expanded_query[:500]}...'"
283 )
284 logger.info(
285 f"Pass 2 - Max results set to: {params['page_size']}"
286 )
287 response = self._make_request("/api/documents/", params=params)
289 if response and "results" in response: 289 ↛ 303line 289 didn't jump to line 303 because the condition on line 289 was always true
290 pass2_new = 0
291 for doc in response["results"]:
292 doc_id = doc.get("id")
293 if doc_id and doc_id not in all_results:
294 all_results[doc_id] = doc
295 pass2_new += 1
296 logger.debug(
297 f"Pass 2 added new doc {doc_id}: {doc.get('title', 'No title')}"
298 )
299 logger.info(
300 f"Pass 2 found {len(response['results'])} docs, added {pass2_new} new"
301 )
302 else:
303 logger.warning("Pass 2 returned no results")
304 else:
305 logger.info("Pass 2 skipped - expanded query same as original")
306 else:
307 logger.info("Pass 2 skipped - no LLM available")
309 # Sort by relevance score if available
310 logger.info(f"Total unique documents collected: {len(all_results)}")
311 sorted_results = sorted(
312 all_results.values(),
313 key=lambda x: x.get("__search_hit__", {}).get("score", 0),
314 reverse=True,
315 )
317 final_results = sorted_results[: self.max_results]
318 logger.info(
319 f"Returning top {len(final_results)} documents after sorting by score"
320 )
322 # Log titles and scores of final results
323 for i, doc in enumerate(final_results[:5], 1): # Log first 5
324 score = doc.get("__search_hit__", {}).get("score", 0)
325 logger.debug(
326 f"Result {i}: '{doc.get('title', 'No title')}' (score={score})"
327 )
329 return final_results
331 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
332 """
333 Get preview results from Paperless-ngx using multi-pass strategy.
335 Args:
336 query: Search query
338 Returns:
339 List of preview dictionaries
340 """
341 try:
342 # Use multi-pass search strategy
343 results = self._multi_pass_search(query)
345 if not results:
346 return []
348 # Convert documents to preview format
349 # Note: Each document may return multiple previews (one per highlight)
350 previews = []
351 for doc_data in results:
352 doc_previews = self._convert_document_to_preview(
353 doc_data, query
354 )
355 # Handle both single preview and list of previews
356 if isinstance(doc_previews, list): 356 ↛ 357line 356 didn't jump to line 357 because the condition on line 356 was never true
357 previews.extend(doc_previews)
358 else:
359 previews.append(doc_previews)
361 logger.info(
362 f"Found {len(previews)} documents in Paperless-ngx for query: {query}"
363 )
364 return previews
366 except Exception:
367 logger.exception("Error getting previews from Paperless-ngx")
368 return []
370 def _convert_document_to_preview(
371 self, doc_data: Dict[str, Any], query: str = ""
372 ) -> Dict[str, Any] | List[Dict[str, Any]]:
373 """
374 Convert a Paperless-ngx document to LDR preview format.
376 Args:
377 doc_data: Document data from the API
378 query: Original search query (for context)
380 Returns:
381 Preview dictionary in LDR format
382 """
383 # Extract title
384 title = doc_data.get("title", f"Document {doc_data.get('id')}")
385 doc_id = doc_data.get("id")
387 logger.info(
388 f"Converting document {doc_id}: '{title}' to preview format"
389 )
391 # Build URL - use the web interface URL for user access
392 url = f"{self.api_url}/documents/{doc_id}/details"
393 logger.debug(f"Generated URL for doc {doc_id}: {url}")
395 # Extract snippet - prefer highlighted content from search
396 snippet = ""
397 search_score = 0.0
398 search_rank = None
399 all_highlights = [] # Initialize empty highlights list
401 if "__search_hit__" in doc_data:
402 search_hit = doc_data["__search_hit__"]
403 logger.debug(
404 f"Found __search_hit__ data for doc {doc_id}: score={search_hit.get('score')}, rank={search_hit.get('rank')}"
405 )
407 # Get highlights - this is the search snippet with matched terms
408 if search_hit.get("highlights"): 408 ↛ 461line 408 didn't jump to line 461 because the condition on line 408 was always true
409 # Highlights can be a string or list
410 highlights = search_hit.get("highlights")
411 logger.info(
412 f"Found highlights for doc {doc_id}: type={type(highlights).__name__}, length={len(str(highlights))}"
413 )
415 if isinstance(highlights, list):
416 logger.debug(
417 f"Highlights is list with {len(highlights)} items"
418 )
419 # IMPORTANT: Store highlights list for processing later
420 # Each highlight will become a separate search result for proper citation
421 all_highlights = highlights
422 # Use first highlight for the default snippet
423 snippet = highlights[0] if highlights else ""
424 logger.info(
425 f"Will create {len(highlights)} separate results from highlights"
426 )
427 else:
428 all_highlights = [
429 str(highlights)
430 ] # Single highlight as list
431 snippet = str(highlights)
433 logger.debug(
434 f"Raw snippet before cleaning (first 200 chars): {snippet[:200]}"
435 )
437 # Clean HTML tags but preserve the matched text
438 snippet = re.sub(r"<span[^>]*>", "**", snippet)
439 snippet = re.sub(r"</span>", "**", snippet)
440 snippet = re.sub(r"<[^>]+>", "", snippet)
442 logger.debug(
443 f"Cleaned snippet (first 200 chars): {snippet[:200]}"
444 )
446 # Limit snippet length to avoid context window issues
447 if ( 447 ↛ 452line 447 didn't jump to line 452 because the condition on line 447 was never true
448 self.MAX_SNIPPET_LENGTH
449 and len(snippet) > self.MAX_SNIPPET_LENGTH
450 ):
451 # Cut at word boundary to avoid mid-word truncation
452 snippet = (
453 snippet[: self.MAX_SNIPPET_LENGTH].rsplit(" ", 1)[0]
454 + "..."
455 )
456 logger.debug(
457 f"Truncated snippet to {self.MAX_SNIPPET_LENGTH} chars"
458 )
460 # Get search relevance metadata
461 search_score = search_hit.get("score", 0.0)
462 search_rank = search_hit.get("rank")
463 logger.info(
464 f"Search metadata for doc {doc_id}: score={search_score}, rank={search_rank}"
465 )
466 else:
467 logger.warning(
468 f"No __search_hit__ data for doc {doc_id}, will use content fallback"
469 )
471 if not snippet:
472 logger.info(
473 f"No snippet from highlights for doc {doc_id}, using content fallback"
474 )
475 # Fallback to content preview if no highlights available
476 content = doc_data.get("content", "")
477 if content: 477 ↛ 520line 477 didn't jump to line 520 because the condition on line 477 was always true
478 logger.debug(f"Document has content of length {len(content)}")
479 # Try to find context around query terms if possible
480 if query: 480 ↛ 512line 480 didn't jump to line 512 because the condition on line 480 was always true
481 query_terms = query.lower().split()
482 content_lower = content.lower()
483 logger.debug(
484 f"Searching for query terms in content: {query_terms}"
485 )
487 # Find first occurrence of any query term
488 best_pos = -1
489 for term in query_terms:
490 pos = content_lower.find(term)
491 if pos != -1 and (best_pos == -1 or pos < best_pos): 491 ↛ 489line 491 didn't jump to line 489 because the condition on line 491 was always true
492 best_pos = pos
493 logger.debug(
494 f"Found term '{term}' at position {pos}"
495 )
497 if best_pos != -1: 497 ↛ 507line 497 didn't jump to line 507 because the condition on line 497 was always true
498 # Extract context around the found term - much larger context for research
499 start = max(0, best_pos - 2000)
500 end = min(len(content), best_pos + 8000)
501 snippet = "..." + content[start:end] + "..."
502 logger.info(
503 f"Extracted snippet around query term at position {best_pos}"
504 )
505 else:
506 # Just take the beginning - use 10000 chars for research
507 snippet = content[:10000]
508 logger.info(
509 "No query terms found, using first 10000 chars of content"
510 )
511 else:
512 snippet = content[:10000]
513 logger.info(
514 "No query provided, using first 10000 chars of content"
515 )
517 if len(content) > 10000: 517 ↛ 518line 517 didn't jump to line 518 because the condition on line 517 was never true
518 snippet += "..."
519 else:
520 logger.warning(f"No content available for doc {doc_id}")
522 logger.info(f"Final snippet for doc {doc_id} has length {len(snippet)}")
524 # Build metadata
525 metadata = {
526 "doc_id": str(doc_id),
527 "correspondent": doc_data.get("correspondent_name", ""),
528 "document_type": doc_data.get("document_type_name", ""),
529 "created": doc_data.get("created", ""),
530 "modified": doc_data.get("modified", ""),
531 "archive_serial_number": doc_data.get("archive_serial_number"),
532 "search_score": search_score,
533 "search_rank": search_rank,
534 }
536 # Add tags if present
537 tags = doc_data.get("tags_list", [])
538 if isinstance(tags, list) and tags:
539 metadata["tags"] = ", ".join(str(tag) for tag in tags)
541 # Build enhanced title with available metadata for better citations
542 title_parts = []
544 # Add correspondent/author if available
545 correspondent = doc_data.get("correspondent_name", "")
546 if correspondent:
547 title_parts.append(f"{correspondent}.")
548 logger.debug(f"Added correspondent to title: {correspondent}")
550 # Add the document title
551 title_parts.append(title)
553 # Add document type if it's meaningful (not just generic types)
554 doc_type = doc_data.get("document_type_name", "")
555 if doc_type and doc_type not in ["Letter", "Other", "Document", ""]:
556 title_parts.append(f"({doc_type})")
557 logger.debug(f"Added document type to title: {doc_type}")
559 # Add year from created date if available
560 created_date = doc_data.get("created", "")
561 if created_date and len(created_date) >= 4:
562 year = created_date[:4]
563 title_parts.append(year)
564 logger.debug(f"Added year to title: {year}")
566 # Format the enhanced title for display in sources list
567 if title_parts: 567 ↛ 570line 567 didn't jump to line 570 because the condition on line 567 was always true
568 enhanced_title = " ".join(title_parts)
569 else:
570 enhanced_title = title
572 logger.info(f"Enhanced title for doc {doc_id}: '{enhanced_title}'")
574 # Build the preview
575 preview = {
576 "title": enhanced_title, # Use enhanced title with bibliographic info
577 "url": url,
578 "link": url, # Add 'link' key for compatibility with search utilities
579 "snippet": snippet,
580 "author": doc_data.get("correspondent_name", ""),
581 "date": doc_data.get("created", ""),
582 "source": "Paperless", # Keep source as the system name like other engines
583 "metadata": metadata,
584 "_raw_data": doc_data, # Store raw data for full content retrieval
585 }
587 logger.info(
588 f"Built preview for doc {doc_id}: URL={url}, snippet_len={len(snippet)}, has_author={bool(preview['author'])}, has_date={bool(preview['date'])}"
589 )
591 # Check if we have multiple highlights to return as separate results
592 if len(all_highlights) > 1:
593 # Create multiple previews, one for each highlight
594 previews = []
595 for i, highlight in enumerate(all_highlights):
596 # Clean each highlight
597 clean_snippet = re.sub(r"<span[^>]*>", "**", str(highlight))
598 clean_snippet = re.sub(r"</span>", "**", clean_snippet)
599 clean_snippet = re.sub(r"<[^>]+>", "", clean_snippet)
601 # Create a preview for this highlight
602 highlight_preview = {
603 "title": f"{enhanced_title} (excerpt {i + 1})", # Differentiate each excerpt
604 "url": url,
605 "link": url,
606 "snippet": clean_snippet,
607 "author": doc_data.get("correspondent_name", ""),
608 "date": doc_data.get("created", ""),
609 "source": "Paperless",
610 "metadata": {
611 **metadata,
612 "excerpt_number": i + 1,
613 "total_excerpts": len(all_highlights),
614 },
615 "_raw_data": doc_data,
616 }
617 previews.append(highlight_preview)
619 logger.info(
620 f"Created {len(previews)} separate previews from highlights for doc {doc_id}"
621 )
622 return previews
623 # Single preview (original behavior)
624 return preview
626 def _get_full_content(
627 self, relevant_items: List[Dict[str, Any]]
628 ) -> List[Dict[str, Any]]:
629 """
630 Get full content for relevant documents.
632 Args:
633 relevant_items: List of relevant preview dictionaries
635 Returns:
636 List of dictionaries with full content
637 """
638 if not self.include_content:
639 # If content inclusion is disabled, just return previews
640 return relevant_items
642 logger.info(f"Getting full content for {len(relevant_items)} documents")
643 results = []
644 for idx, item in enumerate(relevant_items):
645 try:
646 logger.info(
647 f"Processing document {idx + 1}: title='{item.get('title', 'No title')[:50]}...', url={item.get('url', 'No URL')}"
648 )
649 logger.debug(f"Document {idx + 1} keys: {item.keys()}")
650 logger.debug(
651 f"Document {idx + 1} has snippet of length: {len(item.get('snippet', ''))}"
652 )
654 # Get the full document content if we have the raw data
655 if "_raw_data" in item:
656 doc_data = item["_raw_data"]
657 full_content = doc_data.get("content", "")
659 if not full_content:
660 # Try to fetch the document details
661 doc_id = item["metadata"].get("doc_id")
662 if doc_id: 662 ↛ 671line 662 didn't jump to line 671 because the condition on line 662 was always true
663 detail_response = self._make_request(
664 f"/api/documents/{doc_id}/"
665 )
666 if detail_response:
667 full_content = detail_response.get(
668 "content", ""
669 )
671 item["full_content"] = full_content or item["snippet"]
672 logger.info(
673 f"Document {idx + 1} full content length: {len(item['full_content'])}"
674 )
675 else:
676 # Fallback to snippet if no raw data
677 item["full_content"] = item["snippet"]
678 logger.info(
679 f"Document {idx + 1} using snippet as full content (no raw data)"
680 )
682 # Log the final document structure for debugging citation issues
683 logger.info(
684 f"Document {idx + 1} final structure: title='{item.get('title', '')[:50]}...', has_link={bool(item.get('link'))}, has_url={bool(item.get('url'))}, source='{item.get('source', 'Unknown')}'"
685 )
687 # Remove the raw data from the result
688 item.pop("_raw_data", None)
689 results.append(item)
691 except Exception:
692 logger.exception("Error getting full content for document")
693 item["full_content"] = item["snippet"]
694 item.pop("_raw_data", None)
695 results.append(item)
697 return results
699 def run(
700 self, query: str, research_context: Dict[str, Any] | None = None
701 ) -> List[Dict[str, Any]]:
702 """
703 Execute search on Paperless-ngx.
705 Args:
706 query: Search query
707 research_context: Context from previous research
709 Returns:
710 List of search results in LDR format
711 """
712 try:
713 # Get previews
714 previews = self._get_previews(query)
716 if not previews:
717 return []
719 # Apply LLM relevance filtering if enabled by the factory
720 enable_llm_filter = getattr(
721 self, "enable_llm_relevance_filter", False
722 )
723 if enable_llm_filter and self.llm: 723 ↛ 724line 723 didn't jump to line 724 because the condition on line 723 was never true
724 filtered_previews = self._filter_for_relevance(previews, query)
725 if not filtered_previews:
726 logger.info(
727 f"LLM relevance filter returned no results "
728 f"from {len(previews)} previews for query: {query}"
729 )
730 else:
731 filtered_previews = previews
733 # Get full content for relevant items
734 results = self._get_full_content(filtered_previews)
736 logger.info(
737 f"Search completed successfully, returning {len(results)} results"
738 )
739 # Enhanced logging to track document structure for citation debugging
740 for i, r in enumerate(results[:3], 1):
741 logger.info(
742 f"Result {i}: title='{r.get('title', '')[:50]}...', "
743 f"has_full_content={bool(r.get('full_content'))}, "
744 f"full_content_len={len(r.get('full_content', ''))}, "
745 f"snippet_len={len(r.get('snippet', ''))}, "
746 f"url={r.get('url', '')[:50]}"
747 )
749 return results
751 except Exception:
752 logger.exception("Error in Paperless-ngx search")
753 return []
755 async def arun(self, query: str) -> List[Dict[str, Any]]:
756 """
757 Async version of search.
759 Currently falls back to sync version.
760 """
761 return self.run(query)
763 def test_connection(self) -> bool:
764 """
765 Test the connection to Paperless-ngx.
767 Returns:
768 True if connection successful, False otherwise
769 """
770 try:
771 response = self._make_request("/api/")
772 return bool(response)
773 except Exception:
774 logger.exception("Failed to connect to Paperless-ngx")
775 return False
777 def get_document_count(self) -> int:
778 """
779 Get the total number of documents in Paperless-ngx.
781 Returns:
782 Number of documents, or -1 if error
783 """
784 try:
785 response = self._make_request(
786 "/api/documents/", params={"page_size": 1}
787 )
788 return int(response.get("count", -1))
789 except Exception:
790 logger.debug("Failed to fetch document count", exc_info=True)
791 return -1