Coverage for src / local_deep_research / web_search_engines / engines / search_engine_paperless.py: 81%
295 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1"""
2Paperless-ngx search engine implementation for Local Deep Research.
4This module provides a proper search engine implementation that connects to a Paperless-ngx
5instance, allowing LDR to search and retrieve documents from your personal
6document management system.
7"""
9import re
10from typing import Any, Dict, List, Optional
11import requests
12from urllib.parse import urljoin
14from langchain_core.language_models import BaseLLM
15from loguru import logger
17from ..search_engine_base import BaseSearchEngine
18from ...security import safe_get
21class PaperlessSearchEngine(BaseSearchEngine):
22 """Paperless-ngx search engine implementation with full LDR integration."""
24 # Class constants for magic numbers
25 MAX_SNIPPET_LENGTH = 3000 # Reasonable limit to avoid context window issues
26 SNIPPET_CONTEXT_BEFORE = 500 # Characters before matched term in snippet
27 SNIPPET_CONTEXT_AFTER = 2500 # Characters after matched term in snippet
29 def __init__(
30 self,
31 api_url: str = None,
32 api_key: str = None,
33 api_token: str = None, # Support both for backwards compatibility
34 max_results: int = 10,
35 timeout: int = 30,
36 verify_ssl: bool = True,
37 include_content: bool = True,
38 llm: Optional[BaseLLM] = None,
39 settings_snapshot: Optional[Dict[str, Any]] = None,
40 **kwargs,
41 ):
42 """
43 Initialize the Paperless-ngx search engine.
45 Args:
46 api_url: Base URL of Paperless-ngx instance (e.g., "http://localhost:8000")
47 If not provided, will look for PAPERLESS_API_URL env var
48 api_key: API token for authentication (preferred parameter name)
49 api_token: API token for authentication (backwards compatibility)
50 If not provided, will look for PAPERLESS_API_TOKEN env var
51 max_results: Maximum number of search results
52 timeout: Request timeout in seconds
53 verify_ssl: Whether to verify SSL certificates
54 include_content: Whether to include document content in results
55 llm: Language model for relevance filtering (optional)
56 settings_snapshot: Settings snapshot for thread context
57 **kwargs: Additional parameters passed to parent
58 """
59 super().__init__(
60 max_results=max_results,
61 llm=llm,
62 settings_snapshot=settings_snapshot,
63 **kwargs,
64 )
66 # Use provided configuration or get from settings
67 self.api_url = api_url
68 # Support both api_key and api_token for compatibility
69 self.api_token = api_key or api_token
71 # If no API URL provided, try to get from settings_snapshot
72 if not self.api_url and settings_snapshot:
73 self.api_url = settings_snapshot.get(
74 "search.engine.web.paperless.default_params.api_url",
75 "http://localhost:8000",
76 )
78 # If no API token provided, try to get from settings_snapshot
79 if not self.api_token and settings_snapshot:
80 self.api_token = settings_snapshot.get(
81 "search.engine.web.paperless.api_key", ""
82 )
84 # Fix AttributeError: Check if api_url is None before calling rstrip
85 if self.api_url:
86 # Remove trailing slash from API URL
87 self.api_url = self.api_url.rstrip("/")
88 else:
89 # Default to localhost if nothing provided
90 self.api_url = "http://localhost:8000"
91 logger.warning(
92 "No Paperless API URL provided, using default: http://localhost:8000"
93 )
95 self.timeout = timeout
96 self.verify_ssl = verify_ssl
97 self.include_content = include_content
99 # Set up headers for authentication
100 self.headers = {}
101 if self.api_token:
102 self.headers["Authorization"] = f"Token {self.api_token}"
104 logger.info(
105 f"Initialized Paperless-ngx search engine for {self.api_url}"
106 )
108 def _make_request(
109 self, endpoint: str, params: Optional[Dict] = None
110 ) -> Dict[str, Any]:
111 """
112 Make a request to the Paperless-ngx API.
114 Args:
115 endpoint: API endpoint path
116 params: Query parameters
118 Returns:
119 JSON response from the API
120 """
121 url = urljoin(self.api_url, endpoint)
123 logger.debug(f"Making request to: {url}")
124 logger.debug(f"Request params: {params}")
125 logger.debug(
126 f"Headers: {self.headers.keys() if self.headers else 'None'}"
127 )
129 try:
130 # Paperless is typically a local/private network service
131 response = safe_get(
132 url,
133 params=params,
134 headers=self.headers,
135 timeout=self.timeout,
136 verify=self.verify_ssl,
137 allow_private_ips=True,
138 allow_localhost=True,
139 )
140 response.raise_for_status()
141 result = response.json()
143 # Log response details
144 if isinstance(result, dict): 144 ↛ 162line 144 didn't jump to line 162 because the condition on line 144 was always true
145 if "results" in result:
146 logger.info(
147 f"API returned {len(result.get('results', []))} results, total count: {result.get('count', 'unknown')}"
148 )
149 # Log first result details if available
150 if result.get("results"):
151 first = result["results"][0]
152 logger.debug(
153 f"First result: id={first.get('id')}, title='{first.get('title', 'No title')[:50]}...'"
154 )
155 if "__search_hit__" in first: 155 ↛ 162line 155 didn't jump to line 162 because the condition on line 155 was always true
156 logger.debug(
157 f"Has search hit data with score={first['__search_hit__'].get('score')}"
158 )
159 else:
160 logger.debug(f"API response keys: {result.keys()}")
162 return result
163 except requests.exceptions.RequestException:
164 logger.exception("Error making request to Paperless-ngx")
165 logger.debug(f"Failed URL: {url}, params: {params}")
166 return {}
168 def _expand_query_with_llm(self, query: str) -> str:
169 """
170 Use LLM to expand query with relevant keywords and synonyms.
172 Args:
173 query: Original search query
175 Returns:
176 Expanded query with keywords
177 """
178 if not self.llm:
179 logger.info(
180 f"No LLM available for query expansion, using original: '{query}'"
181 )
182 return query
184 try:
185 prompt = f"""Paperless-ngx uses TF-IDF keyword search, not semantic search.
186Convert this query into keywords that would appear in documents.
188Query: "{query}"
190Output format: keyword1 OR keyword2 OR "multi word phrase" OR keyword3
191Include synonyms, plural forms, and technical terms.
193IMPORTANT: Output ONLY the search query. No explanations, no additional text."""
195 logger.debug(
196 f"Sending query expansion prompt to LLM for: '{query}'"
197 )
198 response = self.llm.invoke(prompt)
199 expanded = response.content.strip()
201 logger.debug(
202 f"Raw LLM response (first 500 chars): {expanded[:500]}"
203 )
205 # Clean up the response - remove any explanatory text
206 if "\n" in expanded: 206 ↛ 207line 206 didn't jump to line 207 because the condition on line 206 was never true
207 expanded = expanded.split("\n")[0]
208 logger.debug("Took first line of LLM response")
210 # Always trust the LLM's expansion - it knows better than hard-coded rules
211 logger.info(
212 f"LLM expanded query from '{query}' to {len(expanded)} chars with {expanded.count('OR')} ORs"
213 )
214 logger.debug(
215 f"Expanded query preview (first 200 chars): {expanded[:200]}..."
216 )
217 return expanded
219 except Exception:
220 logger.exception("Failed to expand query with LLM")
221 return query
223 def _multi_pass_search(self, query: str) -> List[Dict[str, Any]]:
224 """
225 Perform multiple search passes with different strategies.
227 Args:
228 query: Original search query
230 Returns:
231 Combined and deduplicated results
232 """
233 logger.info(f"Starting multi-pass search for query: '{query}'")
234 all_results = {} # Use dict to deduplicate by doc_id
236 # Pass 1: Original query
237 params = {
238 "query": query,
239 "page_size": self.max_results,
240 "ordering": "-score",
241 }
243 logger.info(
244 f"Pass 1 - Original query: '{query}' (max_results={self.max_results})"
245 )
246 response = self._make_request("/api/documents/", params=params)
248 if response and "results" in response:
249 pass1_count = len(response["results"])
250 logger.info(f"Pass 1 returned {pass1_count} documents")
251 for doc in response["results"]:
252 doc_id = doc.get("id")
253 if doc_id and doc_id not in all_results: 253 ↛ 251line 253 didn't jump to line 251 because the condition on line 253 was always true
254 all_results[doc_id] = doc
255 logger.debug(
256 f"Added doc {doc_id}: {doc.get('title', 'No title')}"
257 )
258 else:
259 logger.warning(
260 f"Pass 1 returned no results or invalid response: {response}"
261 )
263 # Pass 2: LLM-expanded keywords (if LLM available)
264 if self.llm: 264 ↛ 265line 264 didn't jump to line 265 because the condition on line 264 was never true
265 expanded_query = self._expand_query_with_llm(query)
266 if expanded_query != query:
267 params["query"] = expanded_query
268 params["page_size"] = self.max_results * 2 # Get more results
270 logger.info(
271 f"Pass 2 - Using expanded query with {expanded_query.count('OR')} ORs"
272 )
273 logger.debug(
274 f"Pass 2 - Full expanded query (first 500 chars): '{expanded_query[:500]}...'"
275 )
276 logger.info(
277 f"Pass 2 - Max results set to: {params['page_size']}"
278 )
279 response = self._make_request("/api/documents/", params=params)
281 if response and "results" in response:
282 pass2_new = 0
283 for doc in response["results"]:
284 doc_id = doc.get("id")
285 if doc_id and doc_id not in all_results:
286 all_results[doc_id] = doc
287 pass2_new += 1
288 logger.debug(
289 f"Pass 2 added new doc {doc_id}: {doc.get('title', 'No title')}"
290 )
291 logger.info(
292 f"Pass 2 found {len(response['results'])} docs, added {pass2_new} new"
293 )
294 else:
295 logger.warning("Pass 2 returned no results")
296 else:
297 logger.info("Pass 2 skipped - expanded query same as original")
298 else:
299 logger.info("Pass 2 skipped - no LLM available")
301 # Sort by relevance score if available
302 logger.info(f"Total unique documents collected: {len(all_results)}")
303 sorted_results = sorted(
304 all_results.values(),
305 key=lambda x: x.get("__search_hit__", {}).get("score", 0),
306 reverse=True,
307 )
309 final_results = sorted_results[: self.max_results]
310 logger.info(
311 f"Returning top {len(final_results)} documents after sorting by score"
312 )
314 # Log titles and scores of final results
315 for i, doc in enumerate(final_results[:5], 1): # Log first 5
316 score = doc.get("__search_hit__", {}).get("score", 0)
317 logger.debug(
318 f"Result {i}: '{doc.get('title', 'No title')}' (score={score})"
319 )
321 return final_results
323 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
324 """
325 Get preview results from Paperless-ngx using multi-pass strategy.
327 Args:
328 query: Search query
330 Returns:
331 List of preview dictionaries
332 """
333 try:
334 # Use multi-pass search strategy
335 results = self._multi_pass_search(query)
337 if not results:
338 return []
340 # Convert documents to preview format
341 # Note: Each document may return multiple previews (one per highlight)
342 previews = []
343 for doc_data in results:
344 doc_previews = self._convert_document_to_preview(
345 doc_data, query
346 )
347 # Handle both single preview and list of previews
348 if isinstance(doc_previews, list): 348 ↛ 349line 348 didn't jump to line 349 because the condition on line 348 was never true
349 previews.extend(doc_previews)
350 else:
351 previews.append(doc_previews)
353 logger.info(
354 f"Found {len(previews)} documents in Paperless-ngx for query: {query}"
355 )
356 return previews
358 except Exception:
359 logger.exception("Error getting previews from Paperless-ngx")
360 return []
362 def _convert_document_to_preview(
363 self, doc_data: Dict[str, Any], query: str = ""
364 ) -> Dict[str, Any]:
365 """
366 Convert a Paperless-ngx document to LDR preview format.
368 Args:
369 doc_data: Document data from the API
370 query: Original search query (for context)
372 Returns:
373 Preview dictionary in LDR format
374 """
375 # Extract title
376 title = doc_data.get("title", f"Document {doc_data.get('id')}")
377 doc_id = doc_data.get("id")
379 logger.info(
380 f"Converting document {doc_id}: '{title}' to preview format"
381 )
383 # Build URL - use the web interface URL for user access
384 url = f"{self.api_url}/documents/{doc_id}/details"
385 logger.debug(f"Generated URL for doc {doc_id}: {url}")
387 # Extract snippet - prefer highlighted content from search
388 snippet = ""
389 search_score = 0.0
390 search_rank = None
391 all_highlights = [] # Initialize empty highlights list
393 if "__search_hit__" in doc_data:
394 search_hit = doc_data["__search_hit__"]
395 logger.debug(
396 f"Found __search_hit__ data for doc {doc_id}: score={search_hit.get('score')}, rank={search_hit.get('rank')}"
397 )
399 # Get highlights - this is the search snippet with matched terms
400 if search_hit.get("highlights"): 400 ↛ 453line 400 didn't jump to line 453 because the condition on line 400 was always true
401 # Highlights can be a string or list
402 highlights = search_hit.get("highlights")
403 logger.info(
404 f"Found highlights for doc {doc_id}: type={type(highlights).__name__}, length={len(str(highlights))}"
405 )
407 if isinstance(highlights, list):
408 logger.debug(
409 f"Highlights is list with {len(highlights)} items"
410 )
411 # IMPORTANT: Store highlights list for processing later
412 # Each highlight will become a separate search result for proper citation
413 all_highlights = highlights
414 # Use first highlight for the default snippet
415 snippet = highlights[0] if highlights else ""
416 logger.info(
417 f"Will create {len(highlights)} separate results from highlights"
418 )
419 else:
420 all_highlights = [
421 str(highlights)
422 ] # Single highlight as list
423 snippet = str(highlights)
425 logger.debug(
426 f"Raw snippet before cleaning (first 200 chars): {snippet[:200]}"
427 )
429 # Clean HTML tags but preserve the matched text
430 snippet = re.sub(r"<span[^>]*>", "**", snippet)
431 snippet = re.sub(r"</span>", "**", snippet)
432 snippet = re.sub(r"<[^>]+>", "", snippet)
434 logger.debug(
435 f"Cleaned snippet (first 200 chars): {snippet[:200]}"
436 )
438 # Limit snippet length to avoid context window issues
439 if ( 439 ↛ 444line 439 didn't jump to line 444 because the condition on line 439 was never true
440 self.MAX_SNIPPET_LENGTH
441 and len(snippet) > self.MAX_SNIPPET_LENGTH
442 ):
443 # Cut at word boundary to avoid mid-word truncation
444 snippet = (
445 snippet[: self.MAX_SNIPPET_LENGTH].rsplit(" ", 1)[0]
446 + "..."
447 )
448 logger.debug(
449 f"Truncated snippet to {self.MAX_SNIPPET_LENGTH} chars"
450 )
452 # Get search relevance metadata
453 search_score = search_hit.get("score", 0.0)
454 search_rank = search_hit.get("rank")
455 logger.info(
456 f"Search metadata for doc {doc_id}: score={search_score}, rank={search_rank}"
457 )
458 else:
459 logger.warning(
460 f"No __search_hit__ data for doc {doc_id}, will use content fallback"
461 )
463 if not snippet:
464 logger.info(
465 f"No snippet from highlights for doc {doc_id}, using content fallback"
466 )
467 # Fallback to content preview if no highlights available
468 content = doc_data.get("content", "")
469 if content: 469 ↛ 512line 469 didn't jump to line 512 because the condition on line 469 was always true
470 logger.debug(f"Document has content of length {len(content)}")
471 # Try to find context around query terms if possible
472 if query: 472 ↛ 504line 472 didn't jump to line 504 because the condition on line 472 was always true
473 query_terms = query.lower().split()
474 content_lower = content.lower()
475 logger.debug(
476 f"Searching for query terms in content: {query_terms}"
477 )
479 # Find first occurrence of any query term
480 best_pos = -1
481 for term in query_terms:
482 pos = content_lower.find(term)
483 if pos != -1 and (best_pos == -1 or pos < best_pos): 483 ↛ 481line 483 didn't jump to line 481 because the condition on line 483 was always true
484 best_pos = pos
485 logger.debug(
486 f"Found term '{term}' at position {pos}"
487 )
489 if best_pos != -1: 489 ↛ 499line 489 didn't jump to line 499 because the condition on line 489 was always true
490 # Extract context around the found term - much larger context for research
491 start = max(0, best_pos - 2000)
492 end = min(len(content), best_pos + 8000)
493 snippet = "..." + content[start:end] + "..."
494 logger.info(
495 f"Extracted snippet around query term at position {best_pos}"
496 )
497 else:
498 # Just take the beginning - use 10000 chars for research
499 snippet = content[:10000]
500 logger.info(
501 "No query terms found, using first 10000 chars of content"
502 )
503 else:
504 snippet = content[:10000]
505 logger.info(
506 "No query provided, using first 10000 chars of content"
507 )
509 if len(content) > 10000: 509 ↛ 510line 509 didn't jump to line 510 because the condition on line 509 was never true
510 snippet += "..."
511 else:
512 logger.warning(f"No content available for doc {doc_id}")
514 logger.info(f"Final snippet for doc {doc_id} has length {len(snippet)}")
516 # Build metadata
517 metadata = {
518 "doc_id": str(doc_id),
519 "correspondent": doc_data.get("correspondent_name", ""),
520 "document_type": doc_data.get("document_type_name", ""),
521 "created": doc_data.get("created", ""),
522 "modified": doc_data.get("modified", ""),
523 "archive_serial_number": doc_data.get("archive_serial_number"),
524 "search_score": search_score,
525 "search_rank": search_rank,
526 }
528 # Add tags if present
529 tags = doc_data.get("tags_list", [])
530 if isinstance(tags, list) and tags: 530 ↛ 531line 530 didn't jump to line 531 because the condition on line 530 was never true
531 metadata["tags"] = ", ".join(str(tag) for tag in tags)
533 # Build enhanced title with available metadata for better citations
534 title_parts = []
536 # Add correspondent/author if available
537 correspondent = doc_data.get("correspondent_name", "")
538 if correspondent:
539 title_parts.append(f"{correspondent}.")
540 logger.debug(f"Added correspondent to title: {correspondent}")
542 # Add the document title
543 title_parts.append(title)
545 # Add document type if it's meaningful (not just generic types)
546 doc_type = doc_data.get("document_type_name", "")
547 if doc_type and doc_type not in ["Letter", "Other", "Document", ""]:
548 title_parts.append(f"({doc_type})")
549 logger.debug(f"Added document type to title: {doc_type}")
551 # Add year from created date if available
552 created_date = doc_data.get("created", "")
553 if created_date and len(created_date) >= 4:
554 year = created_date[:4]
555 title_parts.append(year)
556 logger.debug(f"Added year to title: {year}")
558 # Format the enhanced title for display in sources list
559 if title_parts: 559 ↛ 562line 559 didn't jump to line 562 because the condition on line 559 was always true
560 enhanced_title = " ".join(title_parts)
561 else:
562 enhanced_title = title
564 logger.info(f"Enhanced title for doc {doc_id}: '{enhanced_title}'")
566 # Build the preview
567 preview = {
568 "title": enhanced_title, # Use enhanced title with bibliographic info
569 "url": url,
570 "link": url, # Add 'link' key for compatibility with search utilities
571 "snippet": snippet,
572 "author": doc_data.get("correspondent_name", ""),
573 "date": doc_data.get("created", ""),
574 "source": "Paperless", # Keep source as the system name like other engines
575 "metadata": metadata,
576 "_raw_data": doc_data, # Store raw data for full content retrieval
577 }
579 logger.info(
580 f"Built preview for doc {doc_id}: URL={url}, snippet_len={len(snippet)}, has_author={bool(preview['author'])}, has_date={bool(preview['date'])}"
581 )
583 # Check if we have multiple highlights to return as separate results
584 if len(all_highlights) > 1:
585 # Create multiple previews, one for each highlight
586 previews = []
587 for i, highlight in enumerate(all_highlights):
588 # Clean each highlight
589 clean_snippet = re.sub(r"<span[^>]*>", "**", str(highlight))
590 clean_snippet = re.sub(r"</span>", "**", clean_snippet)
591 clean_snippet = re.sub(r"<[^>]+>", "", clean_snippet)
593 # Create a preview for this highlight
594 highlight_preview = {
595 "title": f"{enhanced_title} (excerpt {i + 1})", # Differentiate each excerpt
596 "url": url,
597 "link": url,
598 "snippet": clean_snippet,
599 "author": doc_data.get("correspondent_name", ""),
600 "date": doc_data.get("created", ""),
601 "source": "Paperless",
602 "metadata": {
603 **metadata,
604 "excerpt_number": i + 1,
605 "total_excerpts": len(all_highlights),
606 },
607 "_raw_data": doc_data,
608 }
609 previews.append(highlight_preview)
611 logger.info(
612 f"Created {len(previews)} separate previews from highlights for doc {doc_id}"
613 )
614 return previews
615 else:
616 # Single preview (original behavior)
617 return preview
619 def _get_full_content(
620 self, relevant_items: List[Dict[str, Any]]
621 ) -> List[Dict[str, Any]]:
622 """
623 Get full content for relevant documents.
625 Args:
626 relevant_items: List of relevant preview dictionaries
628 Returns:
629 List of dictionaries with full content
630 """
631 if not self.include_content:
632 # If content inclusion is disabled, just return previews
633 return relevant_items
635 logger.info(f"Getting full content for {len(relevant_items)} documents")
636 results = []
637 for idx, item in enumerate(relevant_items):
638 try:
639 logger.info(
640 f"Processing document {idx + 1}: title='{item.get('title', 'No title')[:50]}...', url={item.get('url', 'No URL')}"
641 )
642 logger.debug(f"Document {idx + 1} keys: {item.keys()}")
643 logger.debug(
644 f"Document {idx + 1} has snippet of length: {len(item.get('snippet', ''))}"
645 )
647 # Get the full document content if we have the raw data
648 if "_raw_data" in item:
649 doc_data = item["_raw_data"]
650 full_content = doc_data.get("content", "")
652 if not full_content:
653 # Try to fetch the document details
654 doc_id = item["metadata"].get("doc_id")
655 if doc_id: 655 ↛ 664line 655 didn't jump to line 664 because the condition on line 655 was always true
656 detail_response = self._make_request(
657 f"/api/documents/{doc_id}/"
658 )
659 if detail_response: 659 ↛ 664line 659 didn't jump to line 664 because the condition on line 659 was always true
660 full_content = detail_response.get(
661 "content", ""
662 )
664 item["full_content"] = full_content or item["snippet"]
665 logger.info(
666 f"Document {idx + 1} full content length: {len(item['full_content'])}"
667 )
668 else:
669 # Fallback to snippet if no raw data
670 item["full_content"] = item["snippet"]
671 logger.info(
672 f"Document {idx + 1} using snippet as full content (no raw data)"
673 )
675 # Log the final document structure for debugging citation issues
676 logger.info(
677 f"Document {idx + 1} final structure: title='{item.get('title', '')[:50]}...', has_link={bool(item.get('link'))}, has_url={bool(item.get('url'))}, source='{item.get('source', 'Unknown')}'"
678 )
680 # Remove the raw data from the result
681 item.pop("_raw_data", None)
682 results.append(item)
684 except Exception:
685 logger.exception("Error getting full content for document")
686 item["full_content"] = item["snippet"]
687 item.pop("_raw_data", None)
688 results.append(item)
690 return results
692 def run(
693 self, query: str, research_context: Dict[str, Any] | None = None
694 ) -> List[Dict[str, Any]]:
695 """
696 Execute search on Paperless-ngx.
698 Args:
699 query: Search query
700 research_context: Context from previous research
702 Returns:
703 List of search results in LDR format
704 """
705 try:
706 # Get previews
707 previews = self._get_previews(query)
709 if not previews:
710 return []
712 # Apply LLM filtering if available
713 if ( 713 ↛ 718line 713 didn't jump to line 718 because the condition on line 713 was never true
714 self.llm
715 and hasattr(self, "_content_filters")
716 and self._content_filters
717 ):
718 filtered_previews = self._apply_content_filters(previews, query)
719 else:
720 filtered_previews = previews
722 # Get full content for relevant items
723 results = self._get_full_content(filtered_previews)
725 logger.info(
726 f"Search completed successfully, returning {len(results)} results"
727 )
728 # Enhanced logging to track document structure for citation debugging
729 for i, r in enumerate(results[:3], 1):
730 logger.info(
731 f"Result {i}: title='{r.get('title', '')[:50]}...', "
732 f"has_full_content={bool(r.get('full_content'))}, "
733 f"full_content_len={len(r.get('full_content', ''))}, "
734 f"snippet_len={len(r.get('snippet', ''))}, "
735 f"url={r.get('url', '')[:50]}"
736 )
738 return results
740 except Exception:
741 logger.exception("Error in Paperless-ngx search")
742 return []
744 async def arun(self, query: str) -> List[Dict[str, Any]]:
745 """
746 Async version of search.
748 Currently falls back to sync version.
749 """
750 return self.run(query)
752 def test_connection(self) -> bool:
753 """
754 Test the connection to Paperless-ngx.
756 Returns:
757 True if connection successful, False otherwise
758 """
759 try:
760 response = self._make_request("/api/")
761 return bool(response)
762 except Exception:
763 logger.exception("Failed to connect to Paperless-ngx")
764 return False
766 def get_document_count(self) -> int:
767 """
768 Get the total number of documents in Paperless-ngx.
770 Returns:
771 Number of documents, or -1 if error
772 """
773 try:
774 response = self._make_request(
775 "/api/documents/", params={"page_size": 1}
776 )
777 return response.get("count", -1)
778 except Exception:
779 return -1