Coverage for src/local_deep_research/web_search_engines/engines/search_engine

1"""

2Paperless-ngx search engine implementation for Local Deep Research.

4This module provides a proper search engine implementation that connects to a Paperless-ngx

5instance, allowing LDR to search and retrieve documents from your personal

6document management system.

7"""

9import re

10from typing import Any, Dict, List, Optional

11import requests

12from urllib.parse import urljoin

14from langchain_core.language_models import BaseLLM

15from loguru import logger

17from ..search_engine_base import BaseSearchEngine

18from ...security import safe_get

21class PaperlessSearchEngine(BaseSearchEngine):

22 """Paperless-ngx search engine implementation with full LDR integration."""

24 # Class constants for magic numbers

25 MAX_SNIPPET_LENGTH = 3000 # Reasonable limit to avoid context window issues

26 SNIPPET_CONTEXT_BEFORE = 500 # Characters before matched term in snippet

27 SNIPPET_CONTEXT_AFTER = 2500 # Characters after matched term in snippet

29 def __init__(

30 self,

31 api_url: str = None,

32 api_key: str = None,

33 api_token: str = None, # Support both for backwards compatibility

34 max_results: int = 10,

35 timeout: int = 30,

36 verify_ssl: bool = True,

37 include_content: bool = True,

38 llm: Optional[BaseLLM] = None,

39 settings_snapshot: Optional[Dict[str, Any]] = None,

40 **kwargs,

41 ):

42 """

43 Initialize the Paperless-ngx search engine.

45 Args:

46 api_url: Base URL of Paperless-ngx instance (e.g., "http://localhost:8000")

47 If not provided, will look for PAPERLESS_API_URL env var

48 api_key: API token for authentication (preferred parameter name)

49 api_token: API token for authentication (backwards compatibility)

50 If not provided, will look for PAPERLESS_API_TOKEN env var

51 max_results: Maximum number of search results

52 timeout: Request timeout in seconds

53 verify_ssl: Whether to verify SSL certificates

54 include_content: Whether to include document content in results

55 llm: Language model for relevance filtering (optional)

56 settings_snapshot: Settings snapshot for thread context

57 **kwargs: Additional parameters passed to parent

58 """

59 super().__init__(

60 max_results=max_results,

61 llm=llm,

62 settings_snapshot=settings_snapshot,

63 **kwargs,

64 )

66 # Use provided configuration or get from settings

67 self.api_url = api_url

68 # Support both api_key and api_token for compatibility

69 self.api_token = api_key or api_token

71 # If no API URL provided, try to get from settings_snapshot

72 if not self.api_url and settings_snapshot:

73 self.api_url = settings_snapshot.get(

74 "search.engine.web.paperless.default_params.api_url",

75 "http://localhost:8000",

76 )

78 # If no API token provided, try to get from settings_snapshot

79 if not self.api_token and settings_snapshot:

80 self.api_token = settings_snapshot.get(

81 "search.engine.web.paperless.api_key", ""

82 )

84 # Fix AttributeError: Check if api_url is None before calling rstrip

85 if self.api_url:

86 # Remove trailing slash from API URL

87 self.api_url = self.api_url.rstrip("/")

88 else:

89 # Default to localhost if nothing provided

90 self.api_url = "http://localhost:8000"

91 logger.warning(

92 "No Paperless API URL provided, using default: http://localhost:8000"

93 )

95 self.timeout = timeout

96 self.verify_ssl = verify_ssl

97 self.include_content = include_content

99 # Set up headers for authentication

100 self.headers = {}

101 if self.api_token:

102 self.headers["Authorization"] = f"Token {self.api_token}"

103

104 logger.info(

105 f"Initialized Paperless-ngx search engine for {self.api_url}"

106 )

107

108 def _make_request(

109 self, endpoint: str, params: Optional[Dict] = None

110 ) -> Dict[str, Any]:

111 """

112 Make a request to the Paperless-ngx API.

113

114 Args:

115 endpoint: API endpoint path

116 params: Query parameters

117

118 Returns:

119 JSON response from the API

120 """

121 url = urljoin(self.api_url, endpoint)

122

123 logger.debug(f"Making request to: {url}")

124 logger.debug(f"Request params: {params}")

125 logger.debug(

126 f"Headers: {self.headers.keys() if self.headers else 'None'}"

127 )

128

129 try:

130 # Paperless is typically a local/private network service

131 response = safe_get(

132 url,

133 params=params,

134 headers=self.headers,

135 timeout=self.timeout,

136 verify=self.verify_ssl,

137 allow_private_ips=True,

138 allow_localhost=True,

139 )

140 response.raise_for_status()

141 result = response.json()

142

143 # Log response details

144 if isinstance(result, dict): 144 ↛ 162line 144 didn't jump to line 162 because the condition on line 144 was always true

145 if "results" in result:

146 logger.info(

147 f"API returned {len(result.get('results', []))} results, total count: {result.get('count', 'unknown')}"

148 )

149 # Log first result details if available

150 if result.get("results"):

151 first = result["results"][0]

152 logger.debug(

153 f"First result: id={first.get('id')}, title='{first.get('title', 'No title')[:50]}...'"

154 )

155 if "__search_hit__" in first:

156 logger.debug(

157 f"Has search hit data with score={first['__search_hit__'].get('score')}"

158 )

159 else:

160 logger.debug(f"API response keys: {result.keys()}")

161

162 return result

163 except requests.exceptions.RequestException:

164 logger.exception("Error making request to Paperless-ngx")

165 logger.debug(f"Failed URL: {url}, params: {params}")

166 return {}

167

168 def _expand_query_with_llm(self, query: str) -> str:

169 """

170 Use LLM to expand query with relevant keywords and synonyms.

171

172 Args:

173 query: Original search query

174

175 Returns:

176 Expanded query with keywords

177 """

178 if not self.llm:

179 logger.info(

180 f"No LLM available for query expansion, using original: '{query}'"

181 )

182 return query

183

184 try:

185 prompt = f"""Paperless-ngx uses TF-IDF keyword search, not semantic search.

186Convert this query into keywords that would appear in documents.

187

188Query: "{query}"

189

190Output format: keyword1 OR keyword2 OR "multi word phrase" OR keyword3

191Include synonyms, plural forms, and technical terms.

192

193IMPORTANT: Output ONLY the search query. No explanations, no additional text."""

194

195 logger.debug(

196 f"Sending query expansion prompt to LLM for: '{query}'"

197 )

198 response = self.llm.invoke(prompt)

199 expanded = response.content.strip()

200

201 logger.debug(

202 f"Raw LLM response (first 500 chars): {expanded[:500]}"

203 )

204

205 # Clean up the response - remove any explanatory text

206 if "\n" in expanded: 206 ↛ 207line 206 didn't jump to line 207 because the condition on line 206 was never true

207 expanded = expanded.split("\n")[0]

208 logger.debug("Took first line of LLM response")

209

210 # Always trust the LLM's expansion - it knows better than hard-coded rules

211 logger.info(

212 f"LLM expanded query from '{query}' to {len(expanded)} chars with {expanded.count('OR')} ORs"

213 )

214 logger.debug(

215 f"Expanded query preview (first 200 chars): {expanded[:200]}..."

216 )

217 return expanded

218

219 except Exception:

220 logger.exception("Failed to expand query with LLM")

221 return query

222

223 def _multi_pass_search(self, query: str) -> List[Dict[str, Any]]:

224 """

225 Perform multiple search passes with different strategies.

226

227 Args:

228 query: Original search query

229

230 Returns:

231 Combined and deduplicated results

232 """

233 logger.info(f"Starting multi-pass search for query: '{query}'")

234 all_results = {} # Use dict to deduplicate by doc_id

235

236 # Pass 1: Original query

237 params = {

238 "query": query,

239 "page_size": self.max_results,

240 "ordering": "-score",

241 }

242

243 logger.info(

244 f"Pass 1 - Original query: '{query}' (max_results={self.max_results})"

245 )

246 response = self._make_request("/api/documents/", params=params)

247

248 if response and "results" in response:

249 pass1_count = len(response["results"])

250 logger.info(f"Pass 1 returned {pass1_count} documents")

251 for doc in response["results"]:

252 doc_id = doc.get("id")

253 if doc_id and doc_id not in all_results: 253 ↛ 251line 253 didn't jump to line 251 because the condition on line 253 was always true

254 all_results[doc_id] = doc

255 logger.debug(

256 f"Added doc {doc_id}: {doc.get('title', 'No title')}"

257 )

258 else:

259 logger.warning(

260 f"Pass 1 returned no results or invalid response: {response}"

261 )

262

263 # Pass 2: LLM-expanded keywords (if LLM available)

264 if self.llm:

265 expanded_query = self._expand_query_with_llm(query)

266 if expanded_query != query: 266 ↛ 297line 266 didn't jump to line 297 because the condition on line 266 was always true

267 params["query"] = expanded_query

268 params["page_size"] = self.max_results * 2 # Get more results

269

270 logger.info(

271 f"Pass 2 - Using expanded query with {expanded_query.count('OR')} ORs"

272 )

273 logger.debug(

274 f"Pass 2 - Full expanded query (first 500 chars): '{expanded_query[:500]}...'"

275 )

276 logger.info(

277 f"Pass 2 - Max results set to: {params['page_size']}"

278 )

279 response = self._make_request("/api/documents/", params=params)

280

281 if response and "results" in response: 281 ↛ 295line 281 didn't jump to line 295 because the condition on line 281 was always true

282 pass2_new = 0

283 for doc in response["results"]:

284 doc_id = doc.get("id")

285 if doc_id and doc_id not in all_results:

286 all_results[doc_id] = doc

287 pass2_new += 1

288 logger.debug(

289 f"Pass 2 added new doc {doc_id}: {doc.get('title', 'No title')}"

290 )

291 logger.info(

292 f"Pass 2 found {len(response['results'])} docs, added {pass2_new} new"

293 )

294 else:

295 logger.warning("Pass 2 returned no results")

296 else:

297 logger.info("Pass 2 skipped - expanded query same as original")

298 else:

299 logger.info("Pass 2 skipped - no LLM available")

300

301 # Sort by relevance score if available

302 logger.info(f"Total unique documents collected: {len(all_results)}")

303 sorted_results = sorted(

304 all_results.values(),

305 key=lambda x: x.get("__search_hit__", {}).get("score", 0),

306 reverse=True,

307 )

308

309 final_results = sorted_results[: self.max_results]

310 logger.info(

311 f"Returning top {len(final_results)} documents after sorting by score"

312 )

313

314 # Log titles and scores of final results

315 for i, doc in enumerate(final_results[:5], 1): # Log first 5

316 score = doc.get("__search_hit__", {}).get("score", 0)

317 logger.debug(

318 f"Result {i}: '{doc.get('title', 'No title')}' (score={score})"

319 )

320

321 return final_results

322

323 def _get_previews(self, query: str) -> List[Dict[str, Any]]:

324 """

325 Get preview results from Paperless-ngx using multi-pass strategy.

326

327 Args:

328 query: Search query

329

330 Returns:

331 List of preview dictionaries

332 """

333 try:

334 # Use multi-pass search strategy

335 results = self._multi_pass_search(query)

336

337 if not results:

338 return []

339

340 # Convert documents to preview format

341 # Note: Each document may return multiple previews (one per highlight)

342 previews = []

343 for doc_data in results:

344 doc_previews = self._convert_document_to_preview(

345 doc_data, query

346 )

347 # Handle both single preview and list of previews

348 if isinstance(doc_previews, list): 348 ↛ 349line 348 didn't jump to line 349 because the condition on line 348 was never true

349 previews.extend(doc_previews)

350 else:

351 previews.append(doc_previews)

352

353 logger.info(

354 f"Found {len(previews)} documents in Paperless-ngx for query: {query}"

355 )

356 return previews

357

358 except Exception:

359 logger.exception("Error getting previews from Paperless-ngx")

360 return []

361

362 def _convert_document_to_preview(

363 self, doc_data: Dict[str, Any], query: str = ""

364 ) -> Dict[str, Any]:

365 """

366 Convert a Paperless-ngx document to LDR preview format.

367

368 Args:

369 doc_data: Document data from the API

370 query: Original search query (for context)

371

372 Returns:

373 Preview dictionary in LDR format

374 """

375 # Extract title

376 title = doc_data.get("title", f"Document {doc_data.get('id')}")

377 doc_id = doc_data.get("id")

378

379 logger.info(

380 f"Converting document {doc_id}: '{title}' to preview format"

381 )

382

383 # Build URL - use the web interface URL for user access

384 url = f"{self.api_url}/documents/{doc_id}/details"

385 logger.debug(f"Generated URL for doc {doc_id}: {url}")

386

387 # Extract snippet - prefer highlighted content from search

388 snippet = ""

389 search_score = 0.0

390 search_rank = None

391 all_highlights = [] # Initialize empty highlights list

392

393 if "__search_hit__" in doc_data:

394 search_hit = doc_data["__search_hit__"]

395 logger.debug(

396 f"Found __search_hit__ data for doc {doc_id}: score={search_hit.get('score')}, rank={search_hit.get('rank')}"

397 )

398

399 # Get highlights - this is the search snippet with matched terms

400 if search_hit.get("highlights"): 400 ↛ 453line 400 didn't jump to line 453 because the condition on line 400 was always true

401 # Highlights can be a string or list

402 highlights = search_hit.get("highlights")

403 logger.info(

404 f"Found highlights for doc {doc_id}: type={type(highlights).__name__}, length={len(str(highlights))}"

405 )

406

407 if isinstance(highlights, list):

408 logger.debug(

409 f"Highlights is list with {len(highlights)} items"

410 )

411 # IMPORTANT: Store highlights list for processing later

412 # Each highlight will become a separate search result for proper citation

413 all_highlights = highlights

414 # Use first highlight for the default snippet

415 snippet = highlights[0] if highlights else ""

416 logger.info(

417 f"Will create {len(highlights)} separate results from highlights"

418 )

419 else:

420 all_highlights = [

421 str(highlights)

422 ] # Single highlight as list

423 snippet = str(highlights)

424

425 logger.debug(

426 f"Raw snippet before cleaning (first 200 chars): {snippet[:200]}"

427 )

428

429 # Clean HTML tags but preserve the matched text

430 snippet = re.sub(r"<span[^>]*>", "**", snippet)

431 snippet = re.sub(r"</span>", "**", snippet)

432 snippet = re.sub(r"<[^>]+>", "", snippet)

433

434 logger.debug(

435 f"Cleaned snippet (first 200 chars): {snippet[:200]}"

436 )

437

438 # Limit snippet length to avoid context window issues

439 if ( 439 ↛ 444line 439 didn't jump to line 444 because the condition on line 439 was never true

440 self.MAX_SNIPPET_LENGTH

441 and len(snippet) > self.MAX_SNIPPET_LENGTH

442 ):

443 # Cut at word boundary to avoid mid-word truncation

444 snippet = (

445 snippet[: self.MAX_SNIPPET_LENGTH].rsplit(" ", 1)[0]

446 + "..."

447 )

448 logger.debug(

449 f"Truncated snippet to {self.MAX_SNIPPET_LENGTH} chars"

450 )

451

452 # Get search relevance metadata

453 search_score = search_hit.get("score", 0.0)

454 search_rank = search_hit.get("rank")

455 logger.info(

456 f"Search metadata for doc {doc_id}: score={search_score}, rank={search_rank}"

457 )

458 else:

459 logger.warning(

460 f"No __search_hit__ data for doc {doc_id}, will use content fallback"

461 )

462

463 if not snippet:

464 logger.info(

465 f"No snippet from highlights for doc {doc_id}, using content fallback"

466 )

467 # Fallback to content preview if no highlights available

468 content = doc_data.get("content", "")

469 if content: 469 ↛ 512line 469 didn't jump to line 512 because the condition on line 469 was always true

470 logger.debug(f"Document has content of length {len(content)}")

471 # Try to find context around query terms if possible

472 if query: 472 ↛ 504line 472 didn't jump to line 504 because the condition on line 472 was always true

473 query_terms = query.lower().split()

474 content_lower = content.lower()

475 logger.debug(

476 f"Searching for query terms in content: {query_terms}"

477 )

478

479 # Find first occurrence of any query term

480 best_pos = -1

481 for term in query_terms:

482 pos = content_lower.find(term)

483 if pos != -1 and (best_pos == -1 or pos < best_pos): 483 ↛ 481line 483 didn't jump to line 481 because the condition on line 483 was always true

484 best_pos = pos

485 logger.debug(

486 f"Found term '{term}' at position {pos}"

487 )

488

489 if best_pos != -1: 489 ↛ 499line 489 didn't jump to line 499 because the condition on line 489 was always true

490 # Extract context around the found term - much larger context for research

491 start = max(0, best_pos - 2000)

492 end = min(len(content), best_pos + 8000)

493 snippet = "..." + content[start:end] + "..."

494 logger.info(

495 f"Extracted snippet around query term at position {best_pos}"

496 )

497 else:

498 # Just take the beginning - use 10000 chars for research

499 snippet = content[:10000]

500 logger.info(

501 "No query terms found, using first 10000 chars of content"

502 )

503 else:

504 snippet = content[:10000]

505 logger.info(

506 "No query provided, using first 10000 chars of content"

507 )

508

509 if len(content) > 10000: 509 ↛ 510line 509 didn't jump to line 510 because the condition on line 509 was never true

510 snippet += "..."

511 else:

512 logger.warning(f"No content available for doc {doc_id}")

513

514 logger.info(f"Final snippet for doc {doc_id} has length {len(snippet)}")

515

516 # Build metadata

517 metadata = {

518 "doc_id": str(doc_id),

519 "correspondent": doc_data.get("correspondent_name", ""),

520 "document_type": doc_data.get("document_type_name", ""),

521 "created": doc_data.get("created", ""),

522 "modified": doc_data.get("modified", ""),

523 "archive_serial_number": doc_data.get("archive_serial_number"),

524 "search_score": search_score,

525 "search_rank": search_rank,

526 }

527

528 # Add tags if present

529 tags = doc_data.get("tags_list", [])

530 if isinstance(tags, list) and tags:

531 metadata["tags"] = ", ".join(str(tag) for tag in tags)

532

533 # Build enhanced title with available metadata for better citations

534 title_parts = []

535

536 # Add correspondent/author if available

537 correspondent = doc_data.get("correspondent_name", "")

538 if correspondent:

539 title_parts.append(f"{correspondent}.")

540 logger.debug(f"Added correspondent to title: {correspondent}")

541

542 # Add the document title

543 title_parts.append(title)

544

545 # Add document type if it's meaningful (not just generic types)

546 doc_type = doc_data.get("document_type_name", "")

547 if doc_type and doc_type not in ["Letter", "Other", "Document", ""]:

548 title_parts.append(f"({doc_type})")

549 logger.debug(f"Added document type to title: {doc_type}")

550

551 # Add year from created date if available

552 created_date = doc_data.get("created", "")

553 if created_date and len(created_date) >= 4:

554 year = created_date[:4]

555 title_parts.append(year)

556 logger.debug(f"Added year to title: {year}")

557

558 # Format the enhanced title for display in sources list

559 if title_parts: 559 ↛ 562line 559 didn't jump to line 562 because the condition on line 559 was always true

560 enhanced_title = " ".join(title_parts)

561 else:

562 enhanced_title = title

563

564 logger.info(f"Enhanced title for doc {doc_id}: '{enhanced_title}'")

565

566 # Build the preview

567 preview = {

568 "title": enhanced_title, # Use enhanced title with bibliographic info

569 "url": url,

570 "link": url, # Add 'link' key for compatibility with search utilities

571 "snippet": snippet,

572 "author": doc_data.get("correspondent_name", ""),

573 "date": doc_data.get("created", ""),

574 "source": "Paperless", # Keep source as the system name like other engines

575 "metadata": metadata,

576 "_raw_data": doc_data, # Store raw data for full content retrieval

577 }

578

579 logger.info(

580 f"Built preview for doc {doc_id}: URL={url}, snippet_len={len(snippet)}, has_author={bool(preview['author'])}, has_date={bool(preview['date'])}"

581 )

582

583 # Check if we have multiple highlights to return as separate results

584 if len(all_highlights) > 1:

585 # Create multiple previews, one for each highlight

586 previews = []

587 for i, highlight in enumerate(all_highlights):

588 # Clean each highlight

589 clean_snippet = re.sub(r"<span[^>]*>", "**", str(highlight))

590 clean_snippet = re.sub(r"</span>", "**", clean_snippet)

591 clean_snippet = re.sub(r"<[^>]+>", "", clean_snippet)

592

593 # Create a preview for this highlight

594 highlight_preview = {

595 "title": f"{enhanced_title} (excerpt {i + 1})", # Differentiate each excerpt

596 "url": url,

597 "link": url,

598 "snippet": clean_snippet,

599 "author": doc_data.get("correspondent_name", ""),

600 "date": doc_data.get("created", ""),

601 "source": "Paperless",

602 "metadata": {

603 **metadata,

604 "excerpt_number": i + 1,

605 "total_excerpts": len(all_highlights),

606 },

607 "_raw_data": doc_data,

608 }

609 previews.append(highlight_preview)

610

611 logger.info(

612 f"Created {len(previews)} separate previews from highlights for doc {doc_id}"

613 )

614 return previews

615 else:

616 # Single preview (original behavior)

617 return preview

618

619 def _get_full_content(

620 self, relevant_items: List[Dict[str, Any]]

621 ) -> List[Dict[str, Any]]:

622 """

623 Get full content for relevant documents.

624

625 Args:

626 relevant_items: List of relevant preview dictionaries

627

628 Returns:

629 List of dictionaries with full content

630 """

631 if not self.include_content:

632 # If content inclusion is disabled, just return previews

633 return relevant_items

634

635 logger.info(f"Getting full content for {len(relevant_items)} documents")

636 results = []

637 for idx, item in enumerate(relevant_items):

638 try:

639 logger.info(

640 f"Processing document {idx + 1}: title='{item.get('title', 'No title')[:50]}...', url={item.get('url', 'No URL')}"

641 )

642 logger.debug(f"Document {idx + 1} keys: {item.keys()}")

643 logger.debug(

644 f"Document {idx + 1} has snippet of length: {len(item.get('snippet', ''))}"

645 )

646

647 # Get the full document content if we have the raw data

648 if "_raw_data" in item:

649 doc_data = item["_raw_data"]

650 full_content = doc_data.get("content", "")

651

652 if not full_content:

653 # Try to fetch the document details

654 doc_id = item["metadata"].get("doc_id")

655 if doc_id: 655 ↛ 664line 655 didn't jump to line 664 because the condition on line 655 was always true

656 detail_response = self._make_request(

657 f"/api/documents/{doc_id}/"

658 )

659 if detail_response:

660 full_content = detail_response.get(

661 "content", ""

662 )

663

664 item["full_content"] = full_content or item["snippet"]

665 logger.info(

666 f"Document {idx + 1} full content length: {len(item['full_content'])}"

667 )

668 else:

669 # Fallback to snippet if no raw data

670 item["full_content"] = item["snippet"]

671 logger.info(

672 f"Document {idx + 1} using snippet as full content (no raw data)"

673 )

674

675 # Log the final document structure for debugging citation issues

676 logger.info(

677 f"Document {idx + 1} final structure: title='{item.get('title', '')[:50]}...', has_link={bool(item.get('link'))}, has_url={bool(item.get('url'))}, source='{item.get('source', 'Unknown')}'"

678 )

679

680 # Remove the raw data from the result

681 item.pop("_raw_data", None)

682 results.append(item)

683

684 except Exception:

685 logger.exception("Error getting full content for document")

686 item["full_content"] = item["snippet"]

687 item.pop("_raw_data", None)

688 results.append(item)

689

690 return results

691

692 def run(

693 self, query: str, research_context: Dict[str, Any] | None = None

694 ) -> List[Dict[str, Any]]:

695 """

696 Execute search on Paperless-ngx.

697

698 Args:

699 query: Search query

700 research_context: Context from previous research

701

702 Returns:

703 List of search results in LDR format

704 """

705 try:

706 # Get previews

707 previews = self._get_previews(query)

708

709 if not previews:

710 return []

711

712 # Apply LLM filtering if available

713 if ( 713 ↛ 718line 713 didn't jump to line 718 because the condition on line 713 was never true

714 self.llm

715 and hasattr(self, "_content_filters")

716 and self._content_filters

717 ):

718 filtered_previews = self._apply_content_filters(previews, query)

719 else:

720 filtered_previews = previews

721

722 # Get full content for relevant items

723 results = self._get_full_content(filtered_previews)

724

725 logger.info(

726 f"Search completed successfully, returning {len(results)} results"

727 )

728 # Enhanced logging to track document structure for citation debugging

729 for i, r in enumerate(results[:3], 1):

730 logger.info(

731 f"Result {i}: title='{r.get('title', '')[:50]}...', "

732 f"has_full_content={bool(r.get('full_content'))}, "

733 f"full_content_len={len(r.get('full_content', ''))}, "

734 f"snippet_len={len(r.get('snippet', ''))}, "

735 f"url={r.get('url', '')[:50]}"

736 )

737

738 return results

739

740 except Exception:

741 logger.exception("Error in Paperless-ngx search")

742 return []

743

744 async def arun(self, query: str) -> List[Dict[str, Any]]:

745 """

746 Async version of search.

747

748 Currently falls back to sync version.

749 """

750 return self.run(query)

751

752 def test_connection(self) -> bool:

753 """

754 Test the connection to Paperless-ngx.

755

756 Returns:

757 True if connection successful, False otherwise

758 """

759 try:

760 response = self._make_request("/api/")

761 return bool(response)

762 except Exception:

763 logger.exception("Failed to connect to Paperless-ngx")

764 return False

765

766 def get_document_count(self) -> int:

767 """

768 Get the total number of documents in Paperless-ngx.

769

770 Returns:

771 Number of documents, or -1 if error

772 """

773 try:

774 response = self._make_request(

775 "/api/documents/", params={"page_size": 1}

776 )

777 return response.get("count", -1)

778 except Exception:

779 return -1

Coverage for src / local_deep_research / web_search_engines / engines / search_engine_paperless.py: 91%

295 statements