Coverage for src / local_deep_research / web_search_engines / engines / search_engine_retriever.py: 100%
57 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2Search engine implementation that wraps any LangChain retriever.
3This allows using vector stores, databases, or any custom retriever as a search source in LDR.
4"""
6from typing import Any, Dict, List, Optional
8from langchain_core.retrievers import BaseRetriever, Document
9from loguru import logger
11from ...constants import SNIPPET_LENGTH_LONG
12from ..search_engine_base import BaseSearchEngine
15class RetrieverSearchEngine(BaseSearchEngine):
16 """
17 Search engine that uses any LangChain retriever.
19 This allows users to plug in any LangChain retriever (vector stores,
20 databases, custom implementations) and use it as a search engine in LDR.
21 """
23 def __init__(
24 self,
25 retriever: BaseRetriever,
26 max_results: int = 10,
27 name: str | None = None,
28 settings_snapshot: Optional[Dict[str, Any]] = None,
29 **kwargs,
30 ):
31 """
32 Initialize the retriever-based search engine.
34 Args:
35 retriever: Any LangChain BaseRetriever instance
36 max_results: Maximum number of results to return
37 name: Display name for this retriever (defaults to retriever class name)
38 **kwargs: Additional parameters passed to parent
39 """
40 super().__init__(
41 max_results=max_results,
42 settings_snapshot=settings_snapshot,
43 **kwargs,
44 )
45 self.retriever = retriever
46 self.name = name if name is not None else retriever.__class__.__name__
48 def run(
49 self, query: str, research_context: Dict[str, Any] | None = None
50 ) -> List[Dict[str, Any]]:
51 """
52 Execute search using the LangChain retriever.
54 Args:
55 query: Search query
56 research_context: Context from previous research to use.
58 Returns:
59 List of search results in LDR format
60 """
61 try:
62 # Use the retriever to get relevant documents
63 docs = self.retriever.invoke(query)
65 # Convert LangChain documents to LDR search result format
66 results = []
67 for i, doc in enumerate(docs[: self.max_results]):
68 result = self._convert_document_to_result(doc, i)
69 results.append(result)
71 logger.info(
72 f"Retriever '{self.name}' returned {len(results)} results for query: {query}"
73 )
74 return results
76 except Exception:
77 logger.exception("Error in retriever search")
78 return []
80 def _convert_document_to_result(
81 self, doc: Document, index: int
82 ) -> Dict[str, Any]:
83 """
84 Convert a LangChain Document to LDR search result format.
86 Args:
87 doc: LangChain Document
88 index: Result index
90 Returns:
91 Search result in LDR format
92 """
93 # Extract metadata
94 metadata = doc.metadata or {}
96 # Build the result
97 return {
98 # Required fields for LDR
99 "title": metadata.get("title", f"Document {index + 1}"),
100 "url": metadata.get(
101 "source",
102 metadata.get("url", f"retriever://{self.name}/doc_{index}"),
103 ),
104 "snippet": doc.page_content[:SNIPPET_LENGTH_LONG]
105 if doc.page_content
106 else "",
107 # Optional fields
108 "full_content": doc.page_content,
109 "author": metadata.get("author", ""),
110 "date": metadata.get("date", ""),
111 # Include all metadata for flexibility
112 "metadata": metadata,
113 # Score if available
114 "score": metadata.get("score", 1.0),
115 # Source information
116 "source": self.name,
117 "retriever_type": self.retriever.__class__.__name__,
118 }
120 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
121 """
122 Get preview information from the retriever.
124 Args:
125 query: Search query
127 Returns:
128 List of preview dictionaries
129 """
130 try:
131 # Use the retriever to get relevant documents
132 docs = self.retriever.invoke(query)
134 # Convert to preview format
135 previews = []
136 for i, doc in enumerate(docs[: self.max_results]):
137 preview = self._convert_document_to_result(doc, i)
138 previews.append(preview)
140 logger.info(
141 f"Retriever '{self.name}' returned {len(previews)} previews for query: {query}"
142 )
143 return previews
145 except Exception:
146 logger.exception("Error getting previews from retriever")
147 return []
149 def _get_full_content(
150 self, relevant_items: List[Dict[str, Any]]
151 ) -> List[Dict[str, Any]]:
152 """
153 For retrievers, previews already contain full content.
155 Args:
156 relevant_items: List of relevant preview dictionaries
158 Returns:
159 Same list with full content (already included)
160 """
161 # For retrievers, the preview already contains the full content
162 # Just ensure the 'full_content' field is present
163 for item in relevant_items:
164 if "full_content" not in item and "snippet" in item:
165 item["full_content"] = item["snippet"]
166 return relevant_items
168 async def arun(self, query: str) -> List[Dict[str, Any]]:
169 """
170 Async version of search using the retriever.
172 Args:
173 query: Search query
175 Returns:
176 List of search results in LDR format
177 """
178 try:
179 # Use async retriever if available
180 if hasattr(self.retriever, "aget_relevant_documents"):
181 docs = await self.retriever.aget_relevant_documents(query)
182 else:
183 # Fall back to sync version
184 logger.debug(
185 f"Retriever '{self.name}' doesn't support async, using sync version"
186 )
187 return self.run(query)
189 # Convert documents to results
190 results = []
191 for i, doc in enumerate(docs[: self.max_results]):
192 result = self._convert_document_to_result(doc, i)
193 results.append(result)
195 logger.info(
196 f"Retriever '{self.name}' returned {len(results)} async results for query: {query}"
197 )
198 return results
200 except Exception:
201 logger.exception("Error in async retriever search")
202 return []