Coverage for src / local_deep_research / web_search_engines / engines / search_engine_retriever.py: 100%

57 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2Search engine implementation that wraps any LangChain retriever. 

3This allows using vector stores, databases, or any custom retriever as a search source in LDR. 

4""" 

5 

6from typing import Any, Dict, List, Optional 

7 

8from langchain_core.retrievers import BaseRetriever, Document 

9from loguru import logger 

10 

11from ...constants import SNIPPET_LENGTH_LONG 

12from ..search_engine_base import BaseSearchEngine 

13 

14 

15class RetrieverSearchEngine(BaseSearchEngine): 

16 """ 

17 Search engine that uses any LangChain retriever. 

18 

19 This allows users to plug in any LangChain retriever (vector stores, 

20 databases, custom implementations) and use it as a search engine in LDR. 

21 """ 

22 

23 def __init__( 

24 self, 

25 retriever: BaseRetriever, 

26 max_results: int = 10, 

27 name: str | None = None, 

28 settings_snapshot: Optional[Dict[str, Any]] = None, 

29 **kwargs, 

30 ): 

31 """ 

32 Initialize the retriever-based search engine. 

33 

34 Args: 

35 retriever: Any LangChain BaseRetriever instance 

36 max_results: Maximum number of results to return 

37 name: Display name for this retriever (defaults to retriever class name) 

38 **kwargs: Additional parameters passed to parent 

39 """ 

40 super().__init__( 

41 max_results=max_results, 

42 settings_snapshot=settings_snapshot, 

43 **kwargs, 

44 ) 

45 self.retriever = retriever 

46 self.name = name if name is not None else retriever.__class__.__name__ 

47 

48 def run( 

49 self, query: str, research_context: Dict[str, Any] | None = None 

50 ) -> List[Dict[str, Any]]: 

51 """ 

52 Execute search using the LangChain retriever. 

53 

54 Args: 

55 query: Search query 

56 research_context: Context from previous research to use. 

57 

58 Returns: 

59 List of search results in LDR format 

60 """ 

61 try: 

62 # Use the retriever to get relevant documents 

63 docs = self.retriever.invoke(query) 

64 

65 # Convert LangChain documents to LDR search result format 

66 results = [] 

67 for i, doc in enumerate(docs[: self.max_results]): 

68 result = self._convert_document_to_result(doc, i) 

69 results.append(result) 

70 

71 logger.info( 

72 f"Retriever '{self.name}' returned {len(results)} results for query: {query}" 

73 ) 

74 return results 

75 

76 except Exception: 

77 logger.exception("Error in retriever search") 

78 return [] 

79 

80 def _convert_document_to_result( 

81 self, doc: Document, index: int 

82 ) -> Dict[str, Any]: 

83 """ 

84 Convert a LangChain Document to LDR search result format. 

85 

86 Args: 

87 doc: LangChain Document 

88 index: Result index 

89 

90 Returns: 

91 Search result in LDR format 

92 """ 

93 # Extract metadata 

94 metadata = doc.metadata or {} 

95 

96 # Build the result 

97 return { 

98 # Required fields for LDR 

99 "title": metadata.get("title", f"Document {index + 1}"), 

100 "url": metadata.get( 

101 "source", 

102 metadata.get("url", f"retriever://{self.name}/doc_{index}"), 

103 ), 

104 "snippet": doc.page_content[:SNIPPET_LENGTH_LONG] 

105 if doc.page_content 

106 else "", 

107 # Optional fields 

108 "full_content": doc.page_content, 

109 "author": metadata.get("author", ""), 

110 "date": metadata.get("date", ""), 

111 # Include all metadata for flexibility 

112 "metadata": metadata, 

113 # Score if available 

114 "score": metadata.get("score", 1.0), 

115 # Source information 

116 "source": self.name, 

117 "retriever_type": self.retriever.__class__.__name__, 

118 } 

119 

120 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

121 """ 

122 Get preview information from the retriever. 

123 

124 Args: 

125 query: Search query 

126 

127 Returns: 

128 List of preview dictionaries 

129 """ 

130 try: 

131 # Use the retriever to get relevant documents 

132 docs = self.retriever.invoke(query) 

133 

134 # Convert to preview format 

135 previews = [] 

136 for i, doc in enumerate(docs[: self.max_results]): 

137 preview = self._convert_document_to_result(doc, i) 

138 previews.append(preview) 

139 

140 logger.info( 

141 f"Retriever '{self.name}' returned {len(previews)} previews for query: {query}" 

142 ) 

143 return previews 

144 

145 except Exception: 

146 logger.exception("Error getting previews from retriever") 

147 return [] 

148 

149 def _get_full_content( 

150 self, relevant_items: List[Dict[str, Any]] 

151 ) -> List[Dict[str, Any]]: 

152 """ 

153 For retrievers, previews already contain full content. 

154 

155 Args: 

156 relevant_items: List of relevant preview dictionaries 

157 

158 Returns: 

159 Same list with full content (already included) 

160 """ 

161 # For retrievers, the preview already contains the full content 

162 # Just ensure the 'full_content' field is present 

163 for item in relevant_items: 

164 if "full_content" not in item and "snippet" in item: 

165 item["full_content"] = item["snippet"] 

166 return relevant_items 

167 

168 async def arun(self, query: str) -> List[Dict[str, Any]]: 

169 """ 

170 Async version of search using the retriever. 

171 

172 Args: 

173 query: Search query 

174 

175 Returns: 

176 List of search results in LDR format 

177 """ 

178 try: 

179 # Use async retriever if available 

180 if hasattr(self.retriever, "aget_relevant_documents"): 

181 docs = await self.retriever.aget_relevant_documents(query) 

182 else: 

183 # Fall back to sync version 

184 logger.debug( 

185 f"Retriever '{self.name}' doesn't support async, using sync version" 

186 ) 

187 return self.run(query) 

188 

189 # Convert documents to results 

190 results = [] 

191 for i, doc in enumerate(docs[: self.max_results]): 

192 result = self._convert_document_to_result(doc, i) 

193 results.append(result) 

194 

195 logger.info( 

196 f"Retriever '{self.name}' returned {len(results)} async results for query: {query}" 

197 ) 

198 return results 

199 

200 except Exception: 

201 logger.exception("Error in async retriever search") 

202 return []