Coverage for src / local_deep_research / web_search_engines / engines / search_engine_retriever.py: 100%

57 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1""" 

2Search engine implementation that wraps any LangChain retriever. 

3This allows using vector stores, databases, or any custom retriever as a search source in LDR. 

4""" 

5 

6from typing import Any, Dict, List 

7 

8from langchain_core.retrievers import BaseRetriever, Document 

9from loguru import logger 

10 

11from ..search_engine_base import BaseSearchEngine 

12 

13 

14class RetrieverSearchEngine(BaseSearchEngine): 

15 """ 

16 Search engine that uses any LangChain retriever. 

17 

18 This allows users to plug in any LangChain retriever (vector stores, 

19 databases, custom implementations) and use it as a search engine in LDR. 

20 """ 

21 

22 def __init__( 

23 self, 

24 retriever: BaseRetriever, 

25 max_results: int = 10, 

26 name: str = None, 

27 **kwargs, 

28 ): 

29 """ 

30 Initialize the retriever-based search engine. 

31 

32 Args: 

33 retriever: Any LangChain BaseRetriever instance 

34 max_results: Maximum number of results to return 

35 name: Display name for this retriever (defaults to retriever class name) 

36 **kwargs: Additional parameters passed to parent 

37 """ 

38 super().__init__(max_results=max_results, **kwargs) 

39 self.retriever = retriever 

40 self.name = name if name is not None else retriever.__class__.__name__ 

41 

42 def run( 

43 self, query: str, research_context: Dict[str, Any] | None = None 

44 ) -> List[Dict[str, Any]]: 

45 """ 

46 Execute search using the LangChain retriever. 

47 

48 Args: 

49 query: Search query 

50 research_context: Context from previous research to use. 

51 

52 Returns: 

53 List of search results in LDR format 

54 """ 

55 try: 

56 # Use the retriever to get relevant documents 

57 docs = self.retriever.invoke(query) 

58 

59 # Convert LangChain documents to LDR search result format 

60 results = [] 

61 for i, doc in enumerate(docs[: self.max_results]): 

62 result = self._convert_document_to_result(doc, i) 

63 results.append(result) 

64 

65 logger.info( 

66 f"Retriever '{self.name}' returned {len(results)} results for query: {query}" 

67 ) 

68 return results 

69 

70 except Exception: 

71 logger.exception("Error in retriever search") 

72 return [] 

73 

74 def _convert_document_to_result( 

75 self, doc: Document, index: int 

76 ) -> Dict[str, Any]: 

77 """ 

78 Convert a LangChain Document to LDR search result format. 

79 

80 Args: 

81 doc: LangChain Document 

82 index: Result index 

83 

84 Returns: 

85 Search result in LDR format 

86 """ 

87 # Extract metadata 

88 metadata = doc.metadata or {} 

89 

90 # Build the result 

91 result = { 

92 # Required fields for LDR 

93 "title": metadata.get("title", f"Document {index + 1}"), 

94 "url": metadata.get( 

95 "source", 

96 metadata.get("url", f"retriever://{self.name}/doc_{index}"), 

97 ), 

98 "snippet": doc.page_content[:500] if doc.page_content else "", 

99 # Optional fields 

100 "full_content": doc.page_content, 

101 "author": metadata.get("author", ""), 

102 "date": metadata.get("date", ""), 

103 # Include all metadata for flexibility 

104 "metadata": metadata, 

105 # Score if available 

106 "score": metadata.get("score", 1.0), 

107 # Source information 

108 "source": self.name, 

109 "retriever_type": self.retriever.__class__.__name__, 

110 } 

111 

112 return result 

113 

114 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

115 """ 

116 Get preview information from the retriever. 

117 

118 Args: 

119 query: Search query 

120 

121 Returns: 

122 List of preview dictionaries 

123 """ 

124 try: 

125 # Use the retriever to get relevant documents 

126 docs = self.retriever.invoke(query) 

127 

128 # Convert to preview format 

129 previews = [] 

130 for i, doc in enumerate(docs[: self.max_results]): 

131 preview = self._convert_document_to_result(doc, i) 

132 previews.append(preview) 

133 

134 logger.info( 

135 f"Retriever '{self.name}' returned {len(previews)} previews for query: {query}" 

136 ) 

137 return previews 

138 

139 except Exception: 

140 logger.exception("Error getting previews from retriever") 

141 return [] 

142 

143 def _get_full_content( 

144 self, relevant_items: List[Dict[str, Any]] 

145 ) -> List[Dict[str, Any]]: 

146 """ 

147 For retrievers, previews already contain full content. 

148 

149 Args: 

150 relevant_items: List of relevant preview dictionaries 

151 

152 Returns: 

153 Same list with full content (already included) 

154 """ 

155 # For retrievers, the preview already contains the full content 

156 # Just ensure the 'full_content' field is present 

157 for item in relevant_items: 

158 if "full_content" not in item and "snippet" in item: 

159 item["full_content"] = item["snippet"] 

160 return relevant_items 

161 

162 async def arun(self, query: str) -> List[Dict[str, Any]]: 

163 """ 

164 Async version of search using the retriever. 

165 

166 Args: 

167 query: Search query 

168 

169 Returns: 

170 List of search results in LDR format 

171 """ 

172 try: 

173 # Use async retriever if available 

174 if hasattr(self.retriever, "aget_relevant_documents"): 

175 docs = await self.retriever.aget_relevant_documents(query) 

176 else: 

177 # Fall back to sync version 

178 logger.debug( 

179 f"Retriever '{self.name}' doesn't support async, using sync version" 

180 ) 

181 return self.run(query) 

182 

183 # Convert documents to results 

184 results = [] 

185 for i, doc in enumerate(docs[: self.max_results]): 

186 result = self._convert_document_to_result(doc, i) 

187 results.append(result) 

188 

189 logger.info( 

190 f"Retriever '{self.name}' returned {len(results)} async results for query: {query}" 

191 ) 

192 return results 

193 

194 except Exception: 

195 logger.exception("Error in async retriever search") 

196 return []