Coverage for src / local_deep_research / web_search_engines / engines / search_engine_retriever.py: 100%

58 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1""" 

2Search engine implementation that wraps any LangChain retriever. 

3This allows using vector stores, databases, or any custom retriever as a search source in LDR. 

4""" 

5 

6from typing import Any, Dict, List 

7 

8from langchain_core.retrievers import BaseRetriever, Document 

9from loguru import logger 

10 

11from ...constants import SNIPPET_LENGTH_LONG 

12from ..search_engine_base import BaseSearchEngine 

13 

14 

15class RetrieverSearchEngine(BaseSearchEngine): 

16 """ 

17 Search engine that uses any LangChain retriever. 

18 

19 This allows users to plug in any LangChain retriever (vector stores, 

20 databases, custom implementations) and use it as a search engine in LDR. 

21 """ 

22 

23 def __init__( 

24 self, 

25 retriever: BaseRetriever, 

26 max_results: int = 10, 

27 name: str = None, 

28 **kwargs, 

29 ): 

30 """ 

31 Initialize the retriever-based search engine. 

32 

33 Args: 

34 retriever: Any LangChain BaseRetriever instance 

35 max_results: Maximum number of results to return 

36 name: Display name for this retriever (defaults to retriever class name) 

37 **kwargs: Additional parameters passed to parent 

38 """ 

39 super().__init__(max_results=max_results, **kwargs) 

40 self.retriever = retriever 

41 self.name = name if name is not None else retriever.__class__.__name__ 

42 

43 def run( 

44 self, query: str, research_context: Dict[str, Any] | None = None 

45 ) -> List[Dict[str, Any]]: 

46 """ 

47 Execute search using the LangChain retriever. 

48 

49 Args: 

50 query: Search query 

51 research_context: Context from previous research to use. 

52 

53 Returns: 

54 List of search results in LDR format 

55 """ 

56 try: 

57 # Use the retriever to get relevant documents 

58 docs = self.retriever.invoke(query) 

59 

60 # Convert LangChain documents to LDR search result format 

61 results = [] 

62 for i, doc in enumerate(docs[: self.max_results]): 

63 result = self._convert_document_to_result(doc, i) 

64 results.append(result) 

65 

66 logger.info( 

67 f"Retriever '{self.name}' returned {len(results)} results for query: {query}" 

68 ) 

69 return results 

70 

71 except Exception: 

72 logger.exception("Error in retriever search") 

73 return [] 

74 

75 def _convert_document_to_result( 

76 self, doc: Document, index: int 

77 ) -> Dict[str, Any]: 

78 """ 

79 Convert a LangChain Document to LDR search result format. 

80 

81 Args: 

82 doc: LangChain Document 

83 index: Result index 

84 

85 Returns: 

86 Search result in LDR format 

87 """ 

88 # Extract metadata 

89 metadata = doc.metadata or {} 

90 

91 # Build the result 

92 result = { 

93 # Required fields for LDR 

94 "title": metadata.get("title", f"Document {index + 1}"), 

95 "url": metadata.get( 

96 "source", 

97 metadata.get("url", f"retriever://{self.name}/doc_{index}"), 

98 ), 

99 "snippet": doc.page_content[:SNIPPET_LENGTH_LONG] 

100 if doc.page_content 

101 else "", 

102 # Optional fields 

103 "full_content": doc.page_content, 

104 "author": metadata.get("author", ""), 

105 "date": metadata.get("date", ""), 

106 # Include all metadata for flexibility 

107 "metadata": metadata, 

108 # Score if available 

109 "score": metadata.get("score", 1.0), 

110 # Source information 

111 "source": self.name, 

112 "retriever_type": self.retriever.__class__.__name__, 

113 } 

114 

115 return result 

116 

117 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

118 """ 

119 Get preview information from the retriever. 

120 

121 Args: 

122 query: Search query 

123 

124 Returns: 

125 List of preview dictionaries 

126 """ 

127 try: 

128 # Use the retriever to get relevant documents 

129 docs = self.retriever.invoke(query) 

130 

131 # Convert to preview format 

132 previews = [] 

133 for i, doc in enumerate(docs[: self.max_results]): 

134 preview = self._convert_document_to_result(doc, i) 

135 previews.append(preview) 

136 

137 logger.info( 

138 f"Retriever '{self.name}' returned {len(previews)} previews for query: {query}" 

139 ) 

140 return previews 

141 

142 except Exception: 

143 logger.exception("Error getting previews from retriever") 

144 return [] 

145 

146 def _get_full_content( 

147 self, relevant_items: List[Dict[str, Any]] 

148 ) -> List[Dict[str, Any]]: 

149 """ 

150 For retrievers, previews already contain full content. 

151 

152 Args: 

153 relevant_items: List of relevant preview dictionaries 

154 

155 Returns: 

156 Same list with full content (already included) 

157 """ 

158 # For retrievers, the preview already contains the full content 

159 # Just ensure the 'full_content' field is present 

160 for item in relevant_items: 

161 if "full_content" not in item and "snippet" in item: 

162 item["full_content"] = item["snippet"] 

163 return relevant_items 

164 

165 async def arun(self, query: str) -> List[Dict[str, Any]]: 

166 """ 

167 Async version of search using the retriever. 

168 

169 Args: 

170 query: Search query 

171 

172 Returns: 

173 List of search results in LDR format 

174 """ 

175 try: 

176 # Use async retriever if available 

177 if hasattr(self.retriever, "aget_relevant_documents"): 

178 docs = await self.retriever.aget_relevant_documents(query) 

179 else: 

180 # Fall back to sync version 

181 logger.debug( 

182 f"Retriever '{self.name}' doesn't support async, using sync version" 

183 ) 

184 return self.run(query) 

185 

186 # Convert documents to results 

187 results = [] 

188 for i, doc in enumerate(docs[: self.max_results]): 

189 result = self._convert_document_to_result(doc, i) 

190 results.append(result) 

191 

192 logger.info( 

193 f"Retriever '{self.name}' returned {len(results)} async results for query: {query}" 

194 ) 

195 return results 

196 

197 except Exception: 

198 logger.exception("Error in async retriever search") 

199 return []