Coverage for src / local_deep_research / advanced_search_system / filters / cross_engine_filter.py: 98%

78 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2Cross-engine search result filter implementation. 

3""" 

4 

5from typing import Dict, List 

6 

7from loguru import logger 

8 

9from ...utilities.json_utils import extract_json, get_llm_response_text 

10from .base_filter import BaseFilter 

11 

12 

13class CrossEngineFilter(BaseFilter): 

14 """Filter that ranks and filters results from multiple search engines.""" 

15 

16 def __init__( 

17 self, 

18 model, 

19 max_results=None, 

20 default_reorder=True, 

21 default_reindex=True, 

22 settings_snapshot=None, 

23 ): 

24 """ 

25 Initialize the cross-engine filter. 

26 

27 Args: 

28 model: Language model to use for relevance assessment 

29 max_results: Maximum number of results to keep after filtering 

30 default_reorder: Default setting for reordering results by relevance 

31 default_reindex: Default setting for reindexing results after filtering 

32 settings_snapshot: Settings snapshot for thread context 

33 """ 

34 super().__init__(model) 

35 # Import from thread_settings to avoid database dependencies 

36 from ...config.thread_settings import ( 

37 get_setting_from_snapshot, 

38 NoSettingsContextError, 

39 ) 

40 

41 # Get max_results from database settings if not provided 

42 if max_results is None: 

43 try: 

44 max_results = get_setting_from_snapshot( 

45 "search.cross_engine_max_results", 

46 default=100, 

47 settings_snapshot=settings_snapshot, 

48 ) 

49 if max_results is not None: 

50 max_results = int(max_results) 

51 else: 

52 max_results = 100 

53 except (NoSettingsContextError, TypeError, ValueError): 

54 max_results = 100 

55 self.max_results = max_results 

56 

57 # Max number of result previews shown to the LLM for relevance ranking. 

58 # Higher values let the LLM evaluate more candidates but increase prompt 

59 # size and latency. 

60 try: 

61 self.max_context_items = int( 

62 get_setting_from_snapshot( 

63 "search.cross_engine_max_context_items", 

64 default=30, 

65 settings_snapshot=settings_snapshot, 

66 ) 

67 ) 

68 except (NoSettingsContextError, TypeError, ValueError): 

69 self.max_context_items = 30 

70 

71 self.default_reorder = default_reorder 

72 self.default_reindex = default_reindex 

73 

74 def _prepare_and_return(self, results, *, reindex, start_index): 

75 """Optionally reindex results and return them.""" 

76 if reindex: 

77 for i, result in enumerate(results): 

78 result["index"] = str(i + start_index + 1) 

79 return results 

80 

81 def filter_results( 

82 self, 

83 results: List[Dict], 

84 query: str, 

85 reorder=None, 

86 reindex=None, 

87 start_index=0, 

88 **kwargs, 

89 ) -> List[Dict]: 

90 """ 

91 Filter and rank search results from multiple engines by relevance. 

92 

93 Args: 

94 results: Combined list of search results from all engines 

95 query: The original search query 

96 reorder: Whether to reorder results by relevance (default: use instance default) 

97 reindex: Whether to update result indices after filtering (default: use instance default) 

98 start_index: Starting index for the results (used for continuous indexing) 

99 **kwargs: Additional parameters 

100 

101 Returns: 

102 Filtered list of search results 

103 """ 

104 # Use instance defaults if not specified 

105 if reorder is None: 

106 reorder = self.default_reorder 

107 if reindex is None: 

108 reindex = self.default_reindex 

109 

110 if not self.model or len(results) <= 10: # Don't filter if few results 

111 return self._prepare_and_return( 

112 results[: min(self.max_results, len(results))], 

113 reindex=reindex, 

114 start_index=start_index, 

115 ) 

116 

117 # Create context for LLM 

118 preview_context = [] 

119 for i, result in enumerate(results): 

120 title = result.get("title", "Untitled").strip() 

121 snippet = result.get("snippet", "").strip() 

122 engine = result.get("engine", "Unknown engine") 

123 

124 # Clean up snippet if too long 

125 if len(snippet) > 200: 

126 snippet = snippet[:200] + "..." 

127 

128 preview_context.append( 

129 f"[{i}] Engine: {engine} | Title: {title}\nSnippet: {snippet}" 

130 ) 

131 

132 max_context_items = min(self.max_context_items, len(preview_context)) 

133 context = "\n\n".join(preview_context[:max_context_items]) 

134 

135 prompt = f"""You are a search result filter. Your task is to rank search results from multiple engines by relevance to a query. 

136 

137Query: "{query}" 

138 

139Search Results: 

140{context} 

141 

142Return the search results as a JSON array of indices, ranked from most to least relevant to the query. 

143Only include indices of results that are actually relevant to the query. 

144For example: [3, 0, 7, 1] 

145 

146If no results seem relevant to the query, return an empty array: []""" 

147 

148 try: 

149 # Get LLM's evaluation 

150 response = self.model.invoke(prompt) 

151 response_text = get_llm_response_text(response) 

152 ranked_indices = extract_json(response_text, expected_type=list) 

153 

154 if ranked_indices is not None: 

155 # If not reordering, just filter based on the indices 

156 if not reorder: 

157 # Just keep the results that were deemed relevant 

158 filtered_results = [] 

159 for idx in sorted( 

160 ranked_indices 

161 ): # Sort to maintain original order 

162 if 0 <= idx < len(results): 

163 filtered_results.append(results[idx]) 

164 

165 # Limit results if needed 

166 final_results = filtered_results[ 

167 : min(self.max_results, len(filtered_results)) 

168 ] 

169 

170 if not final_results and results: 

171 logger.info( 

172 "Cross-engine filtering removed all " 

173 "results, returning top 10 originals" 

174 ) 

175 return self._prepare_and_return( 

176 results[: min(10, len(results))], 

177 reindex=reindex, 

178 start_index=start_index, 

179 ) 

180 

181 logger.info( 

182 f"Cross-engine filtering kept {len(final_results)} out of {len(results)} results without reordering" 

183 ) 

184 return self._prepare_and_return( 

185 final_results, 

186 reindex=reindex, 

187 start_index=start_index, 

188 ) 

189 

190 # Create ranked results list (reordering) 

191 ranked_results = [] 

192 for idx in ranked_indices: 

193 if 0 <= idx < len(results): 

194 ranked_results.append(results[idx]) 

195 

196 # If filtering removed everything, return top results 

197 if not ranked_results and results: 

198 logger.info( 

199 "Cross-engine filtering removed all results, returning top 10 originals instead" 

200 ) 

201 return self._prepare_and_return( 

202 results[: min(10, len(results))], 

203 reindex=reindex, 

204 start_index=start_index, 

205 ) 

206 

207 # Limit results if needed 

208 max_filtered = min(self.max_results, len(ranked_results)) 

209 final_results = ranked_results[:max_filtered] 

210 

211 logger.info( 

212 f"Cross-engine filtering kept {len(final_results)} out of {len(results)} results with reordering={reorder}, reindex={reindex}" 

213 ) 

214 return self._prepare_and_return( 

215 final_results, 

216 reindex=reindex, 

217 start_index=start_index, 

218 ) 

219 logger.info( 

220 "Could not find JSON array in response, returning original results" 

221 ) 

222 return self._prepare_and_return( 

223 results[: min(self.max_results, len(results))], 

224 reindex=reindex, 

225 start_index=start_index, 

226 ) 

227 

228 except Exception: 

229 logger.exception("Cross-engine filtering error") 

230 return self._prepare_and_return( 

231 results[: min(self.max_results, len(results))], 

232 reindex=reindex, 

233 start_index=start_index, 

234 )