Coverage for src / local_deep_research / advanced_search_system / filters / cross_engine_filter.py: 98%

74 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1""" 

2Cross-engine search result filter implementation. 

3""" 

4 

5from typing import Dict, List 

6 

7from loguru import logger 

8 

9from ...utilities.json_utils import extract_json, get_llm_response_text 

10from .base_filter import BaseFilter 

11 

12 

13class CrossEngineFilter(BaseFilter): 

14 """Filter that ranks and filters results from multiple search engines.""" 

15 

16 def __init__( 

17 self, 

18 model, 

19 max_results=None, 

20 default_reorder=True, 

21 default_reindex=True, 

22 settings_snapshot=None, 

23 ): 

24 """ 

25 Initialize the cross-engine filter. 

26 

27 Args: 

28 model: Language model to use for relevance assessment 

29 max_results: Maximum number of results to keep after filtering 

30 default_reorder: Default setting for reordering results by relevance 

31 default_reindex: Default setting for reindexing results after filtering 

32 settings_snapshot: Settings snapshot for thread context 

33 """ 

34 super().__init__(model) 

35 # Get max_results from database settings if not provided 

36 if max_results is None: 

37 # Import from thread_settings to avoid database dependencies 

38 from ...config.thread_settings import ( 

39 get_setting_from_snapshot, 

40 NoSettingsContextError, 

41 ) 

42 

43 try: 

44 max_results = get_setting_from_snapshot( 

45 "search.cross_engine_max_results", 

46 default=100, 

47 settings_snapshot=settings_snapshot, 

48 ) 

49 # Ensure we have an integer 

50 if max_results is not None: 

51 max_results = int(max_results) 

52 else: 

53 max_results = 100 

54 except (NoSettingsContextError, TypeError, ValueError): 

55 max_results = 100 # Explicit default 

56 self.max_results = max_results 

57 self.default_reorder = default_reorder 

58 self.default_reindex = default_reindex 

59 

60 def _prepare_and_return(self, results, *, reindex, start_index): 

61 """Optionally reindex results and return them.""" 

62 if reindex: 

63 for i, result in enumerate(results): 

64 result["index"] = str(i + start_index + 1) 

65 return results 

66 

67 def filter_results( 

68 self, 

69 results: List[Dict], 

70 query: str, 

71 reorder=None, 

72 reindex=None, 

73 start_index=0, 

74 **kwargs, 

75 ) -> List[Dict]: 

76 """ 

77 Filter and rank search results from multiple engines by relevance. 

78 

79 Args: 

80 results: Combined list of search results from all engines 

81 query: The original search query 

82 reorder: Whether to reorder results by relevance (default: use instance default) 

83 reindex: Whether to update result indices after filtering (default: use instance default) 

84 start_index: Starting index for the results (used for continuous indexing) 

85 **kwargs: Additional parameters 

86 

87 Returns: 

88 Filtered list of search results 

89 """ 

90 # Use instance defaults if not specified 

91 if reorder is None: 

92 reorder = self.default_reorder 

93 if reindex is None: 

94 reindex = self.default_reindex 

95 

96 if not self.model or len(results) <= 10: # Don't filter if few results 

97 return self._prepare_and_return( 

98 results[: min(self.max_results, len(results))], 

99 reindex=reindex, 

100 start_index=start_index, 

101 ) 

102 

103 # Create context for LLM 

104 preview_context = [] 

105 for i, result in enumerate(results): 

106 title = result.get("title", "Untitled").strip() 

107 snippet = result.get("snippet", "").strip() 

108 engine = result.get("engine", "Unknown engine") 

109 

110 # Clean up snippet if too long 

111 if len(snippet) > 200: 

112 snippet = snippet[:200] + "..." 

113 

114 preview_context.append( 

115 f"[{i}] Engine: {engine} | Title: {title}\nSnippet: {snippet}" 

116 ) 

117 

118 # Set a reasonable limit on context length 

119 max_context_items = min(30, len(preview_context)) 

120 context = "\n\n".join(preview_context[:max_context_items]) 

121 

122 prompt = f"""You are a search result filter. Your task is to rank search results from multiple engines by relevance to a query. 

123 

124Query: "{query}" 

125 

126Search Results: 

127{context} 

128 

129Return the search results as a JSON array of indices, ranked from most to least relevant to the query. 

130Only include indices of results that are actually relevant to the query. 

131For example: [3, 0, 7, 1] 

132 

133If no results seem relevant to the query, return an empty array: []""" 

134 

135 try: 

136 # Get LLM's evaluation 

137 response = self.model.invoke(prompt) 

138 response_text = get_llm_response_text(response) 

139 ranked_indices = extract_json(response_text, expected_type=list) 

140 

141 if ranked_indices is not None: 

142 # If not reordering, just filter based on the indices 

143 if not reorder: 

144 # Just keep the results that were deemed relevant 

145 filtered_results = [] 

146 for idx in sorted( 

147 ranked_indices 

148 ): # Sort to maintain original order 

149 if 0 <= idx < len(results): 

150 filtered_results.append(results[idx]) 

151 

152 # Limit results if needed 

153 final_results = filtered_results[ 

154 : min(self.max_results, len(filtered_results)) 

155 ] 

156 

157 if not final_results and results: 

158 logger.info( 

159 "Cross-engine filtering removed all " 

160 "results, returning top 10 originals" 

161 ) 

162 return self._prepare_and_return( 

163 results[: min(10, len(results))], 

164 reindex=reindex, 

165 start_index=start_index, 

166 ) 

167 

168 logger.info( 

169 f"Cross-engine filtering kept {len(final_results)} out of {len(results)} results without reordering" 

170 ) 

171 return self._prepare_and_return( 

172 final_results, 

173 reindex=reindex, 

174 start_index=start_index, 

175 ) 

176 

177 # Create ranked results list (reordering) 

178 ranked_results = [] 

179 for idx in ranked_indices: 

180 if 0 <= idx < len(results): 

181 ranked_results.append(results[idx]) 

182 

183 # If filtering removed everything, return top results 

184 if not ranked_results and results: 

185 logger.info( 

186 "Cross-engine filtering removed all results, returning top 10 originals instead" 

187 ) 

188 return self._prepare_and_return( 

189 results[: min(10, len(results))], 

190 reindex=reindex, 

191 start_index=start_index, 

192 ) 

193 

194 # Limit results if needed 

195 max_filtered = min(self.max_results, len(ranked_results)) 

196 final_results = ranked_results[:max_filtered] 

197 

198 logger.info( 

199 f"Cross-engine filtering kept {len(final_results)} out of {len(results)} results with reordering={reorder}, reindex={reindex}" 

200 ) 

201 return self._prepare_and_return( 

202 final_results, 

203 reindex=reindex, 

204 start_index=start_index, 

205 ) 

206 else: 

207 logger.info( 

208 "Could not find JSON array in response, returning original results" 

209 ) 

210 return self._prepare_and_return( 

211 results[: min(self.max_results, len(results))], 

212 reindex=reindex, 

213 start_index=start_index, 

214 ) 

215 

216 except Exception: 

217 logger.exception("Cross-engine filtering error") 

218 return self._prepare_and_return( 

219 results[: min(self.max_results, len(results))], 

220 reindex=reindex, 

221 start_index=start_index, 

222 )