Coverage for src / local_deep_research / advanced_search_system / filters / cross_engine_filter.py: 98%
74 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1"""
2Cross-engine search result filter implementation.
3"""
5from typing import Dict, List
7from loguru import logger
9from ...utilities.json_utils import extract_json, get_llm_response_text
10from .base_filter import BaseFilter
13class CrossEngineFilter(BaseFilter):
14 """Filter that ranks and filters results from multiple search engines."""
16 def __init__(
17 self,
18 model,
19 max_results=None,
20 default_reorder=True,
21 default_reindex=True,
22 settings_snapshot=None,
23 ):
24 """
25 Initialize the cross-engine filter.
27 Args:
28 model: Language model to use for relevance assessment
29 max_results: Maximum number of results to keep after filtering
30 default_reorder: Default setting for reordering results by relevance
31 default_reindex: Default setting for reindexing results after filtering
32 settings_snapshot: Settings snapshot for thread context
33 """
34 super().__init__(model)
35 # Get max_results from database settings if not provided
36 if max_results is None:
37 # Import from thread_settings to avoid database dependencies
38 from ...config.thread_settings import (
39 get_setting_from_snapshot,
40 NoSettingsContextError,
41 )
43 try:
44 max_results = get_setting_from_snapshot(
45 "search.cross_engine_max_results",
46 default=100,
47 settings_snapshot=settings_snapshot,
48 )
49 # Ensure we have an integer
50 if max_results is not None:
51 max_results = int(max_results)
52 else:
53 max_results = 100
54 except (NoSettingsContextError, TypeError, ValueError):
55 max_results = 100 # Explicit default
56 self.max_results = max_results
57 self.default_reorder = default_reorder
58 self.default_reindex = default_reindex
60 def _prepare_and_return(self, results, *, reindex, start_index):
61 """Optionally reindex results and return them."""
62 if reindex:
63 for i, result in enumerate(results):
64 result["index"] = str(i + start_index + 1)
65 return results
67 def filter_results(
68 self,
69 results: List[Dict],
70 query: str,
71 reorder=None,
72 reindex=None,
73 start_index=0,
74 **kwargs,
75 ) -> List[Dict]:
76 """
77 Filter and rank search results from multiple engines by relevance.
79 Args:
80 results: Combined list of search results from all engines
81 query: The original search query
82 reorder: Whether to reorder results by relevance (default: use instance default)
83 reindex: Whether to update result indices after filtering (default: use instance default)
84 start_index: Starting index for the results (used for continuous indexing)
85 **kwargs: Additional parameters
87 Returns:
88 Filtered list of search results
89 """
90 # Use instance defaults if not specified
91 if reorder is None:
92 reorder = self.default_reorder
93 if reindex is None:
94 reindex = self.default_reindex
96 if not self.model or len(results) <= 10: # Don't filter if few results
97 return self._prepare_and_return(
98 results[: min(self.max_results, len(results))],
99 reindex=reindex,
100 start_index=start_index,
101 )
103 # Create context for LLM
104 preview_context = []
105 for i, result in enumerate(results):
106 title = result.get("title", "Untitled").strip()
107 snippet = result.get("snippet", "").strip()
108 engine = result.get("engine", "Unknown engine")
110 # Clean up snippet if too long
111 if len(snippet) > 200:
112 snippet = snippet[:200] + "..."
114 preview_context.append(
115 f"[{i}] Engine: {engine} | Title: {title}\nSnippet: {snippet}"
116 )
118 # Set a reasonable limit on context length
119 max_context_items = min(30, len(preview_context))
120 context = "\n\n".join(preview_context[:max_context_items])
122 prompt = f"""You are a search result filter. Your task is to rank search results from multiple engines by relevance to a query.
124Query: "{query}"
126Search Results:
127{context}
129Return the search results as a JSON array of indices, ranked from most to least relevant to the query.
130Only include indices of results that are actually relevant to the query.
131For example: [3, 0, 7, 1]
133If no results seem relevant to the query, return an empty array: []"""
135 try:
136 # Get LLM's evaluation
137 response = self.model.invoke(prompt)
138 response_text = get_llm_response_text(response)
139 ranked_indices = extract_json(response_text, expected_type=list)
141 if ranked_indices is not None:
142 # If not reordering, just filter based on the indices
143 if not reorder:
144 # Just keep the results that were deemed relevant
145 filtered_results = []
146 for idx in sorted(
147 ranked_indices
148 ): # Sort to maintain original order
149 if 0 <= idx < len(results):
150 filtered_results.append(results[idx])
152 # Limit results if needed
153 final_results = filtered_results[
154 : min(self.max_results, len(filtered_results))
155 ]
157 if not final_results and results:
158 logger.info(
159 "Cross-engine filtering removed all "
160 "results, returning top 10 originals"
161 )
162 return self._prepare_and_return(
163 results[: min(10, len(results))],
164 reindex=reindex,
165 start_index=start_index,
166 )
168 logger.info(
169 f"Cross-engine filtering kept {len(final_results)} out of {len(results)} results without reordering"
170 )
171 return self._prepare_and_return(
172 final_results,
173 reindex=reindex,
174 start_index=start_index,
175 )
177 # Create ranked results list (reordering)
178 ranked_results = []
179 for idx in ranked_indices:
180 if 0 <= idx < len(results):
181 ranked_results.append(results[idx])
183 # If filtering removed everything, return top results
184 if not ranked_results and results:
185 logger.info(
186 "Cross-engine filtering removed all results, returning top 10 originals instead"
187 )
188 return self._prepare_and_return(
189 results[: min(10, len(results))],
190 reindex=reindex,
191 start_index=start_index,
192 )
194 # Limit results if needed
195 max_filtered = min(self.max_results, len(ranked_results))
196 final_results = ranked_results[:max_filtered]
198 logger.info(
199 f"Cross-engine filtering kept {len(final_results)} out of {len(results)} results with reordering={reorder}, reindex={reindex}"
200 )
201 return self._prepare_and_return(
202 final_results,
203 reindex=reindex,
204 start_index=start_index,
205 )
206 else:
207 logger.info(
208 "Could not find JSON array in response, returning original results"
209 )
210 return self._prepare_and_return(
211 results[: min(self.max_results, len(results))],
212 reindex=reindex,
213 start_index=start_index,
214 )
216 except Exception:
217 logger.exception("Cross-engine filtering error")
218 return self._prepare_and_return(
219 results[: min(self.max_results, len(results))],
220 reindex=reindex,
221 start_index=start_index,
222 )