Coverage for src / local_deep_research / web_search_engines / engines / search_engine_serper.py: 83%
144 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1from loguru import logger
2from typing import Any, Dict, List, Optional
3import requests
4from urllib.parse import urlparse
6from langchain_core.language_models import BaseLLM
8from ..search_engine_base import BaseSearchEngine
9from ..rate_limiting import RateLimitError
10from ...security import safe_post
13class SerperSearchEngine(BaseSearchEngine):
14 """Google search engine implementation using Serper API with two-phase approach"""
16 # Mark as public search engine
17 is_public = True
18 # Mark as generic search engine (general web search via Google)
19 is_generic = True
21 # Class constants
22 BASE_URL = "https://google.serper.dev/search"
23 DEFAULT_TIMEOUT = 30
24 DEFAULT_REGION = "us"
25 DEFAULT_LANGUAGE = "en"
27 def __init__(
28 self,
29 max_results: int = 10,
30 region: str = "us",
31 time_period: Optional[str] = None,
32 safe_search: bool = True,
33 search_language: str = "en",
34 api_key: Optional[str] = None,
35 llm: Optional[BaseLLM] = None,
36 include_full_content: bool = False,
37 max_filtered_results: Optional[int] = None,
38 settings_snapshot: Optional[Dict[str, Any]] = None,
39 **kwargs,
40 ):
41 """
42 Initialize the Serper search engine.
44 Args:
45 max_results: Maximum number of search results (default 10)
46 region: Country code for localized results (e.g., 'us', 'gb', 'fr')
47 time_period: Time filter for results ('day', 'week', 'month', 'year', or None for all time)
48 safe_search: Whether to enable safe search
49 search_language: Language code for results (e.g., 'en', 'es', 'fr')
50 api_key: Serper API key (can also be set in settings)
51 llm: Language model for relevance filtering
52 include_full_content: Whether to include full webpage content in results
53 max_filtered_results: Maximum number of results to keep after filtering
54 settings_snapshot: Settings snapshot for thread context
55 **kwargs: Additional parameters (ignored but accepted for compatibility)
56 """
57 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
58 super().__init__(
59 llm=llm,
60 max_filtered_results=max_filtered_results,
61 max_results=max_results,
62 )
63 self.include_full_content = include_full_content
64 self.region = region
65 self.time_period = time_period
66 self.safe_search = safe_search
67 self.search_language = search_language
69 # Get API key - check params, env vars, or database
70 from ...config.search_config import get_setting_from_snapshot
72 serper_api_key = api_key
73 if not serper_api_key:
74 serper_api_key = get_setting_from_snapshot(
75 "search.engine.web.serper.api_key",
76 settings_snapshot=settings_snapshot,
77 )
79 if not serper_api_key:
80 raise ValueError(
81 "Serper API key not found. Please provide api_key parameter or set it in the UI settings."
82 )
84 self.api_key = serper_api_key
85 self.base_url = self.BASE_URL
86 # Note: self.engine_type is automatically set by parent BaseSearchEngine class
88 # Initialize per-query attributes (reset in _get_previews per search)
89 self._knowledge_graph = None
90 self._related_searches = None
91 self._people_also_ask = None
93 # If full content is requested, initialize FullSearchResults
94 if include_full_content: 94 ↛ 96line 94 didn't jump to line 96 because the condition on line 94 was never true
95 # Import FullSearchResults only if needed
96 try:
97 from .full_search import FullSearchResults
99 self.full_search = FullSearchResults(
100 llm=llm,
101 web_search=None, # We'll handle the search ourselves
102 language=search_language,
103 max_results=max_results,
104 region=region,
105 time=time_period,
106 safesearch="Moderate" if safe_search else "Off",
107 )
108 except ImportError:
109 logger.warning(
110 "Warning: FullSearchResults not available. Full content retrieval disabled."
111 )
112 self.include_full_content = False
114 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
115 """
116 Get preview information from Serper API.
118 Args:
119 query: The search query
121 Returns:
122 List of preview dictionaries
123 """
124 logger.info("Getting search results from Serper API")
126 # Reset per-query attributes to prevent leakage between searches
127 self._knowledge_graph = None
128 self._related_searches = None
129 self._people_also_ask = None
131 try:
132 # Build request payload
133 payload = {
134 "q": query,
135 "num": self.max_results,
136 "gl": self.region,
137 "hl": self.search_language,
138 }
140 # Add optional parameters
141 if self.time_period:
142 # Map time periods to Serper's format
143 time_mapping = {
144 "day": "d",
145 "week": "w",
146 "month": "m",
147 "year": "y",
148 }
149 if self.time_period in time_mapping: 149 ↛ 153line 149 didn't jump to line 153 because the condition on line 149 was always true
150 payload["tbs"] = f"qdr:{time_mapping[self.time_period]}"
152 # Apply rate limiting before request
153 self._last_wait_time = self.rate_tracker.apply_rate_limit(
154 self.engine_type
155 )
157 # Make API request
158 headers = {
159 "X-API-KEY": self.api_key,
160 "Content-Type": "application/json",
161 }
163 response = safe_post(
164 self.base_url,
165 headers=headers,
166 json=payload,
167 timeout=self.DEFAULT_TIMEOUT,
168 )
170 # Check for rate limits
171 if response.status_code == 429:
172 raise RateLimitError(
173 f"Serper rate limit hit: {response.status_code} - {response.text}"
174 )
176 response.raise_for_status()
178 data = response.json()
180 # Extract organic results
181 organic_results = data.get("organic", [])
183 # Format results as previews
184 previews = []
185 for idx, result in enumerate(organic_results):
186 # Extract display link safely using urlparse
187 display_link = ""
188 link = result.get("link", "")
189 if link:
190 try:
191 parsed_url = urlparse(link)
192 display_link = parsed_url.netloc or ""
193 except Exception:
194 logger.debug(
195 f"Failed to parse URL for display: {link[:50]}"
196 )
197 display_link = ""
199 preview = {
200 "id": idx,
201 "title": result.get("title", ""),
202 "link": link,
203 "snippet": result.get("snippet", ""),
204 "displayed_link": display_link,
205 "position": result.get("position", idx + 1),
206 }
208 # Store full Serper result for later
209 preview["_full_result"] = result
211 # Only include optional fields if present to avoid None values
212 # This keeps the preview dict cleaner and saves memory
213 if "sitelinks" in result:
214 preview["sitelinks"] = result["sitelinks"]
216 if "date" in result:
217 preview["date"] = result["date"]
219 if "attributes" in result: 219 ↛ 220line 219 didn't jump to line 220 because the condition on line 219 was never true
220 preview["attributes"] = result["attributes"]
222 previews.append(preview)
224 # Store the previews for potential full content retrieval
225 self._search_results = previews
227 # Also store knowledge graph if available
228 if "knowledgeGraph" in data:
229 self._knowledge_graph = data["knowledgeGraph"]
230 logger.info(
231 f"Found knowledge graph for query: {data['knowledgeGraph'].get('title', 'Unknown')}"
232 )
234 # Store related searches and people also ask
235 if "relatedSearches" in data:
236 self._related_searches = data["relatedSearches"]
238 if "peopleAlsoAsk" in data: 238 ↛ 239line 238 didn't jump to line 239 because the condition on line 238 was never true
239 self._people_also_ask = data["peopleAlsoAsk"]
241 return previews
243 except RateLimitError:
244 raise # Re-raise rate limit errors
245 except requests.exceptions.RequestException as e:
246 error_msg = str(e)
247 logger.exception("Error getting Serper API results")
249 # Check for rate limit patterns in error message
250 if any( 250 ↛ 259line 250 didn't jump to line 259 because the condition on line 250 was never true
251 pattern in error_msg.lower()
252 for pattern in [
253 "429",
254 "rate limit",
255 "quota",
256 "too many requests",
257 ]
258 ):
259 raise RateLimitError(f"Serper rate limit hit: {error_msg}")
261 return []
262 except Exception:
263 logger.exception("Unexpected error getting Serper API results")
264 return []
266 def _get_full_content(
267 self, relevant_items: List[Dict[str, Any]]
268 ) -> List[Dict[str, Any]]:
269 """
270 Get full content for the relevant search results.
271 If include_full_content is True and FullSearchResults is available,
272 retrieves full webpage content for the results.
274 Args:
275 relevant_items: List of relevant preview dictionaries
277 Returns:
278 List of result dictionaries with full content if requested
279 """
280 # Check if we should get full content
281 from ...config import search_config
283 if (
284 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
285 and search_config.SEARCH_SNIPPETS_ONLY
286 ):
287 logger.info("Snippet-only mode, skipping full content retrieval")
289 # Return the relevant items with their full Serper information
290 results = []
291 for item in relevant_items:
292 # Use the full result if available, otherwise use the preview
293 if "_full_result" in item: 293 ↛ 296line 293 didn't jump to line 296 because the condition on line 293 was always true
294 result = item["_full_result"].copy()
295 else:
296 result = item.copy()
298 # Clean up temporary fields
299 if "_full_result" in result: 299 ↛ 300line 299 didn't jump to line 300 because the condition on line 299 was never true
300 del result["_full_result"]
302 results.append(result)
304 # Include knowledge graph and other metadata if this is the first call
305 if results and self._knowledge_graph: 305 ↛ 306line 305 didn't jump to line 306 because the condition on line 305 was never true
306 results[0]["knowledge_graph"] = self._knowledge_graph
308 return results
310 # If full content retrieval is enabled
311 if self.include_full_content and hasattr(self, "full_search"): 311 ↛ 312line 311 didn't jump to line 312 because the condition on line 311 was never true
312 logger.info("Retrieving full webpage content")
314 try:
315 # Use FullSearchResults to get full content
316 results_with_content = self.full_search._get_full_content(
317 relevant_items
318 )
320 return results_with_content
322 except Exception as e:
323 logger.info(f"Error retrieving full content: {e}")
324 # Fall back to returning the items without full content
326 # Return items with their full Serper information
327 results = []
328 for item in relevant_items:
329 # Use the full result if available, otherwise use the preview
330 if "_full_result" in item:
331 result = item["_full_result"].copy()
332 else:
333 result = item.copy()
335 # Clean up temporary fields
336 if "_full_result" in result: 336 ↛ 337line 336 didn't jump to line 337 because the condition on line 336 was never true
337 del result["_full_result"]
339 results.append(result)
341 # Include knowledge graph and other metadata if this is the first call
342 if results and self._knowledge_graph:
343 results[0]["knowledge_graph"] = self._knowledge_graph
345 return results
347 def run(
348 self, query: str, research_context: Dict[str, Any] | None = None
349 ) -> List[Dict[str, Any]]:
350 """
351 Execute a search using Serper API with the two-phase approach.
353 Args:
354 query: The search query
355 research_context: Context from previous research to use.
357 Returns:
358 List of search results
359 """
360 logger.info("---Execute a search using Serper API (Google)---")
362 # Use the implementation from the parent class which handles all phases
363 # Note: super().run() internally calls our _get_previews() method
364 results = super().run(query, research_context=research_context)
366 # Clean up temporary attributes
367 if hasattr(self, "_search_results"):
368 del self._search_results
369 if hasattr(self, "_knowledge_graph"): 369 ↛ 371line 369 didn't jump to line 371 because the condition on line 369 was always true
370 del self._knowledge_graph
371 if hasattr(self, "_related_searches"): 371 ↛ 373line 371 didn't jump to line 373 because the condition on line 371 was always true
372 del self._related_searches
373 if hasattr(self, "_people_also_ask"): 373 ↛ 376line 373 didn't jump to line 376 because the condition on line 373 was always true
374 del self._people_also_ask
376 return results