Coverage for src / local_deep_research / web_search_engines / engines / search_engine_serper.py: 72%
138 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1from loguru import logger
2from typing import Any, Dict, List, Optional
3import requests
4from urllib.parse import urlparse
6from langchain_core.language_models import BaseLLM
8from ..search_engine_base import BaseSearchEngine
9from ..rate_limiting import RateLimitError
10from ...security import safe_post
13class SerperSearchEngine(BaseSearchEngine):
14 """Google search engine implementation using Serper API with two-phase approach"""
16 # Mark as public search engine
17 is_public = True
18 # Mark as generic search engine (general web search via Google)
19 is_generic = True
21 # Class constants
22 BASE_URL = "https://google.serper.dev/search"
23 DEFAULT_TIMEOUT = 30
24 DEFAULT_REGION = "us"
25 DEFAULT_LANGUAGE = "en"
27 def __init__(
28 self,
29 max_results: int = 10,
30 region: str = "us",
31 time_period: Optional[str] = None,
32 safe_search: bool = True,
33 search_language: str = "en",
34 api_key: Optional[str] = None,
35 llm: Optional[BaseLLM] = None,
36 include_full_content: bool = False,
37 max_filtered_results: Optional[int] = None,
38 settings_snapshot: Optional[Dict[str, Any]] = None,
39 **kwargs,
40 ):
41 """
42 Initialize the Serper search engine.
44 Args:
45 max_results: Maximum number of search results (default 10)
46 region: Country code for localized results (e.g., 'us', 'gb', 'fr')
47 time_period: Time filter for results ('day', 'week', 'month', 'year', or None for all time)
48 safe_search: Whether to enable safe search
49 search_language: Language code for results (e.g., 'en', 'es', 'fr')
50 api_key: Serper API key (can also be set in settings)
51 llm: Language model for relevance filtering
52 include_full_content: Whether to include full webpage content in results
53 max_filtered_results: Maximum number of results to keep after filtering
54 settings_snapshot: Settings snapshot for thread context
55 **kwargs: Additional parameters (ignored but accepted for compatibility)
56 """
57 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
58 super().__init__(
59 llm=llm,
60 max_filtered_results=max_filtered_results,
61 max_results=max_results,
62 )
63 self.include_full_content = include_full_content
64 self.region = region
65 self.time_period = time_period
66 self.safe_search = safe_search
67 self.search_language = search_language
69 # Get API key - check params, env vars, or database
70 from ...config.search_config import get_setting_from_snapshot
72 serper_api_key = api_key
73 if not serper_api_key:
74 serper_api_key = get_setting_from_snapshot(
75 "search.engine.web.serper.api_key",
76 settings_snapshot=settings_snapshot,
77 )
79 if not serper_api_key:
80 raise ValueError(
81 "Serper API key not found. Please provide api_key parameter or set it in the UI settings."
82 )
84 self.api_key = serper_api_key
85 self.base_url = self.BASE_URL
86 # Note: self.engine_type is automatically set by parent BaseSearchEngine class
88 # If full content is requested, initialize FullSearchResults
89 if include_full_content: 89 ↛ 91line 89 didn't jump to line 91 because the condition on line 89 was never true
90 # Import FullSearchResults only if needed
91 try:
92 from .full_search import FullSearchResults
94 self.full_search = FullSearchResults(
95 llm=llm,
96 web_search=None, # We'll handle the search ourselves
97 language=search_language,
98 max_results=max_results,
99 region=region,
100 time=time_period,
101 safesearch="Moderate" if safe_search else "Off",
102 )
103 except ImportError:
104 logger.warning(
105 "Warning: FullSearchResults not available. Full content retrieval disabled."
106 )
107 self.include_full_content = False
109 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
110 """
111 Get preview information from Serper API.
113 Args:
114 query: The search query
116 Returns:
117 List of preview dictionaries
118 """
119 logger.info("Getting search results from Serper API")
121 try:
122 # Build request payload
123 payload = {
124 "q": query,
125 "num": self.max_results,
126 "gl": self.region,
127 "hl": self.search_language,
128 }
130 # Add optional parameters
131 if self.time_period:
132 # Map time periods to Serper's format
133 time_mapping = {
134 "day": "d",
135 "week": "w",
136 "month": "m",
137 "year": "y",
138 }
139 if self.time_period in time_mapping: 139 ↛ 143line 139 didn't jump to line 143 because the condition on line 139 was always true
140 payload["tbs"] = f"qdr:{time_mapping[self.time_period]}"
142 # Apply rate limiting before request
143 self._last_wait_time = self.rate_tracker.apply_rate_limit(
144 self.engine_type
145 )
147 # Make API request
148 headers = {
149 "X-API-KEY": self.api_key,
150 "Content-Type": "application/json",
151 }
153 response = safe_post(
154 self.base_url,
155 headers=headers,
156 json=payload,
157 timeout=self.DEFAULT_TIMEOUT,
158 )
160 # Check for rate limits
161 if response.status_code == 429:
162 raise RateLimitError(
163 f"Serper rate limit hit: {response.status_code} - {response.text}"
164 )
166 response.raise_for_status()
168 data = response.json()
170 # Extract organic results
171 organic_results = data.get("organic", [])
173 # Format results as previews
174 previews = []
175 for idx, result in enumerate(organic_results):
176 # Extract display link safely using urlparse
177 display_link = ""
178 link = result.get("link", "")
179 if link:
180 try:
181 parsed_url = urlparse(link)
182 display_link = parsed_url.netloc or ""
183 except Exception:
184 logger.debug(
185 f"Failed to parse URL for display: {link[:50]}"
186 )
187 display_link = ""
189 preview = {
190 "id": idx,
191 "title": result.get("title", ""),
192 "link": link,
193 "snippet": result.get("snippet", ""),
194 "displayed_link": display_link,
195 "position": result.get("position", idx + 1),
196 }
198 # Store full Serper result for later
199 preview["_full_result"] = result
201 # Only include optional fields if present to avoid None values
202 # This keeps the preview dict cleaner and saves memory
203 if "sitelinks" in result:
204 preview["sitelinks"] = result["sitelinks"]
206 if "date" in result:
207 preview["date"] = result["date"]
209 if "attributes" in result: 209 ↛ 210line 209 didn't jump to line 210 because the condition on line 209 was never true
210 preview["attributes"] = result["attributes"]
212 previews.append(preview)
214 # Store the previews for potential full content retrieval
215 self._search_results = previews
217 # Also store knowledge graph if available
218 if "knowledgeGraph" in data:
219 self._knowledge_graph = data["knowledgeGraph"]
220 logger.info(
221 f"Found knowledge graph for query: {data['knowledgeGraph'].get('title', 'Unknown')}"
222 )
224 # Store related searches and people also ask
225 if "relatedSearches" in data:
226 self._related_searches = data["relatedSearches"]
228 if "peopleAlsoAsk" in data: 228 ↛ 229line 228 didn't jump to line 229 because the condition on line 228 was never true
229 self._people_also_ask = data["peopleAlsoAsk"]
231 return previews
233 except RateLimitError:
234 raise # Re-raise rate limit errors
235 except requests.exceptions.RequestException as e:
236 error_msg = str(e)
237 logger.exception("Error getting Serper API results")
239 # Check for rate limit patterns in error message
240 if any( 240 ↛ 249line 240 didn't jump to line 249 because the condition on line 240 was never true
241 pattern in error_msg.lower()
242 for pattern in [
243 "429",
244 "rate limit",
245 "quota",
246 "too many requests",
247 ]
248 ):
249 raise RateLimitError(f"Serper rate limit hit: {error_msg}")
251 return []
252 except Exception:
253 logger.exception("Unexpected error getting Serper API results")
254 return []
256 def _get_full_content(
257 self, relevant_items: List[Dict[str, Any]]
258 ) -> List[Dict[str, Any]]:
259 """
260 Get full content for the relevant search results.
261 If include_full_content is True and FullSearchResults is available,
262 retrieves full webpage content for the results.
264 Args:
265 relevant_items: List of relevant preview dictionaries
267 Returns:
268 List of result dictionaries with full content if requested
269 """
270 # Check if we should get full content
271 from ...config import search_config
273 if (
274 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
275 and search_config.SEARCH_SNIPPETS_ONLY
276 ):
277 logger.info("Snippet-only mode, skipping full content retrieval")
279 # Return the relevant items with their full Serper information
280 results = []
281 for item in relevant_items:
282 # Use the full result if available, otherwise use the preview
283 if "_full_result" in item: 283 ↛ 286line 283 didn't jump to line 286 because the condition on line 283 was always true
284 result = item["_full_result"].copy()
285 else:
286 result = item.copy()
288 # Clean up temporary fields
289 if "_full_result" in result: 289 ↛ 290line 289 didn't jump to line 290 because the condition on line 289 was never true
290 del result["_full_result"]
292 results.append(result)
294 # Include knowledge graph and other metadata if this is the first call
295 if results and hasattr(self, "_knowledge_graph"): 295 ↛ 296line 295 didn't jump to line 296 because the condition on line 295 was never true
296 results[0]["knowledge_graph"] = self._knowledge_graph
298 return results
300 # If full content retrieval is enabled
301 if self.include_full_content and hasattr(self, "full_search"): 301 ↛ 302line 301 didn't jump to line 302 because the condition on line 301 was never true
302 logger.info("Retrieving full webpage content")
304 try:
305 # Use FullSearchResults to get full content
306 results_with_content = self.full_search._get_full_content(
307 relevant_items
308 )
310 return results_with_content
312 except Exception as e:
313 logger.info(f"Error retrieving full content: {e}")
314 # Fall back to returning the items without full content
316 # Return items with their full Serper information
317 results = []
318 for item in relevant_items:
319 # Use the full result if available, otherwise use the preview
320 if "_full_result" in item: 320 ↛ 321line 320 didn't jump to line 321 because the condition on line 320 was never true
321 result = item["_full_result"].copy()
322 else:
323 result = item.copy()
325 # Clean up temporary fields
326 if "_full_result" in result: 326 ↛ 327line 326 didn't jump to line 327 because the condition on line 326 was never true
327 del result["_full_result"]
329 results.append(result)
331 # Include knowledge graph and other metadata if this is the first call
332 if results and hasattr(self, "_knowledge_graph"): 332 ↛ 333line 332 didn't jump to line 333 because the condition on line 332 was never true
333 results[0]["knowledge_graph"] = self._knowledge_graph
335 return results
337 def run(
338 self, query: str, research_context: Dict[str, Any] | None = None
339 ) -> List[Dict[str, Any]]:
340 """
341 Execute a search using Serper API with the two-phase approach.
343 Args:
344 query: The search query
345 research_context: Context from previous research to use.
347 Returns:
348 List of search results
349 """
350 logger.info("---Execute a search using Serper API (Google)---")
352 # Use the implementation from the parent class which handles all phases
353 # Note: super().run() internally calls our _get_previews() method
354 results = super().run(query, research_context=research_context)
356 # Clean up temporary attributes
357 if hasattr(self, "_search_results"):
358 del self._search_results
359 if hasattr(self, "_knowledge_graph"):
360 del self._knowledge_graph
361 if hasattr(self, "_related_searches"):
362 del self._related_searches
363 if hasattr(self, "_people_also_ask"):
364 del self._people_also_ask
366 return results