Coverage for src / local_deep_research / web_search_engines / engines / search_engine_serper.py: 99%
91 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1from loguru import logger
2from typing import Any, Dict, List, Optional
3import requests
4from urllib.parse import urlparse
6from langchain_core.language_models import BaseLLM
8from ..search_engine_base import BaseSearchEngine
9from ..rate_limiting import RateLimitError
10from ...security import safe_post
13class SerperSearchEngine(BaseSearchEngine):
14 """Google search engine implementation using Serper API with two-phase approach"""
16 # Mark as public search engine
17 is_public = True
18 # Mark as generic search engine (general web search via Google)
19 is_generic = True
21 # Class constants
22 BASE_URL = "https://google.serper.dev/search"
23 DEFAULT_TIMEOUT = 30
24 DEFAULT_REGION = "us"
25 DEFAULT_LANGUAGE = "en"
27 def __init__(
28 self,
29 max_results: int = 10,
30 region: str = "us",
31 time_period: Optional[str] = None,
32 safe_search: bool = True,
33 search_language: str = "en",
34 api_key: Optional[str] = None,
35 llm: Optional[BaseLLM] = None,
36 include_full_content: bool = False,
37 max_filtered_results: Optional[int] = None,
38 settings_snapshot: Optional[Dict[str, Any]] = None,
39 **kwargs,
40 ):
41 """
42 Initialize the Serper search engine.
44 Args:
45 max_results: Maximum number of search results (default 10)
46 region: Country code for localized results (e.g., 'us', 'gb', 'fr')
47 time_period: Time filter for results ('day', 'week', 'month', 'year', or None for all time)
48 safe_search: Whether to enable safe search
49 search_language: Language code for results (e.g., 'en', 'es', 'fr')
50 api_key: Serper API key (can also be set in settings)
51 llm: Language model for relevance filtering
52 include_full_content: Whether to include full webpage content in results
53 max_filtered_results: Maximum number of results to keep after filtering
54 settings_snapshot: Settings snapshot for thread context
55 **kwargs: Additional parameters (ignored but accepted for compatibility)
56 """
57 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
58 super().__init__(
59 llm=llm,
60 max_filtered_results=max_filtered_results,
61 max_results=max_results,
62 include_full_content=include_full_content,
63 settings_snapshot=settings_snapshot,
64 )
65 self.region = region
66 self.time_period = time_period
67 self.safe_search = safe_search
68 self.search_language = search_language
70 # Get API key - check params, settings, or env vars
71 serper_api_key = self._resolve_api_key(
72 api_key,
73 "search.engine.web.serper.api_key",
74 engine_name="Serper",
75 settings_snapshot=settings_snapshot,
76 )
78 self.api_key = serper_api_key
79 self.base_url = self.BASE_URL
80 # Note: self.engine_type is automatically set by parent BaseSearchEngine class
82 # Initialize per-query attributes (reset in _get_previews per search)
83 self._knowledge_graph = None
84 self._related_searches = None
85 self._people_also_ask = None
87 # If full content is requested, initialize FullSearchResults
88 self._init_full_search(
89 web_search=None, # We'll handle the search ourselves
90 language=search_language,
91 max_results=max_results,
92 region=region,
93 time_period=time_period,
94 safe_search="Moderate" if safe_search else "Off",
95 )
97 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
98 """
99 Get preview information from Serper API.
101 Args:
102 query: The search query
104 Returns:
105 List of preview dictionaries
106 """
107 logger.info("Getting search results from Serper API")
109 # Reset per-query attributes to prevent leakage between searches
110 self._knowledge_graph = None
111 self._related_searches = None
112 self._people_also_ask = None
114 try:
115 # Build request payload
116 payload = {
117 "q": query,
118 "num": self.max_results,
119 "gl": self.region,
120 "hl": self.search_language,
121 }
123 # Add optional parameters
124 if self.time_period:
125 # Map time periods to Serper's format
126 time_mapping = {
127 "day": "d",
128 "week": "w",
129 "month": "m",
130 "year": "y",
131 }
132 if self.time_period in time_mapping:
133 payload["tbs"] = f"qdr:{time_mapping[self.time_period]}"
135 # Apply rate limiting before request
136 self._last_wait_time = self.rate_tracker.apply_rate_limit(
137 self.engine_type
138 )
140 # Make API request
141 headers = {
142 "X-API-KEY": self.api_key,
143 "Content-Type": "application/json",
144 }
146 response = safe_post(
147 self.base_url,
148 headers=headers,
149 json=payload,
150 timeout=self.DEFAULT_TIMEOUT,
151 )
153 # Check for rate limits
154 self._raise_if_rate_limit(response.status_code)
156 response.raise_for_status()
158 data = response.json()
160 # Extract organic results
161 organic_results = data.get("organic", [])
163 # Format results as previews
164 previews = []
165 for idx, result in enumerate(organic_results):
166 # Extract display link safely using urlparse
167 display_link = ""
168 link = result.get("link", "")
169 if link:
170 try:
171 parsed_url = urlparse(link)
172 display_link = parsed_url.netloc or ""
173 except Exception:
174 logger.debug(
175 f"Failed to parse URL for display: {link[:50]}"
176 )
177 display_link = ""
179 preview = {
180 "id": idx,
181 "title": result.get("title", ""),
182 "link": link,
183 "snippet": result.get("snippet", ""),
184 "displayed_link": display_link,
185 "position": result.get("position", idx + 1),
186 }
188 # Store full Serper result for later
189 preview["_full_result"] = result
191 # Only include optional fields if present to avoid None values
192 # This keeps the preview dict cleaner and saves memory
193 if "sitelinks" in result:
194 preview["sitelinks"] = result["sitelinks"]
196 if "date" in result:
197 preview["date"] = result["date"]
199 if "attributes" in result:
200 preview["attributes"] = result["attributes"]
202 previews.append(preview)
204 # Store the previews for potential full content retrieval
205 self._search_results = previews
207 # Also store knowledge graph if available
208 if "knowledgeGraph" in data:
209 self._knowledge_graph = data["knowledgeGraph"]
210 logger.info(
211 f"Found knowledge graph for query: {data['knowledgeGraph'].get('title', 'Unknown')}"
212 )
214 # Store related searches and people also ask
215 if "relatedSearches" in data:
216 self._related_searches = data["relatedSearches"]
218 if "peopleAlsoAsk" in data:
219 self._people_also_ask = data["peopleAlsoAsk"]
221 return previews
223 except RateLimitError:
224 raise # Re-raise rate limit errors
225 except requests.exceptions.RequestException as e:
226 logger.exception("Error getting Serper API results")
227 self._raise_if_rate_limit(e)
228 return []
229 except Exception:
230 logger.exception("Unexpected error getting Serper API results")
231 return []
233 def _get_full_content(
234 self, relevant_items: List[Dict[str, Any]]
235 ) -> List[Dict[str, Any]]:
236 """
237 Get full content for the relevant search results.
238 Extends base implementation to include knowledge graph data.
240 Args:
241 relevant_items: List of relevant preview dictionaries
243 Returns:
244 List of result dictionaries with full content if requested
245 """
246 results = super()._get_full_content(relevant_items)
248 # Include knowledge graph if available
249 if results and hasattr(self, "_knowledge_graph"): 249 ↛ 252line 249 didn't jump to line 252 because the condition on line 249 was always true
250 results[0]["knowledge_graph"] = self._knowledge_graph
252 return results
254 def _temp_attributes(self):
255 """Return list of temporary attribute names to clean up after run()."""
256 return super()._temp_attributes() + [
257 "_knowledge_graph",
258 "_related_searches",
259 "_people_also_ask",
260 ]