Coverage for src / local_deep_research / web_search_engines / engines / search_engine_scaleserp.py: 95%
93 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1from loguru import logger
2from typing import Any, Dict, List, Optional
3import requests
4from urllib.parse import urlparse
6from langchain_core.language_models import BaseLLM
8from ..search_engine_base import BaseSearchEngine
9from ..rate_limiting import RateLimitError
10from ...security import safe_get
13class ScaleSerpSearchEngine(BaseSearchEngine):
14 """Google search engine implementation using ScaleSerp API with caching support"""
16 # Mark as public search engine
17 is_public = True
18 # Mark as generic search engine (general web search via Google)
19 is_generic = True
21 def __init__(
22 self,
23 max_results: int = 10,
24 location: str = "United States",
25 language: str = "en",
26 device: str = "desktop",
27 safe_search: bool = True,
28 api_key: Optional[str] = None,
29 llm: Optional[BaseLLM] = None,
30 include_full_content: bool = False,
31 max_filtered_results: Optional[int] = None,
32 settings_snapshot: Optional[Dict[str, Any]] = None,
33 enable_cache: bool = True,
34 **kwargs,
35 ):
36 """
37 Initialize the ScaleSerp search engine.
39 Args:
40 max_results: Maximum number of search results (default 10, max 100)
41 location: Location for localized results (e.g., 'United States', 'London,England,United Kingdom')
42 language: Language code for results (e.g., 'en', 'es', 'fr')
43 device: Device type for search ('desktop' or 'mobile')
44 safe_search: Whether to enable safe search
45 api_key: ScaleSerp API key (can also be set in settings)
46 llm: Language model for relevance filtering
47 include_full_content: Whether to include full webpage content in results
48 max_filtered_results: Maximum number of results to keep after filtering
49 settings_snapshot: Settings snapshot for thread context
50 enable_cache: Whether to use ScaleSerp's 1-hour caching (saves costs for repeated searches)
51 **kwargs: Additional parameters (ignored but accepted for compatibility)
52 """
53 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
54 super().__init__(
55 llm=llm,
56 max_filtered_results=max_filtered_results,
57 max_results=max_results,
58 include_full_content=include_full_content,
59 settings_snapshot=settings_snapshot,
60 )
61 self.location = location
62 self.language = language
63 self.device = device
64 self.safe_search = safe_search
65 self.enable_cache = enable_cache # ScaleSerp's unique caching feature
67 # Get API key - check params, settings, or env vars
68 scaleserp_api_key = self._resolve_api_key(
69 api_key,
70 "search.engine.web.scaleserp.api_key",
71 engine_name="ScaleSerp",
72 settings_snapshot=settings_snapshot,
73 )
75 self.api_key = scaleserp_api_key
76 self.base_url = "https://api.scaleserp.com/search"
78 # Initialize per-query attributes (reset in _get_previews per search)
79 self._knowledge_graph = None
80 self._related_searches = None
81 self._related_questions = None
83 # If full content is requested, initialize FullSearchResults
84 self._init_full_search(
85 web_search=None, # We'll handle the search ourselves
86 language=language,
87 max_results=max_results,
88 region=location,
89 time_period=None,
90 safe_search="Moderate" if safe_search else "Off",
91 )
93 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
94 """
95 Get preview information from ScaleSerp API.
97 Args:
98 query: The search query
100 Returns:
101 List of preview dictionaries
102 """
103 logger.info("Getting search results from ScaleSerp API")
105 # Reset per-query attributes to prevent leakage between searches
106 self._knowledge_graph = None
107 self._related_searches = None
108 self._related_questions = None
110 try:
111 # Build request parameters
112 params = {
113 "api_key": self.api_key,
114 "q": query,
115 "num": min(self.max_results, 100), # ScaleSerp max is 100
116 "location": self.location,
117 "hl": self.language,
118 "device": self.device,
119 }
121 # Add safe search if enabled
122 if self.safe_search: 122 ↛ 127line 122 didn't jump to line 127 because the condition on line 122 was always true
123 params["safe"] = "on"
125 # ScaleSerp automatically caches identical queries for 1 hour
126 # Cached results are served instantly and don't consume API credits
127 if self.enable_cache: 127 ↛ 136line 127 didn't jump to line 136 because the condition on line 127 was always true
128 params["output"] = (
129 "json" # Ensure JSON output for cache detection
130 )
131 logger.debug(
132 "ScaleSerp caching enabled - identical searches within 1 hour are free"
133 )
135 # Apply rate limiting before request
136 self._last_wait_time = self.rate_tracker.apply_rate_limit(
137 self.engine_type
138 )
140 # Make API request
141 response = safe_get(self.base_url, params=params, timeout=30)
143 # Check for rate limits
144 self._raise_if_rate_limit(response.status_code)
146 response.raise_for_status()
148 data = response.json()
150 # Extract organic results
151 organic_results = data.get("organic_results", [])
153 # Format results as previews
154 previews = []
156 # Check if results were served from cache for monitoring
157 from_cache = data.get("request_info", {}).get("cached", False)
159 for idx, result in enumerate(organic_results):
160 # Extract display link safely using urlparse
161 link = result.get("link", "")
162 display_link = ""
163 if link:
164 try:
165 parsed_url = urlparse(link)
166 display_link = (
167 parsed_url.netloc or parsed_url.path or ""
168 )
169 except Exception:
170 # Fallback to truncated URL if parsing fails
171 logger.debug("URL parsing failed, using truncation")
172 display_link = link[:50]
174 preview = {
175 "id": idx,
176 "title": result.get("title", ""),
177 "link": link,
178 "snippet": result.get("snippet", ""),
179 "displayed_link": display_link,
180 "position": result.get("position", idx + 1),
181 "from_cache": from_cache, # Add cache status for monitoring
182 }
184 # Store full ScaleSerp result for later
185 preview["_full_result"] = result
187 # Include rich snippets if available
188 if "rich_snippet" in result:
189 preview["rich_snippet"] = result["rich_snippet"]
191 # Include date if available
192 if "date" in result:
193 preview["date"] = result["date"]
195 # Include sitelinks if available
196 if "sitelinks" in result:
197 preview["sitelinks"] = result["sitelinks"]
199 previews.append(preview)
201 # Store the previews for potential full content retrieval
202 self._search_results = previews
204 # Store knowledge graph if available
205 if "knowledge_graph" in data:
206 self._knowledge_graph = data["knowledge_graph"]
207 logger.info(
208 f"Found knowledge graph for query: {data['knowledge_graph'].get('title', 'Unknown')}"
209 )
211 # Store related searches
212 if "related_searches" in data:
213 self._related_searches = data["related_searches"]
215 # Store related questions (People Also Ask)
216 if "related_questions" in data:
217 self._related_questions = data["related_questions"]
219 # Log if result was served from cache
220 if from_cache:
221 logger.debug(
222 "Result served from ScaleSerp cache - no API credit used!"
223 )
225 return previews
227 except RateLimitError:
228 raise # Re-raise rate limit errors
229 except requests.exceptions.RequestException as e:
230 sanitized = self._sanitize_error_message(str(e))
231 logger.exception(
232 "Error getting ScaleSerp API results: {}. Check API docs: https://docs.scaleserp.com",
233 sanitized,
234 )
235 self._raise_if_rate_limit(e)
236 return []
237 except Exception as e:
238 sanitized = self._sanitize_error_message(str(e))
239 logger.exception(
240 "Unexpected error getting ScaleSerp API results: {}", sanitized
241 )
242 return []
244 def _get_full_content(
245 self, relevant_items: List[Dict[str, Any]]
246 ) -> List[Dict[str, Any]]:
247 """
248 Get full content for the relevant search results.
249 Extends base implementation to include knowledge graph data.
251 Args:
252 relevant_items: List of relevant preview dictionaries
254 Returns:
255 List of result dictionaries with full content if requested
256 """
257 results = super()._get_full_content(relevant_items)
259 # Include knowledge graph if available
260 if results and hasattr(self, "_knowledge_graph"): 260 ↛ 263line 260 didn't jump to line 263 because the condition on line 260 was always true
261 results[0]["knowledge_graph"] = self._knowledge_graph
263 return results
265 def _temp_attributes(self):
266 """Return list of temporary attribute names to clean up after run()."""
267 return super()._temp_attributes() + [
268 "_knowledge_graph",
269 "_related_searches",
270 "_related_questions",
271 ]