Coverage for src / local_deep_research / web_search_engines / engines / search_engine_scaleserp.py: 61%
137 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1from loguru import logger
2from typing import Any, Dict, List, Optional
3import requests
4from urllib.parse import urlparse
6from langchain_core.language_models import BaseLLM
8from ..search_engine_base import BaseSearchEngine
9from ..rate_limiting import RateLimitError
10from ...security import safe_get
13class ScaleSerpSearchEngine(BaseSearchEngine):
14 """Google search engine implementation using ScaleSerp API with caching support"""
16 # Mark as public search engine
17 is_public = True
18 # Mark as generic search engine (general web search via Google)
19 is_generic = True
21 def __init__(
22 self,
23 max_results: int = 10,
24 location: str = "United States",
25 language: str = "en",
26 device: str = "desktop",
27 safe_search: bool = True,
28 api_key: Optional[str] = None,
29 llm: Optional[BaseLLM] = None,
30 include_full_content: bool = False,
31 max_filtered_results: Optional[int] = None,
32 settings_snapshot: Optional[Dict[str, Any]] = None,
33 enable_cache: bool = True,
34 **kwargs,
35 ):
36 """
37 Initialize the ScaleSerp search engine.
39 Args:
40 max_results: Maximum number of search results (default 10, max 100)
41 location: Location for localized results (e.g., 'United States', 'London,England,United Kingdom')
42 language: Language code for results (e.g., 'en', 'es', 'fr')
43 device: Device type for search ('desktop' or 'mobile')
44 safe_search: Whether to enable safe search
45 api_key: ScaleSerp API key (can also be set in settings)
46 llm: Language model for relevance filtering
47 include_full_content: Whether to include full webpage content in results
48 max_filtered_results: Maximum number of results to keep after filtering
49 settings_snapshot: Settings snapshot for thread context
50 enable_cache: Whether to use ScaleSerp's 1-hour caching (saves costs for repeated searches)
51 **kwargs: Additional parameters (ignored but accepted for compatibility)
52 """
53 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
54 super().__init__(
55 llm=llm,
56 max_filtered_results=max_filtered_results,
57 max_results=max_results,
58 )
59 self.include_full_content = include_full_content
60 self.location = location
61 self.language = language
62 self.device = device
63 self.safe_search = safe_search
64 self.enable_cache = enable_cache # ScaleSerp's unique caching feature
66 # Get API key - check params, env vars, or database
67 from ...config.search_config import get_setting_from_snapshot
69 scaleserp_api_key = api_key
70 if not scaleserp_api_key:
71 scaleserp_api_key = get_setting_from_snapshot(
72 "search.engine.web.scaleserp.api_key",
73 settings_snapshot=settings_snapshot,
74 )
76 if not scaleserp_api_key:
77 raise ValueError(
78 "ScaleSerp API key not found. Please provide api_key parameter or set it in the UI settings. "
79 "Get your API key at https://scaleserp.com"
80 )
82 self.api_key = scaleserp_api_key
83 self.base_url = "https://api.scaleserp.com/search"
85 # If full content is requested, initialize FullSearchResults
86 if include_full_content: 86 ↛ 88line 86 didn't jump to line 88 because the condition on line 86 was never true
87 # Import FullSearchResults only if needed
88 try:
89 from .full_search import FullSearchResults
91 self.full_search = FullSearchResults(
92 llm=llm,
93 web_search=None, # We'll handle the search ourselves
94 language=language,
95 max_results=max_results,
96 region=location,
97 time=None,
98 safesearch="Moderate" if safe_search else "Off",
99 )
100 except ImportError:
101 logger.warning(
102 "Warning: FullSearchResults not available. Full content retrieval disabled."
103 )
104 self.include_full_content = False
106 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
107 """
108 Get preview information from ScaleSerp API.
110 Args:
111 query: The search query
113 Returns:
114 List of preview dictionaries
115 """
116 logger.info("Getting search results from ScaleSerp API")
118 try:
119 # Build request parameters
120 params = {
121 "api_key": self.api_key,
122 "q": query,
123 "num": min(self.max_results, 100), # ScaleSerp max is 100
124 "location": self.location,
125 "hl": self.language,
126 "device": self.device,
127 }
129 # Add safe search if enabled
130 if self.safe_search: 130 ↛ 135line 130 didn't jump to line 135 because the condition on line 130 was always true
131 params["safe"] = "on"
133 # ScaleSerp automatically caches identical queries for 1 hour
134 # Cached results are served instantly and don't consume API credits
135 if self.enable_cache: 135 ↛ 144line 135 didn't jump to line 144 because the condition on line 135 was always true
136 params["output"] = (
137 "json" # Ensure JSON output for cache detection
138 )
139 logger.debug(
140 "ScaleSerp caching enabled - identical searches within 1 hour are free"
141 )
143 # Apply rate limiting before request
144 self._last_wait_time = self.rate_tracker.apply_rate_limit(
145 self.engine_type
146 )
148 # Make API request
149 response = safe_get(self.base_url, params=params, timeout=30)
151 # Check for rate limits
152 if response.status_code == 429:
153 raise RateLimitError(
154 f"ScaleSerp rate limit hit: {response.status_code} - {response.text}"
155 )
157 response.raise_for_status()
159 data = response.json()
161 # Extract organic results
162 organic_results = data.get("organic_results", [])
164 # Format results as previews
165 previews = []
167 # Check if results were served from cache for monitoring
168 from_cache = data.get("request_info", {}).get("cached", False)
170 for idx, result in enumerate(organic_results):
171 # Extract display link safely using urlparse
172 link = result.get("link", "")
173 display_link = ""
174 if link: 174 ↛ 184line 174 didn't jump to line 184 because the condition on line 174 was always true
175 try:
176 parsed_url = urlparse(link)
177 display_link = (
178 parsed_url.netloc or parsed_url.path or ""
179 )
180 except Exception:
181 # Fallback to truncated URL if parsing fails
182 display_link = link[:50]
184 preview = {
185 "id": idx,
186 "title": result.get("title", ""),
187 "link": link,
188 "snippet": result.get("snippet", ""),
189 "displayed_link": display_link,
190 "position": result.get("position", idx + 1),
191 "from_cache": from_cache, # Add cache status for monitoring
192 }
194 # Store full ScaleSerp result for later
195 preview["_full_result"] = result
197 # Include rich snippets if available
198 if "rich_snippet" in result:
199 preview["rich_snippet"] = result["rich_snippet"]
201 # Include date if available
202 if "date" in result: 202 ↛ 203line 202 didn't jump to line 203 because the condition on line 202 was never true
203 preview["date"] = result["date"]
205 # Include sitelinks if available
206 if "sitelinks" in result:
207 preview["sitelinks"] = result["sitelinks"]
209 previews.append(preview)
211 # Store the previews for potential full content retrieval
212 self._search_results = previews
214 # Store knowledge graph if available
215 if "knowledge_graph" in data:
216 self._knowledge_graph = data["knowledge_graph"]
217 logger.info(
218 f"Found knowledge graph for query: {data['knowledge_graph'].get('title', 'Unknown')}"
219 )
221 # Store related searches
222 if "related_searches" in data: 222 ↛ 223line 222 didn't jump to line 223 because the condition on line 222 was never true
223 self._related_searches = data["related_searches"]
225 # Store related questions (People Also Ask)
226 if "related_questions" in data: 226 ↛ 227line 226 didn't jump to line 227 because the condition on line 226 was never true
227 self._related_questions = data["related_questions"]
229 # Log if result was served from cache
230 if from_cache:
231 logger.debug(
232 "Result served from ScaleSerp cache - no API credit used!"
233 )
235 return previews
237 except RateLimitError:
238 raise # Re-raise rate limit errors
239 except requests.exceptions.RequestException as e:
240 error_msg = str(e)
241 logger.exception(
242 "Error getting ScaleSerp API results. Check API docs: https://docs.scaleserp.com"
243 )
245 # Check for rate limit patterns in error message
246 if any( 246 ↛ 255line 246 didn't jump to line 255 because the condition on line 246 was never true
247 pattern in error_msg.lower()
248 for pattern in [
249 "429",
250 "rate limit",
251 "quota",
252 "too many requests",
253 ]
254 ):
255 raise RateLimitError(f"ScaleSerp rate limit hit: {error_msg}")
257 return []
258 except Exception:
259 logger.exception("Unexpected error getting ScaleSerp API results")
260 return []
262 def _get_full_content(
263 self, relevant_items: List[Dict[str, Any]]
264 ) -> List[Dict[str, Any]]:
265 """
266 Get full content for the relevant search results.
267 If include_full_content is True and FullSearchResults is available,
268 retrieves full webpage content for the results.
270 Args:
271 relevant_items: List of relevant preview dictionaries
273 Returns:
274 List of result dictionaries with full content if requested
275 """
276 # Check if we should get full content
277 from ...config import search_config
279 if ( 279 ↛ 283line 279 didn't jump to line 283 because the condition on line 279 was never true
280 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
281 and search_config.SEARCH_SNIPPETS_ONLY
282 ):
283 logger.info("Snippet-only mode, skipping full content retrieval")
285 # Return the relevant items with their full ScaleSerp information
286 results = []
287 for item in relevant_items:
288 # Use the full result if available, otherwise use the preview
289 if "_full_result" in item:
290 result = item["_full_result"].copy()
291 else:
292 result = item.copy()
294 # Clean up temporary fields
295 if "_full_result" in result:
296 del result["_full_result"]
298 results.append(result)
300 # Include knowledge graph and other metadata if this is the first call
301 if results and hasattr(self, "_knowledge_graph"):
302 results[0]["knowledge_graph"] = self._knowledge_graph
304 return results
306 # If full content retrieval is enabled
307 if self.include_full_content and hasattr(self, "full_search"): 307 ↛ 308line 307 didn't jump to line 308 because the condition on line 307 was never true
308 logger.info("Retrieving full webpage content")
310 try:
311 # Use FullSearchResults to get full content
312 results_with_content = self.full_search._get_full_content(
313 relevant_items
314 )
316 return results_with_content
318 except Exception as e:
319 logger.info(f"Error retrieving full content: {e}")
320 # Fall back to returning the items without full content
322 # Return items with their full ScaleSerp information
323 results = []
324 for item in relevant_items:
325 # Use the full result if available, otherwise use the preview
326 if "_full_result" in item: 326 ↛ 329line 326 didn't jump to line 329 because the condition on line 326 was always true
327 result = item["_full_result"].copy()
328 else:
329 result = item.copy()
331 # Clean up temporary fields
332 if "_full_result" in result: 332 ↛ 333line 332 didn't jump to line 333 because the condition on line 332 was never true
333 del result["_full_result"]
335 results.append(result)
337 # Include knowledge graph and other metadata if this is the first call
338 if results and hasattr(self, "_knowledge_graph"): 338 ↛ 339line 338 didn't jump to line 339 because the condition on line 338 was never true
339 results[0]["knowledge_graph"] = self._knowledge_graph
341 return results
343 def run(
344 self, query: str, research_context: Dict[str, Any] | None = None
345 ) -> List[Dict[str, Any]]:
346 """
347 Execute a search using ScaleSerp API with the two-phase approach.
349 Args:
350 query: The search query
351 research_context: Context from previous research to use.
353 Returns:
354 List of search results
355 """
356 logger.info("---Execute a search using ScaleSerp API (Google)---")
358 # Use the implementation from the parent class which handles all phases
359 results = super().run(query, research_context=research_context)
361 # Clean up
362 if hasattr(self, "_search_results"):
363 del self._search_results
364 if hasattr(self, "_knowledge_graph"):
365 del self._knowledge_graph
366 if hasattr(self, "_related_searches"):
367 del self._related_searches
368 if hasattr(self, "_related_questions"):
369 del self._related_questions
371 return results