Coverage for src / local_deep_research / web_search_engines / engines / search_engine_scaleserp.py: 73%
143 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1from loguru import logger
2from typing import Any, Dict, List, Optional
3import requests
4from urllib.parse import urlparse
6from langchain_core.language_models import BaseLLM
8from ..search_engine_base import BaseSearchEngine
9from ..rate_limiting import RateLimitError
10from ...security import safe_get
13class ScaleSerpSearchEngine(BaseSearchEngine):
14 """Google search engine implementation using ScaleSerp API with caching support"""
16 # Mark as public search engine
17 is_public = True
18 # Mark as generic search engine (general web search via Google)
19 is_generic = True
21 def __init__(
22 self,
23 max_results: int = 10,
24 location: str = "United States",
25 language: str = "en",
26 device: str = "desktop",
27 safe_search: bool = True,
28 api_key: Optional[str] = None,
29 llm: Optional[BaseLLM] = None,
30 include_full_content: bool = False,
31 max_filtered_results: Optional[int] = None,
32 settings_snapshot: Optional[Dict[str, Any]] = None,
33 enable_cache: bool = True,
34 **kwargs,
35 ):
36 """
37 Initialize the ScaleSerp search engine.
39 Args:
40 max_results: Maximum number of search results (default 10, max 100)
41 location: Location for localized results (e.g., 'United States', 'London,England,United Kingdom')
42 language: Language code for results (e.g., 'en', 'es', 'fr')
43 device: Device type for search ('desktop' or 'mobile')
44 safe_search: Whether to enable safe search
45 api_key: ScaleSerp API key (can also be set in settings)
46 llm: Language model for relevance filtering
47 include_full_content: Whether to include full webpage content in results
48 max_filtered_results: Maximum number of results to keep after filtering
49 settings_snapshot: Settings snapshot for thread context
50 enable_cache: Whether to use ScaleSerp's 1-hour caching (saves costs for repeated searches)
51 **kwargs: Additional parameters (ignored but accepted for compatibility)
52 """
53 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
54 super().__init__(
55 llm=llm,
56 max_filtered_results=max_filtered_results,
57 max_results=max_results,
58 )
59 self.include_full_content = include_full_content
60 self.location = location
61 self.language = language
62 self.device = device
63 self.safe_search = safe_search
64 self.enable_cache = enable_cache # ScaleSerp's unique caching feature
66 # Get API key - check params, env vars, or database
67 from ...config.search_config import get_setting_from_snapshot
69 scaleserp_api_key = api_key
70 if not scaleserp_api_key:
71 scaleserp_api_key = get_setting_from_snapshot(
72 "search.engine.web.scaleserp.api_key",
73 settings_snapshot=settings_snapshot,
74 )
76 if not scaleserp_api_key:
77 raise ValueError(
78 "ScaleSerp API key not found. Please provide api_key parameter or set it in the UI settings. "
79 "Get your API key at https://scaleserp.com"
80 )
82 self.api_key = scaleserp_api_key
83 self.base_url = "https://api.scaleserp.com/search"
85 # Initialize per-query attributes (reset in _get_previews per search)
86 self._knowledge_graph = None
87 self._related_searches = None
88 self._related_questions = None
90 # If full content is requested, initialize FullSearchResults
91 if include_full_content: 91 ↛ 93line 91 didn't jump to line 93 because the condition on line 91 was never true
92 # Import FullSearchResults only if needed
93 try:
94 from .full_search import FullSearchResults
96 self.full_search = FullSearchResults(
97 llm=llm,
98 web_search=None, # We'll handle the search ourselves
99 language=language,
100 max_results=max_results,
101 region=location,
102 time=None,
103 safesearch="Moderate" if safe_search else "Off",
104 )
105 except ImportError:
106 logger.warning(
107 "Warning: FullSearchResults not available. Full content retrieval disabled."
108 )
109 self.include_full_content = False
111 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
112 """
113 Get preview information from ScaleSerp API.
115 Args:
116 query: The search query
118 Returns:
119 List of preview dictionaries
120 """
121 logger.info("Getting search results from ScaleSerp API")
123 # Reset per-query attributes to prevent leakage between searches
124 self._knowledge_graph = None
125 self._related_searches = None
126 self._related_questions = None
128 try:
129 # Build request parameters
130 params = {
131 "api_key": self.api_key,
132 "q": query,
133 "num": min(self.max_results, 100), # ScaleSerp max is 100
134 "location": self.location,
135 "hl": self.language,
136 "device": self.device,
137 }
139 # Add safe search if enabled
140 if self.safe_search: 140 ↛ 145line 140 didn't jump to line 145 because the condition on line 140 was always true
141 params["safe"] = "on"
143 # ScaleSerp automatically caches identical queries for 1 hour
144 # Cached results are served instantly and don't consume API credits
145 if self.enable_cache: 145 ↛ 154line 145 didn't jump to line 154 because the condition on line 145 was always true
146 params["output"] = (
147 "json" # Ensure JSON output for cache detection
148 )
149 logger.debug(
150 "ScaleSerp caching enabled - identical searches within 1 hour are free"
151 )
153 # Apply rate limiting before request
154 self._last_wait_time = self.rate_tracker.apply_rate_limit(
155 self.engine_type
156 )
158 # Make API request
159 response = safe_get(self.base_url, params=params, timeout=30)
161 # Check for rate limits
162 if response.status_code == 429:
163 raise RateLimitError(
164 f"ScaleSerp rate limit hit: {response.status_code} - {response.text}"
165 )
167 response.raise_for_status()
169 data = response.json()
171 # Extract organic results
172 organic_results = data.get("organic_results", [])
174 # Format results as previews
175 previews = []
177 # Check if results were served from cache for monitoring
178 from_cache = data.get("request_info", {}).get("cached", False)
180 for idx, result in enumerate(organic_results):
181 # Extract display link safely using urlparse
182 link = result.get("link", "")
183 display_link = ""
184 if link: 184 ↛ 194line 184 didn't jump to line 194 because the condition on line 184 was always true
185 try:
186 parsed_url = urlparse(link)
187 display_link = (
188 parsed_url.netloc or parsed_url.path or ""
189 )
190 except Exception:
191 # Fallback to truncated URL if parsing fails
192 display_link = link[:50]
194 preview = {
195 "id": idx,
196 "title": result.get("title", ""),
197 "link": link,
198 "snippet": result.get("snippet", ""),
199 "displayed_link": display_link,
200 "position": result.get("position", idx + 1),
201 "from_cache": from_cache, # Add cache status for monitoring
202 }
204 # Store full ScaleSerp result for later
205 preview["_full_result"] = result
207 # Include rich snippets if available
208 if "rich_snippet" in result:
209 preview["rich_snippet"] = result["rich_snippet"]
211 # Include date if available
212 if "date" in result:
213 preview["date"] = result["date"]
215 # Include sitelinks if available
216 if "sitelinks" in result:
217 preview["sitelinks"] = result["sitelinks"]
219 previews.append(preview)
221 # Store the previews for potential full content retrieval
222 self._search_results = previews
224 # Store knowledge graph if available
225 if "knowledge_graph" in data:
226 self._knowledge_graph = data["knowledge_graph"]
227 logger.info(
228 f"Found knowledge graph for query: {data['knowledge_graph'].get('title', 'Unknown')}"
229 )
231 # Store related searches
232 if "related_searches" in data: 232 ↛ 233line 232 didn't jump to line 233 because the condition on line 232 was never true
233 self._related_searches = data["related_searches"]
235 # Store related questions (People Also Ask)
236 if "related_questions" in data: 236 ↛ 237line 236 didn't jump to line 237 because the condition on line 236 was never true
237 self._related_questions = data["related_questions"]
239 # Log if result was served from cache
240 if from_cache:
241 logger.debug(
242 "Result served from ScaleSerp cache - no API credit used!"
243 )
245 return previews
247 except RateLimitError:
248 raise # Re-raise rate limit errors
249 except requests.exceptions.RequestException as e:
250 error_msg = str(e)
251 logger.exception(
252 "Error getting ScaleSerp API results. Check API docs: https://docs.scaleserp.com"
253 )
255 # Check for rate limit patterns in error message
256 if any( 256 ↛ 265line 256 didn't jump to line 265 because the condition on line 256 was never true
257 pattern in error_msg.lower()
258 for pattern in [
259 "429",
260 "rate limit",
261 "quota",
262 "too many requests",
263 ]
264 ):
265 raise RateLimitError(f"ScaleSerp rate limit hit: {error_msg}")
267 return []
268 except Exception:
269 logger.exception("Unexpected error getting ScaleSerp API results")
270 return []
272 def _get_full_content(
273 self, relevant_items: List[Dict[str, Any]]
274 ) -> List[Dict[str, Any]]:
275 """
276 Get full content for the relevant search results.
277 If include_full_content is True and FullSearchResults is available,
278 retrieves full webpage content for the results.
280 Args:
281 relevant_items: List of relevant preview dictionaries
283 Returns:
284 List of result dictionaries with full content if requested
285 """
286 # Check if we should get full content
287 from ...config import search_config
289 if ( 289 ↛ 293line 289 didn't jump to line 293 because the condition on line 289 was never true
290 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
291 and search_config.SEARCH_SNIPPETS_ONLY
292 ):
293 logger.info("Snippet-only mode, skipping full content retrieval")
295 # Return the relevant items with their full ScaleSerp information
296 results = []
297 for item in relevant_items:
298 # Use the full result if available, otherwise use the preview
299 if "_full_result" in item:
300 result = item["_full_result"].copy()
301 else:
302 result = item.copy()
304 # Clean up temporary fields
305 if "_full_result" in result:
306 del result["_full_result"]
308 results.append(result)
310 # Include knowledge graph and other metadata if this is the first call
311 if results and self._knowledge_graph:
312 results[0]["knowledge_graph"] = self._knowledge_graph
314 return results
316 # If full content retrieval is enabled
317 if self.include_full_content and hasattr(self, "full_search"): 317 ↛ 318line 317 didn't jump to line 318 because the condition on line 317 was never true
318 logger.info("Retrieving full webpage content")
320 try:
321 # Use FullSearchResults to get full content
322 results_with_content = self.full_search._get_full_content(
323 relevant_items
324 )
326 return results_with_content
328 except Exception as e:
329 logger.info(f"Error retrieving full content: {e}")
330 # Fall back to returning the items without full content
332 # Return items with their full ScaleSerp information
333 results = []
334 for item in relevant_items:
335 # Use the full result if available, otherwise use the preview
336 if "_full_result" in item:
337 result = item["_full_result"].copy()
338 else:
339 result = item.copy()
341 # Clean up temporary fields
342 if "_full_result" in result: 342 ↛ 343line 342 didn't jump to line 343 because the condition on line 342 was never true
343 del result["_full_result"]
345 results.append(result)
347 # Include knowledge graph and other metadata if this is the first call
348 if results and self._knowledge_graph:
349 results[0]["knowledge_graph"] = self._knowledge_graph
351 return results
353 def run(
354 self, query: str, research_context: Dict[str, Any] | None = None
355 ) -> List[Dict[str, Any]]:
356 """
357 Execute a search using ScaleSerp API with the two-phase approach.
359 Args:
360 query: The search query
361 research_context: Context from previous research to use.
363 Returns:
364 List of search results
365 """
366 logger.info("---Execute a search using ScaleSerp API (Google)---")
368 # Use the implementation from the parent class which handles all phases
369 results = super().run(query, research_context=research_context)
371 # Clean up
372 if hasattr(self, "_search_results"):
373 del self._search_results
374 if hasattr(self, "_knowledge_graph"): 374 ↛ 376line 374 didn't jump to line 376 because the condition on line 374 was always true
375 del self._knowledge_graph
376 if hasattr(self, "_related_searches"): 376 ↛ 378line 376 didn't jump to line 378 because the condition on line 376 was always true
377 del self._related_searches
378 if hasattr(self, "_related_questions"): 378 ↛ 381line 378 didn't jump to line 381 because the condition on line 378 was always true
379 del self._related_questions
381 return results