Coverage for src / local_deep_research / web_search_engines / engines / search_engine_scaleserp.py: 73%

143 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1from loguru import logger 

2from typing import Any, Dict, List, Optional 

3import requests 

4from urllib.parse import urlparse 

5 

6from langchain_core.language_models import BaseLLM 

7 

8from ..search_engine_base import BaseSearchEngine 

9from ..rate_limiting import RateLimitError 

10from ...security import safe_get 

11 

12 

13class ScaleSerpSearchEngine(BaseSearchEngine): 

14 """Google search engine implementation using ScaleSerp API with caching support""" 

15 

16 # Mark as public search engine 

17 is_public = True 

18 # Mark as generic search engine (general web search via Google) 

19 is_generic = True 

20 

21 def __init__( 

22 self, 

23 max_results: int = 10, 

24 location: str = "United States", 

25 language: str = "en", 

26 device: str = "desktop", 

27 safe_search: bool = True, 

28 api_key: Optional[str] = None, 

29 llm: Optional[BaseLLM] = None, 

30 include_full_content: bool = False, 

31 max_filtered_results: Optional[int] = None, 

32 settings_snapshot: Optional[Dict[str, Any]] = None, 

33 enable_cache: bool = True, 

34 **kwargs, 

35 ): 

36 """ 

37 Initialize the ScaleSerp search engine. 

38 

39 Args: 

40 max_results: Maximum number of search results (default 10, max 100) 

41 location: Location for localized results (e.g., 'United States', 'London,England,United Kingdom') 

42 language: Language code for results (e.g., 'en', 'es', 'fr') 

43 device: Device type for search ('desktop' or 'mobile') 

44 safe_search: Whether to enable safe search 

45 api_key: ScaleSerp API key (can also be set in settings) 

46 llm: Language model for relevance filtering 

47 include_full_content: Whether to include full webpage content in results 

48 max_filtered_results: Maximum number of results to keep after filtering 

49 settings_snapshot: Settings snapshot for thread context 

50 enable_cache: Whether to use ScaleSerp's 1-hour caching (saves costs for repeated searches) 

51 **kwargs: Additional parameters (ignored but accepted for compatibility) 

52 """ 

53 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results 

54 super().__init__( 

55 llm=llm, 

56 max_filtered_results=max_filtered_results, 

57 max_results=max_results, 

58 ) 

59 self.include_full_content = include_full_content 

60 self.location = location 

61 self.language = language 

62 self.device = device 

63 self.safe_search = safe_search 

64 self.enable_cache = enable_cache # ScaleSerp's unique caching feature 

65 

66 # Get API key - check params, env vars, or database 

67 from ...config.search_config import get_setting_from_snapshot 

68 

69 scaleserp_api_key = api_key 

70 if not scaleserp_api_key: 

71 scaleserp_api_key = get_setting_from_snapshot( 

72 "search.engine.web.scaleserp.api_key", 

73 settings_snapshot=settings_snapshot, 

74 ) 

75 

76 if not scaleserp_api_key: 

77 raise ValueError( 

78 "ScaleSerp API key not found. Please provide api_key parameter or set it in the UI settings. " 

79 "Get your API key at https://scaleserp.com" 

80 ) 

81 

82 self.api_key = scaleserp_api_key 

83 self.base_url = "https://api.scaleserp.com/search" 

84 

85 # Initialize per-query attributes (reset in _get_previews per search) 

86 self._knowledge_graph = None 

87 self._related_searches = None 

88 self._related_questions = None 

89 

90 # If full content is requested, initialize FullSearchResults 

91 if include_full_content: 91 ↛ 93line 91 didn't jump to line 93 because the condition on line 91 was never true

92 # Import FullSearchResults only if needed 

93 try: 

94 from .full_search import FullSearchResults 

95 

96 self.full_search = FullSearchResults( 

97 llm=llm, 

98 web_search=None, # We'll handle the search ourselves 

99 language=language, 

100 max_results=max_results, 

101 region=location, 

102 time=None, 

103 safesearch="Moderate" if safe_search else "Off", 

104 ) 

105 except ImportError: 

106 logger.warning( 

107 "Warning: FullSearchResults not available. Full content retrieval disabled." 

108 ) 

109 self.include_full_content = False 

110 

111 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

112 """ 

113 Get preview information from ScaleSerp API. 

114 

115 Args: 

116 query: The search query 

117 

118 Returns: 

119 List of preview dictionaries 

120 """ 

121 logger.info("Getting search results from ScaleSerp API") 

122 

123 # Reset per-query attributes to prevent leakage between searches 

124 self._knowledge_graph = None 

125 self._related_searches = None 

126 self._related_questions = None 

127 

128 try: 

129 # Build request parameters 

130 params = { 

131 "api_key": self.api_key, 

132 "q": query, 

133 "num": min(self.max_results, 100), # ScaleSerp max is 100 

134 "location": self.location, 

135 "hl": self.language, 

136 "device": self.device, 

137 } 

138 

139 # Add safe search if enabled 

140 if self.safe_search: 140 ↛ 145line 140 didn't jump to line 145 because the condition on line 140 was always true

141 params["safe"] = "on" 

142 

143 # ScaleSerp automatically caches identical queries for 1 hour 

144 # Cached results are served instantly and don't consume API credits 

145 if self.enable_cache: 145 ↛ 154line 145 didn't jump to line 154 because the condition on line 145 was always true

146 params["output"] = ( 

147 "json" # Ensure JSON output for cache detection 

148 ) 

149 logger.debug( 

150 "ScaleSerp caching enabled - identical searches within 1 hour are free" 

151 ) 

152 

153 # Apply rate limiting before request 

154 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

155 self.engine_type 

156 ) 

157 

158 # Make API request 

159 response = safe_get(self.base_url, params=params, timeout=30) 

160 

161 # Check for rate limits 

162 if response.status_code == 429: 

163 raise RateLimitError( 

164 f"ScaleSerp rate limit hit: {response.status_code} - {response.text}" 

165 ) 

166 

167 response.raise_for_status() 

168 

169 data = response.json() 

170 

171 # Extract organic results 

172 organic_results = data.get("organic_results", []) 

173 

174 # Format results as previews 

175 previews = [] 

176 

177 # Check if results were served from cache for monitoring 

178 from_cache = data.get("request_info", {}).get("cached", False) 

179 

180 for idx, result in enumerate(organic_results): 

181 # Extract display link safely using urlparse 

182 link = result.get("link", "") 

183 display_link = "" 

184 if link: 184 ↛ 194line 184 didn't jump to line 194 because the condition on line 184 was always true

185 try: 

186 parsed_url = urlparse(link) 

187 display_link = ( 

188 parsed_url.netloc or parsed_url.path or "" 

189 ) 

190 except Exception: 

191 # Fallback to truncated URL if parsing fails 

192 display_link = link[:50] 

193 

194 preview = { 

195 "id": idx, 

196 "title": result.get("title", ""), 

197 "link": link, 

198 "snippet": result.get("snippet", ""), 

199 "displayed_link": display_link, 

200 "position": result.get("position", idx + 1), 

201 "from_cache": from_cache, # Add cache status for monitoring 

202 } 

203 

204 # Store full ScaleSerp result for later 

205 preview["_full_result"] = result 

206 

207 # Include rich snippets if available 

208 if "rich_snippet" in result: 

209 preview["rich_snippet"] = result["rich_snippet"] 

210 

211 # Include date if available 

212 if "date" in result: 

213 preview["date"] = result["date"] 

214 

215 # Include sitelinks if available 

216 if "sitelinks" in result: 

217 preview["sitelinks"] = result["sitelinks"] 

218 

219 previews.append(preview) 

220 

221 # Store the previews for potential full content retrieval 

222 self._search_results = previews 

223 

224 # Store knowledge graph if available 

225 if "knowledge_graph" in data: 

226 self._knowledge_graph = data["knowledge_graph"] 

227 logger.info( 

228 f"Found knowledge graph for query: {data['knowledge_graph'].get('title', 'Unknown')}" 

229 ) 

230 

231 # Store related searches 

232 if "related_searches" in data: 232 ↛ 233line 232 didn't jump to line 233 because the condition on line 232 was never true

233 self._related_searches = data["related_searches"] 

234 

235 # Store related questions (People Also Ask) 

236 if "related_questions" in data: 236 ↛ 237line 236 didn't jump to line 237 because the condition on line 236 was never true

237 self._related_questions = data["related_questions"] 

238 

239 # Log if result was served from cache 

240 if from_cache: 

241 logger.debug( 

242 "Result served from ScaleSerp cache - no API credit used!" 

243 ) 

244 

245 return previews 

246 

247 except RateLimitError: 

248 raise # Re-raise rate limit errors 

249 except requests.exceptions.RequestException as e: 

250 error_msg = str(e) 

251 logger.exception( 

252 "Error getting ScaleSerp API results. Check API docs: https://docs.scaleserp.com" 

253 ) 

254 

255 # Check for rate limit patterns in error message 

256 if any( 256 ↛ 265line 256 didn't jump to line 265 because the condition on line 256 was never true

257 pattern in error_msg.lower() 

258 for pattern in [ 

259 "429", 

260 "rate limit", 

261 "quota", 

262 "too many requests", 

263 ] 

264 ): 

265 raise RateLimitError(f"ScaleSerp rate limit hit: {error_msg}") 

266 

267 return [] 

268 except Exception: 

269 logger.exception("Unexpected error getting ScaleSerp API results") 

270 return [] 

271 

272 def _get_full_content( 

273 self, relevant_items: List[Dict[str, Any]] 

274 ) -> List[Dict[str, Any]]: 

275 """ 

276 Get full content for the relevant search results. 

277 If include_full_content is True and FullSearchResults is available, 

278 retrieves full webpage content for the results. 

279 

280 Args: 

281 relevant_items: List of relevant preview dictionaries 

282 

283 Returns: 

284 List of result dictionaries with full content if requested 

285 """ 

286 # Check if we should get full content 

287 from ...config import search_config 

288 

289 if ( 289 ↛ 293line 289 didn't jump to line 293 because the condition on line 289 was never true

290 hasattr(search_config, "SEARCH_SNIPPETS_ONLY") 

291 and search_config.SEARCH_SNIPPETS_ONLY 

292 ): 

293 logger.info("Snippet-only mode, skipping full content retrieval") 

294 

295 # Return the relevant items with their full ScaleSerp information 

296 results = [] 

297 for item in relevant_items: 

298 # Use the full result if available, otherwise use the preview 

299 if "_full_result" in item: 

300 result = item["_full_result"].copy() 

301 else: 

302 result = item.copy() 

303 

304 # Clean up temporary fields 

305 if "_full_result" in result: 

306 del result["_full_result"] 

307 

308 results.append(result) 

309 

310 # Include knowledge graph and other metadata if this is the first call 

311 if results and self._knowledge_graph: 

312 results[0]["knowledge_graph"] = self._knowledge_graph 

313 

314 return results 

315 

316 # If full content retrieval is enabled 

317 if self.include_full_content and hasattr(self, "full_search"): 317 ↛ 318line 317 didn't jump to line 318 because the condition on line 317 was never true

318 logger.info("Retrieving full webpage content") 

319 

320 try: 

321 # Use FullSearchResults to get full content 

322 results_with_content = self.full_search._get_full_content( 

323 relevant_items 

324 ) 

325 

326 return results_with_content 

327 

328 except Exception as e: 

329 logger.info(f"Error retrieving full content: {e}") 

330 # Fall back to returning the items without full content 

331 

332 # Return items with their full ScaleSerp information 

333 results = [] 

334 for item in relevant_items: 

335 # Use the full result if available, otherwise use the preview 

336 if "_full_result" in item: 

337 result = item["_full_result"].copy() 

338 else: 

339 result = item.copy() 

340 

341 # Clean up temporary fields 

342 if "_full_result" in result: 342 ↛ 343line 342 didn't jump to line 343 because the condition on line 342 was never true

343 del result["_full_result"] 

344 

345 results.append(result) 

346 

347 # Include knowledge graph and other metadata if this is the first call 

348 if results and self._knowledge_graph: 

349 results[0]["knowledge_graph"] = self._knowledge_graph 

350 

351 return results 

352 

353 def run( 

354 self, query: str, research_context: Dict[str, Any] | None = None 

355 ) -> List[Dict[str, Any]]: 

356 """ 

357 Execute a search using ScaleSerp API with the two-phase approach. 

358 

359 Args: 

360 query: The search query 

361 research_context: Context from previous research to use. 

362 

363 Returns: 

364 List of search results 

365 """ 

366 logger.info("---Execute a search using ScaleSerp API (Google)---") 

367 

368 # Use the implementation from the parent class which handles all phases 

369 results = super().run(query, research_context=research_context) 

370 

371 # Clean up 

372 if hasattr(self, "_search_results"): 

373 del self._search_results 

374 if hasattr(self, "_knowledge_graph"): 374 ↛ 376line 374 didn't jump to line 376 because the condition on line 374 was always true

375 del self._knowledge_graph 

376 if hasattr(self, "_related_searches"): 376 ↛ 378line 376 didn't jump to line 378 because the condition on line 376 was always true

377 del self._related_searches 

378 if hasattr(self, "_related_questions"): 378 ↛ 381line 378 didn't jump to line 381 because the condition on line 378 was always true

379 del self._related_questions 

380 

381 return results