Coverage for src / local_deep_research / web_search_engines / engines / search_engine_scaleserp.py: 61%

137 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1from loguru import logger 

2from typing import Any, Dict, List, Optional 

3import requests 

4from urllib.parse import urlparse 

5 

6from langchain_core.language_models import BaseLLM 

7 

8from ..search_engine_base import BaseSearchEngine 

9from ..rate_limiting import RateLimitError 

10from ...security import safe_get 

11 

12 

13class ScaleSerpSearchEngine(BaseSearchEngine): 

14 """Google search engine implementation using ScaleSerp API with caching support""" 

15 

16 # Mark as public search engine 

17 is_public = True 

18 # Mark as generic search engine (general web search via Google) 

19 is_generic = True 

20 

21 def __init__( 

22 self, 

23 max_results: int = 10, 

24 location: str = "United States", 

25 language: str = "en", 

26 device: str = "desktop", 

27 safe_search: bool = True, 

28 api_key: Optional[str] = None, 

29 llm: Optional[BaseLLM] = None, 

30 include_full_content: bool = False, 

31 max_filtered_results: Optional[int] = None, 

32 settings_snapshot: Optional[Dict[str, Any]] = None, 

33 enable_cache: bool = True, 

34 **kwargs, 

35 ): 

36 """ 

37 Initialize the ScaleSerp search engine. 

38 

39 Args: 

40 max_results: Maximum number of search results (default 10, max 100) 

41 location: Location for localized results (e.g., 'United States', 'London,England,United Kingdom') 

42 language: Language code for results (e.g., 'en', 'es', 'fr') 

43 device: Device type for search ('desktop' or 'mobile') 

44 safe_search: Whether to enable safe search 

45 api_key: ScaleSerp API key (can also be set in settings) 

46 llm: Language model for relevance filtering 

47 include_full_content: Whether to include full webpage content in results 

48 max_filtered_results: Maximum number of results to keep after filtering 

49 settings_snapshot: Settings snapshot for thread context 

50 enable_cache: Whether to use ScaleSerp's 1-hour caching (saves costs for repeated searches) 

51 **kwargs: Additional parameters (ignored but accepted for compatibility) 

52 """ 

53 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results 

54 super().__init__( 

55 llm=llm, 

56 max_filtered_results=max_filtered_results, 

57 max_results=max_results, 

58 ) 

59 self.include_full_content = include_full_content 

60 self.location = location 

61 self.language = language 

62 self.device = device 

63 self.safe_search = safe_search 

64 self.enable_cache = enable_cache # ScaleSerp's unique caching feature 

65 

66 # Get API key - check params, env vars, or database 

67 from ...config.search_config import get_setting_from_snapshot 

68 

69 scaleserp_api_key = api_key 

70 if not scaleserp_api_key: 

71 scaleserp_api_key = get_setting_from_snapshot( 

72 "search.engine.web.scaleserp.api_key", 

73 settings_snapshot=settings_snapshot, 

74 ) 

75 

76 if not scaleserp_api_key: 

77 raise ValueError( 

78 "ScaleSerp API key not found. Please provide api_key parameter or set it in the UI settings. " 

79 "Get your API key at https://scaleserp.com" 

80 ) 

81 

82 self.api_key = scaleserp_api_key 

83 self.base_url = "https://api.scaleserp.com/search" 

84 

85 # If full content is requested, initialize FullSearchResults 

86 if include_full_content: 86 ↛ 88line 86 didn't jump to line 88 because the condition on line 86 was never true

87 # Import FullSearchResults only if needed 

88 try: 

89 from .full_search import FullSearchResults 

90 

91 self.full_search = FullSearchResults( 

92 llm=llm, 

93 web_search=None, # We'll handle the search ourselves 

94 language=language, 

95 max_results=max_results, 

96 region=location, 

97 time=None, 

98 safesearch="Moderate" if safe_search else "Off", 

99 ) 

100 except ImportError: 

101 logger.warning( 

102 "Warning: FullSearchResults not available. Full content retrieval disabled." 

103 ) 

104 self.include_full_content = False 

105 

106 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

107 """ 

108 Get preview information from ScaleSerp API. 

109 

110 Args: 

111 query: The search query 

112 

113 Returns: 

114 List of preview dictionaries 

115 """ 

116 logger.info("Getting search results from ScaleSerp API") 

117 

118 try: 

119 # Build request parameters 

120 params = { 

121 "api_key": self.api_key, 

122 "q": query, 

123 "num": min(self.max_results, 100), # ScaleSerp max is 100 

124 "location": self.location, 

125 "hl": self.language, 

126 "device": self.device, 

127 } 

128 

129 # Add safe search if enabled 

130 if self.safe_search: 130 ↛ 135line 130 didn't jump to line 135 because the condition on line 130 was always true

131 params["safe"] = "on" 

132 

133 # ScaleSerp automatically caches identical queries for 1 hour 

134 # Cached results are served instantly and don't consume API credits 

135 if self.enable_cache: 135 ↛ 144line 135 didn't jump to line 144 because the condition on line 135 was always true

136 params["output"] = ( 

137 "json" # Ensure JSON output for cache detection 

138 ) 

139 logger.debug( 

140 "ScaleSerp caching enabled - identical searches within 1 hour are free" 

141 ) 

142 

143 # Apply rate limiting before request 

144 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

145 self.engine_type 

146 ) 

147 

148 # Make API request 

149 response = safe_get(self.base_url, params=params, timeout=30) 

150 

151 # Check for rate limits 

152 if response.status_code == 429: 

153 raise RateLimitError( 

154 f"ScaleSerp rate limit hit: {response.status_code} - {response.text}" 

155 ) 

156 

157 response.raise_for_status() 

158 

159 data = response.json() 

160 

161 # Extract organic results 

162 organic_results = data.get("organic_results", []) 

163 

164 # Format results as previews 

165 previews = [] 

166 

167 # Check if results were served from cache for monitoring 

168 from_cache = data.get("request_info", {}).get("cached", False) 

169 

170 for idx, result in enumerate(organic_results): 

171 # Extract display link safely using urlparse 

172 link = result.get("link", "") 

173 display_link = "" 

174 if link: 174 ↛ 184line 174 didn't jump to line 184 because the condition on line 174 was always true

175 try: 

176 parsed_url = urlparse(link) 

177 display_link = ( 

178 parsed_url.netloc or parsed_url.path or "" 

179 ) 

180 except Exception: 

181 # Fallback to truncated URL if parsing fails 

182 display_link = link[:50] 

183 

184 preview = { 

185 "id": idx, 

186 "title": result.get("title", ""), 

187 "link": link, 

188 "snippet": result.get("snippet", ""), 

189 "displayed_link": display_link, 

190 "position": result.get("position", idx + 1), 

191 "from_cache": from_cache, # Add cache status for monitoring 

192 } 

193 

194 # Store full ScaleSerp result for later 

195 preview["_full_result"] = result 

196 

197 # Include rich snippets if available 

198 if "rich_snippet" in result: 

199 preview["rich_snippet"] = result["rich_snippet"] 

200 

201 # Include date if available 

202 if "date" in result: 202 ↛ 203line 202 didn't jump to line 203 because the condition on line 202 was never true

203 preview["date"] = result["date"] 

204 

205 # Include sitelinks if available 

206 if "sitelinks" in result: 

207 preview["sitelinks"] = result["sitelinks"] 

208 

209 previews.append(preview) 

210 

211 # Store the previews for potential full content retrieval 

212 self._search_results = previews 

213 

214 # Store knowledge graph if available 

215 if "knowledge_graph" in data: 

216 self._knowledge_graph = data["knowledge_graph"] 

217 logger.info( 

218 f"Found knowledge graph for query: {data['knowledge_graph'].get('title', 'Unknown')}" 

219 ) 

220 

221 # Store related searches 

222 if "related_searches" in data: 222 ↛ 223line 222 didn't jump to line 223 because the condition on line 222 was never true

223 self._related_searches = data["related_searches"] 

224 

225 # Store related questions (People Also Ask) 

226 if "related_questions" in data: 226 ↛ 227line 226 didn't jump to line 227 because the condition on line 226 was never true

227 self._related_questions = data["related_questions"] 

228 

229 # Log if result was served from cache 

230 if from_cache: 

231 logger.debug( 

232 "Result served from ScaleSerp cache - no API credit used!" 

233 ) 

234 

235 return previews 

236 

237 except RateLimitError: 

238 raise # Re-raise rate limit errors 

239 except requests.exceptions.RequestException as e: 

240 error_msg = str(e) 

241 logger.exception( 

242 "Error getting ScaleSerp API results. Check API docs: https://docs.scaleserp.com" 

243 ) 

244 

245 # Check for rate limit patterns in error message 

246 if any( 246 ↛ 255line 246 didn't jump to line 255 because the condition on line 246 was never true

247 pattern in error_msg.lower() 

248 for pattern in [ 

249 "429", 

250 "rate limit", 

251 "quota", 

252 "too many requests", 

253 ] 

254 ): 

255 raise RateLimitError(f"ScaleSerp rate limit hit: {error_msg}") 

256 

257 return [] 

258 except Exception: 

259 logger.exception("Unexpected error getting ScaleSerp API results") 

260 return [] 

261 

262 def _get_full_content( 

263 self, relevant_items: List[Dict[str, Any]] 

264 ) -> List[Dict[str, Any]]: 

265 """ 

266 Get full content for the relevant search results. 

267 If include_full_content is True and FullSearchResults is available, 

268 retrieves full webpage content for the results. 

269 

270 Args: 

271 relevant_items: List of relevant preview dictionaries 

272 

273 Returns: 

274 List of result dictionaries with full content if requested 

275 """ 

276 # Check if we should get full content 

277 from ...config import search_config 

278 

279 if ( 279 ↛ 283line 279 didn't jump to line 283 because the condition on line 279 was never true

280 hasattr(search_config, "SEARCH_SNIPPETS_ONLY") 

281 and search_config.SEARCH_SNIPPETS_ONLY 

282 ): 

283 logger.info("Snippet-only mode, skipping full content retrieval") 

284 

285 # Return the relevant items with their full ScaleSerp information 

286 results = [] 

287 for item in relevant_items: 

288 # Use the full result if available, otherwise use the preview 

289 if "_full_result" in item: 

290 result = item["_full_result"].copy() 

291 else: 

292 result = item.copy() 

293 

294 # Clean up temporary fields 

295 if "_full_result" in result: 

296 del result["_full_result"] 

297 

298 results.append(result) 

299 

300 # Include knowledge graph and other metadata if this is the first call 

301 if results and hasattr(self, "_knowledge_graph"): 

302 results[0]["knowledge_graph"] = self._knowledge_graph 

303 

304 return results 

305 

306 # If full content retrieval is enabled 

307 if self.include_full_content and hasattr(self, "full_search"): 307 ↛ 308line 307 didn't jump to line 308 because the condition on line 307 was never true

308 logger.info("Retrieving full webpage content") 

309 

310 try: 

311 # Use FullSearchResults to get full content 

312 results_with_content = self.full_search._get_full_content( 

313 relevant_items 

314 ) 

315 

316 return results_with_content 

317 

318 except Exception as e: 

319 logger.info(f"Error retrieving full content: {e}") 

320 # Fall back to returning the items without full content 

321 

322 # Return items with their full ScaleSerp information 

323 results = [] 

324 for item in relevant_items: 

325 # Use the full result if available, otherwise use the preview 

326 if "_full_result" in item: 326 ↛ 329line 326 didn't jump to line 329 because the condition on line 326 was always true

327 result = item["_full_result"].copy() 

328 else: 

329 result = item.copy() 

330 

331 # Clean up temporary fields 

332 if "_full_result" in result: 332 ↛ 333line 332 didn't jump to line 333 because the condition on line 332 was never true

333 del result["_full_result"] 

334 

335 results.append(result) 

336 

337 # Include knowledge graph and other metadata if this is the first call 

338 if results and hasattr(self, "_knowledge_graph"): 338 ↛ 339line 338 didn't jump to line 339 because the condition on line 338 was never true

339 results[0]["knowledge_graph"] = self._knowledge_graph 

340 

341 return results 

342 

343 def run( 

344 self, query: str, research_context: Dict[str, Any] | None = None 

345 ) -> List[Dict[str, Any]]: 

346 """ 

347 Execute a search using ScaleSerp API with the two-phase approach. 

348 

349 Args: 

350 query: The search query 

351 research_context: Context from previous research to use. 

352 

353 Returns: 

354 List of search results 

355 """ 

356 logger.info("---Execute a search using ScaleSerp API (Google)---") 

357 

358 # Use the implementation from the parent class which handles all phases 

359 results = super().run(query, research_context=research_context) 

360 

361 # Clean up 

362 if hasattr(self, "_search_results"): 

363 del self._search_results 

364 if hasattr(self, "_knowledge_graph"): 

365 del self._knowledge_graph 

366 if hasattr(self, "_related_searches"): 

367 del self._related_searches 

368 if hasattr(self, "_related_questions"): 

369 del self._related_questions 

370 

371 return results