Coverage for src / local_deep_research / web_search_engines / engines / search_engine_scaleserp.py: 95%

93 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1from loguru import logger 

2from typing import Any, Dict, List, Optional 

3import requests 

4from urllib.parse import urlparse 

5 

6from langchain_core.language_models import BaseLLM 

7 

8from ..search_engine_base import BaseSearchEngine 

9from ..rate_limiting import RateLimitError 

10from ...security import safe_get 

11 

12 

13class ScaleSerpSearchEngine(BaseSearchEngine): 

14 """Google search engine implementation using ScaleSerp API with caching support""" 

15 

16 # Mark as public search engine 

17 is_public = True 

18 # Mark as generic search engine (general web search via Google) 

19 is_generic = True 

20 

21 def __init__( 

22 self, 

23 max_results: int = 10, 

24 location: str = "United States", 

25 language: str = "en", 

26 device: str = "desktop", 

27 safe_search: bool = True, 

28 api_key: Optional[str] = None, 

29 llm: Optional[BaseLLM] = None, 

30 include_full_content: bool = False, 

31 max_filtered_results: Optional[int] = None, 

32 settings_snapshot: Optional[Dict[str, Any]] = None, 

33 enable_cache: bool = True, 

34 **kwargs, 

35 ): 

36 """ 

37 Initialize the ScaleSerp search engine. 

38 

39 Args: 

40 max_results: Maximum number of search results (default 10, max 100) 

41 location: Location for localized results (e.g., 'United States', 'London,England,United Kingdom') 

42 language: Language code for results (e.g., 'en', 'es', 'fr') 

43 device: Device type for search ('desktop' or 'mobile') 

44 safe_search: Whether to enable safe search 

45 api_key: ScaleSerp API key (can also be set in settings) 

46 llm: Language model for relevance filtering 

47 include_full_content: Whether to include full webpage content in results 

48 max_filtered_results: Maximum number of results to keep after filtering 

49 settings_snapshot: Settings snapshot for thread context 

50 enable_cache: Whether to use ScaleSerp's 1-hour caching (saves costs for repeated searches) 

51 **kwargs: Additional parameters (ignored but accepted for compatibility) 

52 """ 

53 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results 

54 super().__init__( 

55 llm=llm, 

56 max_filtered_results=max_filtered_results, 

57 max_results=max_results, 

58 include_full_content=include_full_content, 

59 settings_snapshot=settings_snapshot, 

60 ) 

61 self.location = location 

62 self.language = language 

63 self.device = device 

64 self.safe_search = safe_search 

65 self.enable_cache = enable_cache # ScaleSerp's unique caching feature 

66 

67 # Get API key - check params, settings, or env vars 

68 scaleserp_api_key = self._resolve_api_key( 

69 api_key, 

70 "search.engine.web.scaleserp.api_key", 

71 engine_name="ScaleSerp", 

72 settings_snapshot=settings_snapshot, 

73 ) 

74 

75 self.api_key = scaleserp_api_key 

76 self.base_url = "https://api.scaleserp.com/search" 

77 

78 # Initialize per-query attributes (reset in _get_previews per search) 

79 self._knowledge_graph = None 

80 self._related_searches = None 

81 self._related_questions = None 

82 

83 # If full content is requested, initialize FullSearchResults 

84 self._init_full_search( 

85 web_search=None, # We'll handle the search ourselves 

86 language=language, 

87 max_results=max_results, 

88 region=location, 

89 time_period=None, 

90 safe_search="Moderate" if safe_search else "Off", 

91 ) 

92 

93 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

94 """ 

95 Get preview information from ScaleSerp API. 

96 

97 Args: 

98 query: The search query 

99 

100 Returns: 

101 List of preview dictionaries 

102 """ 

103 logger.info("Getting search results from ScaleSerp API") 

104 

105 # Reset per-query attributes to prevent leakage between searches 

106 self._knowledge_graph = None 

107 self._related_searches = None 

108 self._related_questions = None 

109 

110 try: 

111 # Build request parameters 

112 params = { 

113 "api_key": self.api_key, 

114 "q": query, 

115 "num": min(self.max_results, 100), # ScaleSerp max is 100 

116 "location": self.location, 

117 "hl": self.language, 

118 "device": self.device, 

119 } 

120 

121 # Add safe search if enabled 

122 if self.safe_search: 122 ↛ 127line 122 didn't jump to line 127 because the condition on line 122 was always true

123 params["safe"] = "on" 

124 

125 # ScaleSerp automatically caches identical queries for 1 hour 

126 # Cached results are served instantly and don't consume API credits 

127 if self.enable_cache: 127 ↛ 136line 127 didn't jump to line 136 because the condition on line 127 was always true

128 params["output"] = ( 

129 "json" # Ensure JSON output for cache detection 

130 ) 

131 logger.debug( 

132 "ScaleSerp caching enabled - identical searches within 1 hour are free" 

133 ) 

134 

135 # Apply rate limiting before request 

136 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

137 self.engine_type 

138 ) 

139 

140 # Make API request 

141 response = safe_get(self.base_url, params=params, timeout=30) 

142 

143 # Check for rate limits 

144 self._raise_if_rate_limit(response.status_code) 

145 

146 response.raise_for_status() 

147 

148 data = response.json() 

149 

150 # Extract organic results 

151 organic_results = data.get("organic_results", []) 

152 

153 # Format results as previews 

154 previews = [] 

155 

156 # Check if results were served from cache for monitoring 

157 from_cache = data.get("request_info", {}).get("cached", False) 

158 

159 for idx, result in enumerate(organic_results): 

160 # Extract display link safely using urlparse 

161 link = result.get("link", "") 

162 display_link = "" 

163 if link: 

164 try: 

165 parsed_url = urlparse(link) 

166 display_link = ( 

167 parsed_url.netloc or parsed_url.path or "" 

168 ) 

169 except Exception: 

170 # Fallback to truncated URL if parsing fails 

171 logger.debug("URL parsing failed, using truncation") 

172 display_link = link[:50] 

173 

174 preview = { 

175 "id": idx, 

176 "title": result.get("title", ""), 

177 "link": link, 

178 "snippet": result.get("snippet", ""), 

179 "displayed_link": display_link, 

180 "position": result.get("position", idx + 1), 

181 "from_cache": from_cache, # Add cache status for monitoring 

182 } 

183 

184 # Store full ScaleSerp result for later 

185 preview["_full_result"] = result 

186 

187 # Include rich snippets if available 

188 if "rich_snippet" in result: 

189 preview["rich_snippet"] = result["rich_snippet"] 

190 

191 # Include date if available 

192 if "date" in result: 

193 preview["date"] = result["date"] 

194 

195 # Include sitelinks if available 

196 if "sitelinks" in result: 

197 preview["sitelinks"] = result["sitelinks"] 

198 

199 previews.append(preview) 

200 

201 # Store the previews for potential full content retrieval 

202 self._search_results = previews 

203 

204 # Store knowledge graph if available 

205 if "knowledge_graph" in data: 

206 self._knowledge_graph = data["knowledge_graph"] 

207 logger.info( 

208 f"Found knowledge graph for query: {data['knowledge_graph'].get('title', 'Unknown')}" 

209 ) 

210 

211 # Store related searches 

212 if "related_searches" in data: 

213 self._related_searches = data["related_searches"] 

214 

215 # Store related questions (People Also Ask) 

216 if "related_questions" in data: 

217 self._related_questions = data["related_questions"] 

218 

219 # Log if result was served from cache 

220 if from_cache: 

221 logger.debug( 

222 "Result served from ScaleSerp cache - no API credit used!" 

223 ) 

224 

225 return previews 

226 

227 except RateLimitError: 

228 raise # Re-raise rate limit errors 

229 except requests.exceptions.RequestException as e: 

230 sanitized = self._sanitize_error_message(str(e)) 

231 logger.exception( 

232 "Error getting ScaleSerp API results: {}. Check API docs: https://docs.scaleserp.com", 

233 sanitized, 

234 ) 

235 self._raise_if_rate_limit(e) 

236 return [] 

237 except Exception as e: 

238 sanitized = self._sanitize_error_message(str(e)) 

239 logger.exception( 

240 "Unexpected error getting ScaleSerp API results: {}", sanitized 

241 ) 

242 return [] 

243 

244 def _get_full_content( 

245 self, relevant_items: List[Dict[str, Any]] 

246 ) -> List[Dict[str, Any]]: 

247 """ 

248 Get full content for the relevant search results. 

249 Extends base implementation to include knowledge graph data. 

250 

251 Args: 

252 relevant_items: List of relevant preview dictionaries 

253 

254 Returns: 

255 List of result dictionaries with full content if requested 

256 """ 

257 results = super()._get_full_content(relevant_items) 

258 

259 # Include knowledge graph if available 

260 if results and hasattr(self, "_knowledge_graph"): 260 ↛ 263line 260 didn't jump to line 263 because the condition on line 260 was always true

261 results[0]["knowledge_graph"] = self._knowledge_graph 

262 

263 return results 

264 

265 def _temp_attributes(self): 

266 """Return list of temporary attribute names to clean up after run().""" 

267 return super()._temp_attributes() + [ 

268 "_knowledge_graph", 

269 "_related_searches", 

270 "_related_questions", 

271 ]