Coverage for src / local_deep_research / web_search_engines / engines / search_engine_serper.py: 83%

144 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1from loguru import logger 

2from typing import Any, Dict, List, Optional 

3import requests 

4from urllib.parse import urlparse 

5 

6from langchain_core.language_models import BaseLLM 

7 

8from ..search_engine_base import BaseSearchEngine 

9from ..rate_limiting import RateLimitError 

10from ...security import safe_post 

11 

12 

13class SerperSearchEngine(BaseSearchEngine): 

14 """Google search engine implementation using Serper API with two-phase approach""" 

15 

16 # Mark as public search engine 

17 is_public = True 

18 # Mark as generic search engine (general web search via Google) 

19 is_generic = True 

20 

21 # Class constants 

22 BASE_URL = "https://google.serper.dev/search" 

23 DEFAULT_TIMEOUT = 30 

24 DEFAULT_REGION = "us" 

25 DEFAULT_LANGUAGE = "en" 

26 

27 def __init__( 

28 self, 

29 max_results: int = 10, 

30 region: str = "us", 

31 time_period: Optional[str] = None, 

32 safe_search: bool = True, 

33 search_language: str = "en", 

34 api_key: Optional[str] = None, 

35 llm: Optional[BaseLLM] = None, 

36 include_full_content: bool = False, 

37 max_filtered_results: Optional[int] = None, 

38 settings_snapshot: Optional[Dict[str, Any]] = None, 

39 **kwargs, 

40 ): 

41 """ 

42 Initialize the Serper search engine. 

43 

44 Args: 

45 max_results: Maximum number of search results (default 10) 

46 region: Country code for localized results (e.g., 'us', 'gb', 'fr') 

47 time_period: Time filter for results ('day', 'week', 'month', 'year', or None for all time) 

48 safe_search: Whether to enable safe search 

49 search_language: Language code for results (e.g., 'en', 'es', 'fr') 

50 api_key: Serper API key (can also be set in settings) 

51 llm: Language model for relevance filtering 

52 include_full_content: Whether to include full webpage content in results 

53 max_filtered_results: Maximum number of results to keep after filtering 

54 settings_snapshot: Settings snapshot for thread context 

55 **kwargs: Additional parameters (ignored but accepted for compatibility) 

56 """ 

57 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results 

58 super().__init__( 

59 llm=llm, 

60 max_filtered_results=max_filtered_results, 

61 max_results=max_results, 

62 ) 

63 self.include_full_content = include_full_content 

64 self.region = region 

65 self.time_period = time_period 

66 self.safe_search = safe_search 

67 self.search_language = search_language 

68 

69 # Get API key - check params, env vars, or database 

70 from ...config.search_config import get_setting_from_snapshot 

71 

72 serper_api_key = api_key 

73 if not serper_api_key: 

74 serper_api_key = get_setting_from_snapshot( 

75 "search.engine.web.serper.api_key", 

76 settings_snapshot=settings_snapshot, 

77 ) 

78 

79 if not serper_api_key: 

80 raise ValueError( 

81 "Serper API key not found. Please provide api_key parameter or set it in the UI settings." 

82 ) 

83 

84 self.api_key = serper_api_key 

85 self.base_url = self.BASE_URL 

86 # Note: self.engine_type is automatically set by parent BaseSearchEngine class 

87 

88 # Initialize per-query attributes (reset in _get_previews per search) 

89 self._knowledge_graph = None 

90 self._related_searches = None 

91 self._people_also_ask = None 

92 

93 # If full content is requested, initialize FullSearchResults 

94 if include_full_content: 94 ↛ 96line 94 didn't jump to line 96 because the condition on line 94 was never true

95 # Import FullSearchResults only if needed 

96 try: 

97 from .full_search import FullSearchResults 

98 

99 self.full_search = FullSearchResults( 

100 llm=llm, 

101 web_search=None, # We'll handle the search ourselves 

102 language=search_language, 

103 max_results=max_results, 

104 region=region, 

105 time=time_period, 

106 safesearch="Moderate" if safe_search else "Off", 

107 ) 

108 except ImportError: 

109 logger.warning( 

110 "Warning: FullSearchResults not available. Full content retrieval disabled." 

111 ) 

112 self.include_full_content = False 

113 

114 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

115 """ 

116 Get preview information from Serper API. 

117 

118 Args: 

119 query: The search query 

120 

121 Returns: 

122 List of preview dictionaries 

123 """ 

124 logger.info("Getting search results from Serper API") 

125 

126 # Reset per-query attributes to prevent leakage between searches 

127 self._knowledge_graph = None 

128 self._related_searches = None 

129 self._people_also_ask = None 

130 

131 try: 

132 # Build request payload 

133 payload = { 

134 "q": query, 

135 "num": self.max_results, 

136 "gl": self.region, 

137 "hl": self.search_language, 

138 } 

139 

140 # Add optional parameters 

141 if self.time_period: 

142 # Map time periods to Serper's format 

143 time_mapping = { 

144 "day": "d", 

145 "week": "w", 

146 "month": "m", 

147 "year": "y", 

148 } 

149 if self.time_period in time_mapping: 149 ↛ 153line 149 didn't jump to line 153 because the condition on line 149 was always true

150 payload["tbs"] = f"qdr:{time_mapping[self.time_period]}" 

151 

152 # Apply rate limiting before request 

153 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

154 self.engine_type 

155 ) 

156 

157 # Make API request 

158 headers = { 

159 "X-API-KEY": self.api_key, 

160 "Content-Type": "application/json", 

161 } 

162 

163 response = safe_post( 

164 self.base_url, 

165 headers=headers, 

166 json=payload, 

167 timeout=self.DEFAULT_TIMEOUT, 

168 ) 

169 

170 # Check for rate limits 

171 if response.status_code == 429: 

172 raise RateLimitError( 

173 f"Serper rate limit hit: {response.status_code} - {response.text}" 

174 ) 

175 

176 response.raise_for_status() 

177 

178 data = response.json() 

179 

180 # Extract organic results 

181 organic_results = data.get("organic", []) 

182 

183 # Format results as previews 

184 previews = [] 

185 for idx, result in enumerate(organic_results): 

186 # Extract display link safely using urlparse 

187 display_link = "" 

188 link = result.get("link", "") 

189 if link: 

190 try: 

191 parsed_url = urlparse(link) 

192 display_link = parsed_url.netloc or "" 

193 except Exception: 

194 logger.debug( 

195 f"Failed to parse URL for display: {link[:50]}" 

196 ) 

197 display_link = "" 

198 

199 preview = { 

200 "id": idx, 

201 "title": result.get("title", ""), 

202 "link": link, 

203 "snippet": result.get("snippet", ""), 

204 "displayed_link": display_link, 

205 "position": result.get("position", idx + 1), 

206 } 

207 

208 # Store full Serper result for later 

209 preview["_full_result"] = result 

210 

211 # Only include optional fields if present to avoid None values 

212 # This keeps the preview dict cleaner and saves memory 

213 if "sitelinks" in result: 

214 preview["sitelinks"] = result["sitelinks"] 

215 

216 if "date" in result: 

217 preview["date"] = result["date"] 

218 

219 if "attributes" in result: 219 ↛ 220line 219 didn't jump to line 220 because the condition on line 219 was never true

220 preview["attributes"] = result["attributes"] 

221 

222 previews.append(preview) 

223 

224 # Store the previews for potential full content retrieval 

225 self._search_results = previews 

226 

227 # Also store knowledge graph if available 

228 if "knowledgeGraph" in data: 

229 self._knowledge_graph = data["knowledgeGraph"] 

230 logger.info( 

231 f"Found knowledge graph for query: {data['knowledgeGraph'].get('title', 'Unknown')}" 

232 ) 

233 

234 # Store related searches and people also ask 

235 if "relatedSearches" in data: 

236 self._related_searches = data["relatedSearches"] 

237 

238 if "peopleAlsoAsk" in data: 238 ↛ 239line 238 didn't jump to line 239 because the condition on line 238 was never true

239 self._people_also_ask = data["peopleAlsoAsk"] 

240 

241 return previews 

242 

243 except RateLimitError: 

244 raise # Re-raise rate limit errors 

245 except requests.exceptions.RequestException as e: 

246 error_msg = str(e) 

247 logger.exception("Error getting Serper API results") 

248 

249 # Check for rate limit patterns in error message 

250 if any( 250 ↛ 259line 250 didn't jump to line 259 because the condition on line 250 was never true

251 pattern in error_msg.lower() 

252 for pattern in [ 

253 "429", 

254 "rate limit", 

255 "quota", 

256 "too many requests", 

257 ] 

258 ): 

259 raise RateLimitError(f"Serper rate limit hit: {error_msg}") 

260 

261 return [] 

262 except Exception: 

263 logger.exception("Unexpected error getting Serper API results") 

264 return [] 

265 

266 def _get_full_content( 

267 self, relevant_items: List[Dict[str, Any]] 

268 ) -> List[Dict[str, Any]]: 

269 """ 

270 Get full content for the relevant search results. 

271 If include_full_content is True and FullSearchResults is available, 

272 retrieves full webpage content for the results. 

273 

274 Args: 

275 relevant_items: List of relevant preview dictionaries 

276 

277 Returns: 

278 List of result dictionaries with full content if requested 

279 """ 

280 # Check if we should get full content 

281 from ...config import search_config 

282 

283 if ( 

284 hasattr(search_config, "SEARCH_SNIPPETS_ONLY") 

285 and search_config.SEARCH_SNIPPETS_ONLY 

286 ): 

287 logger.info("Snippet-only mode, skipping full content retrieval") 

288 

289 # Return the relevant items with their full Serper information 

290 results = [] 

291 for item in relevant_items: 

292 # Use the full result if available, otherwise use the preview 

293 if "_full_result" in item: 293 ↛ 296line 293 didn't jump to line 296 because the condition on line 293 was always true

294 result = item["_full_result"].copy() 

295 else: 

296 result = item.copy() 

297 

298 # Clean up temporary fields 

299 if "_full_result" in result: 299 ↛ 300line 299 didn't jump to line 300 because the condition on line 299 was never true

300 del result["_full_result"] 

301 

302 results.append(result) 

303 

304 # Include knowledge graph and other metadata if this is the first call 

305 if results and self._knowledge_graph: 305 ↛ 306line 305 didn't jump to line 306 because the condition on line 305 was never true

306 results[0]["knowledge_graph"] = self._knowledge_graph 

307 

308 return results 

309 

310 # If full content retrieval is enabled 

311 if self.include_full_content and hasattr(self, "full_search"): 311 ↛ 312line 311 didn't jump to line 312 because the condition on line 311 was never true

312 logger.info("Retrieving full webpage content") 

313 

314 try: 

315 # Use FullSearchResults to get full content 

316 results_with_content = self.full_search._get_full_content( 

317 relevant_items 

318 ) 

319 

320 return results_with_content 

321 

322 except Exception as e: 

323 logger.info(f"Error retrieving full content: {e}") 

324 # Fall back to returning the items without full content 

325 

326 # Return items with their full Serper information 

327 results = [] 

328 for item in relevant_items: 

329 # Use the full result if available, otherwise use the preview 

330 if "_full_result" in item: 

331 result = item["_full_result"].copy() 

332 else: 

333 result = item.copy() 

334 

335 # Clean up temporary fields 

336 if "_full_result" in result: 336 ↛ 337line 336 didn't jump to line 337 because the condition on line 336 was never true

337 del result["_full_result"] 

338 

339 results.append(result) 

340 

341 # Include knowledge graph and other metadata if this is the first call 

342 if results and self._knowledge_graph: 

343 results[0]["knowledge_graph"] = self._knowledge_graph 

344 

345 return results 

346 

347 def run( 

348 self, query: str, research_context: Dict[str, Any] | None = None 

349 ) -> List[Dict[str, Any]]: 

350 """ 

351 Execute a search using Serper API with the two-phase approach. 

352 

353 Args: 

354 query: The search query 

355 research_context: Context from previous research to use. 

356 

357 Returns: 

358 List of search results 

359 """ 

360 logger.info("---Execute a search using Serper API (Google)---") 

361 

362 # Use the implementation from the parent class which handles all phases 

363 # Note: super().run() internally calls our _get_previews() method 

364 results = super().run(query, research_context=research_context) 

365 

366 # Clean up temporary attributes 

367 if hasattr(self, "_search_results"): 

368 del self._search_results 

369 if hasattr(self, "_knowledge_graph"): 369 ↛ 371line 369 didn't jump to line 371 because the condition on line 369 was always true

370 del self._knowledge_graph 

371 if hasattr(self, "_related_searches"): 371 ↛ 373line 371 didn't jump to line 373 because the condition on line 371 was always true

372 del self._related_searches 

373 if hasattr(self, "_people_also_ask"): 373 ↛ 376line 373 didn't jump to line 376 because the condition on line 373 was always true

374 del self._people_also_ask 

375 

376 return results