Coverage for src / local_deep_research / web_search_engines / engines / search_engine_serper.py: 72%

138 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1from loguru import logger 

2from typing import Any, Dict, List, Optional 

3import requests 

4from urllib.parse import urlparse 

5 

6from langchain_core.language_models import BaseLLM 

7 

8from ..search_engine_base import BaseSearchEngine 

9from ..rate_limiting import RateLimitError 

10from ...security import safe_post 

11 

12 

13class SerperSearchEngine(BaseSearchEngine): 

14 """Google search engine implementation using Serper API with two-phase approach""" 

15 

16 # Mark as public search engine 

17 is_public = True 

18 # Mark as generic search engine (general web search via Google) 

19 is_generic = True 

20 

21 # Class constants 

22 BASE_URL = "https://google.serper.dev/search" 

23 DEFAULT_TIMEOUT = 30 

24 DEFAULT_REGION = "us" 

25 DEFAULT_LANGUAGE = "en" 

26 

27 def __init__( 

28 self, 

29 max_results: int = 10, 

30 region: str = "us", 

31 time_period: Optional[str] = None, 

32 safe_search: bool = True, 

33 search_language: str = "en", 

34 api_key: Optional[str] = None, 

35 llm: Optional[BaseLLM] = None, 

36 include_full_content: bool = False, 

37 max_filtered_results: Optional[int] = None, 

38 settings_snapshot: Optional[Dict[str, Any]] = None, 

39 **kwargs, 

40 ): 

41 """ 

42 Initialize the Serper search engine. 

43 

44 Args: 

45 max_results: Maximum number of search results (default 10) 

46 region: Country code for localized results (e.g., 'us', 'gb', 'fr') 

47 time_period: Time filter for results ('day', 'week', 'month', 'year', or None for all time) 

48 safe_search: Whether to enable safe search 

49 search_language: Language code for results (e.g., 'en', 'es', 'fr') 

50 api_key: Serper API key (can also be set in settings) 

51 llm: Language model for relevance filtering 

52 include_full_content: Whether to include full webpage content in results 

53 max_filtered_results: Maximum number of results to keep after filtering 

54 settings_snapshot: Settings snapshot for thread context 

55 **kwargs: Additional parameters (ignored but accepted for compatibility) 

56 """ 

57 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results 

58 super().__init__( 

59 llm=llm, 

60 max_filtered_results=max_filtered_results, 

61 max_results=max_results, 

62 ) 

63 self.include_full_content = include_full_content 

64 self.region = region 

65 self.time_period = time_period 

66 self.safe_search = safe_search 

67 self.search_language = search_language 

68 

69 # Get API key - check params, env vars, or database 

70 from ...config.search_config import get_setting_from_snapshot 

71 

72 serper_api_key = api_key 

73 if not serper_api_key: 

74 serper_api_key = get_setting_from_snapshot( 

75 "search.engine.web.serper.api_key", 

76 settings_snapshot=settings_snapshot, 

77 ) 

78 

79 if not serper_api_key: 

80 raise ValueError( 

81 "Serper API key not found. Please provide api_key parameter or set it in the UI settings." 

82 ) 

83 

84 self.api_key = serper_api_key 

85 self.base_url = self.BASE_URL 

86 # Note: self.engine_type is automatically set by parent BaseSearchEngine class 

87 

88 # If full content is requested, initialize FullSearchResults 

89 if include_full_content: 89 ↛ 91line 89 didn't jump to line 91 because the condition on line 89 was never true

90 # Import FullSearchResults only if needed 

91 try: 

92 from .full_search import FullSearchResults 

93 

94 self.full_search = FullSearchResults( 

95 llm=llm, 

96 web_search=None, # We'll handle the search ourselves 

97 language=search_language, 

98 max_results=max_results, 

99 region=region, 

100 time=time_period, 

101 safesearch="Moderate" if safe_search else "Off", 

102 ) 

103 except ImportError: 

104 logger.warning( 

105 "Warning: FullSearchResults not available. Full content retrieval disabled." 

106 ) 

107 self.include_full_content = False 

108 

109 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

110 """ 

111 Get preview information from Serper API. 

112 

113 Args: 

114 query: The search query 

115 

116 Returns: 

117 List of preview dictionaries 

118 """ 

119 logger.info("Getting search results from Serper API") 

120 

121 try: 

122 # Build request payload 

123 payload = { 

124 "q": query, 

125 "num": self.max_results, 

126 "gl": self.region, 

127 "hl": self.search_language, 

128 } 

129 

130 # Add optional parameters 

131 if self.time_period: 

132 # Map time periods to Serper's format 

133 time_mapping = { 

134 "day": "d", 

135 "week": "w", 

136 "month": "m", 

137 "year": "y", 

138 } 

139 if self.time_period in time_mapping: 139 ↛ 143line 139 didn't jump to line 143 because the condition on line 139 was always true

140 payload["tbs"] = f"qdr:{time_mapping[self.time_period]}" 

141 

142 # Apply rate limiting before request 

143 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

144 self.engine_type 

145 ) 

146 

147 # Make API request 

148 headers = { 

149 "X-API-KEY": self.api_key, 

150 "Content-Type": "application/json", 

151 } 

152 

153 response = safe_post( 

154 self.base_url, 

155 headers=headers, 

156 json=payload, 

157 timeout=self.DEFAULT_TIMEOUT, 

158 ) 

159 

160 # Check for rate limits 

161 if response.status_code == 429: 

162 raise RateLimitError( 

163 f"Serper rate limit hit: {response.status_code} - {response.text}" 

164 ) 

165 

166 response.raise_for_status() 

167 

168 data = response.json() 

169 

170 # Extract organic results 

171 organic_results = data.get("organic", []) 

172 

173 # Format results as previews 

174 previews = [] 

175 for idx, result in enumerate(organic_results): 

176 # Extract display link safely using urlparse 

177 display_link = "" 

178 link = result.get("link", "") 

179 if link: 

180 try: 

181 parsed_url = urlparse(link) 

182 display_link = parsed_url.netloc or "" 

183 except Exception: 

184 logger.debug( 

185 f"Failed to parse URL for display: {link[:50]}" 

186 ) 

187 display_link = "" 

188 

189 preview = { 

190 "id": idx, 

191 "title": result.get("title", ""), 

192 "link": link, 

193 "snippet": result.get("snippet", ""), 

194 "displayed_link": display_link, 

195 "position": result.get("position", idx + 1), 

196 } 

197 

198 # Store full Serper result for later 

199 preview["_full_result"] = result 

200 

201 # Only include optional fields if present to avoid None values 

202 # This keeps the preview dict cleaner and saves memory 

203 if "sitelinks" in result: 

204 preview["sitelinks"] = result["sitelinks"] 

205 

206 if "date" in result: 

207 preview["date"] = result["date"] 

208 

209 if "attributes" in result: 209 ↛ 210line 209 didn't jump to line 210 because the condition on line 209 was never true

210 preview["attributes"] = result["attributes"] 

211 

212 previews.append(preview) 

213 

214 # Store the previews for potential full content retrieval 

215 self._search_results = previews 

216 

217 # Also store knowledge graph if available 

218 if "knowledgeGraph" in data: 

219 self._knowledge_graph = data["knowledgeGraph"] 

220 logger.info( 

221 f"Found knowledge graph for query: {data['knowledgeGraph'].get('title', 'Unknown')}" 

222 ) 

223 

224 # Store related searches and people also ask 

225 if "relatedSearches" in data: 

226 self._related_searches = data["relatedSearches"] 

227 

228 if "peopleAlsoAsk" in data: 228 ↛ 229line 228 didn't jump to line 229 because the condition on line 228 was never true

229 self._people_also_ask = data["peopleAlsoAsk"] 

230 

231 return previews 

232 

233 except RateLimitError: 

234 raise # Re-raise rate limit errors 

235 except requests.exceptions.RequestException as e: 

236 error_msg = str(e) 

237 logger.exception("Error getting Serper API results") 

238 

239 # Check for rate limit patterns in error message 

240 if any( 240 ↛ 249line 240 didn't jump to line 249 because the condition on line 240 was never true

241 pattern in error_msg.lower() 

242 for pattern in [ 

243 "429", 

244 "rate limit", 

245 "quota", 

246 "too many requests", 

247 ] 

248 ): 

249 raise RateLimitError(f"Serper rate limit hit: {error_msg}") 

250 

251 return [] 

252 except Exception: 

253 logger.exception("Unexpected error getting Serper API results") 

254 return [] 

255 

256 def _get_full_content( 

257 self, relevant_items: List[Dict[str, Any]] 

258 ) -> List[Dict[str, Any]]: 

259 """ 

260 Get full content for the relevant search results. 

261 If include_full_content is True and FullSearchResults is available, 

262 retrieves full webpage content for the results. 

263 

264 Args: 

265 relevant_items: List of relevant preview dictionaries 

266 

267 Returns: 

268 List of result dictionaries with full content if requested 

269 """ 

270 # Check if we should get full content 

271 from ...config import search_config 

272 

273 if ( 

274 hasattr(search_config, "SEARCH_SNIPPETS_ONLY") 

275 and search_config.SEARCH_SNIPPETS_ONLY 

276 ): 

277 logger.info("Snippet-only mode, skipping full content retrieval") 

278 

279 # Return the relevant items with their full Serper information 

280 results = [] 

281 for item in relevant_items: 

282 # Use the full result if available, otherwise use the preview 

283 if "_full_result" in item: 283 ↛ 286line 283 didn't jump to line 286 because the condition on line 283 was always true

284 result = item["_full_result"].copy() 

285 else: 

286 result = item.copy() 

287 

288 # Clean up temporary fields 

289 if "_full_result" in result: 289 ↛ 290line 289 didn't jump to line 290 because the condition on line 289 was never true

290 del result["_full_result"] 

291 

292 results.append(result) 

293 

294 # Include knowledge graph and other metadata if this is the first call 

295 if results and hasattr(self, "_knowledge_graph"): 295 ↛ 296line 295 didn't jump to line 296 because the condition on line 295 was never true

296 results[0]["knowledge_graph"] = self._knowledge_graph 

297 

298 return results 

299 

300 # If full content retrieval is enabled 

301 if self.include_full_content and hasattr(self, "full_search"): 301 ↛ 302line 301 didn't jump to line 302 because the condition on line 301 was never true

302 logger.info("Retrieving full webpage content") 

303 

304 try: 

305 # Use FullSearchResults to get full content 

306 results_with_content = self.full_search._get_full_content( 

307 relevant_items 

308 ) 

309 

310 return results_with_content 

311 

312 except Exception as e: 

313 logger.info(f"Error retrieving full content: {e}") 

314 # Fall back to returning the items without full content 

315 

316 # Return items with their full Serper information 

317 results = [] 

318 for item in relevant_items: 

319 # Use the full result if available, otherwise use the preview 

320 if "_full_result" in item: 320 ↛ 321line 320 didn't jump to line 321 because the condition on line 320 was never true

321 result = item["_full_result"].copy() 

322 else: 

323 result = item.copy() 

324 

325 # Clean up temporary fields 

326 if "_full_result" in result: 326 ↛ 327line 326 didn't jump to line 327 because the condition on line 326 was never true

327 del result["_full_result"] 

328 

329 results.append(result) 

330 

331 # Include knowledge graph and other metadata if this is the first call 

332 if results and hasattr(self, "_knowledge_graph"): 332 ↛ 333line 332 didn't jump to line 333 because the condition on line 332 was never true

333 results[0]["knowledge_graph"] = self._knowledge_graph 

334 

335 return results 

336 

337 def run( 

338 self, query: str, research_context: Dict[str, Any] | None = None 

339 ) -> List[Dict[str, Any]]: 

340 """ 

341 Execute a search using Serper API with the two-phase approach. 

342 

343 Args: 

344 query: The search query 

345 research_context: Context from previous research to use. 

346 

347 Returns: 

348 List of search results 

349 """ 

350 logger.info("---Execute a search using Serper API (Google)---") 

351 

352 # Use the implementation from the parent class which handles all phases 

353 # Note: super().run() internally calls our _get_previews() method 

354 results = super().run(query, research_context=research_context) 

355 

356 # Clean up temporary attributes 

357 if hasattr(self, "_search_results"): 

358 del self._search_results 

359 if hasattr(self, "_knowledge_graph"): 

360 del self._knowledge_graph 

361 if hasattr(self, "_related_searches"): 

362 del self._related_searches 

363 if hasattr(self, "_people_also_ask"): 

364 del self._people_also_ask 

365 

366 return results