Coverage for src/local_deep_research/web_search_engines/engines/search_engine

1from loguru import logger

2from typing import Any, Dict, List, Optional

3import requests

4from urllib.parse import urlparse

6from langchain_core.language_models import BaseLLM

8from ..search_engine_base import BaseSearchEngine

9from ..rate_limiting import RateLimitError

10from ...security import safe_post

13class SerperSearchEngine(BaseSearchEngine):

14 """Google search engine implementation using Serper API with two-phase approach"""

16 # Mark as public search engine

17 is_public = True

18 # Mark as generic search engine (general web search via Google)

19 is_generic = True

21 # Class constants

22 BASE_URL = "https://google.serper.dev/search"

23 DEFAULT_TIMEOUT = 30

24 DEFAULT_REGION = "us"

25 DEFAULT_LANGUAGE = "en"

27 def __init__(

28 self,

29 max_results: int = 10,

30 region: str = "us",

31 time_period: Optional[str] = None,

32 safe_search: bool = True,

33 search_language: str = "en",

34 api_key: Optional[str] = None,

35 llm: Optional[BaseLLM] = None,

36 include_full_content: bool = False,

37 max_filtered_results: Optional[int] = None,

38 settings_snapshot: Optional[Dict[str, Any]] = None,

39 **kwargs,

40 ):

41 """

42 Initialize the Serper search engine.

44 Args:

45 max_results: Maximum number of search results (default 10)

46 region: Country code for localized results (e.g., 'us', 'gb', 'fr')

47 time_period: Time filter for results ('day', 'week', 'month', 'year', or None for all time)

48 safe_search: Whether to enable safe search

49 search_language: Language code for results (e.g., 'en', 'es', 'fr')

50 api_key: Serper API key (can also be set in settings)

51 llm: Language model for relevance filtering

52 include_full_content: Whether to include full webpage content in results

53 max_filtered_results: Maximum number of results to keep after filtering

54 settings_snapshot: Settings snapshot for thread context

55 **kwargs: Additional parameters (ignored but accepted for compatibility)

56 """

57 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results

58 super().__init__(

59 llm=llm,

60 max_filtered_results=max_filtered_results,

61 max_results=max_results,

62 )

63 self.include_full_content = include_full_content

64 self.region = region

65 self.time_period = time_period

66 self.safe_search = safe_search

67 self.search_language = search_language

69 # Get API key - check params, env vars, or database

70 from ...config.search_config import get_setting_from_snapshot

72 serper_api_key = api_key

73 if not serper_api_key:

74 serper_api_key = get_setting_from_snapshot(

75 "search.engine.web.serper.api_key",

76 settings_snapshot=settings_snapshot,

77 )

79 if not serper_api_key:

80 raise ValueError(

81 "Serper API key not found. Please provide api_key parameter or set it in the UI settings."

82 )

84 self.api_key = serper_api_key

85 self.base_url = self.BASE_URL

86 # Note: self.engine_type is automatically set by parent BaseSearchEngine class

88 # Initialize per-query attributes (reset in _get_previews per search)

89 self._knowledge_graph = None

90 self._related_searches = None

91 self._people_also_ask = None

93 # If full content is requested, initialize FullSearchResults

94 if include_full_content: 94 ↛ 96line 94 didn't jump to line 96 because the condition on line 94 was never true

95 # Import FullSearchResults only if needed

96 try:

97 from .full_search import FullSearchResults

99 self.full_search = FullSearchResults(

100 llm=llm,

101 web_search=None, # We'll handle the search ourselves

102 language=search_language,

103 max_results=max_results,

104 region=region,

105 time=time_period,

106 safesearch="Moderate" if safe_search else "Off",

107 )

108 except ImportError:

109 logger.warning(

110 "Warning: FullSearchResults not available. Full content retrieval disabled."

111 )

112 self.include_full_content = False

113

114 def _get_previews(self, query: str) -> List[Dict[str, Any]]:

115 """

116 Get preview information from Serper API.

117

118 Args:

119 query: The search query

120

121 Returns:

122 List of preview dictionaries

123 """

124 logger.info("Getting search results from Serper API")

125

126 # Reset per-query attributes to prevent leakage between searches

127 self._knowledge_graph = None

128 self._related_searches = None

129 self._people_also_ask = None

130

131 try:

132 # Build request payload

133 payload = {

134 "q": query,

135 "num": self.max_results,

136 "gl": self.region,

137 "hl": self.search_language,

138 }

139

140 # Add optional parameters

141 if self.time_period:

142 # Map time periods to Serper's format

143 time_mapping = {

144 "day": "d",

145 "week": "w",

146 "month": "m",

147 "year": "y",

148 }

149 if self.time_period in time_mapping: 149 ↛ 153line 149 didn't jump to line 153 because the condition on line 149 was always true

150 payload["tbs"] = f"qdr:{time_mapping[self.time_period]}"

151

152 # Apply rate limiting before request

153 self._last_wait_time = self.rate_tracker.apply_rate_limit(

154 self.engine_type

155 )

156

157 # Make API request

158 headers = {

159 "X-API-KEY": self.api_key,

160 "Content-Type": "application/json",

161 }

162

163 response = safe_post(

164 self.base_url,

165 headers=headers,

166 json=payload,

167 timeout=self.DEFAULT_TIMEOUT,

168 )

169

170 # Check for rate limits

171 if response.status_code == 429:

172 raise RateLimitError(

173 f"Serper rate limit hit: {response.status_code} - {response.text}"

174 )

175

176 response.raise_for_status()

177

178 data = response.json()

179

180 # Extract organic results

181 organic_results = data.get("organic", [])

182

183 # Format results as previews

184 previews = []

185 for idx, result in enumerate(organic_results):

186 # Extract display link safely using urlparse

187 display_link = ""

188 link = result.get("link", "")

189 if link:

190 try:

191 parsed_url = urlparse(link)

192 display_link = parsed_url.netloc or ""

193 except Exception:

194 logger.debug(

195 f"Failed to parse URL for display: {link[:50]}"

196 )

197 display_link = ""

198

199 preview = {

200 "id": idx,

201 "title": result.get("title", ""),

202 "link": link,

203 "snippet": result.get("snippet", ""),

204 "displayed_link": display_link,

205 "position": result.get("position", idx + 1),

206 }

207

208 # Store full Serper result for later

209 preview["_full_result"] = result

210

211 # Only include optional fields if present to avoid None values

212 # This keeps the preview dict cleaner and saves memory

213 if "sitelinks" in result:

214 preview["sitelinks"] = result["sitelinks"]

215

216 if "date" in result:

217 preview["date"] = result["date"]

218

219 if "attributes" in result: 219 ↛ 220line 219 didn't jump to line 220 because the condition on line 219 was never true

220 preview["attributes"] = result["attributes"]

221

222 previews.append(preview)

223

224 # Store the previews for potential full content retrieval

225 self._search_results = previews

226

227 # Also store knowledge graph if available

228 if "knowledgeGraph" in data:

229 self._knowledge_graph = data["knowledgeGraph"]

230 logger.info(

231 f"Found knowledge graph for query: {data['knowledgeGraph'].get('title', 'Unknown')}"

232 )

233

234 # Store related searches and people also ask

235 if "relatedSearches" in data:

236 self._related_searches = data["relatedSearches"]

237

238 if "peopleAlsoAsk" in data: 238 ↛ 239line 238 didn't jump to line 239 because the condition on line 238 was never true

239 self._people_also_ask = data["peopleAlsoAsk"]

240

241 return previews

242

243 except RateLimitError:

244 raise # Re-raise rate limit errors

245 except requests.exceptions.RequestException as e:

246 error_msg = str(e)

247 logger.exception("Error getting Serper API results")

248

249 # Check for rate limit patterns in error message

250 if any( 250 ↛ 259line 250 didn't jump to line 259 because the condition on line 250 was never true

251 pattern in error_msg.lower()

252 for pattern in [

253 "429",

254 "rate limit",

255 "quota",

256 "too many requests",

257 ]

258 ):

259 raise RateLimitError(f"Serper rate limit hit: {error_msg}")

260

261 return []

262 except Exception:

263 logger.exception("Unexpected error getting Serper API results")

264 return []

265

266 def _get_full_content(

267 self, relevant_items: List[Dict[str, Any]]

268 ) -> List[Dict[str, Any]]:

269 """

270 Get full content for the relevant search results.

271 If include_full_content is True and FullSearchResults is available,

272 retrieves full webpage content for the results.

273

274 Args:

275 relevant_items: List of relevant preview dictionaries

276

277 Returns:

278 List of result dictionaries with full content if requested

279 """

280 # Check if we should get full content

281 from ...config import search_config

282

283 if (

284 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")

285 and search_config.SEARCH_SNIPPETS_ONLY

286 ):

287 logger.info("Snippet-only mode, skipping full content retrieval")

288

289 # Return the relevant items with their full Serper information

290 results = []

291 for item in relevant_items:

292 # Use the full result if available, otherwise use the preview

293 if "_full_result" in item: 293 ↛ 296line 293 didn't jump to line 296 because the condition on line 293 was always true

294 result = item["_full_result"].copy()

295 else:

296 result = item.copy()

297

298 # Clean up temporary fields

299 if "_full_result" in result: 299 ↛ 300line 299 didn't jump to line 300 because the condition on line 299 was never true

300 del result["_full_result"]

301

302 results.append(result)

303

304 # Include knowledge graph and other metadata if this is the first call

305 if results and self._knowledge_graph: 305 ↛ 306line 305 didn't jump to line 306 because the condition on line 305 was never true

306 results[0]["knowledge_graph"] = self._knowledge_graph

307

308 return results

309

310 # If full content retrieval is enabled

311 if self.include_full_content and hasattr(self, "full_search"): 311 ↛ 312line 311 didn't jump to line 312 because the condition on line 311 was never true

312 logger.info("Retrieving full webpage content")

313

314 try:

315 # Use FullSearchResults to get full content

316 results_with_content = self.full_search._get_full_content(

317 relevant_items

318 )

319

320 return results_with_content

321

322 except Exception as e:

323 logger.info(f"Error retrieving full content: {e}")

324 # Fall back to returning the items without full content

325

326 # Return items with their full Serper information

327 results = []

328 for item in relevant_items:

329 # Use the full result if available, otherwise use the preview

330 if "_full_result" in item:

331 result = item["_full_result"].copy()

332 else:

333 result = item.copy()

334

335 # Clean up temporary fields

336 if "_full_result" in result: 336 ↛ 337line 336 didn't jump to line 337 because the condition on line 336 was never true

337 del result["_full_result"]

338

339 results.append(result)

340

341 # Include knowledge graph and other metadata if this is the first call

342 if results and self._knowledge_graph:

343 results[0]["knowledge_graph"] = self._knowledge_graph

344

345 return results

346

347 def run(

348 self, query: str, research_context: Dict[str, Any] | None = None

349 ) -> List[Dict[str, Any]]:

350 """

351 Execute a search using Serper API with the two-phase approach.

352

353 Args:

354 query: The search query

355 research_context: Context from previous research to use.

356

357 Returns:

358 List of search results

359 """

360 logger.info("---Execute a search using Serper API (Google)---")

361

362 # Use the implementation from the parent class which handles all phases

363 # Note: super().run() internally calls our _get_previews() method

364 results = super().run(query, research_context=research_context)

365

366 # Clean up temporary attributes

367 if hasattr(self, "_search_results"):

368 del self._search_results

369 if hasattr(self, "_knowledge_graph"): 369 ↛ 371line 369 didn't jump to line 371 because the condition on line 369 was always true

370 del self._knowledge_graph

371 if hasattr(self, "_related_searches"): 371 ↛ 373line 371 didn't jump to line 373 because the condition on line 371 was always true

372 del self._related_searches

373 if hasattr(self, "_people_also_ask"): 373 ↛ 376line 373 didn't jump to line 376 because the condition on line 373 was always true

374 del self._people_also_ask

375

376 return results

Coverage for src / local_deep_research / web_search_engines / engines / search_engine_serper.py: 83%

144 statements