Coverage for src/local_deep_research/web_search_engines/engines/search_engine

1from loguru import logger

2from typing import Any, Dict, List, Optional

3import requests

4from urllib.parse import urlparse

6from langchain_core.language_models import BaseLLM

8from ..search_engine_base import BaseSearchEngine

9from ..rate_limiting import RateLimitError

10from ...security import safe_get

13class ScaleSerpSearchEngine(BaseSearchEngine):

14 """Google search engine implementation using ScaleSerp API with caching support"""

16 # Mark as public search engine

17 is_public = True

18 # Mark as generic search engine (general web search via Google)

19 is_generic = True

21 def __init__(

22 self,

23 max_results: int = 10,

24 location: str = "United States",

25 language: str = "en",

26 device: str = "desktop",

27 safe_search: bool = True,

28 api_key: Optional[str] = None,

29 llm: Optional[BaseLLM] = None,

30 include_full_content: bool = False,

31 max_filtered_results: Optional[int] = None,

32 settings_snapshot: Optional[Dict[str, Any]] = None,

33 enable_cache: bool = True,

34 **kwargs,

35 ):

36 """

37 Initialize the ScaleSerp search engine.

39 Args:

40 max_results: Maximum number of search results (default 10, max 100)

41 location: Location for localized results (e.g., 'United States', 'London,England,United Kingdom')

42 language: Language code for results (e.g., 'en', 'es', 'fr')

43 device: Device type for search ('desktop' or 'mobile')

44 safe_search: Whether to enable safe search

45 api_key: ScaleSerp API key (can also be set in settings)

46 llm: Language model for relevance filtering

47 include_full_content: Whether to include full webpage content in results

48 max_filtered_results: Maximum number of results to keep after filtering

49 settings_snapshot: Settings snapshot for thread context

50 enable_cache: Whether to use ScaleSerp's 1-hour caching (saves costs for repeated searches)

51 **kwargs: Additional parameters (ignored but accepted for compatibility)

52 """

53 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results

54 super().__init__(

55 llm=llm,

56 max_filtered_results=max_filtered_results,

57 max_results=max_results,

58 )

59 self.include_full_content = include_full_content

60 self.location = location

61 self.language = language

62 self.device = device

63 self.safe_search = safe_search

64 self.enable_cache = enable_cache # ScaleSerp's unique caching feature

66 # Get API key - check params, env vars, or database

67 from ...config.search_config import get_setting_from_snapshot

69 scaleserp_api_key = api_key

70 if not scaleserp_api_key:

71 scaleserp_api_key = get_setting_from_snapshot(

72 "search.engine.web.scaleserp.api_key",

73 settings_snapshot=settings_snapshot,

74 )

76 if not scaleserp_api_key:

77 raise ValueError(

78 "ScaleSerp API key not found. Please provide api_key parameter or set it in the UI settings. "

79 "Get your API key at https://scaleserp.com"

80 )

82 self.api_key = scaleserp_api_key

83 self.base_url = "https://api.scaleserp.com/search"

85 # Initialize per-query attributes (reset in _get_previews per search)

86 self._knowledge_graph = None

87 self._related_searches = None

88 self._related_questions = None

90 # If full content is requested, initialize FullSearchResults

91 if include_full_content: 91 ↛ 93line 91 didn't jump to line 93 because the condition on line 91 was never true

92 # Import FullSearchResults only if needed

93 try:

94 from .full_search import FullSearchResults

96 self.full_search = FullSearchResults(

97 llm=llm,

98 web_search=None, # We'll handle the search ourselves

99 language=language,

100 max_results=max_results,

101 region=location,

102 time=None,

103 safesearch="Moderate" if safe_search else "Off",

104 )

105 except ImportError:

106 logger.warning(

107 "Warning: FullSearchResults not available. Full content retrieval disabled."

108 )

109 self.include_full_content = False

110

111 def _get_previews(self, query: str) -> List[Dict[str, Any]]:

112 """

113 Get preview information from ScaleSerp API.

114

115 Args:

116 query: The search query

117

118 Returns:

119 List of preview dictionaries

120 """

121 logger.info("Getting search results from ScaleSerp API")

122

123 # Reset per-query attributes to prevent leakage between searches

124 self._knowledge_graph = None

125 self._related_searches = None

126 self._related_questions = None

127

128 try:

129 # Build request parameters

130 params = {

131 "api_key": self.api_key,

132 "q": query,

133 "num": min(self.max_results, 100), # ScaleSerp max is 100

134 "location": self.location,

135 "hl": self.language,

136 "device": self.device,

137 }

138

139 # Add safe search if enabled

140 if self.safe_search: 140 ↛ 145line 140 didn't jump to line 145 because the condition on line 140 was always true

141 params["safe"] = "on"

142

143 # ScaleSerp automatically caches identical queries for 1 hour

144 # Cached results are served instantly and don't consume API credits

145 if self.enable_cache: 145 ↛ 154line 145 didn't jump to line 154 because the condition on line 145 was always true

146 params["output"] = (

147 "json" # Ensure JSON output for cache detection

148 )

149 logger.debug(

150 "ScaleSerp caching enabled - identical searches within 1 hour are free"

151 )

152

153 # Apply rate limiting before request

154 self._last_wait_time = self.rate_tracker.apply_rate_limit(

155 self.engine_type

156 )

157

158 # Make API request

159 response = safe_get(self.base_url, params=params, timeout=30)

160

161 # Check for rate limits

162 if response.status_code == 429:

163 raise RateLimitError(

164 f"ScaleSerp rate limit hit: {response.status_code} - {response.text}"

165 )

166

167 response.raise_for_status()

168

169 data = response.json()

170

171 # Extract organic results

172 organic_results = data.get("organic_results", [])

173

174 # Format results as previews

175 previews = []

176

177 # Check if results were served from cache for monitoring

178 from_cache = data.get("request_info", {}).get("cached", False)

179

180 for idx, result in enumerate(organic_results):

181 # Extract display link safely using urlparse

182 link = result.get("link", "")

183 display_link = ""

184 if link: 184 ↛ 194line 184 didn't jump to line 194 because the condition on line 184 was always true

185 try:

186 parsed_url = urlparse(link)

187 display_link = (

188 parsed_url.netloc or parsed_url.path or ""

189 )

190 except Exception:

191 # Fallback to truncated URL if parsing fails

192 display_link = link[:50]

193

194 preview = {

195 "id": idx,

196 "title": result.get("title", ""),

197 "link": link,

198 "snippet": result.get("snippet", ""),

199 "displayed_link": display_link,

200 "position": result.get("position", idx + 1),

201 "from_cache": from_cache, # Add cache status for monitoring

202 }

203

204 # Store full ScaleSerp result for later

205 preview["_full_result"] = result

206

207 # Include rich snippets if available

208 if "rich_snippet" in result:

209 preview["rich_snippet"] = result["rich_snippet"]

210

211 # Include date if available

212 if "date" in result:

213 preview["date"] = result["date"]

214

215 # Include sitelinks if available

216 if "sitelinks" in result:

217 preview["sitelinks"] = result["sitelinks"]

218

219 previews.append(preview)

220

221 # Store the previews for potential full content retrieval

222 self._search_results = previews

223

224 # Store knowledge graph if available

225 if "knowledge_graph" in data:

226 self._knowledge_graph = data["knowledge_graph"]

227 logger.info(

228 f"Found knowledge graph for query: {data['knowledge_graph'].get('title', 'Unknown')}"

229 )

230

231 # Store related searches

232 if "related_searches" in data: 232 ↛ 233line 232 didn't jump to line 233 because the condition on line 232 was never true

233 self._related_searches = data["related_searches"]

234

235 # Store related questions (People Also Ask)

236 if "related_questions" in data: 236 ↛ 237line 236 didn't jump to line 237 because the condition on line 236 was never true

237 self._related_questions = data["related_questions"]

238

239 # Log if result was served from cache

240 if from_cache:

241 logger.debug(

242 "Result served from ScaleSerp cache - no API credit used!"

243 )

244

245 return previews

246

247 except RateLimitError:

248 raise # Re-raise rate limit errors

249 except requests.exceptions.RequestException as e:

250 error_msg = str(e)

251 logger.exception(

252 "Error getting ScaleSerp API results. Check API docs: https://docs.scaleserp.com"

253 )

254

255 # Check for rate limit patterns in error message

256 if any( 256 ↛ 265line 256 didn't jump to line 265 because the condition on line 256 was never true

257 pattern in error_msg.lower()

258 for pattern in [

259 "429",

260 "rate limit",

261 "quota",

262 "too many requests",

263 ]

264 ):

265 raise RateLimitError(f"ScaleSerp rate limit hit: {error_msg}")

266

267 return []

268 except Exception:

269 logger.exception("Unexpected error getting ScaleSerp API results")

270 return []

271

272 def _get_full_content(

273 self, relevant_items: List[Dict[str, Any]]

274 ) -> List[Dict[str, Any]]:

275 """

276 Get full content for the relevant search results.

277 If include_full_content is True and FullSearchResults is available,

278 retrieves full webpage content for the results.

279

280 Args:

281 relevant_items: List of relevant preview dictionaries

282

283 Returns:

284 List of result dictionaries with full content if requested

285 """

286 # Check if we should get full content

287 from ...config import search_config

288

289 if ( 289 ↛ 293line 289 didn't jump to line 293 because the condition on line 289 was never true

290 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")

291 and search_config.SEARCH_SNIPPETS_ONLY

292 ):

293 logger.info("Snippet-only mode, skipping full content retrieval")

294

295 # Return the relevant items with their full ScaleSerp information

296 results = []

297 for item in relevant_items:

298 # Use the full result if available, otherwise use the preview

299 if "_full_result" in item:

300 result = item["_full_result"].copy()

301 else:

302 result = item.copy()

303

304 # Clean up temporary fields

305 if "_full_result" in result:

306 del result["_full_result"]

307

308 results.append(result)

309

310 # Include knowledge graph and other metadata if this is the first call

311 if results and self._knowledge_graph:

312 results[0]["knowledge_graph"] = self._knowledge_graph

313

314 return results

315

316 # If full content retrieval is enabled

317 if self.include_full_content and hasattr(self, "full_search"): 317 ↛ 318line 317 didn't jump to line 318 because the condition on line 317 was never true

318 logger.info("Retrieving full webpage content")

319

320 try:

321 # Use FullSearchResults to get full content

322 results_with_content = self.full_search._get_full_content(

323 relevant_items

324 )

325

326 return results_with_content

327

328 except Exception as e:

329 logger.info(f"Error retrieving full content: {e}")

330 # Fall back to returning the items without full content

331

332 # Return items with their full ScaleSerp information

333 results = []

334 for item in relevant_items:

335 # Use the full result if available, otherwise use the preview

336 if "_full_result" in item:

337 result = item["_full_result"].copy()

338 else:

339 result = item.copy()

340

341 # Clean up temporary fields

342 if "_full_result" in result: 342 ↛ 343line 342 didn't jump to line 343 because the condition on line 342 was never true

343 del result["_full_result"]

344

345 results.append(result)

346

347 # Include knowledge graph and other metadata if this is the first call

348 if results and self._knowledge_graph:

349 results[0]["knowledge_graph"] = self._knowledge_graph

350

351 return results

352

353 def run(

354 self, query: str, research_context: Dict[str, Any] | None = None

355 ) -> List[Dict[str, Any]]:

356 """

357 Execute a search using ScaleSerp API with the two-phase approach.

358

359 Args:

360 query: The search query

361 research_context: Context from previous research to use.

362

363 Returns:

364 List of search results

365 """

366 logger.info("---Execute a search using ScaleSerp API (Google)---")

367

368 # Use the implementation from the parent class which handles all phases

369 results = super().run(query, research_context=research_context)

370

371 # Clean up

372 if hasattr(self, "_search_results"):

373 del self._search_results

374 if hasattr(self, "_knowledge_graph"): 374 ↛ 376line 374 didn't jump to line 376 because the condition on line 374 was always true

375 del self._knowledge_graph

376 if hasattr(self, "_related_searches"): 376 ↛ 378line 376 didn't jump to line 378 because the condition on line 376 was always true

377 del self._related_searches

378 if hasattr(self, "_related_questions"): 378 ↛ 381line 378 didn't jump to line 381 because the condition on line 378 was always true

379 del self._related_questions

380

381 return results

Coverage for src / local_deep_research / web_search_engines / engines / search_engine_scaleserp.py: 73%

143 statements