Coverage for src / local_deep_research / web_search_engines / engines / search_engine_serper.py: 99%

91 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1from loguru import logger 

2from typing import Any, Dict, List, Optional 

3import requests 

4from urllib.parse import urlparse 

5 

6from langchain_core.language_models import BaseLLM 

7 

8from ..search_engine_base import BaseSearchEngine 

9from ..rate_limiting import RateLimitError 

10from ...security import safe_post 

11 

12 

13class SerperSearchEngine(BaseSearchEngine): 

14 """Google search engine implementation using Serper API with two-phase approach""" 

15 

16 # Mark as public search engine 

17 is_public = True 

18 # Mark as generic search engine (general web search via Google) 

19 is_generic = True 

20 

21 # Class constants 

22 BASE_URL = "https://google.serper.dev/search" 

23 DEFAULT_TIMEOUT = 30 

24 DEFAULT_REGION = "us" 

25 DEFAULT_LANGUAGE = "en" 

26 

27 def __init__( 

28 self, 

29 max_results: int = 10, 

30 region: str = "us", 

31 time_period: Optional[str] = None, 

32 safe_search: bool = True, 

33 search_language: str = "en", 

34 api_key: Optional[str] = None, 

35 llm: Optional[BaseLLM] = None, 

36 include_full_content: bool = False, 

37 max_filtered_results: Optional[int] = None, 

38 settings_snapshot: Optional[Dict[str, Any]] = None, 

39 **kwargs, 

40 ): 

41 """ 

42 Initialize the Serper search engine. 

43 

44 Args: 

45 max_results: Maximum number of search results (default 10) 

46 region: Country code for localized results (e.g., 'us', 'gb', 'fr') 

47 time_period: Time filter for results ('day', 'week', 'month', 'year', or None for all time) 

48 safe_search: Whether to enable safe search 

49 search_language: Language code for results (e.g., 'en', 'es', 'fr') 

50 api_key: Serper API key (can also be set in settings) 

51 llm: Language model for relevance filtering 

52 include_full_content: Whether to include full webpage content in results 

53 max_filtered_results: Maximum number of results to keep after filtering 

54 settings_snapshot: Settings snapshot for thread context 

55 **kwargs: Additional parameters (ignored but accepted for compatibility) 

56 """ 

57 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results 

58 super().__init__( 

59 llm=llm, 

60 max_filtered_results=max_filtered_results, 

61 max_results=max_results, 

62 include_full_content=include_full_content, 

63 settings_snapshot=settings_snapshot, 

64 ) 

65 self.region = region 

66 self.time_period = time_period 

67 self.safe_search = safe_search 

68 self.search_language = search_language 

69 

70 # Get API key - check params, settings, or env vars 

71 serper_api_key = self._resolve_api_key( 

72 api_key, 

73 "search.engine.web.serper.api_key", 

74 engine_name="Serper", 

75 settings_snapshot=settings_snapshot, 

76 ) 

77 

78 self.api_key = serper_api_key 

79 self.base_url = self.BASE_URL 

80 # Note: self.engine_type is automatically set by parent BaseSearchEngine class 

81 

82 # Initialize per-query attributes (reset in _get_previews per search) 

83 self._knowledge_graph = None 

84 self._related_searches = None 

85 self._people_also_ask = None 

86 

87 # If full content is requested, initialize FullSearchResults 

88 self._init_full_search( 

89 web_search=None, # We'll handle the search ourselves 

90 language=search_language, 

91 max_results=max_results, 

92 region=region, 

93 time_period=time_period, 

94 safe_search="Moderate" if safe_search else "Off", 

95 ) 

96 

97 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

98 """ 

99 Get preview information from Serper API. 

100 

101 Args: 

102 query: The search query 

103 

104 Returns: 

105 List of preview dictionaries 

106 """ 

107 logger.info("Getting search results from Serper API") 

108 

109 # Reset per-query attributes to prevent leakage between searches 

110 self._knowledge_graph = None 

111 self._related_searches = None 

112 self._people_also_ask = None 

113 

114 try: 

115 # Build request payload 

116 payload = { 

117 "q": query, 

118 "num": self.max_results, 

119 "gl": self.region, 

120 "hl": self.search_language, 

121 } 

122 

123 # Add optional parameters 

124 if self.time_period: 

125 # Map time periods to Serper's format 

126 time_mapping = { 

127 "day": "d", 

128 "week": "w", 

129 "month": "m", 

130 "year": "y", 

131 } 

132 if self.time_period in time_mapping: 

133 payload["tbs"] = f"qdr:{time_mapping[self.time_period]}" 

134 

135 # Apply rate limiting before request 

136 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

137 self.engine_type 

138 ) 

139 

140 # Make API request 

141 headers = { 

142 "X-API-KEY": self.api_key, 

143 "Content-Type": "application/json", 

144 } 

145 

146 response = safe_post( 

147 self.base_url, 

148 headers=headers, 

149 json=payload, 

150 timeout=self.DEFAULT_TIMEOUT, 

151 ) 

152 

153 # Check for rate limits 

154 self._raise_if_rate_limit(response.status_code) 

155 

156 response.raise_for_status() 

157 

158 data = response.json() 

159 

160 # Extract organic results 

161 organic_results = data.get("organic", []) 

162 

163 # Format results as previews 

164 previews = [] 

165 for idx, result in enumerate(organic_results): 

166 # Extract display link safely using urlparse 

167 display_link = "" 

168 link = result.get("link", "") 

169 if link: 

170 try: 

171 parsed_url = urlparse(link) 

172 display_link = parsed_url.netloc or "" 

173 except Exception: 

174 logger.debug( 

175 f"Failed to parse URL for display: {link[:50]}" 

176 ) 

177 display_link = "" 

178 

179 preview = { 

180 "id": idx, 

181 "title": result.get("title", ""), 

182 "link": link, 

183 "snippet": result.get("snippet", ""), 

184 "displayed_link": display_link, 

185 "position": result.get("position", idx + 1), 

186 } 

187 

188 # Store full Serper result for later 

189 preview["_full_result"] = result 

190 

191 # Only include optional fields if present to avoid None values 

192 # This keeps the preview dict cleaner and saves memory 

193 if "sitelinks" in result: 

194 preview["sitelinks"] = result["sitelinks"] 

195 

196 if "date" in result: 

197 preview["date"] = result["date"] 

198 

199 if "attributes" in result: 

200 preview["attributes"] = result["attributes"] 

201 

202 previews.append(preview) 

203 

204 # Store the previews for potential full content retrieval 

205 self._search_results = previews 

206 

207 # Also store knowledge graph if available 

208 if "knowledgeGraph" in data: 

209 self._knowledge_graph = data["knowledgeGraph"] 

210 logger.info( 

211 f"Found knowledge graph for query: {data['knowledgeGraph'].get('title', 'Unknown')}" 

212 ) 

213 

214 # Store related searches and people also ask 

215 if "relatedSearches" in data: 

216 self._related_searches = data["relatedSearches"] 

217 

218 if "peopleAlsoAsk" in data: 

219 self._people_also_ask = data["peopleAlsoAsk"] 

220 

221 return previews 

222 

223 except RateLimitError: 

224 raise # Re-raise rate limit errors 

225 except requests.exceptions.RequestException as e: 

226 logger.exception("Error getting Serper API results") 

227 self._raise_if_rate_limit(e) 

228 return [] 

229 except Exception: 

230 logger.exception("Unexpected error getting Serper API results") 

231 return [] 

232 

233 def _get_full_content( 

234 self, relevant_items: List[Dict[str, Any]] 

235 ) -> List[Dict[str, Any]]: 

236 """ 

237 Get full content for the relevant search results. 

238 Extends base implementation to include knowledge graph data. 

239 

240 Args: 

241 relevant_items: List of relevant preview dictionaries 

242 

243 Returns: 

244 List of result dictionaries with full content if requested 

245 """ 

246 results = super()._get_full_content(relevant_items) 

247 

248 # Include knowledge graph if available 

249 if results and hasattr(self, "_knowledge_graph"): 249 ↛ 252line 249 didn't jump to line 252 because the condition on line 249 was always true

250 results[0]["knowledge_graph"] = self._knowledge_graph 

251 

252 return results 

253 

254 def _temp_attributes(self): 

255 """Return list of temporary attribute names to clean up after run().""" 

256 return super()._temp_attributes() + [ 

257 "_knowledge_graph", 

258 "_related_searches", 

259 "_people_also_ask", 

260 ]