Coverage for src / local_deep_research / web_search_engines / engines / search_engine_tavily.py: 72%

108 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1from typing import Any, Dict, List, Optional 

2 

3import requests 

4from langchain_core.language_models import BaseLLM 

5from loguru import logger 

6 

7from ...config import search_config 

8from ...security.safe_requests import safe_post 

9from ..rate_limiting import RateLimitError 

10from ..search_engine_base import BaseSearchEngine 

11 

12 

13class TavilySearchEngine(BaseSearchEngine): 

14 """Tavily search engine implementation with two-phase approach""" 

15 

16 # Mark as public search engine 

17 is_public = True 

18 # Mark as generic search engine (general web search) 

19 is_generic = True 

20 

21 def __init__( 

22 self, 

23 max_results: int = 10, 

24 region: str = "US", 

25 time_period: str = "y", 

26 safe_search: bool = True, 

27 search_language: str = "English", 

28 api_key: Optional[str] = None, 

29 llm: Optional[BaseLLM] = None, 

30 include_full_content: bool = True, 

31 max_filtered_results: Optional[int] = None, 

32 search_depth: str = "basic", 

33 include_domains: Optional[List[str]] = None, 

34 exclude_domains: Optional[List[str]] = None, 

35 settings_snapshot: Optional[Dict[str, Any]] = None, 

36 **kwargs, 

37 ): 

38 """ 

39 Initialize the Tavily search engine. 

40 

41 Args: 

42 max_results: Maximum number of search results 

43 region: Region code for search results (not used by Tavily currently) 

44 time_period: Time period for search results (not used by Tavily currently) 

45 safe_search: Whether to enable safe search (not used by Tavily currently) 

46 search_language: Language for search results (not used by Tavily currently) 

47 api_key: Tavily API key (can also be set in TAVILY_API_KEY env) 

48 llm: Language model for relevance filtering 

49 include_full_content: Whether to include full webpage content in results 

50 max_filtered_results: Maximum number of results to keep after filtering 

51 search_depth: "basic" or "advanced" - controls search quality vs speed 

52 include_domains: List of domains to include in search 

53 exclude_domains: List of domains to exclude from search 

54 settings_snapshot: Settings snapshot for thread context 

55 **kwargs: Additional parameters (ignored but accepted for compatibility) 

56 """ 

57 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results 

58 super().__init__( 

59 llm=llm, 

60 max_filtered_results=max_filtered_results, 

61 max_results=max_results, 

62 ) 

63 self.include_full_content = include_full_content 

64 self.search_depth = search_depth 

65 self.include_domains = include_domains or [] 

66 self.exclude_domains = exclude_domains or [] 

67 

68 # Get API key - check params, database, or env vars 

69 from ...config.search_config import get_setting_from_snapshot 

70 

71 tavily_api_key = api_key 

72 if not tavily_api_key: 

73 tavily_api_key = get_setting_from_snapshot( 

74 "search.engine.web.tavily.api_key", 

75 settings_snapshot=settings_snapshot, 

76 ) 

77 

78 if not tavily_api_key: 

79 raise ValueError( 

80 "Tavily API key not found. Please provide api_key parameter, " 

81 "set it in the UI settings, or set TAVILY_API_KEY environment variable." 

82 ) 

83 

84 self.api_key = tavily_api_key 

85 self.base_url = "https://api.tavily.com" 

86 

87 # If full content is requested, initialize FullSearchResults 

88 if include_full_content: 

89 # Import FullSearchResults only if needed 

90 try: 

91 from .full_search import FullSearchResults 

92 

93 # Create a simple wrapper for Tavily API calls 

94 class TavilyWrapper: 

95 def __init__(self, parent): 

96 self.parent = parent 

97 

98 def run(self, query): 

99 return self.parent._get_previews(query) 

100 

101 self.full_search = FullSearchResults( 

102 llm=llm, 

103 web_search=TavilyWrapper(self), 

104 language=search_language, 

105 max_results=max_results, 

106 region=region, 

107 time=time_period, 

108 safesearch="moderate" if safe_search else "off", 

109 ) 

110 except ImportError: 

111 logger.warning( 

112 "Warning: FullSearchResults not available. Full content retrieval disabled." 

113 ) 

114 self.include_full_content = False 

115 

116 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

117 """ 

118 Get preview information from Tavily Search. 

119 

120 Args: 

121 query: The search query 

122 

123 Returns: 

124 List of preview dictionaries 

125 """ 

126 logger.info("Getting search results from Tavily") 

127 

128 try: 

129 # Prepare the request payload 

130 payload = { 

131 "api_key": self.api_key, 

132 "query": query[:400], # Limit query length 

133 "search_depth": self.search_depth, 

134 "max_results": min( 

135 20, self.max_results 

136 ), # Tavily has a max limit 

137 "include_answer": False, # We don't need the AI answer 

138 "include_images": False, # We don't need images 

139 "include_raw_content": self.include_full_content, # Get content if requested 

140 } 

141 

142 # Add domain filters if specified 

143 if self.include_domains: 

144 payload["include_domains"] = self.include_domains 

145 if self.exclude_domains: 

146 payload["exclude_domains"] = self.exclude_domains 

147 

148 # Apply rate limiting before request 

149 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

150 self.engine_type 

151 ) 

152 

153 # Make the API request 

154 response = safe_post( 

155 f"{self.base_url}/search", 

156 json=payload, 

157 headers={"Content-Type": "application/json"}, 

158 timeout=30, 

159 ) 

160 

161 # Check for errors 

162 if response.status_code == 429: 

163 raise RateLimitError( 

164 f"Tavily rate limit hit: {response.status_code} - {response.text}" 

165 ) 

166 

167 response.raise_for_status() 

168 

169 # Parse the response 

170 data = response.json() 

171 results = data.get("results", []) 

172 

173 # Format results as previews 

174 previews = [] 

175 for i, result in enumerate(results): 

176 preview = { 

177 "id": result.get("url", str(i)), # Use URL as ID 

178 "title": result.get("title", ""), 

179 "link": result.get("url", ""), 

180 "snippet": result.get( 

181 "content", "" 

182 ), # Tavily calls it "content" 

183 "displayed_link": result.get("url", ""), 

184 "position": i, 

185 } 

186 

187 # Store full Tavily result for later 

188 preview["_full_result"] = result 

189 

190 previews.append(preview) 

191 

192 # Store the previews for potential full content retrieval 

193 self._search_results = previews 

194 

195 return previews 

196 

197 except RateLimitError: 

198 raise # Re-raise rate limit errors 

199 except requests.exceptions.RequestException as e: 

200 error_msg = str(e) 

201 logger.exception("Error getting Tavily results") 

202 

203 # Check for rate limit patterns in error message 

204 if any( 204 ↛ 215line 204 didn't jump to line 215 because the condition on line 204 was always true

205 pattern in error_msg.lower() 

206 for pattern in [ 

207 "429", 

208 "rate limit", 

209 "quota", 

210 "too many requests", 

211 ] 

212 ): 

213 raise RateLimitError(f"Tavily rate limit hit: {error_msg}") 

214 

215 return [] 

216 except Exception: 

217 logger.exception("Unexpected error getting Tavily results") 

218 return [] 

219 

220 def _get_full_content( 

221 self, relevant_items: List[Dict[str, Any]] 

222 ) -> List[Dict[str, Any]]: 

223 """ 

224 Get full content for the relevant search results. 

225 If include_full_content is True and raw content was retrieved, 

226 includes it in the results. 

227 

228 Args: 

229 relevant_items: List of relevant preview dictionaries 

230 

231 Returns: 

232 List of result dictionaries with full content if available 

233 """ 

234 # Check if we should get full content 

235 if ( 235 ↛ 239line 235 didn't jump to line 239 because the condition on line 235 was never true

236 hasattr(search_config, "SEARCH_SNIPPETS_ONLY") 

237 and search_config.SEARCH_SNIPPETS_ONLY 

238 ): 

239 logger.info("Snippet-only mode, skipping full content retrieval") 

240 

241 # Return the relevant items with their full Tavily information 

242 results = [] 

243 for item in relevant_items: 

244 # Use the full result if available, otherwise use the preview 

245 if "_full_result" in item: 

246 result = item["_full_result"] 

247 # Remove temporary field 

248 if "_full_result" in result: 

249 del result["_full_result"] 

250 else: 

251 result = item 

252 

253 results.append(result) 

254 

255 return results 

256 

257 # If full content retrieval is enabled 

258 if self.include_full_content and hasattr(self, "full_search"): 258 ↛ 259line 258 didn't jump to line 259 because the condition on line 258 was never true

259 logger.info("Retrieving full webpage content") 

260 

261 try: 

262 # Use FullSearchResults to get full content 

263 results_with_content = self.full_search._get_full_content( 

264 relevant_items 

265 ) 

266 

267 return results_with_content 

268 

269 except Exception: 

270 logger.exception("Error retrieving full content") 

271 # Fall back to returning the items without full content 

272 

273 # Return items with their full Tavily information 

274 results = [] 

275 for item in relevant_items: 

276 # Use the full result if available, otherwise use the preview 

277 if "_full_result" in item: 277 ↛ 290line 277 didn't jump to line 290 because the condition on line 277 was always true

278 result = item["_full_result"].copy() 

279 

280 # If Tavily provided raw_content, include it 

281 if "raw_content" in result and self.include_full_content: 281 ↛ 282line 281 didn't jump to line 282 because the condition on line 281 was never true

282 result["content"] = result.get( 

283 "raw_content", result.get("content", "") 

284 ) 

285 

286 # Remove temporary field 

287 if "_full_result" in result: 287 ↛ 288line 287 didn't jump to line 288 because the condition on line 287 was never true

288 del result["_full_result"] 

289 else: 

290 result = item.copy() 

291 if "_full_result" in result: 

292 del result["_full_result"] 

293 

294 results.append(result) 

295 

296 return results 

297 

298 def run( 

299 self, query: str, research_context: Dict[str, Any] | None = None 

300 ) -> List[Dict[str, Any]]: 

301 """ 

302 Execute a search using Tavily with the two-phase approach. 

303 

304 Args: 

305 query: The search query 

306 research_context: Context from previous research to use. 

307 

308 Returns: 

309 List of search results 

310 """ 

311 logger.info("---Execute a search using Tavily---") 

312 

313 # Use the implementation from the parent class which handles all phases 

314 results = super().run(query, research_context=research_context) 

315 

316 # Clean up 

317 if hasattr(self, "_search_results"): 317 ↛ 320line 317 didn't jump to line 320 because the condition on line 317 was always true

318 del self._search_results 

319 

320 return results