Coverage for src / local_deep_research / web_search_engines / engines / search_engine_tavily.py: 72%
108 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1from typing import Any, Dict, List, Optional
3import requests
4from langchain_core.language_models import BaseLLM
5from loguru import logger
7from ...config import search_config
8from ...security.safe_requests import safe_post
9from ..rate_limiting import RateLimitError
10from ..search_engine_base import BaseSearchEngine
13class TavilySearchEngine(BaseSearchEngine):
14 """Tavily search engine implementation with two-phase approach"""
16 # Mark as public search engine
17 is_public = True
18 # Mark as generic search engine (general web search)
19 is_generic = True
21 def __init__(
22 self,
23 max_results: int = 10,
24 region: str = "US",
25 time_period: str = "y",
26 safe_search: bool = True,
27 search_language: str = "English",
28 api_key: Optional[str] = None,
29 llm: Optional[BaseLLM] = None,
30 include_full_content: bool = True,
31 max_filtered_results: Optional[int] = None,
32 search_depth: str = "basic",
33 include_domains: Optional[List[str]] = None,
34 exclude_domains: Optional[List[str]] = None,
35 settings_snapshot: Optional[Dict[str, Any]] = None,
36 **kwargs,
37 ):
38 """
39 Initialize the Tavily search engine.
41 Args:
42 max_results: Maximum number of search results
43 region: Region code for search results (not used by Tavily currently)
44 time_period: Time period for search results (not used by Tavily currently)
45 safe_search: Whether to enable safe search (not used by Tavily currently)
46 search_language: Language for search results (not used by Tavily currently)
47 api_key: Tavily API key (can also be set in TAVILY_API_KEY env)
48 llm: Language model for relevance filtering
49 include_full_content: Whether to include full webpage content in results
50 max_filtered_results: Maximum number of results to keep after filtering
51 search_depth: "basic" or "advanced" - controls search quality vs speed
52 include_domains: List of domains to include in search
53 exclude_domains: List of domains to exclude from search
54 settings_snapshot: Settings snapshot for thread context
55 **kwargs: Additional parameters (ignored but accepted for compatibility)
56 """
57 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
58 super().__init__(
59 llm=llm,
60 max_filtered_results=max_filtered_results,
61 max_results=max_results,
62 )
63 self.include_full_content = include_full_content
64 self.search_depth = search_depth
65 self.include_domains = include_domains or []
66 self.exclude_domains = exclude_domains or []
68 # Get API key - check params, database, or env vars
69 from ...config.search_config import get_setting_from_snapshot
71 tavily_api_key = api_key
72 if not tavily_api_key:
73 tavily_api_key = get_setting_from_snapshot(
74 "search.engine.web.tavily.api_key",
75 settings_snapshot=settings_snapshot,
76 )
78 if not tavily_api_key:
79 raise ValueError(
80 "Tavily API key not found. Please provide api_key parameter, "
81 "set it in the UI settings, or set TAVILY_API_KEY environment variable."
82 )
84 self.api_key = tavily_api_key
85 self.base_url = "https://api.tavily.com"
87 # If full content is requested, initialize FullSearchResults
88 if include_full_content:
89 # Import FullSearchResults only if needed
90 try:
91 from .full_search import FullSearchResults
93 # Create a simple wrapper for Tavily API calls
94 class TavilyWrapper:
95 def __init__(self, parent):
96 self.parent = parent
98 def run(self, query):
99 return self.parent._get_previews(query)
101 self.full_search = FullSearchResults(
102 llm=llm,
103 web_search=TavilyWrapper(self),
104 language=search_language,
105 max_results=max_results,
106 region=region,
107 time=time_period,
108 safesearch="moderate" if safe_search else "off",
109 )
110 except ImportError:
111 logger.warning(
112 "Warning: FullSearchResults not available. Full content retrieval disabled."
113 )
114 self.include_full_content = False
116 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
117 """
118 Get preview information from Tavily Search.
120 Args:
121 query: The search query
123 Returns:
124 List of preview dictionaries
125 """
126 logger.info("Getting search results from Tavily")
128 try:
129 # Prepare the request payload
130 payload = {
131 "api_key": self.api_key,
132 "query": query[:400], # Limit query length
133 "search_depth": self.search_depth,
134 "max_results": min(
135 20, self.max_results
136 ), # Tavily has a max limit
137 "include_answer": False, # We don't need the AI answer
138 "include_images": False, # We don't need images
139 "include_raw_content": self.include_full_content, # Get content if requested
140 }
142 # Add domain filters if specified
143 if self.include_domains:
144 payload["include_domains"] = self.include_domains
145 if self.exclude_domains:
146 payload["exclude_domains"] = self.exclude_domains
148 # Apply rate limiting before request
149 self._last_wait_time = self.rate_tracker.apply_rate_limit(
150 self.engine_type
151 )
153 # Make the API request
154 response = safe_post(
155 f"{self.base_url}/search",
156 json=payload,
157 headers={"Content-Type": "application/json"},
158 timeout=30,
159 )
161 # Check for errors
162 if response.status_code == 429:
163 raise RateLimitError(
164 f"Tavily rate limit hit: {response.status_code} - {response.text}"
165 )
167 response.raise_for_status()
169 # Parse the response
170 data = response.json()
171 results = data.get("results", [])
173 # Format results as previews
174 previews = []
175 for i, result in enumerate(results):
176 preview = {
177 "id": result.get("url", str(i)), # Use URL as ID
178 "title": result.get("title", ""),
179 "link": result.get("url", ""),
180 "snippet": result.get(
181 "content", ""
182 ), # Tavily calls it "content"
183 "displayed_link": result.get("url", ""),
184 "position": i,
185 }
187 # Store full Tavily result for later
188 preview["_full_result"] = result
190 previews.append(preview)
192 # Store the previews for potential full content retrieval
193 self._search_results = previews
195 return previews
197 except RateLimitError:
198 raise # Re-raise rate limit errors
199 except requests.exceptions.RequestException as e:
200 error_msg = str(e)
201 logger.exception("Error getting Tavily results")
203 # Check for rate limit patterns in error message
204 if any( 204 ↛ 215line 204 didn't jump to line 215 because the condition on line 204 was always true
205 pattern in error_msg.lower()
206 for pattern in [
207 "429",
208 "rate limit",
209 "quota",
210 "too many requests",
211 ]
212 ):
213 raise RateLimitError(f"Tavily rate limit hit: {error_msg}")
215 return []
216 except Exception:
217 logger.exception("Unexpected error getting Tavily results")
218 return []
220 def _get_full_content(
221 self, relevant_items: List[Dict[str, Any]]
222 ) -> List[Dict[str, Any]]:
223 """
224 Get full content for the relevant search results.
225 If include_full_content is True and raw content was retrieved,
226 includes it in the results.
228 Args:
229 relevant_items: List of relevant preview dictionaries
231 Returns:
232 List of result dictionaries with full content if available
233 """
234 # Check if we should get full content
235 if ( 235 ↛ 239line 235 didn't jump to line 239 because the condition on line 235 was never true
236 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
237 and search_config.SEARCH_SNIPPETS_ONLY
238 ):
239 logger.info("Snippet-only mode, skipping full content retrieval")
241 # Return the relevant items with their full Tavily information
242 results = []
243 for item in relevant_items:
244 # Use the full result if available, otherwise use the preview
245 if "_full_result" in item:
246 result = item["_full_result"]
247 # Remove temporary field
248 if "_full_result" in result:
249 del result["_full_result"]
250 else:
251 result = item
253 results.append(result)
255 return results
257 # If full content retrieval is enabled
258 if self.include_full_content and hasattr(self, "full_search"): 258 ↛ 259line 258 didn't jump to line 259 because the condition on line 258 was never true
259 logger.info("Retrieving full webpage content")
261 try:
262 # Use FullSearchResults to get full content
263 results_with_content = self.full_search._get_full_content(
264 relevant_items
265 )
267 return results_with_content
269 except Exception:
270 logger.exception("Error retrieving full content")
271 # Fall back to returning the items without full content
273 # Return items with their full Tavily information
274 results = []
275 for item in relevant_items:
276 # Use the full result if available, otherwise use the preview
277 if "_full_result" in item: 277 ↛ 290line 277 didn't jump to line 290 because the condition on line 277 was always true
278 result = item["_full_result"].copy()
280 # If Tavily provided raw_content, include it
281 if "raw_content" in result and self.include_full_content: 281 ↛ 282line 281 didn't jump to line 282 because the condition on line 281 was never true
282 result["content"] = result.get(
283 "raw_content", result.get("content", "")
284 )
286 # Remove temporary field
287 if "_full_result" in result: 287 ↛ 288line 287 didn't jump to line 288 because the condition on line 287 was never true
288 del result["_full_result"]
289 else:
290 result = item.copy()
291 if "_full_result" in result:
292 del result["_full_result"]
294 results.append(result)
296 return results
298 def run(
299 self, query: str, research_context: Dict[str, Any] | None = None
300 ) -> List[Dict[str, Any]]:
301 """
302 Execute a search using Tavily with the two-phase approach.
304 Args:
305 query: The search query
306 research_context: Context from previous research to use.
308 Returns:
309 List of search results
310 """
311 logger.info("---Execute a search using Tavily---")
313 # Use the implementation from the parent class which handles all phases
314 results = super().run(query, research_context=research_context)
316 # Clean up
317 if hasattr(self, "_search_results"): 317 ↛ 320line 317 didn't jump to line 320 because the condition on line 317 was always true
318 del self._search_results
320 return results