Coverage for src/local_deep_research/web_search_engines/engines/search_engine

1from typing import Any, Dict, List, Optional

3import requests

4from langchain_core.language_models import BaseLLM

5from loguru import logger

7from ...config import search_config

8from ...security.safe_requests import safe_post

9from ..rate_limiting import RateLimitError

10from ..search_engine_base import BaseSearchEngine

13class TavilySearchEngine(BaseSearchEngine):

14 """Tavily search engine implementation with two-phase approach"""

16 # Mark as public search engine

17 is_public = True

18 # Mark as generic search engine (general web search)

19 is_generic = True

21 def __init__(

22 self,

23 max_results: int = 10,

24 region: str = "US",

25 time_period: str = "y",

26 safe_search: bool = True,

27 search_language: str = "English",

28 api_key: Optional[str] = None,

29 llm: Optional[BaseLLM] = None,

30 include_full_content: bool = True,

31 max_filtered_results: Optional[int] = None,

32 search_depth: str = "basic",

33 include_domains: Optional[List[str]] = None,

34 exclude_domains: Optional[List[str]] = None,

35 settings_snapshot: Optional[Dict[str, Any]] = None,

36 **kwargs,

37 ):

38 """

39 Initialize the Tavily search engine.

41 Args:

42 max_results: Maximum number of search results

43 region: Region code for search results (not used by Tavily currently)

44 time_period: Time period for search results (not used by Tavily currently)

45 safe_search: Whether to enable safe search (not used by Tavily currently)

46 search_language: Language for search results (not used by Tavily currently)

47 api_key: Tavily API key (can also be set in TAVILY_API_KEY env)

48 llm: Language model for relevance filtering

49 include_full_content: Whether to include full webpage content in results

50 max_filtered_results: Maximum number of results to keep after filtering

51 search_depth: "basic" or "advanced" - controls search quality vs speed

52 include_domains: List of domains to include in search

53 exclude_domains: List of domains to exclude from search

54 settings_snapshot: Settings snapshot for thread context

55 **kwargs: Additional parameters (ignored but accepted for compatibility)

56 """

57 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results

58 super().__init__(

59 llm=llm,

60 max_filtered_results=max_filtered_results,

61 max_results=max_results,

62 )

63 self.include_full_content = include_full_content

64 self.search_depth = search_depth

65 self.include_domains = include_domains or []

66 self.exclude_domains = exclude_domains or []

68 # Get API key - check params, database, or env vars

69 from ...config.search_config import get_setting_from_snapshot

71 tavily_api_key = api_key

72 if not tavily_api_key:

73 tavily_api_key = get_setting_from_snapshot(

74 "search.engine.web.tavily.api_key",

75 settings_snapshot=settings_snapshot,

76 )

78 if not tavily_api_key:

79 raise ValueError(

80 "Tavily API key not found. Please provide api_key parameter, "

81 "set it in the UI settings, or set TAVILY_API_KEY environment variable."

82 )

84 self.api_key = tavily_api_key

85 self.base_url = "https://api.tavily.com"

87 # If full content is requested, initialize FullSearchResults

88 if include_full_content:

89 # Import FullSearchResults only if needed

90 try:

91 from .full_search import FullSearchResults

93 # Create a simple wrapper for Tavily API calls

94 class TavilyWrapper:

95 def __init__(self, parent):

96 self.parent = parent

98 def run(self, query):

99 return self.parent._get_previews(query)

100

101 self.full_search = FullSearchResults(

102 llm=llm,

103 web_search=TavilyWrapper(self),

104 language=search_language,

105 max_results=max_results,

106 region=region,

107 time=time_period,

108 safesearch="moderate" if safe_search else "off",

109 )

110 except ImportError:

111 logger.warning(

112 "Warning: FullSearchResults not available. Full content retrieval disabled."

113 )

114 self.include_full_content = False

115

116 def _get_previews(self, query: str) -> List[Dict[str, Any]]:

117 """

118 Get preview information from Tavily Search.

119

120 Args:

121 query: The search query

122

123 Returns:

124 List of preview dictionaries

125 """

126 logger.info("Getting search results from Tavily")

127

128 try:

129 # Prepare the request payload

130 payload = {

131 "api_key": self.api_key,

132 "query": query[:400], # Limit query length

133 "search_depth": self.search_depth,

134 "max_results": min(

135 20, self.max_results

136 ), # Tavily has a max limit

137 "include_answer": False, # We don't need the AI answer

138 "include_images": False, # We don't need images

139 "include_raw_content": self.include_full_content, # Get content if requested

140 }

141

142 # Add domain filters if specified

143 if self.include_domains:

144 payload["include_domains"] = self.include_domains

145 if self.exclude_domains:

146 payload["exclude_domains"] = self.exclude_domains

147

148 # Apply rate limiting before request

149 self._last_wait_time = self.rate_tracker.apply_rate_limit(

150 self.engine_type

151 )

152

153 # Make the API request

154 response = safe_post(

155 f"{self.base_url}/search",

156 json=payload,

157 headers={"Content-Type": "application/json"},

158 timeout=30,

159 )

160

161 # Check for errors

162 if response.status_code == 429:

163 raise RateLimitError(

164 f"Tavily rate limit hit: {response.status_code} - {response.text}"

165 )

166

167 response.raise_for_status()

168

169 # Parse the response

170 data = response.json()

171 results = data.get("results", [])

172

173 # Format results as previews

174 previews = []

175 for i, result in enumerate(results):

176 preview = {

177 "id": result.get("url", str(i)), # Use URL as ID

178 "title": result.get("title", ""),

179 "link": result.get("url", ""),

180 "snippet": result.get(

181 "content", ""

182 ), # Tavily calls it "content"

183 "displayed_link": result.get("url", ""),

184 "position": i,

185 }

186

187 # Store full Tavily result for later

188 preview["_full_result"] = result

189

190 previews.append(preview)

191

192 # Store the previews for potential full content retrieval

193 self._search_results = previews

194

195 return previews

196

197 except RateLimitError:

198 raise # Re-raise rate limit errors

199 except requests.exceptions.RequestException as e:

200 error_msg = str(e)

201 logger.exception("Error getting Tavily results")

202

203 # Check for rate limit patterns in error message

204 if any(

205 pattern in error_msg.lower()

206 for pattern in [

207 "429",

208 "rate limit",

209 "quota",

210 "too many requests",

211 ]

212 ):

213 raise RateLimitError(f"Tavily rate limit hit: {error_msg}")

214

215 return []

216 except Exception:

217 logger.exception("Unexpected error getting Tavily results")

218 return []

219

220 def _get_full_content(

221 self, relevant_items: List[Dict[str, Any]]

222 ) -> List[Dict[str, Any]]:

223 """

224 Get full content for the relevant search results.

225 If include_full_content is True and raw content was retrieved,

226 includes it in the results.

227

228 Args:

229 relevant_items: List of relevant preview dictionaries

230

231 Returns:

232 List of result dictionaries with full content if available

233 """

234 # Check if we should get full content

235 if ( 235 ↛ 239line 235 didn't jump to line 239 because the condition on line 235 was never true

236 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")

237 and search_config.SEARCH_SNIPPETS_ONLY

238 ):

239 logger.info("Snippet-only mode, skipping full content retrieval")

240

241 # Return the relevant items with their full Tavily information

242 results = []

243 for item in relevant_items:

244 # Use the full result if available, otherwise use the preview

245 if "_full_result" in item:

246 result = item["_full_result"]

247 # Remove temporary field

248 if "_full_result" in result:

249 del result["_full_result"]

250 else:

251 result = item

252

253 results.append(result)

254

255 return results

256

257 # If full content retrieval is enabled

258 if self.include_full_content and hasattr(self, "full_search"): 258 ↛ 259line 258 didn't jump to line 259 because the condition on line 258 was never true

259 logger.info("Retrieving full webpage content")

260

261 try:

262 # Use FullSearchResults to get full content

263 results_with_content = self.full_search._get_full_content(

264 relevant_items

265 )

266

267 return results_with_content

268

269 except Exception:

270 logger.exception("Error retrieving full content")

271 # Fall back to returning the items without full content

272

273 # Return items with their full Tavily information

274 results = []

275 for item in relevant_items:

276 # Use the full result if available, otherwise use the preview

277 if "_full_result" in item:

278 result = item["_full_result"].copy()

279

280 # If Tavily provided raw_content, include it

281 if "raw_content" in result and self.include_full_content: 281 ↛ 282line 281 didn't jump to line 282 because the condition on line 281 was never true

282 result["content"] = result.get(

283 "raw_content", result.get("content", "")

284 )

285

286 # Remove temporary field

287 if "_full_result" in result: 287 ↛ 288line 287 didn't jump to line 288 because the condition on line 287 was never true

288 del result["_full_result"]

289 else:

290 result = item.copy()

291 if "_full_result" in result: 291 ↛ 292line 291 didn't jump to line 292 because the condition on line 291 was never true

292 del result["_full_result"]

293

294 results.append(result)

295

296 return results

297

298 def run(

299 self, query: str, research_context: Dict[str, Any] | None = None

300 ) -> List[Dict[str, Any]]:

301 """

302 Execute a search using Tavily with the two-phase approach.

303

304 Args:

305 query: The search query

306 research_context: Context from previous research to use.

307

308 Returns:

309 List of search results

310 """

311 logger.info("---Execute a search using Tavily---")

312

313 # Use the implementation from the parent class which handles all phases

314 results = super().run(query, research_context=research_context)

315

316 # Clean up

317 if hasattr(self, "_search_results"):

318 del self._search_results

319

320 return results

Coverage for src / local_deep_research / web_search_engines / engines / search_engine_tavily.py: 77%

108 statements