Coverage for src / local_deep_research / web_search_engines / engines / search_engine_exa.py: 96%
82 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1from typing import Any, Dict, List, Optional
2from urllib.parse import urlparse
4import requests
5from langchain_core.language_models import BaseLLM
6from loguru import logger
8from ...security.safe_requests import safe_post
9from ..rate_limiting import RateLimitError
10from ..search_engine_base import BaseSearchEngine
13class ExaSearchEngine(BaseSearchEngine):
14 """Exa.ai search engine implementation with neural search capabilities"""
16 # Mark as public search engine
17 is_public = True
18 # Mark as generic search engine (general web search)
19 is_generic = True
21 def __init__(
22 self,
23 max_results: int = 10,
24 region: str = "US",
25 time_period: str = "y",
26 safe_search: bool = True,
27 search_language: str = "English",
28 api_key: Optional[str] = None,
29 llm: Optional[BaseLLM] = None,
30 include_full_content: bool = True,
31 max_filtered_results: Optional[int] = None,
32 search_type: str = "auto",
33 include_domains: Optional[List[str]] = None,
34 exclude_domains: Optional[List[str]] = None,
35 start_published_date: Optional[str] = None,
36 end_published_date: Optional[str] = None,
37 category: Optional[str] = None,
38 settings_snapshot: Optional[Dict[str, Any]] = None,
39 **kwargs,
40 ):
41 """
42 Initialize the Exa search engine.
44 Args:
45 max_results: Maximum number of search results
46 region: Region code for search results (not used by Exa currently)
47 time_period: Time period for search results (not used by Exa currently)
48 safe_search: Whether to enable safe search (not used by Exa currently)
49 search_language: Language for search results (not used by Exa currently)
50 api_key: Exa API key (can also be set in UI settings)
51 llm: Language model for relevance filtering
52 include_full_content: Whether to include full webpage content in results
53 max_filtered_results: Maximum number of results to keep after filtering
54 search_type: "auto" (default), "neural", "fast", or "deep"
55 include_domains: List of domains to include in search
56 exclude_domains: List of domains to exclude from search
57 start_published_date: Only links published after this date (YYYY-MM-DD)
58 end_published_date: Only links published before this date (YYYY-MM-DD)
59 category: Data category to focus on (e.g. 'company', 'news', 'research paper')
60 settings_snapshot: Settings snapshot for thread context
61 **kwargs: Additional parameters (ignored but accepted for compatibility)
62 """
63 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
64 super().__init__(
65 llm=llm,
66 max_filtered_results=max_filtered_results,
67 max_results=max_results,
68 include_full_content=include_full_content,
69 settings_snapshot=settings_snapshot,
70 )
71 self.search_type = search_type
72 self.include_domains = include_domains or []
73 self.exclude_domains = exclude_domains or []
74 self.start_published_date = start_published_date
75 self.end_published_date = end_published_date
76 self.category = category
78 # Resolve API key using base class method
79 self.api_key = self._resolve_api_key(
80 api_key,
81 "search.engine.web.exa.api_key",
82 engine_name="Exa",
83 settings_snapshot=settings_snapshot,
84 )
85 self.base_url = "https://api.exa.ai"
87 # Exa handles full content natively via its API (payload["contents"]),
88 # so _init_full_search() is intentionally not called here.
90 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
91 """
92 Get preview information from Exa Search.
94 Args:
95 query: The search query
97 Returns:
98 List of preview dictionaries
99 """
100 logger.info("Getting search results from Exa")
102 try:
103 # Prepare the request payload
104 payload = {
105 "query": query[:400], # Limit query length
106 "type": self.search_type,
107 "numResults": min(
108 100, self.max_results
109 ), # Exa supports up to 100
110 }
112 # Add optional parameters if specified
113 if self.include_domains:
114 payload["includeDomains"] = self.include_domains
115 if self.exclude_domains:
116 payload["excludeDomains"] = self.exclude_domains
117 if self.start_published_date:
118 payload["startPublishedDate"] = self.start_published_date
119 if self.end_published_date:
120 payload["endPublishedDate"] = self.end_published_date
121 if self.category:
122 payload["category"] = self.category
124 # Request text content if full content is enabled
125 if self.include_full_content:
126 payload["contents"] = {
127 "text": {"maxCharacters": 10000},
128 "highlights": {"maxCharacters": 500, "query": query},
129 "summary": {"query": query},
130 }
132 # Apply rate limiting before request
133 self._last_wait_time = self.rate_tracker.apply_rate_limit(
134 self.engine_type
135 )
137 # Make the API request
138 response = safe_post(
139 f"{self.base_url}/search",
140 json=payload,
141 headers={
142 "Content-Type": "application/json",
143 "x-api-key": self.api_key,
144 },
145 timeout=30,
146 )
148 # Check for rate limits
149 self._raise_if_rate_limit(response.status_code)
151 response.raise_for_status()
153 # Parse the response
154 data = response.json()
155 results = data.get("results", [])
157 # Format results as previews
158 previews = []
159 for i, result in enumerate(results):
160 # Extract text content if available
161 text_content = result.get("text", "")
163 # Use highlights or summary as snippet if available, otherwise use text
164 snippet = ""
165 highlights = result.get("highlights")
166 if highlights and isinstance(highlights, list):
167 # Join highlights with ellipsis
168 snippet = " ... ".join(highlights[:3])
169 elif "summary" in result:
170 snippet = result.get("summary", "")
171 elif text_content: 171 ↛ 176line 171 didn't jump to line 176 because the condition on line 171 was always true
172 # Use first 500 chars of text as snippet
173 snippet = text_content[:500]
175 # Extract display link safely using urlparse
176 link = result.get("url", "")
177 display_link = ""
178 if link: 178 ↛ 187line 178 didn't jump to line 187 because the condition on line 178 was always true
179 try:
180 parsed_url = urlparse(link)
181 display_link = parsed_url.netloc or ""
182 except Exception:
183 logger.debug(
184 f"Failed to parse URL for display: {link[:50]}"
185 )
187 preview = {
188 "id": result.get("id", result.get("url", str(i))),
189 "title": result.get("title", ""),
190 "link": link,
191 "snippet": snippet,
192 "displayed_link": display_link,
193 "position": i,
194 }
196 # Add optional fields if available
197 if "publishedDate" in result:
198 preview["published_date"] = result["publishedDate"]
199 if "author" in result:
200 preview["author"] = result["author"]
201 if "score" in result:
202 preview["score"] = result["score"]
204 # Store full Exa result for later
205 preview["_full_result"] = result
207 previews.append(preview)
209 logger.info(f"Exa returned {len(previews)} results")
210 return previews
212 except RateLimitError:
213 raise # Re-raise rate limit errors
214 except requests.exceptions.RequestException as e:
215 logger.exception("Error getting Exa results")
216 self._raise_if_rate_limit(e)
217 return []
218 except Exception:
219 logger.exception("Unexpected error getting Exa results")
220 return []