Coverage for src / local_deep_research / web_search_engines / engines / search_engine_exa.py: 96%

82 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1from typing import Any, Dict, List, Optional 

2from urllib.parse import urlparse 

3 

4import requests 

5from langchain_core.language_models import BaseLLM 

6from loguru import logger 

7 

8from ...security.safe_requests import safe_post 

9from ..rate_limiting import RateLimitError 

10from ..search_engine_base import BaseSearchEngine 

11 

12 

13class ExaSearchEngine(BaseSearchEngine): 

14 """Exa.ai search engine implementation with neural search capabilities""" 

15 

16 # Mark as public search engine 

17 is_public = True 

18 # Mark as generic search engine (general web search) 

19 is_generic = True 

20 

21 def __init__( 

22 self, 

23 max_results: int = 10, 

24 region: str = "US", 

25 time_period: str = "y", 

26 safe_search: bool = True, 

27 search_language: str = "English", 

28 api_key: Optional[str] = None, 

29 llm: Optional[BaseLLM] = None, 

30 include_full_content: bool = True, 

31 max_filtered_results: Optional[int] = None, 

32 search_type: str = "auto", 

33 include_domains: Optional[List[str]] = None, 

34 exclude_domains: Optional[List[str]] = None, 

35 start_published_date: Optional[str] = None, 

36 end_published_date: Optional[str] = None, 

37 category: Optional[str] = None, 

38 settings_snapshot: Optional[Dict[str, Any]] = None, 

39 **kwargs, 

40 ): 

41 """ 

42 Initialize the Exa search engine. 

43 

44 Args: 

45 max_results: Maximum number of search results 

46 region: Region code for search results (not used by Exa currently) 

47 time_period: Time period for search results (not used by Exa currently) 

48 safe_search: Whether to enable safe search (not used by Exa currently) 

49 search_language: Language for search results (not used by Exa currently) 

50 api_key: Exa API key (can also be set in UI settings) 

51 llm: Language model for relevance filtering 

52 include_full_content: Whether to include full webpage content in results 

53 max_filtered_results: Maximum number of results to keep after filtering 

54 search_type: "auto" (default), "neural", "fast", or "deep" 

55 include_domains: List of domains to include in search 

56 exclude_domains: List of domains to exclude from search 

57 start_published_date: Only links published after this date (YYYY-MM-DD) 

58 end_published_date: Only links published before this date (YYYY-MM-DD) 

59 category: Data category to focus on (e.g. 'company', 'news', 'research paper') 

60 settings_snapshot: Settings snapshot for thread context 

61 **kwargs: Additional parameters (ignored but accepted for compatibility) 

62 """ 

63 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results 

64 super().__init__( 

65 llm=llm, 

66 max_filtered_results=max_filtered_results, 

67 max_results=max_results, 

68 include_full_content=include_full_content, 

69 settings_snapshot=settings_snapshot, 

70 ) 

71 self.search_type = search_type 

72 self.include_domains = include_domains or [] 

73 self.exclude_domains = exclude_domains or [] 

74 self.start_published_date = start_published_date 

75 self.end_published_date = end_published_date 

76 self.category = category 

77 

78 # Resolve API key using base class method 

79 self.api_key = self._resolve_api_key( 

80 api_key, 

81 "search.engine.web.exa.api_key", 

82 engine_name="Exa", 

83 settings_snapshot=settings_snapshot, 

84 ) 

85 self.base_url = "https://api.exa.ai" 

86 

87 # Exa handles full content natively via its API (payload["contents"]), 

88 # so _init_full_search() is intentionally not called here. 

89 

90 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

91 """ 

92 Get preview information from Exa Search. 

93 

94 Args: 

95 query: The search query 

96 

97 Returns: 

98 List of preview dictionaries 

99 """ 

100 logger.info("Getting search results from Exa") 

101 

102 try: 

103 # Prepare the request payload 

104 payload = { 

105 "query": query[:400], # Limit query length 

106 "type": self.search_type, 

107 "numResults": min( 

108 100, self.max_results 

109 ), # Exa supports up to 100 

110 } 

111 

112 # Add optional parameters if specified 

113 if self.include_domains: 

114 payload["includeDomains"] = self.include_domains 

115 if self.exclude_domains: 

116 payload["excludeDomains"] = self.exclude_domains 

117 if self.start_published_date: 

118 payload["startPublishedDate"] = self.start_published_date 

119 if self.end_published_date: 

120 payload["endPublishedDate"] = self.end_published_date 

121 if self.category: 

122 payload["category"] = self.category 

123 

124 # Request text content if full content is enabled 

125 if self.include_full_content: 

126 payload["contents"] = { 

127 "text": {"maxCharacters": 10000}, 

128 "highlights": {"maxCharacters": 500, "query": query}, 

129 "summary": {"query": query}, 

130 } 

131 

132 # Apply rate limiting before request 

133 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

134 self.engine_type 

135 ) 

136 

137 # Make the API request 

138 response = safe_post( 

139 f"{self.base_url}/search", 

140 json=payload, 

141 headers={ 

142 "Content-Type": "application/json", 

143 "x-api-key": self.api_key, 

144 }, 

145 timeout=30, 

146 ) 

147 

148 # Check for rate limits 

149 self._raise_if_rate_limit(response.status_code) 

150 

151 response.raise_for_status() 

152 

153 # Parse the response 

154 data = response.json() 

155 results = data.get("results", []) 

156 

157 # Format results as previews 

158 previews = [] 

159 for i, result in enumerate(results): 

160 # Extract text content if available 

161 text_content = result.get("text", "") 

162 

163 # Use highlights or summary as snippet if available, otherwise use text 

164 snippet = "" 

165 highlights = result.get("highlights") 

166 if highlights and isinstance(highlights, list): 

167 # Join highlights with ellipsis 

168 snippet = " ... ".join(highlights[:3]) 

169 elif "summary" in result: 

170 snippet = result.get("summary", "") 

171 elif text_content: 171 ↛ 176line 171 didn't jump to line 176 because the condition on line 171 was always true

172 # Use first 500 chars of text as snippet 

173 snippet = text_content[:500] 

174 

175 # Extract display link safely using urlparse 

176 link = result.get("url", "") 

177 display_link = "" 

178 if link: 178 ↛ 187line 178 didn't jump to line 187 because the condition on line 178 was always true

179 try: 

180 parsed_url = urlparse(link) 

181 display_link = parsed_url.netloc or "" 

182 except Exception: 

183 logger.debug( 

184 f"Failed to parse URL for display: {link[:50]}" 

185 ) 

186 

187 preview = { 

188 "id": result.get("id", result.get("url", str(i))), 

189 "title": result.get("title", ""), 

190 "link": link, 

191 "snippet": snippet, 

192 "displayed_link": display_link, 

193 "position": i, 

194 } 

195 

196 # Add optional fields if available 

197 if "publishedDate" in result: 

198 preview["published_date"] = result["publishedDate"] 

199 if "author" in result: 

200 preview["author"] = result["author"] 

201 if "score" in result: 

202 preview["score"] = result["score"] 

203 

204 # Store full Exa result for later 

205 preview["_full_result"] = result 

206 

207 previews.append(preview) 

208 

209 logger.info(f"Exa returned {len(previews)} results") 

210 return previews 

211 

212 except RateLimitError: 

213 raise # Re-raise rate limit errors 

214 except requests.exceptions.RequestException as e: 

215 logger.exception("Error getting Exa results") 

216 self._raise_if_rate_limit(e) 

217 return [] 

218 except Exception: 

219 logger.exception("Unexpected error getting Exa results") 

220 return []