Coverage for src/local_deep_research/web_search_engines/engines/search_engine

1from typing import Any, Dict, List, Optional

3import requests

4from langchain_core.language_models import BaseLLM

5from ...security.secure_logging import logger

7from ...security.safe_requests import safe_post

8from ..rate_limiting import RateLimitError

9from ..search_engine_base import BaseSearchEngine, Exposure, Sensitivity

12class ExaSearchEngine(BaseSearchEngine):

13 """Exa.ai search engine implementation with neural search capabilities"""

15 # Mark as public search engine

16 is_public = True

17 egress_sensitivity = Sensitivity.NON_SENSITIVE

18 egress_exposure = Exposure.EXPOSING

19 # Mark as generic search engine (general web search)

20 is_generic = True

22 def __init__(

23 self,

24 max_results: int = 10,

25 region: str = "US",

26 time_period: str = "y",

27 safe_search: bool = True,

28 search_language: str = "English",

29 api_key: Optional[str] = None,

30 llm: Optional[BaseLLM] = None,

31 include_full_content: bool = True,

32 max_filtered_results: Optional[int] = None,

33 search_type: str = "auto",

34 include_domains: Optional[List[str]] = None,

35 exclude_domains: Optional[List[str]] = None,

36 start_published_date: Optional[str] = None,

37 end_published_date: Optional[str] = None,

38 category: Optional[str] = None,

39 settings_snapshot: Optional[Dict[str, Any]] = None,

40 **kwargs,

41 ):

42 """

43 Initialize the Exa search engine.

45 Args:

46 max_results: Maximum number of search results

47 region: Region code for search results (not used by Exa currently)

48 time_period: Time period for search results (not used by Exa currently)

49 safe_search: Whether to enable safe search (not used by Exa currently)

50 search_language: Language for search results (not used by Exa currently)

51 api_key: Exa API key (can also be set in UI settings)

52 llm: Language model for relevance filtering

53 include_full_content: Whether to include full webpage content in results

54 max_filtered_results: Maximum number of results to keep after filtering

55 search_type: "auto" (default), "neural", "fast", or "deep"

56 include_domains: List of domains to include in search

57 exclude_domains: List of domains to exclude from search

58 start_published_date: Only links published after this date (YYYY-MM-DD)

59 end_published_date: Only links published before this date (YYYY-MM-DD)

60 category: Data category to focus on (e.g. 'company', 'news', 'research paper')

61 settings_snapshot: Settings snapshot for thread context

62 **kwargs: Additional parameters (ignored but accepted for compatibility)

63 """

64 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results

65 super().__init__(

66 llm=llm,

67 max_filtered_results=max_filtered_results,

68 max_results=max_results,

69 include_full_content=include_full_content,

70 settings_snapshot=settings_snapshot,

71 )

72 self.search_type = search_type

73 self.include_domains = include_domains or []

74 self.exclude_domains = exclude_domains or []

75 self.start_published_date = start_published_date

76 self.end_published_date = end_published_date

77 self.category = category

79 # Resolve API key using base class method

80 self.api_key = self._resolve_api_key(

81 api_key,

82 "search.engine.web.exa.api_key",

83 engine_name="Exa",

84 settings_snapshot=settings_snapshot,

85 )

86 self.base_url = "https://api.exa.ai"

88 # Exa handles full content natively via its API (payload["contents"]),

89 # so _init_full_search() is intentionally not called here.

91 def _get_previews(self, query: str) -> List[Dict[str, Any]]:

92 """

93 Get preview information from Exa Search.

95 Args:

96 query: The search query

98 Returns:

99 List of preview dictionaries

100 """

101 logger.info("Getting search results from Exa")

102

103 try:

104 # Prepare the request payload

105 payload = {

106 "query": query[:400], # Limit query length

107 "type": self.search_type,

108 "numResults": min(

109 100, self.max_results

110 ), # Exa supports up to 100

111 }

112

113 # Add optional parameters if specified

114 if self.include_domains:

115 payload["includeDomains"] = self.include_domains

116 if self.exclude_domains:

117 payload["excludeDomains"] = self.exclude_domains

118 if self.start_published_date:

119 payload["startPublishedDate"] = self.start_published_date

120 if self.end_published_date:

121 payload["endPublishedDate"] = self.end_published_date

122 if self.category:

123 payload["category"] = self.category

124

125 # Request text content if full content is enabled

126 if self.include_full_content:

127 payload["contents"] = {

128 "text": {"maxCharacters": 10000},

129 "highlights": {"maxCharacters": 500, "query": query},

130 "summary": {"query": query},

131 }

132

133 # Apply rate limiting before request

134 self._last_wait_time = self.rate_tracker.apply_rate_limit(

135 self.engine_type

136 )

137

138 # Make the API request

139 response = safe_post(

140 f"{self.base_url}/search",

141 json=payload,

142 headers={

143 "Content-Type": "application/json",

144 "x-api-key": self.api_key,

145 },

146 timeout=30,

147 )

148

149 # Check for rate limits

150 self._raise_if_rate_limit(response.status_code)

151

152 response.raise_for_status()

153

154 # Parse the response

155 data = response.json()

156 results = data.get("results", [])

157

158 # Format results as previews

159 previews = []

160 for i, result in enumerate(results):

161 # Extract text content if available

162 text_content = result.get("text", "")

163

164 # Use highlights or summary as snippet if available, otherwise use text

165 snippet = ""

166 highlights = result.get("highlights")

167 if highlights and isinstance(highlights, list):

168 # Join highlights with ellipsis

169 snippet = " ... ".join(highlights[:3])

170 elif "summary" in result:

171 snippet = result.get("summary", "")

172 elif text_content: 172 ↛ 177line 172 didn't jump to line 177 because the condition on line 172 was always true

173 # Use first 500 chars of text as snippet

174 snippet = text_content[:500]

175

176 # Extract display link

177 link = result.get("url", "")

178 display_link = self._extract_display_link(link)

179

180 preview = {

181 "id": result.get("id", result.get("url", str(i))),

182 "title": result.get("title", ""),

183 "link": link,

184 "snippet": snippet,

185 "displayed_link": display_link,

186 "position": i,

187 }

188

189 # Add optional fields if available

190 if "publishedDate" in result:

191 preview["published_date"] = result["publishedDate"]

192 if "author" in result:

193 preview["author"] = result["author"]

194 if "score" in result:

195 preview["score"] = result["score"]

196

197 # Store full Exa result for later

198 preview["_full_result"] = result

199

200 previews.append(preview)

201

202 logger.info(f"Exa returned {len(previews)} results")

203 return previews

204

205 except RateLimitError:

206 raise # Re-raise rate limit errors

207 except requests.exceptions.RequestException as e:

208 safe_msg = self._scrub_error(e)

209 logger.warning(f"Error getting Exa results: {safe_msg}")

210 self._raise_if_rate_limit(e)

211 return []

212 except Exception as e:

213 safe_msg = self._scrub_error(e)

214 logger.warning(f"Unexpected error getting Exa results: {safe_msg}")

215 return []

Coverage for src/local_deep_research/web_search_engines/engines/search_engine_exa.py: 99%

79 statements