Coverage for src/local_deep_research/web_search_engines/engines/search_engine

1from loguru import logger

2from typing import Any, Dict, List, Optional

4from langchain_community.utilities import SerpAPIWrapper

5from langchain_core.language_models import BaseLLM

7from ...config import search_config

8from ..search_engine_base import BaseSearchEngine

11class SerpAPISearchEngine(BaseSearchEngine):

12 """Google search engine implementation using SerpAPI with two-phase approach"""

14 # Mark as public search engine

15 is_public = True

16 # Mark as generic search engine (general web search via Google)

17 is_generic = True

19 def __init__(

20 self,

21 max_results: int = 10,

22 region: str = "us",

23 time_period: str = "y",

24 safe_search: bool = True,

25 search_language: str = "English",

26 api_key: Optional[str] = None,

27 language_code_mapping: Optional[Dict[str, str]] = None,

28 llm: Optional[BaseLLM] = None,

29 include_full_content: bool = False,

30 max_filtered_results: Optional[int] = None,

31 settings_snapshot: Optional[Dict[str, Any]] = None,

32 **kwargs,

33 ):

34 """

35 Initialize the SerpAPI search engine.

37 Args:

38 max_results: Maximum number of search results

39 region: Region code for search results

40 time_period: Time period for search results

41 safe_search: Whether to enable safe search

42 search_language: Language for search results

43 api_key: SerpAPI API key (can also be set in SERP_API_KEY env)

44 language_code_mapping: Mapping from language names to codes

45 llm: Language model for relevance filtering

46 include_full_content: Whether to include full webpage content in results

47 max_filtered_results: Maximum number of results to keep after filtering

48 settings_snapshot: Settings snapshot for thread context

49 **kwargs: Additional parameters (ignored but accepted for compatibility)

50 """

51 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results

52 super().__init__(

53 llm=llm,

54 max_filtered_results=max_filtered_results,

55 max_results=max_results,

56 )

57 self.include_full_content = include_full_content

59 # Set up language code mapping

60 if language_code_mapping is None:

61 language_code_mapping = {

62 "english": "en",

63 "spanish": "es",

64 "chinese": "zh",

65 "hindi": "hi",

66 "french": "fr",

67 "arabic": "ar",

68 "bengali": "bn",

69 "portuguese": "pt",

70 "russian": "ru",

71 }

73 # Get API key - check params, env vars, or database

74 from ...config.search_config import get_setting_from_snapshot

76 serpapi_api_key = api_key

77 if not serpapi_api_key:

78 serpapi_api_key = get_setting_from_snapshot(

79 "search.engine.web.serpapi.api_key",

80 settings_snapshot=settings_snapshot,

81 )

83 if not serpapi_api_key:

84 raise ValueError(

85 "SerpAPI key not found. Please provide api_key parameter, set the SERP_API_KEY environment variable, or set it in the UI settings."

86 )

88 # Get language code

89 language_code = language_code_mapping.get(search_language.lower(), "en")

91 # Initialize SerpAPI wrapper

92 self.engine = SerpAPIWrapper(

93 serpapi_api_key=serpapi_api_key,

94 params={

95 "engine": "google",

96 "hl": language_code,

97 "gl": region,

98 "safe": "active" if safe_search else "off",

99 "tbs": f"qdr:{time_period}",

100 "num": max_results,

101 },

102 )

103

104 # If full content is requested, initialize FullSearchResults

105 if include_full_content:

106 # Import FullSearchResults only if needed

107 try:

108 from .full_search import FullSearchResults

109

110 self.full_search = FullSearchResults(

111 llm=llm,

112 web_search=self.engine,

113 language=search_language,

114 max_results=max_results,

115 region=region,

116 time=time_period,

117 safesearch="Moderate" if safe_search else "Off",

118 )

119 except ImportError:

120 logger.warning(

121 "Warning: FullSearchResults not available. Full content retrieval disabled."

122 )

123 self.include_full_content = False

124

125 def _get_previews(self, query: str) -> List[Dict[str, Any]]:

126 """

127 Get preview information from SerpAPI.

128

129 Args:

130 query: The search query

131

132 Returns:

133 List of preview dictionaries

134 """

135 logger.info("Getting search results from SerpAPI")

136

137 try:

138 # Get search results from SerpAPI

139 organic_results = self.engine.results(query).get(

140 "organic_results", []

141 )

142

143 # Format results as previews

144 previews = []

145 for result in organic_results:

146 preview = {

147 "id": result.get(

148 "position", len(previews)

149 ), # Use position as ID

150 "title": result.get("title", ""),

151 "link": result.get("link", ""),

152 "snippet": result.get("snippet", ""),

153 "displayed_link": result.get("displayed_link", ""),

154 "position": result.get("position"),

155 }

156

157 # Store full SerpAPI result for later

158 preview["_full_result"] = result

159

160 previews.append(preview)

161

162 # Store the previews for potential full content retrieval

163 self._search_results = previews

164

165 return previews

166

167 except Exception:

168 logger.exception("Error getting SerpAPI results")

169 return []

170

171 def _get_full_content(

172 self, relevant_items: List[Dict[str, Any]]

173 ) -> List[Dict[str, Any]]:

174 """

175 Get full content for the relevant search results.

176 If include_full_content is True and FullSearchResults is available,

177 retrieves full webpage content for the results.

178

179 Args:

180 relevant_items: List of relevant preview dictionaries

181

182 Returns:

183 List of result dictionaries with full content if requested

184 """

185 # Check if we should get full content

186 if ( 186 ↛ 190line 186 didn't jump to line 190 because the condition on line 186 was never true

187 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")

188 and search_config.SEARCH_SNIPPETS_ONLY

189 ):

190 logger.info("Snippet-only mode, skipping full content retrieval")

191

192 # Return the relevant items with their full SerpAPI information

193 results = []

194 for item in relevant_items:

195 # Use the full result if available, otherwise use the preview

196 if "_full_result" in item:

197 result = item["_full_result"]

198 # Remove temporary field

199 if "_full_result" in result:

200 del result["_full_result"]

201 else:

202 result = item

203

204 results.append(result)

205

206 return results

207

208 # If full content retrieval is enabled

209 if self.include_full_content and hasattr(self, "full_search"): 209 ↛ 210line 209 didn't jump to line 210 because the condition on line 209 was never true

210 logger.info("Retrieving full webpage content")

211

212 try:

213 # Use FullSearchResults to get full content

214 # This is a simplified approach - in a real implementation,

215 # you would need to fetch and process the URLs

216 results_with_content = self.full_search._get_full_content(

217 relevant_items

218 )

219

220 return results_with_content

221

222 except Exception as e:

223 logger.info(f"Error retrieving full content: {e}")

224 # Fall back to returning the items without full content

225

226 # Return items with their full SerpAPI information

227 results = []

228 for item in relevant_items:

229 # Use the full result if available, otherwise use the preview

230 if "_full_result" in item:

231 result = item["_full_result"].copy()

232 # Remove temporary field

233 if "_full_result" in result: 233 ↛ 234line 233 didn't jump to line 234 because the condition on line 233 was never true

234 del result["_full_result"]

235 else:

236 result = item.copy()

237 if "_full_result" in result: 237 ↛ 238line 237 didn't jump to line 238 because the condition on line 237 was never true

238 del result["_full_result"]

239

240 results.append(result)

241

242 return results

243

244 def run(

245 self, query: str, research_context: Dict[str, Any] | None = None

246 ) -> List[Dict[str, Any]]:

247 """

248 Execute a search using SerpAPI with the two-phase approach.

249

250 Args:

251 query: The search query

252 research_context: Context from previous research to use.

253

254 Returns:

255 List of search results

256 """

257 logger.info("---Execute a search using SerpAPI (Google)---")

258

259 # Use the implementation from the parent class which handles all phases

260 results = super().run(query, research_context=research_context)

261

262 # Clean up

263 if hasattr(self, "_search_results"):

264 del self._search_results

265

266 return results

Coverage for src / local_deep_research / web_search_engines / engines / search_engine_serpapi.py: 72%

79 statements