Coverage for src/local_deep_research/web_search_engines/engines/search_engine

1from typing import Any, Dict, List, Optional

3import wikipedia

4from langchain_core.language_models import BaseLLM

5from loguru import logger

7from ...config import search_config

8from ..search_engine_base import BaseSearchEngine

11class WikipediaSearchEngine(BaseSearchEngine):

12 """Wikipedia search engine implementation with two-phase approach"""

14 # Mark as public search engine

15 is_public = True

17 def __init__(

18 self,

19 max_results: int = 10,

20 language: str = "en",

21 include_content: bool = True,

22 sentences: int = 5,

23 llm: Optional[BaseLLM] = None,

24 max_filtered_results: Optional[int] = None,

25 **kwargs,

26 ):

27 """

28 Initialize the Wikipedia search engine.

30 Args:

31 max_results: Maximum number of search results

32 language: Language code for Wikipedia (e.g., 'en', 'fr', 'es')

33 include_content: Whether to include full page content in results

34 sentences: Number of sentences to include in summary

35 llm: Language model for relevance filtering

36 max_filtered_results: Maximum number of results to keep after filtering

37 **kwargs: Additional parameters (ignored but accepted for compatibility)

38 """

39 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results

40 super().__init__(

41 llm=llm,

42 max_filtered_results=max_filtered_results,

43 max_results=max_results,

44 )

45 self.include_content = include_content

46 self.sentences = sentences

48 # Set the Wikipedia language

49 wikipedia.set_lang(language)

51 def _get_previews(self, query: str) -> List[Dict[str, Any]]:

52 """

53 Get preview information (titles and summaries) for Wikipedia pages.

55 Args:

56 query: The search query

58 Returns:

59 List of preview dictionaries

60 """

61 logger.info(f"Getting Wikipedia page previews for query: {query}")

63 try:

64 # Apply rate limiting before search request

65 self._last_wait_time = self.rate_tracker.apply_rate_limit(

66 self.engine_type

67 )

69 # Get search results (just titles)

70 search_results = wikipedia.search(query, results=self.max_results)

72 logger.info(

73 f"Found {len(search_results)} Wikipedia results: {search_results}"

74 )

76 if not search_results:

77 logger.info(f"No Wikipedia results found for query: {query}")

78 return []

80 # Create a cache for full pages (will be populated on-demand)

81 self._page_cache = {}

83 # Generate previews with summaries

84 previews = []

85 for title in search_results:

86 try:

87 # Get just the summary, with auto_suggest=False to be more precise

88 summary = None

89 try:

90 # Apply rate limiting before summary request

91 self._last_wait_time = (

92 self.rate_tracker.apply_rate_limit(self.engine_type)

93 )

95 summary = wikipedia.summary(

96 title, sentences=self.sentences, auto_suggest=False

97 )

98 except wikipedia.exceptions.DisambiguationError as e:

99 # If disambiguation error, try the first option

100 if e.options and len(e.options) > 0:

101 logger.info(

102 f"Disambiguation for '{title}', trying first option: {e.options[0]}"

103 )

104 try:

105 summary = wikipedia.summary(

106 e.options[0],

107 sentences=self.sentences,

108 auto_suggest=False,

109 )

110 title = e.options[0] # Use the new title

111 except Exception as inner_e:

112 logger.exception(

113 f"Error with disambiguation option: {inner_e}"

114 )

115 continue

116 else:

117 logger.warning(

118 f"Disambiguation with no options for '{title}'"

119 )

120 continue

121

122 if summary:

123 preview = {

124 "id": title, # Use title as ID

125 "title": title,

126 "snippet": summary,

127 "link": f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}",

128 "source": "Wikipedia",

129 }

130

131 previews.append(preview)

132

133 except (

134 wikipedia.exceptions.PageError,

135 wikipedia.exceptions.WikipediaException,

136 ) as e:

137 # Skip pages with errors

138 logger.warning(f"Error getting summary for '{title}': {e}")

139 continue

140 except Exception:

141 logger.exception(f"Unexpected error for '{title}'")

142 continue

143

144 logger.info(

145 f"Successfully created {len(previews)} previews from Wikipedia"

146 )

147 return previews

148

149 except Exception:

150 logger.exception("Error getting Wikipedia previews")

151 return []

152

153 def _get_full_content(

154 self, relevant_items: List[Dict[str, Any]]

155 ) -> List[Dict[str, Any]]:

156 """

157 Get full content for the relevant Wikipedia pages.

158

159 Args:

160 relevant_items: List of relevant preview dictionaries

161

162 Returns:

163 List of result dictionaries with full content

164 """

165 # Check if we should add full content

166 if ( 166 ↛ 170line 166 didn't jump to line 170 because the condition on line 166 was never true

167 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")

168 and search_config.SEARCH_SNIPPETS_ONLY

169 ):

170 logger.info("Snippet-only mode, skipping full content retrieval")

171 return relevant_items

172

173 logger.info(

174 f"Getting full content for {len(relevant_items)} relevant Wikipedia pages"

175 )

176

177 results = []

178 for item in relevant_items:

179 title = item.get("id") # Title stored as ID

180

181 if not title:

182 results.append(item)

183 continue

184

185 try:

186 # Apply rate limiting before page request

187 self._last_wait_time = self.rate_tracker.apply_rate_limit(

188 self.engine_type

189 )

190

191 # Get the full page

192 page = wikipedia.page(title, auto_suggest=False)

193

194 # Create a full result with all information

195 result = {

196 "title": page.title,

197 "link": page.url,

198 "snippet": item.get("snippet", ""), # Keep existing snippet

199 "source": "Wikipedia",

200 }

201

202 # Add additional information

203 result["content"] = page.content

204 result["full_content"] = page.content

205 result["categories"] = page.categories

206 result["references"] = page.references

207 result["links"] = page.links

208 result["images"] = page.images

209 result["sections"] = page.sections

210

211 results.append(result)

212

213 except (

214 wikipedia.exceptions.DisambiguationError,

215 wikipedia.exceptions.PageError,

216 wikipedia.exceptions.WikipediaException,

217 ) as e:

218 # If error, use the preview

219 logger.warning(f"Error getting full content for '{title}': {e}")

220 results.append(item)

221 except Exception:

222 logger.exception(

223 f"Unexpected error getting full content for '{title}'"

224 )

225 results.append(item)

226

227 return results

228

229 def get_summary(self, title: str, sentences: Optional[int] = None) -> str:

230 """

231 Get a summary of a specific Wikipedia page.

232

233 Args:

234 title: Title of the Wikipedia page

235 sentences: Number of sentences to include (defaults to self.sentences)

236

237 Returns:

238 Summary of the page

239 """

240 sentences = sentences or self.sentences

241 try:

242 return wikipedia.summary(

243 title, sentences=sentences, auto_suggest=False

244 )

245 except wikipedia.exceptions.DisambiguationError as e:

246 if e.options and len(e.options) > 0:

247 return wikipedia.summary(

248 e.options[0], sentences=sentences, auto_suggest=False

249 )

250 raise

251

252 def get_page(self, title: str) -> Dict[str, Any]:

253 """

254 Get detailed information about a specific Wikipedia page.

255

256 Args:

257 title: Title of the Wikipedia page

258

259 Returns:

260 Dictionary with page information

261 """

262 # Initialize include_content with our instance value

263 include_content = self.include_content

264

265 # Check if we should override with config setting

266 if hasattr(search_config, "SEARCH_SNIPPETS_ONLY"): 266 ↛ 267line 266 didn't jump to line 267 because the condition on line 266 was never true

267 include_content = not search_config.SEARCH_SNIPPETS_ONLY

268

269 try:

270 page = wikipedia.page(title, auto_suggest=False)

271

272 result = {

273 "title": page.title,

274 "link": page.url,

275 "snippet": self.get_summary(title, self.sentences),

276 "source": "Wikipedia",

277 }

278

279 # Add additional information if requested

280 if include_content:

281 result["content"] = page.content

282 result["full_content"] = page.content

283 result["categories"] = page.categories

284 result["references"] = page.references

285 result["links"] = page.links

286 result["images"] = page.images

287 result["sections"] = page.sections

288

289 return result

290 except wikipedia.exceptions.DisambiguationError as e:

291 if e.options and len(e.options) > 0:

292 return self.get_page(e.options[0])

293 raise

294

295 def set_language(self, language: str) -> None:

296 """

297 Change the Wikipedia language.

298

299 Args:

300 language: Language code (e.g., 'en', 'fr', 'es')

301 """

302 wikipedia.set_lang(language)

Coverage for src / local_deep_research / web_search_engines / engines / search_engine_wikipedia.py: 96%

115 statements