Coverage for src / local_deep_research / web_search_engines / engines / search_engine_wikipedia.py: 96%

115 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1from typing import Any, Dict, List, Optional 

2 

3import wikipedia 

4from langchain_core.language_models import BaseLLM 

5from loguru import logger 

6 

7from ...config import search_config 

8from ..search_engine_base import BaseSearchEngine 

9 

10 

11class WikipediaSearchEngine(BaseSearchEngine): 

12 """Wikipedia search engine implementation with two-phase approach""" 

13 

14 # Mark as public search engine 

15 is_public = True 

16 

17 def __init__( 

18 self, 

19 max_results: int = 10, 

20 language: str = "en", 

21 include_content: bool = True, 

22 sentences: int = 5, 

23 llm: Optional[BaseLLM] = None, 

24 max_filtered_results: Optional[int] = None, 

25 **kwargs, 

26 ): 

27 """ 

28 Initialize the Wikipedia search engine. 

29 

30 Args: 

31 max_results: Maximum number of search results 

32 language: Language code for Wikipedia (e.g., 'en', 'fr', 'es') 

33 include_content: Whether to include full page content in results 

34 sentences: Number of sentences to include in summary 

35 llm: Language model for relevance filtering 

36 max_filtered_results: Maximum number of results to keep after filtering 

37 **kwargs: Additional parameters (ignored but accepted for compatibility) 

38 """ 

39 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results 

40 super().__init__( 

41 llm=llm, 

42 max_filtered_results=max_filtered_results, 

43 max_results=max_results, 

44 ) 

45 self.include_content = include_content 

46 self.sentences = sentences 

47 

48 # Set the Wikipedia language 

49 wikipedia.set_lang(language) 

50 

51 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

52 """ 

53 Get preview information (titles and summaries) for Wikipedia pages. 

54 

55 Args: 

56 query: The search query 

57 

58 Returns: 

59 List of preview dictionaries 

60 """ 

61 logger.info(f"Getting Wikipedia page previews for query: {query}") 

62 

63 try: 

64 # Apply rate limiting before search request 

65 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

66 self.engine_type 

67 ) 

68 

69 # Get search results (just titles) 

70 search_results = wikipedia.search(query, results=self.max_results) 

71 

72 logger.info( 

73 f"Found {len(search_results)} Wikipedia results: {search_results}" 

74 ) 

75 

76 if not search_results: 

77 logger.info(f"No Wikipedia results found for query: {query}") 

78 return [] 

79 

80 # Create a cache for full pages (will be populated on-demand) 

81 self._page_cache = {} 

82 

83 # Generate previews with summaries 

84 previews = [] 

85 for title in search_results: 

86 try: 

87 # Get just the summary, with auto_suggest=False to be more precise 

88 summary = None 

89 try: 

90 # Apply rate limiting before summary request 

91 self._last_wait_time = ( 

92 self.rate_tracker.apply_rate_limit(self.engine_type) 

93 ) 

94 

95 summary = wikipedia.summary( 

96 title, sentences=self.sentences, auto_suggest=False 

97 ) 

98 except wikipedia.exceptions.DisambiguationError as e: 

99 # If disambiguation error, try the first option 

100 if e.options and len(e.options) > 0: 

101 logger.info( 

102 f"Disambiguation for '{title}', trying first option: {e.options[0]}" 

103 ) 

104 try: 

105 summary = wikipedia.summary( 

106 e.options[0], 

107 sentences=self.sentences, 

108 auto_suggest=False, 

109 ) 

110 title = e.options[0] # Use the new title 

111 except Exception as inner_e: 

112 logger.exception( 

113 f"Error with disambiguation option: {inner_e}" 

114 ) 

115 continue 

116 else: 

117 logger.warning( 

118 f"Disambiguation with no options for '{title}'" 

119 ) 

120 continue 

121 

122 if summary: 

123 preview = { 

124 "id": title, # Use title as ID 

125 "title": title, 

126 "snippet": summary, 

127 "link": f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}", 

128 "source": "Wikipedia", 

129 } 

130 

131 previews.append(preview) 

132 

133 except ( 

134 wikipedia.exceptions.PageError, 

135 wikipedia.exceptions.WikipediaException, 

136 ) as e: 

137 # Skip pages with errors 

138 logger.warning(f"Error getting summary for '{title}': {e}") 

139 continue 

140 except Exception: 

141 logger.exception(f"Unexpected error for '{title}'") 

142 continue 

143 

144 logger.info( 

145 f"Successfully created {len(previews)} previews from Wikipedia" 

146 ) 

147 return previews 

148 

149 except Exception: 

150 logger.exception("Error getting Wikipedia previews") 

151 return [] 

152 

153 def _get_full_content( 

154 self, relevant_items: List[Dict[str, Any]] 

155 ) -> List[Dict[str, Any]]: 

156 """ 

157 Get full content for the relevant Wikipedia pages. 

158 

159 Args: 

160 relevant_items: List of relevant preview dictionaries 

161 

162 Returns: 

163 List of result dictionaries with full content 

164 """ 

165 # Check if we should add full content 

166 if ( 166 ↛ 170line 166 didn't jump to line 170 because the condition on line 166 was never true

167 hasattr(search_config, "SEARCH_SNIPPETS_ONLY") 

168 and search_config.SEARCH_SNIPPETS_ONLY 

169 ): 

170 logger.info("Snippet-only mode, skipping full content retrieval") 

171 return relevant_items 

172 

173 logger.info( 

174 f"Getting full content for {len(relevant_items)} relevant Wikipedia pages" 

175 ) 

176 

177 results = [] 

178 for item in relevant_items: 

179 title = item.get("id") # Title stored as ID 

180 

181 if not title: 

182 results.append(item) 

183 continue 

184 

185 try: 

186 # Apply rate limiting before page request 

187 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

188 self.engine_type 

189 ) 

190 

191 # Get the full page 

192 page = wikipedia.page(title, auto_suggest=False) 

193 

194 # Create a full result with all information 

195 result = { 

196 "title": page.title, 

197 "link": page.url, 

198 "snippet": item.get("snippet", ""), # Keep existing snippet 

199 "source": "Wikipedia", 

200 } 

201 

202 # Add additional information 

203 result["content"] = page.content 

204 result["full_content"] = page.content 

205 result["categories"] = page.categories 

206 result["references"] = page.references 

207 result["links"] = page.links 

208 result["images"] = page.images 

209 result["sections"] = page.sections 

210 

211 results.append(result) 

212 

213 except ( 

214 wikipedia.exceptions.DisambiguationError, 

215 wikipedia.exceptions.PageError, 

216 wikipedia.exceptions.WikipediaException, 

217 ) as e: 

218 # If error, use the preview 

219 logger.warning(f"Error getting full content for '{title}': {e}") 

220 results.append(item) 

221 except Exception as e: 

222 logger.exception( 

223 f"Unexpected error getting full content for '{title}': {e}" 

224 ) 

225 results.append(item) 

226 

227 return results 

228 

229 def get_summary(self, title: str, sentences: Optional[int] = None) -> str: 

230 """ 

231 Get a summary of a specific Wikipedia page. 

232 

233 Args: 

234 title: Title of the Wikipedia page 

235 sentences: Number of sentences to include (defaults to self.sentences) 

236 

237 Returns: 

238 Summary of the page 

239 """ 

240 sentences = sentences or self.sentences 

241 try: 

242 return wikipedia.summary( 

243 title, sentences=sentences, auto_suggest=False 

244 ) 

245 except wikipedia.exceptions.DisambiguationError as e: 

246 if e.options and len(e.options) > 0: 

247 return wikipedia.summary( 

248 e.options[0], sentences=sentences, auto_suggest=False 

249 ) 

250 raise 

251 

252 def get_page(self, title: str) -> Dict[str, Any]: 

253 """ 

254 Get detailed information about a specific Wikipedia page. 

255 

256 Args: 

257 title: Title of the Wikipedia page 

258 

259 Returns: 

260 Dictionary with page information 

261 """ 

262 # Initialize include_content with our instance value 

263 include_content = self.include_content 

264 

265 # Check if we should override with config setting 

266 if hasattr(search_config, "SEARCH_SNIPPETS_ONLY"): 266 ↛ 267line 266 didn't jump to line 267 because the condition on line 266 was never true

267 include_content = not search_config.SEARCH_SNIPPETS_ONLY 

268 

269 try: 

270 page = wikipedia.page(title, auto_suggest=False) 

271 

272 result = { 

273 "title": page.title, 

274 "link": page.url, 

275 "snippet": self.get_summary(title, self.sentences), 

276 "source": "Wikipedia", 

277 } 

278 

279 # Add additional information if requested 

280 if include_content: 

281 result["content"] = page.content 

282 result["full_content"] = page.content 

283 result["categories"] = page.categories 

284 result["references"] = page.references 

285 result["links"] = page.links 

286 result["images"] = page.images 

287 result["sections"] = page.sections 

288 

289 return result 

290 except wikipedia.exceptions.DisambiguationError as e: 

291 if e.options and len(e.options) > 0: 

292 return self.get_page(e.options[0]) 

293 raise 

294 

295 def set_language(self, language: str) -> None: 

296 """ 

297 Change the Wikipedia language. 

298 

299 Args: 

300 language: Language code (e.g., 'en', 'fr', 'es') 

301 """ 

302 wikipedia.set_lang(language)