Coverage for src / local_deep_research / web_search_engines / engines / search_engine_wikipedia.py: 96%

116 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1from typing import Any, Dict, List, Optional 

2 

3import wikipedia 

4from langchain_core.language_models import BaseLLM 

5from loguru import logger 

6 

7from ...config import search_config 

8from ..search_engine_base import BaseSearchEngine 

9 

10 

11class WikipediaSearchEngine(BaseSearchEngine): 

12 """Wikipedia search engine implementation with two-phase approach""" 

13 

14 # Mark as public search engine 

15 is_public = True 

16 is_lexical = True 

17 needs_llm_relevance_filter = True 

18 

19 def __init__( 

20 self, 

21 max_results: int = 10, 

22 language: str = "en", 

23 include_content: bool = True, 

24 sentences: int = 5, 

25 llm: Optional[BaseLLM] = None, 

26 max_filtered_results: Optional[int] = None, 

27 settings_snapshot: Optional[Dict[str, Any]] = None, 

28 **kwargs, 

29 ): 

30 """ 

31 Initialize the Wikipedia search engine. 

32 

33 Args: 

34 max_results: Maximum number of search results 

35 language: Language code for Wikipedia (e.g., 'en', 'fr', 'es') 

36 include_content: Whether to include full page content in results 

37 sentences: Number of sentences to include in summary 

38 llm: Language model for relevance filtering 

39 max_filtered_results: Maximum number of results to keep after filtering 

40 **kwargs: Additional parameters (ignored but accepted for compatibility) 

41 """ 

42 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results 

43 super().__init__( 

44 llm=llm, 

45 max_filtered_results=max_filtered_results, 

46 max_results=max_results, 

47 settings_snapshot=settings_snapshot, 

48 ) 

49 self.include_content = include_content 

50 self.sentences = sentences 

51 

52 # Set the Wikipedia language 

53 wikipedia.set_lang(language) 

54 

55 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

56 """ 

57 Get preview information (titles and summaries) for Wikipedia pages. 

58 

59 Args: 

60 query: The search query 

61 

62 Returns: 

63 List of preview dictionaries 

64 """ 

65 logger.info(f"Getting Wikipedia page previews for query: {query}") 

66 

67 try: 

68 # Apply rate limiting before search request 

69 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

70 self.engine_type 

71 ) 

72 

73 # Get search results (just titles) 

74 search_results = wikipedia.search(query, results=self.max_results) 

75 

76 logger.info( 

77 f"Found {len(search_results)} Wikipedia results: {search_results}" 

78 ) 

79 

80 if not search_results: 

81 logger.info(f"No Wikipedia results found for query: {query}") 

82 return [] 

83 

84 # Generate previews with summaries. 

85 # NOTE: This loop is intentionally sequential. Do NOT parallelize with 

86 # ThreadPoolExecutor because: 

87 # 1. The `wikipedia` PyPI library is not thread-safe — it uses global 

88 # mutable state (API_URL, RATE_LIMIT_LAST_CALL) and an unlocked cache. 

89 # Concurrent threads would corrupt the library's built-in rate limiting. 

90 # 2. self._last_wait_time is a shared instance attribute with no lock — 

91 # concurrent writes would feed incorrect data to record_outcome(). 

92 # 3. Downstream _filter_for_relevance uses positional indices — random 

93 # completion order would cause the LLM to select wrong articles. 

94 previews = [] 

95 for title in search_results: 

96 try: 

97 # Get just the summary, with auto_suggest=False to be more precise 

98 summary = None 

99 try: 

100 # Apply rate limiting before summary request 

101 self._last_wait_time = ( 

102 self.rate_tracker.apply_rate_limit(self.engine_type) 

103 ) 

104 

105 summary = wikipedia.summary( 

106 title, sentences=self.sentences, auto_suggest=False 

107 ) 

108 except wikipedia.exceptions.DisambiguationError as e: 

109 # If disambiguation error, try the first option 

110 if e.options and len(e.options) > 0: 

111 logger.info( 

112 f"Disambiguation for '{title}', trying first option: {e.options[0]}" 

113 ) 

114 try: 

115 summary = wikipedia.summary( 

116 e.options[0], 

117 sentences=self.sentences, 

118 auto_suggest=False, 

119 ) 

120 title = e.options[0] # Use the new title 

121 except Exception as inner_e: 

122 logger.exception( 

123 f"Error with disambiguation option: {inner_e}" 

124 ) 

125 continue 

126 else: 

127 logger.warning( 

128 f"Disambiguation with no options for '{title}'" 

129 ) 

130 continue 

131 

132 if summary: 

133 preview = { 

134 "id": title, # Use title as ID 

135 "title": title, 

136 "snippet": summary, 

137 "link": f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}", 

138 "source": "Wikipedia", 

139 } 

140 

141 previews.append(preview) 

142 

143 except ( 

144 wikipedia.exceptions.PageError, 

145 wikipedia.exceptions.WikipediaException, 

146 ): 

147 # Skip pages with errors 

148 logger.warning(f"Error getting summary for '{title}'") 

149 continue 

150 except Exception: 

151 logger.exception(f"Unexpected error for '{title}'") 

152 continue 

153 

154 logger.info( 

155 f"Successfully created {len(previews)} previews from Wikipedia" 

156 ) 

157 return previews 

158 

159 except Exception: 

160 logger.exception("Error getting Wikipedia previews") 

161 return [] 

162 

163 def _get_full_content( 

164 self, relevant_items: List[Dict[str, Any]] 

165 ) -> List[Dict[str, Any]]: 

166 """ 

167 Get full content for the relevant Wikipedia pages. 

168 

169 Args: 

170 relevant_items: List of relevant preview dictionaries 

171 

172 Returns: 

173 List of result dictionaries with full content 

174 """ 

175 # Check if we should add full content 

176 if ( 176 ↛ 180line 176 didn't jump to line 180 because the condition on line 176 was never true

177 hasattr(search_config, "SEARCH_SNIPPETS_ONLY") 

178 and search_config.SEARCH_SNIPPETS_ONLY 

179 ): 

180 logger.info("Snippet-only mode, skipping full content retrieval") 

181 return relevant_items 

182 

183 logger.info( 

184 f"Getting full content for {len(relevant_items)} relevant Wikipedia pages" 

185 ) 

186 

187 results = [] 

188 for item in relevant_items: 

189 title = item.get("id") # Title stored as ID 

190 

191 if not title: 

192 results.append(item) 

193 continue 

194 

195 try: 

196 # Apply rate limiting before page request 

197 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

198 self.engine_type 

199 ) 

200 

201 # Get the full page 

202 page = wikipedia.page(title, auto_suggest=False) 

203 

204 # Create a full result with all information 

205 result = { 

206 "title": page.title, 

207 "link": page.url, 

208 "snippet": item.get("snippet", ""), # Keep existing snippet 

209 "source": "Wikipedia", 

210 } 

211 

212 # Add additional information 

213 result["content"] = page.content 

214 result["full_content"] = page.content 

215 result["categories"] = page.categories 

216 result["references"] = page.references 

217 result["links"] = page.links 

218 result["images"] = page.images 

219 result["sections"] = page.sections 

220 

221 results.append(result) 

222 

223 except ( 

224 wikipedia.exceptions.DisambiguationError, 

225 wikipedia.exceptions.PageError, 

226 wikipedia.exceptions.WikipediaException, 

227 ): 

228 # If error, use the preview 

229 logger.warning(f"Error getting full content for '{title}'") 

230 results.append(item) 

231 except Exception: 

232 logger.exception( 

233 f"Unexpected error getting full content for '{title}'" 

234 ) 

235 results.append(item) 

236 

237 return results 

238 

239 def get_summary(self, title: str, sentences: Optional[int] = None) -> str: 

240 """ 

241 Get a summary of a specific Wikipedia page. 

242 

243 Args: 

244 title: Title of the Wikipedia page 

245 sentences: Number of sentences to include (defaults to self.sentences) 

246 

247 Returns: 

248 Summary of the page 

249 """ 

250 sentences = sentences or self.sentences 

251 try: 

252 return str( 

253 wikipedia.summary( 

254 title, sentences=sentences, auto_suggest=False 

255 ) 

256 ) 

257 except wikipedia.exceptions.DisambiguationError as e: 

258 if e.options and len(e.options) > 0: 

259 return str( 

260 wikipedia.summary( 

261 e.options[0], sentences=sentences, auto_suggest=False 

262 ) 

263 ) 

264 raise 

265 

266 def get_page(self, title: str) -> Dict[str, Any]: 

267 """ 

268 Get detailed information about a specific Wikipedia page. 

269 

270 Args: 

271 title: Title of the Wikipedia page 

272 

273 Returns: 

274 Dictionary with page information 

275 """ 

276 # Initialize include_content with our instance value 

277 include_content = self.include_content 

278 

279 # Check if we should override with config setting 

280 if hasattr(search_config, "SEARCH_SNIPPETS_ONLY"): 280 ↛ 281line 280 didn't jump to line 281 because the condition on line 280 was never true

281 include_content = not search_config.SEARCH_SNIPPETS_ONLY 

282 

283 try: 

284 page = wikipedia.page(title, auto_suggest=False) 

285 

286 result = { 

287 "title": page.title, 

288 "link": page.url, 

289 "snippet": self.get_summary(title, self.sentences), 

290 "source": "Wikipedia", 

291 } 

292 

293 # Add additional information if requested 

294 if include_content: 

295 result["content"] = page.content 

296 result["full_content"] = page.content 

297 result["categories"] = page.categories 

298 result["references"] = page.references 

299 result["links"] = page.links 

300 result["images"] = page.images 

301 result["sections"] = page.sections 

302 

303 return result 

304 except wikipedia.exceptions.DisambiguationError as e: 

305 if e.options and len(e.options) > 0: 

306 return self.get_page(e.options[0]) 

307 raise 

308 

309 def set_language(self, language: str) -> None: 

310 """ 

311 Change the Wikipedia language. 

312 

313 Args: 

314 language: Language code (e.g., 'en', 'fr', 'es') 

315 """ 

316 wikipedia.set_lang(language)