Coverage for src / local_deep_research / web_search_engines / engines / search_engine_wikipedia.py: 96%
116 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1from typing import Any, Dict, List, Optional
3import wikipedia
4from langchain_core.language_models import BaseLLM
5from loguru import logger
7from ...config import search_config
8from ..search_engine_base import BaseSearchEngine
11class WikipediaSearchEngine(BaseSearchEngine):
12 """Wikipedia search engine implementation with two-phase approach"""
14 # Mark as public search engine
15 is_public = True
16 is_lexical = True
17 needs_llm_relevance_filter = True
19 def __init__(
20 self,
21 max_results: int = 10,
22 language: str = "en",
23 include_content: bool = True,
24 sentences: int = 5,
25 llm: Optional[BaseLLM] = None,
26 max_filtered_results: Optional[int] = None,
27 settings_snapshot: Optional[Dict[str, Any]] = None,
28 **kwargs,
29 ):
30 """
31 Initialize the Wikipedia search engine.
33 Args:
34 max_results: Maximum number of search results
35 language: Language code for Wikipedia (e.g., 'en', 'fr', 'es')
36 include_content: Whether to include full page content in results
37 sentences: Number of sentences to include in summary
38 llm: Language model for relevance filtering
39 max_filtered_results: Maximum number of results to keep after filtering
40 **kwargs: Additional parameters (ignored but accepted for compatibility)
41 """
42 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
43 super().__init__(
44 llm=llm,
45 max_filtered_results=max_filtered_results,
46 max_results=max_results,
47 settings_snapshot=settings_snapshot,
48 )
49 self.include_content = include_content
50 self.sentences = sentences
52 # Set the Wikipedia language
53 wikipedia.set_lang(language)
55 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
56 """
57 Get preview information (titles and summaries) for Wikipedia pages.
59 Args:
60 query: The search query
62 Returns:
63 List of preview dictionaries
64 """
65 logger.info(f"Getting Wikipedia page previews for query: {query}")
67 try:
68 # Apply rate limiting before search request
69 self._last_wait_time = self.rate_tracker.apply_rate_limit(
70 self.engine_type
71 )
73 # Get search results (just titles)
74 search_results = wikipedia.search(query, results=self.max_results)
76 logger.info(
77 f"Found {len(search_results)} Wikipedia results: {search_results}"
78 )
80 if not search_results:
81 logger.info(f"No Wikipedia results found for query: {query}")
82 return []
84 # Generate previews with summaries.
85 # NOTE: This loop is intentionally sequential. Do NOT parallelize with
86 # ThreadPoolExecutor because:
87 # 1. The `wikipedia` PyPI library is not thread-safe — it uses global
88 # mutable state (API_URL, RATE_LIMIT_LAST_CALL) and an unlocked cache.
89 # Concurrent threads would corrupt the library's built-in rate limiting.
90 # 2. self._last_wait_time is a shared instance attribute with no lock —
91 # concurrent writes would feed incorrect data to record_outcome().
92 # 3. Downstream _filter_for_relevance uses positional indices — random
93 # completion order would cause the LLM to select wrong articles.
94 previews = []
95 for title in search_results:
96 try:
97 # Get just the summary, with auto_suggest=False to be more precise
98 summary = None
99 try:
100 # Apply rate limiting before summary request
101 self._last_wait_time = (
102 self.rate_tracker.apply_rate_limit(self.engine_type)
103 )
105 summary = wikipedia.summary(
106 title, sentences=self.sentences, auto_suggest=False
107 )
108 except wikipedia.exceptions.DisambiguationError as e:
109 # If disambiguation error, try the first option
110 if e.options and len(e.options) > 0:
111 logger.info(
112 f"Disambiguation for '{title}', trying first option: {e.options[0]}"
113 )
114 try:
115 summary = wikipedia.summary(
116 e.options[0],
117 sentences=self.sentences,
118 auto_suggest=False,
119 )
120 title = e.options[0] # Use the new title
121 except Exception as inner_e:
122 logger.exception(
123 f"Error with disambiguation option: {inner_e}"
124 )
125 continue
126 else:
127 logger.warning(
128 f"Disambiguation with no options for '{title}'"
129 )
130 continue
132 if summary:
133 preview = {
134 "id": title, # Use title as ID
135 "title": title,
136 "snippet": summary,
137 "link": f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}",
138 "source": "Wikipedia",
139 }
141 previews.append(preview)
143 except (
144 wikipedia.exceptions.PageError,
145 wikipedia.exceptions.WikipediaException,
146 ):
147 # Skip pages with errors
148 logger.warning(f"Error getting summary for '{title}'")
149 continue
150 except Exception:
151 logger.exception(f"Unexpected error for '{title}'")
152 continue
154 logger.info(
155 f"Successfully created {len(previews)} previews from Wikipedia"
156 )
157 return previews
159 except Exception:
160 logger.exception("Error getting Wikipedia previews")
161 return []
163 def _get_full_content(
164 self, relevant_items: List[Dict[str, Any]]
165 ) -> List[Dict[str, Any]]:
166 """
167 Get full content for the relevant Wikipedia pages.
169 Args:
170 relevant_items: List of relevant preview dictionaries
172 Returns:
173 List of result dictionaries with full content
174 """
175 # Check if we should add full content
176 if ( 176 ↛ 180line 176 didn't jump to line 180 because the condition on line 176 was never true
177 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
178 and search_config.SEARCH_SNIPPETS_ONLY
179 ):
180 logger.info("Snippet-only mode, skipping full content retrieval")
181 return relevant_items
183 logger.info(
184 f"Getting full content for {len(relevant_items)} relevant Wikipedia pages"
185 )
187 results = []
188 for item in relevant_items:
189 title = item.get("id") # Title stored as ID
191 if not title:
192 results.append(item)
193 continue
195 try:
196 # Apply rate limiting before page request
197 self._last_wait_time = self.rate_tracker.apply_rate_limit(
198 self.engine_type
199 )
201 # Get the full page
202 page = wikipedia.page(title, auto_suggest=False)
204 # Create a full result with all information
205 result = {
206 "title": page.title,
207 "link": page.url,
208 "snippet": item.get("snippet", ""), # Keep existing snippet
209 "source": "Wikipedia",
210 }
212 # Add additional information
213 result["content"] = page.content
214 result["full_content"] = page.content
215 result["categories"] = page.categories
216 result["references"] = page.references
217 result["links"] = page.links
218 result["images"] = page.images
219 result["sections"] = page.sections
221 results.append(result)
223 except (
224 wikipedia.exceptions.DisambiguationError,
225 wikipedia.exceptions.PageError,
226 wikipedia.exceptions.WikipediaException,
227 ):
228 # If error, use the preview
229 logger.warning(f"Error getting full content for '{title}'")
230 results.append(item)
231 except Exception:
232 logger.exception(
233 f"Unexpected error getting full content for '{title}'"
234 )
235 results.append(item)
237 return results
239 def get_summary(self, title: str, sentences: Optional[int] = None) -> str:
240 """
241 Get a summary of a specific Wikipedia page.
243 Args:
244 title: Title of the Wikipedia page
245 sentences: Number of sentences to include (defaults to self.sentences)
247 Returns:
248 Summary of the page
249 """
250 sentences = sentences or self.sentences
251 try:
252 return str(
253 wikipedia.summary(
254 title, sentences=sentences, auto_suggest=False
255 )
256 )
257 except wikipedia.exceptions.DisambiguationError as e:
258 if e.options and len(e.options) > 0:
259 return str(
260 wikipedia.summary(
261 e.options[0], sentences=sentences, auto_suggest=False
262 )
263 )
264 raise
266 def get_page(self, title: str) -> Dict[str, Any]:
267 """
268 Get detailed information about a specific Wikipedia page.
270 Args:
271 title: Title of the Wikipedia page
273 Returns:
274 Dictionary with page information
275 """
276 # Initialize include_content with our instance value
277 include_content = self.include_content
279 # Check if we should override with config setting
280 if hasattr(search_config, "SEARCH_SNIPPETS_ONLY"): 280 ↛ 281line 280 didn't jump to line 281 because the condition on line 280 was never true
281 include_content = not search_config.SEARCH_SNIPPETS_ONLY
283 try:
284 page = wikipedia.page(title, auto_suggest=False)
286 result = {
287 "title": page.title,
288 "link": page.url,
289 "snippet": self.get_summary(title, self.sentences),
290 "source": "Wikipedia",
291 }
293 # Add additional information if requested
294 if include_content:
295 result["content"] = page.content
296 result["full_content"] = page.content
297 result["categories"] = page.categories
298 result["references"] = page.references
299 result["links"] = page.links
300 result["images"] = page.images
301 result["sections"] = page.sections
303 return result
304 except wikipedia.exceptions.DisambiguationError as e:
305 if e.options and len(e.options) > 0:
306 return self.get_page(e.options[0])
307 raise
309 def set_language(self, language: str) -> None:
310 """
311 Change the Wikipedia language.
313 Args:
314 language: Language code (e.g., 'en', 'fr', 'es')
315 """
316 wikipedia.set_lang(language)