Coverage for src / local_deep_research / web_search_engines / engines / search_engine_wikipedia.py: 96%
115 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1from typing import Any, Dict, List, Optional
3import wikipedia
4from langchain_core.language_models import BaseLLM
5from loguru import logger
7from ...config import search_config
8from ..search_engine_base import BaseSearchEngine
11class WikipediaSearchEngine(BaseSearchEngine):
12 """Wikipedia search engine implementation with two-phase approach"""
14 # Mark as public search engine
15 is_public = True
17 def __init__(
18 self,
19 max_results: int = 10,
20 language: str = "en",
21 include_content: bool = True,
22 sentences: int = 5,
23 llm: Optional[BaseLLM] = None,
24 max_filtered_results: Optional[int] = None,
25 **kwargs,
26 ):
27 """
28 Initialize the Wikipedia search engine.
30 Args:
31 max_results: Maximum number of search results
32 language: Language code for Wikipedia (e.g., 'en', 'fr', 'es')
33 include_content: Whether to include full page content in results
34 sentences: Number of sentences to include in summary
35 llm: Language model for relevance filtering
36 max_filtered_results: Maximum number of results to keep after filtering
37 **kwargs: Additional parameters (ignored but accepted for compatibility)
38 """
39 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
40 super().__init__(
41 llm=llm,
42 max_filtered_results=max_filtered_results,
43 max_results=max_results,
44 )
45 self.include_content = include_content
46 self.sentences = sentences
48 # Set the Wikipedia language
49 wikipedia.set_lang(language)
51 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
52 """
53 Get preview information (titles and summaries) for Wikipedia pages.
55 Args:
56 query: The search query
58 Returns:
59 List of preview dictionaries
60 """
61 logger.info(f"Getting Wikipedia page previews for query: {query}")
63 try:
64 # Apply rate limiting before search request
65 self._last_wait_time = self.rate_tracker.apply_rate_limit(
66 self.engine_type
67 )
69 # Get search results (just titles)
70 search_results = wikipedia.search(query, results=self.max_results)
72 logger.info(
73 f"Found {len(search_results)} Wikipedia results: {search_results}"
74 )
76 if not search_results:
77 logger.info(f"No Wikipedia results found for query: {query}")
78 return []
80 # Create a cache for full pages (will be populated on-demand)
81 self._page_cache = {}
83 # Generate previews with summaries
84 previews = []
85 for title in search_results:
86 try:
87 # Get just the summary, with auto_suggest=False to be more precise
88 summary = None
89 try:
90 # Apply rate limiting before summary request
91 self._last_wait_time = (
92 self.rate_tracker.apply_rate_limit(self.engine_type)
93 )
95 summary = wikipedia.summary(
96 title, sentences=self.sentences, auto_suggest=False
97 )
98 except wikipedia.exceptions.DisambiguationError as e:
99 # If disambiguation error, try the first option
100 if e.options and len(e.options) > 0:
101 logger.info(
102 f"Disambiguation for '{title}', trying first option: {e.options[0]}"
103 )
104 try:
105 summary = wikipedia.summary(
106 e.options[0],
107 sentences=self.sentences,
108 auto_suggest=False,
109 )
110 title = e.options[0] # Use the new title
111 except Exception as inner_e:
112 logger.exception(
113 f"Error with disambiguation option: {inner_e}"
114 )
115 continue
116 else:
117 logger.warning(
118 f"Disambiguation with no options for '{title}'"
119 )
120 continue
122 if summary:
123 preview = {
124 "id": title, # Use title as ID
125 "title": title,
126 "snippet": summary,
127 "link": f"https://en.wikipedia.org/wiki/{title.replace(' ', '_')}",
128 "source": "Wikipedia",
129 }
131 previews.append(preview)
133 except (
134 wikipedia.exceptions.PageError,
135 wikipedia.exceptions.WikipediaException,
136 ) as e:
137 # Skip pages with errors
138 logger.warning(f"Error getting summary for '{title}': {e}")
139 continue
140 except Exception:
141 logger.exception(f"Unexpected error for '{title}'")
142 continue
144 logger.info(
145 f"Successfully created {len(previews)} previews from Wikipedia"
146 )
147 return previews
149 except Exception:
150 logger.exception("Error getting Wikipedia previews")
151 return []
153 def _get_full_content(
154 self, relevant_items: List[Dict[str, Any]]
155 ) -> List[Dict[str, Any]]:
156 """
157 Get full content for the relevant Wikipedia pages.
159 Args:
160 relevant_items: List of relevant preview dictionaries
162 Returns:
163 List of result dictionaries with full content
164 """
165 # Check if we should add full content
166 if ( 166 ↛ 170line 166 didn't jump to line 170 because the condition on line 166 was never true
167 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
168 and search_config.SEARCH_SNIPPETS_ONLY
169 ):
170 logger.info("Snippet-only mode, skipping full content retrieval")
171 return relevant_items
173 logger.info(
174 f"Getting full content for {len(relevant_items)} relevant Wikipedia pages"
175 )
177 results = []
178 for item in relevant_items:
179 title = item.get("id") # Title stored as ID
181 if not title:
182 results.append(item)
183 continue
185 try:
186 # Apply rate limiting before page request
187 self._last_wait_time = self.rate_tracker.apply_rate_limit(
188 self.engine_type
189 )
191 # Get the full page
192 page = wikipedia.page(title, auto_suggest=False)
194 # Create a full result with all information
195 result = {
196 "title": page.title,
197 "link": page.url,
198 "snippet": item.get("snippet", ""), # Keep existing snippet
199 "source": "Wikipedia",
200 }
202 # Add additional information
203 result["content"] = page.content
204 result["full_content"] = page.content
205 result["categories"] = page.categories
206 result["references"] = page.references
207 result["links"] = page.links
208 result["images"] = page.images
209 result["sections"] = page.sections
211 results.append(result)
213 except (
214 wikipedia.exceptions.DisambiguationError,
215 wikipedia.exceptions.PageError,
216 wikipedia.exceptions.WikipediaException,
217 ) as e:
218 # If error, use the preview
219 logger.warning(f"Error getting full content for '{title}': {e}")
220 results.append(item)
221 except Exception as e:
222 logger.exception(
223 f"Unexpected error getting full content for '{title}': {e}"
224 )
225 results.append(item)
227 return results
229 def get_summary(self, title: str, sentences: Optional[int] = None) -> str:
230 """
231 Get a summary of a specific Wikipedia page.
233 Args:
234 title: Title of the Wikipedia page
235 sentences: Number of sentences to include (defaults to self.sentences)
237 Returns:
238 Summary of the page
239 """
240 sentences = sentences or self.sentences
241 try:
242 return wikipedia.summary(
243 title, sentences=sentences, auto_suggest=False
244 )
245 except wikipedia.exceptions.DisambiguationError as e:
246 if e.options and len(e.options) > 0:
247 return wikipedia.summary(
248 e.options[0], sentences=sentences, auto_suggest=False
249 )
250 raise
252 def get_page(self, title: str) -> Dict[str, Any]:
253 """
254 Get detailed information about a specific Wikipedia page.
256 Args:
257 title: Title of the Wikipedia page
259 Returns:
260 Dictionary with page information
261 """
262 # Initialize include_content with our instance value
263 include_content = self.include_content
265 # Check if we should override with config setting
266 if hasattr(search_config, "SEARCH_SNIPPETS_ONLY"): 266 ↛ 267line 266 didn't jump to line 267 because the condition on line 266 was never true
267 include_content = not search_config.SEARCH_SNIPPETS_ONLY
269 try:
270 page = wikipedia.page(title, auto_suggest=False)
272 result = {
273 "title": page.title,
274 "link": page.url,
275 "snippet": self.get_summary(title, self.sentences),
276 "source": "Wikipedia",
277 }
279 # Add additional information if requested
280 if include_content:
281 result["content"] = page.content
282 result["full_content"] = page.content
283 result["categories"] = page.categories
284 result["references"] = page.references
285 result["links"] = page.links
286 result["images"] = page.images
287 result["sections"] = page.sections
289 return result
290 except wikipedia.exceptions.DisambiguationError as e:
291 if e.options and len(e.options) > 0:
292 return self.get_page(e.options[0])
293 raise
295 def set_language(self, language: str) -> None:
296 """
297 Change the Wikipedia language.
299 Args:
300 language: Language code (e.g., 'en', 'fr', 'es')
301 """
302 wikipedia.set_lang(language)