Coverage for src / local_deep_research / web_search_engines / engines / search_engine_wikinews.py: 58%
163 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1from datetime import datetime, timedelta, UTC
2from typing import Any, Dict, List, Optional, Tuple
4import json
5import html
6import re
7import requests
8from langchain_core.language_models import BaseLLM
9from loguru import logger
11from ...utilities.search_utilities import remove_think_tags
12from ..search_engine_base import BaseSearchEngine
13from ...utilities.search_utilities import LANGUAGE_CODE_MAP
14from ...security import safe_get
16HEADERS = {
17 "User-Agent": "local-deep-research-wikinews-search-engine (github.com/LearningCircuit/local-deep-research)"
18}
19WIKINEWS_LANGUAGES = [
20 "ru",
21 "sr",
22 "pt",
23 "fr",
24 "pl",
25 "en",
26 "zh",
27 "de",
28 "it",
29 "es",
30 "cs",
31 "nl",
32 "ca",
33 "ar",
34 "ja",
35]
36TIMEOUT = 5 # Seconds
37TIME_PERIOD_DELTAS = {
38 "all": None, # No time filter
39 "y": timedelta(days=365), # 1 year
40 "m": timedelta(days=30), # 1 month
41 "w": timedelta(days=7), # 1 week
42 "d": timedelta(days=1), # 24 hours
43}
44DEFAULT_RECENT_BACKWARD_DAYS = 60
45MAX_RETRIES = 3
48class WikinewsSearchEngine(BaseSearchEngine):
49 """Wikinews search engine implementation with LLM query optimization"""
51 # Mark as public and news search engine
52 is_public = True
53 is_news = True
55 def __init__(
56 self,
57 search_language: str = "english",
58 adaptive_search: bool = True,
59 time_period: str = "y",
60 llm: Optional[BaseLLM] = None,
61 max_filtered_results: Optional[int] = None,
62 max_results: int = 10,
63 search_snippets_only: bool = True,
64 **kwargs,
65 ):
66 """
67 Initialize the Wikinews search engine.
69 Args:
70 search_language (str): Language for Wikinews search (e.g. "english").
71 adaptive_search (bool): Whether to expand or shrink date ranges based on query.
72 time_period (str): Defines the look-back window used to filter search results ("all", "y", "m", "w", "d").
73 llm (Optional[BaseLLM]): Language model used for query optimization and classification.
74 max_filtered_results (Optional[int]): Maximum number of results to keep after filtering.
75 max_results (int): Maximum number of search results to return.
76 search_snippets_only (bool): If True, full article content is ignored.
77 """
79 super().__init__(
80 llm=llm,
81 max_filtered_results=max_filtered_results,
82 max_results=max_results,
83 search_snippets_only=search_snippets_only,
84 **kwargs,
85 )
87 # Language initialization
88 lang_code = LANGUAGE_CODE_MAP.get(
89 search_language.lower(),
90 "en", # Default to English if not found
91 )
93 if lang_code not in WIKINEWS_LANGUAGES: 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true
94 logger.warning(
95 f"Wikinews does not support language '{search_language}' ({lang_code}). Defaulting to English."
96 )
97 lang_code = "en"
99 self.lang_code: str = lang_code
101 # Adaptive search
102 self.adaptive_search: bool = adaptive_search
104 # Date range initialization
105 now = datetime.now(UTC)
106 delta = TIME_PERIOD_DELTAS.get(time_period, timedelta(days=365))
107 self.from_date: datetime = (
108 now - delta if delta else datetime.min.replace(tzinfo=UTC)
109 )
110 self.to_date: datetime = now
112 # Preserve original date range so adaptive search can restore it
113 self._original_date_range = (self.from_date, self.to_date)
115 # API base URL
116 self.api_url: str = "https://{lang_code}.wikinews.org/w/api.php"
118 def _optimize_query_for_wikinews(self, query: str) -> str:
119 """
120 Optimize a natural language query for Wikinews search.
121 Uses LLM to transform questions into effective news search queries.
123 Args:
124 query (str): Natural language query
126 Returns:
127 Optimized search query for Wikinews
128 """
129 if not self.llm:
130 return query
132 try:
133 # Prompt for query optimization
134 prompt = f"""You are a query condenser. Your task is to transform the user’s natural-language question into a very short Wikinews search query.
136Input question:
137"{query}"
139STRICT OUTPUT REQUIREMENTS (follow ALL of them):
1401. Return ONLY a JSON object with EXACTLY one field: {{"query": "<refined_query>"}}.
1412. The JSON must be valid, minified, and contain no trailing text.
1423. The refined query must be extremely short: MAXIMUM 3–4 words.
1434. Include only the essential keywords (proper names, events, entities, places).
1445. Remove filler words (e.g., "news", "latest", "about", "what", "how", "is").
1456. DO NOT add Boolean operators (AND, OR).
1467. DO NOT use quotes inside the query.
1478. DO NOT add explanations or comments.
149EXAMPLES:
150- "What's the impact of rising interest rates on UK housing market?" → {{"query": "UK housing rates"}}
151- "Latest developments in the Ukraine-Russia peace negotiations" → {{"query": "Ukraine Russia negotiations"}}
152- "How are tech companies responding to AI regulation?" → {{"query": "tech AI regulation"}}
153- "What is Donald Trump's current political activity?" → {{"query": "Trump political activity"}}
155NOW RETURN ONLY THE JSON OBJECT.
156"""
157 # Get response from LLM
158 response = self.llm.invoke(prompt)
160 response_text = (
161 getattr(response, "content", None)
162 or getattr(response, "text", None)
163 or str(response)
164 )
166 # Find possible JSON object boundaries
167 start = response_text.find("{")
168 end = response_text.rfind("}")
170 # Validate boundaries before slicing
171 if start == -1 or end == -1 or end <= start:
172 raise ValueError("No valid JSON boundaries found")
174 json_str = response_text[start : end + 1]
176 data = json.loads(json_str)
178 if not isinstance(data, dict): 178 ↛ 179line 178 didn't jump to line 179 because the condition on line 178 was never true
179 raise ValueError("Extracted JSON is not an object")
181 optimized_query = (data.get("query", "")).strip()
183 if not optimized_query:
184 raise ValueError("Query field missing or empty")
186 except (
187 ValueError,
188 TypeError,
189 AttributeError,
190 json.JSONDecodeError,
191 ) as e:
192 logger.warning(
193 f"Error optimizing query for Wikinews: {e}. Using original query."
194 )
195 return query
197 logger.info(f"Original query: '{query}'")
198 logger.info(f"Optimized for Wikinews: '{optimized_query}'")
200 return optimized_query
202 def _adapt_date_range_for_query(self, query: str) -> None:
203 """
204 Adapt the date range based on the query type (historical vs recent events).
206 Args:
207 query (str): The search query
208 """
209 # Reset to original date parameters first
210 self.from_date, self.to_date = self._original_date_range
212 if not self.adaptive_search or not self.llm:
213 return
215 # Do not adapt for very short queries (no enough context)
216 if len(query.split()) <= 4:
217 return
219 try:
220 prompt = f"""Classify this query based on temporal scope.
222Query: "{query}"
224Current date: {datetime.now(UTC).strftime("%Y-%m-%d")}
225Cutoff: Events within the last {DEFAULT_RECENT_BACKWARD_DAYS} days are CURRENT
227Classification rules:
228- CURRENT: Recent events (last {DEFAULT_RECENT_BACKWARD_DAYS} days), ongoing situations, "latest", "recent", "today", "this week"
229- HISTORICAL: Events before {DEFAULT_RECENT_BACKWARD_DAYS} days ago, timelines, chronologies, past tense ("what happened", "history of")
230- UNCLEAR: Ambiguous temporal context
232Respond with ONE WORD ONLY: CURRENT, HISTORICAL, or UNCLEAR"""
233 # Get response from LLM
234 response = self.llm.invoke(prompt)
235 response_text = (
236 getattr(response, "content", None)
237 or getattr(response, "text", None)
238 or str(response)
239 )
240 answer = remove_think_tags(response_text).upper()
242 if "CURRENT" in answer:
243 # For current events, focus on recent content
244 logger.info(
245 f"Query '{query}' classified as CURRENT - focusing on recent content"
246 )
247 self.from_date = datetime.now(UTC) - timedelta(
248 days=DEFAULT_RECENT_BACKWARD_DAYS
249 )
250 elif "HISTORICAL" in answer: 250 ↛ 257line 250 didn't jump to line 257 because the condition on line 250 was always true
251 # For historical queries, go back as far as possible
252 logger.info(
253 f"Query '{query}' classified as HISTORICAL - extending search timeframe"
254 )
255 self.from_date = datetime.min.replace(tzinfo=UTC)
256 else:
257 logger.info(
258 f"Query '{query}' classified as UNCLEAR - keeping original date range"
259 )
261 except (AttributeError, TypeError, ValueError, RuntimeError) as e:
262 # Keep original date parameters on error
263 logger.exception(
264 f"Error adapting date range for query: {e}. Keeping original date range."
265 )
267 def _fetch_search_results(
268 self, query: str, sroffset: int
269 ) -> List[Dict[str, Any]]:
270 """Fetch search results from Wikinews API.
272 Args:
273 query (str): The search query.
274 sroffset (int): The result offset for pagination.
276 Returns:
277 List of search result items.
278 """
279 retries = 0
280 while retries < MAX_RETRIES:
281 params = {
282 "action": "query",
283 "list": "search",
284 "srsearch": query,
285 "srprop": "snippet|timestamp",
286 "srlimit": 50,
287 "sroffset": sroffset,
288 "format": "json",
289 }
291 # Apply rate limiting before search request
292 self._last_wait_time = self.rate_tracker.apply_rate_limit(
293 self.engine_type
294 )
296 try:
297 response = safe_get(
298 self.api_url.format(lang_code=self.lang_code),
299 params=params,
300 headers=HEADERS,
301 timeout=TIMEOUT,
302 )
303 response.raise_for_status()
304 data = response.json()
305 return data.get("query", {}).get("search", [])
306 except (
307 requests.exceptions.RequestException,
308 json.JSONDecodeError,
309 ) as e:
310 logger.warning(
311 f"Error fetching search results: {e}, retrying..."
312 )
313 retries += 1
315 return []
317 def _process_search_result(
318 self, result: Dict[str, Any], query: str
319 ) -> Optional[Dict[str, Any]]:
320 """Process and filter a single search result.
322 Args:
323 result (Dict[str, Any]): A single search result item.
324 query (str): The search query.
326 Returns:
327 Processed result or None if filtered out.
328 """
329 page_id = result.get("pageid")
330 title = result.get("title", "")
331 snippet = _clean_wikinews_snippet(result.get("snippet", ""))
333 try:
334 last_edit_timestamp = result.get("timestamp", "")
335 last_edit_date = datetime.fromisoformat(
336 last_edit_timestamp.replace("Z", "+00:00")
337 )
338 except ValueError:
339 logger.warning(
340 f"Error parsing last edit date for page {page_id}, using current date as fallback."
341 )
342 last_edit_date = datetime.now(UTC)
344 # First filter: last edit date must be after from_date
345 if last_edit_date < self.from_date:
346 # In this case we can skip fetching full content
347 return None
349 # Fetch full article content and extract actual publication date
350 # Note: Wikinews API do not allow to retrieve publication date in batched search results
351 full_content, publication_date = self._fetch_full_content_and_pubdate(
352 page_id, last_edit_date
353 )
355 # Second filter: publication date within range
356 if publication_date < self.from_date or publication_date > self.to_date:
357 return None
359 # Third filter: check if all query words are in title or content
360 # Note: Wikinews search return false positive if query words are in "related" articles section
361 # Use word boundary matching to avoid substring matches (e.g., "is" matching "This")
362 combined_text = f"{title} {full_content}".lower()
363 query_words = [
364 w.lower() for w in query.split() if len(w) > 1
365 ] # Skip single chars
366 if query_words and not all(
367 re.search(rf"\b{re.escape(word)}\b", combined_text)
368 for word in query_words
369 ):
370 return None
372 # If only snippets are requested, we use snippet as full content
373 if self.search_snippets_only:
374 full_content = snippet
376 return {
377 "id": page_id,
378 "title": title,
379 "snippet": snippet,
380 "source": "wikinews",
381 "url": f"https://{self.lang_code}.wikinews.org/?curid={page_id}", # Used by '_filter_for_relevance' function
382 "link": f"https://{self.lang_code}.wikinews.org/?curid={page_id}", # Used by citation handler
383 "content": full_content,
384 "full_content": full_content,
385 "publication_date": publication_date.isoformat(timespec="seconds"),
386 }
388 def _fetch_full_content_and_pubdate(
389 self, page_id: int, fallback_date: datetime
390 ) -> Tuple[str, datetime]:
391 """Fetch full article content and publication date from Wikinews API.
393 Args:
394 page_id (int): The Wikinews page ID.
395 fallback_date (datetime): Fallback date if publication date cannot be determined.
397 Returns:
398 Tuple of (full_content, publication_date)
399 """
400 try:
401 content_params = {
402 "action": "query",
403 "prop": "revisions|extracts",
404 "pageids": page_id,
405 "rvprop": "timestamp",
406 "rvdir": "newer", # Older revisions first
407 "rvlimit": 1, # Get the first revision (i.e. publication)
408 "explaintext": True,
409 "format": "json",
410 }
412 # Apply rate limiting before content request
413 self._last_wait_time = self.rate_tracker.apply_rate_limit(
414 self.engine_type
415 )
417 content_resp = safe_get(
418 self.api_url.format(lang_code=self.lang_code),
419 params=content_params,
420 headers=HEADERS,
421 timeout=TIMEOUT,
422 )
423 content_resp.raise_for_status()
424 content_data = content_resp.json()
426 page_data = (
427 content_data.get("query", {})
428 .get("pages", {})
429 .get(str(page_id), {})
430 )
431 full_content = page_data.get("extract", "")
432 revisions = page_data.get("revisions", [])
434 if revisions:
435 try:
436 # First revision timestamp is the publication date
437 publication_date = datetime.fromisoformat(
438 revisions[0]["timestamp"].replace("Z", "+00:00")
439 )
440 except ValueError:
441 logger.warning(
442 f"Error parsing publication date for page {page_id}, using fallback date."
443 )
444 publication_date = fallback_date
445 else:
446 logger.warning(
447 f"No revisions found for page {page_id}, using fallback date."
448 )
449 publication_date = fallback_date
451 return full_content, publication_date
453 except (
454 requests.exceptions.RequestException,
455 json.JSONDecodeError,
456 ) as e:
457 logger.warning(f"Error fetching content for page {page_id}: {e}")
458 return "", fallback_date
460 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
461 """
462 Retrieve article previews from Wikinews based on the query.
464 Args:
465 query (str): The search query
467 Returns:
468 List of relevant article previews
469 """
470 # Adapt date range based on query and optimize query (if LLM is available)
471 self._adapt_date_range_for_query(query)
472 optimized_query = self._optimize_query_for_wikinews(query)
474 articles = []
475 sroffset = 0
477 while len(articles) < self.max_results:
478 search_results = self._fetch_search_results(
479 optimized_query, sroffset
480 )
481 if not search_results:
482 # No more results available (or multiple retries failed)
483 break
485 for result in search_results:
486 article = self._process_search_result(result, optimized_query)
487 if article:
488 articles.append(article)
489 if len(articles) >= self.max_results:
490 break
492 sroffset += len(search_results)
494 return articles
496 def _get_full_content(
497 self, relevant_items: List[Dict[str, Any]]
498 ) -> List[Dict[str, Any]]:
499 """
500 Retrieve full content for relevant Wikinews articles.
502 Args:
503 relevant_items (List[Dict[str, Any]]): List of relevant article previews
505 Returns:
506 List of articles with full content
507 """
508 # Since full content is already fetched in _get_previews, just return relevant items
509 return relevant_items
512def _clean_wikinews_snippet(snippet: str) -> str:
513 """
514 Clean a Wikinews search snippet.
516 Args:
517 snippet (str): Raw snippet from Wikinews API
519 Returns:
520 Clean human-readable text
521 """
522 if not snippet:
523 return ""
525 # Unescape HTML entities
526 unescaped = html.unescape(snippet)
528 # Remove HTML tags
529 clean_text = re.sub(r"<.*?>", "", unescaped)
531 # Normalize whitespace
532 clean_text = re.sub(r"\s+", " ", clean_text).strip()
534 return clean_text