Coverage for src / local_deep_research / web_search_engines / engines / search_engine_wikinews.py: 89%
158 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1from datetime import datetime, timedelta, UTC
2from typing import Any, Dict, List, Optional, Tuple
4import json
5import html
6import re
7import requests
8from langchain_core.language_models import BaseLLM
9from loguru import logger
11from ...utilities.json_utils import extract_json, get_llm_response_text
12from ...utilities.search_utilities import remove_think_tags, LANGUAGE_CODE_MAP
13from ..search_engine_base import BaseSearchEngine
14from ...security import safe_get
16HEADERS = {
17 "User-Agent": "local-deep-research-wikinews-search-engine (github.com/LearningCircuit/local-deep-research)"
18}
19WIKINEWS_LANGUAGES = [
20 "ru",
21 "sr",
22 "pt",
23 "fr",
24 "pl",
25 "en",
26 "zh",
27 "de",
28 "it",
29 "es",
30 "cs",
31 "nl",
32 "ca",
33 "ar",
34 "ja",
35]
36TIMEOUT = 5 # Seconds
37TIME_PERIOD_DELTAS = {
38 "all": None, # No time filter
39 "y": timedelta(days=365), # 1 year
40 "m": timedelta(days=30), # 1 month
41 "w": timedelta(days=7), # 1 week
42 "d": timedelta(days=1), # 24 hours
43}
44DEFAULT_RECENT_BACKWARD_DAYS = 60
45MAX_RETRIES = 3
48class WikinewsSearchEngine(BaseSearchEngine):
49 """Wikinews search engine implementation with LLM query optimization"""
51 # Mark as public and news search engine
52 is_public = True
53 is_news = True
55 def __init__(
56 self,
57 search_language: str = "english",
58 adaptive_search: bool = True,
59 time_period: str = "y",
60 llm: Optional[BaseLLM] = None,
61 max_filtered_results: Optional[int] = None,
62 max_results: int = 10,
63 search_snippets_only: bool = True,
64 **kwargs,
65 ):
66 """
67 Initialize the Wikinews search engine.
69 Args:
70 search_language (str): Language for Wikinews search (e.g. "english").
71 adaptive_search (bool): Whether to expand or shrink date ranges based on query.
72 time_period (str): Defines the look-back window used to filter search results ("all", "y", "m", "w", "d").
73 llm (Optional[BaseLLM]): Language model used for query optimization and classification.
74 max_filtered_results (Optional[int]): Maximum number of results to keep after filtering.
75 max_results (int): Maximum number of search results to return.
76 search_snippets_only (bool): If True, full article content is ignored.
77 """
79 super().__init__(
80 llm=llm,
81 max_filtered_results=max_filtered_results,
82 max_results=max_results,
83 search_snippets_only=search_snippets_only,
84 **kwargs,
85 )
87 # Language initialization
88 lang_code = LANGUAGE_CODE_MAP.get(
89 search_language.lower(),
90 "en", # Default to English if not found
91 )
93 if lang_code not in WIKINEWS_LANGUAGES: 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true
94 logger.warning(
95 f"Wikinews does not support language '{search_language}' ({lang_code}). Defaulting to English."
96 )
97 lang_code = "en"
99 self.lang_code: str = lang_code
101 # Adaptive search
102 self.adaptive_search: bool = adaptive_search
104 # Date range initialization
105 now = datetime.now(UTC)
106 delta = TIME_PERIOD_DELTAS.get(time_period, timedelta(days=365))
107 self.from_date: datetime = (
108 now - delta if delta else datetime.min.replace(tzinfo=UTC)
109 )
110 self.to_date: datetime = now
112 # Preserve original date range so adaptive search can restore it
113 self._original_date_range = (self.from_date, self.to_date)
115 # API base URL
116 self.api_url: str = "https://{lang_code}.wikinews.org/w/api.php"
118 def _optimize_query_for_wikinews(self, query: str) -> str:
119 """
120 Optimize a natural language query for Wikinews search.
121 Uses LLM to transform questions into effective news search queries.
123 Args:
124 query (str): Natural language query
126 Returns:
127 Optimized search query for Wikinews
128 """
129 if not self.llm:
130 return query
132 try:
133 # Prompt for query optimization
134 prompt = f"""You are a query condenser. Your task is to transform the user’s natural-language question into a very short Wikinews search query.
136Input question:
137"{query}"
139STRICT OUTPUT REQUIREMENTS (follow ALL of them):
1401. Return ONLY a JSON object with EXACTLY one field: {{"query": "<refined_query>"}}.
1412. The JSON must be valid, minified, and contain no trailing text.
1423. The refined query must be extremely short: MAXIMUM 3–4 words.
1434. Include only the essential keywords (proper names, events, entities, places).
1445. Remove filler words (e.g., "news", "latest", "about", "what", "how", "is").
1456. DO NOT add Boolean operators (AND, OR).
1467. DO NOT use quotes inside the query.
1478. DO NOT add explanations or comments.
149EXAMPLES:
150- "What's the impact of rising interest rates on UK housing market?" → {{"query": "UK housing rates"}}
151- "Latest developments in the Ukraine-Russia peace negotiations" → {{"query": "Ukraine Russia negotiations"}}
152- "How are tech companies responding to AI regulation?" → {{"query": "tech AI regulation"}}
153- "What is Donald Trump's current political activity?" → {{"query": "Trump political activity"}}
155NOW RETURN ONLY THE JSON OBJECT.
156"""
157 # Get response from LLM
158 response = self.llm.invoke(prompt)
159 response_text = get_llm_response_text(response)
161 data = extract_json(response_text, expected_type=dict)
163 if data is None:
164 raise ValueError("No valid JSON found in response")
166 optimized_query = (data.get("query", "")).strip()
168 if not optimized_query:
169 raise ValueError("Query field missing or empty")
171 except (
172 ValueError,
173 TypeError,
174 AttributeError,
175 json.JSONDecodeError,
176 ) as e:
177 logger.warning(
178 f"Error optimizing query for Wikinews: {e}. Using original query."
179 )
180 return query
182 logger.info(f"Original query: '{query}'")
183 logger.info(f"Optimized for Wikinews: '{optimized_query}'")
185 return optimized_query
187 def _adapt_date_range_for_query(self, query: str) -> None:
188 """
189 Adapt the date range based on the query type (historical vs recent events).
191 Args:
192 query (str): The search query
193 """
194 # Reset to original date parameters first
195 self.from_date, self.to_date = self._original_date_range
197 if not self.adaptive_search or not self.llm:
198 return
200 # Do not adapt for very short queries (no enough context)
201 if len(query.split()) <= 4:
202 return
204 try:
205 prompt = f"""Classify this query based on temporal scope.
207Query: "{query}"
209Current date: {datetime.now(UTC).strftime("%Y-%m-%d")}
210Cutoff: Events within the last {DEFAULT_RECENT_BACKWARD_DAYS} days are CURRENT
212Classification rules:
213- CURRENT: Recent events (last {DEFAULT_RECENT_BACKWARD_DAYS} days), ongoing situations, "latest", "recent", "today", "this week"
214- HISTORICAL: Events before {DEFAULT_RECENT_BACKWARD_DAYS} days ago, timelines, chronologies, past tense ("what happened", "history of")
215- UNCLEAR: Ambiguous temporal context
217Respond with ONE WORD ONLY: CURRENT, HISTORICAL, or UNCLEAR"""
218 # Get response from LLM
219 response = self.llm.invoke(prompt)
220 response_text = (
221 getattr(response, "content", None)
222 or getattr(response, "text", None)
223 or str(response)
224 )
225 answer = remove_think_tags(response_text).upper()
227 if "CURRENT" in answer:
228 # For current events, focus on recent content
229 logger.info(
230 f"Query '{query}' classified as CURRENT - focusing on recent content"
231 )
232 self.from_date = datetime.now(UTC) - timedelta(
233 days=DEFAULT_RECENT_BACKWARD_DAYS
234 )
235 elif "HISTORICAL" in answer: 235 ↛ 242line 235 didn't jump to line 242 because the condition on line 235 was always true
236 # For historical queries, go back as far as possible
237 logger.info(
238 f"Query '{query}' classified as HISTORICAL - extending search timeframe"
239 )
240 self.from_date = datetime.min.replace(tzinfo=UTC)
241 else:
242 logger.info(
243 f"Query '{query}' classified as UNCLEAR - keeping original date range"
244 )
246 except (AttributeError, TypeError, ValueError, RuntimeError):
247 # Keep original date parameters on error
248 logger.exception(
249 "Error adapting date range for query: . Keeping original date range."
250 )
252 def _fetch_search_results(
253 self, query: str, sroffset: int
254 ) -> List[Dict[str, Any]]:
255 """Fetch search results from Wikinews API.
257 Args:
258 query (str): The search query.
259 sroffset (int): The result offset for pagination.
261 Returns:
262 List of search result items.
263 """
264 retries = 0
265 while retries < MAX_RETRIES:
266 params = {
267 "action": "query",
268 "list": "search",
269 "srsearch": query,
270 "srprop": "snippet|timestamp",
271 "srlimit": 50,
272 "sroffset": sroffset,
273 "format": "json",
274 }
276 # Apply rate limiting before search request
277 self._last_wait_time = self.rate_tracker.apply_rate_limit(
278 self.engine_type
279 )
281 try:
282 response = safe_get(
283 self.api_url.format(lang_code=self.lang_code),
284 params=params,
285 headers=HEADERS,
286 timeout=TIMEOUT,
287 )
288 response.raise_for_status()
289 data = response.json()
290 return data.get("query", {}).get("search", [])
291 except (
292 requests.exceptions.RequestException,
293 json.JSONDecodeError,
294 ) as e:
295 logger.warning(
296 f"Error fetching search results: {e}, retrying..."
297 )
298 retries += 1
300 return []
302 def _process_search_result(
303 self, result: Dict[str, Any], query: str
304 ) -> Optional[Dict[str, Any]]:
305 """Process and filter a single search result.
307 Args:
308 result (Dict[str, Any]): A single search result item.
309 query (str): The search query.
311 Returns:
312 Processed result or None if filtered out.
313 """
314 page_id = result.get("pageid")
315 title = result.get("title", "")
316 snippet = _clean_wikinews_snippet(result.get("snippet", ""))
318 try:
319 last_edit_timestamp = result.get("timestamp", "")
320 last_edit_date = datetime.fromisoformat(
321 last_edit_timestamp.replace("Z", "+00:00")
322 )
323 except ValueError:
324 logger.warning(
325 f"Error parsing last edit date for page {page_id}, using current date as fallback."
326 )
327 last_edit_date = datetime.now(UTC)
329 # First filter: last edit date must be after from_date
330 if last_edit_date < self.from_date: 330 ↛ 332line 330 didn't jump to line 332 because the condition on line 330 was never true
331 # In this case we can skip fetching full content
332 return None
334 # Fetch full article content and extract actual publication date
335 # Note: Wikinews API do not allow to retrieve publication date in batched search results
336 full_content, publication_date = self._fetch_full_content_and_pubdate(
337 page_id, last_edit_date
338 )
340 # Second filter: publication date within range
341 if publication_date < self.from_date or publication_date > self.to_date:
342 return None
344 # Third filter: check if all query words are in title or content
345 # Note: Wikinews search return false positive if query words are in "related" articles section
346 # Use word boundary matching to avoid substring matches (e.g., "is" matching "This")
347 combined_text = f"{title} {full_content}".lower()
348 query_words = [
349 w.lower() for w in query.split() if len(w) > 1
350 ] # Skip single chars
351 if query_words and not all( 351 ↛ 355line 351 didn't jump to line 355 because the condition on line 351 was never true
352 re.search(rf"\b{re.escape(word)}\b", combined_text)
353 for word in query_words
354 ):
355 return None
357 # If only snippets are requested, we use snippet as full content
358 if self.search_snippets_only: 358 ↛ 361line 358 didn't jump to line 361 because the condition on line 358 was always true
359 full_content = snippet
361 return {
362 "id": page_id,
363 "title": title,
364 "snippet": snippet,
365 "source": "wikinews",
366 "url": f"https://{self.lang_code}.wikinews.org/?curid={page_id}", # Used by '_filter_for_relevance' function
367 "link": f"https://{self.lang_code}.wikinews.org/?curid={page_id}", # Used by citation handler
368 "content": full_content,
369 "full_content": full_content,
370 "publication_date": publication_date.isoformat(timespec="seconds"),
371 }
373 def _fetch_full_content_and_pubdate(
374 self, page_id: int, fallback_date: datetime
375 ) -> Tuple[str, datetime]:
376 """Fetch full article content and publication date from Wikinews API.
378 Args:
379 page_id (int): The Wikinews page ID.
380 fallback_date (datetime): Fallback date if publication date cannot be determined.
382 Returns:
383 Tuple of (full_content, publication_date)
384 """
385 try:
386 content_params = {
387 "action": "query",
388 "prop": "revisions|extracts",
389 "pageids": page_id,
390 "rvprop": "timestamp",
391 "rvdir": "newer", # Older revisions first
392 "rvlimit": 1, # Get the first revision (i.e. publication)
393 "explaintext": True,
394 "format": "json",
395 }
397 # Apply rate limiting before content request
398 self._last_wait_time = self.rate_tracker.apply_rate_limit(
399 self.engine_type
400 )
402 content_resp = safe_get(
403 self.api_url.format(lang_code=self.lang_code),
404 params=content_params,
405 headers=HEADERS,
406 timeout=TIMEOUT,
407 )
408 content_resp.raise_for_status()
409 content_data = content_resp.json()
411 page_data = (
412 content_data.get("query", {})
413 .get("pages", {})
414 .get(str(page_id), {})
415 )
416 full_content = page_data.get("extract", "")
417 revisions = page_data.get("revisions", [])
419 if revisions:
420 try:
421 # First revision timestamp is the publication date
422 publication_date = datetime.fromisoformat(
423 revisions[0]["timestamp"].replace("Z", "+00:00")
424 )
425 except ValueError:
426 logger.warning(
427 f"Error parsing publication date for page {page_id}, using fallback date."
428 )
429 publication_date = fallback_date
430 else:
431 logger.warning(
432 f"No revisions found for page {page_id}, using fallback date."
433 )
434 publication_date = fallback_date
436 return full_content, publication_date
438 except (
439 requests.exceptions.RequestException,
440 json.JSONDecodeError,
441 ) as e:
442 logger.warning(f"Error fetching content for page {page_id}: {e}")
443 return "", fallback_date
445 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
446 """
447 Retrieve article previews from Wikinews based on the query.
449 Args:
450 query (str): The search query
452 Returns:
453 List of relevant article previews
454 """
455 # Adapt date range based on query and optimize query (if LLM is available)
456 self._adapt_date_range_for_query(query)
457 optimized_query = self._optimize_query_for_wikinews(query)
459 articles = []
460 sroffset = 0
462 while len(articles) < self.max_results:
463 search_results = self._fetch_search_results(
464 optimized_query, sroffset
465 )
466 if not search_results:
467 # No more results available (or multiple retries failed)
468 break
470 for result in search_results: 470 ↛ 477line 470 didn't jump to line 477 because the loop on line 470 didn't complete
471 article = self._process_search_result(result, optimized_query)
472 if article: 472 ↛ 474line 472 didn't jump to line 474 because the condition on line 472 was always true
473 articles.append(article)
474 if len(articles) >= self.max_results: 474 ↛ 470line 474 didn't jump to line 470 because the condition on line 474 was always true
475 break
477 sroffset += len(search_results)
479 return articles
481 def _get_full_content(
482 self, relevant_items: List[Dict[str, Any]]
483 ) -> List[Dict[str, Any]]:
484 """
485 Retrieve full content for relevant Wikinews articles.
487 Args:
488 relevant_items (List[Dict[str, Any]]): List of relevant article previews
490 Returns:
491 List of articles with full content
492 """
493 # Since full content is already fetched in _get_previews, just return relevant items
494 return relevant_items
497def _clean_wikinews_snippet(snippet: str) -> str:
498 """
499 Clean a Wikinews search snippet.
501 Args:
502 snippet (str): Raw snippet from Wikinews API
504 Returns:
505 Clean human-readable text
506 """
507 if not snippet:
508 return ""
510 # Unescape HTML entities
511 unescaped = html.unescape(snippet)
513 # Remove HTML tags
514 clean_text = re.sub(r"<.*?>", "", unescaped)
516 # Normalize whitespace
517 clean_text = re.sub(r"\s+", " ", clean_text).strip()
519 return clean_text