Coverage for src/local_deep_research/web_search_engines/engines/search_engine_wikinews.py: 95%
160 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1from datetime import datetime, timedelta, UTC
2from typing import Any, Dict, List, Optional, Tuple
4import json
5import html
6import re
7import requests
8from langchain_core.language_models import BaseLLM
9from loguru import logger
11from ...constants import USER_AGENT
12from ...utilities.json_utils import extract_json, get_llm_response_text
13from ...utilities.search_utilities import remove_think_tags, LANGUAGE_CODE_MAP
14from ..search_engine_base import BaseSearchEngine
15from ...security import safe_get
17HEADERS = {"User-Agent": USER_AGENT}
18WIKINEWS_LANGUAGES = [
19 "ru",
20 "sr",
21 "pt",
22 "fr",
23 "pl",
24 "en",
25 "zh",
26 "de",
27 "it",
28 "es",
29 "cs",
30 "nl",
31 "ca",
32 "ar",
33 "ja",
34]
35TIMEOUT = 5 # Seconds
36TIME_PERIOD_DELTAS = {
37 "all": None, # No time filter
38 "y": timedelta(days=365), # 1 year
39 "m": timedelta(days=30), # 1 month
40 "w": timedelta(days=7), # 1 week
41 "d": timedelta(days=1), # 24 hours
42}
43DEFAULT_RECENT_BACKWARD_DAYS = 60
44MAX_RETRIES = 3
47class WikinewsSearchEngine(BaseSearchEngine):
48 """Wikinews search engine implementation with LLM query optimization"""
50 # Mark as public and news search engine
51 is_public = True
52 is_news = True
53 is_lexical = True
54 needs_llm_relevance_filter = True
56 def __init__(
57 self,
58 search_language: str = "english",
59 adaptive_search: bool = True,
60 time_period: str = "y",
61 llm: Optional[BaseLLM] = None,
62 max_filtered_results: Optional[int] = None,
63 max_results: int = 10,
64 search_snippets_only: bool = True,
65 settings_snapshot: Optional[Dict[str, Any]] = None,
66 **kwargs,
67 ):
68 """
69 Initialize the Wikinews search engine.
71 Args:
72 search_language (str): Language for Wikinews search (e.g. "english").
73 adaptive_search (bool): Whether to expand or shrink date ranges based on query.
74 time_period (str): Defines the look-back window used to filter search results ("all", "y", "m", "w", "d").
75 llm (Optional[BaseLLM]): Language model used for query optimization and classification.
76 max_filtered_results (Optional[int]): Maximum number of results to keep after filtering.
77 max_results (int): Maximum number of search results to return.
78 search_snippets_only (bool): If True, full article content is ignored.
79 """
81 super().__init__(
82 llm=llm,
83 max_filtered_results=max_filtered_results,
84 max_results=max_results,
85 search_snippets_only=search_snippets_only,
86 settings_snapshot=settings_snapshot,
87 **kwargs,
88 )
90 # Language initialization
91 lang_code = LANGUAGE_CODE_MAP.get(
92 search_language.lower(),
93 "en", # Default to English if not found
94 )
96 if lang_code not in WIKINEWS_LANGUAGES: 96 ↛ 97line 96 didn't jump to line 97 because the condition on line 96 was never true
97 logger.warning(
98 f"Wikinews does not support language '{search_language}' ({lang_code}). Defaulting to English."
99 )
100 lang_code = "en"
102 self.lang_code: str = lang_code
104 # Adaptive search
105 self.adaptive_search: bool = adaptive_search
107 # Date range initialization
108 now = datetime.now(UTC)
109 delta = TIME_PERIOD_DELTAS.get(time_period, timedelta(days=365))
110 self.from_date: datetime = (
111 now - delta if delta else datetime.min.replace(tzinfo=UTC)
112 )
113 self.to_date: datetime = now
115 # Preserve original date range so adaptive search can restore it
116 self._original_date_range = (self.from_date, self.to_date)
118 # API base URL
119 self.api_url: str = "https://{lang_code}.wikinews.org/w/api.php"
121 def _optimize_query_for_wikinews(self, query: str) -> str:
122 """
123 Optimize a natural language query for Wikinews search.
124 Uses LLM to transform questions into effective news search queries.
126 Args:
127 query (str): Natural language query
129 Returns:
130 Optimized search query for Wikinews
131 """
132 if not self.llm:
133 return query
135 try:
136 # Prompt for query optimization
137 prompt = f"""You are a query condenser. Your task is to transform the user’s natural-language question into a very short Wikinews search query.
139Input question:
140"{query}"
142STRICT OUTPUT REQUIREMENTS (follow ALL of them):
1431. Return ONLY a JSON object with EXACTLY one field: {{"query": "<refined_query>"}}.
1442. The JSON must be valid, minified, and contain no trailing text.
1453. The refined query must be extremely short: MAXIMUM 3–4 words.
1464. Include only the essential keywords (proper names, events, entities, places).
1475. Remove filler words (e.g., "news", "latest", "about", "what", "how", "is").
1486. DO NOT add Boolean operators (AND, OR).
1497. DO NOT use quotes inside the query.
1508. DO NOT add explanations or comments.
152EXAMPLES:
153- "What's the impact of rising interest rates on UK housing market?" → {{"query": "UK housing rates"}}
154- "Latest developments in the Ukraine-Russia peace negotiations" → {{"query": "Ukraine Russia negotiations"}}
155- "How are tech companies responding to AI regulation?" → {{"query": "tech AI regulation"}}
156- "What is Donald Trump's current political activity?" → {{"query": "Trump political activity"}}
158NOW RETURN ONLY THE JSON OBJECT.
159"""
160 # Get response from LLM
161 response = self.llm.invoke(prompt)
162 response_text = get_llm_response_text(response)
164 data = extract_json(response_text, expected_type=dict)
166 if data is None or not isinstance(data, dict):
167 raise ValueError("No valid JSON found in response") # noqa: TRY301 — caught by except ValueError to fall back to original query
169 optimized_query: str = str(data.get("query", "")).strip()
171 if not optimized_query:
172 raise ValueError("Query field missing or empty") # noqa: TRY301 — caught by except ValueError to fall back to original query
174 except (
175 ValueError,
176 TypeError,
177 AttributeError,
178 json.JSONDecodeError,
179 ):
180 logger.warning(
181 "Error optimizing query for WikinewsUsing original query."
182 )
183 return query
185 logger.info(f"Original query: '{query}'")
186 logger.info(f"Optimized for Wikinews: '{optimized_query}'")
188 return optimized_query
190 def _adapt_date_range_for_query(self, query: str) -> None:
191 """
192 Adapt the date range based on the query type (historical vs recent events).
194 Args:
195 query (str): The search query
196 """
197 # Reset to original date parameters first
198 self.from_date, self.to_date = self._original_date_range
200 if not self.adaptive_search or not self.llm:
201 return
203 # Do not adapt for very short queries (no enough context)
204 if len(query.split()) <= 4:
205 return
207 try:
208 prompt = f"""Classify this query based on temporal scope.
210Query: "{query}"
212Current date: {datetime.now(UTC).strftime("%Y-%m-%d")}
213Cutoff: Events within the last {DEFAULT_RECENT_BACKWARD_DAYS} days are CURRENT
215Classification rules:
216- CURRENT: Recent events (last {DEFAULT_RECENT_BACKWARD_DAYS} days), ongoing situations, "latest", "recent", "today", "this week"
217- HISTORICAL: Events before {DEFAULT_RECENT_BACKWARD_DAYS} days ago, timelines, chronologies, past tense ("what happened", "history of")
218- UNCLEAR: Ambiguous temporal context
220Respond with ONE WORD ONLY: CURRENT, HISTORICAL, or UNCLEAR"""
221 # Get response from LLM
222 response = self.llm.invoke(prompt)
223 response_text = (
224 getattr(response, "content", None)
225 or getattr(response, "text", None)
226 or str(response)
227 )
228 answer = remove_think_tags(response_text).upper()
230 if "CURRENT" in answer:
231 # For current events, focus on recent content
232 logger.info(
233 f"Query '{query}' classified as CURRENT - focusing on recent content"
234 )
235 self.from_date = datetime.now(UTC) - timedelta(
236 days=DEFAULT_RECENT_BACKWARD_DAYS
237 )
238 elif "HISTORICAL" in answer:
239 # For historical queries, go back as far as possible
240 logger.info(
241 f"Query '{query}' classified as HISTORICAL - extending search timeframe"
242 )
243 self.from_date = datetime.min.replace(tzinfo=UTC)
244 else:
245 logger.info(
246 f"Query '{query}' classified as UNCLEAR - keeping original date range"
247 )
249 except (AttributeError, TypeError, ValueError, RuntimeError):
250 # Keep original date parameters on error
251 logger.exception(
252 "Error adapting date range for query: . Keeping original date range."
253 )
255 def _fetch_search_results(
256 self, query: str, sroffset: int
257 ) -> List[Dict[str, Any]]:
258 """Fetch search results from Wikinews API.
260 Args:
261 query (str): The search query.
262 sroffset (int): The result offset for pagination.
264 Returns:
265 List of search result items.
266 """
267 retries = 0
268 while retries < MAX_RETRIES:
269 params = {
270 "action": "query",
271 "list": "search",
272 "srsearch": query,
273 "srprop": "snippet|timestamp",
274 "srlimit": 50,
275 "sroffset": sroffset,
276 "format": "json",
277 }
279 # Apply rate limiting before search request
280 self._last_wait_time = self.rate_tracker.apply_rate_limit(
281 self.engine_type
282 )
284 try:
285 response = safe_get(
286 self.api_url.format(lang_code=self.lang_code),
287 params=params,
288 headers=HEADERS,
289 timeout=TIMEOUT,
290 )
291 response.raise_for_status()
292 data = response.json()
293 return data.get("query", {}).get("search", []) # type: ignore[no-any-return]
294 except (
295 requests.exceptions.RequestException,
296 json.JSONDecodeError,
297 ):
298 logger.warning("Error fetching search resultsretrying...")
299 retries += 1
301 return []
303 def _process_search_result(
304 self, result: Dict[str, Any], query: str
305 ) -> Optional[Dict[str, Any]]:
306 """Process and filter a single search result.
308 Args:
309 result (Dict[str, Any]): A single search result item.
310 query (str): The search query.
312 Returns:
313 Processed result or None if filtered out.
314 """
315 page_id = result.get("pageid")
316 title = result.get("title", "")
317 snippet = _clean_wikinews_snippet(result.get("snippet", ""))
319 try:
320 last_edit_timestamp = result.get("timestamp", "")
321 last_edit_date = datetime.fromisoformat(
322 last_edit_timestamp.replace("Z", "+00:00")
323 )
324 except ValueError:
325 logger.warning(
326 f"Error parsing last edit date for page {page_id}, using current date as fallback."
327 )
328 last_edit_date = datetime.now(UTC)
330 # First filter: last edit date must be after from_date
331 if last_edit_date < self.from_date: 331 ↛ 333line 331 didn't jump to line 333 because the condition on line 331 was never true
332 # In this case we can skip fetching full content
333 return None
335 # Fetch full article content and extract actual publication date
336 # Note: Wikinews API do not allow to retrieve publication date in batched search results
337 full_content, publication_date = self._fetch_full_content_and_pubdate(
338 int(page_id) if page_id is not None else 0, last_edit_date
339 )
341 # Second filter: publication date within range
342 if publication_date < self.from_date or publication_date > self.to_date:
343 return None
345 # Third filter: check if all query words are in title or content
346 # Note: Wikinews search return false positive if query words are in "related" articles section
347 # Use word boundary matching to avoid substring matches (e.g., "is" matching "This")
348 combined_text = f"{title} {full_content}".lower()
349 query_words = [
350 w.lower() for w in query.split() if len(w) > 1
351 ] # Skip single chars
352 if query_words and not all( 352 ↛ 356line 352 didn't jump to line 356 because the condition on line 352 was never true
353 re.search(rf"\b{re.escape(word)}\b", combined_text)
354 for word in query_words
355 ):
356 return None
358 # If only snippets are requested, we use snippet as full content
359 if self.search_snippets_only:
360 full_content = snippet
362 return {
363 "id": page_id,
364 "title": title,
365 "snippet": snippet,
366 "source": "wikinews",
367 "url": f"https://{self.lang_code}.wikinews.org/?curid={page_id}", # Used by '_filter_for_relevance' function
368 "link": f"https://{self.lang_code}.wikinews.org/?curid={page_id}", # Used by citation handler
369 "content": full_content,
370 "full_content": full_content,
371 "publication_date": publication_date.isoformat(timespec="seconds"),
372 }
374 def _fetch_full_content_and_pubdate(
375 self, page_id: int, fallback_date: datetime
376 ) -> Tuple[str, datetime]:
377 """Fetch full article content and publication date from Wikinews API.
379 Args:
380 page_id (int): The Wikinews page ID.
381 fallback_date (datetime): Fallback date if publication date cannot be determined.
383 Returns:
384 Tuple of (full_content, publication_date)
385 """
386 try:
387 content_params = {
388 "action": "query",
389 "prop": "revisions|extracts",
390 "pageids": page_id,
391 "rvprop": "timestamp",
392 "rvdir": "newer", # Older revisions first
393 "rvlimit": 1, # Get the first revision (i.e. publication)
394 "explaintext": True,
395 "format": "json",
396 }
398 # Apply rate limiting before content request
399 self._last_wait_time = self.rate_tracker.apply_rate_limit(
400 self.engine_type
401 )
403 content_resp = safe_get(
404 self.api_url.format(lang_code=self.lang_code),
405 params=content_params,
406 headers=HEADERS,
407 timeout=TIMEOUT,
408 )
409 content_resp.raise_for_status()
410 content_data = content_resp.json()
412 page_data = (
413 content_data.get("query", {})
414 .get("pages", {})
415 .get(str(page_id), {})
416 )
417 full_content = page_data.get("extract", "")
418 revisions = page_data.get("revisions", [])
420 if revisions:
421 try:
422 # First revision timestamp is the publication date
423 publication_date = datetime.fromisoformat(
424 revisions[0]["timestamp"].replace("Z", "+00:00")
425 )
426 except ValueError:
427 logger.warning(
428 f"Error parsing publication date for page {page_id}, using fallback date."
429 )
430 publication_date = fallback_date
431 else:
432 logger.warning(
433 f"No revisions found for page {page_id}, using fallback date."
434 )
435 publication_date = fallback_date
437 return full_content, publication_date
439 except (
440 requests.exceptions.RequestException,
441 json.JSONDecodeError,
442 ):
443 logger.warning(f"Error fetching content for page {page_id}")
444 return "", fallback_date
446 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
447 """
448 Retrieve article previews from Wikinews based on the query.
450 Args:
451 query (str): The search query
453 Returns:
454 List of relevant article previews
455 """
456 # Adapt date range based on query and optimize query (if LLM is available)
457 self._adapt_date_range_for_query(query)
458 optimized_query = self._optimize_query_for_wikinews(query)
460 articles: list[dict[str, Any]] = []
461 sroffset = 0
463 while len(articles) < self.max_results:
464 search_results = self._fetch_search_results(
465 optimized_query, sroffset
466 )
467 if not search_results:
468 # No more results available (or multiple retries failed)
469 break
471 for result in search_results: 471 ↛ 478line 471 didn't jump to line 478 because the loop on line 471 didn't complete
472 article = self._process_search_result(result, optimized_query)
473 if article: 473 ↛ 475line 473 didn't jump to line 475 because the condition on line 473 was always true
474 articles.append(article)
475 if len(articles) >= self.max_results: 475 ↛ 471line 475 didn't jump to line 471 because the condition on line 475 was always true
476 break
478 sroffset += len(search_results)
480 return articles
482 def _get_full_content(
483 self, relevant_items: List[Dict[str, Any]]
484 ) -> List[Dict[str, Any]]:
485 """
486 Retrieve full content for relevant Wikinews articles.
488 Args:
489 relevant_items (List[Dict[str, Any]]): List of relevant article previews
491 Returns:
492 List of articles with full content
493 """
494 # Since full content is already fetched in _get_previews, just return relevant items
495 return relevant_items
498def _clean_wikinews_snippet(snippet: str) -> str:
499 """
500 Clean a Wikinews search snippet.
502 Args:
503 snippet (str): Raw snippet from Wikinews API
505 Returns:
506 Clean human-readable text
507 """
508 if not snippet:
509 return ""
511 # Unescape HTML entities
512 unescaped = html.unescape(snippet)
514 # Remove HTML tags
515 clean_text = re.sub(r"<.*?>", "", unescaped)
517 # Normalize whitespace
518 return re.sub(r"\s+", " ", clean_text).strip()