Coverage for src / local_deep_research / web_search_engines / engines / search_engine_wikinews.py: 95%
159 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1from datetime import datetime, timedelta, UTC
2from typing import Any, Dict, List, Optional, Tuple
4import json
5import html
6import re
7import requests
8from langchain_core.language_models import BaseLLM
9from loguru import logger
11from ...utilities.json_utils import extract_json, get_llm_response_text
12from ...utilities.search_utilities import remove_think_tags, LANGUAGE_CODE_MAP
13from ..search_engine_base import BaseSearchEngine
14from ...security import safe_get
16HEADERS = {
17 "User-Agent": "local-deep-research-wikinews-search-engine (github.com/LearningCircuit/local-deep-research)"
18}
19WIKINEWS_LANGUAGES = [
20 "ru",
21 "sr",
22 "pt",
23 "fr",
24 "pl",
25 "en",
26 "zh",
27 "de",
28 "it",
29 "es",
30 "cs",
31 "nl",
32 "ca",
33 "ar",
34 "ja",
35]
36TIMEOUT = 5 # Seconds
37TIME_PERIOD_DELTAS = {
38 "all": None, # No time filter
39 "y": timedelta(days=365), # 1 year
40 "m": timedelta(days=30), # 1 month
41 "w": timedelta(days=7), # 1 week
42 "d": timedelta(days=1), # 24 hours
43}
44DEFAULT_RECENT_BACKWARD_DAYS = 60
45MAX_RETRIES = 3
48class WikinewsSearchEngine(BaseSearchEngine):
49 """Wikinews search engine implementation with LLM query optimization"""
51 # Mark as public and news search engine
52 is_public = True
53 is_news = True
54 is_lexical = True
55 needs_llm_relevance_filter = True
57 def __init__(
58 self,
59 search_language: str = "english",
60 adaptive_search: bool = True,
61 time_period: str = "y",
62 llm: Optional[BaseLLM] = None,
63 max_filtered_results: Optional[int] = None,
64 max_results: int = 10,
65 search_snippets_only: bool = True,
66 settings_snapshot: Optional[Dict[str, Any]] = None,
67 **kwargs,
68 ):
69 """
70 Initialize the Wikinews search engine.
72 Args:
73 search_language (str): Language for Wikinews search (e.g. "english").
74 adaptive_search (bool): Whether to expand or shrink date ranges based on query.
75 time_period (str): Defines the look-back window used to filter search results ("all", "y", "m", "w", "d").
76 llm (Optional[BaseLLM]): Language model used for query optimization and classification.
77 max_filtered_results (Optional[int]): Maximum number of results to keep after filtering.
78 max_results (int): Maximum number of search results to return.
79 search_snippets_only (bool): If True, full article content is ignored.
80 """
82 super().__init__(
83 llm=llm,
84 max_filtered_results=max_filtered_results,
85 max_results=max_results,
86 search_snippets_only=search_snippets_only,
87 settings_snapshot=settings_snapshot,
88 **kwargs,
89 )
91 # Language initialization
92 lang_code = LANGUAGE_CODE_MAP.get(
93 search_language.lower(),
94 "en", # Default to English if not found
95 )
97 if lang_code not in WIKINEWS_LANGUAGES: 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true
98 logger.warning(
99 f"Wikinews does not support language '{search_language}' ({lang_code}). Defaulting to English."
100 )
101 lang_code = "en"
103 self.lang_code: str = lang_code
105 # Adaptive search
106 self.adaptive_search: bool = adaptive_search
108 # Date range initialization
109 now = datetime.now(UTC)
110 delta = TIME_PERIOD_DELTAS.get(time_period, timedelta(days=365))
111 self.from_date: datetime = (
112 now - delta if delta else datetime.min.replace(tzinfo=UTC)
113 )
114 self.to_date: datetime = now
116 # Preserve original date range so adaptive search can restore it
117 self._original_date_range = (self.from_date, self.to_date)
119 # API base URL
120 self.api_url: str = "https://{lang_code}.wikinews.org/w/api.php"
122 def _optimize_query_for_wikinews(self, query: str) -> str:
123 """
124 Optimize a natural language query for Wikinews search.
125 Uses LLM to transform questions into effective news search queries.
127 Args:
128 query (str): Natural language query
130 Returns:
131 Optimized search query for Wikinews
132 """
133 if not self.llm:
134 return query
136 try:
137 # Prompt for query optimization
138 prompt = f"""You are a query condenser. Your task is to transform the user’s natural-language question into a very short Wikinews search query.
140Input question:
141"{query}"
143STRICT OUTPUT REQUIREMENTS (follow ALL of them):
1441. Return ONLY a JSON object with EXACTLY one field: {{"query": "<refined_query>"}}.
1452. The JSON must be valid, minified, and contain no trailing text.
1463. The refined query must be extremely short: MAXIMUM 3–4 words.
1474. Include only the essential keywords (proper names, events, entities, places).
1485. Remove filler words (e.g., "news", "latest", "about", "what", "how", "is").
1496. DO NOT add Boolean operators (AND, OR).
1507. DO NOT use quotes inside the query.
1518. DO NOT add explanations or comments.
153EXAMPLES:
154- "What's the impact of rising interest rates on UK housing market?" → {{"query": "UK housing rates"}}
155- "Latest developments in the Ukraine-Russia peace negotiations" → {{"query": "Ukraine Russia negotiations"}}
156- "How are tech companies responding to AI regulation?" → {{"query": "tech AI regulation"}}
157- "What is Donald Trump's current political activity?" → {{"query": "Trump political activity"}}
159NOW RETURN ONLY THE JSON OBJECT.
160"""
161 # Get response from LLM
162 response = self.llm.invoke(prompt)
163 response_text = get_llm_response_text(response)
165 data = extract_json(response_text, expected_type=dict)
167 if data is None or not isinstance(data, dict):
168 raise ValueError("No valid JSON found in response") # noqa: TRY301 — caught by except ValueError to fall back to original query
170 optimized_query: str = str(data.get("query", "")).strip()
172 if not optimized_query:
173 raise ValueError("Query field missing or empty") # noqa: TRY301 — caught by except ValueError to fall back to original query
175 except (
176 ValueError,
177 TypeError,
178 AttributeError,
179 json.JSONDecodeError,
180 ):
181 logger.warning(
182 "Error optimizing query for WikinewsUsing original query."
183 )
184 return query
186 logger.info(f"Original query: '{query}'")
187 logger.info(f"Optimized for Wikinews: '{optimized_query}'")
189 return optimized_query
191 def _adapt_date_range_for_query(self, query: str) -> None:
192 """
193 Adapt the date range based on the query type (historical vs recent events).
195 Args:
196 query (str): The search query
197 """
198 # Reset to original date parameters first
199 self.from_date, self.to_date = self._original_date_range
201 if not self.adaptive_search or not self.llm:
202 return
204 # Do not adapt for very short queries (no enough context)
205 if len(query.split()) <= 4:
206 return
208 try:
209 prompt = f"""Classify this query based on temporal scope.
211Query: "{query}"
213Current date: {datetime.now(UTC).strftime("%Y-%m-%d")}
214Cutoff: Events within the last {DEFAULT_RECENT_BACKWARD_DAYS} days are CURRENT
216Classification rules:
217- CURRENT: Recent events (last {DEFAULT_RECENT_BACKWARD_DAYS} days), ongoing situations, "latest", "recent", "today", "this week"
218- HISTORICAL: Events before {DEFAULT_RECENT_BACKWARD_DAYS} days ago, timelines, chronologies, past tense ("what happened", "history of")
219- UNCLEAR: Ambiguous temporal context
221Respond with ONE WORD ONLY: CURRENT, HISTORICAL, or UNCLEAR"""
222 # Get response from LLM
223 response = self.llm.invoke(prompt)
224 response_text = (
225 getattr(response, "content", None)
226 or getattr(response, "text", None)
227 or str(response)
228 )
229 answer = remove_think_tags(response_text).upper()
231 if "CURRENT" in answer:
232 # For current events, focus on recent content
233 logger.info(
234 f"Query '{query}' classified as CURRENT - focusing on recent content"
235 )
236 self.from_date = datetime.now(UTC) - timedelta(
237 days=DEFAULT_RECENT_BACKWARD_DAYS
238 )
239 elif "HISTORICAL" in answer:
240 # For historical queries, go back as far as possible
241 logger.info(
242 f"Query '{query}' classified as HISTORICAL - extending search timeframe"
243 )
244 self.from_date = datetime.min.replace(tzinfo=UTC)
245 else:
246 logger.info(
247 f"Query '{query}' classified as UNCLEAR - keeping original date range"
248 )
250 except (AttributeError, TypeError, ValueError, RuntimeError):
251 # Keep original date parameters on error
252 logger.exception(
253 "Error adapting date range for query: . Keeping original date range."
254 )
256 def _fetch_search_results(
257 self, query: str, sroffset: int
258 ) -> List[Dict[str, Any]]:
259 """Fetch search results from Wikinews API.
261 Args:
262 query (str): The search query.
263 sroffset (int): The result offset for pagination.
265 Returns:
266 List of search result items.
267 """
268 retries = 0
269 while retries < MAX_RETRIES:
270 params = {
271 "action": "query",
272 "list": "search",
273 "srsearch": query,
274 "srprop": "snippet|timestamp",
275 "srlimit": 50,
276 "sroffset": sroffset,
277 "format": "json",
278 }
280 # Apply rate limiting before search request
281 self._last_wait_time = self.rate_tracker.apply_rate_limit(
282 self.engine_type
283 )
285 try:
286 response = safe_get(
287 self.api_url.format(lang_code=self.lang_code),
288 params=params,
289 headers=HEADERS,
290 timeout=TIMEOUT,
291 )
292 response.raise_for_status()
293 data = response.json()
294 return data.get("query", {}).get("search", []) # type: ignore[no-any-return]
295 except (
296 requests.exceptions.RequestException,
297 json.JSONDecodeError,
298 ):
299 logger.warning("Error fetching search resultsretrying...")
300 retries += 1
302 return []
304 def _process_search_result(
305 self, result: Dict[str, Any], query: str
306 ) -> Optional[Dict[str, Any]]:
307 """Process and filter a single search result.
309 Args:
310 result (Dict[str, Any]): A single search result item.
311 query (str): The search query.
313 Returns:
314 Processed result or None if filtered out.
315 """
316 page_id = result.get("pageid")
317 title = result.get("title", "")
318 snippet = _clean_wikinews_snippet(result.get("snippet", ""))
320 try:
321 last_edit_timestamp = result.get("timestamp", "")
322 last_edit_date = datetime.fromisoformat(
323 last_edit_timestamp.replace("Z", "+00:00")
324 )
325 except ValueError:
326 logger.warning(
327 f"Error parsing last edit date for page {page_id}, using current date as fallback."
328 )
329 last_edit_date = datetime.now(UTC)
331 # First filter: last edit date must be after from_date
332 if last_edit_date < self.from_date: 332 ↛ 334line 332 didn't jump to line 334 because the condition on line 332 was never true
333 # In this case we can skip fetching full content
334 return None
336 # Fetch full article content and extract actual publication date
337 # Note: Wikinews API do not allow to retrieve publication date in batched search results
338 full_content, publication_date = self._fetch_full_content_and_pubdate(
339 int(page_id) if page_id is not None else 0, last_edit_date
340 )
342 # Second filter: publication date within range
343 if publication_date < self.from_date or publication_date > self.to_date:
344 return None
346 # Third filter: check if all query words are in title or content
347 # Note: Wikinews search return false positive if query words are in "related" articles section
348 # Use word boundary matching to avoid substring matches (e.g., "is" matching "This")
349 combined_text = f"{title} {full_content}".lower()
350 query_words = [
351 w.lower() for w in query.split() if len(w) > 1
352 ] # Skip single chars
353 if query_words and not all( 353 ↛ 357line 353 didn't jump to line 357 because the condition on line 353 was never true
354 re.search(rf"\b{re.escape(word)}\b", combined_text)
355 for word in query_words
356 ):
357 return None
359 # If only snippets are requested, we use snippet as full content
360 if self.search_snippets_only:
361 full_content = snippet
363 return {
364 "id": page_id,
365 "title": title,
366 "snippet": snippet,
367 "source": "wikinews",
368 "url": f"https://{self.lang_code}.wikinews.org/?curid={page_id}", # Used by '_filter_for_relevance' function
369 "link": f"https://{self.lang_code}.wikinews.org/?curid={page_id}", # Used by citation handler
370 "content": full_content,
371 "full_content": full_content,
372 "publication_date": publication_date.isoformat(timespec="seconds"),
373 }
375 def _fetch_full_content_and_pubdate(
376 self, page_id: int, fallback_date: datetime
377 ) -> Tuple[str, datetime]:
378 """Fetch full article content and publication date from Wikinews API.
380 Args:
381 page_id (int): The Wikinews page ID.
382 fallback_date (datetime): Fallback date if publication date cannot be determined.
384 Returns:
385 Tuple of (full_content, publication_date)
386 """
387 try:
388 content_params = {
389 "action": "query",
390 "prop": "revisions|extracts",
391 "pageids": page_id,
392 "rvprop": "timestamp",
393 "rvdir": "newer", # Older revisions first
394 "rvlimit": 1, # Get the first revision (i.e. publication)
395 "explaintext": True,
396 "format": "json",
397 }
399 # Apply rate limiting before content request
400 self._last_wait_time = self.rate_tracker.apply_rate_limit(
401 self.engine_type
402 )
404 content_resp = safe_get(
405 self.api_url.format(lang_code=self.lang_code),
406 params=content_params,
407 headers=HEADERS,
408 timeout=TIMEOUT,
409 )
410 content_resp.raise_for_status()
411 content_data = content_resp.json()
413 page_data = (
414 content_data.get("query", {})
415 .get("pages", {})
416 .get(str(page_id), {})
417 )
418 full_content = page_data.get("extract", "")
419 revisions = page_data.get("revisions", [])
421 if revisions:
422 try:
423 # First revision timestamp is the publication date
424 publication_date = datetime.fromisoformat(
425 revisions[0]["timestamp"].replace("Z", "+00:00")
426 )
427 except ValueError:
428 logger.warning(
429 f"Error parsing publication date for page {page_id}, using fallback date."
430 )
431 publication_date = fallback_date
432 else:
433 logger.warning(
434 f"No revisions found for page {page_id}, using fallback date."
435 )
436 publication_date = fallback_date
438 return full_content, publication_date
440 except (
441 requests.exceptions.RequestException,
442 json.JSONDecodeError,
443 ):
444 logger.warning(f"Error fetching content for page {page_id}")
445 return "", fallback_date
447 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
448 """
449 Retrieve article previews from Wikinews based on the query.
451 Args:
452 query (str): The search query
454 Returns:
455 List of relevant article previews
456 """
457 # Adapt date range based on query and optimize query (if LLM is available)
458 self._adapt_date_range_for_query(query)
459 optimized_query = self._optimize_query_for_wikinews(query)
461 articles: list[dict[str, Any]] = []
462 sroffset = 0
464 while len(articles) < self.max_results:
465 search_results = self._fetch_search_results(
466 optimized_query, sroffset
467 )
468 if not search_results:
469 # No more results available (or multiple retries failed)
470 break
472 for result in search_results: 472 ↛ 479line 472 didn't jump to line 479 because the loop on line 472 didn't complete
473 article = self._process_search_result(result, optimized_query)
474 if article: 474 ↛ 476line 474 didn't jump to line 476 because the condition on line 474 was always true
475 articles.append(article)
476 if len(articles) >= self.max_results: 476 ↛ 472line 476 didn't jump to line 472 because the condition on line 476 was always true
477 break
479 sroffset += len(search_results)
481 return articles
483 def _get_full_content(
484 self, relevant_items: List[Dict[str, Any]]
485 ) -> List[Dict[str, Any]]:
486 """
487 Retrieve full content for relevant Wikinews articles.
489 Args:
490 relevant_items (List[Dict[str, Any]]): List of relevant article previews
492 Returns:
493 List of articles with full content
494 """
495 # Since full content is already fetched in _get_previews, just return relevant items
496 return relevant_items
499def _clean_wikinews_snippet(snippet: str) -> str:
500 """
501 Clean a Wikinews search snippet.
503 Args:
504 snippet (str): Raw snippet from Wikinews API
506 Returns:
507 Clean human-readable text
508 """
509 if not snippet:
510 return ""
512 # Unescape HTML entities
513 unescaped = html.unescape(snippet)
515 # Remove HTML tags
516 clean_text = re.sub(r"<.*?>", "", unescaped)
518 # Normalize whitespace
519 return re.sub(r"\s+", " ", clean_text).strip()