Coverage for src / local_deep_research / web_search_engines / engines / search_engine_wikinews.py: 95%

159 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1from datetime import datetime, timedelta, UTC 

2from typing import Any, Dict, List, Optional, Tuple 

3 

4import json 

5import html 

6import re 

7import requests 

8from langchain_core.language_models import BaseLLM 

9from loguru import logger 

10 

11from ...utilities.json_utils import extract_json, get_llm_response_text 

12from ...utilities.search_utilities import remove_think_tags, LANGUAGE_CODE_MAP 

13from ..search_engine_base import BaseSearchEngine 

14from ...security import safe_get 

15 

16HEADERS = { 

17 "User-Agent": "local-deep-research-wikinews-search-engine (github.com/LearningCircuit/local-deep-research)" 

18} 

19WIKINEWS_LANGUAGES = [ 

20 "ru", 

21 "sr", 

22 "pt", 

23 "fr", 

24 "pl", 

25 "en", 

26 "zh", 

27 "de", 

28 "it", 

29 "es", 

30 "cs", 

31 "nl", 

32 "ca", 

33 "ar", 

34 "ja", 

35] 

36TIMEOUT = 5 # Seconds 

37TIME_PERIOD_DELTAS = { 

38 "all": None, # No time filter 

39 "y": timedelta(days=365), # 1 year 

40 "m": timedelta(days=30), # 1 month 

41 "w": timedelta(days=7), # 1 week 

42 "d": timedelta(days=1), # 24 hours 

43} 

44DEFAULT_RECENT_BACKWARD_DAYS = 60 

45MAX_RETRIES = 3 

46 

47 

48class WikinewsSearchEngine(BaseSearchEngine): 

49 """Wikinews search engine implementation with LLM query optimization""" 

50 

51 # Mark as public and news search engine 

52 is_public = True 

53 is_news = True 

54 is_lexical = True 

55 needs_llm_relevance_filter = True 

56 

57 def __init__( 

58 self, 

59 search_language: str = "english", 

60 adaptive_search: bool = True, 

61 time_period: str = "y", 

62 llm: Optional[BaseLLM] = None, 

63 max_filtered_results: Optional[int] = None, 

64 max_results: int = 10, 

65 search_snippets_only: bool = True, 

66 settings_snapshot: Optional[Dict[str, Any]] = None, 

67 **kwargs, 

68 ): 

69 """ 

70 Initialize the Wikinews search engine. 

71 

72 Args: 

73 search_language (str): Language for Wikinews search (e.g. "english"). 

74 adaptive_search (bool): Whether to expand or shrink date ranges based on query. 

75 time_period (str): Defines the look-back window used to filter search results ("all", "y", "m", "w", "d"). 

76 llm (Optional[BaseLLM]): Language model used for query optimization and classification. 

77 max_filtered_results (Optional[int]): Maximum number of results to keep after filtering. 

78 max_results (int): Maximum number of search results to return. 

79 search_snippets_only (bool): If True, full article content is ignored. 

80 """ 

81 

82 super().__init__( 

83 llm=llm, 

84 max_filtered_results=max_filtered_results, 

85 max_results=max_results, 

86 search_snippets_only=search_snippets_only, 

87 settings_snapshot=settings_snapshot, 

88 **kwargs, 

89 ) 

90 

91 # Language initialization 

92 lang_code = LANGUAGE_CODE_MAP.get( 

93 search_language.lower(), 

94 "en", # Default to English if not found 

95 ) 

96 

97 if lang_code not in WIKINEWS_LANGUAGES: 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true

98 logger.warning( 

99 f"Wikinews does not support language '{search_language}' ({lang_code}). Defaulting to English." 

100 ) 

101 lang_code = "en" 

102 

103 self.lang_code: str = lang_code 

104 

105 # Adaptive search 

106 self.adaptive_search: bool = adaptive_search 

107 

108 # Date range initialization 

109 now = datetime.now(UTC) 

110 delta = TIME_PERIOD_DELTAS.get(time_period, timedelta(days=365)) 

111 self.from_date: datetime = ( 

112 now - delta if delta else datetime.min.replace(tzinfo=UTC) 

113 ) 

114 self.to_date: datetime = now 

115 

116 # Preserve original date range so adaptive search can restore it 

117 self._original_date_range = (self.from_date, self.to_date) 

118 

119 # API base URL 

120 self.api_url: str = "https://{lang_code}.wikinews.org/w/api.php" 

121 

122 def _optimize_query_for_wikinews(self, query: str) -> str: 

123 """ 

124 Optimize a natural language query for Wikinews search. 

125 Uses LLM to transform questions into effective news search queries. 

126 

127 Args: 

128 query (str): Natural language query 

129 

130 Returns: 

131 Optimized search query for Wikinews 

132 """ 

133 if not self.llm: 

134 return query 

135 

136 try: 

137 # Prompt for query optimization 

138 prompt = f"""You are a query condenser. Your task is to transform the user’s natural-language question into a very short Wikinews search query. 

139 

140Input question: 

141"{query}" 

142 

143STRICT OUTPUT REQUIREMENTS (follow ALL of them): 

1441. Return ONLY a JSON object with EXACTLY one field: {{"query": "<refined_query>"}}. 

1452. The JSON must be valid, minified, and contain no trailing text. 

1463. The refined query must be extremely short: MAXIMUM 3–4 words. 

1474. Include only the essential keywords (proper names, events, entities, places). 

1485. Remove filler words (e.g., "news", "latest", "about", "what", "how", "is"). 

1496. DO NOT add Boolean operators (AND, OR). 

1507. DO NOT use quotes inside the query. 

1518. DO NOT add explanations or comments. 

152 

153EXAMPLES: 

154- "What's the impact of rising interest rates on UK housing market?" → {{"query": "UK housing rates"}} 

155- "Latest developments in the Ukraine-Russia peace negotiations" → {{"query": "Ukraine Russia negotiations"}} 

156- "How are tech companies responding to AI regulation?" → {{"query": "tech AI regulation"}} 

157- "What is Donald Trump's current political activity?" → {{"query": "Trump political activity"}} 

158 

159NOW RETURN ONLY THE JSON OBJECT. 

160""" 

161 # Get response from LLM 

162 response = self.llm.invoke(prompt) 

163 response_text = get_llm_response_text(response) 

164 

165 data = extract_json(response_text, expected_type=dict) 

166 

167 if data is None or not isinstance(data, dict): 

168 raise ValueError("No valid JSON found in response") # noqa: TRY301 — caught by except ValueError to fall back to original query 

169 

170 optimized_query: str = str(data.get("query", "")).strip() 

171 

172 if not optimized_query: 

173 raise ValueError("Query field missing or empty") # noqa: TRY301 — caught by except ValueError to fall back to original query 

174 

175 except ( 

176 ValueError, 

177 TypeError, 

178 AttributeError, 

179 json.JSONDecodeError, 

180 ): 

181 logger.warning( 

182 "Error optimizing query for WikinewsUsing original query." 

183 ) 

184 return query 

185 

186 logger.info(f"Original query: '{query}'") 

187 logger.info(f"Optimized for Wikinews: '{optimized_query}'") 

188 

189 return optimized_query 

190 

191 def _adapt_date_range_for_query(self, query: str) -> None: 

192 """ 

193 Adapt the date range based on the query type (historical vs recent events). 

194 

195 Args: 

196 query (str): The search query 

197 """ 

198 # Reset to original date parameters first 

199 self.from_date, self.to_date = self._original_date_range 

200 

201 if not self.adaptive_search or not self.llm: 

202 return 

203 

204 # Do not adapt for very short queries (no enough context) 

205 if len(query.split()) <= 4: 

206 return 

207 

208 try: 

209 prompt = f"""Classify this query based on temporal scope. 

210 

211Query: "{query}" 

212 

213Current date: {datetime.now(UTC).strftime("%Y-%m-%d")} 

214Cutoff: Events within the last {DEFAULT_RECENT_BACKWARD_DAYS} days are CURRENT 

215 

216Classification rules: 

217- CURRENT: Recent events (last {DEFAULT_RECENT_BACKWARD_DAYS} days), ongoing situations, "latest", "recent", "today", "this week" 

218- HISTORICAL: Events before {DEFAULT_RECENT_BACKWARD_DAYS} days ago, timelines, chronologies, past tense ("what happened", "history of") 

219- UNCLEAR: Ambiguous temporal context 

220 

221Respond with ONE WORD ONLY: CURRENT, HISTORICAL, or UNCLEAR""" 

222 # Get response from LLM 

223 response = self.llm.invoke(prompt) 

224 response_text = ( 

225 getattr(response, "content", None) 

226 or getattr(response, "text", None) 

227 or str(response) 

228 ) 

229 answer = remove_think_tags(response_text).upper() 

230 

231 if "CURRENT" in answer: 

232 # For current events, focus on recent content 

233 logger.info( 

234 f"Query '{query}' classified as CURRENT - focusing on recent content" 

235 ) 

236 self.from_date = datetime.now(UTC) - timedelta( 

237 days=DEFAULT_RECENT_BACKWARD_DAYS 

238 ) 

239 elif "HISTORICAL" in answer: 

240 # For historical queries, go back as far as possible 

241 logger.info( 

242 f"Query '{query}' classified as HISTORICAL - extending search timeframe" 

243 ) 

244 self.from_date = datetime.min.replace(tzinfo=UTC) 

245 else: 

246 logger.info( 

247 f"Query '{query}' classified as UNCLEAR - keeping original date range" 

248 ) 

249 

250 except (AttributeError, TypeError, ValueError, RuntimeError): 

251 # Keep original date parameters on error 

252 logger.exception( 

253 "Error adapting date range for query: . Keeping original date range." 

254 ) 

255 

256 def _fetch_search_results( 

257 self, query: str, sroffset: int 

258 ) -> List[Dict[str, Any]]: 

259 """Fetch search results from Wikinews API. 

260 

261 Args: 

262 query (str): The search query. 

263 sroffset (int): The result offset for pagination. 

264 

265 Returns: 

266 List of search result items. 

267 """ 

268 retries = 0 

269 while retries < MAX_RETRIES: 

270 params = { 

271 "action": "query", 

272 "list": "search", 

273 "srsearch": query, 

274 "srprop": "snippet|timestamp", 

275 "srlimit": 50, 

276 "sroffset": sroffset, 

277 "format": "json", 

278 } 

279 

280 # Apply rate limiting before search request 

281 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

282 self.engine_type 

283 ) 

284 

285 try: 

286 response = safe_get( 

287 self.api_url.format(lang_code=self.lang_code), 

288 params=params, 

289 headers=HEADERS, 

290 timeout=TIMEOUT, 

291 ) 

292 response.raise_for_status() 

293 data = response.json() 

294 return data.get("query", {}).get("search", []) # type: ignore[no-any-return] 

295 except ( 

296 requests.exceptions.RequestException, 

297 json.JSONDecodeError, 

298 ): 

299 logger.warning("Error fetching search resultsretrying...") 

300 retries += 1 

301 

302 return [] 

303 

304 def _process_search_result( 

305 self, result: Dict[str, Any], query: str 

306 ) -> Optional[Dict[str, Any]]: 

307 """Process and filter a single search result. 

308 

309 Args: 

310 result (Dict[str, Any]): A single search result item. 

311 query (str): The search query. 

312 

313 Returns: 

314 Processed result or None if filtered out. 

315 """ 

316 page_id = result.get("pageid") 

317 title = result.get("title", "") 

318 snippet = _clean_wikinews_snippet(result.get("snippet", "")) 

319 

320 try: 

321 last_edit_timestamp = result.get("timestamp", "") 

322 last_edit_date = datetime.fromisoformat( 

323 last_edit_timestamp.replace("Z", "+00:00") 

324 ) 

325 except ValueError: 

326 logger.warning( 

327 f"Error parsing last edit date for page {page_id}, using current date as fallback." 

328 ) 

329 last_edit_date = datetime.now(UTC) 

330 

331 # First filter: last edit date must be after from_date 

332 if last_edit_date < self.from_date: 332 ↛ 334line 332 didn't jump to line 334 because the condition on line 332 was never true

333 # In this case we can skip fetching full content 

334 return None 

335 

336 # Fetch full article content and extract actual publication date 

337 # Note: Wikinews API do not allow to retrieve publication date in batched search results 

338 full_content, publication_date = self._fetch_full_content_and_pubdate( 

339 int(page_id) if page_id is not None else 0, last_edit_date 

340 ) 

341 

342 # Second filter: publication date within range 

343 if publication_date < self.from_date or publication_date > self.to_date: 

344 return None 

345 

346 # Third filter: check if all query words are in title or content 

347 # Note: Wikinews search return false positive if query words are in "related" articles section 

348 # Use word boundary matching to avoid substring matches (e.g., "is" matching "This") 

349 combined_text = f"{title} {full_content}".lower() 

350 query_words = [ 

351 w.lower() for w in query.split() if len(w) > 1 

352 ] # Skip single chars 

353 if query_words and not all( 353 ↛ 357line 353 didn't jump to line 357 because the condition on line 353 was never true

354 re.search(rf"\b{re.escape(word)}\b", combined_text) 

355 for word in query_words 

356 ): 

357 return None 

358 

359 # If only snippets are requested, we use snippet as full content 

360 if self.search_snippets_only: 

361 full_content = snippet 

362 

363 return { 

364 "id": page_id, 

365 "title": title, 

366 "snippet": snippet, 

367 "source": "wikinews", 

368 "url": f"https://{self.lang_code}.wikinews.org/?curid={page_id}", # Used by '_filter_for_relevance' function 

369 "link": f"https://{self.lang_code}.wikinews.org/?curid={page_id}", # Used by citation handler 

370 "content": full_content, 

371 "full_content": full_content, 

372 "publication_date": publication_date.isoformat(timespec="seconds"), 

373 } 

374 

375 def _fetch_full_content_and_pubdate( 

376 self, page_id: int, fallback_date: datetime 

377 ) -> Tuple[str, datetime]: 

378 """Fetch full article content and publication date from Wikinews API. 

379 

380 Args: 

381 page_id (int): The Wikinews page ID. 

382 fallback_date (datetime): Fallback date if publication date cannot be determined. 

383 

384 Returns: 

385 Tuple of (full_content, publication_date) 

386 """ 

387 try: 

388 content_params = { 

389 "action": "query", 

390 "prop": "revisions|extracts", 

391 "pageids": page_id, 

392 "rvprop": "timestamp", 

393 "rvdir": "newer", # Older revisions first 

394 "rvlimit": 1, # Get the first revision (i.e. publication) 

395 "explaintext": True, 

396 "format": "json", 

397 } 

398 

399 # Apply rate limiting before content request 

400 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

401 self.engine_type 

402 ) 

403 

404 content_resp = safe_get( 

405 self.api_url.format(lang_code=self.lang_code), 

406 params=content_params, 

407 headers=HEADERS, 

408 timeout=TIMEOUT, 

409 ) 

410 content_resp.raise_for_status() 

411 content_data = content_resp.json() 

412 

413 page_data = ( 

414 content_data.get("query", {}) 

415 .get("pages", {}) 

416 .get(str(page_id), {}) 

417 ) 

418 full_content = page_data.get("extract", "") 

419 revisions = page_data.get("revisions", []) 

420 

421 if revisions: 

422 try: 

423 # First revision timestamp is the publication date 

424 publication_date = datetime.fromisoformat( 

425 revisions[0]["timestamp"].replace("Z", "+00:00") 

426 ) 

427 except ValueError: 

428 logger.warning( 

429 f"Error parsing publication date for page {page_id}, using fallback date." 

430 ) 

431 publication_date = fallback_date 

432 else: 

433 logger.warning( 

434 f"No revisions found for page {page_id}, using fallback date." 

435 ) 

436 publication_date = fallback_date 

437 

438 return full_content, publication_date 

439 

440 except ( 

441 requests.exceptions.RequestException, 

442 json.JSONDecodeError, 

443 ): 

444 logger.warning(f"Error fetching content for page {page_id}") 

445 return "", fallback_date 

446 

447 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

448 """ 

449 Retrieve article previews from Wikinews based on the query. 

450 

451 Args: 

452 query (str): The search query 

453 

454 Returns: 

455 List of relevant article previews 

456 """ 

457 # Adapt date range based on query and optimize query (if LLM is available) 

458 self._adapt_date_range_for_query(query) 

459 optimized_query = self._optimize_query_for_wikinews(query) 

460 

461 articles: list[dict[str, Any]] = [] 

462 sroffset = 0 

463 

464 while len(articles) < self.max_results: 

465 search_results = self._fetch_search_results( 

466 optimized_query, sroffset 

467 ) 

468 if not search_results: 

469 # No more results available (or multiple retries failed) 

470 break 

471 

472 for result in search_results: 472 ↛ 479line 472 didn't jump to line 479 because the loop on line 472 didn't complete

473 article = self._process_search_result(result, optimized_query) 

474 if article: 474 ↛ 476line 474 didn't jump to line 476 because the condition on line 474 was always true

475 articles.append(article) 

476 if len(articles) >= self.max_results: 476 ↛ 472line 476 didn't jump to line 472 because the condition on line 476 was always true

477 break 

478 

479 sroffset += len(search_results) 

480 

481 return articles 

482 

483 def _get_full_content( 

484 self, relevant_items: List[Dict[str, Any]] 

485 ) -> List[Dict[str, Any]]: 

486 """ 

487 Retrieve full content for relevant Wikinews articles. 

488 

489 Args: 

490 relevant_items (List[Dict[str, Any]]): List of relevant article previews 

491 

492 Returns: 

493 List of articles with full content 

494 """ 

495 # Since full content is already fetched in _get_previews, just return relevant items 

496 return relevant_items 

497 

498 

499def _clean_wikinews_snippet(snippet: str) -> str: 

500 """ 

501 Clean a Wikinews search snippet. 

502 

503 Args: 

504 snippet (str): Raw snippet from Wikinews API 

505 

506 Returns: 

507 Clean human-readable text 

508 """ 

509 if not snippet: 

510 return "" 

511 

512 # Unescape HTML entities 

513 unescaped = html.unescape(snippet) 

514 

515 # Remove HTML tags 

516 clean_text = re.sub(r"<.*?>", "", unescaped) 

517 

518 # Normalize whitespace 

519 return re.sub(r"\s+", " ", clean_text).strip()