Coverage for src/local_deep_research/web_search_engines/engines/search_engine_wikinews.py: 95%

160 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1from datetime import datetime, timedelta, UTC 

2from typing import Any, Dict, List, Optional, Tuple 

3 

4import json 

5import html 

6import re 

7import requests 

8from langchain_core.language_models import BaseLLM 

9from loguru import logger 

10 

11from ...constants import USER_AGENT 

12from ...utilities.json_utils import extract_json, get_llm_response_text 

13from ...utilities.search_utilities import remove_think_tags, LANGUAGE_CODE_MAP 

14from ..search_engine_base import BaseSearchEngine 

15from ...security import safe_get 

16 

17HEADERS = {"User-Agent": USER_AGENT} 

18WIKINEWS_LANGUAGES = [ 

19 "ru", 

20 "sr", 

21 "pt", 

22 "fr", 

23 "pl", 

24 "en", 

25 "zh", 

26 "de", 

27 "it", 

28 "es", 

29 "cs", 

30 "nl", 

31 "ca", 

32 "ar", 

33 "ja", 

34] 

35TIMEOUT = 5 # Seconds 

36TIME_PERIOD_DELTAS = { 

37 "all": None, # No time filter 

38 "y": timedelta(days=365), # 1 year 

39 "m": timedelta(days=30), # 1 month 

40 "w": timedelta(days=7), # 1 week 

41 "d": timedelta(days=1), # 24 hours 

42} 

43DEFAULT_RECENT_BACKWARD_DAYS = 60 

44MAX_RETRIES = 3 

45 

46 

47class WikinewsSearchEngine(BaseSearchEngine): 

48 """Wikinews search engine implementation with LLM query optimization""" 

49 

50 # Mark as public and news search engine 

51 is_public = True 

52 is_news = True 

53 is_lexical = True 

54 needs_llm_relevance_filter = True 

55 

56 def __init__( 

57 self, 

58 search_language: str = "english", 

59 adaptive_search: bool = True, 

60 time_period: str = "y", 

61 llm: Optional[BaseLLM] = None, 

62 max_filtered_results: Optional[int] = None, 

63 max_results: int = 10, 

64 search_snippets_only: bool = True, 

65 settings_snapshot: Optional[Dict[str, Any]] = None, 

66 **kwargs, 

67 ): 

68 """ 

69 Initialize the Wikinews search engine. 

70 

71 Args: 

72 search_language (str): Language for Wikinews search (e.g. "english"). 

73 adaptive_search (bool): Whether to expand or shrink date ranges based on query. 

74 time_period (str): Defines the look-back window used to filter search results ("all", "y", "m", "w", "d"). 

75 llm (Optional[BaseLLM]): Language model used for query optimization and classification. 

76 max_filtered_results (Optional[int]): Maximum number of results to keep after filtering. 

77 max_results (int): Maximum number of search results to return. 

78 search_snippets_only (bool): If True, full article content is ignored. 

79 """ 

80 

81 super().__init__( 

82 llm=llm, 

83 max_filtered_results=max_filtered_results, 

84 max_results=max_results, 

85 search_snippets_only=search_snippets_only, 

86 settings_snapshot=settings_snapshot, 

87 **kwargs, 

88 ) 

89 

90 # Language initialization 

91 lang_code = LANGUAGE_CODE_MAP.get( 

92 search_language.lower(), 

93 "en", # Default to English if not found 

94 ) 

95 

96 if lang_code not in WIKINEWS_LANGUAGES: 96 ↛ 97line 96 didn't jump to line 97 because the condition on line 96 was never true

97 logger.warning( 

98 f"Wikinews does not support language '{search_language}' ({lang_code}). Defaulting to English." 

99 ) 

100 lang_code = "en" 

101 

102 self.lang_code: str = lang_code 

103 

104 # Adaptive search 

105 self.adaptive_search: bool = adaptive_search 

106 

107 # Date range initialization 

108 now = datetime.now(UTC) 

109 delta = TIME_PERIOD_DELTAS.get(time_period, timedelta(days=365)) 

110 self.from_date: datetime = ( 

111 now - delta if delta else datetime.min.replace(tzinfo=UTC) 

112 ) 

113 self.to_date: datetime = now 

114 

115 # Preserve original date range so adaptive search can restore it 

116 self._original_date_range = (self.from_date, self.to_date) 

117 

118 # API base URL 

119 self.api_url: str = "https://{lang_code}.wikinews.org/w/api.php" 

120 

121 def _optimize_query_for_wikinews(self, query: str) -> str: 

122 """ 

123 Optimize a natural language query for Wikinews search. 

124 Uses LLM to transform questions into effective news search queries. 

125 

126 Args: 

127 query (str): Natural language query 

128 

129 Returns: 

130 Optimized search query for Wikinews 

131 """ 

132 if not self.llm: 

133 return query 

134 

135 try: 

136 # Prompt for query optimization 

137 prompt = f"""You are a query condenser. Your task is to transform the user’s natural-language question into a very short Wikinews search query. 

138 

139Input question: 

140"{query}" 

141 

142STRICT OUTPUT REQUIREMENTS (follow ALL of them): 

1431. Return ONLY a JSON object with EXACTLY one field: {{"query": "<refined_query>"}}. 

1442. The JSON must be valid, minified, and contain no trailing text. 

1453. The refined query must be extremely short: MAXIMUM 3–4 words. 

1464. Include only the essential keywords (proper names, events, entities, places). 

1475. Remove filler words (e.g., "news", "latest", "about", "what", "how", "is"). 

1486. DO NOT add Boolean operators (AND, OR). 

1497. DO NOT use quotes inside the query. 

1508. DO NOT add explanations or comments. 

151 

152EXAMPLES: 

153- "What's the impact of rising interest rates on UK housing market?" → {{"query": "UK housing rates"}} 

154- "Latest developments in the Ukraine-Russia peace negotiations" → {{"query": "Ukraine Russia negotiations"}} 

155- "How are tech companies responding to AI regulation?" → {{"query": "tech AI regulation"}} 

156- "What is Donald Trump's current political activity?" → {{"query": "Trump political activity"}} 

157 

158NOW RETURN ONLY THE JSON OBJECT. 

159""" 

160 # Get response from LLM 

161 response = self.llm.invoke(prompt) 

162 response_text = get_llm_response_text(response) 

163 

164 data = extract_json(response_text, expected_type=dict) 

165 

166 if data is None or not isinstance(data, dict): 

167 raise ValueError("No valid JSON found in response") # noqa: TRY301 — caught by except ValueError to fall back to original query 

168 

169 optimized_query: str = str(data.get("query", "")).strip() 

170 

171 if not optimized_query: 

172 raise ValueError("Query field missing or empty") # noqa: TRY301 — caught by except ValueError to fall back to original query 

173 

174 except ( 

175 ValueError, 

176 TypeError, 

177 AttributeError, 

178 json.JSONDecodeError, 

179 ): 

180 logger.warning( 

181 "Error optimizing query for WikinewsUsing original query." 

182 ) 

183 return query 

184 

185 logger.info(f"Original query: '{query}'") 

186 logger.info(f"Optimized for Wikinews: '{optimized_query}'") 

187 

188 return optimized_query 

189 

190 def _adapt_date_range_for_query(self, query: str) -> None: 

191 """ 

192 Adapt the date range based on the query type (historical vs recent events). 

193 

194 Args: 

195 query (str): The search query 

196 """ 

197 # Reset to original date parameters first 

198 self.from_date, self.to_date = self._original_date_range 

199 

200 if not self.adaptive_search or not self.llm: 

201 return 

202 

203 # Do not adapt for very short queries (no enough context) 

204 if len(query.split()) <= 4: 

205 return 

206 

207 try: 

208 prompt = f"""Classify this query based on temporal scope. 

209 

210Query: "{query}" 

211 

212Current date: {datetime.now(UTC).strftime("%Y-%m-%d")} 

213Cutoff: Events within the last {DEFAULT_RECENT_BACKWARD_DAYS} days are CURRENT 

214 

215Classification rules: 

216- CURRENT: Recent events (last {DEFAULT_RECENT_BACKWARD_DAYS} days), ongoing situations, "latest", "recent", "today", "this week" 

217- HISTORICAL: Events before {DEFAULT_RECENT_BACKWARD_DAYS} days ago, timelines, chronologies, past tense ("what happened", "history of") 

218- UNCLEAR: Ambiguous temporal context 

219 

220Respond with ONE WORD ONLY: CURRENT, HISTORICAL, or UNCLEAR""" 

221 # Get response from LLM 

222 response = self.llm.invoke(prompt) 

223 response_text = ( 

224 getattr(response, "content", None) 

225 or getattr(response, "text", None) 

226 or str(response) 

227 ) 

228 answer = remove_think_tags(response_text).upper() 

229 

230 if "CURRENT" in answer: 

231 # For current events, focus on recent content 

232 logger.info( 

233 f"Query '{query}' classified as CURRENT - focusing on recent content" 

234 ) 

235 self.from_date = datetime.now(UTC) - timedelta( 

236 days=DEFAULT_RECENT_BACKWARD_DAYS 

237 ) 

238 elif "HISTORICAL" in answer: 

239 # For historical queries, go back as far as possible 

240 logger.info( 

241 f"Query '{query}' classified as HISTORICAL - extending search timeframe" 

242 ) 

243 self.from_date = datetime.min.replace(tzinfo=UTC) 

244 else: 

245 logger.info( 

246 f"Query '{query}' classified as UNCLEAR - keeping original date range" 

247 ) 

248 

249 except (AttributeError, TypeError, ValueError, RuntimeError): 

250 # Keep original date parameters on error 

251 logger.exception( 

252 "Error adapting date range for query: . Keeping original date range." 

253 ) 

254 

255 def _fetch_search_results( 

256 self, query: str, sroffset: int 

257 ) -> List[Dict[str, Any]]: 

258 """Fetch search results from Wikinews API. 

259 

260 Args: 

261 query (str): The search query. 

262 sroffset (int): The result offset for pagination. 

263 

264 Returns: 

265 List of search result items. 

266 """ 

267 retries = 0 

268 while retries < MAX_RETRIES: 

269 params = { 

270 "action": "query", 

271 "list": "search", 

272 "srsearch": query, 

273 "srprop": "snippet|timestamp", 

274 "srlimit": 50, 

275 "sroffset": sroffset, 

276 "format": "json", 

277 } 

278 

279 # Apply rate limiting before search request 

280 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

281 self.engine_type 

282 ) 

283 

284 try: 

285 response = safe_get( 

286 self.api_url.format(lang_code=self.lang_code), 

287 params=params, 

288 headers=HEADERS, 

289 timeout=TIMEOUT, 

290 ) 

291 response.raise_for_status() 

292 data = response.json() 

293 return data.get("query", {}).get("search", []) # type: ignore[no-any-return] 

294 except ( 

295 requests.exceptions.RequestException, 

296 json.JSONDecodeError, 

297 ): 

298 logger.warning("Error fetching search resultsretrying...") 

299 retries += 1 

300 

301 return [] 

302 

303 def _process_search_result( 

304 self, result: Dict[str, Any], query: str 

305 ) -> Optional[Dict[str, Any]]: 

306 """Process and filter a single search result. 

307 

308 Args: 

309 result (Dict[str, Any]): A single search result item. 

310 query (str): The search query. 

311 

312 Returns: 

313 Processed result or None if filtered out. 

314 """ 

315 page_id = result.get("pageid") 

316 title = result.get("title", "") 

317 snippet = _clean_wikinews_snippet(result.get("snippet", "")) 

318 

319 try: 

320 last_edit_timestamp = result.get("timestamp", "") 

321 last_edit_date = datetime.fromisoformat( 

322 last_edit_timestamp.replace("Z", "+00:00") 

323 ) 

324 except ValueError: 

325 logger.warning( 

326 f"Error parsing last edit date for page {page_id}, using current date as fallback." 

327 ) 

328 last_edit_date = datetime.now(UTC) 

329 

330 # First filter: last edit date must be after from_date 

331 if last_edit_date < self.from_date: 331 ↛ 333line 331 didn't jump to line 333 because the condition on line 331 was never true

332 # In this case we can skip fetching full content 

333 return None 

334 

335 # Fetch full article content and extract actual publication date 

336 # Note: Wikinews API do not allow to retrieve publication date in batched search results 

337 full_content, publication_date = self._fetch_full_content_and_pubdate( 

338 int(page_id) if page_id is not None else 0, last_edit_date 

339 ) 

340 

341 # Second filter: publication date within range 

342 if publication_date < self.from_date or publication_date > self.to_date: 

343 return None 

344 

345 # Third filter: check if all query words are in title or content 

346 # Note: Wikinews search return false positive if query words are in "related" articles section 

347 # Use word boundary matching to avoid substring matches (e.g., "is" matching "This") 

348 combined_text = f"{title} {full_content}".lower() 

349 query_words = [ 

350 w.lower() for w in query.split() if len(w) > 1 

351 ] # Skip single chars 

352 if query_words and not all( 352 ↛ 356line 352 didn't jump to line 356 because the condition on line 352 was never true

353 re.search(rf"\b{re.escape(word)}\b", combined_text) 

354 for word in query_words 

355 ): 

356 return None 

357 

358 # If only snippets are requested, we use snippet as full content 

359 if self.search_snippets_only: 

360 full_content = snippet 

361 

362 return { 

363 "id": page_id, 

364 "title": title, 

365 "snippet": snippet, 

366 "source": "wikinews", 

367 "url": f"https://{self.lang_code}.wikinews.org/?curid={page_id}", # Used by '_filter_for_relevance' function 

368 "link": f"https://{self.lang_code}.wikinews.org/?curid={page_id}", # Used by citation handler 

369 "content": full_content, 

370 "full_content": full_content, 

371 "publication_date": publication_date.isoformat(timespec="seconds"), 

372 } 

373 

374 def _fetch_full_content_and_pubdate( 

375 self, page_id: int, fallback_date: datetime 

376 ) -> Tuple[str, datetime]: 

377 """Fetch full article content and publication date from Wikinews API. 

378 

379 Args: 

380 page_id (int): The Wikinews page ID. 

381 fallback_date (datetime): Fallback date if publication date cannot be determined. 

382 

383 Returns: 

384 Tuple of (full_content, publication_date) 

385 """ 

386 try: 

387 content_params = { 

388 "action": "query", 

389 "prop": "revisions|extracts", 

390 "pageids": page_id, 

391 "rvprop": "timestamp", 

392 "rvdir": "newer", # Older revisions first 

393 "rvlimit": 1, # Get the first revision (i.e. publication) 

394 "explaintext": True, 

395 "format": "json", 

396 } 

397 

398 # Apply rate limiting before content request 

399 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

400 self.engine_type 

401 ) 

402 

403 content_resp = safe_get( 

404 self.api_url.format(lang_code=self.lang_code), 

405 params=content_params, 

406 headers=HEADERS, 

407 timeout=TIMEOUT, 

408 ) 

409 content_resp.raise_for_status() 

410 content_data = content_resp.json() 

411 

412 page_data = ( 

413 content_data.get("query", {}) 

414 .get("pages", {}) 

415 .get(str(page_id), {}) 

416 ) 

417 full_content = page_data.get("extract", "") 

418 revisions = page_data.get("revisions", []) 

419 

420 if revisions: 

421 try: 

422 # First revision timestamp is the publication date 

423 publication_date = datetime.fromisoformat( 

424 revisions[0]["timestamp"].replace("Z", "+00:00") 

425 ) 

426 except ValueError: 

427 logger.warning( 

428 f"Error parsing publication date for page {page_id}, using fallback date." 

429 ) 

430 publication_date = fallback_date 

431 else: 

432 logger.warning( 

433 f"No revisions found for page {page_id}, using fallback date." 

434 ) 

435 publication_date = fallback_date 

436 

437 return full_content, publication_date 

438 

439 except ( 

440 requests.exceptions.RequestException, 

441 json.JSONDecodeError, 

442 ): 

443 logger.warning(f"Error fetching content for page {page_id}") 

444 return "", fallback_date 

445 

446 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

447 """ 

448 Retrieve article previews from Wikinews based on the query. 

449 

450 Args: 

451 query (str): The search query 

452 

453 Returns: 

454 List of relevant article previews 

455 """ 

456 # Adapt date range based on query and optimize query (if LLM is available) 

457 self._adapt_date_range_for_query(query) 

458 optimized_query = self._optimize_query_for_wikinews(query) 

459 

460 articles: list[dict[str, Any]] = [] 

461 sroffset = 0 

462 

463 while len(articles) < self.max_results: 

464 search_results = self._fetch_search_results( 

465 optimized_query, sroffset 

466 ) 

467 if not search_results: 

468 # No more results available (or multiple retries failed) 

469 break 

470 

471 for result in search_results: 471 ↛ 478line 471 didn't jump to line 478 because the loop on line 471 didn't complete

472 article = self._process_search_result(result, optimized_query) 

473 if article: 473 ↛ 475line 473 didn't jump to line 475 because the condition on line 473 was always true

474 articles.append(article) 

475 if len(articles) >= self.max_results: 475 ↛ 471line 475 didn't jump to line 471 because the condition on line 475 was always true

476 break 

477 

478 sroffset += len(search_results) 

479 

480 return articles 

481 

482 def _get_full_content( 

483 self, relevant_items: List[Dict[str, Any]] 

484 ) -> List[Dict[str, Any]]: 

485 """ 

486 Retrieve full content for relevant Wikinews articles. 

487 

488 Args: 

489 relevant_items (List[Dict[str, Any]]): List of relevant article previews 

490 

491 Returns: 

492 List of articles with full content 

493 """ 

494 # Since full content is already fetched in _get_previews, just return relevant items 

495 return relevant_items 

496 

497 

498def _clean_wikinews_snippet(snippet: str) -> str: 

499 """ 

500 Clean a Wikinews search snippet. 

501 

502 Args: 

503 snippet (str): Raw snippet from Wikinews API 

504 

505 Returns: 

506 Clean human-readable text 

507 """ 

508 if not snippet: 

509 return "" 

510 

511 # Unescape HTML entities 

512 unescaped = html.unescape(snippet) 

513 

514 # Remove HTML tags 

515 clean_text = re.sub(r"<.*?>", "", unescaped) 

516 

517 # Normalize whitespace 

518 return re.sub(r"\s+", " ", clean_text).strip()