Coverage for src / local_deep_research / web_search_engines / engines / search_engine_wikinews.py: 58%

163 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1from datetime import datetime, timedelta, UTC 

2from typing import Any, Dict, List, Optional, Tuple 

3 

4import json 

5import html 

6import re 

7import requests 

8from langchain_core.language_models import BaseLLM 

9from loguru import logger 

10 

11from ...utilities.search_utilities import remove_think_tags 

12from ..search_engine_base import BaseSearchEngine 

13from ...utilities.search_utilities import LANGUAGE_CODE_MAP 

14from ...security import safe_get 

15 

16HEADERS = { 

17 "User-Agent": "local-deep-research-wikinews-search-engine (github.com/LearningCircuit/local-deep-research)" 

18} 

19WIKINEWS_LANGUAGES = [ 

20 "ru", 

21 "sr", 

22 "pt", 

23 "fr", 

24 "pl", 

25 "en", 

26 "zh", 

27 "de", 

28 "it", 

29 "es", 

30 "cs", 

31 "nl", 

32 "ca", 

33 "ar", 

34 "ja", 

35] 

36TIMEOUT = 5 # Seconds 

37TIME_PERIOD_DELTAS = { 

38 "all": None, # No time filter 

39 "y": timedelta(days=365), # 1 year 

40 "m": timedelta(days=30), # 1 month 

41 "w": timedelta(days=7), # 1 week 

42 "d": timedelta(days=1), # 24 hours 

43} 

44DEFAULT_RECENT_BACKWARD_DAYS = 60 

45MAX_RETRIES = 3 

46 

47 

48class WikinewsSearchEngine(BaseSearchEngine): 

49 """Wikinews search engine implementation with LLM query optimization""" 

50 

51 # Mark as public and news search engine 

52 is_public = True 

53 is_news = True 

54 

55 def __init__( 

56 self, 

57 search_language: str = "english", 

58 adaptive_search: bool = True, 

59 time_period: str = "y", 

60 llm: Optional[BaseLLM] = None, 

61 max_filtered_results: Optional[int] = None, 

62 max_results: int = 10, 

63 search_snippets_only: bool = True, 

64 **kwargs, 

65 ): 

66 """ 

67 Initialize the Wikinews search engine. 

68 

69 Args: 

70 search_language (str): Language for Wikinews search (e.g. "english"). 

71 adaptive_search (bool): Whether to expand or shrink date ranges based on query. 

72 time_period (str): Defines the look-back window used to filter search results ("all", "y", "m", "w", "d"). 

73 llm (Optional[BaseLLM]): Language model used for query optimization and classification. 

74 max_filtered_results (Optional[int]): Maximum number of results to keep after filtering. 

75 max_results (int): Maximum number of search results to return. 

76 search_snippets_only (bool): If True, full article content is ignored. 

77 """ 

78 

79 super().__init__( 

80 llm=llm, 

81 max_filtered_results=max_filtered_results, 

82 max_results=max_results, 

83 search_snippets_only=search_snippets_only, 

84 **kwargs, 

85 ) 

86 

87 # Language initialization 

88 lang_code = LANGUAGE_CODE_MAP.get( 

89 search_language.lower(), 

90 "en", # Default to English if not found 

91 ) 

92 

93 if lang_code not in WIKINEWS_LANGUAGES: 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true

94 logger.warning( 

95 f"Wikinews does not support language '{search_language}' ({lang_code}). Defaulting to English." 

96 ) 

97 lang_code = "en" 

98 

99 self.lang_code: str = lang_code 

100 

101 # Adaptive search 

102 self.adaptive_search: bool = adaptive_search 

103 

104 # Date range initialization 

105 now = datetime.now(UTC) 

106 delta = TIME_PERIOD_DELTAS.get(time_period, timedelta(days=365)) 

107 self.from_date: datetime = ( 

108 now - delta if delta else datetime.min.replace(tzinfo=UTC) 

109 ) 

110 self.to_date: datetime = now 

111 

112 # Preserve original date range so adaptive search can restore it 

113 self._original_date_range = (self.from_date, self.to_date) 

114 

115 # API base URL 

116 self.api_url: str = "https://{lang_code}.wikinews.org/w/api.php" 

117 

118 def _optimize_query_for_wikinews(self, query: str) -> str: 

119 """ 

120 Optimize a natural language query for Wikinews search. 

121 Uses LLM to transform questions into effective news search queries. 

122 

123 Args: 

124 query (str): Natural language query 

125 

126 Returns: 

127 Optimized search query for Wikinews 

128 """ 

129 if not self.llm: 

130 return query 

131 

132 try: 

133 # Prompt for query optimization 

134 prompt = f"""You are a query condenser. Your task is to transform the user’s natural-language question into a very short Wikinews search query. 

135 

136Input question: 

137"{query}" 

138 

139STRICT OUTPUT REQUIREMENTS (follow ALL of them): 

1401. Return ONLY a JSON object with EXACTLY one field: {{"query": "<refined_query>"}}. 

1412. The JSON must be valid, minified, and contain no trailing text. 

1423. The refined query must be extremely short: MAXIMUM 3–4 words. 

1434. Include only the essential keywords (proper names, events, entities, places). 

1445. Remove filler words (e.g., "news", "latest", "about", "what", "how", "is"). 

1456. DO NOT add Boolean operators (AND, OR). 

1467. DO NOT use quotes inside the query. 

1478. DO NOT add explanations or comments. 

148 

149EXAMPLES: 

150- "What's the impact of rising interest rates on UK housing market?" → {{"query": "UK housing rates"}} 

151- "Latest developments in the Ukraine-Russia peace negotiations" → {{"query": "Ukraine Russia negotiations"}} 

152- "How are tech companies responding to AI regulation?" → {{"query": "tech AI regulation"}} 

153- "What is Donald Trump's current political activity?" → {{"query": "Trump political activity"}} 

154 

155NOW RETURN ONLY THE JSON OBJECT. 

156""" 

157 # Get response from LLM 

158 response = self.llm.invoke(prompt) 

159 

160 response_text = ( 

161 getattr(response, "content", None) 

162 or getattr(response, "text", None) 

163 or str(response) 

164 ) 

165 

166 # Find possible JSON object boundaries 

167 start = response_text.find("{") 

168 end = response_text.rfind("}") 

169 

170 # Validate boundaries before slicing 

171 if start == -1 or end == -1 or end <= start: 

172 raise ValueError("No valid JSON boundaries found") 

173 

174 json_str = response_text[start : end + 1] 

175 

176 data = json.loads(json_str) 

177 

178 if not isinstance(data, dict): 178 ↛ 179line 178 didn't jump to line 179 because the condition on line 178 was never true

179 raise ValueError("Extracted JSON is not an object") 

180 

181 optimized_query = (data.get("query", "")).strip() 

182 

183 if not optimized_query: 

184 raise ValueError("Query field missing or empty") 

185 

186 except ( 

187 ValueError, 

188 TypeError, 

189 AttributeError, 

190 json.JSONDecodeError, 

191 ) as e: 

192 logger.warning( 

193 f"Error optimizing query for Wikinews: {e}. Using original query." 

194 ) 

195 return query 

196 

197 logger.info(f"Original query: '{query}'") 

198 logger.info(f"Optimized for Wikinews: '{optimized_query}'") 

199 

200 return optimized_query 

201 

202 def _adapt_date_range_for_query(self, query: str) -> None: 

203 """ 

204 Adapt the date range based on the query type (historical vs recent events). 

205 

206 Args: 

207 query (str): The search query 

208 """ 

209 # Reset to original date parameters first 

210 self.from_date, self.to_date = self._original_date_range 

211 

212 if not self.adaptive_search or not self.llm: 

213 return 

214 

215 # Do not adapt for very short queries (no enough context) 

216 if len(query.split()) <= 4: 

217 return 

218 

219 try: 

220 prompt = f"""Classify this query based on temporal scope. 

221 

222Query: "{query}" 

223 

224Current date: {datetime.now(UTC).strftime("%Y-%m-%d")} 

225Cutoff: Events within the last {DEFAULT_RECENT_BACKWARD_DAYS} days are CURRENT 

226 

227Classification rules: 

228- CURRENT: Recent events (last {DEFAULT_RECENT_BACKWARD_DAYS} days), ongoing situations, "latest", "recent", "today", "this week" 

229- HISTORICAL: Events before {DEFAULT_RECENT_BACKWARD_DAYS} days ago, timelines, chronologies, past tense ("what happened", "history of") 

230- UNCLEAR: Ambiguous temporal context 

231 

232Respond with ONE WORD ONLY: CURRENT, HISTORICAL, or UNCLEAR""" 

233 # Get response from LLM 

234 response = self.llm.invoke(prompt) 

235 response_text = ( 

236 getattr(response, "content", None) 

237 or getattr(response, "text", None) 

238 or str(response) 

239 ) 

240 answer = remove_think_tags(response_text).upper() 

241 

242 if "CURRENT" in answer: 

243 # For current events, focus on recent content 

244 logger.info( 

245 f"Query '{query}' classified as CURRENT - focusing on recent content" 

246 ) 

247 self.from_date = datetime.now(UTC) - timedelta( 

248 days=DEFAULT_RECENT_BACKWARD_DAYS 

249 ) 

250 elif "HISTORICAL" in answer: 250 ↛ 257line 250 didn't jump to line 257 because the condition on line 250 was always true

251 # For historical queries, go back as far as possible 

252 logger.info( 

253 f"Query '{query}' classified as HISTORICAL - extending search timeframe" 

254 ) 

255 self.from_date = datetime.min.replace(tzinfo=UTC) 

256 else: 

257 logger.info( 

258 f"Query '{query}' classified as UNCLEAR - keeping original date range" 

259 ) 

260 

261 except (AttributeError, TypeError, ValueError, RuntimeError) as e: 

262 # Keep original date parameters on error 

263 logger.exception( 

264 f"Error adapting date range for query: {e}. Keeping original date range." 

265 ) 

266 

267 def _fetch_search_results( 

268 self, query: str, sroffset: int 

269 ) -> List[Dict[str, Any]]: 

270 """Fetch search results from Wikinews API. 

271 

272 Args: 

273 query (str): The search query. 

274 sroffset (int): The result offset for pagination. 

275 

276 Returns: 

277 List of search result items. 

278 """ 

279 retries = 0 

280 while retries < MAX_RETRIES: 

281 params = { 

282 "action": "query", 

283 "list": "search", 

284 "srsearch": query, 

285 "srprop": "snippet|timestamp", 

286 "srlimit": 50, 

287 "sroffset": sroffset, 

288 "format": "json", 

289 } 

290 

291 # Apply rate limiting before search request 

292 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

293 self.engine_type 

294 ) 

295 

296 try: 

297 response = safe_get( 

298 self.api_url.format(lang_code=self.lang_code), 

299 params=params, 

300 headers=HEADERS, 

301 timeout=TIMEOUT, 

302 ) 

303 response.raise_for_status() 

304 data = response.json() 

305 return data.get("query", {}).get("search", []) 

306 except ( 

307 requests.exceptions.RequestException, 

308 json.JSONDecodeError, 

309 ) as e: 

310 logger.warning( 

311 f"Error fetching search results: {e}, retrying..." 

312 ) 

313 retries += 1 

314 

315 return [] 

316 

317 def _process_search_result( 

318 self, result: Dict[str, Any], query: str 

319 ) -> Optional[Dict[str, Any]]: 

320 """Process and filter a single search result. 

321 

322 Args: 

323 result (Dict[str, Any]): A single search result item. 

324 query (str): The search query. 

325 

326 Returns: 

327 Processed result or None if filtered out. 

328 """ 

329 page_id = result.get("pageid") 

330 title = result.get("title", "") 

331 snippet = _clean_wikinews_snippet(result.get("snippet", "")) 

332 

333 try: 

334 last_edit_timestamp = result.get("timestamp", "") 

335 last_edit_date = datetime.fromisoformat( 

336 last_edit_timestamp.replace("Z", "+00:00") 

337 ) 

338 except ValueError: 

339 logger.warning( 

340 f"Error parsing last edit date for page {page_id}, using current date as fallback." 

341 ) 

342 last_edit_date = datetime.now(UTC) 

343 

344 # First filter: last edit date must be after from_date 

345 if last_edit_date < self.from_date: 

346 # In this case we can skip fetching full content 

347 return None 

348 

349 # Fetch full article content and extract actual publication date 

350 # Note: Wikinews API do not allow to retrieve publication date in batched search results 

351 full_content, publication_date = self._fetch_full_content_and_pubdate( 

352 page_id, last_edit_date 

353 ) 

354 

355 # Second filter: publication date within range 

356 if publication_date < self.from_date or publication_date > self.to_date: 

357 return None 

358 

359 # Third filter: check if all query words are in title or content 

360 # Note: Wikinews search return false positive if query words are in "related" articles section 

361 # Use word boundary matching to avoid substring matches (e.g., "is" matching "This") 

362 combined_text = f"{title} {full_content}".lower() 

363 query_words = [ 

364 w.lower() for w in query.split() if len(w) > 1 

365 ] # Skip single chars 

366 if query_words and not all( 

367 re.search(rf"\b{re.escape(word)}\b", combined_text) 

368 for word in query_words 

369 ): 

370 return None 

371 

372 # If only snippets are requested, we use snippet as full content 

373 if self.search_snippets_only: 

374 full_content = snippet 

375 

376 return { 

377 "id": page_id, 

378 "title": title, 

379 "snippet": snippet, 

380 "source": "wikinews", 

381 "url": f"https://{self.lang_code}.wikinews.org/?curid={page_id}", # Used by '_filter_for_relevance' function 

382 "link": f"https://{self.lang_code}.wikinews.org/?curid={page_id}", # Used by citation handler 

383 "content": full_content, 

384 "full_content": full_content, 

385 "publication_date": publication_date.isoformat(timespec="seconds"), 

386 } 

387 

388 def _fetch_full_content_and_pubdate( 

389 self, page_id: int, fallback_date: datetime 

390 ) -> Tuple[str, datetime]: 

391 """Fetch full article content and publication date from Wikinews API. 

392 

393 Args: 

394 page_id (int): The Wikinews page ID. 

395 fallback_date (datetime): Fallback date if publication date cannot be determined. 

396 

397 Returns: 

398 Tuple of (full_content, publication_date) 

399 """ 

400 try: 

401 content_params = { 

402 "action": "query", 

403 "prop": "revisions|extracts", 

404 "pageids": page_id, 

405 "rvprop": "timestamp", 

406 "rvdir": "newer", # Older revisions first 

407 "rvlimit": 1, # Get the first revision (i.e. publication) 

408 "explaintext": True, 

409 "format": "json", 

410 } 

411 

412 # Apply rate limiting before content request 

413 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

414 self.engine_type 

415 ) 

416 

417 content_resp = safe_get( 

418 self.api_url.format(lang_code=self.lang_code), 

419 params=content_params, 

420 headers=HEADERS, 

421 timeout=TIMEOUT, 

422 ) 

423 content_resp.raise_for_status() 

424 content_data = content_resp.json() 

425 

426 page_data = ( 

427 content_data.get("query", {}) 

428 .get("pages", {}) 

429 .get(str(page_id), {}) 

430 ) 

431 full_content = page_data.get("extract", "") 

432 revisions = page_data.get("revisions", []) 

433 

434 if revisions: 

435 try: 

436 # First revision timestamp is the publication date 

437 publication_date = datetime.fromisoformat( 

438 revisions[0]["timestamp"].replace("Z", "+00:00") 

439 ) 

440 except ValueError: 

441 logger.warning( 

442 f"Error parsing publication date for page {page_id}, using fallback date." 

443 ) 

444 publication_date = fallback_date 

445 else: 

446 logger.warning( 

447 f"No revisions found for page {page_id}, using fallback date." 

448 ) 

449 publication_date = fallback_date 

450 

451 return full_content, publication_date 

452 

453 except ( 

454 requests.exceptions.RequestException, 

455 json.JSONDecodeError, 

456 ) as e: 

457 logger.warning(f"Error fetching content for page {page_id}: {e}") 

458 return "", fallback_date 

459 

460 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

461 """ 

462 Retrieve article previews from Wikinews based on the query. 

463 

464 Args: 

465 query (str): The search query 

466 

467 Returns: 

468 List of relevant article previews 

469 """ 

470 # Adapt date range based on query and optimize query (if LLM is available) 

471 self._adapt_date_range_for_query(query) 

472 optimized_query = self._optimize_query_for_wikinews(query) 

473 

474 articles = [] 

475 sroffset = 0 

476 

477 while len(articles) < self.max_results: 

478 search_results = self._fetch_search_results( 

479 optimized_query, sroffset 

480 ) 

481 if not search_results: 

482 # No more results available (or multiple retries failed) 

483 break 

484 

485 for result in search_results: 

486 article = self._process_search_result(result, optimized_query) 

487 if article: 

488 articles.append(article) 

489 if len(articles) >= self.max_results: 

490 break 

491 

492 sroffset += len(search_results) 

493 

494 return articles 

495 

496 def _get_full_content( 

497 self, relevant_items: List[Dict[str, Any]] 

498 ) -> List[Dict[str, Any]]: 

499 """ 

500 Retrieve full content for relevant Wikinews articles. 

501 

502 Args: 

503 relevant_items (List[Dict[str, Any]]): List of relevant article previews 

504 

505 Returns: 

506 List of articles with full content 

507 """ 

508 # Since full content is already fetched in _get_previews, just return relevant items 

509 return relevant_items 

510 

511 

512def _clean_wikinews_snippet(snippet: str) -> str: 

513 """ 

514 Clean a Wikinews search snippet. 

515 

516 Args: 

517 snippet (str): Raw snippet from Wikinews API 

518 

519 Returns: 

520 Clean human-readable text 

521 """ 

522 if not snippet: 

523 return "" 

524 

525 # Unescape HTML entities 

526 unescaped = html.unescape(snippet) 

527 

528 # Remove HTML tags 

529 clean_text = re.sub(r"<.*?>", "", unescaped) 

530 

531 # Normalize whitespace 

532 clean_text = re.sub(r"\s+", " ", clean_text).strip() 

533 

534 return clean_text