Coverage for src / local_deep_research / web_search_engines / engines / search_engine_wikinews.py: 89%

158 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1from datetime import datetime, timedelta, UTC 

2from typing import Any, Dict, List, Optional, Tuple 

3 

4import json 

5import html 

6import re 

7import requests 

8from langchain_core.language_models import BaseLLM 

9from loguru import logger 

10 

11from ...utilities.json_utils import extract_json, get_llm_response_text 

12from ...utilities.search_utilities import remove_think_tags, LANGUAGE_CODE_MAP 

13from ..search_engine_base import BaseSearchEngine 

14from ...security import safe_get 

15 

16HEADERS = { 

17 "User-Agent": "local-deep-research-wikinews-search-engine (github.com/LearningCircuit/local-deep-research)" 

18} 

19WIKINEWS_LANGUAGES = [ 

20 "ru", 

21 "sr", 

22 "pt", 

23 "fr", 

24 "pl", 

25 "en", 

26 "zh", 

27 "de", 

28 "it", 

29 "es", 

30 "cs", 

31 "nl", 

32 "ca", 

33 "ar", 

34 "ja", 

35] 

36TIMEOUT = 5 # Seconds 

37TIME_PERIOD_DELTAS = { 

38 "all": None, # No time filter 

39 "y": timedelta(days=365), # 1 year 

40 "m": timedelta(days=30), # 1 month 

41 "w": timedelta(days=7), # 1 week 

42 "d": timedelta(days=1), # 24 hours 

43} 

44DEFAULT_RECENT_BACKWARD_DAYS = 60 

45MAX_RETRIES = 3 

46 

47 

48class WikinewsSearchEngine(BaseSearchEngine): 

49 """Wikinews search engine implementation with LLM query optimization""" 

50 

51 # Mark as public and news search engine 

52 is_public = True 

53 is_news = True 

54 

55 def __init__( 

56 self, 

57 search_language: str = "english", 

58 adaptive_search: bool = True, 

59 time_period: str = "y", 

60 llm: Optional[BaseLLM] = None, 

61 max_filtered_results: Optional[int] = None, 

62 max_results: int = 10, 

63 search_snippets_only: bool = True, 

64 **kwargs, 

65 ): 

66 """ 

67 Initialize the Wikinews search engine. 

68 

69 Args: 

70 search_language (str): Language for Wikinews search (e.g. "english"). 

71 adaptive_search (bool): Whether to expand or shrink date ranges based on query. 

72 time_period (str): Defines the look-back window used to filter search results ("all", "y", "m", "w", "d"). 

73 llm (Optional[BaseLLM]): Language model used for query optimization and classification. 

74 max_filtered_results (Optional[int]): Maximum number of results to keep after filtering. 

75 max_results (int): Maximum number of search results to return. 

76 search_snippets_only (bool): If True, full article content is ignored. 

77 """ 

78 

79 super().__init__( 

80 llm=llm, 

81 max_filtered_results=max_filtered_results, 

82 max_results=max_results, 

83 search_snippets_only=search_snippets_only, 

84 **kwargs, 

85 ) 

86 

87 # Language initialization 

88 lang_code = LANGUAGE_CODE_MAP.get( 

89 search_language.lower(), 

90 "en", # Default to English if not found 

91 ) 

92 

93 if lang_code not in WIKINEWS_LANGUAGES: 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true

94 logger.warning( 

95 f"Wikinews does not support language '{search_language}' ({lang_code}). Defaulting to English." 

96 ) 

97 lang_code = "en" 

98 

99 self.lang_code: str = lang_code 

100 

101 # Adaptive search 

102 self.adaptive_search: bool = adaptive_search 

103 

104 # Date range initialization 

105 now = datetime.now(UTC) 

106 delta = TIME_PERIOD_DELTAS.get(time_period, timedelta(days=365)) 

107 self.from_date: datetime = ( 

108 now - delta if delta else datetime.min.replace(tzinfo=UTC) 

109 ) 

110 self.to_date: datetime = now 

111 

112 # Preserve original date range so adaptive search can restore it 

113 self._original_date_range = (self.from_date, self.to_date) 

114 

115 # API base URL 

116 self.api_url: str = "https://{lang_code}.wikinews.org/w/api.php" 

117 

118 def _optimize_query_for_wikinews(self, query: str) -> str: 

119 """ 

120 Optimize a natural language query for Wikinews search. 

121 Uses LLM to transform questions into effective news search queries. 

122 

123 Args: 

124 query (str): Natural language query 

125 

126 Returns: 

127 Optimized search query for Wikinews 

128 """ 

129 if not self.llm: 

130 return query 

131 

132 try: 

133 # Prompt for query optimization 

134 prompt = f"""You are a query condenser. Your task is to transform the user’s natural-language question into a very short Wikinews search query. 

135 

136Input question: 

137"{query}" 

138 

139STRICT OUTPUT REQUIREMENTS (follow ALL of them): 

1401. Return ONLY a JSON object with EXACTLY one field: {{"query": "<refined_query>"}}. 

1412. The JSON must be valid, minified, and contain no trailing text. 

1423. The refined query must be extremely short: MAXIMUM 3–4 words. 

1434. Include only the essential keywords (proper names, events, entities, places). 

1445. Remove filler words (e.g., "news", "latest", "about", "what", "how", "is"). 

1456. DO NOT add Boolean operators (AND, OR). 

1467. DO NOT use quotes inside the query. 

1478. DO NOT add explanations or comments. 

148 

149EXAMPLES: 

150- "What's the impact of rising interest rates on UK housing market?" → {{"query": "UK housing rates"}} 

151- "Latest developments in the Ukraine-Russia peace negotiations" → {{"query": "Ukraine Russia negotiations"}} 

152- "How are tech companies responding to AI regulation?" → {{"query": "tech AI regulation"}} 

153- "What is Donald Trump's current political activity?" → {{"query": "Trump political activity"}} 

154 

155NOW RETURN ONLY THE JSON OBJECT. 

156""" 

157 # Get response from LLM 

158 response = self.llm.invoke(prompt) 

159 response_text = get_llm_response_text(response) 

160 

161 data = extract_json(response_text, expected_type=dict) 

162 

163 if data is None: 

164 raise ValueError("No valid JSON found in response") 

165 

166 optimized_query = (data.get("query", "")).strip() 

167 

168 if not optimized_query: 

169 raise ValueError("Query field missing or empty") 

170 

171 except ( 

172 ValueError, 

173 TypeError, 

174 AttributeError, 

175 json.JSONDecodeError, 

176 ) as e: 

177 logger.warning( 

178 f"Error optimizing query for Wikinews: {e}. Using original query." 

179 ) 

180 return query 

181 

182 logger.info(f"Original query: '{query}'") 

183 logger.info(f"Optimized for Wikinews: '{optimized_query}'") 

184 

185 return optimized_query 

186 

187 def _adapt_date_range_for_query(self, query: str) -> None: 

188 """ 

189 Adapt the date range based on the query type (historical vs recent events). 

190 

191 Args: 

192 query (str): The search query 

193 """ 

194 # Reset to original date parameters first 

195 self.from_date, self.to_date = self._original_date_range 

196 

197 if not self.adaptive_search or not self.llm: 

198 return 

199 

200 # Do not adapt for very short queries (no enough context) 

201 if len(query.split()) <= 4: 

202 return 

203 

204 try: 

205 prompt = f"""Classify this query based on temporal scope. 

206 

207Query: "{query}" 

208 

209Current date: {datetime.now(UTC).strftime("%Y-%m-%d")} 

210Cutoff: Events within the last {DEFAULT_RECENT_BACKWARD_DAYS} days are CURRENT 

211 

212Classification rules: 

213- CURRENT: Recent events (last {DEFAULT_RECENT_BACKWARD_DAYS} days), ongoing situations, "latest", "recent", "today", "this week" 

214- HISTORICAL: Events before {DEFAULT_RECENT_BACKWARD_DAYS} days ago, timelines, chronologies, past tense ("what happened", "history of") 

215- UNCLEAR: Ambiguous temporal context 

216 

217Respond with ONE WORD ONLY: CURRENT, HISTORICAL, or UNCLEAR""" 

218 # Get response from LLM 

219 response = self.llm.invoke(prompt) 

220 response_text = ( 

221 getattr(response, "content", None) 

222 or getattr(response, "text", None) 

223 or str(response) 

224 ) 

225 answer = remove_think_tags(response_text).upper() 

226 

227 if "CURRENT" in answer: 

228 # For current events, focus on recent content 

229 logger.info( 

230 f"Query '{query}' classified as CURRENT - focusing on recent content" 

231 ) 

232 self.from_date = datetime.now(UTC) - timedelta( 

233 days=DEFAULT_RECENT_BACKWARD_DAYS 

234 ) 

235 elif "HISTORICAL" in answer: 235 ↛ 242line 235 didn't jump to line 242 because the condition on line 235 was always true

236 # For historical queries, go back as far as possible 

237 logger.info( 

238 f"Query '{query}' classified as HISTORICAL - extending search timeframe" 

239 ) 

240 self.from_date = datetime.min.replace(tzinfo=UTC) 

241 else: 

242 logger.info( 

243 f"Query '{query}' classified as UNCLEAR - keeping original date range" 

244 ) 

245 

246 except (AttributeError, TypeError, ValueError, RuntimeError): 

247 # Keep original date parameters on error 

248 logger.exception( 

249 "Error adapting date range for query: . Keeping original date range." 

250 ) 

251 

252 def _fetch_search_results( 

253 self, query: str, sroffset: int 

254 ) -> List[Dict[str, Any]]: 

255 """Fetch search results from Wikinews API. 

256 

257 Args: 

258 query (str): The search query. 

259 sroffset (int): The result offset for pagination. 

260 

261 Returns: 

262 List of search result items. 

263 """ 

264 retries = 0 

265 while retries < MAX_RETRIES: 

266 params = { 

267 "action": "query", 

268 "list": "search", 

269 "srsearch": query, 

270 "srprop": "snippet|timestamp", 

271 "srlimit": 50, 

272 "sroffset": sroffset, 

273 "format": "json", 

274 } 

275 

276 # Apply rate limiting before search request 

277 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

278 self.engine_type 

279 ) 

280 

281 try: 

282 response = safe_get( 

283 self.api_url.format(lang_code=self.lang_code), 

284 params=params, 

285 headers=HEADERS, 

286 timeout=TIMEOUT, 

287 ) 

288 response.raise_for_status() 

289 data = response.json() 

290 return data.get("query", {}).get("search", []) 

291 except ( 

292 requests.exceptions.RequestException, 

293 json.JSONDecodeError, 

294 ) as e: 

295 logger.warning( 

296 f"Error fetching search results: {e}, retrying..." 

297 ) 

298 retries += 1 

299 

300 return [] 

301 

302 def _process_search_result( 

303 self, result: Dict[str, Any], query: str 

304 ) -> Optional[Dict[str, Any]]: 

305 """Process and filter a single search result. 

306 

307 Args: 

308 result (Dict[str, Any]): A single search result item. 

309 query (str): The search query. 

310 

311 Returns: 

312 Processed result or None if filtered out. 

313 """ 

314 page_id = result.get("pageid") 

315 title = result.get("title", "") 

316 snippet = _clean_wikinews_snippet(result.get("snippet", "")) 

317 

318 try: 

319 last_edit_timestamp = result.get("timestamp", "") 

320 last_edit_date = datetime.fromisoformat( 

321 last_edit_timestamp.replace("Z", "+00:00") 

322 ) 

323 except ValueError: 

324 logger.warning( 

325 f"Error parsing last edit date for page {page_id}, using current date as fallback." 

326 ) 

327 last_edit_date = datetime.now(UTC) 

328 

329 # First filter: last edit date must be after from_date 

330 if last_edit_date < self.from_date: 330 ↛ 332line 330 didn't jump to line 332 because the condition on line 330 was never true

331 # In this case we can skip fetching full content 

332 return None 

333 

334 # Fetch full article content and extract actual publication date 

335 # Note: Wikinews API do not allow to retrieve publication date in batched search results 

336 full_content, publication_date = self._fetch_full_content_and_pubdate( 

337 page_id, last_edit_date 

338 ) 

339 

340 # Second filter: publication date within range 

341 if publication_date < self.from_date or publication_date > self.to_date: 

342 return None 

343 

344 # Third filter: check if all query words are in title or content 

345 # Note: Wikinews search return false positive if query words are in "related" articles section 

346 # Use word boundary matching to avoid substring matches (e.g., "is" matching "This") 

347 combined_text = f"{title} {full_content}".lower() 

348 query_words = [ 

349 w.lower() for w in query.split() if len(w) > 1 

350 ] # Skip single chars 

351 if query_words and not all( 351 ↛ 355line 351 didn't jump to line 355 because the condition on line 351 was never true

352 re.search(rf"\b{re.escape(word)}\b", combined_text) 

353 for word in query_words 

354 ): 

355 return None 

356 

357 # If only snippets are requested, we use snippet as full content 

358 if self.search_snippets_only: 358 ↛ 361line 358 didn't jump to line 361 because the condition on line 358 was always true

359 full_content = snippet 

360 

361 return { 

362 "id": page_id, 

363 "title": title, 

364 "snippet": snippet, 

365 "source": "wikinews", 

366 "url": f"https://{self.lang_code}.wikinews.org/?curid={page_id}", # Used by '_filter_for_relevance' function 

367 "link": f"https://{self.lang_code}.wikinews.org/?curid={page_id}", # Used by citation handler 

368 "content": full_content, 

369 "full_content": full_content, 

370 "publication_date": publication_date.isoformat(timespec="seconds"), 

371 } 

372 

373 def _fetch_full_content_and_pubdate( 

374 self, page_id: int, fallback_date: datetime 

375 ) -> Tuple[str, datetime]: 

376 """Fetch full article content and publication date from Wikinews API. 

377 

378 Args: 

379 page_id (int): The Wikinews page ID. 

380 fallback_date (datetime): Fallback date if publication date cannot be determined. 

381 

382 Returns: 

383 Tuple of (full_content, publication_date) 

384 """ 

385 try: 

386 content_params = { 

387 "action": "query", 

388 "prop": "revisions|extracts", 

389 "pageids": page_id, 

390 "rvprop": "timestamp", 

391 "rvdir": "newer", # Older revisions first 

392 "rvlimit": 1, # Get the first revision (i.e. publication) 

393 "explaintext": True, 

394 "format": "json", 

395 } 

396 

397 # Apply rate limiting before content request 

398 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

399 self.engine_type 

400 ) 

401 

402 content_resp = safe_get( 

403 self.api_url.format(lang_code=self.lang_code), 

404 params=content_params, 

405 headers=HEADERS, 

406 timeout=TIMEOUT, 

407 ) 

408 content_resp.raise_for_status() 

409 content_data = content_resp.json() 

410 

411 page_data = ( 

412 content_data.get("query", {}) 

413 .get("pages", {}) 

414 .get(str(page_id), {}) 

415 ) 

416 full_content = page_data.get("extract", "") 

417 revisions = page_data.get("revisions", []) 

418 

419 if revisions: 

420 try: 

421 # First revision timestamp is the publication date 

422 publication_date = datetime.fromisoformat( 

423 revisions[0]["timestamp"].replace("Z", "+00:00") 

424 ) 

425 except ValueError: 

426 logger.warning( 

427 f"Error parsing publication date for page {page_id}, using fallback date." 

428 ) 

429 publication_date = fallback_date 

430 else: 

431 logger.warning( 

432 f"No revisions found for page {page_id}, using fallback date." 

433 ) 

434 publication_date = fallback_date 

435 

436 return full_content, publication_date 

437 

438 except ( 

439 requests.exceptions.RequestException, 

440 json.JSONDecodeError, 

441 ) as e: 

442 logger.warning(f"Error fetching content for page {page_id}: {e}") 

443 return "", fallback_date 

444 

445 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

446 """ 

447 Retrieve article previews from Wikinews based on the query. 

448 

449 Args: 

450 query (str): The search query 

451 

452 Returns: 

453 List of relevant article previews 

454 """ 

455 # Adapt date range based on query and optimize query (if LLM is available) 

456 self._adapt_date_range_for_query(query) 

457 optimized_query = self._optimize_query_for_wikinews(query) 

458 

459 articles = [] 

460 sroffset = 0 

461 

462 while len(articles) < self.max_results: 

463 search_results = self._fetch_search_results( 

464 optimized_query, sroffset 

465 ) 

466 if not search_results: 

467 # No more results available (or multiple retries failed) 

468 break 

469 

470 for result in search_results: 470 ↛ 477line 470 didn't jump to line 477 because the loop on line 470 didn't complete

471 article = self._process_search_result(result, optimized_query) 

472 if article: 472 ↛ 474line 472 didn't jump to line 474 because the condition on line 472 was always true

473 articles.append(article) 

474 if len(articles) >= self.max_results: 474 ↛ 470line 474 didn't jump to line 470 because the condition on line 474 was always true

475 break 

476 

477 sroffset += len(search_results) 

478 

479 return articles 

480 

481 def _get_full_content( 

482 self, relevant_items: List[Dict[str, Any]] 

483 ) -> List[Dict[str, Any]]: 

484 """ 

485 Retrieve full content for relevant Wikinews articles. 

486 

487 Args: 

488 relevant_items (List[Dict[str, Any]]): List of relevant article previews 

489 

490 Returns: 

491 List of articles with full content 

492 """ 

493 # Since full content is already fetched in _get_previews, just return relevant items 

494 return relevant_items 

495 

496 

497def _clean_wikinews_snippet(snippet: str) -> str: 

498 """ 

499 Clean a Wikinews search snippet. 

500 

501 Args: 

502 snippet (str): Raw snippet from Wikinews API 

503 

504 Returns: 

505 Clean human-readable text 

506 """ 

507 if not snippet: 

508 return "" 

509 

510 # Unescape HTML entities 

511 unescaped = html.unescape(snippet) 

512 

513 # Remove HTML tags 

514 clean_text = re.sub(r"<.*?>", "", unescaped) 

515 

516 # Normalize whitespace 

517 clean_text = re.sub(r"\s+", " ", clean_text).strip() 

518 

519 return clean_text