Coverage for src/local_deep_research/web_search_engines/engines/search_engine

1from datetime import datetime, timedelta, UTC

2from typing import Any, Dict, List, Optional, Tuple

4import json

5import html

6import re

7import requests

8from langchain_core.language_models import BaseLLM

9from loguru import logger

11from ...utilities.json_utils import extract_json, get_llm_response_text

12from ...utilities.search_utilities import remove_think_tags, LANGUAGE_CODE_MAP

13from ..search_engine_base import BaseSearchEngine

14from ...security import safe_get

16HEADERS = {

17 "User-Agent": "local-deep-research-wikinews-search-engine (github.com/LearningCircuit/local-deep-research)"

18}

19WIKINEWS_LANGUAGES = [

20 "ru",

21 "sr",

22 "pt",

23 "fr",

24 "pl",

25 "en",

26 "zh",

27 "de",

28 "it",

29 "es",

30 "cs",

31 "nl",

32 "ca",

33 "ar",

34 "ja",

35]

36TIMEOUT = 5 # Seconds

37TIME_PERIOD_DELTAS = {

38 "all": None, # No time filter

39 "y": timedelta(days=365), # 1 year

40 "m": timedelta(days=30), # 1 month

41 "w": timedelta(days=7), # 1 week

42 "d": timedelta(days=1), # 24 hours

43}

44DEFAULT_RECENT_BACKWARD_DAYS = 60

45MAX_RETRIES = 3

48class WikinewsSearchEngine(BaseSearchEngine):

49 """Wikinews search engine implementation with LLM query optimization"""

51 # Mark as public and news search engine

52 is_public = True

53 is_news = True

55 def __init__(

56 self,

57 search_language: str = "english",

58 adaptive_search: bool = True,

59 time_period: str = "y",

60 llm: Optional[BaseLLM] = None,

61 max_filtered_results: Optional[int] = None,

62 max_results: int = 10,

63 search_snippets_only: bool = True,

64 **kwargs,

65 ):

66 """

67 Initialize the Wikinews search engine.

69 Args:

70 search_language (str): Language for Wikinews search (e.g. "english").

71 adaptive_search (bool): Whether to expand or shrink date ranges based on query.

72 time_period (str): Defines the look-back window used to filter search results ("all", "y", "m", "w", "d").

73 llm (Optional[BaseLLM]): Language model used for query optimization and classification.

74 max_filtered_results (Optional[int]): Maximum number of results to keep after filtering.

75 max_results (int): Maximum number of search results to return.

76 search_snippets_only (bool): If True, full article content is ignored.

77 """

79 super().__init__(

80 llm=llm,

81 max_filtered_results=max_filtered_results,

82 max_results=max_results,

83 search_snippets_only=search_snippets_only,

84 **kwargs,

85 )

87 # Language initialization

88 lang_code = LANGUAGE_CODE_MAP.get(

89 search_language.lower(),

90 "en", # Default to English if not found

91 )

93 if lang_code not in WIKINEWS_LANGUAGES: 93 ↛ 94line 93 didn't jump to line 94 because the condition on line 93 was never true

94 logger.warning(

95 f"Wikinews does not support language '{search_language}' ({lang_code}). Defaulting to English."

96 )

97 lang_code = "en"

99 self.lang_code: str = lang_code

100

101 # Adaptive search

102 self.adaptive_search: bool = adaptive_search

103

104 # Date range initialization

105 now = datetime.now(UTC)

106 delta = TIME_PERIOD_DELTAS.get(time_period, timedelta(days=365))

107 self.from_date: datetime = (

108 now - delta if delta else datetime.min.replace(tzinfo=UTC)

109 )

110 self.to_date: datetime = now

111

112 # Preserve original date range so adaptive search can restore it

113 self._original_date_range = (self.from_date, self.to_date)

114

115 # API base URL

116 self.api_url: str = "https://{lang_code}.wikinews.org/w/api.php"

117

118 def _optimize_query_for_wikinews(self, query: str) -> str:

119 """

120 Optimize a natural language query for Wikinews search.

121 Uses LLM to transform questions into effective news search queries.

122

123 Args:

124 query (str): Natural language query

125

126 Returns:

127 Optimized search query for Wikinews

128 """

129 if not self.llm:

130 return query

131

132 try:

133 # Prompt for query optimization

134 prompt = f"""You are a query condenser. Your task is to transform the user’s natural-language question into a very short Wikinews search query.

135

136Input question:

137"{query}"

138

139STRICT OUTPUT REQUIREMENTS (follow ALL of them):

1401. Return ONLY a JSON object with EXACTLY one field: {{"query": "<refined_query>"}}.

1412. The JSON must be valid, minified, and contain no trailing text.

1423. The refined query must be extremely short: MAXIMUM 3–4 words.

1434. Include only the essential keywords (proper names, events, entities, places).

1445. Remove filler words (e.g., "news", "latest", "about", "what", "how", "is").

1456. DO NOT add Boolean operators (AND, OR).

1467. DO NOT use quotes inside the query.

1478. DO NOT add explanations or comments.

148

149EXAMPLES:

150- "What's the impact of rising interest rates on UK housing market?" → {{"query": "UK housing rates"}}

151- "Latest developments in the Ukraine-Russia peace negotiations" → {{"query": "Ukraine Russia negotiations"}}

152- "How are tech companies responding to AI regulation?" → {{"query": "tech AI regulation"}}

153- "What is Donald Trump's current political activity?" → {{"query": "Trump political activity"}}

154

155NOW RETURN ONLY THE JSON OBJECT.

156"""

157 # Get response from LLM

158 response = self.llm.invoke(prompt)

159 response_text = get_llm_response_text(response)

160

161 data = extract_json(response_text, expected_type=dict)

162

163 if data is None:

164 raise ValueError("No valid JSON found in response")

165

166 optimized_query = (data.get("query", "")).strip()

167

168 if not optimized_query:

169 raise ValueError("Query field missing or empty")

170

171 except (

172 ValueError,

173 TypeError,

174 AttributeError,

175 json.JSONDecodeError,

176 ) as e:

177 logger.warning(

178 f"Error optimizing query for Wikinews: {e}. Using original query."

179 )

180 return query

181

182 logger.info(f"Original query: '{query}'")

183 logger.info(f"Optimized for Wikinews: '{optimized_query}'")

184

185 return optimized_query

186

187 def _adapt_date_range_for_query(self, query: str) -> None:

188 """

189 Adapt the date range based on the query type (historical vs recent events).

190

191 Args:

192 query (str): The search query

193 """

194 # Reset to original date parameters first

195 self.from_date, self.to_date = self._original_date_range

196

197 if not self.adaptive_search or not self.llm:

198 return

199

200 # Do not adapt for very short queries (no enough context)

201 if len(query.split()) <= 4:

202 return

203

204 try:

205 prompt = f"""Classify this query based on temporal scope.

206

207Query: "{query}"

208

209Current date: {datetime.now(UTC).strftime("%Y-%m-%d")}

210Cutoff: Events within the last {DEFAULT_RECENT_BACKWARD_DAYS} days are CURRENT

211

212Classification rules:

213- CURRENT: Recent events (last {DEFAULT_RECENT_BACKWARD_DAYS} days), ongoing situations, "latest", "recent", "today", "this week"

214- HISTORICAL: Events before {DEFAULT_RECENT_BACKWARD_DAYS} days ago, timelines, chronologies, past tense ("what happened", "history of")

215- UNCLEAR: Ambiguous temporal context

216

217Respond with ONE WORD ONLY: CURRENT, HISTORICAL, or UNCLEAR"""

218 # Get response from LLM

219 response = self.llm.invoke(prompt)

220 response_text = (

221 getattr(response, "content", None)

222 or getattr(response, "text", None)

223 or str(response)

224 )

225 answer = remove_think_tags(response_text).upper()

226

227 if "CURRENT" in answer:

228 # For current events, focus on recent content

229 logger.info(

230 f"Query '{query}' classified as CURRENT - focusing on recent content"

231 )

232 self.from_date = datetime.now(UTC) - timedelta(

233 days=DEFAULT_RECENT_BACKWARD_DAYS

234 )

235 elif "HISTORICAL" in answer: 235 ↛ 242line 235 didn't jump to line 242 because the condition on line 235 was always true

236 # For historical queries, go back as far as possible

237 logger.info(

238 f"Query '{query}' classified as HISTORICAL - extending search timeframe"

239 )

240 self.from_date = datetime.min.replace(tzinfo=UTC)

241 else:

242 logger.info(

243 f"Query '{query}' classified as UNCLEAR - keeping original date range"

244 )

245

246 except (AttributeError, TypeError, ValueError, RuntimeError):

247 # Keep original date parameters on error

248 logger.exception(

249 "Error adapting date range for query: . Keeping original date range."

250 )

251

252 def _fetch_search_results(

253 self, query: str, sroffset: int

254 ) -> List[Dict[str, Any]]:

255 """Fetch search results from Wikinews API.

256

257 Args:

258 query (str): The search query.

259 sroffset (int): The result offset for pagination.

260

261 Returns:

262 List of search result items.

263 """

264 retries = 0

265 while retries < MAX_RETRIES:

266 params = {

267 "action": "query",

268 "list": "search",

269 "srsearch": query,

270 "srprop": "snippet|timestamp",

271 "srlimit": 50,

272 "sroffset": sroffset,

273 "format": "json",

274 }

275

276 # Apply rate limiting before search request

277 self._last_wait_time = self.rate_tracker.apply_rate_limit(

278 self.engine_type

279 )

280

281 try:

282 response = safe_get(

283 self.api_url.format(lang_code=self.lang_code),

284 params=params,

285 headers=HEADERS,

286 timeout=TIMEOUT,

287 )

288 response.raise_for_status()

289 data = response.json()

290 return data.get("query", {}).get("search", [])

291 except (

292 requests.exceptions.RequestException,

293 json.JSONDecodeError,

294 ) as e:

295 logger.warning(

296 f"Error fetching search results: {e}, retrying..."

297 )

298 retries += 1

299

300 return []

301

302 def _process_search_result(

303 self, result: Dict[str, Any], query: str

304 ) -> Optional[Dict[str, Any]]:

305 """Process and filter a single search result.

306

307 Args:

308 result (Dict[str, Any]): A single search result item.

309 query (str): The search query.

310

311 Returns:

312 Processed result or None if filtered out.

313 """

314 page_id = result.get("pageid")

315 title = result.get("title", "")

316 snippet = _clean_wikinews_snippet(result.get("snippet", ""))

317

318 try:

319 last_edit_timestamp = result.get("timestamp", "")

320 last_edit_date = datetime.fromisoformat(

321 last_edit_timestamp.replace("Z", "+00:00")

322 )

323 except ValueError:

324 logger.warning(

325 f"Error parsing last edit date for page {page_id}, using current date as fallback."

326 )

327 last_edit_date = datetime.now(UTC)

328

329 # First filter: last edit date must be after from_date

330 if last_edit_date < self.from_date: 330 ↛ 332line 330 didn't jump to line 332 because the condition on line 330 was never true

331 # In this case we can skip fetching full content

332 return None

333

334 # Fetch full article content and extract actual publication date

335 # Note: Wikinews API do not allow to retrieve publication date in batched search results

336 full_content, publication_date = self._fetch_full_content_and_pubdate(

337 page_id, last_edit_date

338 )

339

340 # Second filter: publication date within range

341 if publication_date < self.from_date or publication_date > self.to_date:

342 return None

343

344 # Third filter: check if all query words are in title or content

345 # Note: Wikinews search return false positive if query words are in "related" articles section

346 # Use word boundary matching to avoid substring matches (e.g., "is" matching "This")

347 combined_text = f"{title} {full_content}".lower()

348 query_words = [

349 w.lower() for w in query.split() if len(w) > 1

350 ] # Skip single chars

351 if query_words and not all( 351 ↛ 355line 351 didn't jump to line 355 because the condition on line 351 was never true

352 re.search(rf"\b{re.escape(word)}\b", combined_text)

353 for word in query_words

354 ):

355 return None

356

357 # If only snippets are requested, we use snippet as full content

358 if self.search_snippets_only: 358 ↛ 361line 358 didn't jump to line 361 because the condition on line 358 was always true

359 full_content = snippet

360

361 return {

362 "id": page_id,

363 "title": title,

364 "snippet": snippet,

365 "source": "wikinews",

366 "url": f"https://{self.lang_code}.wikinews.org/?curid={page_id}", # Used by '_filter_for_relevance' function

367 "link": f"https://{self.lang_code}.wikinews.org/?curid={page_id}", # Used by citation handler

368 "content": full_content,

369 "full_content": full_content,

370 "publication_date": publication_date.isoformat(timespec="seconds"),

371 }

372

373 def _fetch_full_content_and_pubdate(

374 self, page_id: int, fallback_date: datetime

375 ) -> Tuple[str, datetime]:

376 """Fetch full article content and publication date from Wikinews API.

377

378 Args:

379 page_id (int): The Wikinews page ID.

380 fallback_date (datetime): Fallback date if publication date cannot be determined.

381

382 Returns:

383 Tuple of (full_content, publication_date)

384 """

385 try:

386 content_params = {

387 "action": "query",

388 "prop": "revisions|extracts",

389 "pageids": page_id,

390 "rvprop": "timestamp",

391 "rvdir": "newer", # Older revisions first

392 "rvlimit": 1, # Get the first revision (i.e. publication)

393 "explaintext": True,

394 "format": "json",

395 }

396

397 # Apply rate limiting before content request

398 self._last_wait_time = self.rate_tracker.apply_rate_limit(

399 self.engine_type

400 )

401

402 content_resp = safe_get(

403 self.api_url.format(lang_code=self.lang_code),

404 params=content_params,

405 headers=HEADERS,

406 timeout=TIMEOUT,

407 )

408 content_resp.raise_for_status()

409 content_data = content_resp.json()

410

411 page_data = (

412 content_data.get("query", {})

413 .get("pages", {})

414 .get(str(page_id), {})

415 )

416 full_content = page_data.get("extract", "")

417 revisions = page_data.get("revisions", [])

418

419 if revisions:

420 try:

421 # First revision timestamp is the publication date

422 publication_date = datetime.fromisoformat(

423 revisions[0]["timestamp"].replace("Z", "+00:00")

424 )

425 except ValueError:

426 logger.warning(

427 f"Error parsing publication date for page {page_id}, using fallback date."

428 )

429 publication_date = fallback_date

430 else:

431 logger.warning(

432 f"No revisions found for page {page_id}, using fallback date."

433 )

434 publication_date = fallback_date

435

436 return full_content, publication_date

437

438 except (

439 requests.exceptions.RequestException,

440 json.JSONDecodeError,

441 ) as e:

442 logger.warning(f"Error fetching content for page {page_id}: {e}")

443 return "", fallback_date

444

445 def _get_previews(self, query: str) -> List[Dict[str, Any]]:

446 """

447 Retrieve article previews from Wikinews based on the query.

448

449 Args:

450 query (str): The search query

451

452 Returns:

453 List of relevant article previews

454 """

455 # Adapt date range based on query and optimize query (if LLM is available)

456 self._adapt_date_range_for_query(query)

457 optimized_query = self._optimize_query_for_wikinews(query)

458

459 articles = []

460 sroffset = 0

461

462 while len(articles) < self.max_results:

463 search_results = self._fetch_search_results(

464 optimized_query, sroffset

465 )

466 if not search_results:

467 # No more results available (or multiple retries failed)

468 break

469

470 for result in search_results: 470 ↛ 477line 470 didn't jump to line 477 because the loop on line 470 didn't complete

471 article = self._process_search_result(result, optimized_query)

472 if article: 472 ↛ 474line 472 didn't jump to line 474 because the condition on line 472 was always true

473 articles.append(article)

474 if len(articles) >= self.max_results: 474 ↛ 470line 474 didn't jump to line 470 because the condition on line 474 was always true

475 break

476

477 sroffset += len(search_results)

478

479 return articles

480

481 def _get_full_content(

482 self, relevant_items: List[Dict[str, Any]]

483 ) -> List[Dict[str, Any]]:

484 """

485 Retrieve full content for relevant Wikinews articles.

486

487 Args:

488 relevant_items (List[Dict[str, Any]]): List of relevant article previews

489

490 Returns:

491 List of articles with full content

492 """

493 # Since full content is already fetched in _get_previews, just return relevant items

494 return relevant_items

495

496

497def _clean_wikinews_snippet(snippet: str) -> str:

498 """

499 Clean a Wikinews search snippet.

500

501 Args:

502 snippet (str): Raw snippet from Wikinews API

503

504 Returns:

505 Clean human-readable text

506 """

507 if not snippet:

508 return ""

509

510 # Unescape HTML entities

511 unescaped = html.unescape(snippet)

512

513 # Remove HTML tags

514 clean_text = re.sub(r"<.*?>", "", unescaped)

515

516 # Normalize whitespace

517 clean_text = re.sub(r"\s+", " ", clean_text).strip()

518

519 return clean_text

Coverage for src / local_deep_research / web_search_engines / engines / search_engine_wikinews.py: 89%

158 statements