Coverage for src / local_deep_research / web_search_engines / engines / search_engine_wayback.py: 96%
198 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1import re
2from typing import Any, Dict, List, Optional, Tuple
4from langchain_core.language_models import BaseLLM
5from loguru import logger
7from ...config import search_config
8from ...research_library.downloaders.extraction import extract_content
9from ...security.safe_requests import safe_get
10from ..rate_limiting import RateLimitError
11from ..search_engine_base import BaseSearchEngine
14class WaybackSearchEngine(BaseSearchEngine):
15 """
16 Internet Archive Wayback Machine search engine implementation
17 Provides access to historical versions of web pages
18 """
20 # Mark as public search engine
21 is_public = True
23 def __init__(
24 self,
25 max_results: int = 10,
26 max_snapshots_per_url: int = 3,
27 llm: Optional[BaseLLM] = None,
28 language: str = "English",
29 max_filtered_results: Optional[int] = None,
30 closest_only: bool = False,
31 settings_snapshot: Optional[Dict[str, Any]] = None,
32 ):
33 """
34 Initialize the Wayback Machine search engine.
36 Args:
37 max_results: Maximum number of search results
38 max_snapshots_per_url: Maximum snapshots to retrieve per URL
39 llm: Language model for relevance filtering
40 language: Language for content processing
41 max_filtered_results: Maximum number of results to keep after filtering
42 closest_only: If True, only retrieves the closest snapshot for each URL
43 """
44 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
45 super().__init__(
46 llm=llm,
47 max_filtered_results=max_filtered_results,
48 max_results=max_results,
49 settings_snapshot=settings_snapshot,
50 )
51 self.max_snapshots_per_url = max_snapshots_per_url
52 self.language = language
53 self.closest_only = closest_only
55 # API endpoints
56 self.available_api = "https://archive.org/wayback/available"
57 self.cdx_api = "https://web.archive.org/cdx/search/cdx"
59 def _extract_urls_from_query(self, query: str) -> List[str]:
60 """
61 Extract URLs from a query string or interpret as an URL if possible.
62 For non-URL queries, use a DuckDuckGo search to find relevant URLs.
64 Args:
65 query: The search query or URL
67 Returns:
68 List of URLs to search in the Wayback Machine
69 """
70 # Check if the query is already a URL
71 url_pattern = re.compile(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+")
72 urls = url_pattern.findall(query)
74 if urls:
75 logger.info(f"Found {len(urls)} URLs in query")
76 return urls
78 # Check if query is a domain without http prefix
79 domain_pattern = re.compile(r"^(?:[-\w.]|(?:%[\da-fA-F]{2}))+\.\w+$")
80 if domain_pattern.match(query):
81 logger.info(f"Query appears to be a domain: {query}")
82 return [f"http://{query}"]
84 # For non-URL queries, use DuckDuckGo to find relevant URLs
85 logger.info(
86 "Query is not a URL, using DuckDuckGo to find relevant URLs"
87 )
88 try:
89 # Import DuckDuckGo search engine
90 from langchain_community.utilities import DuckDuckGoSearchAPIWrapper
92 # Use max_results from parent class, but limit to 5 for URL discovery
93 url_search_limit = min(5, self.max_results)
94 ddg = DuckDuckGoSearchAPIWrapper(max_results=url_search_limit)
95 # Pass max_results as a positional argument
96 results = ddg.results(query, url_search_limit)
98 # Extract URLs from results
99 ddg_urls = [
100 str(result.get("link"))
101 for result in results
102 if result.get("link")
103 ]
104 if ddg_urls:
105 logger.info(
106 f"Found {len(ddg_urls)} URLs from DuckDuckGo search"
107 )
108 return ddg_urls
109 except Exception:
110 logger.exception("Error using DuckDuckGo for URL discovery")
112 # Fallback: treat the query as a potential domain or path
113 if "/" in query and "." in query:
114 logger.info(f"Treating query as a partial URL: {query}")
115 return [f"http://{query}"]
116 if "." in query: 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true
117 logger.info(f"Treating query as a domain: {query}")
118 return [f"http://{query}"]
120 # Return empty list if nothing worked
121 logger.warning(f"Could not extract any URLs from query: {query}")
122 return []
124 def _format_timestamp(self, timestamp: str) -> str:
125 """Format Wayback Machine timestamp into readable date"""
126 if len(timestamp) < 14:
127 return timestamp
129 try:
130 year = timestamp[0:4]
131 month = timestamp[4:6]
132 day = timestamp[6:8]
133 hour = timestamp[8:10]
134 minute = timestamp[10:12]
135 second = timestamp[12:14]
136 return f"{year}-{month}-{day} {hour}:{minute}:{second}"
137 except Exception:
138 logger.debug("Timestamp formatting failed, returning original")
139 return timestamp
141 def _get_wayback_snapshots(self, url: str) -> List[Dict[str, Any]]:
142 """
143 Get snapshots from the Wayback Machine for a specific URL.
145 Args:
146 url: URL to get snapshots for
148 Returns:
149 List of snapshot dictionaries
150 """
151 snapshots = []
153 try:
154 if self.closest_only:
155 # Get only the closest snapshot
156 response = safe_get(self.available_api, params={"url": url})
158 # Check for rate limit
159 if response.status_code == 429:
160 raise RateLimitError("Wayback Machine rate limit exceeded") # noqa: TRY301 — re-raised by except RateLimitError for base class retry
162 data = response.json()
164 if (
165 "archived_snapshots" in data
166 and "closest" in data["archived_snapshots"]
167 ):
168 snapshot = data["archived_snapshots"]["closest"]
169 snapshot_url = snapshot["url"]
170 timestamp = snapshot["timestamp"]
172 snapshots.append(
173 {
174 "timestamp": timestamp,
175 "formatted_date": self._format_timestamp(timestamp),
176 "url": snapshot_url,
177 "original_url": url,
178 "available": snapshot.get("available", True),
179 "status": snapshot.get("status", "200"),
180 }
181 )
182 else:
183 # Get multiple snapshots using CDX API
184 response = safe_get(
185 self.cdx_api,
186 params={
187 "url": url,
188 "output": "json",
189 "fl": "timestamp,original,statuscode,mimetype",
190 "collapse": "timestamp:4", # Group by year
191 "limit": self.max_snapshots_per_url,
192 },
193 )
195 # Check for rate limit
196 if response.status_code == 429:
197 raise RateLimitError( # noqa: TRY301 — re-raised by except RateLimitError for base class retry
198 "Wayback Machine CDX API rate limit exceeded"
199 )
201 # Check if response is valid JSON
202 data = response.json()
204 # First item is the header
205 if len(data) > 1:
206 headers = data[0]
207 for item in data[1:]:
208 snapshot = dict(zip(headers, item, strict=False))
209 timestamp = snapshot.get("timestamp", "")
211 wayback_url = (
212 f"https://web.archive.org/web/{timestamp}/{url}"
213 )
215 snapshots.append(
216 {
217 "timestamp": timestamp,
218 "formatted_date": self._format_timestamp(
219 timestamp
220 ),
221 "url": wayback_url,
222 "original_url": url,
223 "available": True,
224 "status": snapshot.get("statuscode", "200"),
225 }
226 )
228 # Limit to max snapshots per URL
229 snapshots = snapshots[: self.max_snapshots_per_url]
231 except RateLimitError:
232 # Re-raise rate limit errors for base class retry handling
233 raise
234 except Exception:
235 logger.exception(f"Error getting Wayback snapshots for {url}")
237 return snapshots
239 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
240 """
241 Get preview information for Wayback Machine snapshots.
243 Args:
244 query: The search query
246 Returns:
247 List of preview dictionaries
248 """
249 logger.info(f"Getting Wayback Machine previews for query: {query}")
251 # Extract URLs from query
252 urls = self._extract_urls_from_query(query)
254 if not urls:
255 logger.warning(f"No URLs found in query: {query}")
256 return []
258 # Get snapshots for each URL
259 all_snapshots = []
260 for url in urls:
261 snapshots = self._get_wayback_snapshots(url)
262 all_snapshots.extend(snapshots)
264 # Apply rate limiting between requests
265 if len(urls) > 1: 265 ↛ 266line 265 didn't jump to line 266 because the condition on line 265 was never true
266 self.rate_tracker.apply_rate_limit(self.engine_type)
268 # Format as previews
269 previews = []
270 for snapshot in all_snapshots:
271 preview = {
272 "id": f"{snapshot['timestamp']}_{snapshot['original_url']}",
273 "title": f"Archive of {snapshot['original_url']} ({snapshot['formatted_date']})",
274 "link": snapshot["url"],
275 "snippet": f"Archived version from {snapshot['formatted_date']}",
276 "original_url": snapshot["original_url"],
277 "timestamp": snapshot["timestamp"],
278 "formatted_date": snapshot["formatted_date"],
279 }
280 previews.append(preview)
282 logger.info(f"Found {len(previews)} Wayback Machine snapshots")
283 return previews
285 def _remove_boilerplate(self, html: str) -> str:
286 """Remove boilerplate using the shared extraction pipeline."""
287 if not html or not html.strip():
288 return ""
289 try:
290 return extract_content(html, language=self.language) or ""
291 except Exception:
292 logger.exception("Error removing boilerplate")
293 return html
295 def _get_wayback_content(self, url: str) -> Tuple[str, str]:
296 """
297 Retrieve content from a Wayback Machine URL.
299 Args:
300 url: Wayback Machine URL
302 Returns:
303 Tuple of (raw_html, cleaned_text)
304 """
305 try:
306 headers = {
307 "User-Agent": "Mozilla/5.0 (Local Deep Research Bot; research project)"
308 }
309 response = safe_get(url, headers=headers, timeout=10)
310 raw_html = response.text
312 # Clean the HTML
313 cleaned_text = self._remove_boilerplate(raw_html)
315 return raw_html, cleaned_text
316 except Exception as e:
317 logger.exception(f"Error retrieving content from {url}")
318 return "", f"Error retrieving content: {e!s}"
320 def _get_full_content(
321 self, relevant_items: List[Dict[str, Any]]
322 ) -> List[Dict[str, Any]]:
323 """
324 Get full content for the relevant Wayback Machine snapshots.
326 Args:
327 relevant_items: List of relevant preview dictionaries
329 Returns:
330 List of result dictionaries with full content
331 """
332 # Check if we should add full content
333 if (
334 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
335 and search_config.SEARCH_SNIPPETS_ONLY
336 ):
337 logger.info("Snippet-only mode, skipping full content retrieval")
338 return relevant_items
340 logger.info(
341 f"Getting full content for {len(relevant_items)} Wayback Machine snapshots"
342 )
344 results = []
345 for item in relevant_items:
346 wayback_url = item.get("link")
347 if not wayback_url:
348 results.append(item)
349 continue
351 logger.info(f"Retrieving content from {wayback_url}")
353 try:
354 # Retrieve content
355 raw_html, full_content = self._get_wayback_content(wayback_url)
357 # Add full content to the result
358 result = item.copy()
359 result["raw_html"] = raw_html
360 result["full_content"] = full_content
362 results.append(result)
364 # Apply rate limiting
365 self.rate_tracker.apply_rate_limit(self.engine_type)
366 except Exception:
367 logger.exception(f"Error processing {wayback_url}")
368 results.append(item)
370 return results
372 def search_by_url(
373 self, url: str, max_snapshots: int | None = None
374 ) -> List[Dict[str, Any]]:
375 """
376 Search for archived versions of a specific URL.
378 Args:
379 url: The URL to search for archives
380 max_snapshots: Maximum number of snapshots to return
382 Returns:
383 List of snapshot dictionaries
384 """
385 max_snapshots = max_snapshots or self.max_snapshots_per_url
387 snapshots = self._get_wayback_snapshots(url)
388 previews = []
390 for snapshot in snapshots[:max_snapshots]:
391 preview = {
392 "id": f"{snapshot['timestamp']}_{snapshot['original_url']}",
393 "title": f"Archive of {snapshot['original_url']} ({snapshot['formatted_date']})",
394 "link": snapshot["url"],
395 "snippet": f"Archived version from {snapshot['formatted_date']}",
396 "original_url": snapshot["original_url"],
397 "timestamp": snapshot["timestamp"],
398 "formatted_date": snapshot["formatted_date"],
399 }
400 previews.append(preview)
402 # Get full content if not in snippets-only mode
403 if (
404 not hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
405 or not search_config.SEARCH_SNIPPETS_ONLY
406 ):
407 return self._get_full_content(previews)
409 return previews
411 def search_by_date_range(
412 self, url: str, start_date: str, end_date: str
413 ) -> List[Dict[str, Any]]:
414 """
415 Search for archived versions of a URL within a date range.
417 Args:
418 url: The URL to search for archives
419 start_date: Start date in format YYYYMMDD
420 end_date: End date in format YYYYMMDD
422 Returns:
423 List of snapshot dictionaries
424 """
425 try:
426 # Use CDX API with date range
427 response = safe_get(
428 self.cdx_api,
429 params={
430 "url": url,
431 "output": "json",
432 "fl": "timestamp,original,statuscode,mimetype",
433 "from": start_date,
434 "to": end_date,
435 "limit": self.max_snapshots_per_url,
436 },
437 )
439 # Process response
440 data = response.json()
442 # First item is the header
443 if len(data) <= 1:
444 return []
446 headers = data[0]
447 snapshots = []
449 for item in data[1:]:
450 snapshot = dict(zip(headers, item, strict=False))
451 timestamp = snapshot.get("timestamp", "")
453 wayback_url = f"https://web.archive.org/web/{timestamp}/{url}"
455 snapshots.append(
456 {
457 "id": f"{timestamp}_{url}",
458 "title": f"Archive of {url} ({self._format_timestamp(timestamp)})",
459 "link": wayback_url,
460 "snippet": f"Archived version from {self._format_timestamp(timestamp)}",
461 "original_url": url,
462 "timestamp": timestamp,
463 "formatted_date": self._format_timestamp(timestamp),
464 }
465 )
467 # Get full content if not in snippets-only mode
468 if ( 468 ↛ 472line 468 didn't jump to line 472 because the condition on line 468 was never true
469 not hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
470 or not search_config.SEARCH_SNIPPETS_ONLY
471 ):
472 return self._get_full_content(snapshots)
474 return snapshots
476 except Exception:
477 logger.exception(f"Error searching date range for {url}")
478 return []
480 def get_latest_snapshot(self, url: str) -> Optional[Dict[str, Any]]:
481 """
482 Get the most recent snapshot of a URL.
484 Args:
485 url: The URL to get the latest snapshot for
487 Returns:
488 Dictionary with snapshot information or None if not found
489 """
490 try:
491 response = safe_get(self.available_api, params={"url": url})
492 data = response.json()
494 if (
495 "archived_snapshots" in data
496 and "closest" in data["archived_snapshots"]
497 ):
498 snapshot = data["archived_snapshots"]["closest"]
499 timestamp = snapshot["timestamp"]
500 wayback_url = snapshot["url"]
502 result = {
503 "id": f"{timestamp}_{url}",
504 "title": f"Latest archive of {url} ({self._format_timestamp(timestamp)})",
505 "link": wayback_url,
506 "snippet": f"Archived version from {self._format_timestamp(timestamp)}",
507 "original_url": url,
508 "timestamp": timestamp,
509 "formatted_date": self._format_timestamp(timestamp),
510 }
512 # Get full content if not in snippets-only mode
513 if (
514 not hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
515 or not search_config.SEARCH_SNIPPETS_ONLY
516 ):
517 raw_html, full_content = self._get_wayback_content(
518 wayback_url
519 )
520 result["raw_html"] = raw_html
521 result["full_content"] = full_content
523 return result
525 return None
527 except Exception:
528 logger.exception(f"Error getting latest snapshot for {url}")
529 return None