Coverage for src / local_deep_research / web_search_engines / engines / search_engine_wayback.py: 96%

198 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1import re 

2from typing import Any, Dict, List, Optional, Tuple 

3 

4from langchain_core.language_models import BaseLLM 

5from loguru import logger 

6 

7from ...config import search_config 

8from ...research_library.downloaders.extraction import extract_content 

9from ...security.safe_requests import safe_get 

10from ..rate_limiting import RateLimitError 

11from ..search_engine_base import BaseSearchEngine 

12 

13 

14class WaybackSearchEngine(BaseSearchEngine): 

15 """ 

16 Internet Archive Wayback Machine search engine implementation 

17 Provides access to historical versions of web pages 

18 """ 

19 

20 # Mark as public search engine 

21 is_public = True 

22 

23 def __init__( 

24 self, 

25 max_results: int = 10, 

26 max_snapshots_per_url: int = 3, 

27 llm: Optional[BaseLLM] = None, 

28 language: str = "English", 

29 max_filtered_results: Optional[int] = None, 

30 closest_only: bool = False, 

31 settings_snapshot: Optional[Dict[str, Any]] = None, 

32 ): 

33 """ 

34 Initialize the Wayback Machine search engine. 

35 

36 Args: 

37 max_results: Maximum number of search results 

38 max_snapshots_per_url: Maximum snapshots to retrieve per URL 

39 llm: Language model for relevance filtering 

40 language: Language for content processing 

41 max_filtered_results: Maximum number of results to keep after filtering 

42 closest_only: If True, only retrieves the closest snapshot for each URL 

43 """ 

44 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results 

45 super().__init__( 

46 llm=llm, 

47 max_filtered_results=max_filtered_results, 

48 max_results=max_results, 

49 settings_snapshot=settings_snapshot, 

50 ) 

51 self.max_snapshots_per_url = max_snapshots_per_url 

52 self.language = language 

53 self.closest_only = closest_only 

54 

55 # API endpoints 

56 self.available_api = "https://archive.org/wayback/available" 

57 self.cdx_api = "https://web.archive.org/cdx/search/cdx" 

58 

59 def _extract_urls_from_query(self, query: str) -> List[str]: 

60 """ 

61 Extract URLs from a query string or interpret as an URL if possible. 

62 For non-URL queries, use a DuckDuckGo search to find relevant URLs. 

63 

64 Args: 

65 query: The search query or URL 

66 

67 Returns: 

68 List of URLs to search in the Wayback Machine 

69 """ 

70 # Check if the query is already a URL 

71 url_pattern = re.compile(r"https?://(?:[-\w.]|(?:%[\da-fA-F]{2}))+") 

72 urls = url_pattern.findall(query) 

73 

74 if urls: 

75 logger.info(f"Found {len(urls)} URLs in query") 

76 return urls 

77 

78 # Check if query is a domain without http prefix 

79 domain_pattern = re.compile(r"^(?:[-\w.]|(?:%[\da-fA-F]{2}))+\.\w+$") 

80 if domain_pattern.match(query): 

81 logger.info(f"Query appears to be a domain: {query}") 

82 return [f"http://{query}"] 

83 

84 # For non-URL queries, use DuckDuckGo to find relevant URLs 

85 logger.info( 

86 "Query is not a URL, using DuckDuckGo to find relevant URLs" 

87 ) 

88 try: 

89 # Import DuckDuckGo search engine 

90 from langchain_community.utilities import DuckDuckGoSearchAPIWrapper 

91 

92 # Use max_results from parent class, but limit to 5 for URL discovery 

93 url_search_limit = min(5, self.max_results) 

94 ddg = DuckDuckGoSearchAPIWrapper(max_results=url_search_limit) 

95 # Pass max_results as a positional argument 

96 results = ddg.results(query, url_search_limit) 

97 

98 # Extract URLs from results 

99 ddg_urls = [ 

100 str(result.get("link")) 

101 for result in results 

102 if result.get("link") 

103 ] 

104 if ddg_urls: 

105 logger.info( 

106 f"Found {len(ddg_urls)} URLs from DuckDuckGo search" 

107 ) 

108 return ddg_urls 

109 except Exception: 

110 logger.exception("Error using DuckDuckGo for URL discovery") 

111 

112 # Fallback: treat the query as a potential domain or path 

113 if "/" in query and "." in query: 

114 logger.info(f"Treating query as a partial URL: {query}") 

115 return [f"http://{query}"] 

116 if "." in query: 116 ↛ 117line 116 didn't jump to line 117 because the condition on line 116 was never true

117 logger.info(f"Treating query as a domain: {query}") 

118 return [f"http://{query}"] 

119 

120 # Return empty list if nothing worked 

121 logger.warning(f"Could not extract any URLs from query: {query}") 

122 return [] 

123 

124 def _format_timestamp(self, timestamp: str) -> str: 

125 """Format Wayback Machine timestamp into readable date""" 

126 if len(timestamp) < 14: 

127 return timestamp 

128 

129 try: 

130 year = timestamp[0:4] 

131 month = timestamp[4:6] 

132 day = timestamp[6:8] 

133 hour = timestamp[8:10] 

134 minute = timestamp[10:12] 

135 second = timestamp[12:14] 

136 return f"{year}-{month}-{day} {hour}:{minute}:{second}" 

137 except Exception: 

138 logger.debug("Timestamp formatting failed, returning original") 

139 return timestamp 

140 

141 def _get_wayback_snapshots(self, url: str) -> List[Dict[str, Any]]: 

142 """ 

143 Get snapshots from the Wayback Machine for a specific URL. 

144 

145 Args: 

146 url: URL to get snapshots for 

147 

148 Returns: 

149 List of snapshot dictionaries 

150 """ 

151 snapshots = [] 

152 

153 try: 

154 if self.closest_only: 

155 # Get only the closest snapshot 

156 response = safe_get(self.available_api, params={"url": url}) 

157 

158 # Check for rate limit 

159 if response.status_code == 429: 

160 raise RateLimitError("Wayback Machine rate limit exceeded") # noqa: TRY301 — re-raised by except RateLimitError for base class retry 

161 

162 data = response.json() 

163 

164 if ( 

165 "archived_snapshots" in data 

166 and "closest" in data["archived_snapshots"] 

167 ): 

168 snapshot = data["archived_snapshots"]["closest"] 

169 snapshot_url = snapshot["url"] 

170 timestamp = snapshot["timestamp"] 

171 

172 snapshots.append( 

173 { 

174 "timestamp": timestamp, 

175 "formatted_date": self._format_timestamp(timestamp), 

176 "url": snapshot_url, 

177 "original_url": url, 

178 "available": snapshot.get("available", True), 

179 "status": snapshot.get("status", "200"), 

180 } 

181 ) 

182 else: 

183 # Get multiple snapshots using CDX API 

184 response = safe_get( 

185 self.cdx_api, 

186 params={ 

187 "url": url, 

188 "output": "json", 

189 "fl": "timestamp,original,statuscode,mimetype", 

190 "collapse": "timestamp:4", # Group by year 

191 "limit": self.max_snapshots_per_url, 

192 }, 

193 ) 

194 

195 # Check for rate limit 

196 if response.status_code == 429: 

197 raise RateLimitError( # noqa: TRY301 — re-raised by except RateLimitError for base class retry 

198 "Wayback Machine CDX API rate limit exceeded" 

199 ) 

200 

201 # Check if response is valid JSON 

202 data = response.json() 

203 

204 # First item is the header 

205 if len(data) > 1: 

206 headers = data[0] 

207 for item in data[1:]: 

208 snapshot = dict(zip(headers, item, strict=False)) 

209 timestamp = snapshot.get("timestamp", "") 

210 

211 wayback_url = ( 

212 f"https://web.archive.org/web/{timestamp}/{url}" 

213 ) 

214 

215 snapshots.append( 

216 { 

217 "timestamp": timestamp, 

218 "formatted_date": self._format_timestamp( 

219 timestamp 

220 ), 

221 "url": wayback_url, 

222 "original_url": url, 

223 "available": True, 

224 "status": snapshot.get("statuscode", "200"), 

225 } 

226 ) 

227 

228 # Limit to max snapshots per URL 

229 snapshots = snapshots[: self.max_snapshots_per_url] 

230 

231 except RateLimitError: 

232 # Re-raise rate limit errors for base class retry handling 

233 raise 

234 except Exception: 

235 logger.exception(f"Error getting Wayback snapshots for {url}") 

236 

237 return snapshots 

238 

239 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

240 """ 

241 Get preview information for Wayback Machine snapshots. 

242 

243 Args: 

244 query: The search query 

245 

246 Returns: 

247 List of preview dictionaries 

248 """ 

249 logger.info(f"Getting Wayback Machine previews for query: {query}") 

250 

251 # Extract URLs from query 

252 urls = self._extract_urls_from_query(query) 

253 

254 if not urls: 

255 logger.warning(f"No URLs found in query: {query}") 

256 return [] 

257 

258 # Get snapshots for each URL 

259 all_snapshots = [] 

260 for url in urls: 

261 snapshots = self._get_wayback_snapshots(url) 

262 all_snapshots.extend(snapshots) 

263 

264 # Apply rate limiting between requests 

265 if len(urls) > 1: 265 ↛ 266line 265 didn't jump to line 266 because the condition on line 265 was never true

266 self.rate_tracker.apply_rate_limit(self.engine_type) 

267 

268 # Format as previews 

269 previews = [] 

270 for snapshot in all_snapshots: 

271 preview = { 

272 "id": f"{snapshot['timestamp']}_{snapshot['original_url']}", 

273 "title": f"Archive of {snapshot['original_url']} ({snapshot['formatted_date']})", 

274 "link": snapshot["url"], 

275 "snippet": f"Archived version from {snapshot['formatted_date']}", 

276 "original_url": snapshot["original_url"], 

277 "timestamp": snapshot["timestamp"], 

278 "formatted_date": snapshot["formatted_date"], 

279 } 

280 previews.append(preview) 

281 

282 logger.info(f"Found {len(previews)} Wayback Machine snapshots") 

283 return previews 

284 

285 def _remove_boilerplate(self, html: str) -> str: 

286 """Remove boilerplate using the shared extraction pipeline.""" 

287 if not html or not html.strip(): 

288 return "" 

289 try: 

290 return extract_content(html, language=self.language) or "" 

291 except Exception: 

292 logger.exception("Error removing boilerplate") 

293 return html 

294 

295 def _get_wayback_content(self, url: str) -> Tuple[str, str]: 

296 """ 

297 Retrieve content from a Wayback Machine URL. 

298 

299 Args: 

300 url: Wayback Machine URL 

301 

302 Returns: 

303 Tuple of (raw_html, cleaned_text) 

304 """ 

305 try: 

306 headers = { 

307 "User-Agent": "Mozilla/5.0 (Local Deep Research Bot; research project)" 

308 } 

309 response = safe_get(url, headers=headers, timeout=10) 

310 raw_html = response.text 

311 

312 # Clean the HTML 

313 cleaned_text = self._remove_boilerplate(raw_html) 

314 

315 return raw_html, cleaned_text 

316 except Exception as e: 

317 logger.exception(f"Error retrieving content from {url}") 

318 return "", f"Error retrieving content: {e!s}" 

319 

320 def _get_full_content( 

321 self, relevant_items: List[Dict[str, Any]] 

322 ) -> List[Dict[str, Any]]: 

323 """ 

324 Get full content for the relevant Wayback Machine snapshots. 

325 

326 Args: 

327 relevant_items: List of relevant preview dictionaries 

328 

329 Returns: 

330 List of result dictionaries with full content 

331 """ 

332 # Check if we should add full content 

333 if ( 

334 hasattr(search_config, "SEARCH_SNIPPETS_ONLY") 

335 and search_config.SEARCH_SNIPPETS_ONLY 

336 ): 

337 logger.info("Snippet-only mode, skipping full content retrieval") 

338 return relevant_items 

339 

340 logger.info( 

341 f"Getting full content for {len(relevant_items)} Wayback Machine snapshots" 

342 ) 

343 

344 results = [] 

345 for item in relevant_items: 

346 wayback_url = item.get("link") 

347 if not wayback_url: 

348 results.append(item) 

349 continue 

350 

351 logger.info(f"Retrieving content from {wayback_url}") 

352 

353 try: 

354 # Retrieve content 

355 raw_html, full_content = self._get_wayback_content(wayback_url) 

356 

357 # Add full content to the result 

358 result = item.copy() 

359 result["raw_html"] = raw_html 

360 result["full_content"] = full_content 

361 

362 results.append(result) 

363 

364 # Apply rate limiting 

365 self.rate_tracker.apply_rate_limit(self.engine_type) 

366 except Exception: 

367 logger.exception(f"Error processing {wayback_url}") 

368 results.append(item) 

369 

370 return results 

371 

372 def search_by_url( 

373 self, url: str, max_snapshots: int | None = None 

374 ) -> List[Dict[str, Any]]: 

375 """ 

376 Search for archived versions of a specific URL. 

377 

378 Args: 

379 url: The URL to search for archives 

380 max_snapshots: Maximum number of snapshots to return 

381 

382 Returns: 

383 List of snapshot dictionaries 

384 """ 

385 max_snapshots = max_snapshots or self.max_snapshots_per_url 

386 

387 snapshots = self._get_wayback_snapshots(url) 

388 previews = [] 

389 

390 for snapshot in snapshots[:max_snapshots]: 

391 preview = { 

392 "id": f"{snapshot['timestamp']}_{snapshot['original_url']}", 

393 "title": f"Archive of {snapshot['original_url']} ({snapshot['formatted_date']})", 

394 "link": snapshot["url"], 

395 "snippet": f"Archived version from {snapshot['formatted_date']}", 

396 "original_url": snapshot["original_url"], 

397 "timestamp": snapshot["timestamp"], 

398 "formatted_date": snapshot["formatted_date"], 

399 } 

400 previews.append(preview) 

401 

402 # Get full content if not in snippets-only mode 

403 if ( 

404 not hasattr(search_config, "SEARCH_SNIPPETS_ONLY") 

405 or not search_config.SEARCH_SNIPPETS_ONLY 

406 ): 

407 return self._get_full_content(previews) 

408 

409 return previews 

410 

411 def search_by_date_range( 

412 self, url: str, start_date: str, end_date: str 

413 ) -> List[Dict[str, Any]]: 

414 """ 

415 Search for archived versions of a URL within a date range. 

416 

417 Args: 

418 url: The URL to search for archives 

419 start_date: Start date in format YYYYMMDD 

420 end_date: End date in format YYYYMMDD 

421 

422 Returns: 

423 List of snapshot dictionaries 

424 """ 

425 try: 

426 # Use CDX API with date range 

427 response = safe_get( 

428 self.cdx_api, 

429 params={ 

430 "url": url, 

431 "output": "json", 

432 "fl": "timestamp,original,statuscode,mimetype", 

433 "from": start_date, 

434 "to": end_date, 

435 "limit": self.max_snapshots_per_url, 

436 }, 

437 ) 

438 

439 # Process response 

440 data = response.json() 

441 

442 # First item is the header 

443 if len(data) <= 1: 

444 return [] 

445 

446 headers = data[0] 

447 snapshots = [] 

448 

449 for item in data[1:]: 

450 snapshot = dict(zip(headers, item, strict=False)) 

451 timestamp = snapshot.get("timestamp", "") 

452 

453 wayback_url = f"https://web.archive.org/web/{timestamp}/{url}" 

454 

455 snapshots.append( 

456 { 

457 "id": f"{timestamp}_{url}", 

458 "title": f"Archive of {url} ({self._format_timestamp(timestamp)})", 

459 "link": wayback_url, 

460 "snippet": f"Archived version from {self._format_timestamp(timestamp)}", 

461 "original_url": url, 

462 "timestamp": timestamp, 

463 "formatted_date": self._format_timestamp(timestamp), 

464 } 

465 ) 

466 

467 # Get full content if not in snippets-only mode 

468 if ( 468 ↛ 472line 468 didn't jump to line 472 because the condition on line 468 was never true

469 not hasattr(search_config, "SEARCH_SNIPPETS_ONLY") 

470 or not search_config.SEARCH_SNIPPETS_ONLY 

471 ): 

472 return self._get_full_content(snapshots) 

473 

474 return snapshots 

475 

476 except Exception: 

477 logger.exception(f"Error searching date range for {url}") 

478 return [] 

479 

480 def get_latest_snapshot(self, url: str) -> Optional[Dict[str, Any]]: 

481 """ 

482 Get the most recent snapshot of a URL. 

483 

484 Args: 

485 url: The URL to get the latest snapshot for 

486 

487 Returns: 

488 Dictionary with snapshot information or None if not found 

489 """ 

490 try: 

491 response = safe_get(self.available_api, params={"url": url}) 

492 data = response.json() 

493 

494 if ( 

495 "archived_snapshots" in data 

496 and "closest" in data["archived_snapshots"] 

497 ): 

498 snapshot = data["archived_snapshots"]["closest"] 

499 timestamp = snapshot["timestamp"] 

500 wayback_url = snapshot["url"] 

501 

502 result = { 

503 "id": f"{timestamp}_{url}", 

504 "title": f"Latest archive of {url} ({self._format_timestamp(timestamp)})", 

505 "link": wayback_url, 

506 "snippet": f"Archived version from {self._format_timestamp(timestamp)}", 

507 "original_url": url, 

508 "timestamp": timestamp, 

509 "formatted_date": self._format_timestamp(timestamp), 

510 } 

511 

512 # Get full content if not in snippets-only mode 

513 if ( 

514 not hasattr(search_config, "SEARCH_SNIPPETS_ONLY") 

515 or not search_config.SEARCH_SNIPPETS_ONLY 

516 ): 

517 raw_html, full_content = self._get_wayback_content( 

518 wayback_url 

519 ) 

520 result["raw_html"] = raw_html 

521 result["full_content"] = full_content 

522 

523 return result 

524 

525 return None 

526 

527 except Exception: 

528 logger.exception(f"Error getting latest snapshot for {url}") 

529 return None