Coverage for src / local_deep_research / research_library / downloaders / playwright_html.py: 35%

192 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1""" 

2HTML Downloader with JavaScript rendering support. 

3 

4Uses Crawl4AI (default) or plain Playwright for JS-rendered pages. 

5Crawl4AI adds: robots.txt checking, shadow DOM flattening, iframe 

6inlining, smart scrolling for lazy-loaded content, and caching. 

7Falls back to plain Playwright if Crawl4AI is not installed. 

8 

9No stealth/anti-detection features are used — the browser identifies 

10honestly via BROWSER_USER_AGENT and respects robots.txt. 

11""" 

12 

13import asyncio 

14from typing import Optional 

15from urllib.parse import urlparse 

16 

17from loguru import logger 

18 

19from .html import HTMLDownloader 

20from ...constants import BROWSER_USER_AGENT 

21 

22 

23# Signals that a page is a JS-rendered SPA and needs browser rendering 

24SPA_SIGNALS = [ 

25 'id="root"', 

26 'id="app"', 

27 'id="__next"', 

28 "__NEXT_DATA__", 

29 "data-reactroot", 

30 'ng-version="', 

31 "<noscript>You need to enable JavaScript", 

32 "<noscript>Please enable JavaScript", 

33 "window.__INITIAL_STATE__", 

34] 

35 

36 

37def _run_async(coro, timeout: float = None): 

38 """Run an async coroutine from synchronous code. 

39 

40 Handles the case where an event loop is already running 

41 (e.g. inside Jupyter or an async framework) by creating 

42 a new thread with its own loop. 

43 

44 Args: 

45 coro: The coroutine to run. 

46 timeout: Max seconds to wait for the result. Prevents 

47 indefinite hangs if the coroutine's internal timeout fails. 

48 """ 

49 try: 

50 loop = asyncio.get_running_loop() 

51 except RuntimeError: 

52 loop = None 

53 

54 if loop is None: 

55 return asyncio.run(coro) 

56 

57 # Already inside an event loop — run in a new thread 

58 import concurrent.futures 

59 

60 with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool: 

61 future = pool.submit(asyncio.run, coro) 

62 return future.result(timeout=timeout) 

63 

64 

65class PlaywrightHTMLDownloader(HTMLDownloader): 

66 """HTML downloader with JS rendering via Crawl4AI or Playwright. 

67 

68 Default: Crawl4AI (robots.txt, shadow DOM, iframes, caching). 

69 Fallback: plain Playwright if Crawl4AI is not installed. 

70 

71 No stealth or anti-detection features are used. 

72 """ 

73 

74 def __init__( 

75 self, 

76 timeout: int = 30, 

77 language: str = "English", 

78 wait_until: str = "networkidle", 

79 block_resources: bool = True, 

80 **kwargs, 

81 ): 

82 super().__init__(timeout=timeout, language=language) 

83 self.wait_until = wait_until 

84 self.block_resources = block_resources 

85 # Plain Playwright fallback state 

86 self._playwright = None 

87 self._browser = None 

88 

89 def _fetch_html(self, url: str) -> Optional[str]: 

90 """Fetch HTML with JS rendering. 

91 

92 Tries Crawl4AI first (with robots.txt, shadow DOM, iframes), 

93 falls back to plain Playwright. 

94 """ 

95 # Try Crawl4AI first (richer features, robots.txt) 

96 html = self._fetch_with_crawl4ai(url) 

97 if html is not None: 

98 # Crawl4AI succeeded (non-empty) or intentionally blocked 

99 # by robots.txt (empty string). Either way, don't fall 

100 # through to Playwright. 

101 return html or None 

102 

103 # Crawl4AI not installed or failed — fall back to Playwright 

104 return self._fetch_with_playwright(url) 

105 

106 def _fetch_with_crawl4ai(self, url: str) -> Optional[str]: 

107 """Fetch HTML using Crawl4AI with ethical defaults.""" 

108 domain = urlparse(url).netloc 

109 engine_type = f"crawl4ai_download_{domain}" 

110 

111 try: 

112 from crawl4ai import ( 

113 AsyncWebCrawler, 

114 BrowserConfig, 

115 CrawlerRunConfig, 

116 ) 

117 except ImportError: 

118 logger.debug("crawl4ai not installed — using Playwright") 

119 return None 

120 

121 logger.debug(f"Crawl4AI fetch: {url}") 

122 wait_time = self.rate_tracker.apply_rate_limit(engine_type) 

123 

124 browser_cfg = BrowserConfig( 

125 headless=True, 

126 verbose=False, 

127 user_agent=BROWSER_USER_AGENT, 

128 ) 

129 run_cfg = CrawlerRunConfig( 

130 # Ethical: respect robots.txt 

131 check_robots_txt=True, 

132 # Better extraction: flatten modern web features 

133 flatten_shadow_dom=True, 

134 process_iframes=True, 

135 # Trigger lazy-loaded content 

136 scan_full_page=True, 

137 # Performance 

138 wait_until=self.wait_until, 

139 page_timeout=self.timeout * 1000, 

140 exclude_all_images=self.block_resources, 

141 # No stealth 

142 override_navigator=False, 

143 magic=False, 

144 simulate_user=False, 

145 verbose=False, 

146 ) 

147 

148 try: 

149 

150 async def _crawl(): 

151 async with AsyncWebCrawler(config=browser_cfg) as crawler: 

152 return await crawler.arun(url=url, config=run_cfg) 

153 

154 result = _run_async(_crawl(), timeout=self.timeout + 30) 

155 

156 if result.success and result.html: 

157 html = result.html 

158 logger.debug(f"Crawl4AI: got {len(html)} bytes from {url}") 

159 self.rate_tracker.record_outcome( 

160 engine_type=engine_type, 

161 wait_time=wait_time, 

162 success=True, 

163 retry_count=1, 

164 search_result_count=1, 

165 ) 

166 return html 

167 

168 # Check if blocked by robots.txt 

169 error_msg = getattr(result, "error_message", "") or "" 

170 if "robots.txt" in error_msg.lower(): 

171 logger.info(f"Crawl4AI: blocked by robots.txt for {url}") 

172 # Don't fall back to Playwright — respect the block 

173 self.rate_tracker.record_outcome( 

174 engine_type=engine_type, 

175 wait_time=wait_time, 

176 success=False, 

177 retry_count=1, 

178 error_type="robots_txt_blocked", 

179 ) 

180 return "" # Empty string signals intentional skip 

181 

182 status = getattr(result, "status_code", "unknown") 

183 logger.debug( 

184 f"Crawl4AI: failed for {url}" 

185 f"success={result.success}, status={status}" 

186 ) 

187 self.rate_tracker.record_outcome( 

188 engine_type=engine_type, 

189 wait_time=wait_time, 

190 success=False, 

191 retry_count=1, 

192 error_type=f"crawl4ai_status_{status}", 

193 ) 

194 return None 

195 

196 except Exception as e: 

197 logger.debug(f"Crawl4AI error for {url}: {e}") 

198 self.rate_tracker.record_outcome( 

199 engine_type=engine_type, 

200 wait_time=wait_time, 

201 success=False, 

202 retry_count=1, 

203 error_type=type(e).__name__, 

204 ) 

205 return None 

206 

207 def _fetch_with_playwright(self, url: str) -> Optional[str]: 

208 """Fetch HTML using plain Playwright (fallback).""" 

209 logger.debug(f"Playwright fetch: {url}") 

210 domain = urlparse(url).netloc 

211 engine_type = f"playwright_download_{domain}" 

212 

213 wait_time = self.rate_tracker.apply_rate_limit(engine_type) 

214 

215 try: 

216 from playwright.sync_api import sync_playwright 

217 

218 # Lazy-init browser (reuse across multiple fetches) 

219 if self._browser is None: 

220 logger.debug("Playwright: launching Chromium browser") 

221 pw = sync_playwright().start() 

222 try: 

223 self._browser = pw.chromium.launch(headless=True) 

224 except Exception: 

225 pw.stop() 

226 raise 

227 self._playwright = pw 

228 

229 page = self._browser.new_page( 

230 user_agent=BROWSER_USER_AGENT, 

231 ) 

232 try: 

233 # Block heavy resources to speed up rendering 

234 if self.block_resources: 

235 page.route( 

236 "**/*.{png,jpg,jpeg,gif,webp,svg,ico,woff,woff2," 

237 "ttf,eot,mp4,webm,mp3,ogg,css}", 

238 lambda route: route.abort(), 

239 ) 

240 

241 page.goto( 

242 url, 

243 wait_until=self.wait_until, 

244 timeout=self.timeout * 1000, 

245 ) 

246 html = page.content() 

247 finally: 

248 try: 

249 page.close() 

250 except Exception: 

251 logger.debug("Failed to close Playwright page") 

252 

253 if html: 

254 logger.debug(f"Playwright: got {len(html)} bytes from {url}") 

255 self.rate_tracker.record_outcome( 

256 engine_type=engine_type, 

257 wait_time=wait_time, 

258 success=True, 

259 retry_count=1, 

260 search_result_count=1, 

261 ) 

262 return html 

263 

264 logger.debug(f"Playwright: empty response from {url}") 

265 return None 

266 

267 except ImportError: 

268 logger.warning("playwright not installed — cannot use JS rendering") 

269 return None 

270 except Exception as e: 

271 logger.exception(f"Playwright error fetching {url}") 

272 self.rate_tracker.record_outcome( 

273 engine_type=engine_type, 

274 wait_time=wait_time, 

275 success=False, 

276 retry_count=1, 

277 error_type=type(e).__name__, 

278 ) 

279 return None 

280 

281 def close(self): 

282 """Clean up Playwright browser and resources.""" 

283 if self._browser: 283 ↛ 284line 283 didn't jump to line 284 because the condition on line 283 was never true

284 try: 

285 self._browser.close() 

286 except Exception: 

287 logger.debug( 

288 "Failed to close Playwright browser", exc_info=True 

289 ) 

290 self._browser = None 

291 if self._playwright: 291 ↛ 292line 291 didn't jump to line 292 because the condition on line 291 was never true

292 try: 

293 self._playwright.stop() 

294 except Exception: 

295 logger.debug("Failed to stop Playwright", exc_info=True) 

296 self._playwright = None 

297 super().close() 

298 

299 

300class AutoHTMLDownloader(HTMLDownloader): 

301 """HTML downloader that tries static fetch first, falls back to 

302 Crawl4AI/Playwright when the page needs JavaScript rendering. 

303 

304 Detection heuristics: 

305 - Extracted content is too short (<200 chars) 

306 - Raw HTML contains SPA framework signals (React, Vue, Angular, Next.js) 

307 """ 

308 

309 def __init__( 

310 self, 

311 timeout: int = 30, 

312 language: str = "English", 

313 min_content_length: int = 200, 

314 **kwargs, 

315 ): 

316 super().__init__(timeout=timeout, language=language) 

317 self.min_content_length = min_content_length 

318 self._playwright_downloader = None 

319 

320 def _get_playwright_downloader(self) -> PlaywrightHTMLDownloader: 

321 """Lazy-init JS rendering downloader for fallback.""" 

322 if self._playwright_downloader is None: 322 ↛ 323line 322 didn't jump to line 323 because the condition on line 322 was never true

323 self._playwright_downloader = PlaywrightHTMLDownloader( 

324 timeout=self.timeout, 

325 language=self.language, 

326 ) 

327 return self._playwright_downloader 

328 

329 @staticmethod 

330 def _has_spa_signals(html: str) -> bool: 

331 """Check if HTML contains signals of a JS-rendered SPA.""" 

332 html_lower = html[:5000].lower() # Only check head/early body 

333 return any(signal.lower() in html_lower for signal in SPA_SIGNALS) 

334 

335 def _fetch_html(self, url: str) -> Optional[str]: 

336 """Fetch HTML statically, storing raw response for SPA detection. 

337 

338 Note: _last_raw_html is instance state read by download()/download_with_result(). 

339 This is safe because AutoHTMLDownloader instances are created per-request 

340 in fetch_and_extract/batch_fetch_and_extract — not shared across threads. 

341 """ 

342 self._last_raw_html = None 

343 # Try the normal static fetch 

344 html = super()._fetch_html(url) 

345 if html: 

346 self._last_raw_html = html 

347 return html 

348 

349 # Static fetch failed (403, etc.) — try raw GET to check for 

350 # challenge pages / SPA signals even on non-200 responses 

351 try: 

352 response = self.session.get( 

353 url, 

354 timeout=self.timeout, 

355 allow_redirects=True, 

356 ) 

357 self._last_raw_html = response.text 

358 except Exception: 

359 logger.debug("Failed to fetch raw HTML for SPA detection") 

360 return None 

361 

362 def download(self, url, content_type=None): 

363 """Try static fetch, fall back to JS rendering if needed.""" 

364 from .base import ContentType 

365 

366 if content_type is None: 366 ↛ 370line 366 didn't jump to line 370 because the condition on line 366 was always true

367 content_type = ContentType.TEXT 

368 

369 # First: try static fetch (fast) 

370 logger.debug(f"Auto: trying static fetch for {url}") 

371 result = super().download(url, content_type) 

372 

373 if result and len(result) >= self.min_content_length: 

374 logger.debug( 

375 f"Auto: static fetch succeeded ({len(result)} bytes) for {url}" 

376 ) 

377 return result 

378 

379 # Check if we should retry with JS rendering 

380 raw_html = getattr(self, "_last_raw_html", None) 

381 needs_js = raw_html and self._has_spa_signals(raw_html) 

382 no_content = result is None or len(result) < self.min_content_length 

383 

384 if needs_js or no_content: 384 ↛ 398line 384 didn't jump to line 398 because the condition on line 384 was always true

385 reason = "SPA signals" if needs_js else "no/short content" 

386 logger.info( 

387 f"Auto: {reason} for {url}, falling back to JS rendering" 

388 ) 

389 pw_dl = self._get_playwright_downloader() 

390 pw_result = pw_dl.download(url, content_type) 

391 if pw_result and len(pw_result) > len(result or b""): 391 ↛ 396line 391 didn't jump to line 396 because the condition on line 391 was always true

392 logger.info( 

393 f"Auto: JS rendering succeeded ({len(pw_result)} bytes) for {url}" 

394 ) 

395 return pw_result 

396 logger.debug(f"Auto: JS rendering did not improve result for {url}") 

397 

398 return result 

399 

400 def download_with_result(self, url, content_type=None): 

401 """Try static fetch, fall back to JS rendering if needed.""" 

402 from .base import ContentType 

403 

404 if content_type is None: 

405 content_type = ContentType.TEXT 

406 

407 # First: try static fetch (fast) 

408 logger.debug(f"Auto: trying static fetch for {url}") 

409 result = super().download_with_result(url, content_type) 

410 

411 if ( 

412 result.is_success 

413 and result.content 

414 and len(result.content) >= self.min_content_length 

415 ): 

416 logger.debug( 

417 f"Auto: static fetch succeeded ({len(result.content)} bytes) for {url}" 

418 ) 

419 return result 

420 

421 # Check if we should retry with JS rendering 

422 raw_html = getattr(self, "_last_raw_html", None) 

423 needs_js = raw_html and self._has_spa_signals(raw_html) 

424 no_content = ( 

425 not result.is_success 

426 or not result.content 

427 or len(result.content) < self.min_content_length 

428 ) 

429 

430 if needs_js or no_content: 

431 reason = "SPA signals" if needs_js else "no/short content" 

432 logger.info( 

433 f"Auto: {reason} for {url}, falling back to JS rendering" 

434 ) 

435 pw_dl = self._get_playwright_downloader() 

436 pw_result = pw_dl.download_with_result(url, content_type) 

437 if ( 

438 pw_result.is_success 

439 and pw_result.content 

440 and len(pw_result.content) > len(result.content or b"") 

441 ): 

442 logger.info( 

443 f"Auto: JS rendering succeeded " 

444 f"({len(pw_result.content)} bytes) for {url}" 

445 ) 

446 return pw_result 

447 logger.debug(f"Auto: JS rendering did not improve result for {url}") 

448 

449 return result 

450 

451 def close(self): 

452 """Clean up both static and JS rendering resources.""" 

453 if self._playwright_downloader: 

454 self._playwright_downloader.close() 

455 self._playwright_downloader = None 

456 super().close()