Coverage for src/local_deep_research/research_library/downloaders/playwright_html.py: 55%

199 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1""" 

2HTML Downloader with JavaScript rendering support. 

3 

4Uses Crawl4AI (default) or plain Playwright for JS-rendered pages. 

5Crawl4AI adds: robots.txt checking, shadow DOM flattening, iframe 

6inlining, smart scrolling for lazy-loaded content, and caching. 

7Falls back to plain Playwright if Crawl4AI is not installed. 

8 

9No stealth/anti-detection features are used — the browser identifies 

10honestly via BROWSER_USER_AGENT and respects robots.txt. 

11""" 

12 

13import asyncio 

14from typing import Optional 

15from urllib.parse import urlparse 

16 

17from loguru import logger 

18 

19from .html import HTMLDownloader 

20from ...constants import BROWSER_USER_AGENT 

21 

22 

23# Signals that a page is a JS-rendered SPA and needs browser rendering 

24SPA_SIGNALS = [ 

25 'id="root"', 

26 'id="app"', 

27 'id="__next"', 

28 "__NEXT_DATA__", 

29 "data-reactroot", 

30 'ng-version="', 

31 "<noscript>You need to enable JavaScript", 

32 "<noscript>Please enable JavaScript", 

33 "window.__INITIAL_STATE__", 

34] 

35 

36 

37def _run_async(coro, timeout: float = None): 

38 """Run an async coroutine from synchronous code. 

39 

40 Handles the case where an event loop is already running 

41 (e.g. inside Jupyter or an async framework) by creating 

42 a new thread with its own loop. 

43 

44 Args: 

45 coro: The coroutine to run. 

46 timeout: Max seconds to wait for the result. Prevents 

47 indefinite hangs if the coroutine's internal timeout fails. 

48 """ 

49 try: 

50 loop = asyncio.get_running_loop() 

51 except RuntimeError: 

52 loop = None 

53 

54 if loop is None: 

55 return asyncio.run(coro) 

56 

57 # Already inside an event loop — run in a new thread 

58 import concurrent.futures 

59 

60 with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool: 

61 future = pool.submit(asyncio.run, coro) 

62 return future.result(timeout=timeout) 

63 

64 

65class PlaywrightHTMLDownloader(HTMLDownloader): 

66 """HTML downloader with JS rendering via Crawl4AI or Playwright. 

67 

68 Default: Crawl4AI (robots.txt, shadow DOM, iframes, caching). 

69 Fallback: plain Playwright if Crawl4AI is not installed. 

70 

71 No stealth or anti-detection features are used. 

72 """ 

73 

74 def __init__( 

75 self, 

76 timeout: int = 30, 

77 language: str = "English", 

78 wait_until: str = "networkidle", 

79 block_resources: bool = True, 

80 **kwargs, 

81 ): 

82 super().__init__(timeout=timeout, language=language) 

83 self.wait_until = wait_until 

84 self.block_resources = block_resources 

85 # Plain Playwright fallback state 

86 self._playwright = None 

87 self._browser = None 

88 

89 def _fetch_html(self, url: str) -> Optional[str]: 

90 """Fetch HTML with JS rendering. 

91 

92 Tries Crawl4AI first (with robots.txt, shadow DOM, iframes), 

93 falls back to plain Playwright. 

94 """ 

95 # Try Crawl4AI first (richer features, robots.txt) 

96 html = self._fetch_with_crawl4ai(url) 

97 if html is not None: 

98 # Crawl4AI succeeded (non-empty) or intentionally blocked 

99 # by robots.txt (empty string). Either way, don't fall 

100 # through to Playwright. 

101 return html or None 

102 

103 # Crawl4AI not installed or failed — fall back to Playwright 

104 return self._fetch_with_playwright(url) 

105 

106 def _fetch_with_crawl4ai(self, url: str) -> Optional[str]: 

107 """Fetch HTML using Crawl4AI with ethical defaults.""" 

108 domain = urlparse(url).netloc 

109 engine_type = f"crawl4ai_download_{domain}" 

110 

111 try: 

112 from crawl4ai import ( 

113 AsyncWebCrawler, 

114 BrowserConfig, 

115 CrawlerRunConfig, 

116 ) 

117 except ImportError: 

118 logger.debug("crawl4ai not installed — using Playwright") 

119 return None 

120 

121 logger.debug(f"Crawl4AI fetch: {url}") 

122 wait_time = self.rate_tracker.apply_rate_limit(engine_type) 

123 

124 browser_cfg = BrowserConfig( 

125 headless=True, 

126 verbose=False, 

127 user_agent=BROWSER_USER_AGENT, 

128 ) 

129 run_cfg = CrawlerRunConfig( 

130 # Ethical: respect robots.txt 

131 check_robots_txt=True, 

132 # Better extraction: flatten modern web features 

133 flatten_shadow_dom=True, 

134 process_iframes=True, 

135 # Trigger lazy-loaded content 

136 scan_full_page=True, 

137 # Performance 

138 wait_until=self.wait_until, 

139 page_timeout=self.timeout * 1000, 

140 exclude_all_images=self.block_resources, 

141 # No stealth 

142 override_navigator=False, 

143 magic=False, 

144 simulate_user=False, 

145 verbose=False, 

146 ) 

147 

148 try: 

149 

150 async def _crawl(): 

151 async with AsyncWebCrawler(config=browser_cfg) as crawler: 

152 return await crawler.arun(url=url, config=run_cfg) 

153 

154 result = _run_async(_crawl(), timeout=self.timeout + 30) 

155 

156 if result.success and result.html: 

157 html = result.html 

158 logger.debug(f"Crawl4AI: got {len(html)} bytes from {url}") 

159 self.rate_tracker.record_outcome( 

160 engine_type=engine_type, 

161 wait_time=wait_time, 

162 success=True, 

163 retry_count=1, 

164 search_result_count=1, 

165 ) 

166 return html 

167 

168 # Check if blocked by robots.txt 

169 error_msg = getattr(result, "error_message", "") or "" 

170 if "robots.txt" in error_msg.lower(): 

171 logger.info(f"Crawl4AI: blocked by robots.txt for {url}") 

172 # Don't fall back to Playwright — respect the block 

173 self.rate_tracker.record_outcome( 

174 engine_type=engine_type, 

175 wait_time=wait_time, 

176 success=False, 

177 retry_count=1, 

178 error_type="robots_txt_blocked", 

179 ) 

180 return "" # Empty string signals intentional skip 

181 

182 status = getattr(result, "status_code", "unknown") 

183 logger.debug( 

184 f"Crawl4AI: failed for {url}" 

185 f"success={result.success}, status={status}" 

186 ) 

187 self.rate_tracker.record_outcome( 

188 engine_type=engine_type, 

189 wait_time=wait_time, 

190 success=False, 

191 retry_count=1, 

192 error_type=f"crawl4ai_status_{status}", 

193 ) 

194 return None 

195 

196 except Exception as e: 

197 logger.debug(f"Crawl4AI error for {url}: {e}") 

198 self.rate_tracker.record_outcome( 

199 engine_type=engine_type, 

200 wait_time=wait_time, 

201 success=False, 

202 retry_count=1, 

203 error_type=type(e).__name__, 

204 ) 

205 return None 

206 

207 def _fetch_with_playwright(self, url: str) -> Optional[str]: 

208 """Fetch HTML using plain Playwright (fallback).""" 

209 logger.debug(f"Playwright fetch: {url}") 

210 domain = urlparse(url).netloc 

211 engine_type = f"playwright_download_{domain}" 

212 

213 wait_time = self.rate_tracker.apply_rate_limit(engine_type) 

214 

215 try: 

216 from playwright.sync_api import sync_playwright 

217 

218 # Lazy-init browser (reuse across multiple fetches). 

219 # --no-sandbox: Chromium needs SYS_ADMIN to set up its user-namespace 

220 # sandbox; the production container drops that cap. Without this 

221 # flag, launch() crashes inside Docker. Crawl4AI's own arg list 

222 # already includes it; this fallback path was missing it. 

223 # --disable-dev-shm-usage: Docker's default /dev/shm is 64 MB, 

224 # which Chromium can blow through and OOM. Use /tmp instead. 

225 if self._browser is None: 

226 logger.debug("Playwright: launching Chromium browser") 

227 pw = sync_playwright().start() 

228 try: 

229 self._browser = pw.chromium.launch( 

230 headless=True, 

231 args=["--no-sandbox", "--disable-dev-shm-usage"], 

232 ) 

233 except Exception: 

234 pw.stop() 

235 raise 

236 self._playwright = pw 

237 

238 page = self._browser.new_page( 

239 user_agent=BROWSER_USER_AGENT, 

240 ) 

241 try: 

242 # Block heavy resources to speed up rendering 

243 if self.block_resources: 

244 page.route( 

245 "**/*.{png,jpg,jpeg,gif,webp,svg,ico,woff,woff2," 

246 "ttf,eot,mp4,webm,mp3,ogg,css}", 

247 lambda route: route.abort(), 

248 ) 

249 

250 page.goto( 

251 url, 

252 wait_until=self.wait_until, 

253 timeout=self.timeout * 1000, 

254 ) 

255 html = page.content() 

256 finally: 

257 try: 

258 page.close() 

259 except Exception: 

260 logger.debug("Failed to close Playwright page") 

261 

262 if html: 

263 logger.debug(f"Playwright: got {len(html)} bytes from {url}") 

264 self.rate_tracker.record_outcome( 

265 engine_type=engine_type, 

266 wait_time=wait_time, 

267 success=True, 

268 retry_count=1, 

269 search_result_count=1, 

270 ) 

271 return html 

272 

273 logger.debug(f"Playwright: empty response from {url}") 

274 return None 

275 

276 except ImportError: 

277 logger.warning("playwright not installed — cannot use JS rendering") 

278 return None 

279 except Exception as e: 

280 logger.exception(f"Playwright error fetching {url}") 

281 self.rate_tracker.record_outcome( 

282 engine_type=engine_type, 

283 wait_time=wait_time, 

284 success=False, 

285 retry_count=1, 

286 error_type=type(e).__name__, 

287 ) 

288 return None 

289 

290 def close(self): 

291 """Clean up Playwright browser and resources.""" 

292 if self._browser: 292 ↛ 293line 292 didn't jump to line 293 because the condition on line 292 was never true

293 try: 

294 self._browser.close() 

295 except Exception: 

296 logger.debug( 

297 "Failed to close Playwright browser", exc_info=True 

298 ) 

299 self._browser = None 

300 if self._playwright: 300 ↛ 301line 300 didn't jump to line 301 because the condition on line 300 was never true

301 try: 

302 self._playwright.stop() 

303 except Exception: 

304 logger.debug("Failed to stop Playwright", exc_info=True) 

305 self._playwright = None 

306 super().close() 

307 

308 

309class AutoHTMLDownloader(HTMLDownloader): 

310 """HTML downloader that tries static fetch first, falls back to 

311 Crawl4AI/Playwright when the page needs JavaScript rendering. 

312 

313 Detection heuristics: 

314 - Extracted content is too short (<200 chars) 

315 - Raw HTML contains SPA framework signals (React, Vue, Angular, Next.js) 

316 """ 

317 

318 def __init__( 

319 self, 

320 timeout: int = 30, 

321 language: str = "English", 

322 min_content_length: int = 200, 

323 # Disabled by default to match the production Docker image, which 

324 # ships without Chromium — every JS-rendering fallback attempt 

325 # would otherwise fail loudly (see issue #3826). Callers running 

326 # outside Docker with Chromium installed opt in via the 

327 # ``web.enable_javascript_rendering`` setting, or pass ``True`` 

328 # explicitly when constructing the downloader. 

329 enable_js_rendering: bool = False, 

330 **kwargs, 

331 ): 

332 super().__init__(timeout=timeout, language=language) 

333 self.min_content_length = min_content_length 

334 self.enable_js_rendering = enable_js_rendering 

335 self._playwright_downloader = None 

336 

337 def _get_playwright_downloader(self) -> PlaywrightHTMLDownloader: 

338 """Lazy-init JS rendering downloader for fallback.""" 

339 if self._playwright_downloader is None: 339 ↛ 340line 339 didn't jump to line 340 because the condition on line 339 was never true

340 self._playwright_downloader = PlaywrightHTMLDownloader( 

341 timeout=self.timeout, 

342 language=self.language, 

343 ) 

344 return self._playwright_downloader 

345 

346 @staticmethod 

347 def _has_spa_signals(html: str) -> bool: 

348 """Check if HTML contains signals of a JS-rendered SPA.""" 

349 html_lower = html[:5000].lower() # Only check head/early body 

350 return any(signal.lower() in html_lower for signal in SPA_SIGNALS) 

351 

352 def _fetch_html(self, url: str) -> Optional[str]: 

353 """Fetch HTML statically, storing raw response for SPA detection. 

354 

355 Note: _last_raw_html is instance state read by download()/download_with_result(). 

356 This is safe because AutoHTMLDownloader instances are created per-request 

357 in fetch_and_extract/batch_fetch_and_extract — not shared across threads. 

358 """ 

359 self._last_raw_html = None 

360 # Try the normal static fetch 

361 html = super()._fetch_html(url) 

362 if html: 

363 self._last_raw_html = html 

364 return html 

365 

366 # Static fetch failed (403, etc.) — try raw GET to check for 

367 # challenge pages / SPA signals even on non-200 responses 

368 try: 

369 response = self.session.get( 

370 url, 

371 timeout=self.timeout, 

372 allow_redirects=True, 

373 ) 

374 self._last_raw_html = response.text 

375 except Exception: 

376 logger.debug("Failed to fetch raw HTML for SPA detection") 

377 return None 

378 

379 def download(self, url, content_type=None): 

380 """Try static fetch, fall back to JS rendering if needed.""" 

381 from .base import ContentType 

382 

383 if content_type is None: 383 ↛ 387line 383 didn't jump to line 387 because the condition on line 383 was always true

384 content_type = ContentType.TEXT 

385 

386 # First: try static fetch (fast) 

387 logger.debug(f"Auto: trying static fetch for {url}") 

388 result = super().download(url, content_type) 

389 

390 if result and len(result) >= self.min_content_length: 

391 logger.debug( 

392 f"Auto: static fetch succeeded ({len(result)} bytes) for {url}" 

393 ) 

394 return result 

395 

396 # Check if we should retry with JS rendering 

397 raw_html = getattr(self, "_last_raw_html", None) 

398 needs_js = raw_html and self._has_spa_signals(raw_html) 

399 no_content = result is None or len(result) < self.min_content_length 

400 

401 if needs_js or no_content: 401 ↛ 422line 401 didn't jump to line 422 because the condition on line 401 was always true

402 if not self.enable_js_rendering: 

403 logger.debug( 

404 f"Auto: would fall back to JS rendering for {url}, " 

405 "but JS rendering is disabled " 

406 "(setting: web.enable_javascript_rendering)" 

407 ) 

408 return result 

409 reason = "SPA signals" if needs_js else "no/short content" 

410 logger.info( 

411 f"Auto: {reason} for {url}, falling back to JS rendering" 

412 ) 

413 pw_dl = self._get_playwright_downloader() 

414 pw_result = pw_dl.download(url, content_type) 

415 if pw_result and len(pw_result) > len(result or b""): 415 ↛ 420line 415 didn't jump to line 420 because the condition on line 415 was always true

416 logger.info( 

417 f"Auto: JS rendering succeeded ({len(pw_result)} bytes) for {url}" 

418 ) 

419 return pw_result 

420 logger.debug(f"Auto: JS rendering did not improve result for {url}") 

421 

422 return result 

423 

424 def download_with_result(self, url, content_type=None): 

425 """Try static fetch, fall back to JS rendering if needed.""" 

426 from .base import ContentType 

427 

428 if content_type is None: 428 ↛ 432line 428 didn't jump to line 432 because the condition on line 428 was always true

429 content_type = ContentType.TEXT 

430 

431 # First: try static fetch (fast) 

432 logger.debug(f"Auto: trying static fetch for {url}") 

433 result = super().download_with_result(url, content_type) 

434 

435 if ( 435 ↛ 440line 435 didn't jump to line 440 because the condition on line 435 was never true

436 result.is_success 

437 and result.content 

438 and len(result.content) >= self.min_content_length 

439 ): 

440 logger.debug( 

441 f"Auto: static fetch succeeded ({len(result.content)} bytes) for {url}" 

442 ) 

443 return result 

444 

445 # Check if we should retry with JS rendering 

446 raw_html = getattr(self, "_last_raw_html", None) 

447 needs_js = raw_html and self._has_spa_signals(raw_html) 

448 no_content = ( 

449 not result.is_success 

450 or not result.content 

451 or len(result.content) < self.min_content_length 

452 ) 

453 

454 if needs_js or no_content: 454 ↛ 480line 454 didn't jump to line 480 because the condition on line 454 was always true

455 if not self.enable_js_rendering: 455 ↛ 462line 455 didn't jump to line 462 because the condition on line 455 was always true

456 logger.debug( 

457 f"Auto: would fall back to JS rendering for {url}, " 

458 "but JS rendering is disabled " 

459 "(setting: web.enable_javascript_rendering)" 

460 ) 

461 return result 

462 reason = "SPA signals" if needs_js else "no/short content" 

463 logger.info( 

464 f"Auto: {reason} for {url}, falling back to JS rendering" 

465 ) 

466 pw_dl = self._get_playwright_downloader() 

467 pw_result = pw_dl.download_with_result(url, content_type) 

468 if ( 

469 pw_result.is_success 

470 and pw_result.content 

471 and len(pw_result.content) > len(result.content or b"") 

472 ): 

473 logger.info( 

474 f"Auto: JS rendering succeeded " 

475 f"({len(pw_result.content)} bytes) for {url}" 

476 ) 

477 return pw_result 

478 logger.debug(f"Auto: JS rendering did not improve result for {url}") 

479 

480 return result 

481 

482 def close(self): 

483 """Clean up both static and JS rendering resources.""" 

484 if self._playwright_downloader: 

485 self._playwright_downloader.close() 

486 self._playwright_downloader = None 

487 super().close()