Coverage for src/local_deep_research/research_library/downloaders/playwright_html.py: 55%
199 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""
2HTML Downloader with JavaScript rendering support.
4Uses Crawl4AI (default) or plain Playwright for JS-rendered pages.
5Crawl4AI adds: robots.txt checking, shadow DOM flattening, iframe
6inlining, smart scrolling for lazy-loaded content, and caching.
7Falls back to plain Playwright if Crawl4AI is not installed.
9No stealth/anti-detection features are used — the browser identifies
10honestly via BROWSER_USER_AGENT and respects robots.txt.
11"""
13import asyncio
14from typing import Optional
15from urllib.parse import urlparse
17from loguru import logger
19from .html import HTMLDownloader
20from ...constants import BROWSER_USER_AGENT
23# Signals that a page is a JS-rendered SPA and needs browser rendering
24SPA_SIGNALS = [
25 'id="root"',
26 'id="app"',
27 'id="__next"',
28 "__NEXT_DATA__",
29 "data-reactroot",
30 'ng-version="',
31 "<noscript>You need to enable JavaScript",
32 "<noscript>Please enable JavaScript",
33 "window.__INITIAL_STATE__",
34]
37def _run_async(coro, timeout: float = None):
38 """Run an async coroutine from synchronous code.
40 Handles the case where an event loop is already running
41 (e.g. inside Jupyter or an async framework) by creating
42 a new thread with its own loop.
44 Args:
45 coro: The coroutine to run.
46 timeout: Max seconds to wait for the result. Prevents
47 indefinite hangs if the coroutine's internal timeout fails.
48 """
49 try:
50 loop = asyncio.get_running_loop()
51 except RuntimeError:
52 loop = None
54 if loop is None:
55 return asyncio.run(coro)
57 # Already inside an event loop — run in a new thread
58 import concurrent.futures
60 with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
61 future = pool.submit(asyncio.run, coro)
62 return future.result(timeout=timeout)
65class PlaywrightHTMLDownloader(HTMLDownloader):
66 """HTML downloader with JS rendering via Crawl4AI or Playwright.
68 Default: Crawl4AI (robots.txt, shadow DOM, iframes, caching).
69 Fallback: plain Playwright if Crawl4AI is not installed.
71 No stealth or anti-detection features are used.
72 """
74 def __init__(
75 self,
76 timeout: int = 30,
77 language: str = "English",
78 wait_until: str = "networkidle",
79 block_resources: bool = True,
80 **kwargs,
81 ):
82 super().__init__(timeout=timeout, language=language)
83 self.wait_until = wait_until
84 self.block_resources = block_resources
85 # Plain Playwright fallback state
86 self._playwright = None
87 self._browser = None
89 def _fetch_html(self, url: str) -> Optional[str]:
90 """Fetch HTML with JS rendering.
92 Tries Crawl4AI first (with robots.txt, shadow DOM, iframes),
93 falls back to plain Playwright.
94 """
95 # Try Crawl4AI first (richer features, robots.txt)
96 html = self._fetch_with_crawl4ai(url)
97 if html is not None:
98 # Crawl4AI succeeded (non-empty) or intentionally blocked
99 # by robots.txt (empty string). Either way, don't fall
100 # through to Playwright.
101 return html or None
103 # Crawl4AI not installed or failed — fall back to Playwright
104 return self._fetch_with_playwright(url)
106 def _fetch_with_crawl4ai(self, url: str) -> Optional[str]:
107 """Fetch HTML using Crawl4AI with ethical defaults."""
108 domain = urlparse(url).netloc
109 engine_type = f"crawl4ai_download_{domain}"
111 try:
112 from crawl4ai import (
113 AsyncWebCrawler,
114 BrowserConfig,
115 CrawlerRunConfig,
116 )
117 except ImportError:
118 logger.debug("crawl4ai not installed — using Playwright")
119 return None
121 logger.debug(f"Crawl4AI fetch: {url}")
122 wait_time = self.rate_tracker.apply_rate_limit(engine_type)
124 browser_cfg = BrowserConfig(
125 headless=True,
126 verbose=False,
127 user_agent=BROWSER_USER_AGENT,
128 )
129 run_cfg = CrawlerRunConfig(
130 # Ethical: respect robots.txt
131 check_robots_txt=True,
132 # Better extraction: flatten modern web features
133 flatten_shadow_dom=True,
134 process_iframes=True,
135 # Trigger lazy-loaded content
136 scan_full_page=True,
137 # Performance
138 wait_until=self.wait_until,
139 page_timeout=self.timeout * 1000,
140 exclude_all_images=self.block_resources,
141 # No stealth
142 override_navigator=False,
143 magic=False,
144 simulate_user=False,
145 verbose=False,
146 )
148 try:
150 async def _crawl():
151 async with AsyncWebCrawler(config=browser_cfg) as crawler:
152 return await crawler.arun(url=url, config=run_cfg)
154 result = _run_async(_crawl(), timeout=self.timeout + 30)
156 if result.success and result.html:
157 html = result.html
158 logger.debug(f"Crawl4AI: got {len(html)} bytes from {url}")
159 self.rate_tracker.record_outcome(
160 engine_type=engine_type,
161 wait_time=wait_time,
162 success=True,
163 retry_count=1,
164 search_result_count=1,
165 )
166 return html
168 # Check if blocked by robots.txt
169 error_msg = getattr(result, "error_message", "") or ""
170 if "robots.txt" in error_msg.lower():
171 logger.info(f"Crawl4AI: blocked by robots.txt for {url}")
172 # Don't fall back to Playwright — respect the block
173 self.rate_tracker.record_outcome(
174 engine_type=engine_type,
175 wait_time=wait_time,
176 success=False,
177 retry_count=1,
178 error_type="robots_txt_blocked",
179 )
180 return "" # Empty string signals intentional skip
182 status = getattr(result, "status_code", "unknown")
183 logger.debug(
184 f"Crawl4AI: failed for {url} — "
185 f"success={result.success}, status={status}"
186 )
187 self.rate_tracker.record_outcome(
188 engine_type=engine_type,
189 wait_time=wait_time,
190 success=False,
191 retry_count=1,
192 error_type=f"crawl4ai_status_{status}",
193 )
194 return None
196 except Exception as e:
197 logger.debug(f"Crawl4AI error for {url}: {e}")
198 self.rate_tracker.record_outcome(
199 engine_type=engine_type,
200 wait_time=wait_time,
201 success=False,
202 retry_count=1,
203 error_type=type(e).__name__,
204 )
205 return None
207 def _fetch_with_playwright(self, url: str) -> Optional[str]:
208 """Fetch HTML using plain Playwright (fallback)."""
209 logger.debug(f"Playwright fetch: {url}")
210 domain = urlparse(url).netloc
211 engine_type = f"playwright_download_{domain}"
213 wait_time = self.rate_tracker.apply_rate_limit(engine_type)
215 try:
216 from playwright.sync_api import sync_playwright
218 # Lazy-init browser (reuse across multiple fetches).
219 # --no-sandbox: Chromium needs SYS_ADMIN to set up its user-namespace
220 # sandbox; the production container drops that cap. Without this
221 # flag, launch() crashes inside Docker. Crawl4AI's own arg list
222 # already includes it; this fallback path was missing it.
223 # --disable-dev-shm-usage: Docker's default /dev/shm is 64 MB,
224 # which Chromium can blow through and OOM. Use /tmp instead.
225 if self._browser is None:
226 logger.debug("Playwright: launching Chromium browser")
227 pw = sync_playwright().start()
228 try:
229 self._browser = pw.chromium.launch(
230 headless=True,
231 args=["--no-sandbox", "--disable-dev-shm-usage"],
232 )
233 except Exception:
234 pw.stop()
235 raise
236 self._playwright = pw
238 page = self._browser.new_page(
239 user_agent=BROWSER_USER_AGENT,
240 )
241 try:
242 # Block heavy resources to speed up rendering
243 if self.block_resources:
244 page.route(
245 "**/*.{png,jpg,jpeg,gif,webp,svg,ico,woff,woff2,"
246 "ttf,eot,mp4,webm,mp3,ogg,css}",
247 lambda route: route.abort(),
248 )
250 page.goto(
251 url,
252 wait_until=self.wait_until,
253 timeout=self.timeout * 1000,
254 )
255 html = page.content()
256 finally:
257 try:
258 page.close()
259 except Exception:
260 logger.debug("Failed to close Playwright page")
262 if html:
263 logger.debug(f"Playwright: got {len(html)} bytes from {url}")
264 self.rate_tracker.record_outcome(
265 engine_type=engine_type,
266 wait_time=wait_time,
267 success=True,
268 retry_count=1,
269 search_result_count=1,
270 )
271 return html
273 logger.debug(f"Playwright: empty response from {url}")
274 return None
276 except ImportError:
277 logger.warning("playwright not installed — cannot use JS rendering")
278 return None
279 except Exception as e:
280 logger.exception(f"Playwright error fetching {url}")
281 self.rate_tracker.record_outcome(
282 engine_type=engine_type,
283 wait_time=wait_time,
284 success=False,
285 retry_count=1,
286 error_type=type(e).__name__,
287 )
288 return None
290 def close(self):
291 """Clean up Playwright browser and resources."""
292 if self._browser: 292 ↛ 293line 292 didn't jump to line 293 because the condition on line 292 was never true
293 try:
294 self._browser.close()
295 except Exception:
296 logger.debug(
297 "Failed to close Playwright browser", exc_info=True
298 )
299 self._browser = None
300 if self._playwright: 300 ↛ 301line 300 didn't jump to line 301 because the condition on line 300 was never true
301 try:
302 self._playwright.stop()
303 except Exception:
304 logger.debug("Failed to stop Playwright", exc_info=True)
305 self._playwright = None
306 super().close()
309class AutoHTMLDownloader(HTMLDownloader):
310 """HTML downloader that tries static fetch first, falls back to
311 Crawl4AI/Playwright when the page needs JavaScript rendering.
313 Detection heuristics:
314 - Extracted content is too short (<200 chars)
315 - Raw HTML contains SPA framework signals (React, Vue, Angular, Next.js)
316 """
318 def __init__(
319 self,
320 timeout: int = 30,
321 language: str = "English",
322 min_content_length: int = 200,
323 # Disabled by default to match the production Docker image, which
324 # ships without Chromium — every JS-rendering fallback attempt
325 # would otherwise fail loudly (see issue #3826). Callers running
326 # outside Docker with Chromium installed opt in via the
327 # ``web.enable_javascript_rendering`` setting, or pass ``True``
328 # explicitly when constructing the downloader.
329 enable_js_rendering: bool = False,
330 **kwargs,
331 ):
332 super().__init__(timeout=timeout, language=language)
333 self.min_content_length = min_content_length
334 self.enable_js_rendering = enable_js_rendering
335 self._playwright_downloader = None
337 def _get_playwright_downloader(self) -> PlaywrightHTMLDownloader:
338 """Lazy-init JS rendering downloader for fallback."""
339 if self._playwright_downloader is None: 339 ↛ 340line 339 didn't jump to line 340 because the condition on line 339 was never true
340 self._playwright_downloader = PlaywrightHTMLDownloader(
341 timeout=self.timeout,
342 language=self.language,
343 )
344 return self._playwright_downloader
346 @staticmethod
347 def _has_spa_signals(html: str) -> bool:
348 """Check if HTML contains signals of a JS-rendered SPA."""
349 html_lower = html[:5000].lower() # Only check head/early body
350 return any(signal.lower() in html_lower for signal in SPA_SIGNALS)
352 def _fetch_html(self, url: str) -> Optional[str]:
353 """Fetch HTML statically, storing raw response for SPA detection.
355 Note: _last_raw_html is instance state read by download()/download_with_result().
356 This is safe because AutoHTMLDownloader instances are created per-request
357 in fetch_and_extract/batch_fetch_and_extract — not shared across threads.
358 """
359 self._last_raw_html = None
360 # Try the normal static fetch
361 html = super()._fetch_html(url)
362 if html:
363 self._last_raw_html = html
364 return html
366 # Static fetch failed (403, etc.) — try raw GET to check for
367 # challenge pages / SPA signals even on non-200 responses
368 try:
369 response = self.session.get(
370 url,
371 timeout=self.timeout,
372 allow_redirects=True,
373 )
374 self._last_raw_html = response.text
375 except Exception:
376 logger.debug("Failed to fetch raw HTML for SPA detection")
377 return None
379 def download(self, url, content_type=None):
380 """Try static fetch, fall back to JS rendering if needed."""
381 from .base import ContentType
383 if content_type is None: 383 ↛ 387line 383 didn't jump to line 387 because the condition on line 383 was always true
384 content_type = ContentType.TEXT
386 # First: try static fetch (fast)
387 logger.debug(f"Auto: trying static fetch for {url}")
388 result = super().download(url, content_type)
390 if result and len(result) >= self.min_content_length:
391 logger.debug(
392 f"Auto: static fetch succeeded ({len(result)} bytes) for {url}"
393 )
394 return result
396 # Check if we should retry with JS rendering
397 raw_html = getattr(self, "_last_raw_html", None)
398 needs_js = raw_html and self._has_spa_signals(raw_html)
399 no_content = result is None or len(result) < self.min_content_length
401 if needs_js or no_content: 401 ↛ 422line 401 didn't jump to line 422 because the condition on line 401 was always true
402 if not self.enable_js_rendering:
403 logger.debug(
404 f"Auto: would fall back to JS rendering for {url}, "
405 "but JS rendering is disabled "
406 "(setting: web.enable_javascript_rendering)"
407 )
408 return result
409 reason = "SPA signals" if needs_js else "no/short content"
410 logger.info(
411 f"Auto: {reason} for {url}, falling back to JS rendering"
412 )
413 pw_dl = self._get_playwright_downloader()
414 pw_result = pw_dl.download(url, content_type)
415 if pw_result and len(pw_result) > len(result or b""): 415 ↛ 420line 415 didn't jump to line 420 because the condition on line 415 was always true
416 logger.info(
417 f"Auto: JS rendering succeeded ({len(pw_result)} bytes) for {url}"
418 )
419 return pw_result
420 logger.debug(f"Auto: JS rendering did not improve result for {url}")
422 return result
424 def download_with_result(self, url, content_type=None):
425 """Try static fetch, fall back to JS rendering if needed."""
426 from .base import ContentType
428 if content_type is None: 428 ↛ 432line 428 didn't jump to line 432 because the condition on line 428 was always true
429 content_type = ContentType.TEXT
431 # First: try static fetch (fast)
432 logger.debug(f"Auto: trying static fetch for {url}")
433 result = super().download_with_result(url, content_type)
435 if ( 435 ↛ 440line 435 didn't jump to line 440 because the condition on line 435 was never true
436 result.is_success
437 and result.content
438 and len(result.content) >= self.min_content_length
439 ):
440 logger.debug(
441 f"Auto: static fetch succeeded ({len(result.content)} bytes) for {url}"
442 )
443 return result
445 # Check if we should retry with JS rendering
446 raw_html = getattr(self, "_last_raw_html", None)
447 needs_js = raw_html and self._has_spa_signals(raw_html)
448 no_content = (
449 not result.is_success
450 or not result.content
451 or len(result.content) < self.min_content_length
452 )
454 if needs_js or no_content: 454 ↛ 480line 454 didn't jump to line 480 because the condition on line 454 was always true
455 if not self.enable_js_rendering: 455 ↛ 462line 455 didn't jump to line 462 because the condition on line 455 was always true
456 logger.debug(
457 f"Auto: would fall back to JS rendering for {url}, "
458 "but JS rendering is disabled "
459 "(setting: web.enable_javascript_rendering)"
460 )
461 return result
462 reason = "SPA signals" if needs_js else "no/short content"
463 logger.info(
464 f"Auto: {reason} for {url}, falling back to JS rendering"
465 )
466 pw_dl = self._get_playwright_downloader()
467 pw_result = pw_dl.download_with_result(url, content_type)
468 if (
469 pw_result.is_success
470 and pw_result.content
471 and len(pw_result.content) > len(result.content or b"")
472 ):
473 logger.info(
474 f"Auto: JS rendering succeeded "
475 f"({len(pw_result.content)} bytes) for {url}"
476 )
477 return pw_result
478 logger.debug(f"Auto: JS rendering did not improve result for {url}")
480 return result
482 def close(self):
483 """Clean up both static and JS rendering resources."""
484 if self._playwright_downloader:
485 self._playwright_downloader.close()
486 self._playwright_downloader = None
487 super().close()