Coverage for src / local_deep_research / research_library / downloaders / playwright_html.py: 35%
192 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""
2HTML Downloader with JavaScript rendering support.
4Uses Crawl4AI (default) or plain Playwright for JS-rendered pages.
5Crawl4AI adds: robots.txt checking, shadow DOM flattening, iframe
6inlining, smart scrolling for lazy-loaded content, and caching.
7Falls back to plain Playwright if Crawl4AI is not installed.
9No stealth/anti-detection features are used — the browser identifies
10honestly via BROWSER_USER_AGENT and respects robots.txt.
11"""
13import asyncio
14from typing import Optional
15from urllib.parse import urlparse
17from loguru import logger
19from .html import HTMLDownloader
20from ...constants import BROWSER_USER_AGENT
23# Signals that a page is a JS-rendered SPA and needs browser rendering
24SPA_SIGNALS = [
25 'id="root"',
26 'id="app"',
27 'id="__next"',
28 "__NEXT_DATA__",
29 "data-reactroot",
30 'ng-version="',
31 "<noscript>You need to enable JavaScript",
32 "<noscript>Please enable JavaScript",
33 "window.__INITIAL_STATE__",
34]
37def _run_async(coro, timeout: float = None):
38 """Run an async coroutine from synchronous code.
40 Handles the case where an event loop is already running
41 (e.g. inside Jupyter or an async framework) by creating
42 a new thread with its own loop.
44 Args:
45 coro: The coroutine to run.
46 timeout: Max seconds to wait for the result. Prevents
47 indefinite hangs if the coroutine's internal timeout fails.
48 """
49 try:
50 loop = asyncio.get_running_loop()
51 except RuntimeError:
52 loop = None
54 if loop is None:
55 return asyncio.run(coro)
57 # Already inside an event loop — run in a new thread
58 import concurrent.futures
60 with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:
61 future = pool.submit(asyncio.run, coro)
62 return future.result(timeout=timeout)
65class PlaywrightHTMLDownloader(HTMLDownloader):
66 """HTML downloader with JS rendering via Crawl4AI or Playwright.
68 Default: Crawl4AI (robots.txt, shadow DOM, iframes, caching).
69 Fallback: plain Playwright if Crawl4AI is not installed.
71 No stealth or anti-detection features are used.
72 """
74 def __init__(
75 self,
76 timeout: int = 30,
77 language: str = "English",
78 wait_until: str = "networkidle",
79 block_resources: bool = True,
80 **kwargs,
81 ):
82 super().__init__(timeout=timeout, language=language)
83 self.wait_until = wait_until
84 self.block_resources = block_resources
85 # Plain Playwright fallback state
86 self._playwright = None
87 self._browser = None
89 def _fetch_html(self, url: str) -> Optional[str]:
90 """Fetch HTML with JS rendering.
92 Tries Crawl4AI first (with robots.txt, shadow DOM, iframes),
93 falls back to plain Playwright.
94 """
95 # Try Crawl4AI first (richer features, robots.txt)
96 html = self._fetch_with_crawl4ai(url)
97 if html is not None:
98 # Crawl4AI succeeded (non-empty) or intentionally blocked
99 # by robots.txt (empty string). Either way, don't fall
100 # through to Playwright.
101 return html or None
103 # Crawl4AI not installed or failed — fall back to Playwright
104 return self._fetch_with_playwright(url)
106 def _fetch_with_crawl4ai(self, url: str) -> Optional[str]:
107 """Fetch HTML using Crawl4AI with ethical defaults."""
108 domain = urlparse(url).netloc
109 engine_type = f"crawl4ai_download_{domain}"
111 try:
112 from crawl4ai import (
113 AsyncWebCrawler,
114 BrowserConfig,
115 CrawlerRunConfig,
116 )
117 except ImportError:
118 logger.debug("crawl4ai not installed — using Playwright")
119 return None
121 logger.debug(f"Crawl4AI fetch: {url}")
122 wait_time = self.rate_tracker.apply_rate_limit(engine_type)
124 browser_cfg = BrowserConfig(
125 headless=True,
126 verbose=False,
127 user_agent=BROWSER_USER_AGENT,
128 )
129 run_cfg = CrawlerRunConfig(
130 # Ethical: respect robots.txt
131 check_robots_txt=True,
132 # Better extraction: flatten modern web features
133 flatten_shadow_dom=True,
134 process_iframes=True,
135 # Trigger lazy-loaded content
136 scan_full_page=True,
137 # Performance
138 wait_until=self.wait_until,
139 page_timeout=self.timeout * 1000,
140 exclude_all_images=self.block_resources,
141 # No stealth
142 override_navigator=False,
143 magic=False,
144 simulate_user=False,
145 verbose=False,
146 )
148 try:
150 async def _crawl():
151 async with AsyncWebCrawler(config=browser_cfg) as crawler:
152 return await crawler.arun(url=url, config=run_cfg)
154 result = _run_async(_crawl(), timeout=self.timeout + 30)
156 if result.success and result.html:
157 html = result.html
158 logger.debug(f"Crawl4AI: got {len(html)} bytes from {url}")
159 self.rate_tracker.record_outcome(
160 engine_type=engine_type,
161 wait_time=wait_time,
162 success=True,
163 retry_count=1,
164 search_result_count=1,
165 )
166 return html
168 # Check if blocked by robots.txt
169 error_msg = getattr(result, "error_message", "") or ""
170 if "robots.txt" in error_msg.lower():
171 logger.info(f"Crawl4AI: blocked by robots.txt for {url}")
172 # Don't fall back to Playwright — respect the block
173 self.rate_tracker.record_outcome(
174 engine_type=engine_type,
175 wait_time=wait_time,
176 success=False,
177 retry_count=1,
178 error_type="robots_txt_blocked",
179 )
180 return "" # Empty string signals intentional skip
182 status = getattr(result, "status_code", "unknown")
183 logger.debug(
184 f"Crawl4AI: failed for {url} — "
185 f"success={result.success}, status={status}"
186 )
187 self.rate_tracker.record_outcome(
188 engine_type=engine_type,
189 wait_time=wait_time,
190 success=False,
191 retry_count=1,
192 error_type=f"crawl4ai_status_{status}",
193 )
194 return None
196 except Exception as e:
197 logger.debug(f"Crawl4AI error for {url}: {e}")
198 self.rate_tracker.record_outcome(
199 engine_type=engine_type,
200 wait_time=wait_time,
201 success=False,
202 retry_count=1,
203 error_type=type(e).__name__,
204 )
205 return None
207 def _fetch_with_playwright(self, url: str) -> Optional[str]:
208 """Fetch HTML using plain Playwright (fallback)."""
209 logger.debug(f"Playwright fetch: {url}")
210 domain = urlparse(url).netloc
211 engine_type = f"playwright_download_{domain}"
213 wait_time = self.rate_tracker.apply_rate_limit(engine_type)
215 try:
216 from playwright.sync_api import sync_playwright
218 # Lazy-init browser (reuse across multiple fetches)
219 if self._browser is None:
220 logger.debug("Playwright: launching Chromium browser")
221 pw = sync_playwright().start()
222 try:
223 self._browser = pw.chromium.launch(headless=True)
224 except Exception:
225 pw.stop()
226 raise
227 self._playwright = pw
229 page = self._browser.new_page(
230 user_agent=BROWSER_USER_AGENT,
231 )
232 try:
233 # Block heavy resources to speed up rendering
234 if self.block_resources:
235 page.route(
236 "**/*.{png,jpg,jpeg,gif,webp,svg,ico,woff,woff2,"
237 "ttf,eot,mp4,webm,mp3,ogg,css}",
238 lambda route: route.abort(),
239 )
241 page.goto(
242 url,
243 wait_until=self.wait_until,
244 timeout=self.timeout * 1000,
245 )
246 html = page.content()
247 finally:
248 try:
249 page.close()
250 except Exception:
251 logger.debug("Failed to close Playwright page")
253 if html:
254 logger.debug(f"Playwright: got {len(html)} bytes from {url}")
255 self.rate_tracker.record_outcome(
256 engine_type=engine_type,
257 wait_time=wait_time,
258 success=True,
259 retry_count=1,
260 search_result_count=1,
261 )
262 return html
264 logger.debug(f"Playwright: empty response from {url}")
265 return None
267 except ImportError:
268 logger.warning("playwright not installed — cannot use JS rendering")
269 return None
270 except Exception as e:
271 logger.exception(f"Playwright error fetching {url}")
272 self.rate_tracker.record_outcome(
273 engine_type=engine_type,
274 wait_time=wait_time,
275 success=False,
276 retry_count=1,
277 error_type=type(e).__name__,
278 )
279 return None
281 def close(self):
282 """Clean up Playwright browser and resources."""
283 if self._browser: 283 ↛ 284line 283 didn't jump to line 284 because the condition on line 283 was never true
284 try:
285 self._browser.close()
286 except Exception:
287 logger.debug(
288 "Failed to close Playwright browser", exc_info=True
289 )
290 self._browser = None
291 if self._playwright: 291 ↛ 292line 291 didn't jump to line 292 because the condition on line 291 was never true
292 try:
293 self._playwright.stop()
294 except Exception:
295 logger.debug("Failed to stop Playwright", exc_info=True)
296 self._playwright = None
297 super().close()
300class AutoHTMLDownloader(HTMLDownloader):
301 """HTML downloader that tries static fetch first, falls back to
302 Crawl4AI/Playwright when the page needs JavaScript rendering.
304 Detection heuristics:
305 - Extracted content is too short (<200 chars)
306 - Raw HTML contains SPA framework signals (React, Vue, Angular, Next.js)
307 """
309 def __init__(
310 self,
311 timeout: int = 30,
312 language: str = "English",
313 min_content_length: int = 200,
314 **kwargs,
315 ):
316 super().__init__(timeout=timeout, language=language)
317 self.min_content_length = min_content_length
318 self._playwright_downloader = None
320 def _get_playwright_downloader(self) -> PlaywrightHTMLDownloader:
321 """Lazy-init JS rendering downloader for fallback."""
322 if self._playwright_downloader is None: 322 ↛ 323line 322 didn't jump to line 323 because the condition on line 322 was never true
323 self._playwright_downloader = PlaywrightHTMLDownloader(
324 timeout=self.timeout,
325 language=self.language,
326 )
327 return self._playwright_downloader
329 @staticmethod
330 def _has_spa_signals(html: str) -> bool:
331 """Check if HTML contains signals of a JS-rendered SPA."""
332 html_lower = html[:5000].lower() # Only check head/early body
333 return any(signal.lower() in html_lower for signal in SPA_SIGNALS)
335 def _fetch_html(self, url: str) -> Optional[str]:
336 """Fetch HTML statically, storing raw response for SPA detection.
338 Note: _last_raw_html is instance state read by download()/download_with_result().
339 This is safe because AutoHTMLDownloader instances are created per-request
340 in fetch_and_extract/batch_fetch_and_extract — not shared across threads.
341 """
342 self._last_raw_html = None
343 # Try the normal static fetch
344 html = super()._fetch_html(url)
345 if html:
346 self._last_raw_html = html
347 return html
349 # Static fetch failed (403, etc.) — try raw GET to check for
350 # challenge pages / SPA signals even on non-200 responses
351 try:
352 response = self.session.get(
353 url,
354 timeout=self.timeout,
355 allow_redirects=True,
356 )
357 self._last_raw_html = response.text
358 except Exception:
359 logger.debug("Failed to fetch raw HTML for SPA detection")
360 return None
362 def download(self, url, content_type=None):
363 """Try static fetch, fall back to JS rendering if needed."""
364 from .base import ContentType
366 if content_type is None: 366 ↛ 370line 366 didn't jump to line 370 because the condition on line 366 was always true
367 content_type = ContentType.TEXT
369 # First: try static fetch (fast)
370 logger.debug(f"Auto: trying static fetch for {url}")
371 result = super().download(url, content_type)
373 if result and len(result) >= self.min_content_length:
374 logger.debug(
375 f"Auto: static fetch succeeded ({len(result)} bytes) for {url}"
376 )
377 return result
379 # Check if we should retry with JS rendering
380 raw_html = getattr(self, "_last_raw_html", None)
381 needs_js = raw_html and self._has_spa_signals(raw_html)
382 no_content = result is None or len(result) < self.min_content_length
384 if needs_js or no_content: 384 ↛ 398line 384 didn't jump to line 398 because the condition on line 384 was always true
385 reason = "SPA signals" if needs_js else "no/short content"
386 logger.info(
387 f"Auto: {reason} for {url}, falling back to JS rendering"
388 )
389 pw_dl = self._get_playwright_downloader()
390 pw_result = pw_dl.download(url, content_type)
391 if pw_result and len(pw_result) > len(result or b""): 391 ↛ 396line 391 didn't jump to line 396 because the condition on line 391 was always true
392 logger.info(
393 f"Auto: JS rendering succeeded ({len(pw_result)} bytes) for {url}"
394 )
395 return pw_result
396 logger.debug(f"Auto: JS rendering did not improve result for {url}")
398 return result
400 def download_with_result(self, url, content_type=None):
401 """Try static fetch, fall back to JS rendering if needed."""
402 from .base import ContentType
404 if content_type is None:
405 content_type = ContentType.TEXT
407 # First: try static fetch (fast)
408 logger.debug(f"Auto: trying static fetch for {url}")
409 result = super().download_with_result(url, content_type)
411 if (
412 result.is_success
413 and result.content
414 and len(result.content) >= self.min_content_length
415 ):
416 logger.debug(
417 f"Auto: static fetch succeeded ({len(result.content)} bytes) for {url}"
418 )
419 return result
421 # Check if we should retry with JS rendering
422 raw_html = getattr(self, "_last_raw_html", None)
423 needs_js = raw_html and self._has_spa_signals(raw_html)
424 no_content = (
425 not result.is_success
426 or not result.content
427 or len(result.content) < self.min_content_length
428 )
430 if needs_js or no_content:
431 reason = "SPA signals" if needs_js else "no/short content"
432 logger.info(
433 f"Auto: {reason} for {url}, falling back to JS rendering"
434 )
435 pw_dl = self._get_playwright_downloader()
436 pw_result = pw_dl.download_with_result(url, content_type)
437 if (
438 pw_result.is_success
439 and pw_result.content
440 and len(pw_result.content) > len(result.content or b"")
441 ):
442 logger.info(
443 f"Auto: JS rendering succeeded "
444 f"({len(pw_result.content)} bytes) for {url}"
445 )
446 return pw_result
447 logger.debug(f"Auto: JS rendering did not improve result for {url}")
449 return result
451 def close(self):
452 """Clean up both static and JS rendering resources."""
453 if self._playwright_downloader:
454 self._playwright_downloader.close()
455 self._playwright_downloader = None
456 super().close()