Coverage for src/local_deep_research/research_library/downloaders/playwright

1"""

2HTML Downloader with JavaScript rendering support.

4Uses Crawl4AI (default) or plain Playwright for JS-rendered pages.

5Crawl4AI adds: robots.txt checking, shadow DOM flattening, iframe

6inlining, smart scrolling for lazy-loaded content, and caching.

7Falls back to plain Playwright if Crawl4AI is not installed.

9No stealth/anti-detection features are used — the browser identifies

10honestly via BROWSER_USER_AGENT and respects robots.txt.

11"""

13import asyncio

14from typing import Optional

15from urllib.parse import urlparse

17from loguru import logger

19from .html import HTMLDownloader

20from ...constants import BROWSER_USER_AGENT

23# Signals that a page is a JS-rendered SPA and needs browser rendering

24SPA_SIGNALS = [

25 'id="root"',

26 'id="app"',

27 'id="__next"',

28 "__NEXT_DATA__",

29 "data-reactroot",

30 'ng-version="',

31 "<noscript>You need to enable JavaScript",

32 "<noscript>Please enable JavaScript",

33 "window.__INITIAL_STATE__",

34]

37def _run_async(coro, timeout: float = None):

38 """Run an async coroutine from synchronous code.

40 Handles the case where an event loop is already running

41 (e.g. inside Jupyter or an async framework) by creating

42 a new thread with its own loop.

44 Args:

45 coro: The coroutine to run.

46 timeout: Max seconds to wait for the result. Prevents

47 indefinite hangs if the coroutine's internal timeout fails.

48 """

49 try:

50 loop = asyncio.get_running_loop()

51 except RuntimeError:

52 loop = None

54 if loop is None:

55 return asyncio.run(coro)

57 # Already inside an event loop — run in a new thread

58 import concurrent.futures

60 with concurrent.futures.ThreadPoolExecutor(max_workers=1) as pool:

61 future = pool.submit(asyncio.run, coro)

62 return future.result(timeout=timeout)

65class PlaywrightHTMLDownloader(HTMLDownloader):

66 """HTML downloader with JS rendering via Crawl4AI or Playwright.

68 Default: Crawl4AI (robots.txt, shadow DOM, iframes, caching).

69 Fallback: plain Playwright if Crawl4AI is not installed.

71 No stealth or anti-detection features are used.

72 """

74 def __init__(

75 self,

76 timeout: int = 30,

77 language: str = "English",

78 wait_until: str = "networkidle",

79 block_resources: bool = True,

80 **kwargs,

81 ):

82 super().__init__(timeout=timeout, language=language)

83 self.wait_until = wait_until

84 self.block_resources = block_resources

85 # Plain Playwright fallback state

86 self._playwright = None

87 self._browser = None

89 def _fetch_html(self, url: str) -> Optional[str]:

90 """Fetch HTML with JS rendering.

92 Tries Crawl4AI first (with robots.txt, shadow DOM, iframes),

93 falls back to plain Playwright.

94 """

95 # Try Crawl4AI first (richer features, robots.txt)

96 html = self._fetch_with_crawl4ai(url)

97 if html is not None:

98 # Crawl4AI succeeded (non-empty) or intentionally blocked

99 # by robots.txt (empty string). Either way, don't fall

100 # through to Playwright.

101 return html or None

102

103 # Crawl4AI not installed or failed — fall back to Playwright

104 return self._fetch_with_playwright(url)

105

106 def _fetch_with_crawl4ai(self, url: str) -> Optional[str]:

107 """Fetch HTML using Crawl4AI with ethical defaults."""

108 domain = urlparse(url).netloc

109 engine_type = f"crawl4ai_download_{domain}"

110

111 try:

112 from crawl4ai import (

113 AsyncWebCrawler,

114 BrowserConfig,

115 CrawlerRunConfig,

116 )

117 except ImportError:

118 logger.debug("crawl4ai not installed — using Playwright")

119 return None

120

121 logger.debug(f"Crawl4AI fetch: {url}")

122 wait_time = self.rate_tracker.apply_rate_limit(engine_type)

123

124 browser_cfg = BrowserConfig(

125 headless=True,

126 verbose=False,

127 user_agent=BROWSER_USER_AGENT,

128 )

129 run_cfg = CrawlerRunConfig(

130 # Ethical: respect robots.txt

131 check_robots_txt=True,

132 # Better extraction: flatten modern web features

133 flatten_shadow_dom=True,

134 process_iframes=True,

135 # Trigger lazy-loaded content

136 scan_full_page=True,

137 # Performance

138 wait_until=self.wait_until,

139 page_timeout=self.timeout * 1000,

140 exclude_all_images=self.block_resources,

141 # No stealth

142 override_navigator=False,

143 magic=False,

144 simulate_user=False,

145 verbose=False,

146 )

147

148 try:

149

150 async def _crawl():

151 async with AsyncWebCrawler(config=browser_cfg) as crawler:

152 return await crawler.arun(url=url, config=run_cfg)

153

154 result = _run_async(_crawl(), timeout=self.timeout + 30)

155

156 if result.success and result.html:

157 html = result.html

158 logger.debug(f"Crawl4AI: got {len(html)} bytes from {url}")

159 self.rate_tracker.record_outcome(

160 engine_type=engine_type,

161 wait_time=wait_time,

162 success=True,

163 retry_count=1,

164 search_result_count=1,

165 )

166 return html

167

168 # Check if blocked by robots.txt

169 error_msg = getattr(result, "error_message", "") or ""

170 if "robots.txt" in error_msg.lower():

171 logger.info(f"Crawl4AI: blocked by robots.txt for {url}")

172 # Don't fall back to Playwright — respect the block

173 self.rate_tracker.record_outcome(

174 engine_type=engine_type,

175 wait_time=wait_time,

176 success=False,

177 retry_count=1,

178 error_type="robots_txt_blocked",

179 )

180 return "" # Empty string signals intentional skip

181

182 status = getattr(result, "status_code", "unknown")

183 logger.debug(

184 f"Crawl4AI: failed for {url} — "

185 f"success={result.success}, status={status}"

186 )

187 self.rate_tracker.record_outcome(

188 engine_type=engine_type,

189 wait_time=wait_time,

190 success=False,

191 retry_count=1,

192 error_type=f"crawl4ai_status_{status}",

193 )

194 return None

195

196 except Exception as e:

197 logger.debug(f"Crawl4AI error for {url}: {e}")

198 self.rate_tracker.record_outcome(

199 engine_type=engine_type,

200 wait_time=wait_time,

201 success=False,

202 retry_count=1,

203 error_type=type(e).__name__,

204 )

205 return None

206

207 def _fetch_with_playwright(self, url: str) -> Optional[str]:

208 """Fetch HTML using plain Playwright (fallback)."""

209 logger.debug(f"Playwright fetch: {url}")

210 domain = urlparse(url).netloc

211 engine_type = f"playwright_download_{domain}"

212

213 wait_time = self.rate_tracker.apply_rate_limit(engine_type)

214

215 try:

216 from playwright.sync_api import sync_playwright

217

218 # Lazy-init browser (reuse across multiple fetches).

219 # --no-sandbox: Chromium needs SYS_ADMIN to set up its user-namespace

220 # sandbox; the production container drops that cap. Without this

221 # flag, launch() crashes inside Docker. Crawl4AI's own arg list

222 # already includes it; this fallback path was missing it.

223 # --disable-dev-shm-usage: Docker's default /dev/shm is 64 MB,

224 # which Chromium can blow through and OOM. Use /tmp instead.

225 if self._browser is None:

226 logger.debug("Playwright: launching Chromium browser")

227 pw = sync_playwright().start()

228 try:

229 self._browser = pw.chromium.launch(

230 headless=True,

231 args=["--no-sandbox", "--disable-dev-shm-usage"],

232 )

233 except Exception:

234 pw.stop()

235 raise

236 self._playwright = pw

237

238 page = self._browser.new_page(

239 user_agent=BROWSER_USER_AGENT,

240 )

241 try:

242 # Block heavy resources to speed up rendering

243 if self.block_resources:

244 page.route(

245 "**/*.{png,jpg,jpeg,gif,webp,svg,ico,woff,woff2,"

246 "ttf,eot,mp4,webm,mp3,ogg,css}",

247 lambda route: route.abort(),

248 )

249

250 page.goto(

251 url,

252 wait_until=self.wait_until,

253 timeout=self.timeout * 1000,

254 )

255 html = page.content()

256 finally:

257 try:

258 page.close()

259 except Exception:

260 logger.debug("Failed to close Playwright page")

261

262 if html:

263 logger.debug(f"Playwright: got {len(html)} bytes from {url}")

264 self.rate_tracker.record_outcome(

265 engine_type=engine_type,

266 wait_time=wait_time,

267 success=True,

268 retry_count=1,

269 search_result_count=1,

270 )

271 return html

272

273 logger.debug(f"Playwright: empty response from {url}")

274 return None

275

276 except ImportError:

277 logger.warning("playwright not installed — cannot use JS rendering")

278 return None

279 except Exception as e:

280 logger.exception(f"Playwright error fetching {url}")

281 self.rate_tracker.record_outcome(

282 engine_type=engine_type,

283 wait_time=wait_time,

284 success=False,

285 retry_count=1,

286 error_type=type(e).__name__,

287 )

288 return None

289

290 def close(self):

291 """Clean up Playwright browser and resources."""

292 if self._browser: 292 ↛ 293line 292 didn't jump to line 293 because the condition on line 292 was never true

293 try:

294 self._browser.close()

295 except Exception:

296 logger.debug(

297 "Failed to close Playwright browser", exc_info=True

298 )

299 self._browser = None

300 if self._playwright: 300 ↛ 301line 300 didn't jump to line 301 because the condition on line 300 was never true

301 try:

302 self._playwright.stop()

303 except Exception:

304 logger.debug("Failed to stop Playwright", exc_info=True)

305 self._playwright = None

306 super().close()

307

308

309class AutoHTMLDownloader(HTMLDownloader):

310 """HTML downloader that tries static fetch first, falls back to

311 Crawl4AI/Playwright when the page needs JavaScript rendering.

312

313 Detection heuristics:

314 - Extracted content is too short (<200 chars)

315 - Raw HTML contains SPA framework signals (React, Vue, Angular, Next.js)

316 """

317

318 def __init__(

319 self,

320 timeout: int = 30,

321 language: str = "English",

322 min_content_length: int = 200,

323 # Disabled by default to match the production Docker image, which

324 # ships without Chromium — every JS-rendering fallback attempt

325 # would otherwise fail loudly (see issue #3826). Callers running

326 # outside Docker with Chromium installed opt in via the

327 # ``web.enable_javascript_rendering`` setting, or pass ``True``

328 # explicitly when constructing the downloader.

329 enable_js_rendering: bool = False,

330 **kwargs,

331 ):

332 super().__init__(timeout=timeout, language=language)

333 self.min_content_length = min_content_length

334 self.enable_js_rendering = enable_js_rendering

335 self._playwright_downloader = None

336

337 def _get_playwright_downloader(self) -> PlaywrightHTMLDownloader:

338 """Lazy-init JS rendering downloader for fallback."""

339 if self._playwright_downloader is None: 339 ↛ 340line 339 didn't jump to line 340 because the condition on line 339 was never true

340 self._playwright_downloader = PlaywrightHTMLDownloader(

341 timeout=self.timeout,

342 language=self.language,

343 )

344 return self._playwright_downloader

345

346 @staticmethod

347 def _has_spa_signals(html: str) -> bool:

348 """Check if HTML contains signals of a JS-rendered SPA."""

349 html_lower = html[:5000].lower() # Only check head/early body

350 return any(signal.lower() in html_lower for signal in SPA_SIGNALS)

351

352 def _fetch_html(self, url: str) -> Optional[str]:

353 """Fetch HTML statically, storing raw response for SPA detection.

354

355 Note: _last_raw_html is instance state read by download()/download_with_result().

356 This is safe because AutoHTMLDownloader instances are created per-request

357 in fetch_and_extract/batch_fetch_and_extract — not shared across threads.

358 """

359 self._last_raw_html = None

360 # Try the normal static fetch

361 html = super()._fetch_html(url)

362 if html:

363 self._last_raw_html = html

364 return html

365

366 # Static fetch failed (403, etc.) — try raw GET to check for

367 # challenge pages / SPA signals even on non-200 responses

368 try:

369 response = self.session.get(

370 url,

371 timeout=self.timeout,

372 allow_redirects=True,

373 )

374 self._last_raw_html = response.text

375 except Exception:

376 logger.debug("Failed to fetch raw HTML for SPA detection")

377 return None

378

379 def download(self, url, content_type=None):

380 """Try static fetch, fall back to JS rendering if needed."""

381 from .base import ContentType

382

383 if content_type is None: 383 ↛ 387line 383 didn't jump to line 387 because the condition on line 383 was always true

384 content_type = ContentType.TEXT

385

386 # First: try static fetch (fast)

387 logger.debug(f"Auto: trying static fetch for {url}")

388 result = super().download(url, content_type)

389

390 if result and len(result) >= self.min_content_length:

391 logger.debug(

392 f"Auto: static fetch succeeded ({len(result)} bytes) for {url}"

393 )

394 return result

395

396 # Check if we should retry with JS rendering

397 raw_html = getattr(self, "_last_raw_html", None)

398 needs_js = raw_html and self._has_spa_signals(raw_html)

399 no_content = result is None or len(result) < self.min_content_length

400

401 if needs_js or no_content: 401 ↛ 422line 401 didn't jump to line 422 because the condition on line 401 was always true

402 if not self.enable_js_rendering:

403 logger.debug(

404 f"Auto: would fall back to JS rendering for {url}, "

405 "but JS rendering is disabled "

406 "(setting: web.enable_javascript_rendering)"

407 )

408 return result

409 reason = "SPA signals" if needs_js else "no/short content"

410 logger.info(

411 f"Auto: {reason} for {url}, falling back to JS rendering"

412 )

413 pw_dl = self._get_playwright_downloader()

414 pw_result = pw_dl.download(url, content_type)

415 if pw_result and len(pw_result) > len(result or b""): 415 ↛ 420line 415 didn't jump to line 420 because the condition on line 415 was always true

416 logger.info(

417 f"Auto: JS rendering succeeded ({len(pw_result)} bytes) for {url}"

418 )

419 return pw_result

420 logger.debug(f"Auto: JS rendering did not improve result for {url}")

421

422 return result

423

424 def download_with_result(self, url, content_type=None):

425 """Try static fetch, fall back to JS rendering if needed."""

426 from .base import ContentType

427

428 if content_type is None: 428 ↛ 432line 428 didn't jump to line 432 because the condition on line 428 was always true

429 content_type = ContentType.TEXT

430

431 # First: try static fetch (fast)

432 logger.debug(f"Auto: trying static fetch for {url}")

433 result = super().download_with_result(url, content_type)

434

435 if ( 435 ↛ 440line 435 didn't jump to line 440 because the condition on line 435 was never true

436 result.is_success

437 and result.content

438 and len(result.content) >= self.min_content_length

439 ):

440 logger.debug(

441 f"Auto: static fetch succeeded ({len(result.content)} bytes) for {url}"

442 )

443 return result

444

445 # Check if we should retry with JS rendering

446 raw_html = getattr(self, "_last_raw_html", None)

447 needs_js = raw_html and self._has_spa_signals(raw_html)

448 no_content = (

449 not result.is_success

450 or not result.content

451 or len(result.content) < self.min_content_length

452 )

453

454 if needs_js or no_content: 454 ↛ 480line 454 didn't jump to line 480 because the condition on line 454 was always true

455 if not self.enable_js_rendering: 455 ↛ 462line 455 didn't jump to line 462 because the condition on line 455 was always true

456 logger.debug(

457 f"Auto: would fall back to JS rendering for {url}, "

458 "but JS rendering is disabled "

459 "(setting: web.enable_javascript_rendering)"

460 )

461 return result

462 reason = "SPA signals" if needs_js else "no/short content"

463 logger.info(

464 f"Auto: {reason} for {url}, falling back to JS rendering"

465 )

466 pw_dl = self._get_playwright_downloader()

467 pw_result = pw_dl.download_with_result(url, content_type)

468 if (

469 pw_result.is_success

470 and pw_result.content

471 and len(pw_result.content) > len(result.content or b"")

472 ):

473 logger.info(

474 f"Auto: JS rendering succeeded "

475 f"({len(pw_result.content)} bytes) for {url}"

476 )

477 return pw_result

478 logger.debug(f"Auto: JS rendering did not improve result for {url}")

479

480 return result

481

482 def close(self):

483 """Clean up both static and JS rendering resources."""

484 if self._playwright_downloader:

485 self._playwright_downloader.close()

486 self._playwright_downloader = None

487 super().close()

Coverage for src/local_deep_research/research_library/downloaders/playwright_html.py: 55%

199 statements