Coverage for src/local_deep_research/advanced_search_system/tools/fetch/__init_

1"""Agent-facing ``fetch_content`` tool builders.

3Public API:

4 FETCH_MODES — tuple of valid mode strings.

5 build_fetch_tool() — returns a LangChain ``@tool`` (or ``None`` when

6 mode == "disabled" so the caller can skip

7 registration).

9Modes:

10 disabled — fetch tool is not registered with the agent.

11 full — return the full extracted page text (legacy

12 behavior; can flood small-model context with

13 boilerplate / metadata enrichment).

14 summary_focus — LLM extracts only spans relevant to a focus

15 question the agent supplies per call.

16 summary_focus_query — same as above, but the prompt also includes

17 the original research query (passed in

18 programmatically by the strategy) so the

19 extractor can disambiguate vague focuses.

21Each tool registers fetched URLs in the strategy's

22``SearchResultsCollector`` for citation tracking, returning the result as

23``[N] Title: ...\\nURL: ...\\n\\n<body>`` exactly like the original

24in-strategy implementation, so downstream prompt formatting is unchanged.

25"""

27from __future__ import annotations

29from typing import Any

31from langchain_core.language_models import BaseChatModel

32from langchain_core.tools import tool

33from loguru import logger

35from local_deep_research.utilities.js_rendering import (

36 read_js_rendering_setting as _read_js_rendering_setting,

37)

38from local_deep_research.security import (

39 redact_url_for_log,

40 sanitize_error_for_client,

41)

43from .prompts import SUMMARY_FOCUS_PROMPT, SUMMARY_FOCUS_QUERY_PROMPT

46# Per-call timeouts and caps. Kept here rather than in the strategy file

47# because they are properties of the fetch tool, not of agent

48# orchestration.

49CONTENT_FETCH_TIMEOUT = 30

50CONTENT_MAX_LENGTH = 10_000

52# Cap for credential-scrubbed fetch-tool error strings. Larger than the

53# 200-char HTTP-client default because these errors feed the agent's

54# reasoning; credential scrubbing still runs first on the full string (#4633).

55_TOOL_ERROR_MAX_LEN = 500

58def _scrub_tool_error(message: str) -> str:

59 """Scrub credentials from an LLM/agent-facing fetch-tool error string."""

60 return sanitize_error_for_client(message, max_length=_TOOL_ERROR_MAX_LEN)

63FETCH_MODES = (

64 "disabled",

65 "full",

66 "summary_focus",

67 "summary_focus_query",

68)

71def _register_in_collector(

72 collector: Any,

73 url: str,

74 title: str,

75 snippet_source: str,

76) -> int:

77 """Register a fetched URL in the collector and return its 1-based citation index.

79 If the URL was already tracked (via a prior search hit) the existing

80 index is reused so the agent sees a stable citation per URL.

81 """

82 existing_idx = collector.find_by_url(url)

83 if existing_idx is not None:

84 return existing_idx

85 snippet = snippet_source[:200].strip()

86 if len(snippet_source) > 200:

87 snippet += "..."

88 start = collector.add_results(

89 [{"title": title, "link": url, "snippet": snippet}],

90 engine_name="fetch",

91 )

92 return start + 1

95def _enforce_url_policy(url: str, egress_context: Any) -> None:

96 """Run ``evaluate_url`` against ``egress_context`` and raise

97 ``PolicyDeniedError`` on denial.

99 No-op when no context is configured (callers without policy enforcement,

100 e.g. legacy non-LangGraph strategies, see the legacy behavior).

101 """

102 if egress_context is None:

103 return

104 from local_deep_research.security.egress.policy import (

105 PolicyDeniedError,

106 evaluate_url,

107 )

108

109 decision = evaluate_url(url, egress_context)

110 if not decision.allowed:

111 raise PolicyDeniedError(decision, target=url)

112

113

114def _denial_reason(exc: Any) -> str:

115 """Best-effort egress-denial reason code for an agent-facing message."""

116 return getattr(getattr(exc, "decision", None), "reason", "policy_denied")

117

118

119# Shared instruction appended to every per-URL egress denial returned to the

120# agent. Tells it WHY the fetch was refused and what to do instead, so it

121# adapts (stays in-scope) rather than retrying the same out-of-scope URL.

122_EGRESS_DENIAL_HINT = (

123 "In this run only local collection/library documents can be fetched; "

124 "skip external URLs."

125)

126

127

128def _make_full_fetch_tool(

129 collector: Any,

130 settings_snapshot: dict | None = None,

131 egress_context: Any = None,

132):

133 mode_label = "full"

134

135 @tool

136 def fetch_content(url: str) -> str:

137 """Download and read the full text content from a URL. Use when search snippets aren't detailed enough."""

138 from local_deep_research.content_fetcher import ContentFetcher

139 from local_deep_research.security.egress.policy import (

140 PolicyDeniedError,

141 )

142

143 enable_js = _read_js_rendering_setting(settings_snapshot)

144 try:

145 # Per-URL egress gate (pre-fetch) + ContentFetcher's own

146 # per-redirect gate both raise PolicyDeniedError on an out-of-scope

147 # URL. Run the gate INSIDE the try so the denial is returned as a

148 # recoverable tool message (see the except below).

149 _enforce_url_policy(url, egress_context)

150 with ContentFetcher(

151 timeout=CONTENT_FETCH_TIMEOUT,

152 enable_js_rendering=enable_js,

153 egress_context=egress_context,

154 ) as fetcher:

155 result = fetcher.fetch(url, max_length=CONTENT_MAX_LENGTH)

156 if result.get("status") == "success":

157 title = result.get("title", "")

158 content = result.get("content", "")

159 cite_idx = _register_in_collector(

160 collector, url, title, content

161 )

162 return (

163 f"[{cite_idx}] Title: {title}\nURL: {url}\n\n{content}"

164 )

165 # result['error'] comes from ContentFetcher, which returns a

166 # raw str(exception) — scrub it (and the url) before this

167 # reaches the agent/LLM and user-visible output (#4633).

168 return _scrub_tool_error(

169 f"Failed to fetch {url}: "

170 f"{result.get('error', 'unknown error')}"

171 )

172 except PolicyDeniedError as exc:

173 # An out-of-scope URL is a RECOVERABLE, per-call decision (the agent

174 # picked one bad URL among many). Return it as a tool message — like

175 # the transient-error path below — so the lead agent and pooled

176 # subagents handle it identically and the agent can adapt, instead

177 # of re-raising (which aborts a subagent and depends on each agent's

178 # tool-error layer). The URL was already NOT fetched; the policy

179 # already enforced — only the REPORTING changes, not security.

180 return _scrub_tool_error(

181 f"Cannot fetch {url}: blocked by egress policy "

182 f"({_denial_reason(exc)}). {_EGRESS_DENIAL_HINT}"

183 )

184 except Exception as exc:

185 # Message carries the mode + a REDACTED scheme://host only (no

186 # userinfo/path/query) so an operator can locate the failure without

187 # the log line leaking credentials, query tokens, or page content.

188 # The traceback follows the sink's diagnose setting (off by default;

189 # see utilities/log_utils). The agent/user-facing return is scrubbed

190 # separately below.

191 logger.exception(

192 "fetch_content tool error (mode={}, url={})",

193 mode_label,

194 redact_url_for_log(url),

195 )

196 return _scrub_tool_error(f"Error fetching {url}: {exc}")

197

198 return fetch_content

199

200

201def _make_summary_fetch_tool(

202 collector: Any,

203 model: BaseChatModel,

204 overall_query: str | None,

205 settings_snapshot: dict | None = None,

206 egress_context: Any = None,

207):

208 """Build the summary-mode fetch tool.

209

210 overall_query=None → focus-only prompt (``summary_focus`` mode).

211 overall_query=str → focus + overall-query prompt (``summary_focus_query``).

212 """

213 use_query = bool(overall_query)

214 template = SUMMARY_FOCUS_QUERY_PROMPT if use_query else SUMMARY_FOCUS_PROMPT

215

216 mode_label = "summary_focus_query" if use_query else "summary_focus"

217

218 @tool

219 def fetch_content(url: str, focus: str) -> str:

220 """Fetch a URL and return only the spans of text relevant to ``focus``.

221 Pass the specific question or claim you want answered as ``focus`` — the

222 tool will quote relevant facts verbatim and discard unrelated content.

223 """

224 from local_deep_research.content_fetcher import ContentFetcher

225 from local_deep_research.security.egress.policy import (

226 PolicyDeniedError,

227 )

228

229 enable_js = _read_js_rendering_setting(settings_snapshot)

230 try:

231 # Run the per-URL egress gate INSIDE the try so an out-of-scope URL

232 # is returned as a recoverable tool message (see the except below),

233 # not re-raised.

234 _enforce_url_policy(url, egress_context)

235 with ContentFetcher(

236 timeout=CONTENT_FETCH_TIMEOUT,

237 enable_js_rendering=enable_js,

238 egress_context=egress_context,

239 ) as fetcher:

240 result = fetcher.fetch(url, max_length=CONTENT_MAX_LENGTH)

241 if result.get("status") != "success": 241 ↛ 245line 241 didn't jump to line 245 because the condition on line 241 was never true

242 # result['error'] comes from ContentFetcher, which

243 # returns a raw str(exception) — scrub it (and the url)

244 # before this reaches the agent/LLM / user output (#4633).

245 return _scrub_tool_error(

246 f"Failed to fetch {url}: "

247 f"{result.get('error', 'unknown error')}"

248 )

249

250 title = result.get("title") or ""

251 content = result.get("content") or ""

252

253 # Guard 1 — empty page content (paywalls, JS-only SPAs that

254 # static fetch can't render, deleted pages with HTTP 200).

255 # Skipping the LLM call here means we don't pay the round-trip

256 # to summarise nothing, AND we don't register an empty

257 # citation in the collector — `_register_in_collector` caches

258 # by URL, so an empty snippet would lock the URL in as

259 # "already fetched, nothing here" and the agent would never

260 # retry it under a different focus.

261 if not content.strip(): 261 ↛ 262line 261 didn't jump to line 262 because the condition on line 261 was never true

262 logger.info(

263 f"[FETCH] mode={mode_label} url={url} — "

264 "empty page content, returning NOT RELEVANT without "

265 "LLM call or collector registration"

266 )

267 return f"NOT RELEVANT (no extractable content at {url})"

268

269 fmt_kwargs = {

270 "focus": focus,

271 "title": title,

272 "url": url,

273 "content": content,

274 }

275 if use_query:

276 fmt_kwargs["overall_query"] = overall_query

277 prompt = template.format(**fmt_kwargs)

278

279 try:

280 summary_msg = model.invoke(prompt)

281 summary = getattr(

282 summary_msg, "content", str(summary_msg)

283 ).strip()

284 except Exception as exc:

285 # Redacted scheme://host + mode only — no page content,

286 # focus, or credentials in the message. Traceback follows

287 # the sink's diagnose setting (off by default).

288 logger.exception(

289 "fetch_content summary LLM error (mode={}, url={})",

290 mode_label,

291 redact_url_for_log(url),

292 )

293 return _scrub_tool_error(f"Error summarizing {url}: {exc}")

294

295 # Diagnostic log: per-fetch input/output for evaluating the

296 # summariser. Single multi-line block so it's atomic per call

297 # and easy to grep with ``grep -A1000 "[FETCH] mode="``.

298 log_lines = [

299 f"[FETCH] mode={mode_label} url={url}",

300 f"[FETCH] focus: {focus}",

301 ]

302 if use_query:

303 log_lines.append(f"[FETCH] overall_query: {overall_query}")

304 log_lines.extend(

305 [

306 f"[FETCH] title: {title}",

307 f"[FETCH] page_text ({len(content)} chars):",

308 content,

309 f"[FETCH] summary returned ({len(summary)} chars):",

310 summary or "(empty)",

311 "[FETCH] ---",

312 ]

313 )

314 logger.info("\n".join(log_lines))

315

316 # Guard 2 — empty LLM summary. The model decided nothing on

317 # the page answers the focus (or it returned a malformed/empty

318 # response). Treat as NOT RELEVANT and skip collector

319 # registration: the agent should be free to re-fetch the URL

320 # later with a different focus instead of seeing it as

321 # already-cached with an empty body.

322 if not summary: 322 ↛ 323line 322 didn't jump to line 323 because the condition on line 322 was never true

323 return f"NOT RELEVANT (no spans matched focus at {url})"

324

325 cite_idx = _register_in_collector(

326 collector, url, title, summary

327 )

328 return f"[{cite_idx}] Title: {title}\nURL: {url}\n\n{summary}"

329 except PolicyDeniedError as exc:

330 # Recoverable per-URL denial — return a tool message so both the

331 # lead agent and pooled subagents handle it identically and the

332 # agent stays in-scope. The URL was already NOT fetched; only the

333 # reporting changes, not security. (See the full-fetch variant.)

334 return _scrub_tool_error(

335 f"Cannot fetch {url}: blocked by egress policy "

336 f"({_denial_reason(exc)}). {_EGRESS_DENIAL_HINT}"

337 )

338 except Exception as exc:

339 # Message carries the mode + a REDACTED scheme://host only (no

340 # userinfo/path/query) so an operator can locate the failure without

341 # the log line leaking credentials, query tokens, or page content.

342 # The traceback follows the sink's diagnose setting (off by default;

343 # see utilities/log_utils). The agent/user-facing return is scrubbed

344 # separately below.

345 logger.exception(

346 "fetch_content tool error (mode={}, url={})",

347 mode_label,

348 redact_url_for_log(url),

349 )

350 return _scrub_tool_error(f"Error fetching {url}: {exc}")

351

352 return fetch_content

353

354

355def build_fetch_tool(

356 mode: str,

357 collector: Any,

358 *,

359 model: BaseChatModel | None = None,

360 overall_query: str = "",

361 settings_snapshot: dict | None = None,

362 egress_context: Any = None,

363):

364 """Build the agent-facing ``fetch_content`` tool for *mode*.

365

366 Returns ``None`` when ``mode == 'disabled'``; the caller should not

367 register the tool with the agent in that case (and the system prompt

368 should also drop the corresponding instruction line so the agent

369 isn't told to use a tool that doesn't exist).

370

371 ``settings_snapshot`` is captured by the tool closure so the per-call

372 JS-rendering toggle can be read on a worker thread (where

373 ``threading.local`` context does not propagate).

374

375 ``egress_context`` is captured by the closure so the per-call URL

376 can be policy-gated; when ``None``, no policy enforcement runs

377 (preserves legacy non-LangGraph callers).

378 """

379 if mode == "disabled":

380 return None

381 if mode == "full":

382 return _make_full_fetch_tool(

383 collector,

384 settings_snapshot=settings_snapshot,

385 egress_context=egress_context,

386 )

387 if mode == "summary_focus":

388 if model is None:

389 raise ValueError("summary_focus fetch mode requires a model")

390 return _make_summary_fetch_tool(

391 collector,

392 model,

393 overall_query=None,

394 settings_snapshot=settings_snapshot,

395 egress_context=egress_context,

396 )

397 if mode == "summary_focus_query":

398 if model is None:

399 raise ValueError("summary_focus_query fetch mode requires a model")

400 # Empty overall_query falls back to focus-only behaviour at format

401 # time; we keep the *_query mode label so logs stay diagnostic.

402 return _make_summary_fetch_tool(

403 collector,

404 model,

405 overall_query=overall_query or None,

406 settings_snapshot=settings_snapshot,

407 egress_context=egress_context,

408 )

409 raise ValueError(

410 f"Unknown fetch mode {mode!r}; expected one of {FETCH_MODES}"

411 )

412

413

414__all__ = ["FETCH_MODES", "build_fetch_tool"]

Coverage for src/local_deep_research/advanced_search_system/tools/fetch/init.py: 91%

116 statements