Coverage for src/local_deep_research/advanced_search_system/tools/fetch/__init__.py: 90%
88 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""Agent-facing ``fetch_content`` tool builders.
3Public API:
4 FETCH_MODES — tuple of valid mode strings.
5 build_fetch_tool() — returns a LangChain ``@tool`` (or ``None`` when
6 mode == "disabled" so the caller can skip
7 registration).
9Modes:
10 disabled — fetch tool is not registered with the agent.
11 full — return the full extracted page text (legacy
12 behavior; can flood small-model context with
13 boilerplate / metadata enrichment).
14 summary_focus — LLM extracts only spans relevant to a focus
15 question the agent supplies per call.
16 summary_focus_query — same as above, but the prompt also includes
17 the original research query (passed in
18 programmatically by the strategy) so the
19 extractor can disambiguate vague focuses.
21Each tool registers fetched URLs in the strategy's
22``SearchResultsCollector`` for citation tracking, returning the result as
23``[N] Title: ...\\nURL: ...\\n\\n<body>`` exactly like the original
24in-strategy implementation, so downstream prompt formatting is unchanged.
25"""
27from __future__ import annotations
29from typing import Any
31from langchain_core.language_models import BaseChatModel
32from langchain_core.tools import tool
33from loguru import logger
35from local_deep_research.utilities.js_rendering import (
36 read_js_rendering_setting as _read_js_rendering_setting,
37)
39from .prompts import SUMMARY_FOCUS_PROMPT, SUMMARY_FOCUS_QUERY_PROMPT
42# Per-call timeouts and caps. Kept here rather than in the strategy file
43# because they are properties of the fetch tool, not of agent
44# orchestration.
45CONTENT_FETCH_TIMEOUT = 30
46CONTENT_MAX_LENGTH = 10_000
48FETCH_MODES = (
49 "disabled",
50 "full",
51 "summary_focus",
52 "summary_focus_query",
53)
56def _register_in_collector(
57 collector: Any,
58 url: str,
59 title: str,
60 snippet_source: str,
61) -> int:
62 """Register a fetched URL in the collector and return its 1-based citation index.
64 If the URL was already tracked (via a prior search hit) the existing
65 index is reused so the agent sees a stable citation per URL.
66 """
67 existing_idx = collector.find_by_url(url)
68 if existing_idx is not None:
69 return existing_idx
70 snippet = snippet_source[:200].strip()
71 if len(snippet_source) > 200:
72 snippet += "..."
73 start = collector.add_results(
74 [{"title": title, "link": url, "snippet": snippet}],
75 engine_name="fetch",
76 )
77 return start + 1
80def _make_full_fetch_tool(
81 collector: Any, settings_snapshot: dict | None = None
82):
83 @tool
84 def fetch_content(url: str) -> str:
85 """Download and read the full text content from a URL. Use when search snippets aren't detailed enough."""
86 from local_deep_research.content_fetcher import ContentFetcher
88 enable_js = _read_js_rendering_setting(settings_snapshot)
89 try:
90 with ContentFetcher(
91 timeout=CONTENT_FETCH_TIMEOUT,
92 enable_js_rendering=enable_js,
93 ) as fetcher:
94 result = fetcher.fetch(url, max_length=CONTENT_MAX_LENGTH)
95 if result.get("status") == "success":
96 title = result.get("title", "")
97 content = result.get("content", "")
98 cite_idx = _register_in_collector(
99 collector, url, title, content
100 )
101 return (
102 f"[{cite_idx}] Title: {title}\nURL: {url}\n\n{content}"
103 )
104 return f"Failed to fetch {url}: {result.get('error', 'unknown error')}"
105 except Exception as exc:
106 logger.exception("fetch_content tool error")
107 return f"Error fetching {url}: {exc}"
109 return fetch_content
112def _make_summary_fetch_tool(
113 collector: Any,
114 model: BaseChatModel,
115 overall_query: str | None,
116 settings_snapshot: dict | None = None,
117):
118 """Build the summary-mode fetch tool.
120 overall_query=None → focus-only prompt (``summary_focus`` mode).
121 overall_query=str → focus + overall-query prompt (``summary_focus_query``).
122 """
123 use_query = bool(overall_query)
124 template = SUMMARY_FOCUS_QUERY_PROMPT if use_query else SUMMARY_FOCUS_PROMPT
126 mode_label = "summary_focus_query" if use_query else "summary_focus"
128 @tool
129 def fetch_content(url: str, focus: str) -> str:
130 """Fetch a URL and return only the spans of text relevant to ``focus``.
131 Pass the specific question or claim you want answered as ``focus`` — the
132 tool will quote relevant facts verbatim and discard unrelated content.
133 """
134 from local_deep_research.content_fetcher import ContentFetcher
136 enable_js = _read_js_rendering_setting(settings_snapshot)
137 try:
138 with ContentFetcher(
139 timeout=CONTENT_FETCH_TIMEOUT,
140 enable_js_rendering=enable_js,
141 ) as fetcher:
142 result = fetcher.fetch(url, max_length=CONTENT_MAX_LENGTH)
143 if result.get("status") != "success": 143 ↛ 144line 143 didn't jump to line 144 because the condition on line 143 was never true
144 return f"Failed to fetch {url}: {result.get('error', 'unknown error')}"
146 title = result.get("title", "")
147 content = result.get("content", "")
149 fmt_kwargs = {
150 "focus": focus,
151 "title": title,
152 "url": url,
153 "content": content,
154 }
155 if use_query:
156 fmt_kwargs["overall_query"] = overall_query
157 prompt = template.format(**fmt_kwargs)
159 try:
160 summary_msg = model.invoke(prompt)
161 summary = getattr(
162 summary_msg, "content", str(summary_msg)
163 ).strip()
164 except Exception as exc:
165 logger.exception("fetch_content summary LLM error")
166 return f"Error summarizing {url}: {exc}"
168 # Diagnostic log: per-fetch input/output for evaluating the
169 # summariser. Single multi-line block so it's atomic per call
170 # and easy to grep with ``grep -A1000 "[FETCH] mode="``.
171 log_lines = [
172 f"[FETCH] mode={mode_label} url={url}",
173 f"[FETCH] focus: {focus}",
174 ]
175 if use_query:
176 log_lines.append(f"[FETCH] overall_query: {overall_query}")
177 log_lines.extend(
178 [
179 f"[FETCH] title: {title}",
180 f"[FETCH] page_text ({len(content)} chars):",
181 content,
182 f"[FETCH] summary returned ({len(summary)} chars):",
183 summary or "(empty)",
184 "[FETCH] ---",
185 ]
186 )
187 logger.info("\n".join(log_lines))
189 cite_idx = _register_in_collector(
190 collector, url, title, summary or content
191 )
192 return f"[{cite_idx}] Title: {title}\nURL: {url}\n\n{summary}"
193 except Exception as exc:
194 logger.exception("fetch_content tool error")
195 return f"Error fetching {url}: {exc}"
197 return fetch_content
200def build_fetch_tool(
201 mode: str,
202 collector: Any,
203 *,
204 model: BaseChatModel | None = None,
205 overall_query: str = "",
206 settings_snapshot: dict | None = None,
207):
208 """Build the agent-facing ``fetch_content`` tool for *mode*.
210 Returns ``None`` when ``mode == 'disabled'``; the caller should not
211 register the tool with the agent in that case (and the system prompt
212 should also drop the corresponding instruction line so the agent
213 isn't told to use a tool that doesn't exist).
215 ``settings_snapshot`` is captured by the tool closure so the per-call
216 JS-rendering toggle can be read on a worker thread (where
217 ``threading.local`` context does not propagate).
218 """
219 if mode == "disabled":
220 return None
221 if mode == "full":
222 return _make_full_fetch_tool(
223 collector, settings_snapshot=settings_snapshot
224 )
225 if mode == "summary_focus":
226 if model is None:
227 raise ValueError("summary_focus fetch mode requires a model")
228 return _make_summary_fetch_tool(
229 collector,
230 model,
231 overall_query=None,
232 settings_snapshot=settings_snapshot,
233 )
234 if mode == "summary_focus_query":
235 if model is None:
236 raise ValueError("summary_focus_query fetch mode requires a model")
237 # Empty overall_query falls back to focus-only behaviour at format
238 # time; we keep the *_query mode label so logs stay diagnostic.
239 return _make_summary_fetch_tool(
240 collector,
241 model,
242 overall_query=overall_query or None,
243 settings_snapshot=settings_snapshot,
244 )
245 raise ValueError(
246 f"Unknown fetch mode {mode!r}; expected one of {FETCH_MODES}"
247 )
250__all__ = ["FETCH_MODES", "build_fetch_tool"]