Coverage for src/local_deep_research/advanced_search_system/tools/fetch/__init__.py: 90%

88 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1"""Agent-facing ``fetch_content`` tool builders. 

2 

3Public API: 

4 FETCH_MODES — tuple of valid mode strings. 

5 build_fetch_tool() — returns a LangChain ``@tool`` (or ``None`` when 

6 mode == "disabled" so the caller can skip 

7 registration). 

8 

9Modes: 

10 disabled — fetch tool is not registered with the agent. 

11 full — return the full extracted page text (legacy 

12 behavior; can flood small-model context with 

13 boilerplate / metadata enrichment). 

14 summary_focus — LLM extracts only spans relevant to a focus 

15 question the agent supplies per call. 

16 summary_focus_query — same as above, but the prompt also includes 

17 the original research query (passed in 

18 programmatically by the strategy) so the 

19 extractor can disambiguate vague focuses. 

20 

21Each tool registers fetched URLs in the strategy's 

22``SearchResultsCollector`` for citation tracking, returning the result as 

23``[N] Title: ...\\nURL: ...\\n\\n<body>`` exactly like the original 

24in-strategy implementation, so downstream prompt formatting is unchanged. 

25""" 

26 

27from __future__ import annotations 

28 

29from typing import Any 

30 

31from langchain_core.language_models import BaseChatModel 

32from langchain_core.tools import tool 

33from loguru import logger 

34 

35from local_deep_research.utilities.js_rendering import ( 

36 read_js_rendering_setting as _read_js_rendering_setting, 

37) 

38 

39from .prompts import SUMMARY_FOCUS_PROMPT, SUMMARY_FOCUS_QUERY_PROMPT 

40 

41 

42# Per-call timeouts and caps. Kept here rather than in the strategy file 

43# because they are properties of the fetch tool, not of agent 

44# orchestration. 

45CONTENT_FETCH_TIMEOUT = 30 

46CONTENT_MAX_LENGTH = 10_000 

47 

48FETCH_MODES = ( 

49 "disabled", 

50 "full", 

51 "summary_focus", 

52 "summary_focus_query", 

53) 

54 

55 

56def _register_in_collector( 

57 collector: Any, 

58 url: str, 

59 title: str, 

60 snippet_source: str, 

61) -> int: 

62 """Register a fetched URL in the collector and return its 1-based citation index. 

63 

64 If the URL was already tracked (via a prior search hit) the existing 

65 index is reused so the agent sees a stable citation per URL. 

66 """ 

67 existing_idx = collector.find_by_url(url) 

68 if existing_idx is not None: 

69 return existing_idx 

70 snippet = snippet_source[:200].strip() 

71 if len(snippet_source) > 200: 

72 snippet += "..." 

73 start = collector.add_results( 

74 [{"title": title, "link": url, "snippet": snippet}], 

75 engine_name="fetch", 

76 ) 

77 return start + 1 

78 

79 

80def _make_full_fetch_tool( 

81 collector: Any, settings_snapshot: dict | None = None 

82): 

83 @tool 

84 def fetch_content(url: str) -> str: 

85 """Download and read the full text content from a URL. Use when search snippets aren't detailed enough.""" 

86 from local_deep_research.content_fetcher import ContentFetcher 

87 

88 enable_js = _read_js_rendering_setting(settings_snapshot) 

89 try: 

90 with ContentFetcher( 

91 timeout=CONTENT_FETCH_TIMEOUT, 

92 enable_js_rendering=enable_js, 

93 ) as fetcher: 

94 result = fetcher.fetch(url, max_length=CONTENT_MAX_LENGTH) 

95 if result.get("status") == "success": 

96 title = result.get("title", "") 

97 content = result.get("content", "") 

98 cite_idx = _register_in_collector( 

99 collector, url, title, content 

100 ) 

101 return ( 

102 f"[{cite_idx}] Title: {title}\nURL: {url}\n\n{content}" 

103 ) 

104 return f"Failed to fetch {url}: {result.get('error', 'unknown error')}" 

105 except Exception as exc: 

106 logger.exception("fetch_content tool error") 

107 return f"Error fetching {url}: {exc}" 

108 

109 return fetch_content 

110 

111 

112def _make_summary_fetch_tool( 

113 collector: Any, 

114 model: BaseChatModel, 

115 overall_query: str | None, 

116 settings_snapshot: dict | None = None, 

117): 

118 """Build the summary-mode fetch tool. 

119 

120 overall_query=None → focus-only prompt (``summary_focus`` mode). 

121 overall_query=str → focus + overall-query prompt (``summary_focus_query``). 

122 """ 

123 use_query = bool(overall_query) 

124 template = SUMMARY_FOCUS_QUERY_PROMPT if use_query else SUMMARY_FOCUS_PROMPT 

125 

126 mode_label = "summary_focus_query" if use_query else "summary_focus" 

127 

128 @tool 

129 def fetch_content(url: str, focus: str) -> str: 

130 """Fetch a URL and return only the spans of text relevant to ``focus``. 

131 Pass the specific question or claim you want answered as ``focus`` — the 

132 tool will quote relevant facts verbatim and discard unrelated content. 

133 """ 

134 from local_deep_research.content_fetcher import ContentFetcher 

135 

136 enable_js = _read_js_rendering_setting(settings_snapshot) 

137 try: 

138 with ContentFetcher( 

139 timeout=CONTENT_FETCH_TIMEOUT, 

140 enable_js_rendering=enable_js, 

141 ) as fetcher: 

142 result = fetcher.fetch(url, max_length=CONTENT_MAX_LENGTH) 

143 if result.get("status") != "success": 143 ↛ 144line 143 didn't jump to line 144 because the condition on line 143 was never true

144 return f"Failed to fetch {url}: {result.get('error', 'unknown error')}" 

145 

146 title = result.get("title", "") 

147 content = result.get("content", "") 

148 

149 fmt_kwargs = { 

150 "focus": focus, 

151 "title": title, 

152 "url": url, 

153 "content": content, 

154 } 

155 if use_query: 

156 fmt_kwargs["overall_query"] = overall_query 

157 prompt = template.format(**fmt_kwargs) 

158 

159 try: 

160 summary_msg = model.invoke(prompt) 

161 summary = getattr( 

162 summary_msg, "content", str(summary_msg) 

163 ).strip() 

164 except Exception as exc: 

165 logger.exception("fetch_content summary LLM error") 

166 return f"Error summarizing {url}: {exc}" 

167 

168 # Diagnostic log: per-fetch input/output for evaluating the 

169 # summariser. Single multi-line block so it's atomic per call 

170 # and easy to grep with ``grep -A1000 "[FETCH] mode="``. 

171 log_lines = [ 

172 f"[FETCH] mode={mode_label} url={url}", 

173 f"[FETCH] focus: {focus}", 

174 ] 

175 if use_query: 

176 log_lines.append(f"[FETCH] overall_query: {overall_query}") 

177 log_lines.extend( 

178 [ 

179 f"[FETCH] title: {title}", 

180 f"[FETCH] page_text ({len(content)} chars):", 

181 content, 

182 f"[FETCH] summary returned ({len(summary)} chars):", 

183 summary or "(empty)", 

184 "[FETCH] ---", 

185 ] 

186 ) 

187 logger.info("\n".join(log_lines)) 

188 

189 cite_idx = _register_in_collector( 

190 collector, url, title, summary or content 

191 ) 

192 return f"[{cite_idx}] Title: {title}\nURL: {url}\n\n{summary}" 

193 except Exception as exc: 

194 logger.exception("fetch_content tool error") 

195 return f"Error fetching {url}: {exc}" 

196 

197 return fetch_content 

198 

199 

200def build_fetch_tool( 

201 mode: str, 

202 collector: Any, 

203 *, 

204 model: BaseChatModel | None = None, 

205 overall_query: str = "", 

206 settings_snapshot: dict | None = None, 

207): 

208 """Build the agent-facing ``fetch_content`` tool for *mode*. 

209 

210 Returns ``None`` when ``mode == 'disabled'``; the caller should not 

211 register the tool with the agent in that case (and the system prompt 

212 should also drop the corresponding instruction line so the agent 

213 isn't told to use a tool that doesn't exist). 

214 

215 ``settings_snapshot`` is captured by the tool closure so the per-call 

216 JS-rendering toggle can be read on a worker thread (where 

217 ``threading.local`` context does not propagate). 

218 """ 

219 if mode == "disabled": 

220 return None 

221 if mode == "full": 

222 return _make_full_fetch_tool( 

223 collector, settings_snapshot=settings_snapshot 

224 ) 

225 if mode == "summary_focus": 

226 if model is None: 

227 raise ValueError("summary_focus fetch mode requires a model") 

228 return _make_summary_fetch_tool( 

229 collector, 

230 model, 

231 overall_query=None, 

232 settings_snapshot=settings_snapshot, 

233 ) 

234 if mode == "summary_focus_query": 

235 if model is None: 

236 raise ValueError("summary_focus_query fetch mode requires a model") 

237 # Empty overall_query falls back to focus-only behaviour at format 

238 # time; we keep the *_query mode label so logs stay diagnostic. 

239 return _make_summary_fetch_tool( 

240 collector, 

241 model, 

242 overall_query=overall_query or None, 

243 settings_snapshot=settings_snapshot, 

244 ) 

245 raise ValueError( 

246 f"Unknown fetch mode {mode!r}; expected one of {FETCH_MODES}" 

247 ) 

248 

249 

250__all__ = ["FETCH_MODES", "build_fetch_tool"]