Coverage for src/local_deep_research/web_search_engines/engines/full_search.py: 99%
97 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1from loguru import logger
2from datetime import datetime, UTC
3from typing import Any, Dict, List, Optional, Protocol, runtime_checkable
5from langchain_core.language_models import BaseLLM
7from ...config.search_config import QUALITY_CHECK_DDG_URLS
8from ...research_library.downloaders.extraction import (
9 batch_fetch_and_extract,
10)
11from ...security.ssrf_validator import validate_url
12from ...utilities.js_rendering import (
13 read_js_rendering_setting as _read_js_rendering_setting,
14)
15from ...utilities.json_utils import extract_json, get_llm_response_text
18@runtime_checkable
19class _Invokable(Protocol):
20 def invoke(self, query: str) -> Any: ... 20 ↛ exitline 20 didn't return from function 'invoke' because
23class FullSearchResults:
24 def __init__(
25 self,
26 llm: Optional[BaseLLM],
27 web_search: _Invokable,
28 output_format: str = "list",
29 language: str = "English",
30 max_results: int = 10,
31 region: str = "wt-wt",
32 time: Optional[str] = "y",
33 safesearch: str | int = "Moderate",
34 settings_snapshot: Optional[Dict] = None,
35 ):
36 self.llm = llm
37 self.output_format = output_format
38 self.language = language
39 self.max_results = max_results
40 self.region = region
41 self.time = time
42 self.safesearch = safesearch
43 self.web_search = web_search
44 self.settings_snapshot = settings_snapshot
46 def check_urls(self, results: List[Dict], query: str) -> List[Dict]:
47 if not results:
48 return results
50 now = datetime.now(UTC)
51 current_time = now.strftime("%Y-%m-%d")
52 prompt = f"""ONLY Return a JSON array. The response contains no letters. Evaluate these URLs for:
53 1. Timeliness (today: {current_time})
54 2. Factual accuracy (cross-reference major claims)
55 3. Source reliability (prefer official company websites, established news outlets)
56 4. Direct relevance to query: {query}
58 URLs to evaluate:
59 {results}
61 Return a JSON array of indices (0-based) for sources that meet ALL criteria.
62 ONLY Return a JSON array of indices (0-based) and nothing else. No letters.
63 Example response: \n[0, 2, 4]\n\n"""
65 try:
66 if self.llm is None:
67 return results
68 response = self.llm.invoke(prompt)
69 response_text = get_llm_response_text(response)
70 good_indices = extract_json(response_text, expected_type=list)
72 if good_indices is None:
73 good_indices = []
75 return [r for i, r in enumerate(results) if i in good_indices]
76 except Exception:
77 logger.exception("URL filtering error")
78 logger.warning(
79 "URL quality filter unavailable — returning {} unfiltered "
80 "results as fallback",
81 len(results),
82 )
83 return results # Fall back to original results on LLM error
85 def run(self, query: str):
86 # Step 1: Get search results
87 search_results = self.web_search.invoke(query)
88 if not isinstance(search_results, list):
89 raise ValueError("Expected the search results in list format.")
91 # Step 2: Filter URLs using LLM
92 if QUALITY_CHECK_DDG_URLS:
93 filtered_results = self.check_urls(search_results, query)
94 else:
95 filtered_results = search_results
97 # Extract URLs from filtered results
98 urls = [
99 result.get("link")
100 for result in filtered_results
101 if result.get("link")
102 ]
104 if not urls:
105 logger.error("\n === NO VALID LINKS ===\n")
106 return []
108 # SSRF-validate URLs
109 safe_urls: List[str] = []
110 for url in urls:
111 if url is not None and validate_url(url):
112 safe_urls.append(url)
113 else:
114 logger.warning(
115 f"SSRF validation blocked URL from full content fetch: {url}. "
116 "If this is a trusted internal/private resource, note that "
117 "full content fetching currently only supports public URLs."
118 )
120 if not safe_urls:
121 logger.warning(
122 "All URLs were blocked by SSRF validation — returning results "
123 "without full content. This can happen when search results "
124 "point to internal/private network addresses."
125 )
126 for result in filtered_results:
127 result["full_content"] = None
128 return filtered_results
130 # Fetch and extract all pages — specialized downloaders (arXiv,
131 # PubMed, etc.) are tried first, with HTML crawling as fallback.
132 url_to_content = batch_fetch_and_extract(
133 safe_urls,
134 language=self.language,
135 enable_js_rendering=_read_js_rendering_setting(
136 self.settings_snapshot
137 ),
138 )
140 nr_full_text = sum(1 for v in url_to_content.values() if v)
141 for result in filtered_results:
142 link = result.get("link")
143 result["full_content"] = url_to_content.get(link) if link else None
145 logger.info(f"Full search: retrieved content from {nr_full_text} pages")
146 return filtered_results
148 def _get_full_content(
149 self, relevant_items: List[Dict[str, Any]]
150 ) -> List[Dict[str, Any]]:
151 """Fetch and attach full content to an existing list of items."""
152 urls: List[str] = []
153 for item in relevant_items:
154 link = item.get("link")
155 if link is not None and validate_url(link):
156 urls.append(link)
157 elif link is not None:
158 logger.warning(
159 f"SSRF validation blocked URL from full content fetch: {link}."
160 )
162 if not urls:
163 for item in relevant_items:
164 item["full_content"] = None
165 return relevant_items
167 try:
168 url_to_content = batch_fetch_and_extract(
169 urls,
170 language=self.language,
171 enable_js_rendering=_read_js_rendering_setting(
172 self.settings_snapshot
173 ),
174 )
175 except Exception:
176 logger.exception("Error fetching full content")
177 for item in relevant_items:
178 item["full_content"] = None
179 return relevant_items
181 for item in relevant_items:
182 link = item.get("link")
183 item["full_content"] = url_to_content.get(link) if link else None
185 return relevant_items
187 def invoke(self, query: str) -> Any:
188 return self.run(query)
190 def __call__(self, query: str) -> Any:
191 return self.invoke(query)