Coverage for src/local_deep_research/web_search_engines/engines/full_search.py: 99%

97 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1from loguru import logger 

2from datetime import datetime, UTC 

3from typing import Any, Dict, List, Optional, Protocol, runtime_checkable 

4 

5from langchain_core.language_models import BaseLLM 

6 

7from ...config.search_config import QUALITY_CHECK_DDG_URLS 

8from ...research_library.downloaders.extraction import ( 

9 batch_fetch_and_extract, 

10) 

11from ...security.ssrf_validator import validate_url 

12from ...utilities.js_rendering import ( 

13 read_js_rendering_setting as _read_js_rendering_setting, 

14) 

15from ...utilities.json_utils import extract_json, get_llm_response_text 

16 

17 

18@runtime_checkable 

19class _Invokable(Protocol): 

20 def invoke(self, query: str) -> Any: ... 20 ↛ exitline 20 didn't return from function 'invoke' because

21 

22 

23class FullSearchResults: 

24 def __init__( 

25 self, 

26 llm: Optional[BaseLLM], 

27 web_search: _Invokable, 

28 output_format: str = "list", 

29 language: str = "English", 

30 max_results: int = 10, 

31 region: str = "wt-wt", 

32 time: Optional[str] = "y", 

33 safesearch: str | int = "Moderate", 

34 settings_snapshot: Optional[Dict] = None, 

35 ): 

36 self.llm = llm 

37 self.output_format = output_format 

38 self.language = language 

39 self.max_results = max_results 

40 self.region = region 

41 self.time = time 

42 self.safesearch = safesearch 

43 self.web_search = web_search 

44 self.settings_snapshot = settings_snapshot 

45 

46 def check_urls(self, results: List[Dict], query: str) -> List[Dict]: 

47 if not results: 

48 return results 

49 

50 now = datetime.now(UTC) 

51 current_time = now.strftime("%Y-%m-%d") 

52 prompt = f"""ONLY Return a JSON array. The response contains no letters. Evaluate these URLs for: 

53 1. Timeliness (today: {current_time}) 

54 2. Factual accuracy (cross-reference major claims) 

55 3. Source reliability (prefer official company websites, established news outlets) 

56 4. Direct relevance to query: {query} 

57 

58 URLs to evaluate: 

59 {results} 

60 

61 Return a JSON array of indices (0-based) for sources that meet ALL criteria. 

62 ONLY Return a JSON array of indices (0-based) and nothing else. No letters. 

63 Example response: \n[0, 2, 4]\n\n""" 

64 

65 try: 

66 if self.llm is None: 

67 return results 

68 response = self.llm.invoke(prompt) 

69 response_text = get_llm_response_text(response) 

70 good_indices = extract_json(response_text, expected_type=list) 

71 

72 if good_indices is None: 

73 good_indices = [] 

74 

75 return [r for i, r in enumerate(results) if i in good_indices] 

76 except Exception: 

77 logger.exception("URL filtering error") 

78 logger.warning( 

79 "URL quality filter unavailable — returning {} unfiltered " 

80 "results as fallback", 

81 len(results), 

82 ) 

83 return results # Fall back to original results on LLM error 

84 

85 def run(self, query: str): 

86 # Step 1: Get search results 

87 search_results = self.web_search.invoke(query) 

88 if not isinstance(search_results, list): 

89 raise ValueError("Expected the search results in list format.") 

90 

91 # Step 2: Filter URLs using LLM 

92 if QUALITY_CHECK_DDG_URLS: 

93 filtered_results = self.check_urls(search_results, query) 

94 else: 

95 filtered_results = search_results 

96 

97 # Extract URLs from filtered results 

98 urls = [ 

99 result.get("link") 

100 for result in filtered_results 

101 if result.get("link") 

102 ] 

103 

104 if not urls: 

105 logger.error("\n === NO VALID LINKS ===\n") 

106 return [] 

107 

108 # SSRF-validate URLs 

109 safe_urls: List[str] = [] 

110 for url in urls: 

111 if url is not None and validate_url(url): 

112 safe_urls.append(url) 

113 else: 

114 logger.warning( 

115 f"SSRF validation blocked URL from full content fetch: {url}. " 

116 "If this is a trusted internal/private resource, note that " 

117 "full content fetching currently only supports public URLs." 

118 ) 

119 

120 if not safe_urls: 

121 logger.warning( 

122 "All URLs were blocked by SSRF validation — returning results " 

123 "without full content. This can happen when search results " 

124 "point to internal/private network addresses." 

125 ) 

126 for result in filtered_results: 

127 result["full_content"] = None 

128 return filtered_results 

129 

130 # Fetch and extract all pages — specialized downloaders (arXiv, 

131 # PubMed, etc.) are tried first, with HTML crawling as fallback. 

132 url_to_content = batch_fetch_and_extract( 

133 safe_urls, 

134 language=self.language, 

135 enable_js_rendering=_read_js_rendering_setting( 

136 self.settings_snapshot 

137 ), 

138 ) 

139 

140 nr_full_text = sum(1 for v in url_to_content.values() if v) 

141 for result in filtered_results: 

142 link = result.get("link") 

143 result["full_content"] = url_to_content.get(link) if link else None 

144 

145 logger.info(f"Full search: retrieved content from {nr_full_text} pages") 

146 return filtered_results 

147 

148 def _get_full_content( 

149 self, relevant_items: List[Dict[str, Any]] 

150 ) -> List[Dict[str, Any]]: 

151 """Fetch and attach full content to an existing list of items.""" 

152 urls: List[str] = [] 

153 for item in relevant_items: 

154 link = item.get("link") 

155 if link is not None and validate_url(link): 

156 urls.append(link) 

157 elif link is not None: 

158 logger.warning( 

159 f"SSRF validation blocked URL from full content fetch: {link}." 

160 ) 

161 

162 if not urls: 

163 for item in relevant_items: 

164 item["full_content"] = None 

165 return relevant_items 

166 

167 try: 

168 url_to_content = batch_fetch_and_extract( 

169 urls, 

170 language=self.language, 

171 enable_js_rendering=_read_js_rendering_setting( 

172 self.settings_snapshot 

173 ), 

174 ) 

175 except Exception: 

176 logger.exception("Error fetching full content") 

177 for item in relevant_items: 

178 item["full_content"] = None 

179 return relevant_items 

180 

181 for item in relevant_items: 

182 link = item.get("link") 

183 item["full_content"] = url_to_content.get(link) if link else None 

184 

185 return relevant_items 

186 

187 def invoke(self, query: str) -> Any: 

188 return self.run(query) 

189 

190 def __call__(self, query: str) -> Any: 

191 return self.invoke(query)