Coverage for src / local_deep_research / web_search_engines / engines / full_search.py: 99%

75 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1from loguru import logger 

2from datetime import datetime, UTC 

3from typing import Dict, List 

4 

5import justext 

6from langchain_community.document_loaders import AsyncChromiumLoader 

7from langchain_community.document_transformers import BeautifulSoupTransformer 

8from langchain_core.language_models import BaseLLM 

9 

10from ...config.search_config import QUALITY_CHECK_DDG_URLS 

11from ...utilities.json_utils import extract_json, get_llm_response_text 

12 

13 

14class FullSearchResults: 

15 def __init__( 

16 self, 

17 llm: BaseLLM, # Add LLM parameter 

18 web_search: list, 

19 output_format: str = "list", 

20 language: str = "English", 

21 max_results: int = 10, 

22 region: str = "wt-wt", 

23 time: str = "y", 

24 safesearch: str | int = "Moderate", 

25 ): 

26 self.llm = llm 

27 self.output_format = output_format 

28 self.language = language 

29 self.max_results = max_results 

30 self.region = region 

31 self.time = time 

32 self.safesearch = safesearch 

33 self.web_search = web_search 

34 

35 self.bs_transformer = BeautifulSoupTransformer() 

36 self.tags_to_extract = ["p", "div", "span"] 

37 

38 def check_urls(self, results: List[Dict], query: str) -> List[Dict]: 

39 if not results: 

40 return results 

41 

42 now = datetime.now(UTC) 

43 current_time = now.strftime("%Y-%m-%d") 

44 prompt = f"""ONLY Return a JSON array. The response contains no letters. Evaluate these URLs for: 

45 1. Timeliness (today: {current_time}) 

46 2. Factual accuracy (cross-reference major claims) 

47 3. Source reliability (prefer official company websites, established news outlets) 

48 4. Direct relevance to query: {query} 

49 

50 URLs to evaluate: 

51 {results} 

52 

53 Return a JSON array of indices (0-based) for sources that meet ALL criteria. 

54 ONLY Return a JSON array of indices (0-based) and nothing else. No letters. 

55 Example response: \n[0, 2, 4]\n\n""" 

56 

57 try: 

58 # Get LLM's evaluation 

59 response = self.llm.invoke(prompt) 

60 response_text = get_llm_response_text(response) 

61 good_indices = extract_json(response_text, expected_type=list) 

62 

63 if good_indices is None: 

64 good_indices = [] 

65 

66 # Return only the results with good URLs 

67 return [r for i, r in enumerate(results) if i in good_indices] 

68 except Exception: 

69 logger.exception("URL filtering error") 

70 return [] 

71 

72 def remove_boilerplate(self, html: str) -> str: 

73 if not html or not html.strip(): 

74 return "" 

75 paragraphs = justext.justext(html, justext.get_stoplist(self.language)) 

76 cleaned = "\n".join( 

77 [p.text for p in paragraphs if not p.is_boilerplate] 

78 ) 

79 return cleaned 

80 

81 def run(self, query: str): 

82 nr_full_text = 0 

83 # Step 1: Get search results 

84 search_results = self.web_search.invoke(query) 

85 if not isinstance(search_results, list): 

86 raise ValueError("Expected the search results in list format.") 

87 

88 # Step 2: Filter URLs using LLM 

89 if QUALITY_CHECK_DDG_URLS: 

90 filtered_results = self.check_urls(search_results, query) 

91 else: 

92 filtered_results = search_results 

93 

94 # Extract URLs from filtered results 

95 urls = [ 

96 result.get("link") 

97 for result in filtered_results 

98 if result.get("link") 

99 ] 

100 

101 if not urls: 

102 logger.error("\n === NO VALID LINKS ===\n") 

103 return [] 

104 

105 # Step 3: Download the full HTML pages for filtered URLs 

106 loader = AsyncChromiumLoader(urls) 

107 html_docs = loader.load() 

108 

109 # Step 4: Process the HTML using BeautifulSoupTransformer 

110 full_docs = self.bs_transformer.transform_documents( 

111 html_docs, tags_to_extract=self.tags_to_extract 

112 ) 

113 

114 # Step 5: Remove boilerplate from each document 

115 url_to_content = {} 

116 for doc in full_docs: 

117 nr_full_text = nr_full_text + 1 

118 source = doc.metadata.get("source") 

119 if source: 119 ↛ 116line 119 didn't jump to line 116 because the condition on line 119 was always true

120 cleaned_text = self.remove_boilerplate(doc.page_content) 

121 url_to_content[source] = cleaned_text 

122 

123 # Attach the cleaned full content to each filtered result 

124 for result in filtered_results: 

125 link = result.get("link") 

126 result["full_content"] = url_to_content.get(link) 

127 

128 logger.info("FULL SEARCH WITH FILTERED URLS") 

129 logger.info("Full text retrieved: ", nr_full_text) 

130 return filtered_results 

131 

132 def invoke(self, query: str): 

133 return self.run(query) 

134 

135 def __call__(self, query: str): 

136 return self.invoke(query)