Coverage for src / local_deep_research / web_search_engines / engines / full_search.py: 30%

73 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1import json 

2from loguru import logger 

3from datetime import datetime, UTC 

4from typing import Dict, List 

5 

6import justext 

7from langchain_community.document_loaders import AsyncChromiumLoader 

8from langchain_community.document_transformers import BeautifulSoupTransformer 

9from langchain_core.language_models import BaseLLM 

10 

11from ...config.search_config import QUALITY_CHECK_DDG_URLS 

12from ...utilities.search_utilities import remove_think_tags 

13 

14 

15class FullSearchResults: 

16 def __init__( 

17 self, 

18 llm: BaseLLM, # Add LLM parameter 

19 web_search: list, 

20 output_format: str = "list", 

21 language: str = "English", 

22 max_results: int = 10, 

23 region: str = "wt-wt", 

24 time: str = "y", 

25 safesearch: str | int = "Moderate", 

26 ): 

27 self.llm = llm 

28 self.output_format = output_format 

29 self.language = language 

30 self.max_results = max_results 

31 self.region = region 

32 self.time = time 

33 self.safesearch = safesearch 

34 self.web_search = web_search 

35 

36 self.bs_transformer = BeautifulSoupTransformer() 

37 self.tags_to_extract = ["p", "div", "span"] 

38 

39 def check_urls(self, results: List[Dict], query: str) -> List[Dict]: 

40 if not results: 

41 return results 

42 

43 now = datetime.now(UTC) 

44 current_time = now.strftime("%Y-%m-%d") 

45 prompt = f"""ONLY Return a JSON array. The response contains no letters. Evaluate these URLs for: 

46 1. Timeliness (today: {current_time}) 

47 2. Factual accuracy (cross-reference major claims) 

48 3. Source reliability (prefer official company websites, established news outlets) 

49 4. Direct relevance to query: {query} 

50 

51 URLs to evaluate: 

52 {results} 

53 

54 Return a JSON array of indices (0-based) for sources that meet ALL criteria. 

55 ONLY Return a JSON array of indices (0-based) and nothing else. No letters. 

56 Example response: \n[0, 2, 4]\n\n""" 

57 

58 try: 

59 # Get LLM's evaluation 

60 response = self.llm.invoke(prompt) 

61 good_indices = json.loads(remove_think_tags(response.content)) 

62 

63 # Return only the results with good URLs 

64 return [r for i, r in enumerate(results) if i in good_indices] 

65 except Exception: 

66 logger.exception("URL filtering error") 

67 return [] 

68 

69 def remove_boilerplate(self, html: str) -> str: 

70 if not html or not html.strip(): 

71 return "" 

72 paragraphs = justext.justext(html, justext.get_stoplist(self.language)) 

73 cleaned = "\n".join( 

74 [p.text for p in paragraphs if not p.is_boilerplate] 

75 ) 

76 return cleaned 

77 

78 def run(self, query: str): 

79 nr_full_text = 0 

80 # Step 1: Get search results 

81 search_results = self.web_search.invoke(query) 

82 if not isinstance(search_results, list): 

83 raise ValueError("Expected the search results in list format.") 

84 

85 # Step 2: Filter URLs using LLM 

86 if QUALITY_CHECK_DDG_URLS: 

87 filtered_results = self.check_urls(search_results, query) 

88 else: 

89 filtered_results = search_results 

90 

91 # Extract URLs from filtered results 

92 urls = [ 

93 result.get("link") 

94 for result in filtered_results 

95 if result.get("link") 

96 ] 

97 

98 if not urls: 

99 logger.error("\n === NO VALID LINKS ===\n") 

100 return [] 

101 

102 # Step 3: Download the full HTML pages for filtered URLs 

103 loader = AsyncChromiumLoader(urls) 

104 html_docs = loader.load() 

105 

106 # Step 4: Process the HTML using BeautifulSoupTransformer 

107 full_docs = self.bs_transformer.transform_documents( 

108 html_docs, tags_to_extract=self.tags_to_extract 

109 ) 

110 

111 # Step 5: Remove boilerplate from each document 

112 url_to_content = {} 

113 for doc in full_docs: 

114 nr_full_text = nr_full_text + 1 

115 source = doc.metadata.get("source") 

116 if source: 

117 cleaned_text = self.remove_boilerplate(doc.page_content) 

118 url_to_content[source] = cleaned_text 

119 

120 # Attach the cleaned full content to each filtered result 

121 for result in filtered_results: 

122 link = result.get("link") 

123 result["full_content"] = url_to_content.get(link) 

124 

125 logger.info("FULL SEARCH WITH FILTERED URLS") 

126 logger.info("Full text retrieved: ", nr_full_text) 

127 return filtered_results 

128 

129 def invoke(self, query: str): 

130 return self.run(query) 

131 

132 def __call__(self, query: str): 

133 return self.invoke(query)