Coverage for src/local_deep_research/web_search_engines/engines/full

1from loguru import logger

2from datetime import datetime, UTC

3from typing import Dict, List

5import justext

6from langchain_community.document_loaders import AsyncChromiumLoader

7from langchain_community.document_transformers import BeautifulSoupTransformer

8from langchain_core.language_models import BaseLLM

10from ...config.search_config import QUALITY_CHECK_DDG_URLS

11from ...utilities.json_utils import extract_json, get_llm_response_text

14class FullSearchResults:

15 def __init__(

16 self,

17 llm: BaseLLM, # Add LLM parameter

18 web_search: list,

19 output_format: str = "list",

20 language: str = "English",

21 max_results: int = 10,

22 region: str = "wt-wt",

23 time: str = "y",

24 safesearch: str | int = "Moderate",

25 ):

26 self.llm = llm

27 self.output_format = output_format

28 self.language = language

29 self.max_results = max_results

30 self.region = region

31 self.time = time

32 self.safesearch = safesearch

33 self.web_search = web_search

35 self.bs_transformer = BeautifulSoupTransformer()

36 self.tags_to_extract = ["p", "div", "span"]

38 def check_urls(self, results: List[Dict], query: str) -> List[Dict]:

39 if not results:

40 return results

42 now = datetime.now(UTC)

43 current_time = now.strftime("%Y-%m-%d")

44 prompt = f"""ONLY Return a JSON array. The response contains no letters. Evaluate these URLs for:

45 1. Timeliness (today: {current_time})

46 2. Factual accuracy (cross-reference major claims)

47 3. Source reliability (prefer official company websites, established news outlets)

48 4. Direct relevance to query: {query}

50 URLs to evaluate:

51 {results}

53 Return a JSON array of indices (0-based) for sources that meet ALL criteria.

54 ONLY Return a JSON array of indices (0-based) and nothing else. No letters.

55 Example response: \n[0, 2, 4]\n\n"""

57 try:

58 # Get LLM's evaluation

59 response = self.llm.invoke(prompt)

60 response_text = get_llm_response_text(response)

61 good_indices = extract_json(response_text, expected_type=list)

63 if good_indices is None:

64 good_indices = []

66 # Return only the results with good URLs

67 return [r for i, r in enumerate(results) if i in good_indices]

68 except Exception:

69 logger.exception("URL filtering error")

70 return []

72 def remove_boilerplate(self, html: str) -> str:

73 if not html or not html.strip():

74 return ""

75 paragraphs = justext.justext(html, justext.get_stoplist(self.language))

76 cleaned = "\n".join(

77 [p.text for p in paragraphs if not p.is_boilerplate]

78 )

79 return cleaned

81 def run(self, query: str):

82 nr_full_text = 0

83 # Step 1: Get search results

84 search_results = self.web_search.invoke(query)

85 if not isinstance(search_results, list):

86 raise ValueError("Expected the search results in list format.")

88 # Step 2: Filter URLs using LLM

89 if QUALITY_CHECK_DDG_URLS:

90 filtered_results = self.check_urls(search_results, query)

91 else:

92 filtered_results = search_results

94 # Extract URLs from filtered results

95 urls = [

96 result.get("link")

97 for result in filtered_results

98 if result.get("link")

99 ]

100

101 if not urls:

102 logger.error("\n === NO VALID LINKS ===\n")

103 return []

104

105 # Step 3: Download the full HTML pages for filtered URLs

106 loader = AsyncChromiumLoader(urls)

107 html_docs = loader.load()

108

109 # Step 4: Process the HTML using BeautifulSoupTransformer

110 full_docs = self.bs_transformer.transform_documents(

111 html_docs, tags_to_extract=self.tags_to_extract

112 )

113

114 # Step 5: Remove boilerplate from each document

115 url_to_content = {}

116 for doc in full_docs:

117 nr_full_text = nr_full_text + 1

118 source = doc.metadata.get("source")

119 if source: 119 ↛ 116line 119 didn't jump to line 116 because the condition on line 119 was always true

120 cleaned_text = self.remove_boilerplate(doc.page_content)

121 url_to_content[source] = cleaned_text

122

123 # Attach the cleaned full content to each filtered result

124 for result in filtered_results:

125 link = result.get("link")

126 result["full_content"] = url_to_content.get(link)

127

128 logger.info("FULL SEARCH WITH FILTERED URLS")

129 logger.info("Full text retrieved: ", nr_full_text)

130 return filtered_results

131

132 def invoke(self, query: str):

133 return self.run(query)

134

135 def __call__(self, query: str):

136 return self.invoke(query)

Coverage for src / local_deep_research / web_search_engines / engines / full_search.py: 99%

75 statements