Coverage for src / local_deep_research / web_search_engines / engines / full_search.py: 99%
75 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1from loguru import logger
2from datetime import datetime, UTC
3from typing import Dict, List
5import justext
6from langchain_community.document_loaders import AsyncChromiumLoader
7from langchain_community.document_transformers import BeautifulSoupTransformer
8from langchain_core.language_models import BaseLLM
10from ...config.search_config import QUALITY_CHECK_DDG_URLS
11from ...utilities.json_utils import extract_json, get_llm_response_text
14class FullSearchResults:
15 def __init__(
16 self,
17 llm: BaseLLM, # Add LLM parameter
18 web_search: list,
19 output_format: str = "list",
20 language: str = "English",
21 max_results: int = 10,
22 region: str = "wt-wt",
23 time: str = "y",
24 safesearch: str | int = "Moderate",
25 ):
26 self.llm = llm
27 self.output_format = output_format
28 self.language = language
29 self.max_results = max_results
30 self.region = region
31 self.time = time
32 self.safesearch = safesearch
33 self.web_search = web_search
35 self.bs_transformer = BeautifulSoupTransformer()
36 self.tags_to_extract = ["p", "div", "span"]
38 def check_urls(self, results: List[Dict], query: str) -> List[Dict]:
39 if not results:
40 return results
42 now = datetime.now(UTC)
43 current_time = now.strftime("%Y-%m-%d")
44 prompt = f"""ONLY Return a JSON array. The response contains no letters. Evaluate these URLs for:
45 1. Timeliness (today: {current_time})
46 2. Factual accuracy (cross-reference major claims)
47 3. Source reliability (prefer official company websites, established news outlets)
48 4. Direct relevance to query: {query}
50 URLs to evaluate:
51 {results}
53 Return a JSON array of indices (0-based) for sources that meet ALL criteria.
54 ONLY Return a JSON array of indices (0-based) and nothing else. No letters.
55 Example response: \n[0, 2, 4]\n\n"""
57 try:
58 # Get LLM's evaluation
59 response = self.llm.invoke(prompt)
60 response_text = get_llm_response_text(response)
61 good_indices = extract_json(response_text, expected_type=list)
63 if good_indices is None:
64 good_indices = []
66 # Return only the results with good URLs
67 return [r for i, r in enumerate(results) if i in good_indices]
68 except Exception:
69 logger.exception("URL filtering error")
70 return []
72 def remove_boilerplate(self, html: str) -> str:
73 if not html or not html.strip():
74 return ""
75 paragraphs = justext.justext(html, justext.get_stoplist(self.language))
76 cleaned = "\n".join(
77 [p.text for p in paragraphs if not p.is_boilerplate]
78 )
79 return cleaned
81 def run(self, query: str):
82 nr_full_text = 0
83 # Step 1: Get search results
84 search_results = self.web_search.invoke(query)
85 if not isinstance(search_results, list):
86 raise ValueError("Expected the search results in list format.")
88 # Step 2: Filter URLs using LLM
89 if QUALITY_CHECK_DDG_URLS:
90 filtered_results = self.check_urls(search_results, query)
91 else:
92 filtered_results = search_results
94 # Extract URLs from filtered results
95 urls = [
96 result.get("link")
97 for result in filtered_results
98 if result.get("link")
99 ]
101 if not urls:
102 logger.error("\n === NO VALID LINKS ===\n")
103 return []
105 # Step 3: Download the full HTML pages for filtered URLs
106 loader = AsyncChromiumLoader(urls)
107 html_docs = loader.load()
109 # Step 4: Process the HTML using BeautifulSoupTransformer
110 full_docs = self.bs_transformer.transform_documents(
111 html_docs, tags_to_extract=self.tags_to_extract
112 )
114 # Step 5: Remove boilerplate from each document
115 url_to_content = {}
116 for doc in full_docs:
117 nr_full_text = nr_full_text + 1
118 source = doc.metadata.get("source")
119 if source: 119 ↛ 116line 119 didn't jump to line 116 because the condition on line 119 was always true
120 cleaned_text = self.remove_boilerplate(doc.page_content)
121 url_to_content[source] = cleaned_text
123 # Attach the cleaned full content to each filtered result
124 for result in filtered_results:
125 link = result.get("link")
126 result["full_content"] = url_to_content.get(link)
128 logger.info("FULL SEARCH WITH FILTERED URLS")
129 logger.info("Full text retrieved: ", nr_full_text)
130 return filtered_results
132 def invoke(self, query: str):
133 return self.run(query)
135 def __call__(self, query: str):
136 return self.invoke(query)