Coverage for src / local_deep_research / web_search_engines / engines / full_search.py: 30%
73 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1import json
2from loguru import logger
3from datetime import datetime, UTC
4from typing import Dict, List
6import justext
7from langchain_community.document_loaders import AsyncChromiumLoader
8from langchain_community.document_transformers import BeautifulSoupTransformer
9from langchain_core.language_models import BaseLLM
11from ...config.search_config import QUALITY_CHECK_DDG_URLS
12from ...utilities.search_utilities import remove_think_tags
15class FullSearchResults:
16 def __init__(
17 self,
18 llm: BaseLLM, # Add LLM parameter
19 web_search: list,
20 output_format: str = "list",
21 language: str = "English",
22 max_results: int = 10,
23 region: str = "wt-wt",
24 time: str = "y",
25 safesearch: str | int = "Moderate",
26 ):
27 self.llm = llm
28 self.output_format = output_format
29 self.language = language
30 self.max_results = max_results
31 self.region = region
32 self.time = time
33 self.safesearch = safesearch
34 self.web_search = web_search
36 self.bs_transformer = BeautifulSoupTransformer()
37 self.tags_to_extract = ["p", "div", "span"]
39 def check_urls(self, results: List[Dict], query: str) -> List[Dict]:
40 if not results:
41 return results
43 now = datetime.now(UTC)
44 current_time = now.strftime("%Y-%m-%d")
45 prompt = f"""ONLY Return a JSON array. The response contains no letters. Evaluate these URLs for:
46 1. Timeliness (today: {current_time})
47 2. Factual accuracy (cross-reference major claims)
48 3. Source reliability (prefer official company websites, established news outlets)
49 4. Direct relevance to query: {query}
51 URLs to evaluate:
52 {results}
54 Return a JSON array of indices (0-based) for sources that meet ALL criteria.
55 ONLY Return a JSON array of indices (0-based) and nothing else. No letters.
56 Example response: \n[0, 2, 4]\n\n"""
58 try:
59 # Get LLM's evaluation
60 response = self.llm.invoke(prompt)
61 good_indices = json.loads(remove_think_tags(response.content))
63 # Return only the results with good URLs
64 return [r for i, r in enumerate(results) if i in good_indices]
65 except Exception:
66 logger.exception("URL filtering error")
67 return []
69 def remove_boilerplate(self, html: str) -> str:
70 if not html or not html.strip():
71 return ""
72 paragraphs = justext.justext(html, justext.get_stoplist(self.language))
73 cleaned = "\n".join(
74 [p.text for p in paragraphs if not p.is_boilerplate]
75 )
76 return cleaned
78 def run(self, query: str):
79 nr_full_text = 0
80 # Step 1: Get search results
81 search_results = self.web_search.invoke(query)
82 if not isinstance(search_results, list):
83 raise ValueError("Expected the search results in list format.")
85 # Step 2: Filter URLs using LLM
86 if QUALITY_CHECK_DDG_URLS:
87 filtered_results = self.check_urls(search_results, query)
88 else:
89 filtered_results = search_results
91 # Extract URLs from filtered results
92 urls = [
93 result.get("link")
94 for result in filtered_results
95 if result.get("link")
96 ]
98 if not urls:
99 logger.error("\n === NO VALID LINKS ===\n")
100 return []
102 # Step 3: Download the full HTML pages for filtered URLs
103 loader = AsyncChromiumLoader(urls)
104 html_docs = loader.load()
106 # Step 4: Process the HTML using BeautifulSoupTransformer
107 full_docs = self.bs_transformer.transform_documents(
108 html_docs, tags_to_extract=self.tags_to_extract
109 )
111 # Step 5: Remove boilerplate from each document
112 url_to_content = {}
113 for doc in full_docs:
114 nr_full_text = nr_full_text + 1
115 source = doc.metadata.get("source")
116 if source:
117 cleaned_text = self.remove_boilerplate(doc.page_content)
118 url_to_content[source] = cleaned_text
120 # Attach the cleaned full content to each filtered result
121 for result in filtered_results:
122 link = result.get("link")
123 result["full_content"] = url_to_content.get(link)
125 logger.info("FULL SEARCH WITH FILTERED URLS")
126 logger.info("Full text retrieved: ", nr_full_text)
127 return filtered_results
129 def invoke(self, query: str):
130 return self.run(query)
132 def __call__(self, query: str):
133 return self.invoke(query)