Coverage for src / local_deep_research / web_search_engines / engines / search_engine_searxng.py: 86%
204 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1import enum
2import time
3from typing import Any, Dict, List, Optional
5import requests
6from langchain_core.language_models import BaseLLM
7from loguru import logger
9from ...config import search_config
10from ...security.safe_requests import safe_get
11from ..search_engine_base import BaseSearchEngine
12from .full_search import FullSearchResults
15@enum.unique
16class SafeSearchSetting(enum.IntEnum):
17 """
18 Acceptable settings for safe search.
19 """
21 OFF = 0
22 MODERATE = 1
23 STRICT = 2
26class SearXNGSearchEngine(BaseSearchEngine):
27 """
28 SearXNG search engine implementation that requires an instance URL provided via
29 environment variable or configuration. Designed for ethical usage with proper
30 rate limiting and single-instance approach.
31 """
33 # Mark as public search engine
34 is_public = True
35 # Mark as generic search engine (general web search)
36 is_generic = True
38 def _is_valid_search_result(self, url: str) -> bool:
39 """
40 Check if a parsed result is a valid search result vs an error page.
42 When SearXNG's backend engines fail or get rate-limited, it returns
43 error/stats pages that shouldn't be treated as search results.
45 Returns False for:
46 - Relative URLs (don't start with http:// or https://, case-insensitive)
47 - URLs pointing to the SearXNG instance itself (catches /stats, /preferences, etc.)
48 """
49 # Must have an absolute URL (case-insensitive scheme check)
50 if not url or not url.lower().startswith(("http://", "https://")):
51 return False
53 # Reject URLs pointing back to the SearXNG instance itself
54 # This catches all internal pages like /stats?engine=, /preferences, /about
55 if url.startswith(self.instance_url):
56 return False
58 return True
60 def __init__(
61 self,
62 max_results: int = 15,
63 instance_url: str = "http://localhost:8080",
64 categories: Optional[List[str]] = None,
65 engines: Optional[List[str]] = None,
66 language: str = "en",
67 safe_search: str = SafeSearchSetting.OFF.name,
68 time_range: Optional[str] = None,
69 delay_between_requests: float = 0.0,
70 llm: Optional[BaseLLM] = None,
71 max_filtered_results: Optional[int] = None,
72 include_full_content: bool = True,
73 **kwargs,
74 ): # API key is actually the instance URL
75 """
76 Initialize the SearXNG search engine with ethical usage patterns.
78 Args:
79 max_results: Maximum number of search results
80 instance_url: URL of your SearXNG instance (preferably self-hosted)
81 categories: List of SearXNG categories to search in (general, images, videos, news, etc.)
82 engines: List of engines to use (google, bing, duckduckgo, etc.)
83 language: Language code for search results
84 safe_search: Safe search level (0=off, 1=moderate, 2=strict)
85 time_range: Time range for results (day, week, month, year)
86 delay_between_requests: Seconds to wait between requests
87 llm: Language model for relevance filtering
88 max_filtered_results: Maximum number of results to keep after filtering
89 include_full_content: Whether to include full webpage content in results
90 """
92 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
93 super().__init__(
94 llm=llm,
95 max_filtered_results=max_filtered_results,
96 max_results=max_results,
97 **kwargs, # Pass through all other kwargs including search_snippets_only
98 )
100 # Validate and normalize the instance URL if provided
101 self.instance_url = instance_url.rstrip("/")
102 logger.info(
103 f"SearXNG initialized with instance URL: {self.instance_url}"
104 )
105 try:
106 # Make sure it's accessible.
107 # allow_private_ips=True since SearXNG is typically self-hosted on local network
108 response = safe_get(
109 self.instance_url, timeout=5, allow_private_ips=True
110 )
111 if response.status_code == 200:
112 logger.info("SearXNG instance is accessible.")
113 self.is_available = True
114 else:
115 self.is_available = False
116 logger.error(
117 f"Failed to access SearXNG instance at {self.instance_url}. Status code: {response.status_code}"
118 )
119 except requests.RequestException as e:
120 self.is_available = False
121 logger.exception(
122 f"Error while trying to access SearXNG instance at {self.instance_url}: {e!s}"
123 )
125 # Add debug logging for all parameters
126 logger.info(
127 f"SearXNG init params: max_results={max_results}, language={language}, "
128 f"max_filtered_results={max_filtered_results}, is_available={self.is_available}"
129 )
131 self.max_results = max_results
132 self.categories = categories or ["general"]
133 self.engines = engines
134 self.language = language
135 try:
136 # Handle both string names and integer values
137 if isinstance(safe_search, int) or (
138 isinstance(safe_search, str) and str(safe_search).isdigit()
139 ):
140 self.safe_search = SafeSearchSetting(int(safe_search))
141 else:
142 self.safe_search = SafeSearchSetting[safe_search]
143 except (ValueError, KeyError):
144 logger.exception(
145 "'{}' is not a valid safe search setting. Disabling safe search",
146 safe_search,
147 )
148 self.safe_search = SafeSearchSetting.OFF
149 self.time_range = time_range
151 self.delay_between_requests = float(delay_between_requests)
153 self.include_full_content = include_full_content
155 if self.is_available:
156 self.search_url = f"{self.instance_url}/search"
157 logger.info(
158 f"SearXNG engine initialized with instance: {self.instance_url}"
159 )
160 logger.info(
161 f"Rate limiting set to {self.delay_between_requests} seconds between requests"
162 )
164 self.full_search = FullSearchResults(
165 llm=llm,
166 web_search=self,
167 language=language,
168 max_results=max_results,
169 region="wt-wt",
170 time="y",
171 safesearch=self.safe_search.value,
172 )
174 self.last_request_time = 0
176 def _respect_rate_limit(self):
177 """Apply self-imposed rate limiting between requests"""
178 current_time = time.time()
179 time_since_last_request = current_time - self.last_request_time
181 if time_since_last_request < self.delay_between_requests:
182 wait_time = self.delay_between_requests - time_since_last_request
183 logger.info(f"Rate limiting: waiting {wait_time:.2f} seconds")
184 time.sleep(wait_time)
186 self.last_request_time = time.time()
188 def _get_search_results(self, query: str) -> List[Dict[str, Any]]:
189 """
190 Get search results from SearXNG with ethical rate limiting.
192 Args:
193 query: The search query
195 Returns:
196 List of search results from SearXNG
197 """
198 if not self.is_available:
199 logger.error(
200 "SearXNG engine is disabled (no instance URL provided) - cannot run search"
201 )
202 return []
204 logger.info(f"SearXNG running search for query: {query}")
206 try:
207 self._respect_rate_limit()
209 initial_headers = {
210 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
211 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
212 "Accept-Language": "en-US,en;q=0.9",
213 }
215 try:
216 initial_response = safe_get(
217 self.instance_url,
218 headers=initial_headers,
219 timeout=10,
220 allow_private_ips=True,
221 )
222 cookies = initial_response.cookies
223 except Exception:
224 logger.exception("Failed to get initial cookies")
225 cookies = None
227 params = {
228 "q": query,
229 "categories": ",".join(self.categories),
230 "language": self.language,
231 "format": "html", # Use HTML format instead of JSON
232 "pageno": 1,
233 "safesearch": self.safe_search.value,
234 "count": self.max_results,
235 }
237 if self.engines: 237 ↛ 238line 237 didn't jump to line 238 because the condition on line 237 was never true
238 params["engines"] = ",".join(self.engines)
240 if self.time_range: 240 ↛ 241line 240 didn't jump to line 241 because the condition on line 240 was never true
241 params["time_range"] = self.time_range
243 # Browser-like headers
244 headers = {
245 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
246 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
247 "Accept-Language": "en-US,en;q=0.9",
248 "Referer": self.instance_url + "/",
249 "Connection": "keep-alive",
250 "Upgrade-Insecure-Requests": "1",
251 }
253 logger.info(
254 f"Sending request to SearXNG instance at {self.instance_url}"
255 )
256 response = safe_get(
257 self.search_url,
258 params=params,
259 headers=headers,
260 cookies=cookies,
261 timeout=15,
262 allow_private_ips=True,
263 )
265 if response.status_code == 200:
266 try:
267 from bs4 import BeautifulSoup
269 soup = BeautifulSoup(response.text, "html.parser")
270 results = []
272 result_elements = soup.select(".result-item")
274 if not result_elements: 274 ↛ 275line 274 didn't jump to line 275 because the condition on line 274 was never true
275 result_elements = soup.select(".result")
277 if not result_elements: 277 ↛ 278line 277 didn't jump to line 278 because the condition on line 277 was never true
278 result_elements = soup.select("article")
280 if not result_elements: 280 ↛ 281line 280 didn't jump to line 281 because the condition on line 280 was never true
281 logger.debug(
282 f"Classes found in HTML: {[c['class'] for c in soup.select('[class]') if 'class' in c.attrs][:10]}"
283 )
284 result_elements = soup.select('div[id^="result"]')
286 logger.info(
287 f"Found {len(result_elements)} search result elements"
288 )
290 for idx, result_element in enumerate(result_elements):
291 if idx >= self.max_results: 291 ↛ 292line 291 didn't jump to line 292 because the condition on line 291 was never true
292 break
294 title_element = (
295 result_element.select_one(".result-title")
296 or result_element.select_one(".title")
297 or result_element.select_one("h3")
298 or result_element.select_one("a[href]")
299 )
301 url_element = (
302 result_element.select_one(".result-url")
303 or result_element.select_one(".url")
304 or result_element.select_one("a[href]")
305 )
307 content_element = (
308 result_element.select_one(".result-content")
309 or result_element.select_one(".content")
310 or result_element.select_one(".snippet")
311 or result_element.select_one("p")
312 )
314 title = (
315 title_element.get_text(strip=True)
316 if title_element
317 else ""
318 )
320 url = ""
321 if url_element and url_element.has_attr("href"): 321 ↛ 323line 321 didn't jump to line 323 because the condition on line 321 was always true
322 url = url_element["href"]
323 elif url_element:
324 url = url_element.get_text(strip=True)
326 content = (
327 content_element.get_text(strip=True)
328 if content_element
329 else ""
330 )
332 if ( 332 ↛ 337line 332 didn't jump to line 337 because the condition on line 332 was never true
333 not url
334 and title_element
335 and title_element.has_attr("href")
336 ):
337 url = title_element["href"]
339 logger.debug(
340 f"Extracted result {idx}: title={title[:30]}..., url={url[:30]}..., content={content[:30]}..."
341 )
343 # Add to results only if it's a valid search result
344 # (not an error page or internal SearXNG page)
345 if self._is_valid_search_result(url):
346 results.append(
347 {
348 "title": title,
349 "url": url,
350 "content": content,
351 "engine": "searxng",
352 "category": "general",
353 }
354 )
355 else:
356 # Check if this is a backend engine failure
357 if url and "/stats?engine=" in url: 357 ↛ 367line 357 didn't jump to line 367 because the condition on line 357 was always true
358 try:
359 engine_name = url.split("/stats?engine=")[
360 1
361 ].split("&")[0]
362 logger.warning(
363 f"SearXNG backend engine failed or rate-limited: {engine_name}"
364 )
365 except (IndexError, AttributeError):
366 pass # Couldn't parse engine name
367 logger.debug(
368 f"Filtered invalid SearXNG result: title={title!r}, url={url!r}"
369 )
371 if results: 371 ↛ 376line 371 didn't jump to line 376 because the condition on line 371 was always true
372 logger.info(
373 f"SearXNG returned {len(results)} valid results from HTML parsing"
374 )
375 else:
376 logger.warning(
377 f"SearXNG returned no valid results for query: {query}. "
378 "This may indicate SearXNG backend engine issues or rate limiting."
379 )
380 return results
382 except ImportError:
383 logger.exception(
384 "BeautifulSoup not available for HTML parsing"
385 )
386 return []
387 except Exception:
388 logger.exception("Error parsing HTML results")
389 return []
390 else:
391 logger.error(
392 f"SearXNG returned status code {response.status_code}"
393 )
394 return []
396 except Exception:
397 logger.exception("Error getting SearXNG results")
398 return []
400 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
401 """
402 Get preview information for SearXNG search results.
404 Args:
405 query: The search query
407 Returns:
408 List of preview dictionaries
409 """
410 if not self.is_available:
411 logger.warning(
412 "SearXNG engine is disabled (no instance URL provided)"
413 )
414 return []
416 logger.info(f"Getting SearXNG previews for query: {query}")
418 results = self._get_search_results(query)
420 if not results:
421 logger.warning(f"No SearXNG results found for query: {query}")
422 return []
424 previews = []
425 for i, result in enumerate(results):
426 title = result.get("title", "")
427 url = result.get("url", "")
428 content = result.get("content", "")
430 preview = {
431 "id": url or f"searxng-result-{i}",
432 "title": title,
433 "link": url,
434 "snippet": content,
435 "engine": result.get("engine", ""),
436 "category": result.get("category", ""),
437 }
439 previews.append(preview)
441 return previews
443 def _get_full_content(
444 self, relevant_items: List[Dict[str, Any]]
445 ) -> List[Dict[str, Any]]:
446 """
447 Get full content for the relevant search results.
449 Args:
450 relevant_items: List of relevant preview dictionaries
452 Returns:
453 List of result dictionaries with full content
454 """
455 if not self.is_available:
456 return relevant_items
458 if ( 458 ↛ 462line 458 didn't jump to line 462 because the condition on line 458 was never true
459 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
460 and search_config.SEARCH_SNIPPETS_ONLY
461 ):
462 logger.info("Snippet-only mode, skipping full content retrieval")
463 return relevant_items
465 logger.info("Retrieving full webpage content")
467 try:
468 results_with_content = self.full_search._get_full_content(
469 relevant_items
470 )
471 return results_with_content
473 except Exception:
474 logger.exception("Error retrieving full content")
475 return relevant_items
477 def invoke(self, query: str) -> List[Dict[str, Any]]:
478 """Compatibility method for LangChain tools"""
479 return self.run(query)
481 def results(
482 self, query: str, max_results: Optional[int] = None
483 ) -> List[Dict[str, Any]]:
484 """
485 Get search results in a format compatible with other search engines.
487 Args:
488 query: The search query
489 max_results: Optional override for maximum results
491 Returns:
492 List of search result dictionaries
493 """
494 if not self.is_available:
495 return []
497 original_max_results = self.max_results
499 try:
500 if max_results is not None:
501 self.max_results = max_results
503 results = self._get_search_results(query)
505 formatted_results = []
506 for result in results:
507 formatted_results.append(
508 {
509 "title": result.get("title", ""),
510 "link": result.get("url", ""),
511 "snippet": result.get("content", ""),
512 }
513 )
515 return formatted_results
517 finally:
518 self.max_results = original_max_results
520 @staticmethod
521 def get_self_hosting_instructions() -> str:
522 """
523 Get instructions for self-hosting a SearXNG instance.
525 Returns:
526 String with installation instructions
527 """
528 return """
529# SearXNG Self-Hosting Instructions
531The most ethical way to use SearXNG is to host your own instance. Here's how:
533## Using Docker (easiest method)
5351. Install Docker if you don't have it already
5362. Run these commands:
538```bash
539# Pull the SearXNG Docker image
540docker pull searxng/searxng
542# Run SearXNG (will be available at http://localhost:8080)
543docker run -d -p 8080:8080 --name searxng searxng/searxng
544```
546## Using Docker Compose (recommended for production)
5481. Create a file named `docker-compose.yml` with the following content:
550```yaml
551version: '3'
552services:
553 searxng:
554 container_name: searxng
555 image: searxng/searxng
556 ports:
557 - "8080:8080"
558 volumes:
559 - ./searxng:/etc/searxng
560 environment:
561 - SEARXNG_BASE_URL=http://localhost:8080/
562 restart: unless-stopped
563```
5652. Run with Docker Compose:
567```bash
568docker-compose up -d
569```
571For more detailed instructions and configuration options, visit:
572https://searxng.github.io/searxng/admin/installation.html
573"""
575 def run(
576 self, query: str, research_context: Dict[str, Any] | None = None
577 ) -> List[Dict[str, Any]]:
578 """
579 Override BaseSearchEngine run method to add SearXNG-specific error handling.
580 """
581 if not self.is_available:
582 logger.error(
583 "SearXNG run method called but engine is not available (missing instance URL)"
584 )
585 return []
587 logger.info(f"SearXNG search engine running with query: '{query}'")
588 logger.info(f"SearXNG instance URL: {self.instance_url}")
590 try:
591 # Call the parent class's run method
592 results = super().run(query, research_context=research_context)
593 logger.info(f"SearXNG search completed with {len(results)} results")
594 return results
595 except Exception:
596 logger.exception("Error in SearXNG run method")
597 # Return empty results on error
598 return []