Coverage for src / local_deep_research / web_search_engines / engines / search_engine_searxng.py: 87%
222 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1import enum
2import json
3import time
4from typing import Any, Dict, List, Optional
6import requests
7from langchain_core.language_models import BaseLLM
8from loguru import logger
10from ...config import search_config
11from ...security.safe_requests import safe_get
12from ..search_engine_base import BaseSearchEngine
13from .full_search import FullSearchResults
16@enum.unique
17class SafeSearchSetting(enum.IntEnum):
18 """
19 Acceptable settings for safe search.
20 """
22 OFF = 0
23 MODERATE = 1
24 STRICT = 2
27class SearXNGSearchEngine(BaseSearchEngine):
28 """
29 SearXNG search engine implementation that requires an instance URL provided via
30 environment variable or configuration. Designed for ethical usage with proper
31 rate limiting and single-instance approach.
32 """
34 # Mark as public search engine
35 is_public = True
36 # Mark as generic search engine (general web search)
37 is_generic = True
39 @staticmethod
40 def _normalize_list(value):
41 """Ensure *value* is a ``list[str]`` or ``None``.
43 Settings saved via the web UI may arrive as raw JSON strings
44 (e.g. ``'[\\r\\n "general"\\r\\n]'``) instead of parsed lists.
45 This helper decodes such strings so that ``",".join()`` later
46 works on list items rather than individual characters (issue #1030).
47 """
48 if value is None:
49 return None
50 if isinstance(value, list):
51 return value
52 if isinstance(value, str): 52 ↛ 65line 52 didn't jump to line 65 because the condition on line 52 was always true
53 stripped = value.strip()
54 if stripped:
55 try:
56 parsed = json.loads(stripped)
57 if isinstance(parsed, list): 57 ↛ 62line 57 didn't jump to line 62 because the condition on line 57 was always true
58 return [str(item) for item in parsed]
59 except (json.JSONDecodeError, ValueError, RecursionError):
60 pass
61 # Comma-separated fallback
62 return [
63 item.strip() for item in stripped.split(",") if item.strip()
64 ]
65 return None
67 def _is_valid_search_result(self, url: str) -> bool:
68 """
69 Check if a parsed result is a valid search result vs an error page.
71 When SearXNG's backend engines fail or get rate-limited, it returns
72 error/stats pages that shouldn't be treated as search results.
74 Returns False for:
75 - Relative URLs (don't start with http:// or https://, case-insensitive)
76 - URLs pointing to the SearXNG instance itself (catches /stats, /preferences, etc.)
77 """
78 # Must have an absolute URL (case-insensitive scheme check)
79 if not url or not url.lower().startswith(("http://", "https://")):
80 return False
82 # Reject URLs pointing back to the SearXNG instance itself
83 # This catches all internal pages like /stats?engine=, /preferences, /about
84 if url.startswith(self.instance_url):
85 return False
87 return True
89 def __init__(
90 self,
91 max_results: int = 15,
92 instance_url: str = "http://localhost:8080",
93 categories: Optional[List[str]] = None,
94 engines: Optional[List[str]] = None,
95 language: str = "en",
96 safe_search: str = SafeSearchSetting.OFF.name,
97 time_range: Optional[str] = None,
98 delay_between_requests: float = 0.0,
99 llm: Optional[BaseLLM] = None,
100 max_filtered_results: Optional[int] = None,
101 include_full_content: bool = True,
102 **kwargs,
103 ): # API key is actually the instance URL
104 """
105 Initialize the SearXNG search engine with ethical usage patterns.
107 Args:
108 max_results: Maximum number of search results
109 instance_url: URL of your SearXNG instance (preferably self-hosted)
110 categories: List of SearXNG categories to search in (general, images, videos, news, etc.)
111 engines: List of engines to use (google, bing, duckduckgo, etc.)
112 language: Language code for search results
113 safe_search: Safe search level (0=off, 1=moderate, 2=strict)
114 time_range: Time range for results (day, week, month, year)
115 delay_between_requests: Seconds to wait between requests
116 llm: Language model for relevance filtering
117 max_filtered_results: Maximum number of results to keep after filtering
118 include_full_content: Whether to include full webpage content in results
119 """
121 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
122 super().__init__(
123 llm=llm,
124 max_filtered_results=max_filtered_results,
125 max_results=max_results,
126 **kwargs, # Pass through all other kwargs including search_snippets_only
127 )
129 # Validate and normalize the instance URL if provided
130 self.instance_url = instance_url.rstrip("/")
131 logger.info(
132 f"SearXNG initialized with instance URL: {self.instance_url}"
133 )
134 try:
135 # Make sure it's accessible.
136 # allow_private_ips=True since SearXNG is typically self-hosted on local network
137 response = safe_get(
138 self.instance_url, timeout=5, allow_private_ips=True
139 )
140 if response.status_code == 200:
141 logger.info("SearXNG instance is accessible.")
142 self.is_available = True
143 else:
144 self.is_available = False
145 logger.error(
146 f"Failed to access SearXNG instance at {self.instance_url}. Status code: {response.status_code}"
147 )
148 except requests.RequestException:
149 self.is_available = False
150 logger.exception(
151 f"Error while trying to access SearXNG instance at {self.instance_url}"
152 )
154 # Add debug logging for all parameters
155 logger.info(
156 f"SearXNG init params: max_results={max_results}, language={language}, "
157 f"max_filtered_results={max_filtered_results}, is_available={self.is_available}"
158 )
160 self.max_results = max_results
161 self.categories = self._normalize_list(categories) or ["general"]
162 self.engines = self._normalize_list(engines)
163 self.language = language
164 try:
165 # Handle both string names and integer values
166 if isinstance(safe_search, int) or (
167 isinstance(safe_search, str) and str(safe_search).isdigit()
168 ):
169 self.safe_search = SafeSearchSetting(int(safe_search))
170 else:
171 self.safe_search = SafeSearchSetting[safe_search]
172 except (ValueError, KeyError):
173 logger.exception(
174 "'{}' is not a valid safe search setting. Disabling safe search",
175 safe_search,
176 )
177 self.safe_search = SafeSearchSetting.OFF
178 self.time_range = time_range
180 self.delay_between_requests = float(delay_between_requests)
182 self.include_full_content = include_full_content
184 if self.is_available:
185 self.search_url = f"{self.instance_url}/search"
186 logger.info(
187 f"SearXNG engine initialized with instance: {self.instance_url}"
188 )
189 logger.info(
190 f"Rate limiting set to {self.delay_between_requests} seconds between requests"
191 )
193 self.full_search = FullSearchResults(
194 llm=llm,
195 web_search=self,
196 language=language,
197 max_results=max_results,
198 region="wt-wt",
199 time="y",
200 safesearch=self.safe_search.value,
201 )
203 self.last_request_time = 0
205 def _respect_rate_limit(self):
206 """Apply self-imposed rate limiting between requests"""
207 current_time = time.time()
208 time_since_last_request = current_time - self.last_request_time
210 if time_since_last_request < self.delay_between_requests:
211 wait_time = self.delay_between_requests - time_since_last_request
212 logger.info(f"Rate limiting: waiting {wait_time:.2f} seconds")
213 time.sleep(wait_time)
215 self.last_request_time = time.time()
217 def _get_search_results(self, query: str) -> List[Dict[str, Any]]:
218 """
219 Get search results from SearXNG with ethical rate limiting.
221 Args:
222 query: The search query
224 Returns:
225 List of search results from SearXNG
226 """
227 if not self.is_available:
228 logger.error(
229 "SearXNG engine is disabled (no instance URL provided) - cannot run search"
230 )
231 return []
233 logger.info(f"SearXNG running search for query: {query}")
235 try:
236 self._respect_rate_limit()
238 initial_headers = {
239 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
240 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
241 "Accept-Language": "en-US,en;q=0.9",
242 }
244 try:
245 initial_response = safe_get(
246 self.instance_url,
247 headers=initial_headers,
248 timeout=10,
249 allow_private_ips=True,
250 )
251 cookies = initial_response.cookies
252 except Exception:
253 logger.exception("Failed to get initial cookies")
254 cookies = None
256 params = {
257 "q": query,
258 "categories": ",".join(self.categories),
259 "language": self.language,
260 "format": "html", # Use HTML format instead of JSON
261 "pageno": 1,
262 "safesearch": self.safe_search.value,
263 "count": self.max_results,
264 }
266 if self.engines: 266 ↛ 267line 266 didn't jump to line 267 because the condition on line 266 was never true
267 params["engines"] = ",".join(self.engines)
269 if self.time_range: 269 ↛ 270line 269 didn't jump to line 270 because the condition on line 269 was never true
270 params["time_range"] = self.time_range
272 # Browser-like headers
273 headers = {
274 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
275 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
276 "Accept-Language": "en-US,en;q=0.9",
277 "Referer": self.instance_url + "/",
278 "Connection": "keep-alive",
279 "Upgrade-Insecure-Requests": "1",
280 }
282 logger.info(
283 f"Sending request to SearXNG instance at {self.instance_url}"
284 )
285 response = safe_get(
286 self.search_url,
287 params=params,
288 headers=headers,
289 cookies=cookies,
290 timeout=15,
291 allow_private_ips=True,
292 )
294 if response.status_code == 200:
295 try:
296 from bs4 import BeautifulSoup
298 soup = BeautifulSoup(response.text, "html.parser")
299 results = []
301 result_elements = soup.select(".result-item")
303 if not result_elements:
304 result_elements = soup.select(".result")
306 if not result_elements: 306 ↛ 307line 306 didn't jump to line 307 because the condition on line 306 was never true
307 result_elements = soup.select("article")
309 if not result_elements: 309 ↛ 310line 309 didn't jump to line 310 because the condition on line 309 was never true
310 logger.debug(
311 f"Classes found in HTML: {[c['class'] for c in soup.select('[class]') if 'class' in c.attrs][:10]}"
312 )
313 result_elements = soup.select('div[id^="result"]')
315 logger.info(
316 f"Found {len(result_elements)} search result elements"
317 )
319 for idx, result_element in enumerate(result_elements):
320 if idx >= self.max_results: 320 ↛ 321line 320 didn't jump to line 321 because the condition on line 320 was never true
321 break
323 title_element = (
324 result_element.select_one(".result-title")
325 or result_element.select_one(".title")
326 or result_element.select_one("h3")
327 or result_element.select_one("a[href]")
328 )
330 url_element = (
331 result_element.select_one(".result-url")
332 or result_element.select_one(".url")
333 or result_element.select_one("a[href]")
334 )
336 content_element = (
337 result_element.select_one(".result-content")
338 or result_element.select_one(".content")
339 or result_element.select_one(".snippet")
340 or result_element.select_one("p")
341 )
343 title = (
344 title_element.get_text(strip=True)
345 if title_element
346 else ""
347 )
349 url = ""
350 if url_element and url_element.has_attr("href"): 350 ↛ 352line 350 didn't jump to line 352 because the condition on line 350 was always true
351 url = url_element["href"]
352 elif url_element:
353 url = url_element.get_text(strip=True)
355 content = (
356 content_element.get_text(strip=True)
357 if content_element
358 else ""
359 )
361 if ( 361 ↛ 366line 361 didn't jump to line 366 because the condition on line 361 was never true
362 not url
363 and title_element
364 and title_element.has_attr("href")
365 ):
366 url = title_element["href"]
368 logger.debug(
369 f"Extracted result {idx}: title={title[:30]}..., url={url[:30]}..., content={content[:30]}..."
370 )
372 # Add to results only if it's a valid search result
373 # (not an error page or internal SearXNG page)
374 if self._is_valid_search_result(url):
375 results.append(
376 {
377 "title": title,
378 "url": url,
379 "content": content,
380 "engine": "searxng",
381 "category": "general",
382 }
383 )
384 else:
385 # Check if this is a backend engine failure
386 if url and "/stats?engine=" in url: 386 ↛ 396line 386 didn't jump to line 396 because the condition on line 386 was always true
387 try:
388 engine_name = url.split("/stats?engine=")[
389 1
390 ].split("&")[0]
391 logger.warning(
392 f"SearXNG backend engine failed or rate-limited: {engine_name}"
393 )
394 except (IndexError, AttributeError):
395 pass # Couldn't parse engine name
396 logger.debug(
397 f"Filtered invalid SearXNG result: title={title!r}, url={url!r}"
398 )
400 if results: 400 ↛ 405line 400 didn't jump to line 405 because the condition on line 400 was always true
401 logger.info(
402 f"SearXNG returned {len(results)} valid results from HTML parsing"
403 )
404 else:
405 logger.warning(
406 f"SearXNG returned no valid results for query: {query}. "
407 "This may indicate SearXNG backend engine issues or rate limiting."
408 )
409 return results
411 except ImportError:
412 logger.exception(
413 "BeautifulSoup not available for HTML parsing"
414 )
415 return []
416 except Exception:
417 logger.exception("Error parsing HTML results")
418 return []
419 else:
420 logger.error(
421 f"SearXNG returned status code {response.status_code}"
422 )
423 return []
425 except Exception:
426 logger.exception("Error getting SearXNG results")
427 return []
429 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
430 """
431 Get preview information for SearXNG search results.
433 Args:
434 query: The search query
436 Returns:
437 List of preview dictionaries
438 """
439 if not self.is_available:
440 logger.warning(
441 "SearXNG engine is disabled (no instance URL provided)"
442 )
443 return []
445 logger.info(f"Getting SearXNG previews for query: {query}")
447 results = self._get_search_results(query)
449 if not results:
450 logger.warning(f"No SearXNG results found for query: {query}")
451 return []
453 previews = []
454 for i, result in enumerate(results):
455 title = result.get("title", "")
456 url = result.get("url", "")
457 content = result.get("content", "")
459 preview = {
460 "id": url or f"searxng-result-{i}",
461 "title": title,
462 "link": url,
463 "snippet": content,
464 "engine": result.get("engine", ""),
465 "category": result.get("category", ""),
466 }
468 previews.append(preview)
470 return previews
472 def _get_full_content(
473 self, relevant_items: List[Dict[str, Any]]
474 ) -> List[Dict[str, Any]]:
475 """
476 Get full content for the relevant search results.
478 Args:
479 relevant_items: List of relevant preview dictionaries
481 Returns:
482 List of result dictionaries with full content
483 """
484 if not self.is_available:
485 return relevant_items
487 if ( 487 ↛ 491line 487 didn't jump to line 491 because the condition on line 487 was never true
488 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
489 and search_config.SEARCH_SNIPPETS_ONLY
490 ):
491 logger.info("Snippet-only mode, skipping full content retrieval")
492 return relevant_items
494 logger.info("Retrieving full webpage content")
496 try:
497 results_with_content = self.full_search._get_full_content(
498 relevant_items
499 )
500 return results_with_content
502 except Exception:
503 logger.exception("Error retrieving full content")
504 return relevant_items
506 def invoke(self, query: str) -> List[Dict[str, Any]]:
507 """Compatibility method for LangChain tools"""
508 return self.run(query)
510 def results(
511 self, query: str, max_results: Optional[int] = None
512 ) -> List[Dict[str, Any]]:
513 """
514 Get search results in a format compatible with other search engines.
516 Args:
517 query: The search query
518 max_results: Optional override for maximum results
520 Returns:
521 List of search result dictionaries
522 """
523 if not self.is_available:
524 return []
526 original_max_results = self.max_results
528 try:
529 if max_results is not None:
530 self.max_results = max_results
532 results = self._get_search_results(query)
534 formatted_results = []
535 for result in results:
536 formatted_results.append(
537 {
538 "title": result.get("title", ""),
539 "link": result.get("url", ""),
540 "snippet": result.get("content", ""),
541 }
542 )
544 return formatted_results
546 finally:
547 self.max_results = original_max_results
549 @staticmethod
550 def get_self_hosting_instructions() -> str:
551 """
552 Get instructions for self-hosting a SearXNG instance.
554 Returns:
555 String with installation instructions
556 """
557 return """
558# SearXNG Self-Hosting Instructions
560The most ethical way to use SearXNG is to host your own instance. Here's how:
562## Using Docker (easiest method)
5641. Install Docker if you don't have it already
5652. Run these commands:
567```bash
568# Pull the SearXNG Docker image
569docker pull searxng/searxng
571# Run SearXNG (will be available at http://localhost:8080)
572docker run -d -p 8080:8080 --name searxng searxng/searxng
573```
575## Using Docker Compose (recommended for production)
5771. Create a file named `docker-compose.yml` with the following content:
579```yaml
580version: '3'
581services:
582 searxng:
583 container_name: searxng
584 image: searxng/searxng
585 ports:
586 - "8080:8080"
587 volumes:
588 - ./searxng:/etc/searxng
589 environment:
590 - SEARXNG_BASE_URL=http://localhost:8080/
591 restart: unless-stopped
592```
5942. Run with Docker Compose:
596```bash
597docker-compose up -d
598```
600For more detailed instructions and configuration options, visit:
601https://searxng.github.io/searxng/admin/installation.html
602"""
604 def run(
605 self, query: str, research_context: Dict[str, Any] | None = None
606 ) -> List[Dict[str, Any]]:
607 """
608 Override BaseSearchEngine run method to add SearXNG-specific error handling.
609 """
610 if not self.is_available:
611 logger.error(
612 "SearXNG run method called but engine is not available (missing instance URL)"
613 )
614 return []
616 logger.info(f"SearXNG search engine running with query: '{query}'")
617 logger.info(f"SearXNG instance URL: {self.instance_url}")
619 try:
620 # Call the parent class's run method
621 results = super().run(query, research_context=research_context)
622 logger.info(f"SearXNG search completed with {len(results)} results")
623 return results
624 except Exception:
625 logger.exception("Error in SearXNG run method")
626 # Return empty results on error
627 return []