Coverage for src / local_deep_research / web_search_engines / engines / search_engine_searxng.py: 95%
220 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1import enum
2import json
3import time
4from typing import Any, Dict, List, Optional
6import requests
7from langchain_core.language_models import BaseLLM
8from loguru import logger
10from ...config import search_config
11from ...security.safe_requests import safe_get
12from ..search_engine_base import BaseSearchEngine
15@enum.unique
16class SafeSearchSetting(enum.IntEnum):
17 """
18 Acceptable settings for safe search.
19 """
21 OFF = 0
22 MODERATE = 1
23 STRICT = 2
26class SearXNGSearchEngine(BaseSearchEngine):
27 """
28 SearXNG search engine implementation that requires an instance URL provided via
29 environment variable or configuration. Designed for ethical usage with proper
30 rate limiting and single-instance approach.
31 """
33 # Mark as public search engine
34 is_public = True
35 # Mark as generic search engine (general web search)
36 is_generic = True
38 @staticmethod
39 def _normalize_list(value):
40 """Ensure *value* is a ``list[str]`` or ``None``.
42 Settings saved via the web UI may arrive as raw JSON strings
43 (e.g. ``'[\\r\\n "general"\\r\\n]'``) instead of parsed lists.
44 This helper decodes such strings so that ``",".join()`` later
45 works on list items rather than individual characters (issue #1030).
46 """
47 if value is None:
48 return None
49 if isinstance(value, list):
50 return value
51 if isinstance(value, str):
52 stripped = value.strip()
53 if stripped:
54 try:
55 parsed = json.loads(stripped)
56 if isinstance(parsed, list):
57 return [str(item) for item in parsed]
58 except (json.JSONDecodeError, ValueError, RecursionError):
59 pass
60 # Comma-separated fallback
61 return [
62 item.strip() for item in stripped.split(",") if item.strip()
63 ]
64 return None
66 def _is_valid_search_result(self, url: str) -> bool:
67 """
68 Check if a parsed result is a valid search result vs an error page.
70 When SearXNG's backend engines fail or get rate-limited, it returns
71 error/stats pages that shouldn't be treated as search results.
73 Returns False for:
74 - Relative URLs (don't start with http:// or https://, case-insensitive)
75 - URLs pointing to the SearXNG instance itself (catches /stats, /preferences, etc.)
76 """
77 # Must have an absolute URL (case-insensitive scheme check)
78 if not url or not url.lower().startswith(("http://", "https://")):
79 return False
81 # Reject URLs pointing back to the SearXNG instance itself
82 # This catches all internal pages like /stats?engine=, /preferences, /about
83 if url.startswith(self.instance_url):
84 return False
86 return True
88 def __init__(
89 self,
90 max_results: int = 15,
91 instance_url: str = "http://localhost:8080",
92 categories: Optional[List[str]] = None,
93 engines: Optional[List[str]] = None,
94 language: str = "en",
95 safe_search: str = SafeSearchSetting.OFF.name,
96 time_range: Optional[str] = None,
97 delay_between_requests: float = 0.0,
98 llm: Optional[BaseLLM] = None,
99 max_filtered_results: Optional[int] = None,
100 include_full_content: bool = True,
101 settings_snapshot: Optional[Dict[str, Any]] = None,
102 **kwargs,
103 ): # API key is actually the instance URL
104 """
105 Initialize the SearXNG search engine with ethical usage patterns.
107 Args:
108 max_results: Maximum number of search results
109 instance_url: URL of your SearXNG instance (preferably self-hosted)
110 categories: List of SearXNG categories to search in (general, images, videos, news, etc.)
111 engines: List of engines to use (google, bing, duckduckgo, etc.)
112 language: Language code for search results
113 safe_search: Safe search level (0=off, 1=moderate, 2=strict)
114 time_range: Time range for results (day, week, month, year)
115 delay_between_requests: Seconds to wait between requests
116 llm: Language model for relevance filtering
117 max_filtered_results: Maximum number of results to keep after filtering
118 include_full_content: Whether to include full webpage content in results
119 """
121 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
122 super().__init__(
123 llm=llm,
124 max_filtered_results=max_filtered_results,
125 max_results=max_results,
126 include_full_content=include_full_content,
127 settings_snapshot=settings_snapshot,
128 **kwargs, # Pass through all other kwargs including search_snippets_only
129 )
131 # Validate and normalize the instance URL if provided
132 self.instance_url = instance_url.rstrip("/")
133 logger.info(
134 f"SearXNG initialized with instance URL: {self.instance_url}"
135 )
136 try:
137 # Make sure it's accessible.
138 # allow_private_ips=True since SearXNG is typically self-hosted on local network
139 response = safe_get(
140 self.instance_url, timeout=5, allow_private_ips=True
141 )
142 if response.status_code == 200:
143 logger.info("SearXNG instance is accessible.")
144 self.is_available = True
145 else:
146 self.is_available = False
147 logger.error(
148 f"Failed to access SearXNG instance at {self.instance_url}. Status code: {response.status_code}"
149 )
150 except requests.RequestException:
151 self.is_available = False
152 logger.exception(
153 f"Error while trying to access SearXNG instance at {self.instance_url}"
154 )
156 # Add debug logging for all parameters
157 logger.info(
158 f"SearXNG init params: max_results={max_results}, language={language}, "
159 f"max_filtered_results={max_filtered_results}, is_available={self.is_available}"
160 )
162 self.max_results = max_results
163 self.categories = self._normalize_list(categories) or ["general"]
164 self.engines = self._normalize_list(engines)
165 self.language = language
166 try:
167 # Handle both string names and integer values
168 if isinstance(safe_search, int) or (
169 isinstance(safe_search, str) and str(safe_search).isdigit()
170 ):
171 self.safe_search = SafeSearchSetting(int(safe_search))
172 else:
173 self.safe_search = SafeSearchSetting[safe_search]
174 except (ValueError, KeyError):
175 logger.exception(
176 "'{}' is not a valid safe search setting. Disabling safe search",
177 safe_search,
178 )
179 self.safe_search = SafeSearchSetting.OFF
180 self.time_range = time_range
182 self.delay_between_requests = float(delay_between_requests)
184 if self.is_available:
185 self.search_url = f"{self.instance_url}/search"
186 logger.info(
187 f"SearXNG engine initialized with instance: {self.instance_url}"
188 )
189 logger.info(
190 f"Rate limiting set to {self.delay_between_requests} seconds between requests"
191 )
193 self._init_full_search(
194 web_search=self,
195 language=language,
196 max_results=max_results,
197 region="wt-wt",
198 time_period="y",
199 safe_search=self.safe_search.value,
200 )
202 self.last_request_time: float = 0.0
204 def _respect_rate_limit(self):
205 """Apply self-imposed rate limiting between requests"""
206 current_time = time.time()
207 time_since_last_request = current_time - self.last_request_time
209 if time_since_last_request < self.delay_between_requests:
210 wait_time = self.delay_between_requests - time_since_last_request
211 logger.info(f"Rate limiting: waiting {wait_time:.2f} seconds")
212 time.sleep(wait_time)
214 self.last_request_time = time.time()
216 def _get_search_results(self, query: str) -> List[Dict[str, Any]]:
217 """
218 Get search results from SearXNG with ethical rate limiting.
220 Args:
221 query: The search query
223 Returns:
224 List of search results from SearXNG
225 """
226 if not self.is_available:
227 logger.error(
228 "SearXNG engine is disabled (no instance URL provided) - cannot run search"
229 )
230 return []
232 logger.info(f"SearXNG running search for query: {query}")
234 try:
235 self._respect_rate_limit()
237 initial_headers = {
238 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
239 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
240 "Accept-Language": "en-US,en;q=0.9",
241 }
243 try:
244 initial_response = safe_get(
245 self.instance_url,
246 headers=initial_headers,
247 timeout=10,
248 allow_private_ips=True,
249 )
250 cookies = initial_response.cookies
251 except Exception:
252 logger.exception("Failed to get initial cookies")
253 cookies = None
255 params = {
256 "q": query,
257 "categories": ",".join(self.categories),
258 "language": self.language,
259 "format": "html", # Use HTML format instead of JSON
260 "pageno": 1,
261 "safesearch": self.safe_search.value,
262 "count": self.max_results,
263 }
265 if self.engines: 265 ↛ 266line 265 didn't jump to line 266 because the condition on line 265 was never true
266 params["engines"] = ",".join(self.engines)
268 if self.time_range: 268 ↛ 269line 268 didn't jump to line 269 because the condition on line 268 was never true
269 params["time_range"] = self.time_range
271 # Browser-like headers
272 headers = {
273 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
274 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
275 "Accept-Language": "en-US,en;q=0.9",
276 "Referer": self.instance_url + "/",
277 "Connection": "keep-alive",
278 "Upgrade-Insecure-Requests": "1",
279 }
281 logger.info(
282 f"Sending request to SearXNG instance at {self.instance_url}"
283 )
284 response = safe_get(
285 self.search_url,
286 params=params,
287 headers=headers,
288 cookies=cookies,
289 timeout=15,
290 allow_private_ips=True,
291 )
293 if response.status_code == 200:
294 try:
295 from bs4 import BeautifulSoup
297 soup = BeautifulSoup(response.text, "html.parser")
298 results = []
300 result_elements = soup.select(".result-item")
302 if not result_elements:
303 result_elements = soup.select(".result")
305 if not result_elements:
306 result_elements = soup.select("article")
308 if not result_elements:
309 logger.debug(
310 f"Classes found in HTML: {[c['class'] for c in soup.select('[class]') if 'class' in c.attrs][:10]}"
311 )
312 result_elements = soup.select('div[id^="result"]')
314 logger.info(
315 f"Found {len(result_elements)} search result elements"
316 )
318 for idx, result_element in enumerate(result_elements):
319 if idx >= self.max_results: 319 ↛ 320line 319 didn't jump to line 320 because the condition on line 319 was never true
320 break
322 title_element = (
323 result_element.select_one(".result-title")
324 or result_element.select_one(".title")
325 or result_element.select_one("h3")
326 or result_element.select_one("a[href]")
327 )
329 url_element = (
330 result_element.select_one(".result-url")
331 or result_element.select_one(".url")
332 or result_element.select_one("a[href]")
333 )
335 content_element = (
336 result_element.select_one(".result-content")
337 or result_element.select_one(".content")
338 or result_element.select_one(".snippet")
339 or result_element.select_one("p")
340 )
342 title = (
343 title_element.get_text(strip=True)
344 if title_element
345 else ""
346 )
348 url = ""
349 if url_element and url_element.has_attr("href"):
350 url = str(url_element["href"])
351 elif url_element: 351 ↛ 354line 351 didn't jump to line 354 because the condition on line 351 was always true
352 url = url_element.get_text(strip=True)
354 content = (
355 content_element.get_text(strip=True)
356 if content_element
357 else ""
358 )
360 if (
361 not url
362 and title_element
363 and title_element.has_attr("href")
364 ):
365 url = str(title_element["href"])
367 logger.debug(
368 f"Extracted result {idx}: title={title[:30]}..., url={url[:30]}..., content={content[:30]}..."
369 )
371 # Add to results only if it's a valid search result
372 # (not an error page or internal SearXNG page)
373 if self._is_valid_search_result(url):
374 results.append(
375 {
376 "title": title,
377 "url": url,
378 "content": content,
379 "engine": "searxng",
380 "category": "general",
381 }
382 )
383 else:
384 # Check if this is a backend engine failure
385 if url and "/stats?engine=" in url: 385 ↛ 395line 385 didn't jump to line 395 because the condition on line 385 was always true
386 try:
387 engine_name = url.split("/stats?engine=")[
388 1
389 ].split("&")[0]
390 logger.warning(
391 f"SearXNG backend engine failed or rate-limited: {engine_name}"
392 )
393 except (IndexError, AttributeError):
394 pass # Couldn't parse engine name
395 logger.debug(
396 f"Filtered invalid SearXNG result: title={title!r}, url={url!r}"
397 )
399 if results:
400 logger.info(
401 f"SearXNG returned {len(results)} valid results from HTML parsing"
402 )
403 else:
404 logger.warning(
405 f"SearXNG returned no valid results for query: {query}. "
406 "This may indicate SearXNG backend engine issues or rate limiting."
407 )
408 return results
410 except ImportError:
411 logger.exception(
412 "BeautifulSoup not available for HTML parsing"
413 )
414 return []
415 except Exception:
416 logger.exception("Error parsing HTML results")
417 return []
418 else:
419 logger.error(
420 f"SearXNG returned status code {response.status_code}"
421 )
422 return []
424 except Exception:
425 logger.exception("Error getting SearXNG results")
426 return []
428 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
429 """
430 Get preview information for SearXNG search results.
432 Args:
433 query: The search query
435 Returns:
436 List of preview dictionaries
437 """
438 if not self.is_available:
439 logger.warning(
440 "SearXNG engine is disabled (no instance URL provided)"
441 )
442 return []
444 logger.info(f"Getting SearXNG previews for query: {query}")
446 results = self._get_search_results(query)
448 if not results:
449 logger.warning(f"No SearXNG results found for query: {query}")
450 return []
452 previews = []
453 for i, result in enumerate(results):
454 title = result.get("title", "")
455 url = result.get("url", "")
456 content = result.get("content", "")
458 preview = {
459 "id": url or f"searxng-result-{i}",
460 "title": title,
461 "link": url,
462 "snippet": content,
463 "engine": result.get("engine", ""),
464 "category": result.get("category", ""),
465 }
467 previews.append(preview)
469 return previews
471 def _get_full_content(
472 self, relevant_items: List[Dict[str, Any]]
473 ) -> List[Dict[str, Any]]:
474 """
475 Get full content for the relevant search results.
477 Args:
478 relevant_items: List of relevant preview dictionaries
480 Returns:
481 List of result dictionaries with full content
482 """
483 if not self.is_available:
484 return relevant_items
486 if (
487 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
488 and search_config.SEARCH_SNIPPETS_ONLY
489 ):
490 logger.info("Snippet-only mode, skipping full content retrieval")
491 return relevant_items
493 if not hasattr(self, "full_search"): 493 ↛ 494line 493 didn't jump to line 494 because the condition on line 493 was never true
494 return relevant_items
496 logger.info("Retrieving full webpage content")
498 try:
499 return self.full_search._get_full_content(relevant_items)
501 except Exception:
502 logger.exception("Error retrieving full content")
503 return relevant_items
505 def invoke(self, query: str) -> List[Dict[str, Any]]:
506 """Compatibility method for LangChain tools"""
507 return self.run(query)
509 def results(
510 self, query: str, max_results: Optional[int] = None
511 ) -> List[Dict[str, Any]]:
512 """
513 Get search results in a format compatible with other search engines.
515 Args:
516 query: The search query
517 max_results: Optional override for maximum results
519 Returns:
520 List of search result dictionaries
521 """
522 if not self.is_available:
523 return []
525 original_max_results = self.max_results
527 try:
528 if max_results is not None:
529 self.max_results = max_results
531 results = self._get_search_results(query)
533 formatted_results = []
534 for result in results:
535 formatted_results.append(
536 {
537 "title": result.get("title", ""),
538 "link": result.get("url", ""),
539 "snippet": result.get("content", ""),
540 }
541 )
543 return formatted_results
545 finally:
546 self.max_results = original_max_results
548 @staticmethod
549 def get_self_hosting_instructions() -> str:
550 """
551 Get instructions for self-hosting a SearXNG instance.
553 Returns:
554 String with installation instructions
555 """
556 return """
557# SearXNG Self-Hosting Instructions
559The most ethical way to use SearXNG is to host your own instance. Here's how:
561## Using Docker (easiest method)
5631. Install Docker if you don't have it already
5642. Run these commands:
566```bash
567# Pull the SearXNG Docker image
568docker pull searxng/searxng
570# Run SearXNG (will be available at http://localhost:8080)
571docker run -d -p 8080:8080 --name searxng searxng/searxng
572```
574## Using Docker Compose (recommended for production)
5761. Create a file named `docker-compose.yml` with the following content:
578```yaml
579version: '3'
580services:
581 searxng:
582 container_name: searxng
583 image: searxng/searxng
584 ports:
585 - "8080:8080"
586 volumes:
587 - ./searxng:/etc/searxng
588 environment:
589 - SEARXNG_BASE_URL=http://localhost:8080/
590 restart: unless-stopped
591```
5932. Run with Docker Compose:
595```bash
596docker-compose up -d
597```
599For more detailed instructions and configuration options, visit:
600https://searxng.github.io/searxng/admin/installation.html
601"""
603 def run(
604 self, query: str, research_context: Dict[str, Any] | None = None
605 ) -> List[Dict[str, Any]]:
606 """
607 Override BaseSearchEngine run method to add SearXNG-specific error handling.
608 """
609 if not self.is_available:
610 logger.error(
611 "SearXNG run method called but engine is not available (missing instance URL)"
612 )
613 return []
615 logger.info(f"SearXNG instance URL: {self.instance_url}")
617 try:
618 # Call the parent class's run method
619 results = super().run(query, research_context=research_context)
620 logger.info(f"SearXNG search completed with {len(results)} results")
621 return results
622 except Exception:
623 logger.exception("Error in SearXNG run method")
624 # Return empty results on error
625 return []