Coverage for src / local_deep_research / web_search_engines / engines / search_engine_searxng.py: 87%

222 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1import enum 

2import json 

3import time 

4from typing import Any, Dict, List, Optional 

5 

6import requests 

7from langchain_core.language_models import BaseLLM 

8from loguru import logger 

9 

10from ...config import search_config 

11from ...security.safe_requests import safe_get 

12from ..search_engine_base import BaseSearchEngine 

13from .full_search import FullSearchResults 

14 

15 

16@enum.unique 

17class SafeSearchSetting(enum.IntEnum): 

18 """ 

19 Acceptable settings for safe search. 

20 """ 

21 

22 OFF = 0 

23 MODERATE = 1 

24 STRICT = 2 

25 

26 

27class SearXNGSearchEngine(BaseSearchEngine): 

28 """ 

29 SearXNG search engine implementation that requires an instance URL provided via 

30 environment variable or configuration. Designed for ethical usage with proper 

31 rate limiting and single-instance approach. 

32 """ 

33 

34 # Mark as public search engine 

35 is_public = True 

36 # Mark as generic search engine (general web search) 

37 is_generic = True 

38 

39 @staticmethod 

40 def _normalize_list(value): 

41 """Ensure *value* is a ``list[str]`` or ``None``. 

42 

43 Settings saved via the web UI may arrive as raw JSON strings 

44 (e.g. ``'[\\r\\n "general"\\r\\n]'``) instead of parsed lists. 

45 This helper decodes such strings so that ``",".join()`` later 

46 works on list items rather than individual characters (issue #1030). 

47 """ 

48 if value is None: 

49 return None 

50 if isinstance(value, list): 

51 return value 

52 if isinstance(value, str): 52 ↛ 65line 52 didn't jump to line 65 because the condition on line 52 was always true

53 stripped = value.strip() 

54 if stripped: 

55 try: 

56 parsed = json.loads(stripped) 

57 if isinstance(parsed, list): 57 ↛ 62line 57 didn't jump to line 62 because the condition on line 57 was always true

58 return [str(item) for item in parsed] 

59 except (json.JSONDecodeError, ValueError, RecursionError): 

60 pass 

61 # Comma-separated fallback 

62 return [ 

63 item.strip() for item in stripped.split(",") if item.strip() 

64 ] 

65 return None 

66 

67 def _is_valid_search_result(self, url: str) -> bool: 

68 """ 

69 Check if a parsed result is a valid search result vs an error page. 

70 

71 When SearXNG's backend engines fail or get rate-limited, it returns 

72 error/stats pages that shouldn't be treated as search results. 

73 

74 Returns False for: 

75 - Relative URLs (don't start with http:// or https://, case-insensitive) 

76 - URLs pointing to the SearXNG instance itself (catches /stats, /preferences, etc.) 

77 """ 

78 # Must have an absolute URL (case-insensitive scheme check) 

79 if not url or not url.lower().startswith(("http://", "https://")): 

80 return False 

81 

82 # Reject URLs pointing back to the SearXNG instance itself 

83 # This catches all internal pages like /stats?engine=, /preferences, /about 

84 if url.startswith(self.instance_url): 

85 return False 

86 

87 return True 

88 

89 def __init__( 

90 self, 

91 max_results: int = 15, 

92 instance_url: str = "http://localhost:8080", 

93 categories: Optional[List[str]] = None, 

94 engines: Optional[List[str]] = None, 

95 language: str = "en", 

96 safe_search: str = SafeSearchSetting.OFF.name, 

97 time_range: Optional[str] = None, 

98 delay_between_requests: float = 0.0, 

99 llm: Optional[BaseLLM] = None, 

100 max_filtered_results: Optional[int] = None, 

101 include_full_content: bool = True, 

102 **kwargs, 

103 ): # API key is actually the instance URL 

104 """ 

105 Initialize the SearXNG search engine with ethical usage patterns. 

106 

107 Args: 

108 max_results: Maximum number of search results 

109 instance_url: URL of your SearXNG instance (preferably self-hosted) 

110 categories: List of SearXNG categories to search in (general, images, videos, news, etc.) 

111 engines: List of engines to use (google, bing, duckduckgo, etc.) 

112 language: Language code for search results 

113 safe_search: Safe search level (0=off, 1=moderate, 2=strict) 

114 time_range: Time range for results (day, week, month, year) 

115 delay_between_requests: Seconds to wait between requests 

116 llm: Language model for relevance filtering 

117 max_filtered_results: Maximum number of results to keep after filtering 

118 include_full_content: Whether to include full webpage content in results 

119 """ 

120 

121 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results 

122 super().__init__( 

123 llm=llm, 

124 max_filtered_results=max_filtered_results, 

125 max_results=max_results, 

126 **kwargs, # Pass through all other kwargs including search_snippets_only 

127 ) 

128 

129 # Validate and normalize the instance URL if provided 

130 self.instance_url = instance_url.rstrip("/") 

131 logger.info( 

132 f"SearXNG initialized with instance URL: {self.instance_url}" 

133 ) 

134 try: 

135 # Make sure it's accessible. 

136 # allow_private_ips=True since SearXNG is typically self-hosted on local network 

137 response = safe_get( 

138 self.instance_url, timeout=5, allow_private_ips=True 

139 ) 

140 if response.status_code == 200: 

141 logger.info("SearXNG instance is accessible.") 

142 self.is_available = True 

143 else: 

144 self.is_available = False 

145 logger.error( 

146 f"Failed to access SearXNG instance at {self.instance_url}. Status code: {response.status_code}" 

147 ) 

148 except requests.RequestException: 

149 self.is_available = False 

150 logger.exception( 

151 f"Error while trying to access SearXNG instance at {self.instance_url}" 

152 ) 

153 

154 # Add debug logging for all parameters 

155 logger.info( 

156 f"SearXNG init params: max_results={max_results}, language={language}, " 

157 f"max_filtered_results={max_filtered_results}, is_available={self.is_available}" 

158 ) 

159 

160 self.max_results = max_results 

161 self.categories = self._normalize_list(categories) or ["general"] 

162 self.engines = self._normalize_list(engines) 

163 self.language = language 

164 try: 

165 # Handle both string names and integer values 

166 if isinstance(safe_search, int) or ( 

167 isinstance(safe_search, str) and str(safe_search).isdigit() 

168 ): 

169 self.safe_search = SafeSearchSetting(int(safe_search)) 

170 else: 

171 self.safe_search = SafeSearchSetting[safe_search] 

172 except (ValueError, KeyError): 

173 logger.exception( 

174 "'{}' is not a valid safe search setting. Disabling safe search", 

175 safe_search, 

176 ) 

177 self.safe_search = SafeSearchSetting.OFF 

178 self.time_range = time_range 

179 

180 self.delay_between_requests = float(delay_between_requests) 

181 

182 self.include_full_content = include_full_content 

183 

184 if self.is_available: 

185 self.search_url = f"{self.instance_url}/search" 

186 logger.info( 

187 f"SearXNG engine initialized with instance: {self.instance_url}" 

188 ) 

189 logger.info( 

190 f"Rate limiting set to {self.delay_between_requests} seconds between requests" 

191 ) 

192 

193 self.full_search = FullSearchResults( 

194 llm=llm, 

195 web_search=self, 

196 language=language, 

197 max_results=max_results, 

198 region="wt-wt", 

199 time="y", 

200 safesearch=self.safe_search.value, 

201 ) 

202 

203 self.last_request_time = 0 

204 

205 def _respect_rate_limit(self): 

206 """Apply self-imposed rate limiting between requests""" 

207 current_time = time.time() 

208 time_since_last_request = current_time - self.last_request_time 

209 

210 if time_since_last_request < self.delay_between_requests: 

211 wait_time = self.delay_between_requests - time_since_last_request 

212 logger.info(f"Rate limiting: waiting {wait_time:.2f} seconds") 

213 time.sleep(wait_time) 

214 

215 self.last_request_time = time.time() 

216 

217 def _get_search_results(self, query: str) -> List[Dict[str, Any]]: 

218 """ 

219 Get search results from SearXNG with ethical rate limiting. 

220 

221 Args: 

222 query: The search query 

223 

224 Returns: 

225 List of search results from SearXNG 

226 """ 

227 if not self.is_available: 

228 logger.error( 

229 "SearXNG engine is disabled (no instance URL provided) - cannot run search" 

230 ) 

231 return [] 

232 

233 logger.info(f"SearXNG running search for query: {query}") 

234 

235 try: 

236 self._respect_rate_limit() 

237 

238 initial_headers = { 

239 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", 

240 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 

241 "Accept-Language": "en-US,en;q=0.9", 

242 } 

243 

244 try: 

245 initial_response = safe_get( 

246 self.instance_url, 

247 headers=initial_headers, 

248 timeout=10, 

249 allow_private_ips=True, 

250 ) 

251 cookies = initial_response.cookies 

252 except Exception: 

253 logger.exception("Failed to get initial cookies") 

254 cookies = None 

255 

256 params = { 

257 "q": query, 

258 "categories": ",".join(self.categories), 

259 "language": self.language, 

260 "format": "html", # Use HTML format instead of JSON 

261 "pageno": 1, 

262 "safesearch": self.safe_search.value, 

263 "count": self.max_results, 

264 } 

265 

266 if self.engines: 266 ↛ 267line 266 didn't jump to line 267 because the condition on line 266 was never true

267 params["engines"] = ",".join(self.engines) 

268 

269 if self.time_range: 269 ↛ 270line 269 didn't jump to line 270 because the condition on line 269 was never true

270 params["time_range"] = self.time_range 

271 

272 # Browser-like headers 

273 headers = { 

274 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", 

275 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 

276 "Accept-Language": "en-US,en;q=0.9", 

277 "Referer": self.instance_url + "/", 

278 "Connection": "keep-alive", 

279 "Upgrade-Insecure-Requests": "1", 

280 } 

281 

282 logger.info( 

283 f"Sending request to SearXNG instance at {self.instance_url}" 

284 ) 

285 response = safe_get( 

286 self.search_url, 

287 params=params, 

288 headers=headers, 

289 cookies=cookies, 

290 timeout=15, 

291 allow_private_ips=True, 

292 ) 

293 

294 if response.status_code == 200: 

295 try: 

296 from bs4 import BeautifulSoup 

297 

298 soup = BeautifulSoup(response.text, "html.parser") 

299 results = [] 

300 

301 result_elements = soup.select(".result-item") 

302 

303 if not result_elements: 

304 result_elements = soup.select(".result") 

305 

306 if not result_elements: 306 ↛ 307line 306 didn't jump to line 307 because the condition on line 306 was never true

307 result_elements = soup.select("article") 

308 

309 if not result_elements: 309 ↛ 310line 309 didn't jump to line 310 because the condition on line 309 was never true

310 logger.debug( 

311 f"Classes found in HTML: {[c['class'] for c in soup.select('[class]') if 'class' in c.attrs][:10]}" 

312 ) 

313 result_elements = soup.select('div[id^="result"]') 

314 

315 logger.info( 

316 f"Found {len(result_elements)} search result elements" 

317 ) 

318 

319 for idx, result_element in enumerate(result_elements): 

320 if idx >= self.max_results: 320 ↛ 321line 320 didn't jump to line 321 because the condition on line 320 was never true

321 break 

322 

323 title_element = ( 

324 result_element.select_one(".result-title") 

325 or result_element.select_one(".title") 

326 or result_element.select_one("h3") 

327 or result_element.select_one("a[href]") 

328 ) 

329 

330 url_element = ( 

331 result_element.select_one(".result-url") 

332 or result_element.select_one(".url") 

333 or result_element.select_one("a[href]") 

334 ) 

335 

336 content_element = ( 

337 result_element.select_one(".result-content") 

338 or result_element.select_one(".content") 

339 or result_element.select_one(".snippet") 

340 or result_element.select_one("p") 

341 ) 

342 

343 title = ( 

344 title_element.get_text(strip=True) 

345 if title_element 

346 else "" 

347 ) 

348 

349 url = "" 

350 if url_element and url_element.has_attr("href"): 350 ↛ 352line 350 didn't jump to line 352 because the condition on line 350 was always true

351 url = url_element["href"] 

352 elif url_element: 

353 url = url_element.get_text(strip=True) 

354 

355 content = ( 

356 content_element.get_text(strip=True) 

357 if content_element 

358 else "" 

359 ) 

360 

361 if ( 361 ↛ 366line 361 didn't jump to line 366 because the condition on line 361 was never true

362 not url 

363 and title_element 

364 and title_element.has_attr("href") 

365 ): 

366 url = title_element["href"] 

367 

368 logger.debug( 

369 f"Extracted result {idx}: title={title[:30]}..., url={url[:30]}..., content={content[:30]}..." 

370 ) 

371 

372 # Add to results only if it's a valid search result 

373 # (not an error page or internal SearXNG page) 

374 if self._is_valid_search_result(url): 

375 results.append( 

376 { 

377 "title": title, 

378 "url": url, 

379 "content": content, 

380 "engine": "searxng", 

381 "category": "general", 

382 } 

383 ) 

384 else: 

385 # Check if this is a backend engine failure 

386 if url and "/stats?engine=" in url: 386 ↛ 396line 386 didn't jump to line 396 because the condition on line 386 was always true

387 try: 

388 engine_name = url.split("/stats?engine=")[ 

389 1 

390 ].split("&")[0] 

391 logger.warning( 

392 f"SearXNG backend engine failed or rate-limited: {engine_name}" 

393 ) 

394 except (IndexError, AttributeError): 

395 pass # Couldn't parse engine name 

396 logger.debug( 

397 f"Filtered invalid SearXNG result: title={title!r}, url={url!r}" 

398 ) 

399 

400 if results: 400 ↛ 405line 400 didn't jump to line 405 because the condition on line 400 was always true

401 logger.info( 

402 f"SearXNG returned {len(results)} valid results from HTML parsing" 

403 ) 

404 else: 

405 logger.warning( 

406 f"SearXNG returned no valid results for query: {query}. " 

407 "This may indicate SearXNG backend engine issues or rate limiting." 

408 ) 

409 return results 

410 

411 except ImportError: 

412 logger.exception( 

413 "BeautifulSoup not available for HTML parsing" 

414 ) 

415 return [] 

416 except Exception: 

417 logger.exception("Error parsing HTML results") 

418 return [] 

419 else: 

420 logger.error( 

421 f"SearXNG returned status code {response.status_code}" 

422 ) 

423 return [] 

424 

425 except Exception: 

426 logger.exception("Error getting SearXNG results") 

427 return [] 

428 

429 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

430 """ 

431 Get preview information for SearXNG search results. 

432 

433 Args: 

434 query: The search query 

435 

436 Returns: 

437 List of preview dictionaries 

438 """ 

439 if not self.is_available: 

440 logger.warning( 

441 "SearXNG engine is disabled (no instance URL provided)" 

442 ) 

443 return [] 

444 

445 logger.info(f"Getting SearXNG previews for query: {query}") 

446 

447 results = self._get_search_results(query) 

448 

449 if not results: 

450 logger.warning(f"No SearXNG results found for query: {query}") 

451 return [] 

452 

453 previews = [] 

454 for i, result in enumerate(results): 

455 title = result.get("title", "") 

456 url = result.get("url", "") 

457 content = result.get("content", "") 

458 

459 preview = { 

460 "id": url or f"searxng-result-{i}", 

461 "title": title, 

462 "link": url, 

463 "snippet": content, 

464 "engine": result.get("engine", ""), 

465 "category": result.get("category", ""), 

466 } 

467 

468 previews.append(preview) 

469 

470 return previews 

471 

472 def _get_full_content( 

473 self, relevant_items: List[Dict[str, Any]] 

474 ) -> List[Dict[str, Any]]: 

475 """ 

476 Get full content for the relevant search results. 

477 

478 Args: 

479 relevant_items: List of relevant preview dictionaries 

480 

481 Returns: 

482 List of result dictionaries with full content 

483 """ 

484 if not self.is_available: 

485 return relevant_items 

486 

487 if ( 487 ↛ 491line 487 didn't jump to line 491 because the condition on line 487 was never true

488 hasattr(search_config, "SEARCH_SNIPPETS_ONLY") 

489 and search_config.SEARCH_SNIPPETS_ONLY 

490 ): 

491 logger.info("Snippet-only mode, skipping full content retrieval") 

492 return relevant_items 

493 

494 logger.info("Retrieving full webpage content") 

495 

496 try: 

497 results_with_content = self.full_search._get_full_content( 

498 relevant_items 

499 ) 

500 return results_with_content 

501 

502 except Exception: 

503 logger.exception("Error retrieving full content") 

504 return relevant_items 

505 

506 def invoke(self, query: str) -> List[Dict[str, Any]]: 

507 """Compatibility method for LangChain tools""" 

508 return self.run(query) 

509 

510 def results( 

511 self, query: str, max_results: Optional[int] = None 

512 ) -> List[Dict[str, Any]]: 

513 """ 

514 Get search results in a format compatible with other search engines. 

515 

516 Args: 

517 query: The search query 

518 max_results: Optional override for maximum results 

519 

520 Returns: 

521 List of search result dictionaries 

522 """ 

523 if not self.is_available: 

524 return [] 

525 

526 original_max_results = self.max_results 

527 

528 try: 

529 if max_results is not None: 

530 self.max_results = max_results 

531 

532 results = self._get_search_results(query) 

533 

534 formatted_results = [] 

535 for result in results: 

536 formatted_results.append( 

537 { 

538 "title": result.get("title", ""), 

539 "link": result.get("url", ""), 

540 "snippet": result.get("content", ""), 

541 } 

542 ) 

543 

544 return formatted_results 

545 

546 finally: 

547 self.max_results = original_max_results 

548 

549 @staticmethod 

550 def get_self_hosting_instructions() -> str: 

551 """ 

552 Get instructions for self-hosting a SearXNG instance. 

553 

554 Returns: 

555 String with installation instructions 

556 """ 

557 return """ 

558# SearXNG Self-Hosting Instructions 

559 

560The most ethical way to use SearXNG is to host your own instance. Here's how: 

561 

562## Using Docker (easiest method) 

563 

5641. Install Docker if you don't have it already 

5652. Run these commands: 

566 

567```bash 

568# Pull the SearXNG Docker image 

569docker pull searxng/searxng 

570 

571# Run SearXNG (will be available at http://localhost:8080) 

572docker run -d -p 8080:8080 --name searxng searxng/searxng 

573``` 

574 

575## Using Docker Compose (recommended for production) 

576 

5771. Create a file named `docker-compose.yml` with the following content: 

578 

579```yaml 

580version: '3' 

581services: 

582 searxng: 

583 container_name: searxng 

584 image: searxng/searxng 

585 ports: 

586 - "8080:8080" 

587 volumes: 

588 - ./searxng:/etc/searxng 

589 environment: 

590 - SEARXNG_BASE_URL=http://localhost:8080/ 

591 restart: unless-stopped 

592``` 

593 

5942. Run with Docker Compose: 

595 

596```bash 

597docker-compose up -d 

598``` 

599 

600For more detailed instructions and configuration options, visit: 

601https://searxng.github.io/searxng/admin/installation.html 

602""" 

603 

604 def run( 

605 self, query: str, research_context: Dict[str, Any] | None = None 

606 ) -> List[Dict[str, Any]]: 

607 """ 

608 Override BaseSearchEngine run method to add SearXNG-specific error handling. 

609 """ 

610 if not self.is_available: 

611 logger.error( 

612 "SearXNG run method called but engine is not available (missing instance URL)" 

613 ) 

614 return [] 

615 

616 logger.info(f"SearXNG search engine running with query: '{query}'") 

617 logger.info(f"SearXNG instance URL: {self.instance_url}") 

618 

619 try: 

620 # Call the parent class's run method 

621 results = super().run(query, research_context=research_context) 

622 logger.info(f"SearXNG search completed with {len(results)} results") 

623 return results 

624 except Exception: 

625 logger.exception("Error in SearXNG run method") 

626 # Return empty results on error 

627 return []