Coverage for src / local_deep_research / web_search_engines / engines / search_engine_searxng.py: 95%

220 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1import enum 

2import json 

3import time 

4from typing import Any, Dict, List, Optional 

5 

6import requests 

7from langchain_core.language_models import BaseLLM 

8from loguru import logger 

9 

10from ...config import search_config 

11from ...security.safe_requests import safe_get 

12from ..search_engine_base import BaseSearchEngine 

13 

14 

15@enum.unique 

16class SafeSearchSetting(enum.IntEnum): 

17 """ 

18 Acceptable settings for safe search. 

19 """ 

20 

21 OFF = 0 

22 MODERATE = 1 

23 STRICT = 2 

24 

25 

26class SearXNGSearchEngine(BaseSearchEngine): 

27 """ 

28 SearXNG search engine implementation that requires an instance URL provided via 

29 environment variable or configuration. Designed for ethical usage with proper 

30 rate limiting and single-instance approach. 

31 """ 

32 

33 # Mark as public search engine 

34 is_public = True 

35 # Mark as generic search engine (general web search) 

36 is_generic = True 

37 

38 @staticmethod 

39 def _normalize_list(value): 

40 """Ensure *value* is a ``list[str]`` or ``None``. 

41 

42 Settings saved via the web UI may arrive as raw JSON strings 

43 (e.g. ``'[\\r\\n "general"\\r\\n]'``) instead of parsed lists. 

44 This helper decodes such strings so that ``",".join()`` later 

45 works on list items rather than individual characters (issue #1030). 

46 """ 

47 if value is None: 

48 return None 

49 if isinstance(value, list): 

50 return value 

51 if isinstance(value, str): 

52 stripped = value.strip() 

53 if stripped: 

54 try: 

55 parsed = json.loads(stripped) 

56 if isinstance(parsed, list): 

57 return [str(item) for item in parsed] 

58 except (json.JSONDecodeError, ValueError, RecursionError): 

59 pass 

60 # Comma-separated fallback 

61 return [ 

62 item.strip() for item in stripped.split(",") if item.strip() 

63 ] 

64 return None 

65 

66 def _is_valid_search_result(self, url: str) -> bool: 

67 """ 

68 Check if a parsed result is a valid search result vs an error page. 

69 

70 When SearXNG's backend engines fail or get rate-limited, it returns 

71 error/stats pages that shouldn't be treated as search results. 

72 

73 Returns False for: 

74 - Relative URLs (don't start with http:// or https://, case-insensitive) 

75 - URLs pointing to the SearXNG instance itself (catches /stats, /preferences, etc.) 

76 """ 

77 # Must have an absolute URL (case-insensitive scheme check) 

78 if not url or not url.lower().startswith(("http://", "https://")): 

79 return False 

80 

81 # Reject URLs pointing back to the SearXNG instance itself 

82 # This catches all internal pages like /stats?engine=, /preferences, /about 

83 if url.startswith(self.instance_url): 

84 return False 

85 

86 return True 

87 

88 def __init__( 

89 self, 

90 max_results: int = 15, 

91 instance_url: str = "http://localhost:8080", 

92 categories: Optional[List[str]] = None, 

93 engines: Optional[List[str]] = None, 

94 language: str = "en", 

95 safe_search: str = SafeSearchSetting.OFF.name, 

96 time_range: Optional[str] = None, 

97 delay_between_requests: float = 0.0, 

98 llm: Optional[BaseLLM] = None, 

99 max_filtered_results: Optional[int] = None, 

100 include_full_content: bool = True, 

101 settings_snapshot: Optional[Dict[str, Any]] = None, 

102 **kwargs, 

103 ): # API key is actually the instance URL 

104 """ 

105 Initialize the SearXNG search engine with ethical usage patterns. 

106 

107 Args: 

108 max_results: Maximum number of search results 

109 instance_url: URL of your SearXNG instance (preferably self-hosted) 

110 categories: List of SearXNG categories to search in (general, images, videos, news, etc.) 

111 engines: List of engines to use (google, bing, duckduckgo, etc.) 

112 language: Language code for search results 

113 safe_search: Safe search level (0=off, 1=moderate, 2=strict) 

114 time_range: Time range for results (day, week, month, year) 

115 delay_between_requests: Seconds to wait between requests 

116 llm: Language model for relevance filtering 

117 max_filtered_results: Maximum number of results to keep after filtering 

118 include_full_content: Whether to include full webpage content in results 

119 """ 

120 

121 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results 

122 super().__init__( 

123 llm=llm, 

124 max_filtered_results=max_filtered_results, 

125 max_results=max_results, 

126 include_full_content=include_full_content, 

127 settings_snapshot=settings_snapshot, 

128 **kwargs, # Pass through all other kwargs including search_snippets_only 

129 ) 

130 

131 # Validate and normalize the instance URL if provided 

132 self.instance_url = instance_url.rstrip("/") 

133 logger.info( 

134 f"SearXNG initialized with instance URL: {self.instance_url}" 

135 ) 

136 try: 

137 # Make sure it's accessible. 

138 # allow_private_ips=True since SearXNG is typically self-hosted on local network 

139 response = safe_get( 

140 self.instance_url, timeout=5, allow_private_ips=True 

141 ) 

142 if response.status_code == 200: 

143 logger.info("SearXNG instance is accessible.") 

144 self.is_available = True 

145 else: 

146 self.is_available = False 

147 logger.error( 

148 f"Failed to access SearXNG instance at {self.instance_url}. Status code: {response.status_code}" 

149 ) 

150 except requests.RequestException: 

151 self.is_available = False 

152 logger.exception( 

153 f"Error while trying to access SearXNG instance at {self.instance_url}" 

154 ) 

155 

156 # Add debug logging for all parameters 

157 logger.info( 

158 f"SearXNG init params: max_results={max_results}, language={language}, " 

159 f"max_filtered_results={max_filtered_results}, is_available={self.is_available}" 

160 ) 

161 

162 self.max_results = max_results 

163 self.categories = self._normalize_list(categories) or ["general"] 

164 self.engines = self._normalize_list(engines) 

165 self.language = language 

166 try: 

167 # Handle both string names and integer values 

168 if isinstance(safe_search, int) or ( 

169 isinstance(safe_search, str) and str(safe_search).isdigit() 

170 ): 

171 self.safe_search = SafeSearchSetting(int(safe_search)) 

172 else: 

173 self.safe_search = SafeSearchSetting[safe_search] 

174 except (ValueError, KeyError): 

175 logger.exception( 

176 "'{}' is not a valid safe search setting. Disabling safe search", 

177 safe_search, 

178 ) 

179 self.safe_search = SafeSearchSetting.OFF 

180 self.time_range = time_range 

181 

182 self.delay_between_requests = float(delay_between_requests) 

183 

184 if self.is_available: 

185 self.search_url = f"{self.instance_url}/search" 

186 logger.info( 

187 f"SearXNG engine initialized with instance: {self.instance_url}" 

188 ) 

189 logger.info( 

190 f"Rate limiting set to {self.delay_between_requests} seconds between requests" 

191 ) 

192 

193 self._init_full_search( 

194 web_search=self, 

195 language=language, 

196 max_results=max_results, 

197 region="wt-wt", 

198 time_period="y", 

199 safe_search=self.safe_search.value, 

200 ) 

201 

202 self.last_request_time: float = 0.0 

203 

204 def _respect_rate_limit(self): 

205 """Apply self-imposed rate limiting between requests""" 

206 current_time = time.time() 

207 time_since_last_request = current_time - self.last_request_time 

208 

209 if time_since_last_request < self.delay_between_requests: 

210 wait_time = self.delay_between_requests - time_since_last_request 

211 logger.info(f"Rate limiting: waiting {wait_time:.2f} seconds") 

212 time.sleep(wait_time) 

213 

214 self.last_request_time = time.time() 

215 

216 def _get_search_results(self, query: str) -> List[Dict[str, Any]]: 

217 """ 

218 Get search results from SearXNG with ethical rate limiting. 

219 

220 Args: 

221 query: The search query 

222 

223 Returns: 

224 List of search results from SearXNG 

225 """ 

226 if not self.is_available: 

227 logger.error( 

228 "SearXNG engine is disabled (no instance URL provided) - cannot run search" 

229 ) 

230 return [] 

231 

232 logger.info(f"SearXNG running search for query: {query}") 

233 

234 try: 

235 self._respect_rate_limit() 

236 

237 initial_headers = { 

238 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", 

239 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 

240 "Accept-Language": "en-US,en;q=0.9", 

241 } 

242 

243 try: 

244 initial_response = safe_get( 

245 self.instance_url, 

246 headers=initial_headers, 

247 timeout=10, 

248 allow_private_ips=True, 

249 ) 

250 cookies = initial_response.cookies 

251 except Exception: 

252 logger.exception("Failed to get initial cookies") 

253 cookies = None 

254 

255 params = { 

256 "q": query, 

257 "categories": ",".join(self.categories), 

258 "language": self.language, 

259 "format": "html", # Use HTML format instead of JSON 

260 "pageno": 1, 

261 "safesearch": self.safe_search.value, 

262 "count": self.max_results, 

263 } 

264 

265 if self.engines: 265 ↛ 266line 265 didn't jump to line 266 because the condition on line 265 was never true

266 params["engines"] = ",".join(self.engines) 

267 

268 if self.time_range: 268 ↛ 269line 268 didn't jump to line 269 because the condition on line 268 was never true

269 params["time_range"] = self.time_range 

270 

271 # Browser-like headers 

272 headers = { 

273 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", 

274 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 

275 "Accept-Language": "en-US,en;q=0.9", 

276 "Referer": self.instance_url + "/", 

277 "Connection": "keep-alive", 

278 "Upgrade-Insecure-Requests": "1", 

279 } 

280 

281 logger.info( 

282 f"Sending request to SearXNG instance at {self.instance_url}" 

283 ) 

284 response = safe_get( 

285 self.search_url, 

286 params=params, 

287 headers=headers, 

288 cookies=cookies, 

289 timeout=15, 

290 allow_private_ips=True, 

291 ) 

292 

293 if response.status_code == 200: 

294 try: 

295 from bs4 import BeautifulSoup 

296 

297 soup = BeautifulSoup(response.text, "html.parser") 

298 results = [] 

299 

300 result_elements = soup.select(".result-item") 

301 

302 if not result_elements: 

303 result_elements = soup.select(".result") 

304 

305 if not result_elements: 

306 result_elements = soup.select("article") 

307 

308 if not result_elements: 

309 logger.debug( 

310 f"Classes found in HTML: {[c['class'] for c in soup.select('[class]') if 'class' in c.attrs][:10]}" 

311 ) 

312 result_elements = soup.select('div[id^="result"]') 

313 

314 logger.info( 

315 f"Found {len(result_elements)} search result elements" 

316 ) 

317 

318 for idx, result_element in enumerate(result_elements): 

319 if idx >= self.max_results: 319 ↛ 320line 319 didn't jump to line 320 because the condition on line 319 was never true

320 break 

321 

322 title_element = ( 

323 result_element.select_one(".result-title") 

324 or result_element.select_one(".title") 

325 or result_element.select_one("h3") 

326 or result_element.select_one("a[href]") 

327 ) 

328 

329 url_element = ( 

330 result_element.select_one(".result-url") 

331 or result_element.select_one(".url") 

332 or result_element.select_one("a[href]") 

333 ) 

334 

335 content_element = ( 

336 result_element.select_one(".result-content") 

337 or result_element.select_one(".content") 

338 or result_element.select_one(".snippet") 

339 or result_element.select_one("p") 

340 ) 

341 

342 title = ( 

343 title_element.get_text(strip=True) 

344 if title_element 

345 else "" 

346 ) 

347 

348 url = "" 

349 if url_element and url_element.has_attr("href"): 

350 url = str(url_element["href"]) 

351 elif url_element: 351 ↛ 354line 351 didn't jump to line 354 because the condition on line 351 was always true

352 url = url_element.get_text(strip=True) 

353 

354 content = ( 

355 content_element.get_text(strip=True) 

356 if content_element 

357 else "" 

358 ) 

359 

360 if ( 

361 not url 

362 and title_element 

363 and title_element.has_attr("href") 

364 ): 

365 url = str(title_element["href"]) 

366 

367 logger.debug( 

368 f"Extracted result {idx}: title={title[:30]}..., url={url[:30]}..., content={content[:30]}..." 

369 ) 

370 

371 # Add to results only if it's a valid search result 

372 # (not an error page or internal SearXNG page) 

373 if self._is_valid_search_result(url): 

374 results.append( 

375 { 

376 "title": title, 

377 "url": url, 

378 "content": content, 

379 "engine": "searxng", 

380 "category": "general", 

381 } 

382 ) 

383 else: 

384 # Check if this is a backend engine failure 

385 if url and "/stats?engine=" in url: 385 ↛ 395line 385 didn't jump to line 395 because the condition on line 385 was always true

386 try: 

387 engine_name = url.split("/stats?engine=")[ 

388 1 

389 ].split("&")[0] 

390 logger.warning( 

391 f"SearXNG backend engine failed or rate-limited: {engine_name}" 

392 ) 

393 except (IndexError, AttributeError): 

394 pass # Couldn't parse engine name 

395 logger.debug( 

396 f"Filtered invalid SearXNG result: title={title!r}, url={url!r}" 

397 ) 

398 

399 if results: 

400 logger.info( 

401 f"SearXNG returned {len(results)} valid results from HTML parsing" 

402 ) 

403 else: 

404 logger.warning( 

405 f"SearXNG returned no valid results for query: {query}. " 

406 "This may indicate SearXNG backend engine issues or rate limiting." 

407 ) 

408 return results 

409 

410 except ImportError: 

411 logger.exception( 

412 "BeautifulSoup not available for HTML parsing" 

413 ) 

414 return [] 

415 except Exception: 

416 logger.exception("Error parsing HTML results") 

417 return [] 

418 else: 

419 logger.error( 

420 f"SearXNG returned status code {response.status_code}" 

421 ) 

422 return [] 

423 

424 except Exception: 

425 logger.exception("Error getting SearXNG results") 

426 return [] 

427 

428 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

429 """ 

430 Get preview information for SearXNG search results. 

431 

432 Args: 

433 query: The search query 

434 

435 Returns: 

436 List of preview dictionaries 

437 """ 

438 if not self.is_available: 

439 logger.warning( 

440 "SearXNG engine is disabled (no instance URL provided)" 

441 ) 

442 return [] 

443 

444 logger.info(f"Getting SearXNG previews for query: {query}") 

445 

446 results = self._get_search_results(query) 

447 

448 if not results: 

449 logger.warning(f"No SearXNG results found for query: {query}") 

450 return [] 

451 

452 previews = [] 

453 for i, result in enumerate(results): 

454 title = result.get("title", "") 

455 url = result.get("url", "") 

456 content = result.get("content", "") 

457 

458 preview = { 

459 "id": url or f"searxng-result-{i}", 

460 "title": title, 

461 "link": url, 

462 "snippet": content, 

463 "engine": result.get("engine", ""), 

464 "category": result.get("category", ""), 

465 } 

466 

467 previews.append(preview) 

468 

469 return previews 

470 

471 def _get_full_content( 

472 self, relevant_items: List[Dict[str, Any]] 

473 ) -> List[Dict[str, Any]]: 

474 """ 

475 Get full content for the relevant search results. 

476 

477 Args: 

478 relevant_items: List of relevant preview dictionaries 

479 

480 Returns: 

481 List of result dictionaries with full content 

482 """ 

483 if not self.is_available: 

484 return relevant_items 

485 

486 if ( 

487 hasattr(search_config, "SEARCH_SNIPPETS_ONLY") 

488 and search_config.SEARCH_SNIPPETS_ONLY 

489 ): 

490 logger.info("Snippet-only mode, skipping full content retrieval") 

491 return relevant_items 

492 

493 if not hasattr(self, "full_search"): 493 ↛ 494line 493 didn't jump to line 494 because the condition on line 493 was never true

494 return relevant_items 

495 

496 logger.info("Retrieving full webpage content") 

497 

498 try: 

499 return self.full_search._get_full_content(relevant_items) 

500 

501 except Exception: 

502 logger.exception("Error retrieving full content") 

503 return relevant_items 

504 

505 def invoke(self, query: str) -> List[Dict[str, Any]]: 

506 """Compatibility method for LangChain tools""" 

507 return self.run(query) 

508 

509 def results( 

510 self, query: str, max_results: Optional[int] = None 

511 ) -> List[Dict[str, Any]]: 

512 """ 

513 Get search results in a format compatible with other search engines. 

514 

515 Args: 

516 query: The search query 

517 max_results: Optional override for maximum results 

518 

519 Returns: 

520 List of search result dictionaries 

521 """ 

522 if not self.is_available: 

523 return [] 

524 

525 original_max_results = self.max_results 

526 

527 try: 

528 if max_results is not None: 

529 self.max_results = max_results 

530 

531 results = self._get_search_results(query) 

532 

533 formatted_results = [] 

534 for result in results: 

535 formatted_results.append( 

536 { 

537 "title": result.get("title", ""), 

538 "link": result.get("url", ""), 

539 "snippet": result.get("content", ""), 

540 } 

541 ) 

542 

543 return formatted_results 

544 

545 finally: 

546 self.max_results = original_max_results 

547 

548 @staticmethod 

549 def get_self_hosting_instructions() -> str: 

550 """ 

551 Get instructions for self-hosting a SearXNG instance. 

552 

553 Returns: 

554 String with installation instructions 

555 """ 

556 return """ 

557# SearXNG Self-Hosting Instructions 

558 

559The most ethical way to use SearXNG is to host your own instance. Here's how: 

560 

561## Using Docker (easiest method) 

562 

5631. Install Docker if you don't have it already 

5642. Run these commands: 

565 

566```bash 

567# Pull the SearXNG Docker image 

568docker pull searxng/searxng 

569 

570# Run SearXNG (will be available at http://localhost:8080) 

571docker run -d -p 8080:8080 --name searxng searxng/searxng 

572``` 

573 

574## Using Docker Compose (recommended for production) 

575 

5761. Create a file named `docker-compose.yml` with the following content: 

577 

578```yaml 

579version: '3' 

580services: 

581 searxng: 

582 container_name: searxng 

583 image: searxng/searxng 

584 ports: 

585 - "8080:8080" 

586 volumes: 

587 - ./searxng:/etc/searxng 

588 environment: 

589 - SEARXNG_BASE_URL=http://localhost:8080/ 

590 restart: unless-stopped 

591``` 

592 

5932. Run with Docker Compose: 

594 

595```bash 

596docker-compose up -d 

597``` 

598 

599For more detailed instructions and configuration options, visit: 

600https://searxng.github.io/searxng/admin/installation.html 

601""" 

602 

603 def run( 

604 self, query: str, research_context: Dict[str, Any] | None = None 

605 ) -> List[Dict[str, Any]]: 

606 """ 

607 Override BaseSearchEngine run method to add SearXNG-specific error handling. 

608 """ 

609 if not self.is_available: 

610 logger.error( 

611 "SearXNG run method called but engine is not available (missing instance URL)" 

612 ) 

613 return [] 

614 

615 logger.info(f"SearXNG instance URL: {self.instance_url}") 

616 

617 try: 

618 # Call the parent class's run method 

619 results = super().run(query, research_context=research_context) 

620 logger.info(f"SearXNG search completed with {len(results)} results") 

621 return results 

622 except Exception: 

623 logger.exception("Error in SearXNG run method") 

624 # Return empty results on error 

625 return []