Coverage for src / local_deep_research / web_search_engines / engines / search_engine_searxng.py: 86%

204 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1import enum 

2import time 

3from typing import Any, Dict, List, Optional 

4 

5import requests 

6from langchain_core.language_models import BaseLLM 

7from loguru import logger 

8 

9from ...config import search_config 

10from ...security.safe_requests import safe_get 

11from ..search_engine_base import BaseSearchEngine 

12from .full_search import FullSearchResults 

13 

14 

15@enum.unique 

16class SafeSearchSetting(enum.IntEnum): 

17 """ 

18 Acceptable settings for safe search. 

19 """ 

20 

21 OFF = 0 

22 MODERATE = 1 

23 STRICT = 2 

24 

25 

26class SearXNGSearchEngine(BaseSearchEngine): 

27 """ 

28 SearXNG search engine implementation that requires an instance URL provided via 

29 environment variable or configuration. Designed for ethical usage with proper 

30 rate limiting and single-instance approach. 

31 """ 

32 

33 # Mark as public search engine 

34 is_public = True 

35 # Mark as generic search engine (general web search) 

36 is_generic = True 

37 

38 def _is_valid_search_result(self, url: str) -> bool: 

39 """ 

40 Check if a parsed result is a valid search result vs an error page. 

41 

42 When SearXNG's backend engines fail or get rate-limited, it returns 

43 error/stats pages that shouldn't be treated as search results. 

44 

45 Returns False for: 

46 - Relative URLs (don't start with http:// or https://, case-insensitive) 

47 - URLs pointing to the SearXNG instance itself (catches /stats, /preferences, etc.) 

48 """ 

49 # Must have an absolute URL (case-insensitive scheme check) 

50 if not url or not url.lower().startswith(("http://", "https://")): 

51 return False 

52 

53 # Reject URLs pointing back to the SearXNG instance itself 

54 # This catches all internal pages like /stats?engine=, /preferences, /about 

55 if url.startswith(self.instance_url): 

56 return False 

57 

58 return True 

59 

60 def __init__( 

61 self, 

62 max_results: int = 15, 

63 instance_url: str = "http://localhost:8080", 

64 categories: Optional[List[str]] = None, 

65 engines: Optional[List[str]] = None, 

66 language: str = "en", 

67 safe_search: str = SafeSearchSetting.OFF.name, 

68 time_range: Optional[str] = None, 

69 delay_between_requests: float = 0.0, 

70 llm: Optional[BaseLLM] = None, 

71 max_filtered_results: Optional[int] = None, 

72 include_full_content: bool = True, 

73 **kwargs, 

74 ): # API key is actually the instance URL 

75 """ 

76 Initialize the SearXNG search engine with ethical usage patterns. 

77 

78 Args: 

79 max_results: Maximum number of search results 

80 instance_url: URL of your SearXNG instance (preferably self-hosted) 

81 categories: List of SearXNG categories to search in (general, images, videos, news, etc.) 

82 engines: List of engines to use (google, bing, duckduckgo, etc.) 

83 language: Language code for search results 

84 safe_search: Safe search level (0=off, 1=moderate, 2=strict) 

85 time_range: Time range for results (day, week, month, year) 

86 delay_between_requests: Seconds to wait between requests 

87 llm: Language model for relevance filtering 

88 max_filtered_results: Maximum number of results to keep after filtering 

89 include_full_content: Whether to include full webpage content in results 

90 """ 

91 

92 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results 

93 super().__init__( 

94 llm=llm, 

95 max_filtered_results=max_filtered_results, 

96 max_results=max_results, 

97 **kwargs, # Pass through all other kwargs including search_snippets_only 

98 ) 

99 

100 # Validate and normalize the instance URL if provided 

101 self.instance_url = instance_url.rstrip("/") 

102 logger.info( 

103 f"SearXNG initialized with instance URL: {self.instance_url}" 

104 ) 

105 try: 

106 # Make sure it's accessible. 

107 # allow_private_ips=True since SearXNG is typically self-hosted on local network 

108 response = safe_get( 

109 self.instance_url, timeout=5, allow_private_ips=True 

110 ) 

111 if response.status_code == 200: 

112 logger.info("SearXNG instance is accessible.") 

113 self.is_available = True 

114 else: 

115 self.is_available = False 

116 logger.error( 

117 f"Failed to access SearXNG instance at {self.instance_url}. Status code: {response.status_code}" 

118 ) 

119 except requests.RequestException as e: 

120 self.is_available = False 

121 logger.exception( 

122 f"Error while trying to access SearXNG instance at {self.instance_url}: {e!s}" 

123 ) 

124 

125 # Add debug logging for all parameters 

126 logger.info( 

127 f"SearXNG init params: max_results={max_results}, language={language}, " 

128 f"max_filtered_results={max_filtered_results}, is_available={self.is_available}" 

129 ) 

130 

131 self.max_results = max_results 

132 self.categories = categories or ["general"] 

133 self.engines = engines 

134 self.language = language 

135 try: 

136 # Handle both string names and integer values 

137 if isinstance(safe_search, int) or ( 

138 isinstance(safe_search, str) and str(safe_search).isdigit() 

139 ): 

140 self.safe_search = SafeSearchSetting(int(safe_search)) 

141 else: 

142 self.safe_search = SafeSearchSetting[safe_search] 

143 except (ValueError, KeyError): 

144 logger.exception( 

145 "'{}' is not a valid safe search setting. Disabling safe search", 

146 safe_search, 

147 ) 

148 self.safe_search = SafeSearchSetting.OFF 

149 self.time_range = time_range 

150 

151 self.delay_between_requests = float(delay_between_requests) 

152 

153 self.include_full_content = include_full_content 

154 

155 if self.is_available: 

156 self.search_url = f"{self.instance_url}/search" 

157 logger.info( 

158 f"SearXNG engine initialized with instance: {self.instance_url}" 

159 ) 

160 logger.info( 

161 f"Rate limiting set to {self.delay_between_requests} seconds between requests" 

162 ) 

163 

164 self.full_search = FullSearchResults( 

165 llm=llm, 

166 web_search=self, 

167 language=language, 

168 max_results=max_results, 

169 region="wt-wt", 

170 time="y", 

171 safesearch=self.safe_search.value, 

172 ) 

173 

174 self.last_request_time = 0 

175 

176 def _respect_rate_limit(self): 

177 """Apply self-imposed rate limiting between requests""" 

178 current_time = time.time() 

179 time_since_last_request = current_time - self.last_request_time 

180 

181 if time_since_last_request < self.delay_between_requests: 

182 wait_time = self.delay_between_requests - time_since_last_request 

183 logger.info(f"Rate limiting: waiting {wait_time:.2f} seconds") 

184 time.sleep(wait_time) 

185 

186 self.last_request_time = time.time() 

187 

188 def _get_search_results(self, query: str) -> List[Dict[str, Any]]: 

189 """ 

190 Get search results from SearXNG with ethical rate limiting. 

191 

192 Args: 

193 query: The search query 

194 

195 Returns: 

196 List of search results from SearXNG 

197 """ 

198 if not self.is_available: 

199 logger.error( 

200 "SearXNG engine is disabled (no instance URL provided) - cannot run search" 

201 ) 

202 return [] 

203 

204 logger.info(f"SearXNG running search for query: {query}") 

205 

206 try: 

207 self._respect_rate_limit() 

208 

209 initial_headers = { 

210 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", 

211 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 

212 "Accept-Language": "en-US,en;q=0.9", 

213 } 

214 

215 try: 

216 initial_response = safe_get( 

217 self.instance_url, 

218 headers=initial_headers, 

219 timeout=10, 

220 allow_private_ips=True, 

221 ) 

222 cookies = initial_response.cookies 

223 except Exception: 

224 logger.exception("Failed to get initial cookies") 

225 cookies = None 

226 

227 params = { 

228 "q": query, 

229 "categories": ",".join(self.categories), 

230 "language": self.language, 

231 "format": "html", # Use HTML format instead of JSON 

232 "pageno": 1, 

233 "safesearch": self.safe_search.value, 

234 "count": self.max_results, 

235 } 

236 

237 if self.engines: 237 ↛ 238line 237 didn't jump to line 238 because the condition on line 237 was never true

238 params["engines"] = ",".join(self.engines) 

239 

240 if self.time_range: 240 ↛ 241line 240 didn't jump to line 241 because the condition on line 240 was never true

241 params["time_range"] = self.time_range 

242 

243 # Browser-like headers 

244 headers = { 

245 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", 

246 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8", 

247 "Accept-Language": "en-US,en;q=0.9", 

248 "Referer": self.instance_url + "/", 

249 "Connection": "keep-alive", 

250 "Upgrade-Insecure-Requests": "1", 

251 } 

252 

253 logger.info( 

254 f"Sending request to SearXNG instance at {self.instance_url}" 

255 ) 

256 response = safe_get( 

257 self.search_url, 

258 params=params, 

259 headers=headers, 

260 cookies=cookies, 

261 timeout=15, 

262 allow_private_ips=True, 

263 ) 

264 

265 if response.status_code == 200: 

266 try: 

267 from bs4 import BeautifulSoup 

268 

269 soup = BeautifulSoup(response.text, "html.parser") 

270 results = [] 

271 

272 result_elements = soup.select(".result-item") 

273 

274 if not result_elements: 274 ↛ 275line 274 didn't jump to line 275 because the condition on line 274 was never true

275 result_elements = soup.select(".result") 

276 

277 if not result_elements: 277 ↛ 278line 277 didn't jump to line 278 because the condition on line 277 was never true

278 result_elements = soup.select("article") 

279 

280 if not result_elements: 280 ↛ 281line 280 didn't jump to line 281 because the condition on line 280 was never true

281 logger.debug( 

282 f"Classes found in HTML: {[c['class'] for c in soup.select('[class]') if 'class' in c.attrs][:10]}" 

283 ) 

284 result_elements = soup.select('div[id^="result"]') 

285 

286 logger.info( 

287 f"Found {len(result_elements)} search result elements" 

288 ) 

289 

290 for idx, result_element in enumerate(result_elements): 

291 if idx >= self.max_results: 291 ↛ 292line 291 didn't jump to line 292 because the condition on line 291 was never true

292 break 

293 

294 title_element = ( 

295 result_element.select_one(".result-title") 

296 or result_element.select_one(".title") 

297 or result_element.select_one("h3") 

298 or result_element.select_one("a[href]") 

299 ) 

300 

301 url_element = ( 

302 result_element.select_one(".result-url") 

303 or result_element.select_one(".url") 

304 or result_element.select_one("a[href]") 

305 ) 

306 

307 content_element = ( 

308 result_element.select_one(".result-content") 

309 or result_element.select_one(".content") 

310 or result_element.select_one(".snippet") 

311 or result_element.select_one("p") 

312 ) 

313 

314 title = ( 

315 title_element.get_text(strip=True) 

316 if title_element 

317 else "" 

318 ) 

319 

320 url = "" 

321 if url_element and url_element.has_attr("href"): 321 ↛ 323line 321 didn't jump to line 323 because the condition on line 321 was always true

322 url = url_element["href"] 

323 elif url_element: 

324 url = url_element.get_text(strip=True) 

325 

326 content = ( 

327 content_element.get_text(strip=True) 

328 if content_element 

329 else "" 

330 ) 

331 

332 if ( 332 ↛ 337line 332 didn't jump to line 337 because the condition on line 332 was never true

333 not url 

334 and title_element 

335 and title_element.has_attr("href") 

336 ): 

337 url = title_element["href"] 

338 

339 logger.debug( 

340 f"Extracted result {idx}: title={title[:30]}..., url={url[:30]}..., content={content[:30]}..." 

341 ) 

342 

343 # Add to results only if it's a valid search result 

344 # (not an error page or internal SearXNG page) 

345 if self._is_valid_search_result(url): 

346 results.append( 

347 { 

348 "title": title, 

349 "url": url, 

350 "content": content, 

351 "engine": "searxng", 

352 "category": "general", 

353 } 

354 ) 

355 else: 

356 # Check if this is a backend engine failure 

357 if url and "/stats?engine=" in url: 357 ↛ 367line 357 didn't jump to line 367 because the condition on line 357 was always true

358 try: 

359 engine_name = url.split("/stats?engine=")[ 

360 1 

361 ].split("&")[0] 

362 logger.warning( 

363 f"SearXNG backend engine failed or rate-limited: {engine_name}" 

364 ) 

365 except (IndexError, AttributeError): 

366 pass # Couldn't parse engine name 

367 logger.debug( 

368 f"Filtered invalid SearXNG result: title={title!r}, url={url!r}" 

369 ) 

370 

371 if results: 371 ↛ 376line 371 didn't jump to line 376 because the condition on line 371 was always true

372 logger.info( 

373 f"SearXNG returned {len(results)} valid results from HTML parsing" 

374 ) 

375 else: 

376 logger.warning( 

377 f"SearXNG returned no valid results for query: {query}. " 

378 "This may indicate SearXNG backend engine issues or rate limiting." 

379 ) 

380 return results 

381 

382 except ImportError: 

383 logger.exception( 

384 "BeautifulSoup not available for HTML parsing" 

385 ) 

386 return [] 

387 except Exception: 

388 logger.exception("Error parsing HTML results") 

389 return [] 

390 else: 

391 logger.error( 

392 f"SearXNG returned status code {response.status_code}" 

393 ) 

394 return [] 

395 

396 except Exception: 

397 logger.exception("Error getting SearXNG results") 

398 return [] 

399 

400 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

401 """ 

402 Get preview information for SearXNG search results. 

403 

404 Args: 

405 query: The search query 

406 

407 Returns: 

408 List of preview dictionaries 

409 """ 

410 if not self.is_available: 

411 logger.warning( 

412 "SearXNG engine is disabled (no instance URL provided)" 

413 ) 

414 return [] 

415 

416 logger.info(f"Getting SearXNG previews for query: {query}") 

417 

418 results = self._get_search_results(query) 

419 

420 if not results: 

421 logger.warning(f"No SearXNG results found for query: {query}") 

422 return [] 

423 

424 previews = [] 

425 for i, result in enumerate(results): 

426 title = result.get("title", "") 

427 url = result.get("url", "") 

428 content = result.get("content", "") 

429 

430 preview = { 

431 "id": url or f"searxng-result-{i}", 

432 "title": title, 

433 "link": url, 

434 "snippet": content, 

435 "engine": result.get("engine", ""), 

436 "category": result.get("category", ""), 

437 } 

438 

439 previews.append(preview) 

440 

441 return previews 

442 

443 def _get_full_content( 

444 self, relevant_items: List[Dict[str, Any]] 

445 ) -> List[Dict[str, Any]]: 

446 """ 

447 Get full content for the relevant search results. 

448 

449 Args: 

450 relevant_items: List of relevant preview dictionaries 

451 

452 Returns: 

453 List of result dictionaries with full content 

454 """ 

455 if not self.is_available: 

456 return relevant_items 

457 

458 if ( 458 ↛ 462line 458 didn't jump to line 462 because the condition on line 458 was never true

459 hasattr(search_config, "SEARCH_SNIPPETS_ONLY") 

460 and search_config.SEARCH_SNIPPETS_ONLY 

461 ): 

462 logger.info("Snippet-only mode, skipping full content retrieval") 

463 return relevant_items 

464 

465 logger.info("Retrieving full webpage content") 

466 

467 try: 

468 results_with_content = self.full_search._get_full_content( 

469 relevant_items 

470 ) 

471 return results_with_content 

472 

473 except Exception: 

474 logger.exception("Error retrieving full content") 

475 return relevant_items 

476 

477 def invoke(self, query: str) -> List[Dict[str, Any]]: 

478 """Compatibility method for LangChain tools""" 

479 return self.run(query) 

480 

481 def results( 

482 self, query: str, max_results: Optional[int] = None 

483 ) -> List[Dict[str, Any]]: 

484 """ 

485 Get search results in a format compatible with other search engines. 

486 

487 Args: 

488 query: The search query 

489 max_results: Optional override for maximum results 

490 

491 Returns: 

492 List of search result dictionaries 

493 """ 

494 if not self.is_available: 

495 return [] 

496 

497 original_max_results = self.max_results 

498 

499 try: 

500 if max_results is not None: 

501 self.max_results = max_results 

502 

503 results = self._get_search_results(query) 

504 

505 formatted_results = [] 

506 for result in results: 

507 formatted_results.append( 

508 { 

509 "title": result.get("title", ""), 

510 "link": result.get("url", ""), 

511 "snippet": result.get("content", ""), 

512 } 

513 ) 

514 

515 return formatted_results 

516 

517 finally: 

518 self.max_results = original_max_results 

519 

520 @staticmethod 

521 def get_self_hosting_instructions() -> str: 

522 """ 

523 Get instructions for self-hosting a SearXNG instance. 

524 

525 Returns: 

526 String with installation instructions 

527 """ 

528 return """ 

529# SearXNG Self-Hosting Instructions 

530 

531The most ethical way to use SearXNG is to host your own instance. Here's how: 

532 

533## Using Docker (easiest method) 

534 

5351. Install Docker if you don't have it already 

5362. Run these commands: 

537 

538```bash 

539# Pull the SearXNG Docker image 

540docker pull searxng/searxng 

541 

542# Run SearXNG (will be available at http://localhost:8080) 

543docker run -d -p 8080:8080 --name searxng searxng/searxng 

544``` 

545 

546## Using Docker Compose (recommended for production) 

547 

5481. Create a file named `docker-compose.yml` with the following content: 

549 

550```yaml 

551version: '3' 

552services: 

553 searxng: 

554 container_name: searxng 

555 image: searxng/searxng 

556 ports: 

557 - "8080:8080" 

558 volumes: 

559 - ./searxng:/etc/searxng 

560 environment: 

561 - SEARXNG_BASE_URL=http://localhost:8080/ 

562 restart: unless-stopped 

563``` 

564 

5652. Run with Docker Compose: 

566 

567```bash 

568docker-compose up -d 

569``` 

570 

571For more detailed instructions and configuration options, visit: 

572https://searxng.github.io/searxng/admin/installation.html 

573""" 

574 

575 def run( 

576 self, query: str, research_context: Dict[str, Any] | None = None 

577 ) -> List[Dict[str, Any]]: 

578 """ 

579 Override BaseSearchEngine run method to add SearXNG-specific error handling. 

580 """ 

581 if not self.is_available: 

582 logger.error( 

583 "SearXNG run method called but engine is not available (missing instance URL)" 

584 ) 

585 return [] 

586 

587 logger.info(f"SearXNG search engine running with query: '{query}'") 

588 logger.info(f"SearXNG instance URL: {self.instance_url}") 

589 

590 try: 

591 # Call the parent class's run method 

592 results = super().run(query, research_context=research_context) 

593 logger.info(f"SearXNG search completed with {len(results)} results") 

594 return results 

595 except Exception: 

596 logger.exception("Error in SearXNG run method") 

597 # Return empty results on error 

598 return []