Coverage for src/local_deep_research/web_search_engines/engines/search_engine

1import enum

2import json

3import time

4from typing import Any, Dict, List, Optional

6import requests

7from langchain_core.language_models import BaseLLM

8from loguru import logger

10from ...config import search_config

11from ...security.safe_requests import safe_get

12from ..search_engine_base import BaseSearchEngine

13from .full_search import FullSearchResults

16@enum.unique

17class SafeSearchSetting(enum.IntEnum):

18 """

19 Acceptable settings for safe search.

20 """

22 OFF = 0

23 MODERATE = 1

24 STRICT = 2

27class SearXNGSearchEngine(BaseSearchEngine):

28 """

29 SearXNG search engine implementation that requires an instance URL provided via

30 environment variable or configuration. Designed for ethical usage with proper

31 rate limiting and single-instance approach.

32 """

34 # Mark as public search engine

35 is_public = True

36 # Mark as generic search engine (general web search)

37 is_generic = True

39 @staticmethod

40 def _normalize_list(value):

41 """Ensure *value* is a ``list[str]`` or ``None``.

43 Settings saved via the web UI may arrive as raw JSON strings

44 (e.g. ``'[\\r\\n "general"\\r\\n]'``) instead of parsed lists.

45 This helper decodes such strings so that ``",".join()`` later

46 works on list items rather than individual characters (issue #1030).

47 """

48 if value is None:

49 return None

50 if isinstance(value, list):

51 return value

52 if isinstance(value, str): 52 ↛ 65line 52 didn't jump to line 65 because the condition on line 52 was always true

53 stripped = value.strip()

54 if stripped:

55 try:

56 parsed = json.loads(stripped)

57 if isinstance(parsed, list): 57 ↛ 62line 57 didn't jump to line 62 because the condition on line 57 was always true

58 return [str(item) for item in parsed]

59 except (json.JSONDecodeError, ValueError, RecursionError):

60 pass

61 # Comma-separated fallback

62 return [

63 item.strip() for item in stripped.split(",") if item.strip()

64 ]

65 return None

67 def _is_valid_search_result(self, url: str) -> bool:

68 """

69 Check if a parsed result is a valid search result vs an error page.

71 When SearXNG's backend engines fail or get rate-limited, it returns

72 error/stats pages that shouldn't be treated as search results.

74 Returns False for:

75 - Relative URLs (don't start with http:// or https://, case-insensitive)

76 - URLs pointing to the SearXNG instance itself (catches /stats, /preferences, etc.)

77 """

78 # Must have an absolute URL (case-insensitive scheme check)

79 if not url or not url.lower().startswith(("http://", "https://")):

80 return False

82 # Reject URLs pointing back to the SearXNG instance itself

83 # This catches all internal pages like /stats?engine=, /preferences, /about

84 if url.startswith(self.instance_url):

85 return False

87 return True

89 def __init__(

90 self,

91 max_results: int = 15,

92 instance_url: str = "http://localhost:8080",

93 categories: Optional[List[str]] = None,

94 engines: Optional[List[str]] = None,

95 language: str = "en",

96 safe_search: str = SafeSearchSetting.OFF.name,

97 time_range: Optional[str] = None,

98 delay_between_requests: float = 0.0,

99 llm: Optional[BaseLLM] = None,

100 max_filtered_results: Optional[int] = None,

101 include_full_content: bool = True,

102 **kwargs,

103 ): # API key is actually the instance URL

104 """

105 Initialize the SearXNG search engine with ethical usage patterns.

106

107 Args:

108 max_results: Maximum number of search results

109 instance_url: URL of your SearXNG instance (preferably self-hosted)

110 categories: List of SearXNG categories to search in (general, images, videos, news, etc.)

111 engines: List of engines to use (google, bing, duckduckgo, etc.)

112 language: Language code for search results

113 safe_search: Safe search level (0=off, 1=moderate, 2=strict)

114 time_range: Time range for results (day, week, month, year)

115 delay_between_requests: Seconds to wait between requests

116 llm: Language model for relevance filtering

117 max_filtered_results: Maximum number of results to keep after filtering

118 include_full_content: Whether to include full webpage content in results

119 """

120

121 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results

122 super().__init__(

123 llm=llm,

124 max_filtered_results=max_filtered_results,

125 max_results=max_results,

126 **kwargs, # Pass through all other kwargs including search_snippets_only

127 )

128

129 # Validate and normalize the instance URL if provided

130 self.instance_url = instance_url.rstrip("/")

131 logger.info(

132 f"SearXNG initialized with instance URL: {self.instance_url}"

133 )

134 try:

135 # Make sure it's accessible.

136 # allow_private_ips=True since SearXNG is typically self-hosted on local network

137 response = safe_get(

138 self.instance_url, timeout=5, allow_private_ips=True

139 )

140 if response.status_code == 200:

141 logger.info("SearXNG instance is accessible.")

142 self.is_available = True

143 else:

144 self.is_available = False

145 logger.error(

146 f"Failed to access SearXNG instance at {self.instance_url}. Status code: {response.status_code}"

147 )

148 except requests.RequestException:

149 self.is_available = False

150 logger.exception(

151 f"Error while trying to access SearXNG instance at {self.instance_url}"

152 )

153

154 # Add debug logging for all parameters

155 logger.info(

156 f"SearXNG init params: max_results={max_results}, language={language}, "

157 f"max_filtered_results={max_filtered_results}, is_available={self.is_available}"

158 )

159

160 self.max_results = max_results

161 self.categories = self._normalize_list(categories) or ["general"]

162 self.engines = self._normalize_list(engines)

163 self.language = language

164 try:

165 # Handle both string names and integer values

166 if isinstance(safe_search, int) or (

167 isinstance(safe_search, str) and str(safe_search).isdigit()

168 ):

169 self.safe_search = SafeSearchSetting(int(safe_search))

170 else:

171 self.safe_search = SafeSearchSetting[safe_search]

172 except (ValueError, KeyError):

173 logger.exception(

174 "'{}' is not a valid safe search setting. Disabling safe search",

175 safe_search,

176 )

177 self.safe_search = SafeSearchSetting.OFF

178 self.time_range = time_range

179

180 self.delay_between_requests = float(delay_between_requests)

181

182 self.include_full_content = include_full_content

183

184 if self.is_available:

185 self.search_url = f"{self.instance_url}/search"

186 logger.info(

187 f"SearXNG engine initialized with instance: {self.instance_url}"

188 )

189 logger.info(

190 f"Rate limiting set to {self.delay_between_requests} seconds between requests"

191 )

192

193 self.full_search = FullSearchResults(

194 llm=llm,

195 web_search=self,

196 language=language,

197 max_results=max_results,

198 region="wt-wt",

199 time="y",

200 safesearch=self.safe_search.value,

201 )

202

203 self.last_request_time = 0

204

205 def _respect_rate_limit(self):

206 """Apply self-imposed rate limiting between requests"""

207 current_time = time.time()

208 time_since_last_request = current_time - self.last_request_time

209

210 if time_since_last_request < self.delay_between_requests:

211 wait_time = self.delay_between_requests - time_since_last_request

212 logger.info(f"Rate limiting: waiting {wait_time:.2f} seconds")

213 time.sleep(wait_time)

214

215 self.last_request_time = time.time()

216

217 def _get_search_results(self, query: str) -> List[Dict[str, Any]]:

218 """

219 Get search results from SearXNG with ethical rate limiting.

220

221 Args:

222 query: The search query

223

224 Returns:

225 List of search results from SearXNG

226 """

227 if not self.is_available:

228 logger.error(

229 "SearXNG engine is disabled (no instance URL provided) - cannot run search"

230 )

231 return []

232

233 logger.info(f"SearXNG running search for query: {query}")

234

235 try:

236 self._respect_rate_limit()

237

238 initial_headers = {

239 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",

240 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",

241 "Accept-Language": "en-US,en;q=0.9",

242 }

243

244 try:

245 initial_response = safe_get(

246 self.instance_url,

247 headers=initial_headers,

248 timeout=10,

249 allow_private_ips=True,

250 )

251 cookies = initial_response.cookies

252 except Exception:

253 logger.exception("Failed to get initial cookies")

254 cookies = None

255

256 params = {

257 "q": query,

258 "categories": ",".join(self.categories),

259 "language": self.language,

260 "format": "html", # Use HTML format instead of JSON

261 "pageno": 1,

262 "safesearch": self.safe_search.value,

263 "count": self.max_results,

264 }

265

266 if self.engines: 266 ↛ 267line 266 didn't jump to line 267 because the condition on line 266 was never true

267 params["engines"] = ",".join(self.engines)

268

269 if self.time_range: 269 ↛ 270line 269 didn't jump to line 270 because the condition on line 269 was never true

270 params["time_range"] = self.time_range

271

272 # Browser-like headers

273 headers = {

274 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",

275 "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",

276 "Accept-Language": "en-US,en;q=0.9",

277 "Referer": self.instance_url + "/",

278 "Connection": "keep-alive",

279 "Upgrade-Insecure-Requests": "1",

280 }

281

282 logger.info(

283 f"Sending request to SearXNG instance at {self.instance_url}"

284 )

285 response = safe_get(

286 self.search_url,

287 params=params,

288 headers=headers,

289 cookies=cookies,

290 timeout=15,

291 allow_private_ips=True,

292 )

293

294 if response.status_code == 200:

295 try:

296 from bs4 import BeautifulSoup

297

298 soup = BeautifulSoup(response.text, "html.parser")

299 results = []

300

301 result_elements = soup.select(".result-item")

302

303 if not result_elements:

304 result_elements = soup.select(".result")

305

306 if not result_elements: 306 ↛ 307line 306 didn't jump to line 307 because the condition on line 306 was never true

307 result_elements = soup.select("article")

308

309 if not result_elements: 309 ↛ 310line 309 didn't jump to line 310 because the condition on line 309 was never true

310 logger.debug(

311 f"Classes found in HTML: {[c['class'] for c in soup.select('[class]') if 'class' in c.attrs][:10]}"

312 )

313 result_elements = soup.select('div[id^="result"]')

314

315 logger.info(

316 f"Found {len(result_elements)} search result elements"

317 )

318

319 for idx, result_element in enumerate(result_elements):

320 if idx >= self.max_results: 320 ↛ 321line 320 didn't jump to line 321 because the condition on line 320 was never true

321 break

322

323 title_element = (

324 result_element.select_one(".result-title")

325 or result_element.select_one(".title")

326 or result_element.select_one("h3")

327 or result_element.select_one("a[href]")

328 )

329

330 url_element = (

331 result_element.select_one(".result-url")

332 or result_element.select_one(".url")

333 or result_element.select_one("a[href]")

334 )

335

336 content_element = (

337 result_element.select_one(".result-content")

338 or result_element.select_one(".content")

339 or result_element.select_one(".snippet")

340 or result_element.select_one("p")

341 )

342

343 title = (

344 title_element.get_text(strip=True)

345 if title_element

346 else ""

347 )

348

349 url = ""

350 if url_element and url_element.has_attr("href"): 350 ↛ 352line 350 didn't jump to line 352 because the condition on line 350 was always true

351 url = url_element["href"]

352 elif url_element:

353 url = url_element.get_text(strip=True)

354

355 content = (

356 content_element.get_text(strip=True)

357 if content_element

358 else ""

359 )

360

361 if ( 361 ↛ 366line 361 didn't jump to line 366 because the condition on line 361 was never true

362 not url

363 and title_element

364 and title_element.has_attr("href")

365 ):

366 url = title_element["href"]

367

368 logger.debug(

369 f"Extracted result {idx}: title={title[:30]}..., url={url[:30]}..., content={content[:30]}..."

370 )

371

372 # Add to results only if it's a valid search result

373 # (not an error page or internal SearXNG page)

374 if self._is_valid_search_result(url):

375 results.append(

376 {

377 "title": title,

378 "url": url,

379 "content": content,

380 "engine": "searxng",

381 "category": "general",

382 }

383 )

384 else:

385 # Check if this is a backend engine failure

386 if url and "/stats?engine=" in url: 386 ↛ 396line 386 didn't jump to line 396 because the condition on line 386 was always true

387 try:

388 engine_name = url.split("/stats?engine=")[

389 1

390 ].split("&")[0]

391 logger.warning(

392 f"SearXNG backend engine failed or rate-limited: {engine_name}"

393 )

394 except (IndexError, AttributeError):

395 pass # Couldn't parse engine name

396 logger.debug(

397 f"Filtered invalid SearXNG result: title={title!r}, url={url!r}"

398 )

399

400 if results: 400 ↛ 405line 400 didn't jump to line 405 because the condition on line 400 was always true

401 logger.info(

402 f"SearXNG returned {len(results)} valid results from HTML parsing"

403 )

404 else:

405 logger.warning(

406 f"SearXNG returned no valid results for query: {query}. "

407 "This may indicate SearXNG backend engine issues or rate limiting."

408 )

409 return results

410

411 except ImportError:

412 logger.exception(

413 "BeautifulSoup not available for HTML parsing"

414 )

415 return []

416 except Exception:

417 logger.exception("Error parsing HTML results")

418 return []

419 else:

420 logger.error(

421 f"SearXNG returned status code {response.status_code}"

422 )

423 return []

424

425 except Exception:

426 logger.exception("Error getting SearXNG results")

427 return []

428

429 def _get_previews(self, query: str) -> List[Dict[str, Any]]:

430 """

431 Get preview information for SearXNG search results.

432

433 Args:

434 query: The search query

435

436 Returns:

437 List of preview dictionaries

438 """

439 if not self.is_available:

440 logger.warning(

441 "SearXNG engine is disabled (no instance URL provided)"

442 )

443 return []

444

445 logger.info(f"Getting SearXNG previews for query: {query}")

446

447 results = self._get_search_results(query)

448

449 if not results:

450 logger.warning(f"No SearXNG results found for query: {query}")

451 return []

452

453 previews = []

454 for i, result in enumerate(results):

455 title = result.get("title", "")

456 url = result.get("url", "")

457 content = result.get("content", "")

458

459 preview = {

460 "id": url or f"searxng-result-{i}",

461 "title": title,

462 "link": url,

463 "snippet": content,

464 "engine": result.get("engine", ""),

465 "category": result.get("category", ""),

466 }

467

468 previews.append(preview)

469

470 return previews

471

472 def _get_full_content(

473 self, relevant_items: List[Dict[str, Any]]

474 ) -> List[Dict[str, Any]]:

475 """

476 Get full content for the relevant search results.

477

478 Args:

479 relevant_items: List of relevant preview dictionaries

480

481 Returns:

482 List of result dictionaries with full content

483 """

484 if not self.is_available:

485 return relevant_items

486

487 if ( 487 ↛ 491line 487 didn't jump to line 491 because the condition on line 487 was never true

488 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")

489 and search_config.SEARCH_SNIPPETS_ONLY

490 ):

491 logger.info("Snippet-only mode, skipping full content retrieval")

492 return relevant_items

493

494 logger.info("Retrieving full webpage content")

495

496 try:

497 results_with_content = self.full_search._get_full_content(

498 relevant_items

499 )

500 return results_with_content

501

502 except Exception:

503 logger.exception("Error retrieving full content")

504 return relevant_items

505

506 def invoke(self, query: str) -> List[Dict[str, Any]]:

507 """Compatibility method for LangChain tools"""

508 return self.run(query)

509

510 def results(

511 self, query: str, max_results: Optional[int] = None

512 ) -> List[Dict[str, Any]]:

513 """

514 Get search results in a format compatible with other search engines.

515

516 Args:

517 query: The search query

518 max_results: Optional override for maximum results

519

520 Returns:

521 List of search result dictionaries

522 """

523 if not self.is_available:

524 return []

525

526 original_max_results = self.max_results

527

528 try:

529 if max_results is not None:

530 self.max_results = max_results

531

532 results = self._get_search_results(query)

533

534 formatted_results = []

535 for result in results:

536 formatted_results.append(

537 {

538 "title": result.get("title", ""),

539 "link": result.get("url", ""),

540 "snippet": result.get("content", ""),

541 }

542 )

543

544 return formatted_results

545

546 finally:

547 self.max_results = original_max_results

548

549 @staticmethod

550 def get_self_hosting_instructions() -> str:

551 """

552 Get instructions for self-hosting a SearXNG instance.

553

554 Returns:

555 String with installation instructions

556 """

557 return """

558# SearXNG Self-Hosting Instructions

559

560The most ethical way to use SearXNG is to host your own instance. Here's how:

561

562## Using Docker (easiest method)

563

5641. Install Docker if you don't have it already

5652. Run these commands:

566

567```bash

568# Pull the SearXNG Docker image

569docker pull searxng/searxng

570

571# Run SearXNG (will be available at http://localhost:8080)

572docker run -d -p 8080:8080 --name searxng searxng/searxng

573```

574

575## Using Docker Compose (recommended for production)

576

5771. Create a file named `docker-compose.yml` with the following content:

578

579```yaml

580version: '3'

581services:

582 searxng:

583 container_name: searxng

584 image: searxng/searxng

585 ports:

586 - "8080:8080"

587 volumes:

588 - ./searxng:/etc/searxng

589 environment:

590 - SEARXNG_BASE_URL=http://localhost:8080/

591 restart: unless-stopped

592```

593

5942. Run with Docker Compose:

595

596```bash

597docker-compose up -d

598```

599

600For more detailed instructions and configuration options, visit:

601https://searxng.github.io/searxng/admin/installation.html

602"""

603

604 def run(

605 self, query: str, research_context: Dict[str, Any] | None = None

606 ) -> List[Dict[str, Any]]:

607 """

608 Override BaseSearchEngine run method to add SearXNG-specific error handling.

609 """

610 if not self.is_available:

611 logger.error(

612 "SearXNG run method called but engine is not available (missing instance URL)"

613 )

614 return []

615

616 logger.info(f"SearXNG search engine running with query: '{query}'")

617 logger.info(f"SearXNG instance URL: {self.instance_url}")

618

619 try:

620 # Call the parent class's run method

621 results = super().run(query, research_context=research_context)

622 logger.info(f"SearXNG search completed with {len(results)} results")

623 return results

624 except Exception:

625 logger.exception("Error in SearXNG run method")

626 # Return empty results on error

627 return []

Coverage for src / local_deep_research / web_search_engines / engines / search_engine_searxng.py: 87%

222 statements