Coverage for src/local_deep_research/web_search_engines/engines/search_engine

1"""Stack Exchange search engine for Q&A content."""

3import html

4import re

5import time

6from typing import Any, Dict, List, Optional

8import requests

9from langchain_core.language_models import BaseLLM

11from ...constants import USER_AGENT

12from ...security.safe_requests import safe_get

13from ...security.secure_logging import logger

14from ..rate_limiting import RateLimitError

15from ..search_engine_base import BaseSearchEngine, Exposure, Sensitivity

18class StackExchangeSearchEngine(BaseSearchEngine):

19 """

20 Stack Exchange search engine for Q&A content.

22 Provides access to Stack Overflow and other Stack Exchange sites.

23 No authentication required (300 requests/day without key).

24 """

26 is_public = True

27 egress_sensitivity = Sensitivity.NON_SENSITIVE

28 egress_exposure = Exposure.EXPOSING

29 is_generic = False

30 is_scientific = False

31 is_code = True

32 is_lexical = True

33 needs_llm_relevance_filter = True

35 # Common Stack Exchange sites

36 SITES = {

37 "stackoverflow": "Stack Overflow",

38 "serverfault": "Server Fault",

39 "superuser": "Super User",

40 "askubuntu": "Ask Ubuntu",

41 "unix": "Unix & Linux",

42 "math": "Mathematics",

43 "physics": "Physics",

44 "stats": "Cross Validated",

45 "security": "Information Security",

46 "dba": "Database Administrators",

47 }

49 # Sites with their own .com domains (not *.stackexchange.com)

50 SITE_DOMAINS = {

51 "stackoverflow": "stackoverflow.com",

52 "serverfault": "serverfault.com",

53 "superuser": "superuser.com",

54 "askubuntu": "askubuntu.com",

55 }

57 def __init__(

58 self,

59 max_results: int = 10,

60 site: str = "stackoverflow",

61 sort: str = "relevance",

62 accepted_only: bool = False,

63 has_answers: bool = False,

64 min_score: Optional[int] = None,

65 tagged: Optional[str] = None,

66 llm: Optional[BaseLLM] = None,

67 max_filtered_results: Optional[int] = None,

68 settings_snapshot: Optional[Dict[str, Any]] = None,

69 **kwargs,

70 ):

71 """

72 Initialize the Stack Exchange search engine.

74 Args:

75 max_results: Maximum number of search results

76 site: Stack Exchange site to search (stackoverflow, serverfault, etc.)

77 sort: Sort order (relevance, votes, creation, activity)

78 accepted_only: Only return questions with accepted answers

79 has_answers: Only return questions that have answers

80 min_score: Minimum score for questions

81 tagged: Filter by tags (semicolon separated)

82 llm: Language model for relevance filtering

83 max_filtered_results: Maximum results after filtering

84 settings_snapshot: Settings snapshot for thread context

85 """

86 super().__init__(

87 llm=llm,

88 max_filtered_results=max_filtered_results,

89 max_results=max_results,

90 settings_snapshot=settings_snapshot,

91 **kwargs,

92 )

94 # Validate site parameter

95 if site not in self.SITES:

96 valid_sites = ", ".join(self.SITES.keys())

97 raise ValueError(

98 f"Invalid site: '{site}'. Must be one of: {valid_sites}"

99 )

100

101 # Validate sort parameter

102 valid_sorts = ("relevance", "votes", "creation", "activity")

103 if sort not in valid_sorts:

104 raise ValueError(

105 f"Invalid sort: '{sort}'. Must be one of: {', '.join(valid_sorts)}"

106 )

107

108 # Validate sort/min_score combination: the StackExchange API's "min"

109 # parameter works with any sort except "relevance".

110 if min_score is not None and sort == "relevance":

111 raise ValueError(

112 "min_score requires a numeric sort order (votes, creation, or activity). "

113 "sort='relevance' does not support the 'min' parameter."

114 )

115

116 self.site = site

117 self.sort = sort

118 self.accepted_only = accepted_only

119 self.has_answers = has_answers

120 self.min_score = min_score

121 self.tagged = tagged

122

123 self.base_url = "https://api.stackexchange.com/2.3"

124 self.search_url = f"{self.base_url}/search/advanced"

125

126 # User-Agent and required headers for API requests

127 self.headers = {

128 "User-Agent": USER_AGENT,

129 "Accept-Encoding": "gzip, deflate",

130 }

131

132 # Track backoff requirement from API responses

133 self._backoff_until: float = 0

134

135 def _apply_backoff(self) -> None:

136 """Apply backoff if required by previous API response."""

137 if self._backoff_until > 0:

138 wait_time = self._backoff_until - time.time()

139 if wait_time > 0: 139 ↛ 140line 139 didn't jump to line 140 because the condition on line 139 was never true

140 logger.info(

141 f"Stack Exchange backoff: waiting {wait_time:.1f} seconds"

142 )

143 time.sleep(wait_time)

144 self._backoff_until = 0

145

146 def _handle_backoff(self, data: Dict[str, Any]) -> None:

147 """Handle backoff field in API response."""

148 backoff = data.get("backoff")

149 if backoff:

150 self._backoff_until = time.time() + min(int(backoff), 300)

151 logger.warning(

152 f"Stack Exchange API requested backoff of {backoff} seconds"

153 )

154

155 def _build_query_params(self, query: str) -> Dict[str, Any]:

156 """Build query parameters for the API request."""

157 params = {

158 "q": query,

159 "site": self.site,

160 "order": "desc",

161 "sort": self.sort,

162 "pagesize": min(self.max_results, 100),

163 "filter": "withbody", # Include question body

164 }

165

166 if self.accepted_only:

167 params["accepted"] = "True"

168

169 if self.has_answers: 169 ↛ 170line 169 didn't jump to line 170 because the condition on line 169 was never true

170 params["answers"] = "1"

171

172 if self.min_score is not None:

173 params["min"] = self.min_score

174

175 if self.tagged:

176 params["tagged"] = self.tagged

177

178 return params

179

180 def _decode_html(self, text: str) -> str:

181 """Decode HTML entities in text."""

182 return html.unescape(text)

183

184 def _get_site_name(self) -> str:

185 """Get human-readable site name."""

186 return self.SITES.get(self.site, self.site.title())

187

188 def _get_previews(self, query: str) -> List[Dict[str, Any]]:

189 """

190 Get preview information for Stack Exchange questions.

191

192 Args:

193 query: The search query

194

195 Returns:

196 List of preview dictionaries

197 """

198 logger.info(

199 f"Getting Stack Exchange previews for query: {query} on {self.site}"

200 )

201

202 # Apply rate limiting

203 self._last_wait_time = self.rate_tracker.apply_rate_limit(

204 self.engine_type

205 )

206

207 # Apply backoff if required by previous API response

208 self._apply_backoff()

209

210 try:

211 params = self._build_query_params(query)

212

213 response = safe_get(

214 self.search_url,

215 params=params,

216 headers=self.headers,

217 timeout=30,

218 )

219

220 self._raise_if_rate_limit(response.status_code)

221

222 response.raise_for_status()

223 data = response.json()

224

225 # Handle backoff if present in response

226 self._handle_backoff(data)

227

228 # Check for API errors

229 if "error_id" in data:

230 error_msg = data.get("error_message", "Unknown error")

231 logger.error(f"Stack Exchange API error: {error_msg}")

232 return []

233

234 results = data.get("items", [])

235 quota_remaining = data.get("quota_remaining", 0)

236 logger.info(

237 f"Found {len(results)} Stack Exchange results, quota remaining: {quota_remaining}"

238 )

239

240 if quota_remaining < 10:

241 logger.warning(f"Stack Exchange quota low: {quota_remaining}")

242

243 previews = []

244 for question in results[: self.max_results]:

245 try:

246 question_id = question.get("question_id")

247 title = self._decode_html(question.get("title", "Untitled"))

248

249 # Get owner info

250 owner = question.get("owner", {})

251 author = self._decode_html(

252 owner.get("display_name", "Unknown")

253 )

254 author_link = owner.get("link", "")

255 author_reputation = owner.get("reputation", 0)

256

257 # Get question stats

258 score = question.get("score", 0)

259 view_count = question.get("view_count", 0)

260 answer_count = question.get("answer_count", 0)

261 is_answered = question.get("is_answered", False)

262 accepted_answer_id = question.get("accepted_answer_id")

263

264 # Get tags

265 tags = question.get("tags", [])

266

267 # Build answer status prefix

268 status_parts = []

269 if is_answered:

270 status = f"Answered ({answer_count} answer{'s' if answer_count != 1 else ''}"

271 if accepted_answer_id: 271 ↛ 273line 271 didn't jump to line 273 because the condition on line 271 was always true

272 status += ", accepted"

273 status += ")"

274 status_parts.append(status)

275 elif answer_count > 0: 275 ↛ 276line 275 didn't jump to line 276 because the condition on line 275 was never true

276 status_parts.append(

277 f"{answer_count} answer{'s' if answer_count != 1 else ''}"

278 )

279 if tags:

280 status_parts.append(f"Tags: {', '.join(tags[:4])}")

281 prefix = " | ".join(status_parts)

282

283 # Get body (snippet)

284 body = question.get("body", "")

285 # Strip HTML for snippet

286 body_text = html.unescape(re.sub(r"<[^>]+>", " ", body))

287 body_text = " ".join(body_text.split())[:1000]

288 snippet = f"{prefix} | {body_text}" if prefix else body_text

289

290 # Get dates

291 creation_date = question.get("creation_date", 0)

292 last_activity = question.get("last_activity_date", 0)

293

294 # Build link

295 fallback_domain = self.SITE_DOMAINS.get(

296 self.site, f"{self.site}.stackexchange.com"

297 )

298 link = question.get(

299 "link",

300 f"https://{fallback_domain}/questions/{question_id}",

301 )

302

303 preview = {

304 "id": str(question_id),

305 "title": title,

306 "link": link,

307 "snippet": snippet,

308 "author": author,

309 "author_link": author_link,

310 "author_reputation": author_reputation,

311 "score": score,

312 "view_count": view_count,

313 "answer_count": answer_count,

314 "is_answered": is_answered,

315 "has_accepted_answer": accepted_answer_id is not None,

316 "tags": tags,

317 "creation_date": creation_date,

318 "last_activity_date": last_activity,

319 "site": self.site,

320 "source": self._get_site_name(),

321 "_raw": question,

322 }

323

324 previews.append(preview)

325

326 except Exception as e:

327 safe_msg = self._scrub_error(e)

328 logger.exception(

329 f"Error parsing Stack Exchange question ({type(e).__name__}): {safe_msg}"

330 )

331 continue

332

333 return previews

334

335 except (requests.RequestException, ValueError) as e:

336 safe_msg = self._scrub_error(e)

337 logger.exception(

338 f"Stack Exchange API request failed ({type(e).__name__}): {safe_msg}"

339 )

340 self._raise_if_rate_limit(e)

341 return []

342

343 def _get_full_content(

344 self, relevant_items: List[Dict[str, Any]]

345 ) -> List[Dict[str, Any]]:

346 """

347 Get full content for the relevant Stack Exchange questions.

348

349 Fetches the question body and top answers from the API.

350

351 Args:

352 relevant_items: List of relevant preview dictionaries

353

354 Returns:

355 List of result dictionaries with full content

356 """

357 logger.info(

358 f"Getting full content for {len(relevant_items)} Stack Exchange questions"

359 )

360

361 results = []

362 for item in relevant_items:

363 result = item.copy()

364

365 raw = item.get("_raw", {})

366 if raw:

367 # Get full body

368 body = raw.get("body", "")

369 clean_body = html.unescape(re.sub(r"<[^>]+>", " ", body))

370 clean_body = " ".join(clean_body.split())

371

372 # Build content with question + answers

373 content_parts = []

374 content_parts.append(

375 f"Question: {result.get('title', 'Untitled')}"

376 )

377 if result.get("tags"):

378 content_parts.append(f"Tags: {', '.join(result['tags'])}")

379 content_parts.append(f"\n{clean_body}")

380

381 # Fetch top answers

382 question_id = raw.get("question_id")

383 if question_id:

384 try:

385 question_id = int(question_id)

386 except (TypeError, ValueError):

387 question_id = None

388 if question_id:

389 answers = self._fetch_top_answers(

390 question_id, max_answers=3

391 )

392 if answers: 392 ↛ 409line 392 didn't jump to line 409 because the condition on line 392 was always true

393 content_parts.append(

394 f"\n--- Top Answers ({len(answers)}) ---"

395 )

396 for ans in answers:

397 ans_body = html.unescape(

398 re.sub(r"<[^>]+>", " ", ans.get("body", ""))

399 )

400 ans_body = " ".join(ans_body.split())[:3000]

401 score = ans.get("score", 0)

402 accepted = ans.get("is_accepted", False)

403 label = f"[Score: {score}"

404 if accepted: 404 ↛ 406line 404 didn't jump to line 406 because the condition on line 404 was always true

405 label += ", Accepted"

406 label += "]"

407 content_parts.append(f"\n{label}\n{ans_body}")

408

409 result["content"] = "\n".join(content_parts)

410

411 # Clean up internal fields

412 if "_raw" in result:

413 del result["_raw"]

414

415 results.append(result)

416

417 return results

418

419 def _fetch_top_answers(

420 self, question_id: int, max_answers: int = 3

421 ) -> List[Dict[str, Any]]:

422 """Fetch top answers for a question, sorted by votes."""

423 try:

424 self._apply_backoff()

425 url = f"{self.base_url}/questions/{question_id}/answers"

426 params = {

427 "site": self.site,

428 "order": "desc",

429 "sort": "votes",

430 "pagesize": max_answers,

431 "filter": "withbody",

432 }

433 response = safe_get(

434 url, params=params, headers=self.headers, timeout=30

435 )

436 self._raise_if_rate_limit(response.status_code)

437 response.raise_for_status()

438 data = response.json()

439 self._handle_backoff(data)

440

441 if "error_id" in data:

442 logger.warning(

443 f"Stack Exchange API error fetching answers for "

444 f"{question_id}: {data.get('error_message', 'Unknown')}"

445 )

446 return []

447

448 quota_remaining = data.get("quota_remaining")

449 if quota_remaining is not None and quota_remaining < 10:

450 logger.warning(f"Stack Exchange quota low: {quota_remaining}")

451

452 return data.get("items", []) # type: ignore[no-any-return]

453 except (RateLimitError, ValueError):

454 raise

455 except Exception:

456 logger.warning(

457 f"Failed to fetch answers for question {question_id}"

458 )

459 return []

460

461 def get_question(self, question_id: int) -> Optional[Dict[str, Any]]:

462 """

463 Get a specific question by ID.

464

465 Args:

466 question_id: The Stack Exchange question ID

467

468 Returns:

469 Question dictionary or None

470 """

471 try:

472 url = f"{self.base_url}/questions/{question_id}"

473 params = {"site": self.site, "filter": "withbody"}

474 response = safe_get(

475 url, params=params, headers=self.headers, timeout=30

476 )

477 self._raise_if_rate_limit(response.status_code)

478 response.raise_for_status()

479 data = response.json()

480 self._handle_backoff(data)

481 items = data.get("items", [])

482 return items[0] if items else None

483 except RateLimitError:

484 raise

485 except Exception as e:

486 safe_msg = self._scrub_error(e)

487 logger.exception(

488 f"Error fetching Stack Exchange question {question_id} ({type(e).__name__}): {safe_msg}"

489 )

490 return None

491

492 def get_answers(self, question_id: int) -> List[Dict[str, Any]]:

493 """

494 Get answers for a specific question.

495

496 Args:

497 question_id: The Stack Exchange question ID

498

499 Returns:

500 List of answer dictionaries

501 """

502 try:

503 url = f"{self.base_url}/questions/{question_id}/answers"

504 params = {

505 "site": self.site,

506 "order": "desc",

507 "sort": "votes",

508 "filter": "withbody",

509 }

510 response = safe_get(

511 url, params=params, headers=self.headers, timeout=30

512 )

513 self._raise_if_rate_limit(response.status_code)

514 response.raise_for_status()

515 data = response.json()

516 self._handle_backoff(data)

517 return data.get("items", []) # type: ignore[no-any-return]

518 except RateLimitError:

519 raise

520 except Exception as e:

521 safe_msg = self._scrub_error(e)

522 logger.exception(

523 f"Error fetching answers for question {question_id} ({type(e).__name__}): {safe_msg}"

524 )

525 return []

526

527 def search_by_tag(self, tag: str, query: str = "") -> List[Dict[str, Any]]:

528 """

529 Search questions by tag.

530

531 Args:

532 tag: The tag to filter by

533 query: Optional search query

534

535 Returns:

536 List of matching questions

537 """

538 original_tagged = self.tagged

539 try:

540 self.tagged = tag

541 return self.run(query)

542 finally:

543 self.tagged = original_tagged

Coverage for src/local_deep_research/web_search_engines/engines/search_engine_stackexchange.py: 95%

237 statements