Coverage for src/local_deep_research/web_search_engines/engines/search_engine

1"""Project Gutenberg search engine via Gutendex API."""

3from typing import Any, Dict, List, Optional

5import requests

6from langchain_core.language_models import BaseLLM

8from ...constants import USER_AGENT

9from ...security.safe_requests import safe_get

10from ...security.secure_logging import logger

11from ..rate_limiting import RateLimitError

12from ..search_engine_base import BaseSearchEngine, Exposure, Sensitivity

15class GutenbergSearchEngine(BaseSearchEngine):

16 """

17 Project Gutenberg search engine via Gutendex API.

19 Provides access to 70,000+ free public domain books with full text.

20 No authentication required.

21 """

23 is_public = True

24 egress_sensitivity = Sensitivity.NON_SENSITIVE

25 egress_exposure = Exposure.EXPOSING

26 is_generic = False

27 is_scientific = False

28 is_books = True

29 is_lexical = True

30 needs_llm_relevance_filter = True

32 def __init__(

33 self,

34 max_results: int = 10,

35 languages: Optional[str] = None,

36 topic: Optional[str] = None,

37 author_year_start: Optional[int] = None,

38 author_year_end: Optional[int] = None,

39 copyright_filter: Optional[bool] = None,

40 sort: str = "popular",

41 max_content_chars: int = 50000,

42 llm: Optional[BaseLLM] = None,

43 max_filtered_results: Optional[int] = None,

44 settings_snapshot: Optional[Dict[str, Any]] = None,

45 **kwargs,

46 ):

47 """

48 Initialize the Project Gutenberg search engine.

50 Args:

51 max_results: Maximum number of search results

52 languages: Filter by language codes (e.g., 'en', 'fr,de')

53 topic: Filter by subject/bookshelf topic

54 author_year_start: Filter authors born after this year

55 author_year_end: Filter authors born before this year

56 copyright_filter: Filter by copyright status (True/False/None)

57 sort: Sort order ('popular', 'ascending', 'descending')

58 max_content_chars: Maximum characters of book text to retrieve

59 llm: Language model for relevance filtering

60 max_filtered_results: Maximum results after filtering

61 settings_snapshot: Settings snapshot for thread context

62 """

63 super().__init__(

64 llm=llm,

65 max_filtered_results=max_filtered_results,

66 max_results=max_results,

67 settings_snapshot=settings_snapshot,

68 **kwargs,

69 )

71 self.languages = languages

72 self.topic = topic

73 self.author_year_start = author_year_start

74 self.author_year_end = author_year_end

75 self.copyright_filter = copyright_filter

76 self.sort = sort

77 self.max_content_chars = max_content_chars

79 self.base_url = "https://gutendex.com"

80 self.search_url = f"{self.base_url}/books/"

82 # User-Agent header for API requests

83 self.headers = {"User-Agent": USER_AGENT}

85 def _build_query_params(self, query: str) -> Dict[str, Any]:

86 """Build query parameters for the API request."""

87 params: Dict[str, Any] = {}

88 if query: 88 ↛ 91line 88 didn't jump to line 91 because the condition on line 88 was always true

89 params["search"] = query

91 if self.languages:

92 params["languages"] = self.languages

94 if self.topic:

95 params["topic"] = self.topic

97 if self.author_year_start is not None: 97 ↛ 98line 97 didn't jump to line 98 because the condition on line 97 was never true

98 params["author_year_start"] = self.author_year_start

100 if self.author_year_end is not None: 100 ↛ 101line 100 didn't jump to line 101 because the condition on line 100 was never true

101 params["author_year_end"] = self.author_year_end

102

103 if self.copyright_filter is not None: 103 ↛ 104line 103 didn't jump to line 104 because the condition on line 103 was never true

104 params["copyright"] = str(self.copyright_filter).lower()

105

106 if self.sort and self.sort != "popular": 106 ↛ 107line 106 didn't jump to line 107 because the condition on line 106 was never true

107 params["sort"] = self.sort

108

109 return params

110

111 def _get_best_format_url(self, formats: Dict[str, str]) -> Optional[str]:

112 """Get the best available format URL for reading."""

113 # Priority order for reading formats

114 priority = [

115 "text/html",

116 "text/html; charset=utf-8",

117 "text/plain; charset=utf-8",

118 "text/plain",

119 "application/epub+zip",

120 "application/x-mobipocket-ebook",

121 "application/pdf",

122 ]

123

124 for mime_type in priority:

125 if mime_type in formats:

126 return formats[mime_type]

127

128 # Return first available if no priority match

129 if formats:

130 return next(iter(formats.values()))

131 return None

132

133 def _get_text_url(self, formats: Dict[str, str]) -> Optional[str]:

134 """Get the plain text URL for content retrieval."""

135 for mime_type in [

136 "text/plain; charset=utf-8",

137 "text/plain; charset=us-ascii",

138 "text/plain",

139 ]:

140 if mime_type in formats:

141 return formats[mime_type]

142 return None

143

144 def _fetch_book_text(self, text_url: str) -> Optional[str]:

145 """Fetch and return the plain text content of a book."""

146 try:

147 response = safe_get(text_url, headers=self.headers, timeout=30)

148 self._raise_if_rate_limit(response.status_code)

149 response.raise_for_status()

150

151 text = response.text

152 if not text:

153 return None

154

155 # Strip the Project Gutenberg header/footer boilerplate

156 start_markers = [

157 "*** START OF THE PROJECT GUTENBERG EBOOK",

158 "*** START OF THIS PROJECT GUTENBERG EBOOK",

159 "***START OF THE PROJECT GUTENBERG EBOOK",

160 ]

161 end_markers = [

162 "*** END OF THE PROJECT GUTENBERG EBOOK",

163 "*** END OF THIS PROJECT GUTENBERG EBOOK",

164 "***END OF THE PROJECT GUTENBERG EBOOK",

165 ]

166

167 for marker in start_markers: 167 ↛ 176line 167 didn't jump to line 176 because the loop on line 167 didn't complete

168 idx = text.find(marker)

169 if idx != -1:

170 # Skip past the marker line

171 newline = text.find("\n", idx)

172 if newline != -1: 172 ↛ 174line 172 didn't jump to line 174 because the condition on line 172 was always true

173 text = text[newline + 1 :]

174 break

175

176 for marker in end_markers: 176 ↛ 182line 176 didn't jump to line 182 because the loop on line 176 didn't complete

177 idx = text.find(marker)

178 if idx != -1:

179 text = text[:idx]

180 break

181

182 text = text.strip()

183

184 # Truncate to max_content_chars

185 if len(text) > self.max_content_chars:

186 text = (

187 text[: self.max_content_chars] + "\n\n[... truncated ...]"

188 )

189

190 return text

191

192 except (RateLimitError, ValueError):

193 raise

194 except Exception:

195 logger.warning(f"Failed to fetch book text from {text_url}")

196 return None

197

198 def _parse_authors(self, authors: List[Dict]) -> List[str]:

199 """Parse author information."""

200 result = []

201 for author in authors[:5]:

202 name = author.get("name", "")

203 if name: 203 ↛ 201line 203 didn't jump to line 201 because the condition on line 203 was always true

204 # Format: "Last, First" -> "First Last"

205 if ", " in name:

206 parts = name.split(", ", 1)

207 name = f"{parts[1]} {parts[0]}"

208 result.append(name)

209 return result

210

211 def _get_previews(self, query: str) -> List[Dict[str, Any]]:

212 """

213 Get preview information for Project Gutenberg books.

214

215 Args:

216 query: The search query

217

218 Returns:

219 List of preview dictionaries

220 """

221 logger.info(f"Getting Gutenberg previews for query: {query}")

222

223 # Apply rate limiting

224 self._last_wait_time = self.rate_tracker.apply_rate_limit(

225 self.engine_type

226 )

227

228 try:

229 params = self._build_query_params(query)

230 response = safe_get(

231 self.search_url,

232 params=params,

233 headers=self.headers,

234 timeout=30,

235 )

236

237 self._raise_if_rate_limit(response.status_code)

238

239 response.raise_for_status()

240 data = response.json()

241

242 results = data.get("results", [])

243 total = data.get("count", 0)

244 logger.info(

245 f"Found {total} Gutenberg results, returning {len(results)}"

246 )

247

248 previews = []

249 for book in results[: self.max_results]:

250 try:

251 book_id = book.get("id")

252 title = book.get("title", "Untitled")

253

254 # Get authors

255 authors = self._parse_authors(book.get("authors", []))

256

257 # Get subjects and bookshelves

258 subjects = book.get("subjects", [])[:5]

259 bookshelves = book.get("bookshelves", [])[:3]

260

261 # Get languages

262 languages = book.get("languages", [])

263

264 # Get formats

265 formats = book.get("formats", {})

266 read_url = self._get_best_format_url(formats)

267

268 # Build Gutenberg URL

269 gutenberg_url = (

270 f"https://www.gutenberg.org/ebooks/{book_id}"

271 )

272

273 # Get summaries if available

274 summaries = book.get("summaries", [])

275 summary_text = ""

276 if summaries and isinstance(summaries, list): 276 ↛ 278line 276 didn't jump to line 278 because the condition on line 276 was never true

277 # Use the first summary, strip whitespace

278 first_summary = summaries[0] if summaries else ""

279 if isinstance(first_summary, str):

280 summary_text = first_summary.strip()[:300]

281

282 # Build snippet with summary for richer content

283 snippet_parts = []

284 if summary_text: 284 ↛ 285line 284 didn't jump to line 285 because the condition on line 284 was never true

285 snippet_parts.append(summary_text)

286 if authors:

287 snippet_parts.append(f"By {', '.join(authors[:2])}")

288 if subjects and not summary_text:

289 snippet_parts.append(

290 f"Subjects: {', '.join(subjects[:3])}"

291 )

292 if bookshelves and not summary_text:

293 snippet_parts.append(

294 f"Bookshelves: {', '.join(bookshelves[:2])}"

295 )

296 snippet = ". ".join(snippet_parts)

297

298 # Check for cover image

299 cover_url = formats.get("image/jpeg")

300

301 preview = {

302 "id": str(book_id),

303 "title": title,

304 "link": gutenberg_url,

305 "snippet": snippet,

306 "authors": authors,

307 "subjects": subjects,

308 "bookshelves": bookshelves,

309 "languages": languages,

310 "download_count": book.get("download_count", 0),

311 "read_url": read_url,

312 "cover_url": cover_url,

313 "formats": list(formats.keys()),

314 "copyright": book.get("copyright", False),

315 "source": "Project Gutenberg",

316 "_raw": book,

317 }

318

319 previews.append(preview)

320

321 except Exception as e:

322 safe_msg = self._scrub_error(e)

323 logger.exception(

324 f"Error parsing Gutenberg book ({type(e).__name__}): {safe_msg}"

325 )

326 continue

327

328 return previews

329

330 except (requests.RequestException, ValueError) as e:

331 safe_msg = self._scrub_error(e)

332 logger.exception(

333 f"Gutendex API request failed ({type(e).__name__}): {safe_msg}"

334 )

335 self._raise_if_rate_limit(e)

336 return []

337

338 def _get_full_content(

339 self, relevant_items: List[Dict[str, Any]]

340 ) -> List[Dict[str, Any]]:

341 """

342 Get full content for the relevant Gutenberg books.

343

344 Fetches the actual plain text of each book from Project Gutenberg.

345

346 Args:

347 relevant_items: List of relevant preview dictionaries

348

349 Returns:

350 List of result dictionaries with full content

351 """

352 logger.info(

353 f"Getting full content for {len(relevant_items)} Gutenberg books"

354 )

355

356 results = []

357 for item in relevant_items:

358 result = item.copy()

359

360 raw = item.get("_raw", {})

361 if raw:

362 # Get all subjects

363 result["subjects"] = raw.get("subjects", [])

364

365 # Get all bookshelves

366 result["bookshelves"] = raw.get("bookshelves", [])

367

368 # Get translators

369 translators = raw.get("translators", [])

370 result["translators"] = self._parse_authors(translators)

371

372 # Fetch actual book text

373 formats = raw.get("formats", {})

374 text_url = self._get_text_url(formats)

375

376 book_text = None

377 if text_url and text_url.startswith(

378 "https://www.gutenberg.org/"

379 ):

380 logger.info(

381 f"Fetching book text for '{result.get('title')}' from {text_url}"

382 )

383 book_text = self._fetch_book_text(text_url)

384 elif text_url:

385 logger.warning(

386 f"Skipping text_url with unexpected origin: {text_url}"

387 )

388

389 # Build content with metadata header + actual text

390 content_parts = []

391 if result.get("authors"):

392 content_parts.append(

393 f"Authors: {', '.join(result['authors'])}"

394 )

395 if result.get("subjects"):

396 content_parts.append(

397 f"Subjects: {', '.join(result['subjects'][:5])}"

398 )

399

400 if book_text:

401 content_parts.append("")

402 content_parts.append(book_text)

403 logger.info(

404 f"Retrieved {len(book_text)} chars of text for '{result.get('title')}'"

405 )

406 else:

407 if result.get("bookshelves"):

408 content_parts.append(

409 f"Bookshelves: {', '.join(result['bookshelves'])}"

410 )

411 if result.get("download_count"):

412 content_parts.append(

413 f"Downloads: {result['download_count']}"

414 )

415 if result.get("read_url"):

416 content_parts.append(

417 f"Read online: {result['read_url']}"

418 )

419 logger.warning(

420 f"Could not fetch text for '{result.get('title')}', using metadata only"

421 )

422

423 result["content"] = "\n".join(content_parts)

424

425 # Clean up internal fields

426 if "_raw" in result:

427 del result["_raw"]

428

429 results.append(result)

430

431 return results

432

433 def get_book(self, book_id: int) -> Optional[Dict[str, Any]]:

434 """

435 Get a specific book by Gutenberg ID.

436

437 Args:

438 book_id: The Project Gutenberg book ID

439

440 Returns:

441 Book dictionary or None

442 """

443 try:

444 url = f"{self.base_url}/books/{book_id}"

445 response = safe_get(url, headers=self.headers, timeout=30)

446 self._raise_if_rate_limit(response.status_code)

447 response.raise_for_status()

448 return response.json() # type: ignore[no-any-return]

449 except RateLimitError:

450 raise

451 except Exception as e:

452 safe_msg = self._scrub_error(e)

453 logger.exception(

454 f"Error fetching Gutenberg book {book_id} ({type(e).__name__}): {safe_msg}"

455 )

456 return None

457

458 def search_by_topic(self, topic: str) -> List[Dict[str, Any]]:

459 """

460 Search books by topic/subject.

461

462 Args:

463 topic: The topic to search for

464

465 Returns:

466 List of matching books

467 """

468 original_topic = self.topic

469 try:

470 self.topic = topic

471 return self.run("")

472 finally:

473 self.topic = original_topic

Coverage for src/local_deep_research/web_search_engines/engines/search_engine_gutenberg.py: 91%

212 statements