Coverage for src / local_deep_research / web_search_engines / engines / search_engine_gutenberg.py: 91%
206 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""Project Gutenberg search engine via Gutendex API."""
3from typing import Any, Dict, List, Optional
5import requests
6from langchain_core.language_models import BaseLLM
7from loguru import logger
9from ...security.safe_requests import safe_get
10from ..rate_limiting import RateLimitError
11from ..search_engine_base import BaseSearchEngine
14class GutenbergSearchEngine(BaseSearchEngine):
15 """
16 Project Gutenberg search engine via Gutendex API.
18 Provides access to 70,000+ free public domain books with full text.
19 No authentication required.
20 """
22 is_public = True
23 is_generic = False
24 is_scientific = False
25 is_books = True
26 is_lexical = True
27 needs_llm_relevance_filter = True
29 def __init__(
30 self,
31 max_results: int = 10,
32 languages: Optional[str] = None,
33 topic: Optional[str] = None,
34 author_year_start: Optional[int] = None,
35 author_year_end: Optional[int] = None,
36 copyright_filter: Optional[bool] = None,
37 sort: str = "popular",
38 max_content_chars: int = 50000,
39 llm: Optional[BaseLLM] = None,
40 max_filtered_results: Optional[int] = None,
41 settings_snapshot: Optional[Dict[str, Any]] = None,
42 **kwargs,
43 ):
44 """
45 Initialize the Project Gutenberg search engine.
47 Args:
48 max_results: Maximum number of search results
49 languages: Filter by language codes (e.g., 'en', 'fr,de')
50 topic: Filter by subject/bookshelf topic
51 author_year_start: Filter authors born after this year
52 author_year_end: Filter authors born before this year
53 copyright_filter: Filter by copyright status (True/False/None)
54 sort: Sort order ('popular', 'ascending', 'descending')
55 max_content_chars: Maximum characters of book text to retrieve
56 llm: Language model for relevance filtering
57 max_filtered_results: Maximum results after filtering
58 settings_snapshot: Settings snapshot for thread context
59 """
60 super().__init__(
61 llm=llm,
62 max_filtered_results=max_filtered_results,
63 max_results=max_results,
64 settings_snapshot=settings_snapshot,
65 **kwargs,
66 )
68 self.languages = languages
69 self.topic = topic
70 self.author_year_start = author_year_start
71 self.author_year_end = author_year_end
72 self.copyright_filter = copyright_filter
73 self.sort = sort
74 self.max_content_chars = max_content_chars
76 self.base_url = "https://gutendex.com"
77 self.search_url = f"{self.base_url}/books/"
79 # User-Agent header for API requests
80 self.headers = {
81 "User-Agent": "LocalDeepResearch/1.0 (https://github.com/LearningCircuit/local-deep-research)"
82 }
84 def _build_query_params(self, query: str) -> Dict[str, Any]:
85 """Build query parameters for the API request."""
86 params: Dict[str, Any] = {}
87 if query: 87 ↛ 90line 87 didn't jump to line 90 because the condition on line 87 was always true
88 params["search"] = query
90 if self.languages:
91 params["languages"] = self.languages
93 if self.topic:
94 params["topic"] = self.topic
96 if self.author_year_start is not None: 96 ↛ 97line 96 didn't jump to line 97 because the condition on line 96 was never true
97 params["author_year_start"] = self.author_year_start
99 if self.author_year_end is not None: 99 ↛ 100line 99 didn't jump to line 100 because the condition on line 99 was never true
100 params["author_year_end"] = self.author_year_end
102 if self.copyright_filter is not None: 102 ↛ 103line 102 didn't jump to line 103 because the condition on line 102 was never true
103 params["copyright"] = str(self.copyright_filter).lower()
105 if self.sort and self.sort != "popular": 105 ↛ 106line 105 didn't jump to line 106 because the condition on line 105 was never true
106 params["sort"] = self.sort
108 return params
110 def _get_best_format_url(self, formats: Dict[str, str]) -> Optional[str]:
111 """Get the best available format URL for reading."""
112 # Priority order for reading formats
113 priority = [
114 "text/html",
115 "text/html; charset=utf-8",
116 "text/plain; charset=utf-8",
117 "text/plain",
118 "application/epub+zip",
119 "application/x-mobipocket-ebook",
120 "application/pdf",
121 ]
123 for mime_type in priority:
124 if mime_type in formats:
125 return formats[mime_type]
127 # Return first available if no priority match
128 if formats:
129 return next(iter(formats.values()))
130 return None
132 def _get_text_url(self, formats: Dict[str, str]) -> Optional[str]:
133 """Get the plain text URL for content retrieval."""
134 for mime_type in [
135 "text/plain; charset=utf-8",
136 "text/plain; charset=us-ascii",
137 "text/plain",
138 ]:
139 if mime_type in formats:
140 return formats[mime_type]
141 return None
143 def _fetch_book_text(self, text_url: str) -> Optional[str]:
144 """Fetch and return the plain text content of a book."""
145 try:
146 response = safe_get(text_url, headers=self.headers, timeout=30)
147 self._raise_if_rate_limit(response.status_code)
148 response.raise_for_status()
150 text = response.text
151 if not text:
152 return None
154 # Strip the Project Gutenberg header/footer boilerplate
155 start_markers = [
156 "*** START OF THE PROJECT GUTENBERG EBOOK",
157 "*** START OF THIS PROJECT GUTENBERG EBOOK",
158 "***START OF THE PROJECT GUTENBERG EBOOK",
159 ]
160 end_markers = [
161 "*** END OF THE PROJECT GUTENBERG EBOOK",
162 "*** END OF THIS PROJECT GUTENBERG EBOOK",
163 "***END OF THE PROJECT GUTENBERG EBOOK",
164 ]
166 for marker in start_markers: 166 ↛ 175line 166 didn't jump to line 175 because the loop on line 166 didn't complete
167 idx = text.find(marker)
168 if idx != -1:
169 # Skip past the marker line
170 newline = text.find("\n", idx)
171 if newline != -1: 171 ↛ 173line 171 didn't jump to line 173 because the condition on line 171 was always true
172 text = text[newline + 1 :]
173 break
175 for marker in end_markers: 175 ↛ 181line 175 didn't jump to line 181 because the loop on line 175 didn't complete
176 idx = text.find(marker)
177 if idx != -1:
178 text = text[:idx]
179 break
181 text = text.strip()
183 # Truncate to max_content_chars
184 if len(text) > self.max_content_chars:
185 text = (
186 text[: self.max_content_chars] + "\n\n[... truncated ...]"
187 )
189 return text
191 except (RateLimitError, ValueError):
192 raise
193 except Exception:
194 logger.warning(f"Failed to fetch book text from {text_url}")
195 return None
197 def _parse_authors(self, authors: List[Dict]) -> List[str]:
198 """Parse author information."""
199 result = []
200 for author in authors[:5]:
201 name = author.get("name", "")
202 if name: 202 ↛ 200line 202 didn't jump to line 200 because the condition on line 202 was always true
203 # Format: "Last, First" -> "First Last"
204 if ", " in name:
205 parts = name.split(", ", 1)
206 name = f"{parts[1]} {parts[0]}"
207 result.append(name)
208 return result
210 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
211 """
212 Get preview information for Project Gutenberg books.
214 Args:
215 query: The search query
217 Returns:
218 List of preview dictionaries
219 """
220 logger.info(f"Getting Gutenberg previews for query: {query}")
222 # Apply rate limiting
223 self._last_wait_time = self.rate_tracker.apply_rate_limit(
224 self.engine_type
225 )
227 try:
228 params = self._build_query_params(query)
229 response = safe_get(
230 self.search_url,
231 params=params,
232 headers=self.headers,
233 timeout=30,
234 )
236 self._raise_if_rate_limit(response.status_code)
238 response.raise_for_status()
239 data = response.json()
241 results = data.get("results", [])
242 total = data.get("count", 0)
243 logger.info(
244 f"Found {total} Gutenberg results, returning {len(results)}"
245 )
247 previews = []
248 for book in results[: self.max_results]:
249 try:
250 book_id = book.get("id")
251 title = book.get("title", "Untitled")
253 # Get authors
254 authors = self._parse_authors(book.get("authors", []))
256 # Get subjects and bookshelves
257 subjects = book.get("subjects", [])[:5]
258 bookshelves = book.get("bookshelves", [])[:3]
260 # Get languages
261 languages = book.get("languages", [])
263 # Get formats
264 formats = book.get("formats", {})
265 read_url = self._get_best_format_url(formats)
267 # Build Gutenberg URL
268 gutenberg_url = (
269 f"https://www.gutenberg.org/ebooks/{book_id}"
270 )
272 # Get summaries if available
273 summaries = book.get("summaries", [])
274 summary_text = ""
275 if summaries and isinstance(summaries, list): 275 ↛ 277line 275 didn't jump to line 277 because the condition on line 275 was never true
276 # Use the first summary, strip whitespace
277 first_summary = summaries[0] if summaries else ""
278 if isinstance(first_summary, str):
279 summary_text = first_summary.strip()[:300]
281 # Build snippet with summary for richer content
282 snippet_parts = []
283 if summary_text: 283 ↛ 284line 283 didn't jump to line 284 because the condition on line 283 was never true
284 snippet_parts.append(summary_text)
285 if authors:
286 snippet_parts.append(f"By {', '.join(authors[:2])}")
287 if subjects and not summary_text:
288 snippet_parts.append(
289 f"Subjects: {', '.join(subjects[:3])}"
290 )
291 if bookshelves and not summary_text:
292 snippet_parts.append(
293 f"Bookshelves: {', '.join(bookshelves[:2])}"
294 )
295 snippet = ". ".join(snippet_parts)
297 # Check for cover image
298 cover_url = formats.get("image/jpeg")
300 preview = {
301 "id": str(book_id),
302 "title": title,
303 "link": gutenberg_url,
304 "snippet": snippet,
305 "authors": authors,
306 "subjects": subjects,
307 "bookshelves": bookshelves,
308 "languages": languages,
309 "download_count": book.get("download_count", 0),
310 "read_url": read_url,
311 "cover_url": cover_url,
312 "formats": list(formats.keys()),
313 "copyright": book.get("copyright", False),
314 "source": "Project Gutenberg",
315 "_raw": book,
316 }
318 previews.append(preview)
320 except Exception:
321 logger.exception("Error parsing Gutenberg book")
322 continue
324 return previews
326 except (requests.RequestException, ValueError) as e:
327 logger.exception("Gutendex API request failed")
328 self._raise_if_rate_limit(e)
329 return []
331 def _get_full_content(
332 self, relevant_items: List[Dict[str, Any]]
333 ) -> List[Dict[str, Any]]:
334 """
335 Get full content for the relevant Gutenberg books.
337 Fetches the actual plain text of each book from Project Gutenberg.
339 Args:
340 relevant_items: List of relevant preview dictionaries
342 Returns:
343 List of result dictionaries with full content
344 """
345 logger.info(
346 f"Getting full content for {len(relevant_items)} Gutenberg books"
347 )
349 results = []
350 for item in relevant_items:
351 result = item.copy()
353 raw = item.get("_raw", {})
354 if raw:
355 # Get all subjects
356 result["subjects"] = raw.get("subjects", [])
358 # Get all bookshelves
359 result["bookshelves"] = raw.get("bookshelves", [])
361 # Get translators
362 translators = raw.get("translators", [])
363 result["translators"] = self._parse_authors(translators)
365 # Fetch actual book text
366 formats = raw.get("formats", {})
367 text_url = self._get_text_url(formats)
369 book_text = None
370 if text_url and text_url.startswith(
371 "https://www.gutenberg.org/"
372 ):
373 logger.info(
374 f"Fetching book text for '{result.get('title')}' from {text_url}"
375 )
376 book_text = self._fetch_book_text(text_url)
377 elif text_url:
378 logger.warning(
379 f"Skipping text_url with unexpected origin: {text_url}"
380 )
382 # Build content with metadata header + actual text
383 content_parts = []
384 if result.get("authors"):
385 content_parts.append(
386 f"Authors: {', '.join(result['authors'])}"
387 )
388 if result.get("subjects"):
389 content_parts.append(
390 f"Subjects: {', '.join(result['subjects'][:5])}"
391 )
393 if book_text:
394 content_parts.append("")
395 content_parts.append(book_text)
396 logger.info(
397 f"Retrieved {len(book_text)} chars of text for '{result.get('title')}'"
398 )
399 else:
400 if result.get("bookshelves"):
401 content_parts.append(
402 f"Bookshelves: {', '.join(result['bookshelves'])}"
403 )
404 if result.get("download_count"):
405 content_parts.append(
406 f"Downloads: {result['download_count']}"
407 )
408 if result.get("read_url"):
409 content_parts.append(
410 f"Read online: {result['read_url']}"
411 )
412 logger.warning(
413 f"Could not fetch text for '{result.get('title')}', using metadata only"
414 )
416 result["content"] = "\n".join(content_parts)
418 # Clean up internal fields
419 if "_raw" in result:
420 del result["_raw"]
422 results.append(result)
424 return results
426 def get_book(self, book_id: int) -> Optional[Dict[str, Any]]:
427 """
428 Get a specific book by Gutenberg ID.
430 Args:
431 book_id: The Project Gutenberg book ID
433 Returns:
434 Book dictionary or None
435 """
436 try:
437 url = f"{self.base_url}/books/{book_id}"
438 response = safe_get(url, headers=self.headers, timeout=30)
439 self._raise_if_rate_limit(response.status_code)
440 response.raise_for_status()
441 return response.json() # type: ignore[no-any-return]
442 except RateLimitError:
443 raise
444 except Exception:
445 logger.exception(f"Error fetching Gutenberg book {book_id}")
446 return None
448 def search_by_topic(self, topic: str) -> List[Dict[str, Any]]:
449 """
450 Search books by topic/subject.
452 Args:
453 topic: The topic to search for
455 Returns:
456 List of matching books
457 """
458 original_topic = self.topic
459 try:
460 self.topic = topic
461 return self.run("")
462 finally:
463 self.topic = original_topic