Coverage for src/local_deep_research/web_search_engines/engines/search_engine_gutenberg.py: 91%
207 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""Project Gutenberg search engine via Gutendex API."""
3from typing import Any, Dict, List, Optional
5import requests
6from langchain_core.language_models import BaseLLM
7from loguru import logger
9from ...constants import USER_AGENT
10from ...security.safe_requests import safe_get
11from ..rate_limiting import RateLimitError
12from ..search_engine_base import BaseSearchEngine
15class GutenbergSearchEngine(BaseSearchEngine):
16 """
17 Project Gutenberg search engine via Gutendex API.
19 Provides access to 70,000+ free public domain books with full text.
20 No authentication required.
21 """
23 is_public = True
24 is_generic = False
25 is_scientific = False
26 is_books = True
27 is_lexical = True
28 needs_llm_relevance_filter = True
30 def __init__(
31 self,
32 max_results: int = 10,
33 languages: Optional[str] = None,
34 topic: Optional[str] = None,
35 author_year_start: Optional[int] = None,
36 author_year_end: Optional[int] = None,
37 copyright_filter: Optional[bool] = None,
38 sort: str = "popular",
39 max_content_chars: int = 50000,
40 llm: Optional[BaseLLM] = None,
41 max_filtered_results: Optional[int] = None,
42 settings_snapshot: Optional[Dict[str, Any]] = None,
43 **kwargs,
44 ):
45 """
46 Initialize the Project Gutenberg search engine.
48 Args:
49 max_results: Maximum number of search results
50 languages: Filter by language codes (e.g., 'en', 'fr,de')
51 topic: Filter by subject/bookshelf topic
52 author_year_start: Filter authors born after this year
53 author_year_end: Filter authors born before this year
54 copyright_filter: Filter by copyright status (True/False/None)
55 sort: Sort order ('popular', 'ascending', 'descending')
56 max_content_chars: Maximum characters of book text to retrieve
57 llm: Language model for relevance filtering
58 max_filtered_results: Maximum results after filtering
59 settings_snapshot: Settings snapshot for thread context
60 """
61 super().__init__(
62 llm=llm,
63 max_filtered_results=max_filtered_results,
64 max_results=max_results,
65 settings_snapshot=settings_snapshot,
66 **kwargs,
67 )
69 self.languages = languages
70 self.topic = topic
71 self.author_year_start = author_year_start
72 self.author_year_end = author_year_end
73 self.copyright_filter = copyright_filter
74 self.sort = sort
75 self.max_content_chars = max_content_chars
77 self.base_url = "https://gutendex.com"
78 self.search_url = f"{self.base_url}/books/"
80 # User-Agent header for API requests
81 self.headers = {"User-Agent": USER_AGENT}
83 def _build_query_params(self, query: str) -> Dict[str, Any]:
84 """Build query parameters for the API request."""
85 params: Dict[str, Any] = {}
86 if query: 86 ↛ 89line 86 didn't jump to line 89 because the condition on line 86 was always true
87 params["search"] = query
89 if self.languages:
90 params["languages"] = self.languages
92 if self.topic:
93 params["topic"] = self.topic
95 if self.author_year_start is not None: 95 ↛ 96line 95 didn't jump to line 96 because the condition on line 95 was never true
96 params["author_year_start"] = self.author_year_start
98 if self.author_year_end is not None: 98 ↛ 99line 98 didn't jump to line 99 because the condition on line 98 was never true
99 params["author_year_end"] = self.author_year_end
101 if self.copyright_filter is not None: 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true
102 params["copyright"] = str(self.copyright_filter).lower()
104 if self.sort and self.sort != "popular": 104 ↛ 105line 104 didn't jump to line 105 because the condition on line 104 was never true
105 params["sort"] = self.sort
107 return params
109 def _get_best_format_url(self, formats: Dict[str, str]) -> Optional[str]:
110 """Get the best available format URL for reading."""
111 # Priority order for reading formats
112 priority = [
113 "text/html",
114 "text/html; charset=utf-8",
115 "text/plain; charset=utf-8",
116 "text/plain",
117 "application/epub+zip",
118 "application/x-mobipocket-ebook",
119 "application/pdf",
120 ]
122 for mime_type in priority:
123 if mime_type in formats:
124 return formats[mime_type]
126 # Return first available if no priority match
127 if formats:
128 return next(iter(formats.values()))
129 return None
131 def _get_text_url(self, formats: Dict[str, str]) -> Optional[str]:
132 """Get the plain text URL for content retrieval."""
133 for mime_type in [
134 "text/plain; charset=utf-8",
135 "text/plain; charset=us-ascii",
136 "text/plain",
137 ]:
138 if mime_type in formats:
139 return formats[mime_type]
140 return None
142 def _fetch_book_text(self, text_url: str) -> Optional[str]:
143 """Fetch and return the plain text content of a book."""
144 try:
145 response = safe_get(text_url, headers=self.headers, timeout=30)
146 self._raise_if_rate_limit(response.status_code)
147 response.raise_for_status()
149 text = response.text
150 if not text:
151 return None
153 # Strip the Project Gutenberg header/footer boilerplate
154 start_markers = [
155 "*** START OF THE PROJECT GUTENBERG EBOOK",
156 "*** START OF THIS PROJECT GUTENBERG EBOOK",
157 "***START OF THE PROJECT GUTENBERG EBOOK",
158 ]
159 end_markers = [
160 "*** END OF THE PROJECT GUTENBERG EBOOK",
161 "*** END OF THIS PROJECT GUTENBERG EBOOK",
162 "***END OF THE PROJECT GUTENBERG EBOOK",
163 ]
165 for marker in start_markers: 165 ↛ 174line 165 didn't jump to line 174 because the loop on line 165 didn't complete
166 idx = text.find(marker)
167 if idx != -1:
168 # Skip past the marker line
169 newline = text.find("\n", idx)
170 if newline != -1: 170 ↛ 172line 170 didn't jump to line 172 because the condition on line 170 was always true
171 text = text[newline + 1 :]
172 break
174 for marker in end_markers: 174 ↛ 180line 174 didn't jump to line 180 because the loop on line 174 didn't complete
175 idx = text.find(marker)
176 if idx != -1:
177 text = text[:idx]
178 break
180 text = text.strip()
182 # Truncate to max_content_chars
183 if len(text) > self.max_content_chars:
184 text = (
185 text[: self.max_content_chars] + "\n\n[... truncated ...]"
186 )
188 return text
190 except (RateLimitError, ValueError):
191 raise
192 except Exception:
193 logger.warning(f"Failed to fetch book text from {text_url}")
194 return None
196 def _parse_authors(self, authors: List[Dict]) -> List[str]:
197 """Parse author information."""
198 result = []
199 for author in authors[:5]:
200 name = author.get("name", "")
201 if name: 201 ↛ 199line 201 didn't jump to line 199 because the condition on line 201 was always true
202 # Format: "Last, First" -> "First Last"
203 if ", " in name:
204 parts = name.split(", ", 1)
205 name = f"{parts[1]} {parts[0]}"
206 result.append(name)
207 return result
209 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
210 """
211 Get preview information for Project Gutenberg books.
213 Args:
214 query: The search query
216 Returns:
217 List of preview dictionaries
218 """
219 logger.info(f"Getting Gutenberg previews for query: {query}")
221 # Apply rate limiting
222 self._last_wait_time = self.rate_tracker.apply_rate_limit(
223 self.engine_type
224 )
226 try:
227 params = self._build_query_params(query)
228 response = safe_get(
229 self.search_url,
230 params=params,
231 headers=self.headers,
232 timeout=30,
233 )
235 self._raise_if_rate_limit(response.status_code)
237 response.raise_for_status()
238 data = response.json()
240 results = data.get("results", [])
241 total = data.get("count", 0)
242 logger.info(
243 f"Found {total} Gutenberg results, returning {len(results)}"
244 )
246 previews = []
247 for book in results[: self.max_results]:
248 try:
249 book_id = book.get("id")
250 title = book.get("title", "Untitled")
252 # Get authors
253 authors = self._parse_authors(book.get("authors", []))
255 # Get subjects and bookshelves
256 subjects = book.get("subjects", [])[:5]
257 bookshelves = book.get("bookshelves", [])[:3]
259 # Get languages
260 languages = book.get("languages", [])
262 # Get formats
263 formats = book.get("formats", {})
264 read_url = self._get_best_format_url(formats)
266 # Build Gutenberg URL
267 gutenberg_url = (
268 f"https://www.gutenberg.org/ebooks/{book_id}"
269 )
271 # Get summaries if available
272 summaries = book.get("summaries", [])
273 summary_text = ""
274 if summaries and isinstance(summaries, list): 274 ↛ 276line 274 didn't jump to line 276 because the condition on line 274 was never true
275 # Use the first summary, strip whitespace
276 first_summary = summaries[0] if summaries else ""
277 if isinstance(first_summary, str):
278 summary_text = first_summary.strip()[:300]
280 # Build snippet with summary for richer content
281 snippet_parts = []
282 if summary_text: 282 ↛ 283line 282 didn't jump to line 283 because the condition on line 282 was never true
283 snippet_parts.append(summary_text)
284 if authors:
285 snippet_parts.append(f"By {', '.join(authors[:2])}")
286 if subjects and not summary_text:
287 snippet_parts.append(
288 f"Subjects: {', '.join(subjects[:3])}"
289 )
290 if bookshelves and not summary_text:
291 snippet_parts.append(
292 f"Bookshelves: {', '.join(bookshelves[:2])}"
293 )
294 snippet = ". ".join(snippet_parts)
296 # Check for cover image
297 cover_url = formats.get("image/jpeg")
299 preview = {
300 "id": str(book_id),
301 "title": title,
302 "link": gutenberg_url,
303 "snippet": snippet,
304 "authors": authors,
305 "subjects": subjects,
306 "bookshelves": bookshelves,
307 "languages": languages,
308 "download_count": book.get("download_count", 0),
309 "read_url": read_url,
310 "cover_url": cover_url,
311 "formats": list(formats.keys()),
312 "copyright": book.get("copyright", False),
313 "source": "Project Gutenberg",
314 "_raw": book,
315 }
317 previews.append(preview)
319 except Exception:
320 logger.exception("Error parsing Gutenberg book")
321 continue
323 return previews
325 except (requests.RequestException, ValueError) as e:
326 logger.exception("Gutendex API request failed")
327 self._raise_if_rate_limit(e)
328 return []
330 def _get_full_content(
331 self, relevant_items: List[Dict[str, Any]]
332 ) -> List[Dict[str, Any]]:
333 """
334 Get full content for the relevant Gutenberg books.
336 Fetches the actual plain text of each book from Project Gutenberg.
338 Args:
339 relevant_items: List of relevant preview dictionaries
341 Returns:
342 List of result dictionaries with full content
343 """
344 logger.info(
345 f"Getting full content for {len(relevant_items)} Gutenberg books"
346 )
348 results = []
349 for item in relevant_items:
350 result = item.copy()
352 raw = item.get("_raw", {})
353 if raw:
354 # Get all subjects
355 result["subjects"] = raw.get("subjects", [])
357 # Get all bookshelves
358 result["bookshelves"] = raw.get("bookshelves", [])
360 # Get translators
361 translators = raw.get("translators", [])
362 result["translators"] = self._parse_authors(translators)
364 # Fetch actual book text
365 formats = raw.get("formats", {})
366 text_url = self._get_text_url(formats)
368 book_text = None
369 if text_url and text_url.startswith(
370 "https://www.gutenberg.org/"
371 ):
372 logger.info(
373 f"Fetching book text for '{result.get('title')}' from {text_url}"
374 )
375 book_text = self._fetch_book_text(text_url)
376 elif text_url:
377 logger.warning(
378 f"Skipping text_url with unexpected origin: {text_url}"
379 )
381 # Build content with metadata header + actual text
382 content_parts = []
383 if result.get("authors"):
384 content_parts.append(
385 f"Authors: {', '.join(result['authors'])}"
386 )
387 if result.get("subjects"):
388 content_parts.append(
389 f"Subjects: {', '.join(result['subjects'][:5])}"
390 )
392 if book_text:
393 content_parts.append("")
394 content_parts.append(book_text)
395 logger.info(
396 f"Retrieved {len(book_text)} chars of text for '{result.get('title')}'"
397 )
398 else:
399 if result.get("bookshelves"):
400 content_parts.append(
401 f"Bookshelves: {', '.join(result['bookshelves'])}"
402 )
403 if result.get("download_count"):
404 content_parts.append(
405 f"Downloads: {result['download_count']}"
406 )
407 if result.get("read_url"):
408 content_parts.append(
409 f"Read online: {result['read_url']}"
410 )
411 logger.warning(
412 f"Could not fetch text for '{result.get('title')}', using metadata only"
413 )
415 result["content"] = "\n".join(content_parts)
417 # Clean up internal fields
418 if "_raw" in result:
419 del result["_raw"]
421 results.append(result)
423 return results
425 def get_book(self, book_id: int) -> Optional[Dict[str, Any]]:
426 """
427 Get a specific book by Gutenberg ID.
429 Args:
430 book_id: The Project Gutenberg book ID
432 Returns:
433 Book dictionary or None
434 """
435 try:
436 url = f"{self.base_url}/books/{book_id}"
437 response = safe_get(url, headers=self.headers, timeout=30)
438 self._raise_if_rate_limit(response.status_code)
439 response.raise_for_status()
440 return response.json() # type: ignore[no-any-return]
441 except RateLimitError:
442 raise
443 except Exception:
444 logger.exception(f"Error fetching Gutenberg book {book_id}")
445 return None
447 def search_by_topic(self, topic: str) -> List[Dict[str, Any]]:
448 """
449 Search books by topic/subject.
451 Args:
452 topic: The topic to search for
454 Returns:
455 List of matching books
456 """
457 original_topic = self.topic
458 try:
459 self.topic = topic
460 return self.run("")
461 finally:
462 self.topic = original_topic