Coverage for src/local_deep_research/web_search_engines/engines/search_engine_gutenberg.py: 91%

207 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1"""Project Gutenberg search engine via Gutendex API.""" 

2 

3from typing import Any, Dict, List, Optional 

4 

5import requests 

6from langchain_core.language_models import BaseLLM 

7from loguru import logger 

8 

9from ...constants import USER_AGENT 

10from ...security.safe_requests import safe_get 

11from ..rate_limiting import RateLimitError 

12from ..search_engine_base import BaseSearchEngine 

13 

14 

15class GutenbergSearchEngine(BaseSearchEngine): 

16 """ 

17 Project Gutenberg search engine via Gutendex API. 

18 

19 Provides access to 70,000+ free public domain books with full text. 

20 No authentication required. 

21 """ 

22 

23 is_public = True 

24 is_generic = False 

25 is_scientific = False 

26 is_books = True 

27 is_lexical = True 

28 needs_llm_relevance_filter = True 

29 

30 def __init__( 

31 self, 

32 max_results: int = 10, 

33 languages: Optional[str] = None, 

34 topic: Optional[str] = None, 

35 author_year_start: Optional[int] = None, 

36 author_year_end: Optional[int] = None, 

37 copyright_filter: Optional[bool] = None, 

38 sort: str = "popular", 

39 max_content_chars: int = 50000, 

40 llm: Optional[BaseLLM] = None, 

41 max_filtered_results: Optional[int] = None, 

42 settings_snapshot: Optional[Dict[str, Any]] = None, 

43 **kwargs, 

44 ): 

45 """ 

46 Initialize the Project Gutenberg search engine. 

47 

48 Args: 

49 max_results: Maximum number of search results 

50 languages: Filter by language codes (e.g., 'en', 'fr,de') 

51 topic: Filter by subject/bookshelf topic 

52 author_year_start: Filter authors born after this year 

53 author_year_end: Filter authors born before this year 

54 copyright_filter: Filter by copyright status (True/False/None) 

55 sort: Sort order ('popular', 'ascending', 'descending') 

56 max_content_chars: Maximum characters of book text to retrieve 

57 llm: Language model for relevance filtering 

58 max_filtered_results: Maximum results after filtering 

59 settings_snapshot: Settings snapshot for thread context 

60 """ 

61 super().__init__( 

62 llm=llm, 

63 max_filtered_results=max_filtered_results, 

64 max_results=max_results, 

65 settings_snapshot=settings_snapshot, 

66 **kwargs, 

67 ) 

68 

69 self.languages = languages 

70 self.topic = topic 

71 self.author_year_start = author_year_start 

72 self.author_year_end = author_year_end 

73 self.copyright_filter = copyright_filter 

74 self.sort = sort 

75 self.max_content_chars = max_content_chars 

76 

77 self.base_url = "https://gutendex.com" 

78 self.search_url = f"{self.base_url}/books/" 

79 

80 # User-Agent header for API requests 

81 self.headers = {"User-Agent": USER_AGENT} 

82 

83 def _build_query_params(self, query: str) -> Dict[str, Any]: 

84 """Build query parameters for the API request.""" 

85 params: Dict[str, Any] = {} 

86 if query: 86 ↛ 89line 86 didn't jump to line 89 because the condition on line 86 was always true

87 params["search"] = query 

88 

89 if self.languages: 

90 params["languages"] = self.languages 

91 

92 if self.topic: 

93 params["topic"] = self.topic 

94 

95 if self.author_year_start is not None: 95 ↛ 96line 95 didn't jump to line 96 because the condition on line 95 was never true

96 params["author_year_start"] = self.author_year_start 

97 

98 if self.author_year_end is not None: 98 ↛ 99line 98 didn't jump to line 99 because the condition on line 98 was never true

99 params["author_year_end"] = self.author_year_end 

100 

101 if self.copyright_filter is not None: 101 ↛ 102line 101 didn't jump to line 102 because the condition on line 101 was never true

102 params["copyright"] = str(self.copyright_filter).lower() 

103 

104 if self.sort and self.sort != "popular": 104 ↛ 105line 104 didn't jump to line 105 because the condition on line 104 was never true

105 params["sort"] = self.sort 

106 

107 return params 

108 

109 def _get_best_format_url(self, formats: Dict[str, str]) -> Optional[str]: 

110 """Get the best available format URL for reading.""" 

111 # Priority order for reading formats 

112 priority = [ 

113 "text/html", 

114 "text/html; charset=utf-8", 

115 "text/plain; charset=utf-8", 

116 "text/plain", 

117 "application/epub+zip", 

118 "application/x-mobipocket-ebook", 

119 "application/pdf", 

120 ] 

121 

122 for mime_type in priority: 

123 if mime_type in formats: 

124 return formats[mime_type] 

125 

126 # Return first available if no priority match 

127 if formats: 

128 return next(iter(formats.values())) 

129 return None 

130 

131 def _get_text_url(self, formats: Dict[str, str]) -> Optional[str]: 

132 """Get the plain text URL for content retrieval.""" 

133 for mime_type in [ 

134 "text/plain; charset=utf-8", 

135 "text/plain; charset=us-ascii", 

136 "text/plain", 

137 ]: 

138 if mime_type in formats: 

139 return formats[mime_type] 

140 return None 

141 

142 def _fetch_book_text(self, text_url: str) -> Optional[str]: 

143 """Fetch and return the plain text content of a book.""" 

144 try: 

145 response = safe_get(text_url, headers=self.headers, timeout=30) 

146 self._raise_if_rate_limit(response.status_code) 

147 response.raise_for_status() 

148 

149 text = response.text 

150 if not text: 

151 return None 

152 

153 # Strip the Project Gutenberg header/footer boilerplate 

154 start_markers = [ 

155 "*** START OF THE PROJECT GUTENBERG EBOOK", 

156 "*** START OF THIS PROJECT GUTENBERG EBOOK", 

157 "***START OF THE PROJECT GUTENBERG EBOOK", 

158 ] 

159 end_markers = [ 

160 "*** END OF THE PROJECT GUTENBERG EBOOK", 

161 "*** END OF THIS PROJECT GUTENBERG EBOOK", 

162 "***END OF THE PROJECT GUTENBERG EBOOK", 

163 ] 

164 

165 for marker in start_markers: 165 ↛ 174line 165 didn't jump to line 174 because the loop on line 165 didn't complete

166 idx = text.find(marker) 

167 if idx != -1: 

168 # Skip past the marker line 

169 newline = text.find("\n", idx) 

170 if newline != -1: 170 ↛ 172line 170 didn't jump to line 172 because the condition on line 170 was always true

171 text = text[newline + 1 :] 

172 break 

173 

174 for marker in end_markers: 174 ↛ 180line 174 didn't jump to line 180 because the loop on line 174 didn't complete

175 idx = text.find(marker) 

176 if idx != -1: 

177 text = text[:idx] 

178 break 

179 

180 text = text.strip() 

181 

182 # Truncate to max_content_chars 

183 if len(text) > self.max_content_chars: 

184 text = ( 

185 text[: self.max_content_chars] + "\n\n[... truncated ...]" 

186 ) 

187 

188 return text 

189 

190 except (RateLimitError, ValueError): 

191 raise 

192 except Exception: 

193 logger.warning(f"Failed to fetch book text from {text_url}") 

194 return None 

195 

196 def _parse_authors(self, authors: List[Dict]) -> List[str]: 

197 """Parse author information.""" 

198 result = [] 

199 for author in authors[:5]: 

200 name = author.get("name", "") 

201 if name: 201 ↛ 199line 201 didn't jump to line 199 because the condition on line 201 was always true

202 # Format: "Last, First" -> "First Last" 

203 if ", " in name: 

204 parts = name.split(", ", 1) 

205 name = f"{parts[1]} {parts[0]}" 

206 result.append(name) 

207 return result 

208 

209 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

210 """ 

211 Get preview information for Project Gutenberg books. 

212 

213 Args: 

214 query: The search query 

215 

216 Returns: 

217 List of preview dictionaries 

218 """ 

219 logger.info(f"Getting Gutenberg previews for query: {query}") 

220 

221 # Apply rate limiting 

222 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

223 self.engine_type 

224 ) 

225 

226 try: 

227 params = self._build_query_params(query) 

228 response = safe_get( 

229 self.search_url, 

230 params=params, 

231 headers=self.headers, 

232 timeout=30, 

233 ) 

234 

235 self._raise_if_rate_limit(response.status_code) 

236 

237 response.raise_for_status() 

238 data = response.json() 

239 

240 results = data.get("results", []) 

241 total = data.get("count", 0) 

242 logger.info( 

243 f"Found {total} Gutenberg results, returning {len(results)}" 

244 ) 

245 

246 previews = [] 

247 for book in results[: self.max_results]: 

248 try: 

249 book_id = book.get("id") 

250 title = book.get("title", "Untitled") 

251 

252 # Get authors 

253 authors = self._parse_authors(book.get("authors", [])) 

254 

255 # Get subjects and bookshelves 

256 subjects = book.get("subjects", [])[:5] 

257 bookshelves = book.get("bookshelves", [])[:3] 

258 

259 # Get languages 

260 languages = book.get("languages", []) 

261 

262 # Get formats 

263 formats = book.get("formats", {}) 

264 read_url = self._get_best_format_url(formats) 

265 

266 # Build Gutenberg URL 

267 gutenberg_url = ( 

268 f"https://www.gutenberg.org/ebooks/{book_id}" 

269 ) 

270 

271 # Get summaries if available 

272 summaries = book.get("summaries", []) 

273 summary_text = "" 

274 if summaries and isinstance(summaries, list): 274 ↛ 276line 274 didn't jump to line 276 because the condition on line 274 was never true

275 # Use the first summary, strip whitespace 

276 first_summary = summaries[0] if summaries else "" 

277 if isinstance(first_summary, str): 

278 summary_text = first_summary.strip()[:300] 

279 

280 # Build snippet with summary for richer content 

281 snippet_parts = [] 

282 if summary_text: 282 ↛ 283line 282 didn't jump to line 283 because the condition on line 282 was never true

283 snippet_parts.append(summary_text) 

284 if authors: 

285 snippet_parts.append(f"By {', '.join(authors[:2])}") 

286 if subjects and not summary_text: 

287 snippet_parts.append( 

288 f"Subjects: {', '.join(subjects[:3])}" 

289 ) 

290 if bookshelves and not summary_text: 

291 snippet_parts.append( 

292 f"Bookshelves: {', '.join(bookshelves[:2])}" 

293 ) 

294 snippet = ". ".join(snippet_parts) 

295 

296 # Check for cover image 

297 cover_url = formats.get("image/jpeg") 

298 

299 preview = { 

300 "id": str(book_id), 

301 "title": title, 

302 "link": gutenberg_url, 

303 "snippet": snippet, 

304 "authors": authors, 

305 "subjects": subjects, 

306 "bookshelves": bookshelves, 

307 "languages": languages, 

308 "download_count": book.get("download_count", 0), 

309 "read_url": read_url, 

310 "cover_url": cover_url, 

311 "formats": list(formats.keys()), 

312 "copyright": book.get("copyright", False), 

313 "source": "Project Gutenberg", 

314 "_raw": book, 

315 } 

316 

317 previews.append(preview) 

318 

319 except Exception: 

320 logger.exception("Error parsing Gutenberg book") 

321 continue 

322 

323 return previews 

324 

325 except (requests.RequestException, ValueError) as e: 

326 logger.exception("Gutendex API request failed") 

327 self._raise_if_rate_limit(e) 

328 return [] 

329 

330 def _get_full_content( 

331 self, relevant_items: List[Dict[str, Any]] 

332 ) -> List[Dict[str, Any]]: 

333 """ 

334 Get full content for the relevant Gutenberg books. 

335 

336 Fetches the actual plain text of each book from Project Gutenberg. 

337 

338 Args: 

339 relevant_items: List of relevant preview dictionaries 

340 

341 Returns: 

342 List of result dictionaries with full content 

343 """ 

344 logger.info( 

345 f"Getting full content for {len(relevant_items)} Gutenberg books" 

346 ) 

347 

348 results = [] 

349 for item in relevant_items: 

350 result = item.copy() 

351 

352 raw = item.get("_raw", {}) 

353 if raw: 

354 # Get all subjects 

355 result["subjects"] = raw.get("subjects", []) 

356 

357 # Get all bookshelves 

358 result["bookshelves"] = raw.get("bookshelves", []) 

359 

360 # Get translators 

361 translators = raw.get("translators", []) 

362 result["translators"] = self._parse_authors(translators) 

363 

364 # Fetch actual book text 

365 formats = raw.get("formats", {}) 

366 text_url = self._get_text_url(formats) 

367 

368 book_text = None 

369 if text_url and text_url.startswith( 

370 "https://www.gutenberg.org/" 

371 ): 

372 logger.info( 

373 f"Fetching book text for '{result.get('title')}' from {text_url}" 

374 ) 

375 book_text = self._fetch_book_text(text_url) 

376 elif text_url: 

377 logger.warning( 

378 f"Skipping text_url with unexpected origin: {text_url}" 

379 ) 

380 

381 # Build content with metadata header + actual text 

382 content_parts = [] 

383 if result.get("authors"): 

384 content_parts.append( 

385 f"Authors: {', '.join(result['authors'])}" 

386 ) 

387 if result.get("subjects"): 

388 content_parts.append( 

389 f"Subjects: {', '.join(result['subjects'][:5])}" 

390 ) 

391 

392 if book_text: 

393 content_parts.append("") 

394 content_parts.append(book_text) 

395 logger.info( 

396 f"Retrieved {len(book_text)} chars of text for '{result.get('title')}'" 

397 ) 

398 else: 

399 if result.get("bookshelves"): 

400 content_parts.append( 

401 f"Bookshelves: {', '.join(result['bookshelves'])}" 

402 ) 

403 if result.get("download_count"): 

404 content_parts.append( 

405 f"Downloads: {result['download_count']}" 

406 ) 

407 if result.get("read_url"): 

408 content_parts.append( 

409 f"Read online: {result['read_url']}" 

410 ) 

411 logger.warning( 

412 f"Could not fetch text for '{result.get('title')}', using metadata only" 

413 ) 

414 

415 result["content"] = "\n".join(content_parts) 

416 

417 # Clean up internal fields 

418 if "_raw" in result: 

419 del result["_raw"] 

420 

421 results.append(result) 

422 

423 return results 

424 

425 def get_book(self, book_id: int) -> Optional[Dict[str, Any]]: 

426 """ 

427 Get a specific book by Gutenberg ID. 

428 

429 Args: 

430 book_id: The Project Gutenberg book ID 

431 

432 Returns: 

433 Book dictionary or None 

434 """ 

435 try: 

436 url = f"{self.base_url}/books/{book_id}" 

437 response = safe_get(url, headers=self.headers, timeout=30) 

438 self._raise_if_rate_limit(response.status_code) 

439 response.raise_for_status() 

440 return response.json() # type: ignore[no-any-return] 

441 except RateLimitError: 

442 raise 

443 except Exception: 

444 logger.exception(f"Error fetching Gutenberg book {book_id}") 

445 return None 

446 

447 def search_by_topic(self, topic: str) -> List[Dict[str, Any]]: 

448 """ 

449 Search books by topic/subject. 

450 

451 Args: 

452 topic: The topic to search for 

453 

454 Returns: 

455 List of matching books 

456 """ 

457 original_topic = self.topic 

458 try: 

459 self.topic = topic 

460 return self.run("") 

461 finally: 

462 self.topic = original_topic