Coverage for src / local_deep_research / web_search_engines / engines / search_engine_gutenberg.py: 91%

206 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1"""Project Gutenberg search engine via Gutendex API.""" 

2 

3from typing import Any, Dict, List, Optional 

4 

5import requests 

6from langchain_core.language_models import BaseLLM 

7from loguru import logger 

8 

9from ...security.safe_requests import safe_get 

10from ..rate_limiting import RateLimitError 

11from ..search_engine_base import BaseSearchEngine 

12 

13 

14class GutenbergSearchEngine(BaseSearchEngine): 

15 """ 

16 Project Gutenberg search engine via Gutendex API. 

17 

18 Provides access to 70,000+ free public domain books with full text. 

19 No authentication required. 

20 """ 

21 

22 is_public = True 

23 is_generic = False 

24 is_scientific = False 

25 is_books = True 

26 is_lexical = True 

27 needs_llm_relevance_filter = True 

28 

29 def __init__( 

30 self, 

31 max_results: int = 10, 

32 languages: Optional[str] = None, 

33 topic: Optional[str] = None, 

34 author_year_start: Optional[int] = None, 

35 author_year_end: Optional[int] = None, 

36 copyright_filter: Optional[bool] = None, 

37 sort: str = "popular", 

38 max_content_chars: int = 50000, 

39 llm: Optional[BaseLLM] = None, 

40 max_filtered_results: Optional[int] = None, 

41 settings_snapshot: Optional[Dict[str, Any]] = None, 

42 **kwargs, 

43 ): 

44 """ 

45 Initialize the Project Gutenberg search engine. 

46 

47 Args: 

48 max_results: Maximum number of search results 

49 languages: Filter by language codes (e.g., 'en', 'fr,de') 

50 topic: Filter by subject/bookshelf topic 

51 author_year_start: Filter authors born after this year 

52 author_year_end: Filter authors born before this year 

53 copyright_filter: Filter by copyright status (True/False/None) 

54 sort: Sort order ('popular', 'ascending', 'descending') 

55 max_content_chars: Maximum characters of book text to retrieve 

56 llm: Language model for relevance filtering 

57 max_filtered_results: Maximum results after filtering 

58 settings_snapshot: Settings snapshot for thread context 

59 """ 

60 super().__init__( 

61 llm=llm, 

62 max_filtered_results=max_filtered_results, 

63 max_results=max_results, 

64 settings_snapshot=settings_snapshot, 

65 **kwargs, 

66 ) 

67 

68 self.languages = languages 

69 self.topic = topic 

70 self.author_year_start = author_year_start 

71 self.author_year_end = author_year_end 

72 self.copyright_filter = copyright_filter 

73 self.sort = sort 

74 self.max_content_chars = max_content_chars 

75 

76 self.base_url = "https://gutendex.com" 

77 self.search_url = f"{self.base_url}/books/" 

78 

79 # User-Agent header for API requests 

80 self.headers = { 

81 "User-Agent": "LocalDeepResearch/1.0 (https://github.com/LearningCircuit/local-deep-research)" 

82 } 

83 

84 def _build_query_params(self, query: str) -> Dict[str, Any]: 

85 """Build query parameters for the API request.""" 

86 params: Dict[str, Any] = {} 

87 if query: 87 ↛ 90line 87 didn't jump to line 90 because the condition on line 87 was always true

88 params["search"] = query 

89 

90 if self.languages: 

91 params["languages"] = self.languages 

92 

93 if self.topic: 

94 params["topic"] = self.topic 

95 

96 if self.author_year_start is not None: 96 ↛ 97line 96 didn't jump to line 97 because the condition on line 96 was never true

97 params["author_year_start"] = self.author_year_start 

98 

99 if self.author_year_end is not None: 99 ↛ 100line 99 didn't jump to line 100 because the condition on line 99 was never true

100 params["author_year_end"] = self.author_year_end 

101 

102 if self.copyright_filter is not None: 102 ↛ 103line 102 didn't jump to line 103 because the condition on line 102 was never true

103 params["copyright"] = str(self.copyright_filter).lower() 

104 

105 if self.sort and self.sort != "popular": 105 ↛ 106line 105 didn't jump to line 106 because the condition on line 105 was never true

106 params["sort"] = self.sort 

107 

108 return params 

109 

110 def _get_best_format_url(self, formats: Dict[str, str]) -> Optional[str]: 

111 """Get the best available format URL for reading.""" 

112 # Priority order for reading formats 

113 priority = [ 

114 "text/html", 

115 "text/html; charset=utf-8", 

116 "text/plain; charset=utf-8", 

117 "text/plain", 

118 "application/epub+zip", 

119 "application/x-mobipocket-ebook", 

120 "application/pdf", 

121 ] 

122 

123 for mime_type in priority: 

124 if mime_type in formats: 

125 return formats[mime_type] 

126 

127 # Return first available if no priority match 

128 if formats: 

129 return next(iter(formats.values())) 

130 return None 

131 

132 def _get_text_url(self, formats: Dict[str, str]) -> Optional[str]: 

133 """Get the plain text URL for content retrieval.""" 

134 for mime_type in [ 

135 "text/plain; charset=utf-8", 

136 "text/plain; charset=us-ascii", 

137 "text/plain", 

138 ]: 

139 if mime_type in formats: 

140 return formats[mime_type] 

141 return None 

142 

143 def _fetch_book_text(self, text_url: str) -> Optional[str]: 

144 """Fetch and return the plain text content of a book.""" 

145 try: 

146 response = safe_get(text_url, headers=self.headers, timeout=30) 

147 self._raise_if_rate_limit(response.status_code) 

148 response.raise_for_status() 

149 

150 text = response.text 

151 if not text: 

152 return None 

153 

154 # Strip the Project Gutenberg header/footer boilerplate 

155 start_markers = [ 

156 "*** START OF THE PROJECT GUTENBERG EBOOK", 

157 "*** START OF THIS PROJECT GUTENBERG EBOOK", 

158 "***START OF THE PROJECT GUTENBERG EBOOK", 

159 ] 

160 end_markers = [ 

161 "*** END OF THE PROJECT GUTENBERG EBOOK", 

162 "*** END OF THIS PROJECT GUTENBERG EBOOK", 

163 "***END OF THE PROJECT GUTENBERG EBOOK", 

164 ] 

165 

166 for marker in start_markers: 166 ↛ 175line 166 didn't jump to line 175 because the loop on line 166 didn't complete

167 idx = text.find(marker) 

168 if idx != -1: 

169 # Skip past the marker line 

170 newline = text.find("\n", idx) 

171 if newline != -1: 171 ↛ 173line 171 didn't jump to line 173 because the condition on line 171 was always true

172 text = text[newline + 1 :] 

173 break 

174 

175 for marker in end_markers: 175 ↛ 181line 175 didn't jump to line 181 because the loop on line 175 didn't complete

176 idx = text.find(marker) 

177 if idx != -1: 

178 text = text[:idx] 

179 break 

180 

181 text = text.strip() 

182 

183 # Truncate to max_content_chars 

184 if len(text) > self.max_content_chars: 

185 text = ( 

186 text[: self.max_content_chars] + "\n\n[... truncated ...]" 

187 ) 

188 

189 return text 

190 

191 except (RateLimitError, ValueError): 

192 raise 

193 except Exception: 

194 logger.warning(f"Failed to fetch book text from {text_url}") 

195 return None 

196 

197 def _parse_authors(self, authors: List[Dict]) -> List[str]: 

198 """Parse author information.""" 

199 result = [] 

200 for author in authors[:5]: 

201 name = author.get("name", "") 

202 if name: 202 ↛ 200line 202 didn't jump to line 200 because the condition on line 202 was always true

203 # Format: "Last, First" -> "First Last" 

204 if ", " in name: 

205 parts = name.split(", ", 1) 

206 name = f"{parts[1]} {parts[0]}" 

207 result.append(name) 

208 return result 

209 

210 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

211 """ 

212 Get preview information for Project Gutenberg books. 

213 

214 Args: 

215 query: The search query 

216 

217 Returns: 

218 List of preview dictionaries 

219 """ 

220 logger.info(f"Getting Gutenberg previews for query: {query}") 

221 

222 # Apply rate limiting 

223 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

224 self.engine_type 

225 ) 

226 

227 try: 

228 params = self._build_query_params(query) 

229 response = safe_get( 

230 self.search_url, 

231 params=params, 

232 headers=self.headers, 

233 timeout=30, 

234 ) 

235 

236 self._raise_if_rate_limit(response.status_code) 

237 

238 response.raise_for_status() 

239 data = response.json() 

240 

241 results = data.get("results", []) 

242 total = data.get("count", 0) 

243 logger.info( 

244 f"Found {total} Gutenberg results, returning {len(results)}" 

245 ) 

246 

247 previews = [] 

248 for book in results[: self.max_results]: 

249 try: 

250 book_id = book.get("id") 

251 title = book.get("title", "Untitled") 

252 

253 # Get authors 

254 authors = self._parse_authors(book.get("authors", [])) 

255 

256 # Get subjects and bookshelves 

257 subjects = book.get("subjects", [])[:5] 

258 bookshelves = book.get("bookshelves", [])[:3] 

259 

260 # Get languages 

261 languages = book.get("languages", []) 

262 

263 # Get formats 

264 formats = book.get("formats", {}) 

265 read_url = self._get_best_format_url(formats) 

266 

267 # Build Gutenberg URL 

268 gutenberg_url = ( 

269 f"https://www.gutenberg.org/ebooks/{book_id}" 

270 ) 

271 

272 # Get summaries if available 

273 summaries = book.get("summaries", []) 

274 summary_text = "" 

275 if summaries and isinstance(summaries, list): 275 ↛ 277line 275 didn't jump to line 277 because the condition on line 275 was never true

276 # Use the first summary, strip whitespace 

277 first_summary = summaries[0] if summaries else "" 

278 if isinstance(first_summary, str): 

279 summary_text = first_summary.strip()[:300] 

280 

281 # Build snippet with summary for richer content 

282 snippet_parts = [] 

283 if summary_text: 283 ↛ 284line 283 didn't jump to line 284 because the condition on line 283 was never true

284 snippet_parts.append(summary_text) 

285 if authors: 

286 snippet_parts.append(f"By {', '.join(authors[:2])}") 

287 if subjects and not summary_text: 

288 snippet_parts.append( 

289 f"Subjects: {', '.join(subjects[:3])}" 

290 ) 

291 if bookshelves and not summary_text: 

292 snippet_parts.append( 

293 f"Bookshelves: {', '.join(bookshelves[:2])}" 

294 ) 

295 snippet = ". ".join(snippet_parts) 

296 

297 # Check for cover image 

298 cover_url = formats.get("image/jpeg") 

299 

300 preview = { 

301 "id": str(book_id), 

302 "title": title, 

303 "link": gutenberg_url, 

304 "snippet": snippet, 

305 "authors": authors, 

306 "subjects": subjects, 

307 "bookshelves": bookshelves, 

308 "languages": languages, 

309 "download_count": book.get("download_count", 0), 

310 "read_url": read_url, 

311 "cover_url": cover_url, 

312 "formats": list(formats.keys()), 

313 "copyright": book.get("copyright", False), 

314 "source": "Project Gutenberg", 

315 "_raw": book, 

316 } 

317 

318 previews.append(preview) 

319 

320 except Exception: 

321 logger.exception("Error parsing Gutenberg book") 

322 continue 

323 

324 return previews 

325 

326 except (requests.RequestException, ValueError) as e: 

327 logger.exception("Gutendex API request failed") 

328 self._raise_if_rate_limit(e) 

329 return [] 

330 

331 def _get_full_content( 

332 self, relevant_items: List[Dict[str, Any]] 

333 ) -> List[Dict[str, Any]]: 

334 """ 

335 Get full content for the relevant Gutenberg books. 

336 

337 Fetches the actual plain text of each book from Project Gutenberg. 

338 

339 Args: 

340 relevant_items: List of relevant preview dictionaries 

341 

342 Returns: 

343 List of result dictionaries with full content 

344 """ 

345 logger.info( 

346 f"Getting full content for {len(relevant_items)} Gutenberg books" 

347 ) 

348 

349 results = [] 

350 for item in relevant_items: 

351 result = item.copy() 

352 

353 raw = item.get("_raw", {}) 

354 if raw: 

355 # Get all subjects 

356 result["subjects"] = raw.get("subjects", []) 

357 

358 # Get all bookshelves 

359 result["bookshelves"] = raw.get("bookshelves", []) 

360 

361 # Get translators 

362 translators = raw.get("translators", []) 

363 result["translators"] = self._parse_authors(translators) 

364 

365 # Fetch actual book text 

366 formats = raw.get("formats", {}) 

367 text_url = self._get_text_url(formats) 

368 

369 book_text = None 

370 if text_url and text_url.startswith( 

371 "https://www.gutenberg.org/" 

372 ): 

373 logger.info( 

374 f"Fetching book text for '{result.get('title')}' from {text_url}" 

375 ) 

376 book_text = self._fetch_book_text(text_url) 

377 elif text_url: 

378 logger.warning( 

379 f"Skipping text_url with unexpected origin: {text_url}" 

380 ) 

381 

382 # Build content with metadata header + actual text 

383 content_parts = [] 

384 if result.get("authors"): 

385 content_parts.append( 

386 f"Authors: {', '.join(result['authors'])}" 

387 ) 

388 if result.get("subjects"): 

389 content_parts.append( 

390 f"Subjects: {', '.join(result['subjects'][:5])}" 

391 ) 

392 

393 if book_text: 

394 content_parts.append("") 

395 content_parts.append(book_text) 

396 logger.info( 

397 f"Retrieved {len(book_text)} chars of text for '{result.get('title')}'" 

398 ) 

399 else: 

400 if result.get("bookshelves"): 

401 content_parts.append( 

402 f"Bookshelves: {', '.join(result['bookshelves'])}" 

403 ) 

404 if result.get("download_count"): 

405 content_parts.append( 

406 f"Downloads: {result['download_count']}" 

407 ) 

408 if result.get("read_url"): 

409 content_parts.append( 

410 f"Read online: {result['read_url']}" 

411 ) 

412 logger.warning( 

413 f"Could not fetch text for '{result.get('title')}', using metadata only" 

414 ) 

415 

416 result["content"] = "\n".join(content_parts) 

417 

418 # Clean up internal fields 

419 if "_raw" in result: 

420 del result["_raw"] 

421 

422 results.append(result) 

423 

424 return results 

425 

426 def get_book(self, book_id: int) -> Optional[Dict[str, Any]]: 

427 """ 

428 Get a specific book by Gutenberg ID. 

429 

430 Args: 

431 book_id: The Project Gutenberg book ID 

432 

433 Returns: 

434 Book dictionary or None 

435 """ 

436 try: 

437 url = f"{self.base_url}/books/{book_id}" 

438 response = safe_get(url, headers=self.headers, timeout=30) 

439 self._raise_if_rate_limit(response.status_code) 

440 response.raise_for_status() 

441 return response.json() # type: ignore[no-any-return] 

442 except RateLimitError: 

443 raise 

444 except Exception: 

445 logger.exception(f"Error fetching Gutenberg book {book_id}") 

446 return None 

447 

448 def search_by_topic(self, topic: str) -> List[Dict[str, Any]]: 

449 """ 

450 Search books by topic/subject. 

451 

452 Args: 

453 topic: The topic to search for 

454 

455 Returns: 

456 List of matching books 

457 """ 

458 original_topic = self.topic 

459 try: 

460 self.topic = topic 

461 return self.run("") 

462 finally: 

463 self.topic = original_topic