Coverage for src / local_deep_research / web_search_engines / engines / search_engine_openlibrary.py: 89%

213 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1"""Open Library search engine for books and literature.""" 

2 

3import html 

4from typing import Any, Dict, List, Optional 

5 

6import requests 

7from langchain_core.language_models import BaseLLM 

8from loguru import logger 

9 

10from ...security.safe_requests import safe_get 

11from ..rate_limiting import RateLimitError 

12from ..search_engine_base import BaseSearchEngine 

13 

14 

15class OpenLibrarySearchEngine(BaseSearchEngine): 

16 """ 

17 Open Library search engine for books and literature. 

18 

19 Provides access to 2M+ books with metadata, covers, and reading lists. 

20 No authentication required. Part of the Internet Archive. 

21 """ 

22 

23 is_public = True 

24 is_generic = False 

25 is_scientific = False 

26 is_books = True # New category for book search 

27 is_lexical = True 

28 needs_llm_relevance_filter = True 

29 

30 def __init__( 

31 self, 

32 max_results: int = 10, 

33 sort: str = "relevance", 

34 language: Optional[str] = None, 

35 search_field: Optional[str] = None, 

36 llm: Optional[BaseLLM] = None, 

37 max_filtered_results: Optional[int] = None, 

38 settings_snapshot: Optional[Dict[str, Any]] = None, 

39 **kwargs, 

40 ): 

41 """ 

42 Initialize the Open Library search engine. 

43 

44 Args: 

45 max_results: Maximum number of search results 

46 sort: Sort order ('relevance', 'new', 'old', 'random') 

47 language: Filter by language code (e.g., 'eng', 'fre', 'ger') 

48 search_field: Search in specific field ('title', 'author', 'subject') 

49 llm: Language model for relevance filtering 

50 max_filtered_results: Maximum results after filtering 

51 settings_snapshot: Settings snapshot for thread context 

52 """ 

53 super().__init__( 

54 llm=llm, 

55 max_filtered_results=max_filtered_results, 

56 max_results=max_results, 

57 settings_snapshot=settings_snapshot, 

58 **kwargs, 

59 ) 

60 

61 self.sort = sort 

62 self.language = language 

63 self.search_field = search_field 

64 

65 self.base_url = "https://openlibrary.org" 

66 self.search_url = f"{self.base_url}/search.json" 

67 

68 # User-Agent header is important for Open Library API 

69 # They may block requests without a proper User-Agent 

70 self.headers = { 

71 "User-Agent": "LocalDeepResearch/1.0 (https://github.com/LearningCircuit/local-deep-research)" 

72 } 

73 

74 def _build_query_params(self, query: str) -> Dict[str, Any]: 

75 """Build query parameters for the API request.""" 

76 params = { 

77 "limit": min(self.max_results, 100), 

78 "fields": "key,title,author_name,author_key,first_publish_year," 

79 "publisher,language,subject,isbn,cover_i,edition_count," 

80 "ebook_access,has_fulltext,ia,description", 

81 } 

82 

83 # Build query based on search field 

84 if self.search_field == "title": 

85 params["title"] = query 

86 elif self.search_field == "author": 

87 params["author"] = query 

88 elif self.search_field == "subject": 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true

89 params["subject"] = query 

90 else: 

91 params["q"] = query 

92 

93 # Add sort if not relevance (default) 

94 if self.sort and self.sort != "relevance": 

95 params["sort"] = self.sort 

96 

97 # Add language filter 

98 if self.language: 

99 params["language"] = self.language 

100 

101 return params 

102 

103 def _get_cover_url( 

104 self, cover_id: Optional[int], size: str = "M" 

105 ) -> Optional[str]: 

106 """Get cover image URL for a book.""" 

107 if not cover_id: 

108 return None 

109 return f"https://covers.openlibrary.org/b/id/{cover_id}-{size}.jpg" 

110 

111 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

112 """ 

113 Get preview information for Open Library books. 

114 

115 Args: 

116 query: The search query 

117 

118 Returns: 

119 List of preview dictionaries 

120 """ 

121 logger.info(f"Getting Open Library previews for query: {query}") 

122 

123 # Apply rate limiting 

124 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

125 self.engine_type 

126 ) 

127 

128 try: 

129 params = self._build_query_params(query) 

130 response = safe_get( 

131 self.search_url, 

132 params=params, 

133 headers=self.headers, 

134 timeout=30, 

135 ) 

136 

137 self._raise_if_rate_limit(response.status_code) 

138 

139 response.raise_for_status() 

140 data = response.json() 

141 

142 docs = data.get("docs", []) 

143 total_found = data.get("num_found", 0) 

144 logger.info( 

145 f"Found {total_found} Open Library results, returning {len(docs)}" 

146 ) 

147 

148 previews = [] 

149 for doc in docs: 

150 try: 

151 # Get work key and build URL 

152 work_key = doc.get("key", "") 

153 link = f"{self.base_url}{work_key}" if work_key else "" 

154 

155 # Get title (decode HTML entities) 

156 title = html.unescape(doc.get("title", "Untitled")) 

157 

158 # Get authors 

159 authors = doc.get("author_name", []) 

160 if isinstance(authors, str): 

161 authors = [authors] 

162 authors = authors[:5] # Limit to 5 authors 

163 

164 # Get first publish year 

165 first_publish_year = doc.get("first_publish_year") 

166 

167 # Get publishers 

168 publishers = doc.get("publisher", []) 

169 if isinstance(publishers, str): 169 ↛ 170line 169 didn't jump to line 170 because the condition on line 169 was never true

170 publishers = [publishers] 

171 publisher = publishers[0] if publishers else "" 

172 

173 # Get subjects 

174 subjects = doc.get("subject", []) 

175 if isinstance(subjects, str): 175 ↛ 176line 175 didn't jump to line 176 because the condition on line 175 was never true

176 subjects = [subjects] 

177 subjects = subjects[:5] # Limit to 5 subjects 

178 

179 # Get ISBNs 

180 isbns = doc.get("isbn", []) 

181 if isinstance(isbns, str): 181 ↛ 182line 181 didn't jump to line 182 because the condition on line 181 was never true

182 isbns = [isbns] 

183 isbn = isbns[0] if isbns else None 

184 

185 # Get cover 

186 cover_id = doc.get("cover_i") 

187 cover_url = self._get_cover_url(cover_id) 

188 

189 # Get description if available 

190 description = doc.get("description", "") 

191 # Description can be a string or a dict with "value" key 

192 if isinstance(description, dict): 192 ↛ 193line 192 didn't jump to line 193 because the condition on line 192 was never true

193 description = description.get("value", "") 

194 elif isinstance(description, list): 194 ↛ 195line 194 didn't jump to line 195 because the condition on line 194 was never true

195 description = ( 

196 " ".join(str(d) for d in description) 

197 if description 

198 else "" 

199 ) 

200 

201 # Build snippet with description for richer content 

202 snippet_parts = [] 

203 if description: 203 ↛ 204line 203 didn't jump to line 204 because the condition on line 203 was never true

204 snippet_parts.append(description[:800]) 

205 if authors: 

206 snippet_parts.append(f"By {', '.join(authors[:3])}") 

207 if first_publish_year: 

208 snippet_parts.append( 

209 f"First published: {first_publish_year}" 

210 ) 

211 if subjects: 

212 snippet_parts.append( 

213 f"Subjects: {', '.join(subjects[:5])}" 

214 ) 

215 snippet = ". ".join(snippet_parts) 

216 

217 # Check availability 

218 has_fulltext = doc.get("has_fulltext", False) 

219 ebook_access = doc.get("ebook_access", "no_ebook") 

220 ia_ids = doc.get("ia", []) 

221 if isinstance(ia_ids, str): 221 ↛ 222line 221 didn't jump to line 222 because the condition on line 221 was never true

222 ia_ids = [ia_ids] 

223 

224 preview = { 

225 "id": work_key, 

226 "title": title, 

227 "link": link, 

228 "snippet": snippet, 

229 "authors": authors, 

230 "first_publish_year": first_publish_year, 

231 "publisher": publisher, 

232 "subjects": subjects, 

233 "isbn": isbn, 

234 "cover_url": cover_url, 

235 "edition_count": doc.get("edition_count", 0), 

236 "has_fulltext": has_fulltext, 

237 "ebook_access": ebook_access, 

238 "internet_archive_ids": ia_ids[:3] if ia_ids else [], 

239 "source": "Open Library", 

240 "_raw": doc, 

241 } 

242 

243 previews.append(preview) 

244 

245 except Exception: 

246 logger.exception("Error parsing Open Library item") 

247 continue 

248 

249 return previews 

250 

251 except (requests.RequestException, ValueError) as e: 

252 logger.exception("Open Library API request failed") 

253 self._raise_if_rate_limit(e) 

254 return [] 

255 

256 def _get_full_content( 

257 self, relevant_items: List[Dict[str, Any]] 

258 ) -> List[Dict[str, Any]]: 

259 """ 

260 Get full content for the relevant Open Library books. 

261 

262 Fetches detailed information from the Works API including 

263 full descriptions and excerpts. 

264 

265 Args: 

266 relevant_items: List of relevant preview dictionaries 

267 

268 Returns: 

269 List of result dictionaries with full content 

270 """ 

271 logger.info( 

272 f"Getting full content for {len(relevant_items)} Open Library books" 

273 ) 

274 

275 results = [] 

276 for item in relevant_items: 

277 result = item.copy() 

278 

279 raw = item.get("_raw", {}) 

280 if raw: 

281 # Get all languages 

282 languages = raw.get("language", []) 

283 if isinstance(languages, str): 

284 languages = [languages] 

285 result["languages"] = languages 

286 

287 # Get all subjects 

288 result["subjects"] = raw.get("subject", []) 

289 if isinstance(result["subjects"], str): 

290 result["subjects"] = [result["subjects"]] 

291 

292 # Get all publishers 

293 result["publishers"] = raw.get("publisher", []) 

294 if isinstance(result["publishers"], str): 

295 result["publishers"] = [result["publishers"]] 

296 

297 # Fetch detailed info from Works API 

298 work_key = item.get("id", "") 

299 work_data = self._fetch_work_details(work_key) 

300 

301 # Build content with metadata + description + excerpts 

302 content_parts = [] 

303 if result.get("authors"): 

304 content_parts.append( 

305 f"Authors: {', '.join(result['authors'])}" 

306 ) 

307 if result.get("first_publish_year"): 

308 content_parts.append( 

309 f"First published: {result['first_publish_year']}" 

310 ) 

311 if result.get("subjects"): 

312 subjects = result["subjects"] 

313 if isinstance(subjects, list): 313 ↛ 319line 313 didn't jump to line 319 because the condition on line 313 was always true

314 content_parts.append( 

315 f"Subjects: {', '.join(subjects[:10])}" 

316 ) 

317 

318 # Use full description from Works API if available 

319 description = "" 

320 if work_data: 

321 desc = work_data.get("description", "") 

322 if isinstance(desc, dict): 

323 desc = desc.get("value", "") 

324 elif isinstance(desc, list): 324 ↛ 325line 324 didn't jump to line 325 because the condition on line 324 was never true

325 desc = " ".join(str(d) for d in desc) 

326 if isinstance(desc, str) and desc: 326 ↛ 328line 326 didn't jump to line 328 because the condition on line 326 was always true

327 description = desc 

328 if not description: 

329 desc = raw.get("description", "") 

330 if isinstance(desc, dict): 330 ↛ 331line 330 didn't jump to line 331 because the condition on line 330 was never true

331 desc = desc.get("value", "") 

332 elif isinstance(desc, list): 332 ↛ 333line 332 didn't jump to line 333 because the condition on line 332 was never true

333 desc = " ".join(str(d) for d in desc) 

334 if isinstance(desc, str) and desc: 

335 description = desc 

336 if description: 

337 content_parts.append(f"\n{description}") 

338 

339 # Add excerpts from Works API 

340 if work_data: 

341 excerpts = work_data.get("excerpts", []) 

342 if excerpts: 

343 content_parts.append("\nExcerpts:") 

344 for exc in excerpts[:5]: 

345 text = exc.get("excerpt", "") 

346 if text: 346 ↛ 344line 346 didn't jump to line 344 because the condition on line 346 was always true

347 content_parts.append(f' "{text}"') 

348 

349 if result.get("has_fulltext"): 

350 content_parts.append( 

351 "\nFull text available on Internet Archive" 

352 ) 

353 

354 result["content"] = "\n".join(content_parts) 

355 

356 # Clean up internal fields 

357 if "_raw" in result: 

358 del result["_raw"] 

359 

360 results.append(result) 

361 

362 return results 

363 

364 def _fetch_work_details(self, work_key: str) -> Optional[Dict[str, Any]]: 

365 """Fetch detailed work information from the Works API.""" 

366 if not work_key or not work_key.startswith("/works/"): 

367 if work_key: 

368 logger.warning( 

369 "Invalid work_key format: expected '/works/...' prefix" 

370 ) 

371 return None 

372 try: 

373 url = f"{self.base_url}{work_key}.json" 

374 response = safe_get(url, headers=self.headers, timeout=15) 

375 self._raise_if_rate_limit(response.status_code) 

376 response.raise_for_status() 

377 return response.json() # type: ignore[no-any-return] 

378 except (RateLimitError, ValueError): 

379 raise 

380 except Exception: 

381 logger.warning(f"Failed to fetch work details for {work_key}") 

382 return None 

383 

384 def get_book_by_isbn(self, isbn: str) -> Optional[Dict[str, Any]]: 

385 """ 

386 Get a specific book by ISBN. 

387 

388 Args: 

389 isbn: The book ISBN (10 or 13 digit) 

390 

391 Returns: 

392 Book dictionary or None 

393 """ 

394 try: 

395 url = f"{self.base_url}/isbn/{isbn}.json" 

396 response = safe_get(url, headers=self.headers, timeout=30) 

397 self._raise_if_rate_limit(response.status_code) 

398 response.raise_for_status() 

399 return response.json() # type: ignore[no-any-return] 

400 except RateLimitError: 

401 raise 

402 except Exception: 

403 logger.exception(f"Error fetching book by ISBN {isbn}") 

404 return None 

405 

406 def get_author(self, author_key: str) -> Optional[Dict[str, Any]]: 

407 """ 

408 Get author information. 

409 

410 Args: 

411 author_key: The author key (e.g., '/authors/OL23919A') 

412 

413 Returns: 

414 Author dictionary or None 

415 """ 

416 try: 

417 if not author_key or not author_key.startswith("/authors/"): 417 ↛ 418line 417 didn't jump to line 418 because the condition on line 417 was never true

418 logger.warning( 

419 "Invalid author_key format: expected '/authors/...' prefix" 

420 ) 

421 return None 

422 url = f"{self.base_url}{author_key}.json" 

423 response = safe_get(url, headers=self.headers, timeout=30) 

424 self._raise_if_rate_limit(response.status_code) 

425 response.raise_for_status() 

426 return response.json() # type: ignore[no-any-return] 

427 except RateLimitError: 

428 raise 

429 except Exception: 

430 logger.exception(f"Error fetching author {author_key}") 

431 return None