Coverage for src/local_deep_research/web_search_engines/engines/search_engine_openlibrary.py: 89%

214 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1"""Open Library search engine for books and literature.""" 

2 

3import html 

4from typing import Any, Dict, List, Optional 

5 

6import requests 

7from langchain_core.language_models import BaseLLM 

8from loguru import logger 

9 

10from ...constants import USER_AGENT 

11from ...security.safe_requests import safe_get 

12from ..rate_limiting import RateLimitError 

13from ..search_engine_base import BaseSearchEngine 

14 

15 

16class OpenLibrarySearchEngine(BaseSearchEngine): 

17 """ 

18 Open Library search engine for books and literature. 

19 

20 Provides access to 2M+ books with metadata, covers, and reading lists. 

21 No authentication required. Part of the Internet Archive. 

22 """ 

23 

24 is_public = True 

25 is_generic = False 

26 is_scientific = False 

27 is_books = True # New category for book search 

28 is_lexical = True 

29 needs_llm_relevance_filter = True 

30 

31 def __init__( 

32 self, 

33 max_results: int = 10, 

34 sort: str = "relevance", 

35 language: Optional[str] = None, 

36 search_field: Optional[str] = None, 

37 llm: Optional[BaseLLM] = None, 

38 max_filtered_results: Optional[int] = None, 

39 settings_snapshot: Optional[Dict[str, Any]] = None, 

40 **kwargs, 

41 ): 

42 """ 

43 Initialize the Open Library search engine. 

44 

45 Args: 

46 max_results: Maximum number of search results 

47 sort: Sort order ('relevance', 'new', 'old', 'random') 

48 language: Filter by language code (e.g., 'eng', 'fre', 'ger') 

49 search_field: Search in specific field ('title', 'author', 'subject') 

50 llm: Language model for relevance filtering 

51 max_filtered_results: Maximum results after filtering 

52 settings_snapshot: Settings snapshot for thread context 

53 """ 

54 super().__init__( 

55 llm=llm, 

56 max_filtered_results=max_filtered_results, 

57 max_results=max_results, 

58 settings_snapshot=settings_snapshot, 

59 **kwargs, 

60 ) 

61 

62 self.sort = sort 

63 self.language = language 

64 self.search_field = search_field 

65 

66 self.base_url = "https://openlibrary.org" 

67 self.search_url = f"{self.base_url}/search.json" 

68 

69 # User-Agent header is important for Open Library API 

70 # They may block requests without a proper User-Agent 

71 self.headers = {"User-Agent": USER_AGENT} 

72 

73 def _build_query_params(self, query: str) -> Dict[str, Any]: 

74 """Build query parameters for the API request.""" 

75 params = { 

76 "limit": min(self.max_results, 100), 

77 "fields": "key,title,author_name,author_key,first_publish_year," 

78 "publisher,language,subject,isbn,cover_i,edition_count," 

79 "ebook_access,has_fulltext,ia,description", 

80 } 

81 

82 # Build query based on search field 

83 if self.search_field == "title": 

84 params["title"] = query 

85 elif self.search_field == "author": 

86 params["author"] = query 

87 elif self.search_field == "subject": 87 ↛ 88line 87 didn't jump to line 88 because the condition on line 87 was never true

88 params["subject"] = query 

89 else: 

90 params["q"] = query 

91 

92 # Add sort if not relevance (default) 

93 if self.sort and self.sort != "relevance": 

94 params["sort"] = self.sort 

95 

96 # Add language filter 

97 if self.language: 

98 params["language"] = self.language 

99 

100 return params 

101 

102 def _get_cover_url( 

103 self, cover_id: Optional[int], size: str = "M" 

104 ) -> Optional[str]: 

105 """Get cover image URL for a book.""" 

106 if not cover_id: 

107 return None 

108 return f"https://covers.openlibrary.org/b/id/{cover_id}-{size}.jpg" 

109 

110 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

111 """ 

112 Get preview information for Open Library books. 

113 

114 Args: 

115 query: The search query 

116 

117 Returns: 

118 List of preview dictionaries 

119 """ 

120 logger.info(f"Getting Open Library previews for query: {query}") 

121 

122 # Apply rate limiting 

123 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

124 self.engine_type 

125 ) 

126 

127 try: 

128 params = self._build_query_params(query) 

129 response = safe_get( 

130 self.search_url, 

131 params=params, 

132 headers=self.headers, 

133 timeout=30, 

134 ) 

135 

136 self._raise_if_rate_limit(response.status_code) 

137 

138 response.raise_for_status() 

139 data = response.json() 

140 

141 docs = data.get("docs", []) 

142 total_found = data.get("num_found", 0) 

143 logger.info( 

144 f"Found {total_found} Open Library results, returning {len(docs)}" 

145 ) 

146 

147 previews = [] 

148 for doc in docs: 

149 try: 

150 # Get work key and build URL 

151 work_key = doc.get("key", "") 

152 link = f"{self.base_url}{work_key}" if work_key else "" 

153 

154 # Get title (decode HTML entities) 

155 title = html.unescape(doc.get("title", "Untitled")) 

156 

157 # Get authors 

158 authors = doc.get("author_name", []) 

159 if isinstance(authors, str): 

160 authors = [authors] 

161 authors = authors[:5] # Limit to 5 authors 

162 

163 # Get first publish year 

164 first_publish_year = doc.get("first_publish_year") 

165 

166 # Get publishers 

167 publishers = doc.get("publisher", []) 

168 if isinstance(publishers, str): 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true

169 publishers = [publishers] 

170 publisher = publishers[0] if publishers else "" 

171 

172 # Get subjects 

173 subjects = doc.get("subject", []) 

174 if isinstance(subjects, str): 174 ↛ 175line 174 didn't jump to line 175 because the condition on line 174 was never true

175 subjects = [subjects] 

176 subjects = subjects[:5] # Limit to 5 subjects 

177 

178 # Get ISBNs 

179 isbns = doc.get("isbn", []) 

180 if isinstance(isbns, str): 180 ↛ 181line 180 didn't jump to line 181 because the condition on line 180 was never true

181 isbns = [isbns] 

182 isbn = isbns[0] if isbns else None 

183 

184 # Get cover 

185 cover_id = doc.get("cover_i") 

186 cover_url = self._get_cover_url(cover_id) 

187 

188 # Get description if available 

189 description = doc.get("description", "") 

190 # Description can be a string or a dict with "value" key 

191 if isinstance(description, dict): 191 ↛ 192line 191 didn't jump to line 192 because the condition on line 191 was never true

192 description = description.get("value", "") 

193 elif isinstance(description, list): 193 ↛ 194line 193 didn't jump to line 194 because the condition on line 193 was never true

194 description = ( 

195 " ".join(str(d) for d in description) 

196 if description 

197 else "" 

198 ) 

199 

200 # Build snippet with description for richer content 

201 snippet_parts = [] 

202 if description: 202 ↛ 203line 202 didn't jump to line 203 because the condition on line 202 was never true

203 snippet_parts.append(description[:800]) 

204 if authors: 

205 snippet_parts.append(f"By {', '.join(authors[:3])}") 

206 if first_publish_year: 

207 snippet_parts.append( 

208 f"First published: {first_publish_year}" 

209 ) 

210 if subjects: 

211 snippet_parts.append( 

212 f"Subjects: {', '.join(subjects[:5])}" 

213 ) 

214 snippet = ". ".join(snippet_parts) 

215 

216 # Check availability 

217 has_fulltext = doc.get("has_fulltext", False) 

218 ebook_access = doc.get("ebook_access", "no_ebook") 

219 ia_ids = doc.get("ia", []) 

220 if isinstance(ia_ids, str): 220 ↛ 221line 220 didn't jump to line 221 because the condition on line 220 was never true

221 ia_ids = [ia_ids] 

222 

223 preview = { 

224 "id": work_key, 

225 "title": title, 

226 "link": link, 

227 "snippet": snippet, 

228 "authors": authors, 

229 "first_publish_year": first_publish_year, 

230 "publisher": publisher, 

231 "subjects": subjects, 

232 "isbn": isbn, 

233 "cover_url": cover_url, 

234 "edition_count": doc.get("edition_count", 0), 

235 "has_fulltext": has_fulltext, 

236 "ebook_access": ebook_access, 

237 "internet_archive_ids": ia_ids[:3] if ia_ids else [], 

238 "source": "Open Library", 

239 "_raw": doc, 

240 } 

241 

242 previews.append(preview) 

243 

244 except Exception: 

245 logger.exception("Error parsing Open Library item") 

246 continue 

247 

248 return previews 

249 

250 except (requests.RequestException, ValueError) as e: 

251 logger.exception("Open Library API request failed") 

252 self._raise_if_rate_limit(e) 

253 return [] 

254 

255 def _get_full_content( 

256 self, relevant_items: List[Dict[str, Any]] 

257 ) -> List[Dict[str, Any]]: 

258 """ 

259 Get full content for the relevant Open Library books. 

260 

261 Fetches detailed information from the Works API including 

262 full descriptions and excerpts. 

263 

264 Args: 

265 relevant_items: List of relevant preview dictionaries 

266 

267 Returns: 

268 List of result dictionaries with full content 

269 """ 

270 logger.info( 

271 f"Getting full content for {len(relevant_items)} Open Library books" 

272 ) 

273 

274 results = [] 

275 for item in relevant_items: 

276 result = item.copy() 

277 

278 raw = item.get("_raw", {}) 

279 if raw: 

280 # Get all languages 

281 languages = raw.get("language", []) 

282 if isinstance(languages, str): 

283 languages = [languages] 

284 result["languages"] = languages 

285 

286 # Get all subjects 

287 result["subjects"] = raw.get("subject", []) 

288 if isinstance(result["subjects"], str): 

289 result["subjects"] = [result["subjects"]] 

290 

291 # Get all publishers 

292 result["publishers"] = raw.get("publisher", []) 

293 if isinstance(result["publishers"], str): 

294 result["publishers"] = [result["publishers"]] 

295 

296 # Fetch detailed info from Works API 

297 work_key = item.get("id", "") 

298 work_data = self._fetch_work_details(work_key) 

299 

300 # Build content with metadata + description + excerpts 

301 content_parts = [] 

302 if result.get("authors"): 

303 content_parts.append( 

304 f"Authors: {', '.join(result['authors'])}" 

305 ) 

306 if result.get("first_publish_year"): 

307 content_parts.append( 

308 f"First published: {result['first_publish_year']}" 

309 ) 

310 if result.get("subjects"): 

311 subjects = result["subjects"] 

312 if isinstance(subjects, list): 312 ↛ 318line 312 didn't jump to line 318 because the condition on line 312 was always true

313 content_parts.append( 

314 f"Subjects: {', '.join(subjects[:10])}" 

315 ) 

316 

317 # Use full description from Works API if available 

318 description = "" 

319 if work_data: 

320 desc = work_data.get("description", "") 

321 if isinstance(desc, dict): 

322 desc = desc.get("value", "") 

323 elif isinstance(desc, list): 323 ↛ 324line 323 didn't jump to line 324 because the condition on line 323 was never true

324 desc = " ".join(str(d) for d in desc) 

325 if isinstance(desc, str) and desc: 325 ↛ 327line 325 didn't jump to line 327 because the condition on line 325 was always true

326 description = desc 

327 if not description: 

328 desc = raw.get("description", "") 

329 if isinstance(desc, dict): 329 ↛ 330line 329 didn't jump to line 330 because the condition on line 329 was never true

330 desc = desc.get("value", "") 

331 elif isinstance(desc, list): 331 ↛ 332line 331 didn't jump to line 332 because the condition on line 331 was never true

332 desc = " ".join(str(d) for d in desc) 

333 if isinstance(desc, str) and desc: 

334 description = desc 

335 if description: 

336 content_parts.append(f"\n{description}") 

337 

338 # Add excerpts from Works API 

339 if work_data: 

340 excerpts = work_data.get("excerpts", []) 

341 if excerpts: 

342 content_parts.append("\nExcerpts:") 

343 for exc in excerpts[:5]: 

344 text = exc.get("excerpt", "") 

345 if text: 345 ↛ 343line 345 didn't jump to line 343 because the condition on line 345 was always true

346 content_parts.append(f' "{text}"') 

347 

348 if result.get("has_fulltext"): 

349 content_parts.append( 

350 "\nFull text available on Internet Archive" 

351 ) 

352 

353 result["content"] = "\n".join(content_parts) 

354 

355 # Clean up internal fields 

356 if "_raw" in result: 

357 del result["_raw"] 

358 

359 results.append(result) 

360 

361 return results 

362 

363 def _fetch_work_details(self, work_key: str) -> Optional[Dict[str, Any]]: 

364 """Fetch detailed work information from the Works API.""" 

365 if not work_key or not work_key.startswith("/works/"): 

366 if work_key: 

367 logger.warning( 

368 "Invalid work_key format: expected '/works/...' prefix" 

369 ) 

370 return None 

371 try: 

372 url = f"{self.base_url}{work_key}.json" 

373 response = safe_get(url, headers=self.headers, timeout=15) 

374 self._raise_if_rate_limit(response.status_code) 

375 response.raise_for_status() 

376 return response.json() # type: ignore[no-any-return] 

377 except (RateLimitError, ValueError): 

378 raise 

379 except Exception: 

380 logger.warning(f"Failed to fetch work details for {work_key}") 

381 return None 

382 

383 def get_book_by_isbn(self, isbn: str) -> Optional[Dict[str, Any]]: 

384 """ 

385 Get a specific book by ISBN. 

386 

387 Args: 

388 isbn: The book ISBN (10 or 13 digit) 

389 

390 Returns: 

391 Book dictionary or None 

392 """ 

393 try: 

394 url = f"{self.base_url}/isbn/{isbn}.json" 

395 response = safe_get(url, headers=self.headers, timeout=30) 

396 self._raise_if_rate_limit(response.status_code) 

397 response.raise_for_status() 

398 return response.json() # type: ignore[no-any-return] 

399 except RateLimitError: 

400 raise 

401 except Exception: 

402 logger.exception(f"Error fetching book by ISBN {isbn}") 

403 return None 

404 

405 def get_author(self, author_key: str) -> Optional[Dict[str, Any]]: 

406 """ 

407 Get author information. 

408 

409 Args: 

410 author_key: The author key (e.g., '/authors/OL23919A') 

411 

412 Returns: 

413 Author dictionary or None 

414 """ 

415 try: 

416 if not author_key or not author_key.startswith("/authors/"): 416 ↛ 417line 416 didn't jump to line 417 because the condition on line 416 was never true

417 logger.warning( 

418 "Invalid author_key format: expected '/authors/...' prefix" 

419 ) 

420 return None 

421 url = f"{self.base_url}{author_key}.json" 

422 response = safe_get(url, headers=self.headers, timeout=30) 

423 self._raise_if_rate_limit(response.status_code) 

424 response.raise_for_status() 

425 return response.json() # type: ignore[no-any-return] 

426 except RateLimitError: 

427 raise 

428 except Exception: 

429 logger.exception(f"Error fetching author {author_key}") 

430 return None