Coverage for src / local_deep_research / web_search_engines / engines / search_engine_openlibrary.py: 89%
213 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""Open Library search engine for books and literature."""
3import html
4from typing import Any, Dict, List, Optional
6import requests
7from langchain_core.language_models import BaseLLM
8from loguru import logger
10from ...security.safe_requests import safe_get
11from ..rate_limiting import RateLimitError
12from ..search_engine_base import BaseSearchEngine
15class OpenLibrarySearchEngine(BaseSearchEngine):
16 """
17 Open Library search engine for books and literature.
19 Provides access to 2M+ books with metadata, covers, and reading lists.
20 No authentication required. Part of the Internet Archive.
21 """
23 is_public = True
24 is_generic = False
25 is_scientific = False
26 is_books = True # New category for book search
27 is_lexical = True
28 needs_llm_relevance_filter = True
30 def __init__(
31 self,
32 max_results: int = 10,
33 sort: str = "relevance",
34 language: Optional[str] = None,
35 search_field: Optional[str] = None,
36 llm: Optional[BaseLLM] = None,
37 max_filtered_results: Optional[int] = None,
38 settings_snapshot: Optional[Dict[str, Any]] = None,
39 **kwargs,
40 ):
41 """
42 Initialize the Open Library search engine.
44 Args:
45 max_results: Maximum number of search results
46 sort: Sort order ('relevance', 'new', 'old', 'random')
47 language: Filter by language code (e.g., 'eng', 'fre', 'ger')
48 search_field: Search in specific field ('title', 'author', 'subject')
49 llm: Language model for relevance filtering
50 max_filtered_results: Maximum results after filtering
51 settings_snapshot: Settings snapshot for thread context
52 """
53 super().__init__(
54 llm=llm,
55 max_filtered_results=max_filtered_results,
56 max_results=max_results,
57 settings_snapshot=settings_snapshot,
58 **kwargs,
59 )
61 self.sort = sort
62 self.language = language
63 self.search_field = search_field
65 self.base_url = "https://openlibrary.org"
66 self.search_url = f"{self.base_url}/search.json"
68 # User-Agent header is important for Open Library API
69 # They may block requests without a proper User-Agent
70 self.headers = {
71 "User-Agent": "LocalDeepResearch/1.0 (https://github.com/LearningCircuit/local-deep-research)"
72 }
74 def _build_query_params(self, query: str) -> Dict[str, Any]:
75 """Build query parameters for the API request."""
76 params = {
77 "limit": min(self.max_results, 100),
78 "fields": "key,title,author_name,author_key,first_publish_year,"
79 "publisher,language,subject,isbn,cover_i,edition_count,"
80 "ebook_access,has_fulltext,ia,description",
81 }
83 # Build query based on search field
84 if self.search_field == "title":
85 params["title"] = query
86 elif self.search_field == "author":
87 params["author"] = query
88 elif self.search_field == "subject": 88 ↛ 89line 88 didn't jump to line 89 because the condition on line 88 was never true
89 params["subject"] = query
90 else:
91 params["q"] = query
93 # Add sort if not relevance (default)
94 if self.sort and self.sort != "relevance":
95 params["sort"] = self.sort
97 # Add language filter
98 if self.language:
99 params["language"] = self.language
101 return params
103 def _get_cover_url(
104 self, cover_id: Optional[int], size: str = "M"
105 ) -> Optional[str]:
106 """Get cover image URL for a book."""
107 if not cover_id:
108 return None
109 return f"https://covers.openlibrary.org/b/id/{cover_id}-{size}.jpg"
111 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
112 """
113 Get preview information for Open Library books.
115 Args:
116 query: The search query
118 Returns:
119 List of preview dictionaries
120 """
121 logger.info(f"Getting Open Library previews for query: {query}")
123 # Apply rate limiting
124 self._last_wait_time = self.rate_tracker.apply_rate_limit(
125 self.engine_type
126 )
128 try:
129 params = self._build_query_params(query)
130 response = safe_get(
131 self.search_url,
132 params=params,
133 headers=self.headers,
134 timeout=30,
135 )
137 self._raise_if_rate_limit(response.status_code)
139 response.raise_for_status()
140 data = response.json()
142 docs = data.get("docs", [])
143 total_found = data.get("num_found", 0)
144 logger.info(
145 f"Found {total_found} Open Library results, returning {len(docs)}"
146 )
148 previews = []
149 for doc in docs:
150 try:
151 # Get work key and build URL
152 work_key = doc.get("key", "")
153 link = f"{self.base_url}{work_key}" if work_key else ""
155 # Get title (decode HTML entities)
156 title = html.unescape(doc.get("title", "Untitled"))
158 # Get authors
159 authors = doc.get("author_name", [])
160 if isinstance(authors, str):
161 authors = [authors]
162 authors = authors[:5] # Limit to 5 authors
164 # Get first publish year
165 first_publish_year = doc.get("first_publish_year")
167 # Get publishers
168 publishers = doc.get("publisher", [])
169 if isinstance(publishers, str): 169 ↛ 170line 169 didn't jump to line 170 because the condition on line 169 was never true
170 publishers = [publishers]
171 publisher = publishers[0] if publishers else ""
173 # Get subjects
174 subjects = doc.get("subject", [])
175 if isinstance(subjects, str): 175 ↛ 176line 175 didn't jump to line 176 because the condition on line 175 was never true
176 subjects = [subjects]
177 subjects = subjects[:5] # Limit to 5 subjects
179 # Get ISBNs
180 isbns = doc.get("isbn", [])
181 if isinstance(isbns, str): 181 ↛ 182line 181 didn't jump to line 182 because the condition on line 181 was never true
182 isbns = [isbns]
183 isbn = isbns[0] if isbns else None
185 # Get cover
186 cover_id = doc.get("cover_i")
187 cover_url = self._get_cover_url(cover_id)
189 # Get description if available
190 description = doc.get("description", "")
191 # Description can be a string or a dict with "value" key
192 if isinstance(description, dict): 192 ↛ 193line 192 didn't jump to line 193 because the condition on line 192 was never true
193 description = description.get("value", "")
194 elif isinstance(description, list): 194 ↛ 195line 194 didn't jump to line 195 because the condition on line 194 was never true
195 description = (
196 " ".join(str(d) for d in description)
197 if description
198 else ""
199 )
201 # Build snippet with description for richer content
202 snippet_parts = []
203 if description: 203 ↛ 204line 203 didn't jump to line 204 because the condition on line 203 was never true
204 snippet_parts.append(description[:800])
205 if authors:
206 snippet_parts.append(f"By {', '.join(authors[:3])}")
207 if first_publish_year:
208 snippet_parts.append(
209 f"First published: {first_publish_year}"
210 )
211 if subjects:
212 snippet_parts.append(
213 f"Subjects: {', '.join(subjects[:5])}"
214 )
215 snippet = ". ".join(snippet_parts)
217 # Check availability
218 has_fulltext = doc.get("has_fulltext", False)
219 ebook_access = doc.get("ebook_access", "no_ebook")
220 ia_ids = doc.get("ia", [])
221 if isinstance(ia_ids, str): 221 ↛ 222line 221 didn't jump to line 222 because the condition on line 221 was never true
222 ia_ids = [ia_ids]
224 preview = {
225 "id": work_key,
226 "title": title,
227 "link": link,
228 "snippet": snippet,
229 "authors": authors,
230 "first_publish_year": first_publish_year,
231 "publisher": publisher,
232 "subjects": subjects,
233 "isbn": isbn,
234 "cover_url": cover_url,
235 "edition_count": doc.get("edition_count", 0),
236 "has_fulltext": has_fulltext,
237 "ebook_access": ebook_access,
238 "internet_archive_ids": ia_ids[:3] if ia_ids else [],
239 "source": "Open Library",
240 "_raw": doc,
241 }
243 previews.append(preview)
245 except Exception:
246 logger.exception("Error parsing Open Library item")
247 continue
249 return previews
251 except (requests.RequestException, ValueError) as e:
252 logger.exception("Open Library API request failed")
253 self._raise_if_rate_limit(e)
254 return []
256 def _get_full_content(
257 self, relevant_items: List[Dict[str, Any]]
258 ) -> List[Dict[str, Any]]:
259 """
260 Get full content for the relevant Open Library books.
262 Fetches detailed information from the Works API including
263 full descriptions and excerpts.
265 Args:
266 relevant_items: List of relevant preview dictionaries
268 Returns:
269 List of result dictionaries with full content
270 """
271 logger.info(
272 f"Getting full content for {len(relevant_items)} Open Library books"
273 )
275 results = []
276 for item in relevant_items:
277 result = item.copy()
279 raw = item.get("_raw", {})
280 if raw:
281 # Get all languages
282 languages = raw.get("language", [])
283 if isinstance(languages, str):
284 languages = [languages]
285 result["languages"] = languages
287 # Get all subjects
288 result["subjects"] = raw.get("subject", [])
289 if isinstance(result["subjects"], str):
290 result["subjects"] = [result["subjects"]]
292 # Get all publishers
293 result["publishers"] = raw.get("publisher", [])
294 if isinstance(result["publishers"], str):
295 result["publishers"] = [result["publishers"]]
297 # Fetch detailed info from Works API
298 work_key = item.get("id", "")
299 work_data = self._fetch_work_details(work_key)
301 # Build content with metadata + description + excerpts
302 content_parts = []
303 if result.get("authors"):
304 content_parts.append(
305 f"Authors: {', '.join(result['authors'])}"
306 )
307 if result.get("first_publish_year"):
308 content_parts.append(
309 f"First published: {result['first_publish_year']}"
310 )
311 if result.get("subjects"):
312 subjects = result["subjects"]
313 if isinstance(subjects, list): 313 ↛ 319line 313 didn't jump to line 319 because the condition on line 313 was always true
314 content_parts.append(
315 f"Subjects: {', '.join(subjects[:10])}"
316 )
318 # Use full description from Works API if available
319 description = ""
320 if work_data:
321 desc = work_data.get("description", "")
322 if isinstance(desc, dict):
323 desc = desc.get("value", "")
324 elif isinstance(desc, list): 324 ↛ 325line 324 didn't jump to line 325 because the condition on line 324 was never true
325 desc = " ".join(str(d) for d in desc)
326 if isinstance(desc, str) and desc: 326 ↛ 328line 326 didn't jump to line 328 because the condition on line 326 was always true
327 description = desc
328 if not description:
329 desc = raw.get("description", "")
330 if isinstance(desc, dict): 330 ↛ 331line 330 didn't jump to line 331 because the condition on line 330 was never true
331 desc = desc.get("value", "")
332 elif isinstance(desc, list): 332 ↛ 333line 332 didn't jump to line 333 because the condition on line 332 was never true
333 desc = " ".join(str(d) for d in desc)
334 if isinstance(desc, str) and desc:
335 description = desc
336 if description:
337 content_parts.append(f"\n{description}")
339 # Add excerpts from Works API
340 if work_data:
341 excerpts = work_data.get("excerpts", [])
342 if excerpts:
343 content_parts.append("\nExcerpts:")
344 for exc in excerpts[:5]:
345 text = exc.get("excerpt", "")
346 if text: 346 ↛ 344line 346 didn't jump to line 344 because the condition on line 346 was always true
347 content_parts.append(f' "{text}"')
349 if result.get("has_fulltext"):
350 content_parts.append(
351 "\nFull text available on Internet Archive"
352 )
354 result["content"] = "\n".join(content_parts)
356 # Clean up internal fields
357 if "_raw" in result:
358 del result["_raw"]
360 results.append(result)
362 return results
364 def _fetch_work_details(self, work_key: str) -> Optional[Dict[str, Any]]:
365 """Fetch detailed work information from the Works API."""
366 if not work_key or not work_key.startswith("/works/"):
367 if work_key:
368 logger.warning(
369 "Invalid work_key format: expected '/works/...' prefix"
370 )
371 return None
372 try:
373 url = f"{self.base_url}{work_key}.json"
374 response = safe_get(url, headers=self.headers, timeout=15)
375 self._raise_if_rate_limit(response.status_code)
376 response.raise_for_status()
377 return response.json() # type: ignore[no-any-return]
378 except (RateLimitError, ValueError):
379 raise
380 except Exception:
381 logger.warning(f"Failed to fetch work details for {work_key}")
382 return None
384 def get_book_by_isbn(self, isbn: str) -> Optional[Dict[str, Any]]:
385 """
386 Get a specific book by ISBN.
388 Args:
389 isbn: The book ISBN (10 or 13 digit)
391 Returns:
392 Book dictionary or None
393 """
394 try:
395 url = f"{self.base_url}/isbn/{isbn}.json"
396 response = safe_get(url, headers=self.headers, timeout=30)
397 self._raise_if_rate_limit(response.status_code)
398 response.raise_for_status()
399 return response.json() # type: ignore[no-any-return]
400 except RateLimitError:
401 raise
402 except Exception:
403 logger.exception(f"Error fetching book by ISBN {isbn}")
404 return None
406 def get_author(self, author_key: str) -> Optional[Dict[str, Any]]:
407 """
408 Get author information.
410 Args:
411 author_key: The author key (e.g., '/authors/OL23919A')
413 Returns:
414 Author dictionary or None
415 """
416 try:
417 if not author_key or not author_key.startswith("/authors/"): 417 ↛ 418line 417 didn't jump to line 418 because the condition on line 417 was never true
418 logger.warning(
419 "Invalid author_key format: expected '/authors/...' prefix"
420 )
421 return None
422 url = f"{self.base_url}{author_key}.json"
423 response = safe_get(url, headers=self.headers, timeout=30)
424 self._raise_if_rate_limit(response.status_code)
425 response.raise_for_status()
426 return response.json() # type: ignore[no-any-return]
427 except RateLimitError:
428 raise
429 except Exception:
430 logger.exception(f"Error fetching author {author_key}")
431 return None