Coverage for src/local_deep_research/web_search_engines/engines/search_engine_openlibrary.py: 89%
214 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""Open Library search engine for books and literature."""
3import html
4from typing import Any, Dict, List, Optional
6import requests
7from langchain_core.language_models import BaseLLM
8from loguru import logger
10from ...constants import USER_AGENT
11from ...security.safe_requests import safe_get
12from ..rate_limiting import RateLimitError
13from ..search_engine_base import BaseSearchEngine
16class OpenLibrarySearchEngine(BaseSearchEngine):
17 """
18 Open Library search engine for books and literature.
20 Provides access to 2M+ books with metadata, covers, and reading lists.
21 No authentication required. Part of the Internet Archive.
22 """
24 is_public = True
25 is_generic = False
26 is_scientific = False
27 is_books = True # New category for book search
28 is_lexical = True
29 needs_llm_relevance_filter = True
31 def __init__(
32 self,
33 max_results: int = 10,
34 sort: str = "relevance",
35 language: Optional[str] = None,
36 search_field: Optional[str] = None,
37 llm: Optional[BaseLLM] = None,
38 max_filtered_results: Optional[int] = None,
39 settings_snapshot: Optional[Dict[str, Any]] = None,
40 **kwargs,
41 ):
42 """
43 Initialize the Open Library search engine.
45 Args:
46 max_results: Maximum number of search results
47 sort: Sort order ('relevance', 'new', 'old', 'random')
48 language: Filter by language code (e.g., 'eng', 'fre', 'ger')
49 search_field: Search in specific field ('title', 'author', 'subject')
50 llm: Language model for relevance filtering
51 max_filtered_results: Maximum results after filtering
52 settings_snapshot: Settings snapshot for thread context
53 """
54 super().__init__(
55 llm=llm,
56 max_filtered_results=max_filtered_results,
57 max_results=max_results,
58 settings_snapshot=settings_snapshot,
59 **kwargs,
60 )
62 self.sort = sort
63 self.language = language
64 self.search_field = search_field
66 self.base_url = "https://openlibrary.org"
67 self.search_url = f"{self.base_url}/search.json"
69 # User-Agent header is important for Open Library API
70 # They may block requests without a proper User-Agent
71 self.headers = {"User-Agent": USER_AGENT}
73 def _build_query_params(self, query: str) -> Dict[str, Any]:
74 """Build query parameters for the API request."""
75 params = {
76 "limit": min(self.max_results, 100),
77 "fields": "key,title,author_name,author_key,first_publish_year,"
78 "publisher,language,subject,isbn,cover_i,edition_count,"
79 "ebook_access,has_fulltext,ia,description",
80 }
82 # Build query based on search field
83 if self.search_field == "title":
84 params["title"] = query
85 elif self.search_field == "author":
86 params["author"] = query
87 elif self.search_field == "subject": 87 ↛ 88line 87 didn't jump to line 88 because the condition on line 87 was never true
88 params["subject"] = query
89 else:
90 params["q"] = query
92 # Add sort if not relevance (default)
93 if self.sort and self.sort != "relevance":
94 params["sort"] = self.sort
96 # Add language filter
97 if self.language:
98 params["language"] = self.language
100 return params
102 def _get_cover_url(
103 self, cover_id: Optional[int], size: str = "M"
104 ) -> Optional[str]:
105 """Get cover image URL for a book."""
106 if not cover_id:
107 return None
108 return f"https://covers.openlibrary.org/b/id/{cover_id}-{size}.jpg"
110 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
111 """
112 Get preview information for Open Library books.
114 Args:
115 query: The search query
117 Returns:
118 List of preview dictionaries
119 """
120 logger.info(f"Getting Open Library previews for query: {query}")
122 # Apply rate limiting
123 self._last_wait_time = self.rate_tracker.apply_rate_limit(
124 self.engine_type
125 )
127 try:
128 params = self._build_query_params(query)
129 response = safe_get(
130 self.search_url,
131 params=params,
132 headers=self.headers,
133 timeout=30,
134 )
136 self._raise_if_rate_limit(response.status_code)
138 response.raise_for_status()
139 data = response.json()
141 docs = data.get("docs", [])
142 total_found = data.get("num_found", 0)
143 logger.info(
144 f"Found {total_found} Open Library results, returning {len(docs)}"
145 )
147 previews = []
148 for doc in docs:
149 try:
150 # Get work key and build URL
151 work_key = doc.get("key", "")
152 link = f"{self.base_url}{work_key}" if work_key else ""
154 # Get title (decode HTML entities)
155 title = html.unescape(doc.get("title", "Untitled"))
157 # Get authors
158 authors = doc.get("author_name", [])
159 if isinstance(authors, str):
160 authors = [authors]
161 authors = authors[:5] # Limit to 5 authors
163 # Get first publish year
164 first_publish_year = doc.get("first_publish_year")
166 # Get publishers
167 publishers = doc.get("publisher", [])
168 if isinstance(publishers, str): 168 ↛ 169line 168 didn't jump to line 169 because the condition on line 168 was never true
169 publishers = [publishers]
170 publisher = publishers[0] if publishers else ""
172 # Get subjects
173 subjects = doc.get("subject", [])
174 if isinstance(subjects, str): 174 ↛ 175line 174 didn't jump to line 175 because the condition on line 174 was never true
175 subjects = [subjects]
176 subjects = subjects[:5] # Limit to 5 subjects
178 # Get ISBNs
179 isbns = doc.get("isbn", [])
180 if isinstance(isbns, str): 180 ↛ 181line 180 didn't jump to line 181 because the condition on line 180 was never true
181 isbns = [isbns]
182 isbn = isbns[0] if isbns else None
184 # Get cover
185 cover_id = doc.get("cover_i")
186 cover_url = self._get_cover_url(cover_id)
188 # Get description if available
189 description = doc.get("description", "")
190 # Description can be a string or a dict with "value" key
191 if isinstance(description, dict): 191 ↛ 192line 191 didn't jump to line 192 because the condition on line 191 was never true
192 description = description.get("value", "")
193 elif isinstance(description, list): 193 ↛ 194line 193 didn't jump to line 194 because the condition on line 193 was never true
194 description = (
195 " ".join(str(d) for d in description)
196 if description
197 else ""
198 )
200 # Build snippet with description for richer content
201 snippet_parts = []
202 if description: 202 ↛ 203line 202 didn't jump to line 203 because the condition on line 202 was never true
203 snippet_parts.append(description[:800])
204 if authors:
205 snippet_parts.append(f"By {', '.join(authors[:3])}")
206 if first_publish_year:
207 snippet_parts.append(
208 f"First published: {first_publish_year}"
209 )
210 if subjects:
211 snippet_parts.append(
212 f"Subjects: {', '.join(subjects[:5])}"
213 )
214 snippet = ". ".join(snippet_parts)
216 # Check availability
217 has_fulltext = doc.get("has_fulltext", False)
218 ebook_access = doc.get("ebook_access", "no_ebook")
219 ia_ids = doc.get("ia", [])
220 if isinstance(ia_ids, str): 220 ↛ 221line 220 didn't jump to line 221 because the condition on line 220 was never true
221 ia_ids = [ia_ids]
223 preview = {
224 "id": work_key,
225 "title": title,
226 "link": link,
227 "snippet": snippet,
228 "authors": authors,
229 "first_publish_year": first_publish_year,
230 "publisher": publisher,
231 "subjects": subjects,
232 "isbn": isbn,
233 "cover_url": cover_url,
234 "edition_count": doc.get("edition_count", 0),
235 "has_fulltext": has_fulltext,
236 "ebook_access": ebook_access,
237 "internet_archive_ids": ia_ids[:3] if ia_ids else [],
238 "source": "Open Library",
239 "_raw": doc,
240 }
242 previews.append(preview)
244 except Exception:
245 logger.exception("Error parsing Open Library item")
246 continue
248 return previews
250 except (requests.RequestException, ValueError) as e:
251 logger.exception("Open Library API request failed")
252 self._raise_if_rate_limit(e)
253 return []
255 def _get_full_content(
256 self, relevant_items: List[Dict[str, Any]]
257 ) -> List[Dict[str, Any]]:
258 """
259 Get full content for the relevant Open Library books.
261 Fetches detailed information from the Works API including
262 full descriptions and excerpts.
264 Args:
265 relevant_items: List of relevant preview dictionaries
267 Returns:
268 List of result dictionaries with full content
269 """
270 logger.info(
271 f"Getting full content for {len(relevant_items)} Open Library books"
272 )
274 results = []
275 for item in relevant_items:
276 result = item.copy()
278 raw = item.get("_raw", {})
279 if raw:
280 # Get all languages
281 languages = raw.get("language", [])
282 if isinstance(languages, str):
283 languages = [languages]
284 result["languages"] = languages
286 # Get all subjects
287 result["subjects"] = raw.get("subject", [])
288 if isinstance(result["subjects"], str):
289 result["subjects"] = [result["subjects"]]
291 # Get all publishers
292 result["publishers"] = raw.get("publisher", [])
293 if isinstance(result["publishers"], str):
294 result["publishers"] = [result["publishers"]]
296 # Fetch detailed info from Works API
297 work_key = item.get("id", "")
298 work_data = self._fetch_work_details(work_key)
300 # Build content with metadata + description + excerpts
301 content_parts = []
302 if result.get("authors"):
303 content_parts.append(
304 f"Authors: {', '.join(result['authors'])}"
305 )
306 if result.get("first_publish_year"):
307 content_parts.append(
308 f"First published: {result['first_publish_year']}"
309 )
310 if result.get("subjects"):
311 subjects = result["subjects"]
312 if isinstance(subjects, list): 312 ↛ 318line 312 didn't jump to line 318 because the condition on line 312 was always true
313 content_parts.append(
314 f"Subjects: {', '.join(subjects[:10])}"
315 )
317 # Use full description from Works API if available
318 description = ""
319 if work_data:
320 desc = work_data.get("description", "")
321 if isinstance(desc, dict):
322 desc = desc.get("value", "")
323 elif isinstance(desc, list): 323 ↛ 324line 323 didn't jump to line 324 because the condition on line 323 was never true
324 desc = " ".join(str(d) for d in desc)
325 if isinstance(desc, str) and desc: 325 ↛ 327line 325 didn't jump to line 327 because the condition on line 325 was always true
326 description = desc
327 if not description:
328 desc = raw.get("description", "")
329 if isinstance(desc, dict): 329 ↛ 330line 329 didn't jump to line 330 because the condition on line 329 was never true
330 desc = desc.get("value", "")
331 elif isinstance(desc, list): 331 ↛ 332line 331 didn't jump to line 332 because the condition on line 331 was never true
332 desc = " ".join(str(d) for d in desc)
333 if isinstance(desc, str) and desc:
334 description = desc
335 if description:
336 content_parts.append(f"\n{description}")
338 # Add excerpts from Works API
339 if work_data:
340 excerpts = work_data.get("excerpts", [])
341 if excerpts:
342 content_parts.append("\nExcerpts:")
343 for exc in excerpts[:5]:
344 text = exc.get("excerpt", "")
345 if text: 345 ↛ 343line 345 didn't jump to line 343 because the condition on line 345 was always true
346 content_parts.append(f' "{text}"')
348 if result.get("has_fulltext"):
349 content_parts.append(
350 "\nFull text available on Internet Archive"
351 )
353 result["content"] = "\n".join(content_parts)
355 # Clean up internal fields
356 if "_raw" in result:
357 del result["_raw"]
359 results.append(result)
361 return results
363 def _fetch_work_details(self, work_key: str) -> Optional[Dict[str, Any]]:
364 """Fetch detailed work information from the Works API."""
365 if not work_key or not work_key.startswith("/works/"):
366 if work_key:
367 logger.warning(
368 "Invalid work_key format: expected '/works/...' prefix"
369 )
370 return None
371 try:
372 url = f"{self.base_url}{work_key}.json"
373 response = safe_get(url, headers=self.headers, timeout=15)
374 self._raise_if_rate_limit(response.status_code)
375 response.raise_for_status()
376 return response.json() # type: ignore[no-any-return]
377 except (RateLimitError, ValueError):
378 raise
379 except Exception:
380 logger.warning(f"Failed to fetch work details for {work_key}")
381 return None
383 def get_book_by_isbn(self, isbn: str) -> Optional[Dict[str, Any]]:
384 """
385 Get a specific book by ISBN.
387 Args:
388 isbn: The book ISBN (10 or 13 digit)
390 Returns:
391 Book dictionary or None
392 """
393 try:
394 url = f"{self.base_url}/isbn/{isbn}.json"
395 response = safe_get(url, headers=self.headers, timeout=30)
396 self._raise_if_rate_limit(response.status_code)
397 response.raise_for_status()
398 return response.json() # type: ignore[no-any-return]
399 except RateLimitError:
400 raise
401 except Exception:
402 logger.exception(f"Error fetching book by ISBN {isbn}")
403 return None
405 def get_author(self, author_key: str) -> Optional[Dict[str, Any]]:
406 """
407 Get author information.
409 Args:
410 author_key: The author key (e.g., '/authors/OL23919A')
412 Returns:
413 Author dictionary or None
414 """
415 try:
416 if not author_key or not author_key.startswith("/authors/"): 416 ↛ 417line 416 didn't jump to line 417 because the condition on line 416 was never true
417 logger.warning(
418 "Invalid author_key format: expected '/authors/...' prefix"
419 )
420 return None
421 url = f"{self.base_url}{author_key}.json"
422 response = safe_get(url, headers=self.headers, timeout=30)
423 self._raise_if_rate_limit(response.status_code)
424 response.raise_for_status()
425 return response.json() # type: ignore[no-any-return]
426 except RateLimitError:
427 raise
428 except Exception:
429 logger.exception(f"Error fetching author {author_key}")
430 return None