Coverage for src/local_deep_research/web_search_engines/engines/search_engine

1"""Open Library search engine for books and literature."""

3import html

4from typing import Any, Dict, List, Optional

6import requests

7from langchain_core.language_models import BaseLLM

9from ...constants import USER_AGENT

10from ...security.safe_requests import safe_get

11from ...security.secure_logging import logger

12from ..rate_limiting import RateLimitError

13from ..search_engine_base import BaseSearchEngine, Exposure, Sensitivity

16class OpenLibrarySearchEngine(BaseSearchEngine):

17 """

18 Open Library search engine for books and literature.

20 Provides access to 2M+ books with metadata, covers, and reading lists.

21 No authentication required. Part of the Internet Archive.

22 """

24 is_public = True

25 egress_sensitivity = Sensitivity.NON_SENSITIVE

26 egress_exposure = Exposure.EXPOSING

27 is_generic = False

28 is_scientific = False

29 is_books = True # New category for book search

30 is_lexical = True

31 needs_llm_relevance_filter = True

33 def __init__(

34 self,

35 max_results: int = 10,

36 sort: str = "relevance",

37 language: Optional[str] = None,

38 search_field: Optional[str] = None,

39 llm: Optional[BaseLLM] = None,

40 max_filtered_results: Optional[int] = None,

41 settings_snapshot: Optional[Dict[str, Any]] = None,

42 **kwargs,

43 ):

44 """

45 Initialize the Open Library search engine.

47 Args:

48 max_results: Maximum number of search results

49 sort: Sort order ('relevance', 'new', 'old', 'random')

50 language: Filter by language code (e.g., 'eng', 'fre', 'ger')

51 search_field: Search in specific field ('title', 'author', 'subject')

52 llm: Language model for relevance filtering

53 max_filtered_results: Maximum results after filtering

54 settings_snapshot: Settings snapshot for thread context

55 """

56 super().__init__(

57 llm=llm,

58 max_filtered_results=max_filtered_results,

59 max_results=max_results,

60 settings_snapshot=settings_snapshot,

61 **kwargs,

62 )

64 self.sort = sort

65 self.language = language

66 self.search_field = search_field

68 self.base_url = "https://openlibrary.org"

69 self.search_url = f"{self.base_url}/search.json"

71 # User-Agent header is important for Open Library API

72 # They may block requests without a proper User-Agent

73 self.headers = {"User-Agent": USER_AGENT}

75 def _build_query_params(self, query: str) -> Dict[str, Any]:

76 """Build query parameters for the API request."""

77 params = {

78 "limit": min(self.max_results, 100),

79 "fields": "key,title,author_name,author_key,first_publish_year,"

80 "publisher,language,subject,isbn,cover_i,edition_count,"

81 "ebook_access,has_fulltext,ia,description",

82 }

84 # Build query based on search field

85 if self.search_field == "title":

86 params["title"] = query

87 elif self.search_field == "author":

88 params["author"] = query

89 elif self.search_field == "subject": 89 ↛ 90line 89 didn't jump to line 90 because the condition on line 89 was never true

90 params["subject"] = query

91 else:

92 params["q"] = query

94 # Add sort if not relevance (default)

95 if self.sort and self.sort != "relevance":

96 params["sort"] = self.sort

98 # Add language filter

99 if self.language:

100 params["language"] = self.language

101

102 return params

103

104 def _get_cover_url(

105 self, cover_id: Optional[int], size: str = "M"

106 ) -> Optional[str]:

107 """Get cover image URL for a book."""

108 if not cover_id:

109 return None

110 return f"https://covers.openlibrary.org/b/id/{cover_id}-{size}.jpg"

111

112 def _get_previews(self, query: str) -> List[Dict[str, Any]]:

113 """

114 Get preview information for Open Library books.

115

116 Args:

117 query: The search query

118

119 Returns:

120 List of preview dictionaries

121 """

122 logger.info(f"Getting Open Library previews for query: {query}")

123

124 # Apply rate limiting

125 self._last_wait_time = self.rate_tracker.apply_rate_limit(

126 self.engine_type

127 )

128

129 try:

130 params = self._build_query_params(query)

131 response = safe_get(

132 self.search_url,

133 params=params,

134 headers=self.headers,

135 timeout=30,

136 )

137

138 self._raise_if_rate_limit(response.status_code)

139

140 response.raise_for_status()

141 data = response.json()

142

143 docs = data.get("docs", [])

144 total_found = data.get("num_found", 0)

145 logger.info(

146 f"Found {total_found} Open Library results, returning {len(docs)}"

147 )

148

149 previews = []

150 for doc in docs:

151 try:

152 # Get work key and build URL

153 work_key = doc.get("key", "")

154 link = f"{self.base_url}{work_key}" if work_key else ""

155

156 # Get title (decode HTML entities)

157 title = html.unescape(doc.get("title", "Untitled"))

158

159 # Get authors

160 authors = doc.get("author_name", [])

161 if isinstance(authors, str):

162 authors = [authors]

163 authors = authors[:5] # Limit to 5 authors

164

165 # Get first publish year

166 first_publish_year = doc.get("first_publish_year")

167

168 # Get publishers

169 publishers = doc.get("publisher", [])

170 if isinstance(publishers, str): 170 ↛ 171line 170 didn't jump to line 171 because the condition on line 170 was never true

171 publishers = [publishers]

172 publisher = publishers[0] if publishers else ""

173

174 # Get subjects

175 subjects = doc.get("subject", [])

176 if isinstance(subjects, str): 176 ↛ 177line 176 didn't jump to line 177 because the condition on line 176 was never true

177 subjects = [subjects]

178 subjects = subjects[:5] # Limit to 5 subjects

179

180 # Get ISBNs

181 isbns = doc.get("isbn", [])

182 if isinstance(isbns, str): 182 ↛ 183line 182 didn't jump to line 183 because the condition on line 182 was never true

183 isbns = [isbns]

184 isbn = isbns[0] if isbns else None

185

186 # Get cover

187 cover_id = doc.get("cover_i")

188 cover_url = self._get_cover_url(cover_id)

189

190 # Get description if available

191 description = doc.get("description", "")

192 # Description can be a string or a dict with "value" key

193 if isinstance(description, dict): 193 ↛ 194line 193 didn't jump to line 194 because the condition on line 193 was never true

194 description = description.get("value", "")

195 elif isinstance(description, list): 195 ↛ 196line 195 didn't jump to line 196 because the condition on line 195 was never true

196 description = (

197 " ".join(str(d) for d in description)

198 if description

199 else ""

200 )

201

202 # Build snippet with description for richer content

203 snippet_parts = []

204 if description: 204 ↛ 205line 204 didn't jump to line 205 because the condition on line 204 was never true

205 snippet_parts.append(description[:800])

206 if authors:

207 snippet_parts.append(f"By {', '.join(authors[:3])}")

208 if first_publish_year:

209 snippet_parts.append(

210 f"First published: {first_publish_year}"

211 )

212 if subjects:

213 snippet_parts.append(

214 f"Subjects: {', '.join(subjects[:5])}"

215 )

216 snippet = ". ".join(snippet_parts)

217

218 # Check availability

219 has_fulltext = doc.get("has_fulltext", False)

220 ebook_access = doc.get("ebook_access", "no_ebook")

221 ia_ids = doc.get("ia", [])

222 if isinstance(ia_ids, str): 222 ↛ 223line 222 didn't jump to line 223 because the condition on line 222 was never true

223 ia_ids = [ia_ids]

224

225 preview = {

226 "id": work_key,

227 "title": title,

228 "link": link,

229 "snippet": snippet,

230 "authors": authors,

231 "first_publish_year": first_publish_year,

232 "publisher": publisher,

233 "subjects": subjects,

234 "isbn": isbn,

235 "cover_url": cover_url,

236 "edition_count": doc.get("edition_count", 0),

237 "has_fulltext": has_fulltext,

238 "ebook_access": ebook_access,

239 "internet_archive_ids": ia_ids[:3] if ia_ids else [],

240 "source": "Open Library",

241 "_raw": doc,

242 }

243

244 previews.append(preview)

245

246 except Exception as e:

247 safe_msg = self._scrub_error(e)

248 logger.exception(

249 f"Error parsing Open Library item ({type(e).__name__}): {safe_msg}"

250 )

251 continue

252

253 return previews

254

255 except (requests.RequestException, ValueError) as e:

256 safe_msg = self._scrub_error(e)

257 logger.exception(

258 f"Open Library API request failed ({type(e).__name__}): {safe_msg}"

259 )

260 self._raise_if_rate_limit(e)

261 return []

262

263 def _get_full_content(

264 self, relevant_items: List[Dict[str, Any]]

265 ) -> List[Dict[str, Any]]:

266 """

267 Get full content for the relevant Open Library books.

268

269 Fetches detailed information from the Works API including

270 full descriptions and excerpts.

271

272 Args:

273 relevant_items: List of relevant preview dictionaries

274

275 Returns:

276 List of result dictionaries with full content

277 """

278 logger.info(

279 f"Getting full content for {len(relevant_items)} Open Library books"

280 )

281

282 results = []

283 for item in relevant_items:

284 result = item.copy()

285

286 raw = item.get("_raw", {})

287 if raw:

288 # Get all languages

289 languages = raw.get("language", [])

290 if isinstance(languages, str):

291 languages = [languages]

292 result["languages"] = languages

293

294 # Get all subjects

295 result["subjects"] = raw.get("subject", [])

296 if isinstance(result["subjects"], str):

297 result["subjects"] = [result["subjects"]]

298

299 # Get all publishers

300 result["publishers"] = raw.get("publisher", [])

301 if isinstance(result["publishers"], str):

302 result["publishers"] = [result["publishers"]]

303

304 # Fetch detailed info from Works API

305 work_key = item.get("id", "")

306 work_data = self._fetch_work_details(work_key)

307

308 # Build content with metadata + description + excerpts

309 content_parts = []

310 if result.get("authors"):

311 content_parts.append(

312 f"Authors: {', '.join(result['authors'])}"

313 )

314 if result.get("first_publish_year"):

315 content_parts.append(

316 f"First published: {result['first_publish_year']}"

317 )

318 if result.get("subjects"):

319 subjects = result["subjects"]

320 if isinstance(subjects, list): 320 ↛ 326line 320 didn't jump to line 326 because the condition on line 320 was always true

321 content_parts.append(

322 f"Subjects: {', '.join(subjects[:10])}"

323 )

324

325 # Use full description from Works API if available

326 description = ""

327 if work_data:

328 desc = work_data.get("description", "")

329 if isinstance(desc, dict):

330 desc = desc.get("value", "")

331 elif isinstance(desc, list): 331 ↛ 332line 331 didn't jump to line 332 because the condition on line 331 was never true

332 desc = " ".join(str(d) for d in desc)

333 if isinstance(desc, str) and desc: 333 ↛ 335line 333 didn't jump to line 335 because the condition on line 333 was always true

334 description = desc

335 if not description:

336 desc = raw.get("description", "")

337 if isinstance(desc, dict): 337 ↛ 338line 337 didn't jump to line 338 because the condition on line 337 was never true

338 desc = desc.get("value", "")

339 elif isinstance(desc, list): 339 ↛ 340line 339 didn't jump to line 340 because the condition on line 339 was never true

340 desc = " ".join(str(d) for d in desc)

341 if isinstance(desc, str) and desc:

342 description = desc

343 if description:

344 content_parts.append(f"\n{description}")

345

346 # Add excerpts from Works API

347 if work_data:

348 excerpts = work_data.get("excerpts", [])

349 if excerpts:

350 content_parts.append("\nExcerpts:")

351 for exc in excerpts[:5]:

352 text = exc.get("excerpt", "")

353 if text: 353 ↛ 351line 353 didn't jump to line 351 because the condition on line 353 was always true

354 content_parts.append(f' "{text}"')

355

356 if result.get("has_fulltext"):

357 content_parts.append(

358 "\nFull text available on Internet Archive"

359 )

360

361 result["content"] = "\n".join(content_parts)

362

363 # Clean up internal fields

364 if "_raw" in result:

365 del result["_raw"]

366

367 results.append(result)

368

369 return results

370

371 def _fetch_work_details(self, work_key: str) -> Optional[Dict[str, Any]]:

372 """Fetch detailed work information from the Works API."""

373 if not work_key or not work_key.startswith("/works/"):

374 if work_key:

375 logger.warning(

376 "Invalid work_key format: expected '/works/...' prefix"

377 )

378 return None

379 try:

380 url = f"{self.base_url}{work_key}.json"

381 response = safe_get(url, headers=self.headers, timeout=15)

382 self._raise_if_rate_limit(response.status_code)

383 response.raise_for_status()

384 return response.json() # type: ignore[no-any-return]

385 except (RateLimitError, ValueError):

386 raise

387 except Exception:

388 logger.warning(f"Failed to fetch work details for {work_key}")

389 return None

390

391 def get_book_by_isbn(self, isbn: str) -> Optional[Dict[str, Any]]:

392 """

393 Get a specific book by ISBN.

394

395 Args:

396 isbn: The book ISBN (10 or 13 digit)

397

398 Returns:

399 Book dictionary or None

400 """

401 try:

402 url = f"{self.base_url}/isbn/{isbn}.json"

403 response = safe_get(url, headers=self.headers, timeout=30)

404 self._raise_if_rate_limit(response.status_code)

405 response.raise_for_status()

406 return response.json() # type: ignore[no-any-return]

407 except RateLimitError:

408 raise

409 except Exception as e:

410 safe_msg = self._scrub_error(e)

411 logger.exception(

412 f"Error fetching book by ISBN {isbn} ({type(e).__name__}): {safe_msg}"

413 )

414 return None

415

416 def get_author(self, author_key: str) -> Optional[Dict[str, Any]]:

417 """

418 Get author information.

419

420 Args:

421 author_key: The author key (e.g., '/authors/OL23919A')

422

423 Returns:

424 Author dictionary or None

425 """

426 try:

427 if not author_key or not author_key.startswith("/authors/"): 427 ↛ 428line 427 didn't jump to line 428 because the condition on line 427 was never true

428 logger.warning(

429 "Invalid author_key format: expected '/authors/...' prefix"

430 )

431 return None

432 url = f"{self.base_url}{author_key}.json"

433 response = safe_get(url, headers=self.headers, timeout=30)

434 self._raise_if_rate_limit(response.status_code)

435 response.raise_for_status()

436 return response.json() # type: ignore[no-any-return]

437 except RateLimitError:

438 raise

439 except Exception as e:

440 safe_msg = self._scrub_error(e)

441 logger.exception(

442 f"Error fetching author {author_key} ({type(e).__name__}): {safe_msg}"

443 )

444 return None

Coverage for src/local_deep_research/web_search_engines/engines/search_engine_openlibrary.py: 89%

220 statements