Coverage for src / local_deep_research / web_search_engines / engines / search_engine_openalex.py: 87%

158 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1"""OpenAlex search engine implementation for academic papers and research.""" 

2 

3from typing import Any, Dict, List, Optional 

4 

5from langchain_core.language_models import BaseLLM 

6from loguru import logger 

7 

8from ...constants import SNIPPET_LENGTH_LONG 

9from ...advanced_search_system.filters.journal_reputation_filter import ( 

10 JournalReputationFilter, 

11) 

12from ...security.safe_requests import safe_get 

13from ..rate_limiting import RateLimitError 

14from ..search_engine_base import BaseSearchEngine 

15 

16 

17class OpenAlexSearchEngine(BaseSearchEngine): 

18 """OpenAlex search engine implementation with natural language query support.""" 

19 

20 # Mark as public search engine 

21 is_public = True 

22 # Scientific/academic search engine 

23 is_scientific = True 

24 

25 def __init__( 

26 self, 

27 max_results: int = 25, 

28 email: Optional[str] = None, 

29 sort_by: str = "relevance", 

30 filter_open_access: bool = False, 

31 min_citations: int = 0, 

32 from_publication_date: Optional[str] = None, 

33 llm: Optional[BaseLLM] = None, 

34 max_filtered_results: Optional[int] = None, 

35 settings_snapshot: Optional[Dict[str, Any]] = None, 

36 **kwargs, 

37 ): 

38 """ 

39 Initialize the OpenAlex search engine. 

40 

41 Args: 

42 max_results: Maximum number of search results 

43 email: Email for polite pool (gets faster response) - optional 

44 sort_by: Sort order ('relevance', 'cited_by_count', 'publication_date') 

45 filter_open_access: Only return open access papers 

46 min_citations: Minimum citation count filter 

47 from_publication_date: Filter papers from this date (YYYY-MM-DD) 

48 llm: Language model for relevance filtering 

49 max_filtered_results: Maximum number of results to keep after filtering 

50 settings_snapshot: Settings snapshot for configuration 

51 **kwargs: Additional parameters to pass to parent class 

52 """ 

53 # Initialize journal reputation filter if needed 

54 content_filters = [] 

55 journal_filter = JournalReputationFilter.create_default( 

56 model=llm, 

57 engine_name="openalex", 

58 settings_snapshot=settings_snapshot, 

59 ) 

60 if journal_filter is not None: 60 ↛ 61line 60 didn't jump to line 61 because the condition on line 60 was never true

61 content_filters.append(journal_filter) 

62 

63 # Initialize the BaseSearchEngine 

64 super().__init__( 

65 llm=llm, 

66 max_filtered_results=max_filtered_results, 

67 max_results=max_results, 

68 content_filters=content_filters, 

69 settings_snapshot=settings_snapshot, 

70 **kwargs, 

71 ) 

72 

73 self.sort_by = sort_by 

74 self.filter_open_access = filter_open_access 

75 self.min_citations = min_citations 

76 # Only set from_publication_date if it's not empty or "False" 

77 self.from_publication_date = ( 

78 from_publication_date 

79 if from_publication_date and from_publication_date != "False" 

80 else None 

81 ) 

82 

83 # Get email from settings if not provided 

84 if not email and settings_snapshot: 84 ↛ 85line 84 didn't jump to line 85 because the condition on line 84 was never true

85 from ...config.search_config import get_setting_from_snapshot 

86 

87 try: 

88 email = get_setting_from_snapshot( 

89 "search.engine.web.openalex.email", 

90 settings_snapshot=settings_snapshot, 

91 ) 

92 except Exception: 

93 pass 

94 

95 # Handle "False" string for email 

96 self.email = email if email and email != "False" else None 

97 

98 # API configuration 

99 self.api_base = "https://api.openalex.org" 

100 self.headers = { 

101 "User-Agent": f"Local-Deep-Research-Agent{f' ({email})' if email else ''}", 

102 "Accept": "application/json", 

103 } 

104 

105 if email: 

106 # Email allows access to polite pool with faster response times 

107 logger.info(f"Using OpenAlex polite pool with email: {email}") 

108 else: 

109 logger.info( 

110 "Using OpenAlex without email (consider adding email for faster responses)" 

111 ) 

112 

113 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

114 """ 

115 Get preview information for OpenAlex search results. 

116 

117 Args: 

118 query: The search query (natural language supported!) 

119 

120 Returns: 

121 List of preview dictionaries 

122 """ 

123 logger.info(f"Searching OpenAlex for: {query}") 

124 

125 # Build the search URL with parameters 

126 params = { 

127 "search": query, # OpenAlex handles natural language beautifully 

128 "per_page": min(self.max_results, 200), # OpenAlex allows up to 200 

129 "page": 1, 

130 # Request specific fields including abstract for snippets 

131 "select": "id,display_name,publication_year,publication_date,doi,primary_location,authorships,cited_by_count,open_access,best_oa_location,abstract_inverted_index", 

132 } 

133 

134 # Add optional filters 

135 filters = [] 

136 

137 if self.filter_open_access: 

138 filters.append("is_oa:true") 

139 

140 if self.min_citations > 0: 

141 filters.append(f"cited_by_count:>{self.min_citations}") 

142 

143 if self.from_publication_date and self.from_publication_date != "False": 

144 filters.append( 

145 f"from_publication_date:{self.from_publication_date}" 

146 ) 

147 

148 if filters: 

149 params["filter"] = ",".join(filters) 

150 

151 # Add sorting 

152 sort_map = { 

153 "relevance": "relevance_score:desc", 

154 "cited_by_count": "cited_by_count:desc", 

155 "publication_date": "publication_date:desc", 

156 } 

157 params["sort"] = sort_map.get(self.sort_by, "relevance_score:desc") 

158 

159 # Add email to params for polite pool 

160 if self.email and self.email != "False": 

161 params["mailto"] = self.email 

162 

163 try: 

164 # Apply rate limiting before making the request (simple like PubMed) 

165 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

166 self.engine_type 

167 ) 

168 logger.debug( 

169 f"Applied rate limit wait: {self._last_wait_time:.2f}s" 

170 ) 

171 

172 # Make the API request 

173 logger.info(f"Making OpenAlex API request with params: {params}") 

174 response = safe_get( 

175 f"{self.api_base}/works", 

176 params=params, 

177 headers=self.headers, 

178 timeout=30, 

179 ) 

180 logger.info(f"OpenAlex API response status: {response.status_code}") 

181 

182 # Log rate limit info if available 

183 if "x-ratelimit-remaining" in response.headers: 183 ↛ 184line 183 didn't jump to line 184 because the condition on line 183 was never true

184 remaining = response.headers.get("x-ratelimit-remaining") 

185 limit = response.headers.get("x-ratelimit-limit", "unknown") 

186 logger.debug( 

187 f"OpenAlex rate limit: {remaining}/{limit} requests remaining" 

188 ) 

189 

190 if response.status_code == 200: 

191 data = response.json() 

192 results = data.get("results", []) 

193 meta = data.get("meta", {}) 

194 total_count = meta.get("count", 0) 

195 

196 logger.info( 

197 f"OpenAlex returned {len(results)} results (total available: {total_count:,})" 

198 ) 

199 

200 # Log first result structure for debugging 

201 if results: 

202 first_result = results[0] 

203 logger.debug( 

204 f"First result keys: {list(first_result.keys())}" 

205 ) 

206 logger.debug( 

207 f"First result has abstract: {'abstract_inverted_index' in first_result}" 

208 ) 

209 if "open_access" in first_result: 209 ↛ 215line 209 didn't jump to line 215 because the condition on line 209 was always true

210 logger.debug( 

211 f"Open access structure: {first_result['open_access']}" 

212 ) 

213 

214 # Format results as previews 

215 previews = [] 

216 for i, work in enumerate(results): 

217 logger.debug( 

218 f"Formatting work {i + 1}/{len(results)}: {work.get('display_name', 'Unknown')[:50]}" 

219 ) 

220 preview = self._format_work_preview(work) 

221 if preview: 221 ↛ 227line 221 didn't jump to line 227 because the condition on line 221 was always true

222 previews.append(preview) 

223 logger.debug( 

224 f"Preview created with snippet: {preview.get('snippet', '')[:100]}..." 

225 ) 

226 else: 

227 logger.warning(f"Failed to format work {i + 1}") 

228 

229 logger.info( 

230 f"Successfully formatted {len(previews)} previews from {len(results)} results" 

231 ) 

232 return previews 

233 

234 elif response.status_code == 429: 

235 # Rate limited (very rare with OpenAlex) 

236 logger.warning("OpenAlex rate limit reached") 

237 raise RateLimitError("OpenAlex rate limit exceeded") 

238 

239 else: 

240 logger.error( 

241 f"OpenAlex API error: {response.status_code} - {response.text[:200]}" 

242 ) 

243 return [] 

244 

245 except RateLimitError: 

246 # Re-raise rate limit errors for base class retry handling 

247 raise 

248 except Exception: 

249 logger.exception("Error searching OpenAlex") 

250 return [] 

251 

252 def _format_work_preview( 

253 self, work: Dict[str, Any] 

254 ) -> Optional[Dict[str, Any]]: 

255 """ 

256 Format an OpenAlex work as a preview dictionary. 

257 

258 Args: 

259 work: OpenAlex work object 

260 

261 Returns: 

262 Formatted preview dictionary or None if formatting fails 

263 """ 

264 try: 

265 # Extract basic information 

266 work_id = work.get("id", "") 

267 title = work.get("display_name", "No title") 

268 logger.debug(f"Formatting work: {title[:50]}") 

269 

270 # Build snippet from abstract or first part of title 

271 abstract = None 

272 if work.get("abstract_inverted_index"): 

273 logger.debug( 

274 f"Found abstract_inverted_index with {len(work['abstract_inverted_index'])} words" 

275 ) 

276 # Reconstruct abstract from inverted index 

277 abstract = self._reconstruct_abstract( 

278 work["abstract_inverted_index"] 

279 ) 

280 logger.debug( 

281 f"Reconstructed abstract length: {len(abstract) if abstract else 0}" 

282 ) 

283 else: 

284 logger.debug("No abstract_inverted_index found") 

285 

286 snippet = ( 

287 abstract[:SNIPPET_LENGTH_LONG] 

288 if abstract 

289 else f"Academic paper: {title}" 

290 ) 

291 logger.debug(f"Created snippet: {snippet[:100]}...") 

292 

293 # Get publication info 

294 publication_year = work.get("publication_year", "unknown") 

295 publication_date = work.get("publication_date", "unknown") 

296 

297 # Get venue/journal info 

298 venue = work.get("primary_location", {}) 

299 journal_name = "unknown" 

300 if venue: 

301 source = venue.get("source", {}) 

302 if source: 302 ↛ 306line 302 didn't jump to line 306 because the condition on line 302 was always true

303 journal_name = source.get("display_name", "unknown") 

304 

305 # Get authors 

306 authors = [] 

307 for authorship in work.get("authorships", [])[ 

308 :5 

309 ]: # Limit to 5 authors 

310 author = authorship.get("author", {}) 

311 if author: 311 ↛ 307line 311 didn't jump to line 307 because the condition on line 311 was always true

312 authors.append(author.get("display_name", "")) 

313 

314 authors_str = ", ".join(authors) 

315 if len(work.get("authorships", [])) > 5: 

316 authors_str += " et al." 

317 

318 # Get metrics 

319 cited_by_count = work.get("cited_by_count", 0) 

320 

321 # Get URL - prefer DOI, fallback to OpenAlex URL 

322 url = work.get("doi", work_id) 

323 if not url.startswith("http"): 

324 if url.startswith("https://doi.org/"): 324 ↛ 325line 324 didn't jump to line 325 because the condition on line 324 was never true

325 pass # Already a full DOI URL 

326 elif url.startswith("10."): 326 ↛ 329line 326 didn't jump to line 329 because the condition on line 326 was always true

327 url = f"https://doi.org/{url}" 

328 else: 

329 url = work_id # OpenAlex URL 

330 

331 # Check if open access 

332 open_access_info = work.get("open_access", {}) 

333 is_oa = ( 

334 open_access_info.get("is_oa", False) 

335 if open_access_info 

336 else False 

337 ) 

338 oa_url = None 

339 if is_oa: 

340 best_location = work.get("best_oa_location", {}) 

341 if best_location: 341 ↛ 346line 341 didn't jump to line 346 because the condition on line 341 was always true

342 oa_url = best_location.get("pdf_url") or best_location.get( 

343 "landing_page_url" 

344 ) 

345 

346 preview = { 

347 "id": work_id, 

348 "title": title, 

349 "link": url, 

350 "snippet": snippet, 

351 "authors": authors_str, 

352 "year": publication_year, 

353 "date": publication_date, 

354 "journal": journal_name, 

355 "citations": cited_by_count, 

356 "is_open_access": is_oa, 

357 "oa_url": oa_url, 

358 "abstract": abstract, 

359 "type": "academic_paper", 

360 } 

361 

362 return preview 

363 

364 except Exception: 

365 logger.exception( 

366 f"Error formatting OpenAlex work: {work.get('id', 'unknown')}" 

367 ) 

368 return None 

369 

370 def _reconstruct_abstract( 

371 self, inverted_index: Dict[str, List[int]] 

372 ) -> str: 

373 """ 

374 Reconstruct abstract text from OpenAlex inverted index format. 

375 

376 Args: 

377 inverted_index: Dictionary mapping words to their positions 

378 

379 Returns: 

380 Reconstructed abstract text 

381 """ 

382 try: 

383 # Create position-word mapping 

384 position_word = {} 

385 for word, positions in inverted_index.items(): 

386 for pos in positions: 

387 position_word[pos] = word 

388 

389 # Sort by position and reconstruct 

390 sorted_positions = sorted(position_word.keys()) 

391 words = [position_word[pos] for pos in sorted_positions] 

392 

393 return " ".join(words) 

394 

395 except Exception: 

396 logger.debug("Could not reconstruct abstract from inverted index") 

397 return "" 

398 

399 def _get_full_content( 

400 self, relevant_items: List[Dict[str, Any]] 

401 ) -> List[Dict[str, Any]]: 

402 """ 

403 Get full content for relevant items (OpenAlex provides most content in preview). 

404 

405 Args: 

406 relevant_items: List of relevant preview dictionaries 

407 

408 Returns: 

409 List of result dictionaries with full content 

410 """ 

411 # OpenAlex returns comprehensive data in the initial search, 

412 # so we don't need a separate full content fetch 

413 results = [] 

414 for item in relevant_items: 

415 result = { 

416 "title": item.get("title", ""), 

417 "link": item.get("link", ""), 

418 "snippet": item.get("snippet", ""), 

419 "content": item.get("abstract", item.get("snippet", "")), 

420 "metadata": { 

421 "authors": item.get("authors", ""), 

422 "year": item.get("year", ""), 

423 "journal": item.get("journal", ""), 

424 "citations": item.get("citations", 0), 

425 "is_open_access": item.get("is_open_access", False), 

426 "oa_url": item.get("oa_url"), 

427 }, 

428 } 

429 results.append(result) 

430 

431 return results