Coverage for src / local_deep_research / web_search_engines / engines / search_engine_arxiv.py: 70%

158 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1from typing import Any, Dict, List, Optional 

2 

3import arxiv 

4from langchain_core.language_models import BaseLLM 

5from loguru import logger 

6 

7from ...advanced_search_system.filters.journal_reputation_filter import ( 

8 JournalReputationFilter, 

9) 

10from ...config import search_config 

11from ...constants import SNIPPET_LENGTH_SHORT 

12from ..rate_limiting import RateLimitError 

13from ..search_engine_base import BaseSearchEngine 

14 

15 

16class ArXivSearchEngine(BaseSearchEngine): 

17 """arXiv search engine implementation with two-phase approach""" 

18 

19 # Mark as public search engine 

20 is_public = True 

21 # Not a generic search engine (specialized for academic papers) 

22 is_generic = False 

23 # Scientific/academic search engine 

24 is_scientific = True 

25 

26 def __init__( 

27 self, 

28 max_results: int = 10, 

29 sort_by: str = "relevance", 

30 sort_order: str = "descending", 

31 include_full_text: bool = False, 

32 download_dir: Optional[str] = None, 

33 max_full_text: int = 1, 

34 llm: Optional[BaseLLM] = None, 

35 max_filtered_results: Optional[int] = None, 

36 settings_snapshot: Optional[Dict[str, Any]] = None, 

37 ): # Added this parameter 

38 """ 

39 Initialize the arXiv search engine. 

40 

41 Args: 

42 max_results: Maximum number of search results 

43 sort_by: Sorting criteria ('relevance', 'lastUpdatedDate', or 'submittedDate') 

44 sort_order: Sort order ('ascending' or 'descending') 

45 include_full_text: Whether to include full paper content in results (downloads PDF) 

46 download_dir: Directory to download PDFs to (if include_full_text is True) 

47 max_full_text: Maximum number of PDFs to download and process (default: 1) 

48 llm: Language model for relevance filtering 

49 max_filtered_results: Maximum number of results to keep after filtering 

50 settings_snapshot: Settings snapshot for thread context 

51 """ 

52 # Initialize the journal reputation filter if needed. 

53 content_filters = [] 

54 journal_filter = JournalReputationFilter.create_default( 

55 model=llm, engine_name="arxiv", settings_snapshot=settings_snapshot 

56 ) 

57 if journal_filter is not None: 

58 content_filters.append(journal_filter) 

59 

60 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results 

61 super().__init__( 

62 llm=llm, 

63 max_filtered_results=max_filtered_results, 

64 max_results=max_results, 

65 # We deliberately do this filtering after relevancy checks, 

66 # because it is potentially quite slow. 

67 content_filters=content_filters, 

68 settings_snapshot=settings_snapshot, 

69 ) 

70 self.max_results = max(self.max_results, 25) 

71 self.sort_by = sort_by 

72 self.sort_order = sort_order 

73 self.include_full_text = include_full_text 

74 self.download_dir = download_dir 

75 self.max_full_text = max_full_text 

76 

77 # Map sort parameters to arxiv package parameters 

78 self.sort_criteria = { 

79 "relevance": arxiv.SortCriterion.Relevance, 

80 "lastUpdatedDate": arxiv.SortCriterion.LastUpdatedDate, 

81 "submittedDate": arxiv.SortCriterion.SubmittedDate, 

82 } 

83 

84 self.sort_directions = { 

85 "ascending": arxiv.SortOrder.Ascending, 

86 "descending": arxiv.SortOrder.Descending, 

87 } 

88 

89 def _get_search_results(self, query: str) -> List[Any]: 

90 """ 

91 Helper method to get search results from arXiv API. 

92 

93 Args: 

94 query: The search query 

95 

96 Returns: 

97 List of arXiv paper objects 

98 """ 

99 # Configure the search client 

100 sort_criteria = self.sort_criteria.get( 

101 self.sort_by, arxiv.SortCriterion.Relevance 

102 ) 

103 sort_order = self.sort_directions.get( 

104 self.sort_order, arxiv.SortOrder.Descending 

105 ) 

106 

107 # Create the search client 

108 client = arxiv.Client(page_size=self.max_results) 

109 

110 # Create the search query 

111 search = arxiv.Search( 

112 query=query, 

113 max_results=self.max_results, 

114 sort_by=sort_criteria, 

115 sort_order=sort_order, 

116 ) 

117 

118 # Apply rate limiting before making the request 

119 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

120 self.engine_type 

121 ) 

122 

123 # Get the search results 

124 papers = list(client.results(search)) 

125 

126 return papers 

127 

128 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

129 """ 

130 Get preview information for arXiv papers. 

131 

132 Args: 

133 query: The search query 

134 

135 Returns: 

136 List of preview dictionaries 

137 """ 

138 logger.info("Getting paper previews from arXiv") 

139 

140 try: 

141 # Get search results from arXiv 

142 papers = self._get_search_results(query) 

143 

144 # Store the paper objects for later use 

145 self._papers = {paper.entry_id: paper for paper in papers} 

146 

147 # Format results as previews with basic information 

148 previews = [] 

149 for paper in papers: 

150 preview = { 

151 "id": paper.entry_id, # Use entry_id as ID 

152 "title": paper.title, 

153 "link": paper.entry_id, # arXiv URL 

154 "snippet": ( 

155 paper.summary[:SNIPPET_LENGTH_SHORT] + "..." 

156 if len(paper.summary) > SNIPPET_LENGTH_SHORT 

157 else paper.summary 

158 ), 

159 "authors": [ 

160 author.name for author in paper.authors[:3] 

161 ], # First 3 authors 

162 "published": ( 

163 paper.published.strftime("%Y-%m-%d") 

164 if paper.published 

165 else None 

166 ), 

167 "journal_ref": paper.journal_ref, 

168 "source": "arXiv", 

169 } 

170 

171 previews.append(preview) 

172 

173 return previews 

174 

175 except Exception as e: 

176 error_msg = str(e) 

177 logger.exception("Error getting arXiv previews") 

178 

179 # Check for rate limiting patterns 

180 if ( 

181 "429" in error_msg 

182 or "too many requests" in error_msg.lower() 

183 or "rate limit" in error_msg.lower() 

184 or "service unavailable" in error_msg.lower() 

185 or "503" in error_msg 

186 ): 

187 raise RateLimitError(f"arXiv rate limit hit: {error_msg}") 

188 

189 return [] 

190 

191 def _get_full_content( 

192 self, relevant_items: List[Dict[str, Any]] 

193 ) -> List[Dict[str, Any]]: 

194 """ 

195 Get full content for the relevant arXiv papers. 

196 Downloads PDFs and extracts text when include_full_text is True. 

197 Limits the number of PDFs processed to max_full_text. 

198 

199 Args: 

200 relevant_items: List of relevant preview dictionaries 

201 

202 Returns: 

203 List of result dictionaries with full content 

204 """ 

205 # Check if we should get full content 

206 if ( 

207 hasattr(search_config, "SEARCH_SNIPPETS_ONLY") 

208 and search_config.SEARCH_SNIPPETS_ONLY 

209 ): 

210 logger.info("Snippet-only mode, skipping full content retrieval") 

211 return relevant_items 

212 

213 logger.info("Getting full content for relevant arXiv papers") 

214 

215 results = [] 

216 pdf_count = 0 # Track number of PDFs processed 

217 

218 for item in relevant_items: 

219 # Start with the preview data 

220 result = item.copy() 

221 

222 # Get the paper ID 

223 paper_id = item.get("id") 

224 

225 # Try to get the full paper from our cache 

226 paper = None 

227 if hasattr(self, "_papers") and paper_id in self._papers: 

228 paper = self._papers[paper_id] 

229 

230 if paper: 

231 # Add complete paper information 

232 result.update( 

233 { 

234 "pdf_url": paper.pdf_url, 

235 "authors": [ 

236 author.name for author in paper.authors 

237 ], # All authors 

238 "published": ( 

239 paper.published.strftime("%Y-%m-%d") 

240 if paper.published 

241 else None 

242 ), 

243 "updated": ( 

244 paper.updated.strftime("%Y-%m-%d") 

245 if paper.updated 

246 else None 

247 ), 

248 "categories": paper.categories, 

249 "summary": paper.summary, # Full summary 

250 "comment": paper.comment, 

251 "doi": paper.doi, 

252 } 

253 ) 

254 

255 # Default to using summary as content 

256 result["content"] = paper.summary 

257 result["full_content"] = paper.summary 

258 

259 # Download PDF and extract text if requested and within limit 

260 if ( 260 ↛ 265line 260 didn't jump to line 265 because the condition on line 260 was never true

261 self.include_full_text 

262 and self.download_dir 

263 and pdf_count < self.max_full_text 

264 ): 

265 try: 

266 # Download the paper 

267 pdf_count += ( 

268 1 # Increment counter before attempting download 

269 ) 

270 # Apply rate limiting before PDF download 

271 self.rate_tracker.apply_rate_limit(self.engine_type) 

272 

273 paper_path = paper.download_pdf( 

274 dirpath=self.download_dir 

275 ) 

276 result["pdf_path"] = str(paper_path) 

277 

278 # Extract text from PDF 

279 try: 

280 # Try PyPDF2 first 

281 try: 

282 import PyPDF2 

283 

284 with open(paper_path, "rb") as pdf_file: 

285 pdf_reader = PyPDF2.PdfReader(pdf_file) 

286 pdf_text = "" 

287 for page in pdf_reader.pages: 

288 pdf_text += page.extract_text() + "\n\n" 

289 

290 if ( 

291 pdf_text.strip() 

292 ): # Only use if we got meaningful text 

293 result["content"] = pdf_text 

294 result["full_content"] = pdf_text 

295 logger.info( 

296 "Successfully extracted text from PDF using PyPDF2" 

297 ) 

298 except (ImportError, Exception) as e1: 

299 # Fall back to pdfplumber 

300 try: 

301 import pdfplumber 

302 

303 with pdfplumber.open(paper_path) as pdf: 

304 pdf_text = "" 

305 for page in pdf.pages: 

306 pdf_text += ( 

307 page.extract_text() + "\n\n" 

308 ) 

309 

310 if ( 

311 pdf_text.strip() 

312 ): # Only use if we got meaningful text 

313 result["content"] = pdf_text 

314 result["full_content"] = pdf_text 

315 logger.info( 

316 "Successfully extracted text from PDF using pdfplumber" 

317 ) 

318 except (ImportError, Exception) as e2: 

319 logger.exception( 

320 f"PDF text extraction failed: {e1!s}, then {e2!s}" 

321 ) 

322 logger.info( 

323 "Using paper summary as content instead" 

324 ) 

325 except Exception: 

326 logger.exception("Error extracting text from PDF") 

327 logger.info( 

328 "Using paper summary as content instead" 

329 ) 

330 except Exception: 

331 logger.exception( 

332 f"Error downloading paper {paper.title}" 

333 ) 

334 result["pdf_path"] = None 

335 pdf_count -= 1 # Decrement counter if download fails 

336 elif ( 336 ↛ 342line 336 didn't jump to line 342 because the condition on line 336 was never true

337 self.include_full_text 

338 and self.download_dir 

339 and pdf_count >= self.max_full_text 

340 ): 

341 # Reached PDF limit 

342 logger.info( 

343 f"Maximum number of PDFs ({self.max_full_text}) reached. Skipping remaining PDFs." 

344 ) 

345 result["content"] = paper.summary 

346 result["full_content"] = paper.summary 

347 

348 results.append(result) 

349 

350 return results 

351 

352 def run( 

353 self, query: str, research_context: Dict[str, Any] | None = None 

354 ) -> List[Dict[str, Any]]: 

355 """ 

356 Execute a search using arXiv with the two-phase approach. 

357 

358 Args: 

359 query: The search query 

360 research_context: Context from previous research to use. 

361 

362 Returns: 

363 List of search results 

364 """ 

365 logger.info("---Execute a search using arXiv---") 

366 

367 # Use the implementation from the parent class which handles all phases 

368 results = super().run(query, research_context=research_context) 

369 

370 # Clean up 

371 if hasattr(self, "_papers"): 

372 del self._papers 

373 

374 return results 

375 

376 def get_paper_details(self, arxiv_id: str) -> Dict[str, Any]: 

377 """ 

378 Get detailed information about a specific arXiv paper. 

379 

380 Args: 

381 arxiv_id: arXiv ID of the paper (e.g., '2101.12345') 

382 

383 Returns: 

384 Dictionary with paper information 

385 """ 

386 try: 

387 # Create the search client 

388 client = arxiv.Client() 

389 

390 # Search for the specific paper 

391 search = arxiv.Search(id_list=[arxiv_id], max_results=1) 

392 

393 # Apply rate limiting before fetching paper by ID 

394 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

395 self.engine_type 

396 ) 

397 

398 # Get the paper 

399 papers = list(client.results(search)) 

400 if not papers: 

401 return {} 

402 

403 paper = papers[0] 

404 

405 # Format result based on config 

406 result = { 

407 "title": paper.title, 

408 "link": paper.entry_id, 

409 "snippet": ( 

410 paper.summary[:250] + "..." 

411 if len(paper.summary) > 250 

412 else paper.summary 

413 ), 

414 "authors": [ 

415 author.name for author in paper.authors[:3] 

416 ], # First 3 authors 

417 "journal_ref": paper.journal_ref, 

418 } 

419 

420 # Add full content if not in snippet-only mode 

421 if ( 421 ↛ 464line 421 didn't jump to line 464 because the condition on line 421 was always true

422 not hasattr(search_config, "SEARCH_SNIPPETS_ONLY") 

423 or not search_config.SEARCH_SNIPPETS_ONLY 

424 ): 

425 result.update( 

426 { 

427 "pdf_url": paper.pdf_url, 

428 "authors": [ 

429 author.name for author in paper.authors 

430 ], # All authors 

431 "published": ( 

432 paper.published.strftime("%Y-%m-%d") 

433 if paper.published 

434 else None 

435 ), 

436 "updated": ( 

437 paper.updated.strftime("%Y-%m-%d") 

438 if paper.updated 

439 else None 

440 ), 

441 "categories": paper.categories, 

442 "summary": paper.summary, # Full summary 

443 "comment": paper.comment, 

444 "doi": paper.doi, 

445 "content": paper.summary, # Use summary as content 

446 "full_content": paper.summary, # For consistency 

447 } 

448 ) 

449 

450 # Download PDF if requested 

451 if self.include_full_text and self.download_dir: 451 ↛ 452line 451 didn't jump to line 452 because the condition on line 451 was never true

452 try: 

453 # Apply rate limiting before PDF download 

454 self.rate_tracker.apply_rate_limit(self.engine_type) 

455 

456 # Download the paper 

457 paper_path = paper.download_pdf( 

458 dirpath=self.download_dir 

459 ) 

460 result["pdf_path"] = str(paper_path) 

461 except Exception: 

462 logger.exception("Error downloading paper") 

463 

464 return result 

465 

466 except Exception: 

467 logger.exception("Error getting paper details") 

468 return {} 

469 

470 def search_by_author( 

471 self, author_name: str, max_results: Optional[int] = None 

472 ) -> List[Dict[str, Any]]: 

473 """ 

474 Search for papers by a specific author. 

475 

476 Args: 

477 author_name: Name of the author 

478 max_results: Maximum number of results (defaults to self.max_results) 

479 

480 Returns: 

481 List of papers by the author 

482 """ 

483 original_max_results = self.max_results 

484 

485 try: 

486 if max_results: 

487 self.max_results = max_results 

488 

489 query = f'au:"{author_name}"' 

490 return self.run(query) 

491 

492 finally: 

493 # Restore original value 

494 self.max_results = original_max_results 

495 

496 def search_by_category( 

497 self, category: str, max_results: Optional[int] = None 

498 ) -> List[Dict[str, Any]]: 

499 """ 

500 Search for papers in a specific arXiv category. 

501 

502 Args: 

503 category: arXiv category (e.g., 'cs.AI', 'physics.optics') 

504 max_results: Maximum number of results (defaults to self.max_results) 

505 

506 Returns: 

507 List of papers in the category 

508 """ 

509 original_max_results = self.max_results 

510 

511 try: 

512 if max_results: 

513 self.max_results = max_results 

514 

515 query = f"cat:{category}" 

516 return self.run(query) 

517 

518 finally: 

519 # Restore original value 

520 self.max_results = original_max_results