Coverage for src / local_deep_research / web_search_engines / engines / search_engine_arxiv.py: 70%
158 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1from typing import Any, Dict, List, Optional
3import arxiv
4from langchain_core.language_models import BaseLLM
5from loguru import logger
7from ...advanced_search_system.filters.journal_reputation_filter import (
8 JournalReputationFilter,
9)
10from ...config import search_config
11from ...constants import SNIPPET_LENGTH_SHORT
12from ..rate_limiting import RateLimitError
13from ..search_engine_base import BaseSearchEngine
16class ArXivSearchEngine(BaseSearchEngine):
17 """arXiv search engine implementation with two-phase approach"""
19 # Mark as public search engine
20 is_public = True
21 # Not a generic search engine (specialized for academic papers)
22 is_generic = False
23 # Scientific/academic search engine
24 is_scientific = True
26 def __init__(
27 self,
28 max_results: int = 10,
29 sort_by: str = "relevance",
30 sort_order: str = "descending",
31 include_full_text: bool = False,
32 download_dir: Optional[str] = None,
33 max_full_text: int = 1,
34 llm: Optional[BaseLLM] = None,
35 max_filtered_results: Optional[int] = None,
36 settings_snapshot: Optional[Dict[str, Any]] = None,
37 ): # Added this parameter
38 """
39 Initialize the arXiv search engine.
41 Args:
42 max_results: Maximum number of search results
43 sort_by: Sorting criteria ('relevance', 'lastUpdatedDate', or 'submittedDate')
44 sort_order: Sort order ('ascending' or 'descending')
45 include_full_text: Whether to include full paper content in results (downloads PDF)
46 download_dir: Directory to download PDFs to (if include_full_text is True)
47 max_full_text: Maximum number of PDFs to download and process (default: 1)
48 llm: Language model for relevance filtering
49 max_filtered_results: Maximum number of results to keep after filtering
50 settings_snapshot: Settings snapshot for thread context
51 """
52 # Initialize the journal reputation filter if needed.
53 content_filters = []
54 journal_filter = JournalReputationFilter.create_default(
55 model=llm, engine_name="arxiv", settings_snapshot=settings_snapshot
56 )
57 if journal_filter is not None:
58 content_filters.append(journal_filter)
60 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
61 super().__init__(
62 llm=llm,
63 max_filtered_results=max_filtered_results,
64 max_results=max_results,
65 # We deliberately do this filtering after relevancy checks,
66 # because it is potentially quite slow.
67 content_filters=content_filters,
68 settings_snapshot=settings_snapshot,
69 )
70 self.max_results = max(self.max_results, 25)
71 self.sort_by = sort_by
72 self.sort_order = sort_order
73 self.include_full_text = include_full_text
74 self.download_dir = download_dir
75 self.max_full_text = max_full_text
77 # Map sort parameters to arxiv package parameters
78 self.sort_criteria = {
79 "relevance": arxiv.SortCriterion.Relevance,
80 "lastUpdatedDate": arxiv.SortCriterion.LastUpdatedDate,
81 "submittedDate": arxiv.SortCriterion.SubmittedDate,
82 }
84 self.sort_directions = {
85 "ascending": arxiv.SortOrder.Ascending,
86 "descending": arxiv.SortOrder.Descending,
87 }
89 def _get_search_results(self, query: str) -> List[Any]:
90 """
91 Helper method to get search results from arXiv API.
93 Args:
94 query: The search query
96 Returns:
97 List of arXiv paper objects
98 """
99 # Configure the search client
100 sort_criteria = self.sort_criteria.get(
101 self.sort_by, arxiv.SortCriterion.Relevance
102 )
103 sort_order = self.sort_directions.get(
104 self.sort_order, arxiv.SortOrder.Descending
105 )
107 # Create the search client
108 client = arxiv.Client(page_size=self.max_results)
110 # Create the search query
111 search = arxiv.Search(
112 query=query,
113 max_results=self.max_results,
114 sort_by=sort_criteria,
115 sort_order=sort_order,
116 )
118 # Apply rate limiting before making the request
119 self._last_wait_time = self.rate_tracker.apply_rate_limit(
120 self.engine_type
121 )
123 # Get the search results
124 papers = list(client.results(search))
126 return papers
128 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
129 """
130 Get preview information for arXiv papers.
132 Args:
133 query: The search query
135 Returns:
136 List of preview dictionaries
137 """
138 logger.info("Getting paper previews from arXiv")
140 try:
141 # Get search results from arXiv
142 papers = self._get_search_results(query)
144 # Store the paper objects for later use
145 self._papers = {paper.entry_id: paper for paper in papers}
147 # Format results as previews with basic information
148 previews = []
149 for paper in papers:
150 preview = {
151 "id": paper.entry_id, # Use entry_id as ID
152 "title": paper.title,
153 "link": paper.entry_id, # arXiv URL
154 "snippet": (
155 paper.summary[:SNIPPET_LENGTH_SHORT] + "..."
156 if len(paper.summary) > SNIPPET_LENGTH_SHORT
157 else paper.summary
158 ),
159 "authors": [
160 author.name for author in paper.authors[:3]
161 ], # First 3 authors
162 "published": (
163 paper.published.strftime("%Y-%m-%d")
164 if paper.published
165 else None
166 ),
167 "journal_ref": paper.journal_ref,
168 "source": "arXiv",
169 }
171 previews.append(preview)
173 return previews
175 except Exception as e:
176 error_msg = str(e)
177 logger.exception("Error getting arXiv previews")
179 # Check for rate limiting patterns
180 if (
181 "429" in error_msg
182 or "too many requests" in error_msg.lower()
183 or "rate limit" in error_msg.lower()
184 or "service unavailable" in error_msg.lower()
185 or "503" in error_msg
186 ):
187 raise RateLimitError(f"arXiv rate limit hit: {error_msg}")
189 return []
191 def _get_full_content(
192 self, relevant_items: List[Dict[str, Any]]
193 ) -> List[Dict[str, Any]]:
194 """
195 Get full content for the relevant arXiv papers.
196 Downloads PDFs and extracts text when include_full_text is True.
197 Limits the number of PDFs processed to max_full_text.
199 Args:
200 relevant_items: List of relevant preview dictionaries
202 Returns:
203 List of result dictionaries with full content
204 """
205 # Check if we should get full content
206 if (
207 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
208 and search_config.SEARCH_SNIPPETS_ONLY
209 ):
210 logger.info("Snippet-only mode, skipping full content retrieval")
211 return relevant_items
213 logger.info("Getting full content for relevant arXiv papers")
215 results = []
216 pdf_count = 0 # Track number of PDFs processed
218 for item in relevant_items:
219 # Start with the preview data
220 result = item.copy()
222 # Get the paper ID
223 paper_id = item.get("id")
225 # Try to get the full paper from our cache
226 paper = None
227 if hasattr(self, "_papers") and paper_id in self._papers:
228 paper = self._papers[paper_id]
230 if paper:
231 # Add complete paper information
232 result.update(
233 {
234 "pdf_url": paper.pdf_url,
235 "authors": [
236 author.name for author in paper.authors
237 ], # All authors
238 "published": (
239 paper.published.strftime("%Y-%m-%d")
240 if paper.published
241 else None
242 ),
243 "updated": (
244 paper.updated.strftime("%Y-%m-%d")
245 if paper.updated
246 else None
247 ),
248 "categories": paper.categories,
249 "summary": paper.summary, # Full summary
250 "comment": paper.comment,
251 "doi": paper.doi,
252 }
253 )
255 # Default to using summary as content
256 result["content"] = paper.summary
257 result["full_content"] = paper.summary
259 # Download PDF and extract text if requested and within limit
260 if ( 260 ↛ 265line 260 didn't jump to line 265 because the condition on line 260 was never true
261 self.include_full_text
262 and self.download_dir
263 and pdf_count < self.max_full_text
264 ):
265 try:
266 # Download the paper
267 pdf_count += (
268 1 # Increment counter before attempting download
269 )
270 # Apply rate limiting before PDF download
271 self.rate_tracker.apply_rate_limit(self.engine_type)
273 paper_path = paper.download_pdf(
274 dirpath=self.download_dir
275 )
276 result["pdf_path"] = str(paper_path)
278 # Extract text from PDF
279 try:
280 # Try PyPDF2 first
281 try:
282 import PyPDF2
284 with open(paper_path, "rb") as pdf_file:
285 pdf_reader = PyPDF2.PdfReader(pdf_file)
286 pdf_text = ""
287 for page in pdf_reader.pages:
288 pdf_text += page.extract_text() + "\n\n"
290 if (
291 pdf_text.strip()
292 ): # Only use if we got meaningful text
293 result["content"] = pdf_text
294 result["full_content"] = pdf_text
295 logger.info(
296 "Successfully extracted text from PDF using PyPDF2"
297 )
298 except (ImportError, Exception) as e1:
299 # Fall back to pdfplumber
300 try:
301 import pdfplumber
303 with pdfplumber.open(paper_path) as pdf:
304 pdf_text = ""
305 for page in pdf.pages:
306 pdf_text += (
307 page.extract_text() + "\n\n"
308 )
310 if (
311 pdf_text.strip()
312 ): # Only use if we got meaningful text
313 result["content"] = pdf_text
314 result["full_content"] = pdf_text
315 logger.info(
316 "Successfully extracted text from PDF using pdfplumber"
317 )
318 except (ImportError, Exception) as e2:
319 logger.exception(
320 f"PDF text extraction failed: {e1!s}, then {e2!s}"
321 )
322 logger.info(
323 "Using paper summary as content instead"
324 )
325 except Exception:
326 logger.exception("Error extracting text from PDF")
327 logger.info(
328 "Using paper summary as content instead"
329 )
330 except Exception:
331 logger.exception(
332 f"Error downloading paper {paper.title}"
333 )
334 result["pdf_path"] = None
335 pdf_count -= 1 # Decrement counter if download fails
336 elif ( 336 ↛ 342line 336 didn't jump to line 342 because the condition on line 336 was never true
337 self.include_full_text
338 and self.download_dir
339 and pdf_count >= self.max_full_text
340 ):
341 # Reached PDF limit
342 logger.info(
343 f"Maximum number of PDFs ({self.max_full_text}) reached. Skipping remaining PDFs."
344 )
345 result["content"] = paper.summary
346 result["full_content"] = paper.summary
348 results.append(result)
350 return results
352 def run(
353 self, query: str, research_context: Dict[str, Any] | None = None
354 ) -> List[Dict[str, Any]]:
355 """
356 Execute a search using arXiv with the two-phase approach.
358 Args:
359 query: The search query
360 research_context: Context from previous research to use.
362 Returns:
363 List of search results
364 """
365 logger.info("---Execute a search using arXiv---")
367 # Use the implementation from the parent class which handles all phases
368 results = super().run(query, research_context=research_context)
370 # Clean up
371 if hasattr(self, "_papers"):
372 del self._papers
374 return results
376 def get_paper_details(self, arxiv_id: str) -> Dict[str, Any]:
377 """
378 Get detailed information about a specific arXiv paper.
380 Args:
381 arxiv_id: arXiv ID of the paper (e.g., '2101.12345')
383 Returns:
384 Dictionary with paper information
385 """
386 try:
387 # Create the search client
388 client = arxiv.Client()
390 # Search for the specific paper
391 search = arxiv.Search(id_list=[arxiv_id], max_results=1)
393 # Apply rate limiting before fetching paper by ID
394 self._last_wait_time = self.rate_tracker.apply_rate_limit(
395 self.engine_type
396 )
398 # Get the paper
399 papers = list(client.results(search))
400 if not papers:
401 return {}
403 paper = papers[0]
405 # Format result based on config
406 result = {
407 "title": paper.title,
408 "link": paper.entry_id,
409 "snippet": (
410 paper.summary[:250] + "..."
411 if len(paper.summary) > 250
412 else paper.summary
413 ),
414 "authors": [
415 author.name for author in paper.authors[:3]
416 ], # First 3 authors
417 "journal_ref": paper.journal_ref,
418 }
420 # Add full content if not in snippet-only mode
421 if ( 421 ↛ 464line 421 didn't jump to line 464 because the condition on line 421 was always true
422 not hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
423 or not search_config.SEARCH_SNIPPETS_ONLY
424 ):
425 result.update(
426 {
427 "pdf_url": paper.pdf_url,
428 "authors": [
429 author.name for author in paper.authors
430 ], # All authors
431 "published": (
432 paper.published.strftime("%Y-%m-%d")
433 if paper.published
434 else None
435 ),
436 "updated": (
437 paper.updated.strftime("%Y-%m-%d")
438 if paper.updated
439 else None
440 ),
441 "categories": paper.categories,
442 "summary": paper.summary, # Full summary
443 "comment": paper.comment,
444 "doi": paper.doi,
445 "content": paper.summary, # Use summary as content
446 "full_content": paper.summary, # For consistency
447 }
448 )
450 # Download PDF if requested
451 if self.include_full_text and self.download_dir: 451 ↛ 452line 451 didn't jump to line 452 because the condition on line 451 was never true
452 try:
453 # Apply rate limiting before PDF download
454 self.rate_tracker.apply_rate_limit(self.engine_type)
456 # Download the paper
457 paper_path = paper.download_pdf(
458 dirpath=self.download_dir
459 )
460 result["pdf_path"] = str(paper_path)
461 except Exception:
462 logger.exception("Error downloading paper")
464 return result
466 except Exception:
467 logger.exception("Error getting paper details")
468 return {}
470 def search_by_author(
471 self, author_name: str, max_results: Optional[int] = None
472 ) -> List[Dict[str, Any]]:
473 """
474 Search for papers by a specific author.
476 Args:
477 author_name: Name of the author
478 max_results: Maximum number of results (defaults to self.max_results)
480 Returns:
481 List of papers by the author
482 """
483 original_max_results = self.max_results
485 try:
486 if max_results:
487 self.max_results = max_results
489 query = f'au:"{author_name}"'
490 return self.run(query)
492 finally:
493 # Restore original value
494 self.max_results = original_max_results
496 def search_by_category(
497 self, category: str, max_results: Optional[int] = None
498 ) -> List[Dict[str, Any]]:
499 """
500 Search for papers in a specific arXiv category.
502 Args:
503 category: arXiv category (e.g., 'cs.AI', 'physics.optics')
504 max_results: Maximum number of results (defaults to self.max_results)
506 Returns:
507 List of papers in the category
508 """
509 original_max_results = self.max_results
511 try:
512 if max_results:
513 self.max_results = max_results
515 query = f"cat:{category}"
516 return self.run(query)
518 finally:
519 # Restore original value
520 self.max_results = original_max_results