Coverage for src/local_deep_research/web_search_engines/engines/search_engine

1from typing import Any, Dict, List, Optional

3import arxiv

4from langchain_core.language_models import BaseLLM

5from loguru import logger

7from ...advanced_search_system.filters.journal_reputation_filter import (

8 JournalReputationFilter,

10from ...config import search_config

11from ...constants import SNIPPET_LENGTH_SHORT

12from ..rate_limiting import RateLimitError

13from ..search_engine_base import BaseSearchEngine

16class ArXivSearchEngine(BaseSearchEngine):

17 """arXiv search engine implementation with two-phase approach"""

19 # Mark as public search engine

20 is_public = True

21 # Not a generic search engine (specialized for academic papers)

22 is_generic = False

23 # Scientific/academic search engine

24 is_scientific = True

26 def __init__(

27 self,

28 max_results: int = 10,

29 sort_by: str = "relevance",

30 sort_order: str = "descending",

31 include_full_text: bool = False,

32 download_dir: Optional[str] = None,

33 max_full_text: int = 1,

34 llm: Optional[BaseLLM] = None,

35 max_filtered_results: Optional[int] = None,

36 settings_snapshot: Optional[Dict[str, Any]] = None,

37 ): # Added this parameter

38 """

39 Initialize the arXiv search engine.

41 Args:

42 max_results: Maximum number of search results

43 sort_by: Sorting criteria ('relevance', 'lastUpdatedDate', or 'submittedDate')

44 sort_order: Sort order ('ascending' or 'descending')

45 include_full_text: Whether to include full paper content in results (downloads PDF)

46 download_dir: Directory to download PDFs to (if include_full_text is True)

47 max_full_text: Maximum number of PDFs to download and process (default: 1)

48 llm: Language model for relevance filtering

49 max_filtered_results: Maximum number of results to keep after filtering

50 settings_snapshot: Settings snapshot for thread context

51 """

52 # Initialize the journal reputation filter if needed.

53 content_filters = []

54 journal_filter = JournalReputationFilter.create_default(

55 model=llm, engine_name="arxiv", settings_snapshot=settings_snapshot

56 )

57 if journal_filter is not None:

58 content_filters.append(journal_filter)

60 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results

61 super().__init__(

62 llm=llm,

63 max_filtered_results=max_filtered_results,

64 max_results=max_results,

65 # We deliberately do this filtering after relevancy checks,

66 # because it is potentially quite slow.

67 content_filters=content_filters,

68 settings_snapshot=settings_snapshot,

69 )

70 self.max_results = max(self.max_results, 25)

71 self.sort_by = sort_by

72 self.sort_order = sort_order

73 self.include_full_text = include_full_text

74 self.download_dir = download_dir

75 self.max_full_text = max_full_text

77 # Map sort parameters to arxiv package parameters

78 self.sort_criteria = {

79 "relevance": arxiv.SortCriterion.Relevance,

80 "lastUpdatedDate": arxiv.SortCriterion.LastUpdatedDate,

81 "submittedDate": arxiv.SortCriterion.SubmittedDate,

82 }

84 self.sort_directions = {

85 "ascending": arxiv.SortOrder.Ascending,

86 "descending": arxiv.SortOrder.Descending,

87 }

89 def _get_search_results(self, query: str) -> List[Any]:

90 """

91 Helper method to get search results from arXiv API.

93 Args:

94 query: The search query

96 Returns:

97 List of arXiv paper objects

98 """

99 # Configure the search client

100 sort_criteria = self.sort_criteria.get(

101 self.sort_by, arxiv.SortCriterion.Relevance

102 )

103 sort_order = self.sort_directions.get(

104 self.sort_order, arxiv.SortOrder.Descending

105 )

106

107 # Create the search client

108 client = arxiv.Client(page_size=self.max_results)

109

110 # Create the search query

111 search = arxiv.Search(

112 query=query,

113 max_results=self.max_results,

114 sort_by=sort_criteria,

115 sort_order=sort_order,

116 )

117

118 # Apply rate limiting before making the request

119 self._last_wait_time = self.rate_tracker.apply_rate_limit(

120 self.engine_type

121 )

122

123 # Get the search results

124 papers = list(client.results(search))

125

126 return papers

127

128 def _get_previews(self, query: str) -> List[Dict[str, Any]]:

129 """

130 Get preview information for arXiv papers.

131

132 Args:

133 query: The search query

134

135 Returns:

136 List of preview dictionaries

137 """

138 logger.info("Getting paper previews from arXiv")

139

140 try:

141 # Get search results from arXiv

142 papers = self._get_search_results(query)

143

144 # Store the paper objects for later use

145 self._papers = {paper.entry_id: paper for paper in papers}

146

147 # Format results as previews with basic information

148 previews = []

149 for paper in papers:

150 preview = {

151 "id": paper.entry_id, # Use entry_id as ID

152 "title": paper.title,

153 "link": paper.entry_id, # arXiv URL

154 "snippet": (

155 paper.summary[:SNIPPET_LENGTH_SHORT] + "..."

156 if len(paper.summary) > SNIPPET_LENGTH_SHORT

157 else paper.summary

158 ),

159 "authors": [

160 author.name for author in paper.authors[:3]

161 ], # First 3 authors

162 "published": (

163 paper.published.strftime("%Y-%m-%d")

164 if paper.published

165 else None

166 ),

167 "journal_ref": paper.journal_ref,

168 "source": "arXiv",

169 }

170

171 previews.append(preview)

172

173 return previews

174

175 except Exception as e:

176 error_msg = str(e)

177 logger.exception("Error getting arXiv previews")

178

179 # Check for rate limiting patterns

180 if (

181 "429" in error_msg

182 or "too many requests" in error_msg.lower()

183 or "rate limit" in error_msg.lower()

184 or "service unavailable" in error_msg.lower()

185 or "503" in error_msg

186 ):

187 raise RateLimitError(f"arXiv rate limit hit: {error_msg}")

188

189 return []

190

191 def _get_full_content(

192 self, relevant_items: List[Dict[str, Any]]

193 ) -> List[Dict[str, Any]]:

194 """

195 Get full content for the relevant arXiv papers.

196 Downloads PDFs and extracts text when include_full_text is True.

197 Limits the number of PDFs processed to max_full_text.

198

199 Args:

200 relevant_items: List of relevant preview dictionaries

201

202 Returns:

203 List of result dictionaries with full content

204 """

205 # Check if we should get full content

206 if (

207 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")

208 and search_config.SEARCH_SNIPPETS_ONLY

209 ):

210 logger.info("Snippet-only mode, skipping full content retrieval")

211 return relevant_items

212

213 logger.info("Getting full content for relevant arXiv papers")

214

215 results = []

216 pdf_count = 0 # Track number of PDFs processed

217

218 for item in relevant_items:

219 # Start with the preview data

220 result = item.copy()

221

222 # Get the paper ID

223 paper_id = item.get("id")

224

225 # Try to get the full paper from our cache

226 paper = None

227 if hasattr(self, "_papers") and paper_id in self._papers:

228 paper = self._papers[paper_id]

229

230 if paper:

231 # Add complete paper information

232 result.update(

233 {

234 "pdf_url": paper.pdf_url,

235 "authors": [

236 author.name for author in paper.authors

237 ], # All authors

238 "published": (

239 paper.published.strftime("%Y-%m-%d")

240 if paper.published

241 else None

242 ),

243 "updated": (

244 paper.updated.strftime("%Y-%m-%d")

245 if paper.updated

246 else None

247 ),

248 "categories": paper.categories,

249 "summary": paper.summary, # Full summary

250 "comment": paper.comment,

251 "doi": paper.doi,

252 }

253 )

254

255 # Default to using summary as content

256 result["content"] = paper.summary

257 result["full_content"] = paper.summary

258

259 # Download PDF and extract text if requested and within limit

260 if ( 260 ↛ 265line 260 didn't jump to line 265 because the condition on line 260 was never true

261 self.include_full_text

262 and self.download_dir

263 and pdf_count < self.max_full_text

264 ):

265 try:

266 # Download the paper

267 pdf_count += (

268 1 # Increment counter before attempting download

269 )

270 # Apply rate limiting before PDF download

271 self.rate_tracker.apply_rate_limit(self.engine_type)

272

273 paper_path = paper.download_pdf(

274 dirpath=self.download_dir

275 )

276 result["pdf_path"] = str(paper_path)

277

278 # Extract text from PDF

279 try:

280 # Try PyPDF2 first

281 try:

282 import PyPDF2

283

284 with open(paper_path, "rb") as pdf_file:

285 pdf_reader = PyPDF2.PdfReader(pdf_file)

286 pdf_text = ""

287 for page in pdf_reader.pages:

288 pdf_text += page.extract_text() + "\n\n"

289

290 if (

291 pdf_text.strip()

292 ): # Only use if we got meaningful text

293 result["content"] = pdf_text

294 result["full_content"] = pdf_text

295 logger.info(

296 "Successfully extracted text from PDF using PyPDF2"

297 )

298 except (ImportError, Exception) as e1:

299 # Fall back to pdfplumber

300 try:

301 import pdfplumber

302

303 with pdfplumber.open(paper_path) as pdf:

304 pdf_text = ""

305 for page in pdf.pages:

306 pdf_text += (

307 page.extract_text() + "\n\n"

308 )

309

310 if (

311 pdf_text.strip()

312 ): # Only use if we got meaningful text

313 result["content"] = pdf_text

314 result["full_content"] = pdf_text

315 logger.info(

316 "Successfully extracted text from PDF using pdfplumber"

317 )

318 except (ImportError, Exception) as e2:

319 logger.exception(

320 f"PDF text extraction failed: {e1!s}, then {e2!s}"

321 )

322 logger.info(

323 "Using paper summary as content instead"

324 )

325 except Exception:

326 logger.exception("Error extracting text from PDF")

327 logger.info(

328 "Using paper summary as content instead"

329 )

330 except Exception:

331 logger.exception(

332 f"Error downloading paper {paper.title}"

333 )

334 result["pdf_path"] = None

335 pdf_count -= 1 # Decrement counter if download fails

336 elif ( 336 ↛ 342line 336 didn't jump to line 342 because the condition on line 336 was never true

337 self.include_full_text

338 and self.download_dir

339 and pdf_count >= self.max_full_text

340 ):

341 # Reached PDF limit

342 logger.info(

343 f"Maximum number of PDFs ({self.max_full_text}) reached. Skipping remaining PDFs."

344 )

345 result["content"] = paper.summary

346 result["full_content"] = paper.summary

347

348 results.append(result)

349

350 return results

351

352 def run(

353 self, query: str, research_context: Dict[str, Any] | None = None

354 ) -> List[Dict[str, Any]]:

355 """

356 Execute a search using arXiv with the two-phase approach.

357

358 Args:

359 query: The search query

360 research_context: Context from previous research to use.

361

362 Returns:

363 List of search results

364 """

365 logger.info("---Execute a search using arXiv---")

366

367 # Use the implementation from the parent class which handles all phases

368 results = super().run(query, research_context=research_context)

369

370 # Clean up

371 if hasattr(self, "_papers"):

372 del self._papers

373

374 return results

375

376 def get_paper_details(self, arxiv_id: str) -> Dict[str, Any]:

377 """

378 Get detailed information about a specific arXiv paper.

379

380 Args:

381 arxiv_id: arXiv ID of the paper (e.g., '2101.12345')

382

383 Returns:

384 Dictionary with paper information

385 """

386 try:

387 # Create the search client

388 client = arxiv.Client()

389

390 # Search for the specific paper

391 search = arxiv.Search(id_list=[arxiv_id], max_results=1)

392

393 # Apply rate limiting before fetching paper by ID

394 self._last_wait_time = self.rate_tracker.apply_rate_limit(

395 self.engine_type

396 )

397

398 # Get the paper

399 papers = list(client.results(search))

400 if not papers:

401 return {}

402

403 paper = papers[0]

404

405 # Format result based on config

406 result = {

407 "title": paper.title,

408 "link": paper.entry_id,

409 "snippet": (

410 paper.summary[:250] + "..."

411 if len(paper.summary) > 250

412 else paper.summary

413 ),

414 "authors": [

415 author.name for author in paper.authors[:3]

416 ], # First 3 authors

417 "journal_ref": paper.journal_ref,

418 }

419

420 # Add full content if not in snippet-only mode

421 if ( 421 ↛ 464line 421 didn't jump to line 464 because the condition on line 421 was always true

422 not hasattr(search_config, "SEARCH_SNIPPETS_ONLY")

423 or not search_config.SEARCH_SNIPPETS_ONLY

424 ):

425 result.update(

426 {

427 "pdf_url": paper.pdf_url,

428 "authors": [

429 author.name for author in paper.authors

430 ], # All authors

431 "published": (

432 paper.published.strftime("%Y-%m-%d")

433 if paper.published

434 else None

435 ),

436 "updated": (

437 paper.updated.strftime("%Y-%m-%d")

438 if paper.updated

439 else None

440 ),

441 "categories": paper.categories,

442 "summary": paper.summary, # Full summary

443 "comment": paper.comment,

444 "doi": paper.doi,

445 "content": paper.summary, # Use summary as content

446 "full_content": paper.summary, # For consistency

447 }

448 )

449

450 # Download PDF if requested

451 if self.include_full_text and self.download_dir: 451 ↛ 452line 451 didn't jump to line 452 because the condition on line 451 was never true

452 try:

453 # Apply rate limiting before PDF download

454 self.rate_tracker.apply_rate_limit(self.engine_type)

455

456 # Download the paper

457 paper_path = paper.download_pdf(

458 dirpath=self.download_dir

459 )

460 result["pdf_path"] = str(paper_path)

461 except Exception:

462 logger.exception("Error downloading paper")

463

464 return result

465

466 except Exception:

467 logger.exception("Error getting paper details")

468 return {}

469

470 def search_by_author(

471 self, author_name: str, max_results: Optional[int] = None

472 ) -> List[Dict[str, Any]]:

473 """

474 Search for papers by a specific author.

475

476 Args:

477 author_name: Name of the author

478 max_results: Maximum number of results (defaults to self.max_results)

479

480 Returns:

481 List of papers by the author

482 """

483 original_max_results = self.max_results

484

485 try:

486 if max_results:

487 self.max_results = max_results

488

489 query = f'au:"{author_name}"'

490 return self.run(query)

491

492 finally:

493 # Restore original value

494 self.max_results = original_max_results

495

496 def search_by_category(

497 self, category: str, max_results: Optional[int] = None

498 ) -> List[Dict[str, Any]]:

499 """

500 Search for papers in a specific arXiv category.

501

502 Args:

503 category: arXiv category (e.g., 'cs.AI', 'physics.optics')

504 max_results: Maximum number of results (defaults to self.max_results)

505

506 Returns:

507 List of papers in the category

508 """

509 original_max_results = self.max_results

510

511 try:

512 if max_results:

513 self.max_results = max_results

514

515 query = f"cat:{category}"

516 return self.run(query)

517

518 finally:

519 # Restore original value

520 self.max_results = original_max_results

Coverage for src / local_deep_research / web_search_engines / engines / search_engine_arxiv.py: 70%

158 statements