Coverage for src/local_deep_research/web_search_engines/engines/search_engine_pubmed.py: 92%

715 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1import re 

2from typing import Any, Dict, List, Optional, Tuple 

3 

4from defusedxml import ElementTree as ET 

5 

6from langchain_core.language_models import BaseLLM 

7from loguru import logger 

8 

9from ...config import search_config 

10from ...constants import SNIPPET_LENGTH_LONG 

11from ...security.safe_requests import safe_get 

12from ...advanced_search_system.filters.journal_reputation_filter import ( 

13 JournalReputationFilter, 

14) 

15from ..rate_limiting import RateLimitError 

16from ..search_engine_base import BaseSearchEngine 

17 

18 

19class PubMedSearchEngine(BaseSearchEngine): 

20 """ 

21 PubMed search engine implementation with two-phase approach and adaptive search. 

22 Provides efficient access to biomedical literature while minimizing API usage. 

23 """ 

24 

25 # Mark as public search engine 

26 is_public = True 

27 # Scientific/medical search engine 

28 is_scientific = True 

29 is_lexical = True 

30 needs_llm_relevance_filter = True 

31 

32 def __init__( 

33 self, 

34 max_results: int = 10, 

35 api_key: Optional[str] = None, 

36 days_limit: Optional[int] = None, 

37 get_abstracts: bool = True, 

38 get_full_text: bool = False, 

39 full_text_limit: int = 3, 

40 llm: Optional[BaseLLM] = None, 

41 max_filtered_results: Optional[int] = None, 

42 optimize_queries: bool = True, 

43 include_publication_type_in_context: bool = True, 

44 include_journal_in_context: bool = True, 

45 include_year_in_context: bool = True, 

46 include_authors_in_context: bool = False, 

47 include_full_date_in_context: bool = False, 

48 include_mesh_terms_in_context: bool = True, 

49 include_keywords_in_context: bool = True, 

50 include_doi_in_context: bool = False, 

51 include_pmid_in_context: bool = False, 

52 include_pmc_availability_in_context: bool = False, 

53 max_mesh_terms: int = 3, 

54 max_keywords: int = 3, 

55 include_citation_in_context: bool = False, 

56 include_language_in_context: bool = False, 

57 settings_snapshot: Optional[Dict[str, Any]] = None, 

58 ): 

59 """ 

60 Initialize the PubMed search engine. 

61 

62 Args: 

63 max_results: Maximum number of search results 

64 api_key: NCBI API key for higher rate limits (optional) 

65 days_limit: Limit results to N days (optional) 

66 get_abstracts: Whether to fetch abstracts for all results 

67 get_full_text: Whether to fetch full text content (when available in PMC) 

68 full_text_limit: Max number of full-text articles to retrieve 

69 llm: Language model for relevance filtering 

70 max_filtered_results: Maximum number of results to keep after filtering 

71 optimize_queries: Whether to optimize natural language queries for PubMed 

72 """ 

73 # Wire up the journal reputation filter as a preview filter so 

74 # results are scored against bundled OpenAlex/DOAJ/predatory data 

75 # before the (more expensive) LLM relevance pass. 

76 preview_filters = [] 

77 journal_filter = JournalReputationFilter.create_default( 

78 model=llm, # type: ignore[arg-type] 

79 engine_name="pubmed", 

80 settings_snapshot=settings_snapshot, 

81 ) 

82 if journal_filter is not None: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true

83 preview_filters.append(journal_filter) 

84 

85 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results 

86 super().__init__( 

87 llm=llm, 

88 max_filtered_results=max_filtered_results, 

89 max_results=max_results, 

90 preview_filters=preview_filters, # type: ignore[arg-type] 

91 settings_snapshot=settings_snapshot, 

92 ) 

93 self.max_results = max(self.max_results, 25) 

94 self.api_key = api_key 

95 self.days_limit = days_limit 

96 self.get_abstracts = get_abstracts 

97 self.get_full_text = get_full_text 

98 self.full_text_limit = full_text_limit 

99 self.optimize_queries = optimize_queries 

100 self.include_publication_type_in_context = ( 

101 include_publication_type_in_context 

102 ) 

103 self.include_journal_in_context = include_journal_in_context 

104 self.include_year_in_context = include_year_in_context 

105 self.include_authors_in_context = include_authors_in_context 

106 self.include_full_date_in_context = include_full_date_in_context 

107 self.include_mesh_terms_in_context = include_mesh_terms_in_context 

108 self.include_keywords_in_context = include_keywords_in_context 

109 self.include_doi_in_context = include_doi_in_context 

110 self.include_pmid_in_context = include_pmid_in_context 

111 self.include_pmc_availability_in_context = ( 

112 include_pmc_availability_in_context 

113 ) 

114 self.max_mesh_terms = max_mesh_terms 

115 self.max_keywords = max_keywords 

116 self.include_citation_in_context = include_citation_in_context 

117 self.include_language_in_context = include_language_in_context 

118 

119 # Base API URLs 

120 self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" 

121 self.search_url = f"{self.base_url}/esearch.fcgi" 

122 self.summary_url = f"{self.base_url}/esummary.fcgi" 

123 self.fetch_url = f"{self.base_url}/efetch.fcgi" 

124 self.link_url = f"{self.base_url}/elink.fcgi" 

125 

126 # PMC base URL for full text 

127 self.pmc_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" 

128 

129 def _get_result_count(self, query: str) -> int: 

130 """ 

131 Get the total number of results for a query without retrieving the results themselves. 

132 

133 Args: 

134 query: The search query 

135 

136 Returns: 

137 Total number of matching results 

138 """ 

139 try: 

140 # Prepare search parameters 

141 params = { 

142 "db": "pubmed", 

143 "term": query, 

144 "retmode": "json", 

145 "retmax": 0, # Don't need actual results, just the count 

146 } 

147 

148 # Add API key if available 

149 if self.api_key: 

150 params["api_key"] = self.api_key 

151 

152 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

153 self.engine_type 

154 ) 

155 

156 # Execute search request 

157 response = safe_get(self.search_url, params=params) 

158 response.raise_for_status() 

159 

160 # Parse response 

161 data = response.json() 

162 count = int(data["esearchresult"]["count"]) 

163 

164 logger.info( 

165 "Query '{}' has {} total results in PubMed", query, count 

166 ) 

167 return count 

168 

169 except Exception: 

170 logger.exception("Error getting result count") 

171 return 0 

172 

173 def _extract_core_terms(self, query: str) -> str: 

174 """ 

175 Extract core terms from a complex query for volume estimation. 

176 

177 Args: 

178 query: PubMed query string 

179 

180 Returns: 

181 Simplified query with core terms 

182 """ 

183 # Remove field specifications and operators 

184 simplified = re.sub(r"\[\w+\]", "", query) # Remove [Field] tags 

185 simplified = re.sub( 

186 r"\b(AND|OR|NOT)\b", "", simplified 

187 ) # Remove operators 

188 

189 # Remove quotes and parentheses 

190 simplified = ( 

191 simplified.replace('"', "").replace("(", "").replace(")", "") 

192 ) 

193 

194 # Split by whitespace and join terms with 4+ chars (likely meaningful) 

195 terms = [term for term in simplified.split() if len(term) >= 4] 

196 

197 # Join with AND to create a basic search 

198 return " ".join(terms[:5]) # Limit to top 5 terms 

199 

200 def _expand_time_window(self, time_filter: str) -> str: 

201 """ 

202 Expand a time window to get more results. 

203 

204 Args: 

205 time_filter: Current time filter 

206 

207 Returns: 

208 Expanded time filter 

209 """ 

210 # Parse current time window 

211 import re 

212 

213 match = re.match(r'"last (\d+) (\w+)"[pdat]', time_filter) 

214 if not match: 

215 return '"last 10 years"[pdat]' 

216 

217 amount, unit = int(match.group(1)), match.group(2) 

218 

219 # Expand based on current unit 

220 if unit == "months" or unit == "month": 

221 if amount < 6: 

222 return '"last 6 months"[pdat]' 

223 if amount < 12: 

224 return '"last 1 year"[pdat]' 

225 return '"last 2 years"[pdat]' 

226 if unit == "years" or unit == "year": 226 ↛ 233line 226 didn't jump to line 233 because the condition on line 226 was always true

227 if amount < 2: 

228 return '"last 2 years"[pdat]' 

229 if amount < 5: 

230 return '"last 5 years"[pdat]' 

231 return '"last 10 years"[pdat]' 

232 

233 return '"last 10 years"[pdat]' 

234 

235 def _optimize_query_for_pubmed(self, query: str) -> str: 

236 """ 

237 Optimize a natural language query for PubMed search. 

238 Uses LLM to transform questions into effective keyword-based queries. 

239 

240 Args: 

241 query: Natural language query 

242 

243 Returns: 

244 Optimized query string for PubMed 

245 """ 

246 if not self.llm or not self.optimize_queries: 

247 # Return original query if no LLM available or optimization disabled 

248 return query 

249 

250 try: 

251 # Prompt for query optimization 

252 prompt = f"""Transform this natural language question into an optimized PubMed search query. 

253 

254Original query: "{query}" 

255 

256CRITICAL RULES: 

2571. ONLY RETURN THE EXACT SEARCH QUERY - NO EXPLANATIONS, NO COMMENTS 

2582. DO NOT wrap the entire query in quotes 

2593. DO NOT include ANY date restrictions or year filters 

2604. Use parentheses around OR statements: (term1[Field] OR term2[Field]) 

2615. Use only BASIC MeSH terms - stick to broad categories like "Vaccines"[Mesh] 

2626. KEEP IT SIMPLE - use 2-3 main concepts maximum 

2637. Focus on Title/Abstract searches for reliability: term[Title/Abstract] 

2648. Use wildcards for variations: vaccin*[Title/Abstract] 

265 

266EXAMPLE QUERIES: 

267✓ GOOD: (mRNA[Title/Abstract] OR "messenger RNA"[Title/Abstract]) AND vaccin*[Title/Abstract] 

268✓ GOOD: (influenza[Title/Abstract] OR flu[Title/Abstract]) AND treatment[Title/Abstract] 

269✗ BAD: (mRNA[Title/Abstract]) AND "specific disease"[Mesh] AND treatment[Title/Abstract] AND 2023[dp] 

270✗ BAD: "Here's a query to find articles about vaccines..." 

271 

272Return ONLY the search query without any explanations. 

273""" 

274 

275 # Get response from LLM 

276 response = self.llm.invoke(prompt) 

277 raw_response = ( 

278 str(response.content) 

279 if hasattr(response, "content") 

280 else str(response) 

281 ).strip() 

282 

283 # Clean up the query - extract only the actual query and remove any explanations 

284 # First check if there are multiple lines and take the first non-empty line 

285 lines = raw_response.split("\n") 

286 cleaned_lines = [line.strip() for line in lines if line.strip()] 

287 

288 if cleaned_lines: 288 ↛ 338line 288 didn't jump to line 338 because the condition on line 288 was always true

289 optimized_query = cleaned_lines[0] 

290 

291 # Remove any quotes that wrap the entire query 

292 if optimized_query.startswith('"') and optimized_query.endswith( 

293 '"' 

294 ): 

295 optimized_query = optimized_query[1:-1] 

296 

297 # Remove any explanation phrases that might be at the beginning 

298 explanation_starters = [ 

299 "here is", 

300 "here's", 

301 "this query", 

302 "the following", 

303 ] 

304 for starter in explanation_starters: 

305 if optimized_query.lower().startswith(starter): 

306 # Find the actual query part - typically after a colon 

307 colon_pos = optimized_query.find(":") 

308 if colon_pos > 0: 

309 optimized_query = optimized_query[ 

310 colon_pos + 1 : 

311 ].strip() 

312 

313 # Check if the query still seems to contain explanations 

314 if ( 

315 len(optimized_query) > 200 

316 or "this query will" in optimized_query.lower() 

317 ): 

318 # It's probably still an explanation - try to extract just the query part 

319 # Look for common patterns in the explanation like parentheses 

320 pattern = r"\([^)]+\)\s+AND\s+" 

321 import re 

322 

323 matches = re.findall(pattern, optimized_query) 

324 if matches: 324 ↛ 344line 324 didn't jump to line 344 because the condition on line 324 was always true

325 # Extract just the query syntax parts 

326 query_parts = [] 

327 for part in re.split(r"\.\s+", optimized_query): 

328 if ( 

329 "(" in part 

330 and ")" in part 

331 and ("AND" in part or "OR" in part) 

332 ): 

333 query_parts.append(part) 

334 if query_parts: 334 ↛ 344line 334 didn't jump to line 344 because the condition on line 334 was always true

335 optimized_query = " ".join(query_parts) 

336 else: 

337 # Fall back to original query if cleaning fails 

338 logger.warning( 

339 "Failed to extract a clean query from LLM response" 

340 ) 

341 optimized_query = query 

342 

343 # Final safety check - if query looks too much like an explanation, use original 

344 if len(optimized_query.split()) > 30: 

345 logger.warning( 

346 "Query too verbose, falling back to simpler form" 

347 ) 

348 # Create a simple query from the original 

349 words = [ 

350 w 

351 for w in query.split() 

352 if len(w) > 3 

353 and w.lower() 

354 not in ( 

355 "what", 

356 "are", 

357 "the", 

358 "and", 

359 "for", 

360 "with", 

361 "from", 

362 "have", 

363 "been", 

364 "recent", 

365 ) 

366 ] 

367 optimized_query = " AND ".join(words[:3]) 

368 

369 # Basic cleanup: standardize field tag case for consistency 

370 import re 

371 

372 optimized_query = re.sub( 

373 r"\[mesh\]", "[Mesh]", optimized_query, flags=re.IGNORECASE 

374 ) 

375 optimized_query = re.sub( 

376 r"\[title/abstract\]", 

377 "[Title/Abstract]", 

378 optimized_query, 

379 flags=re.IGNORECASE, 

380 ) 

381 optimized_query = re.sub( 

382 r"\[publication type\]", 

383 "[Publication Type]", 

384 optimized_query, 

385 flags=re.IGNORECASE, 

386 ) 

387 

388 # Fix unclosed quotes followed by field tags 

389 # Pattern: "term[Field] -> "term"[Field] 

390 optimized_query = re.sub(r'"([^"]+)\[', r'"\1"[', optimized_query) 

391 

392 # Simplify the query if still no results are found 

393 self._simplify_query_cache = optimized_query 

394 

395 # Log original and optimized queries 

396 logger.info("Original query: '{}'", query) 

397 logger.info(f"Optimized for PubMed: '{optimized_query}'") 

398 logger.debug( 

399 f"Query optimization complete: '{query[:50]}...' -> '{optimized_query[:100]}...'" 

400 ) 

401 

402 return optimized_query 

403 

404 except Exception: 

405 logger.exception("Error optimizing query") 

406 logger.debug(f"Falling back to original query: '{query}'") 

407 return query # Fall back to original query on error 

408 

409 def _simplify_query(self, query: str) -> str: 

410 """ 

411 Simplify a PubMed query that returned no results. 

412 Progressively removes elements to get a more basic query. 

413 

414 Args: 

415 query: The original query that returned no results 

416 

417 Returns: 

418 Simplified query 

419 """ 

420 logger.info(f"Simplifying query: {query}") 

421 logger.debug(f"Query simplification started for: '{query[:100]}...'") 

422 

423 # Simple approach: remove field restrictions to broaden the search 

424 import re 

425 

426 # Remove field tags to make search broader 

427 simplified = query 

428 

429 # Remove [Mesh] tags - search in all fields instead 

430 simplified = re.sub(r"\[Mesh\]", "", simplified, flags=re.IGNORECASE) 

431 

432 # Remove [Publication Type] tags 

433 simplified = re.sub( 

434 r"\[Publication Type\]", "", simplified, flags=re.IGNORECASE 

435 ) 

436 

437 # Keep [Title/Abstract] as it's usually helpful 

438 # Clean up any double spaces 

439 simplified = re.sub(r"\s+", " ", simplified).strip() 

440 

441 # If no simplification was possible, return the original query 

442 if simplified == query: 

443 logger.debug("No simplification possible, returning original query") 

444 

445 logger.info(f"Simplified query: {simplified}") 

446 logger.debug( 

447 f"Query simplified from {len(query)} to {len(simplified)} chars" 

448 ) 

449 return simplified 

450 

451 def _is_historical_focused(self, query: str) -> bool: 

452 """ 

453 Determine if a query is specifically focused on historical/older information using LLM. 

454 Default assumption is that queries should prioritize recent information unless 

455 explicitly asking for historical content. 

456 

457 Args: 

458 query: The search query 

459 

460 Returns: 

461 Boolean indicating if the query is focused on historical information 

462 """ 

463 if not self.llm: 

464 # Fall back to basic keyword check if no LLM available 

465 historical_terms = [ 

466 "history", 

467 "historical", 

468 "early", 

469 "initial", 

470 "first", 

471 "original", 

472 "before", 

473 "prior to", 

474 "origins", 

475 "evolution", 

476 "development", 

477 ] 

478 historical_years = [str(year) for year in range(1900, 2020)] 

479 

480 query_lower = query.lower() 

481 has_historical_term = any( 

482 term in query_lower for term in historical_terms 

483 ) 

484 has_past_year = any(year in query for year in historical_years) 

485 

486 return has_historical_term or has_past_year 

487 

488 try: 

489 # Use LLM to determine if the query is focused on historical information 

490 prompt = f"""Determine if this query is specifically asking for HISTORICAL or OLDER information. 

491 

492Query: "{query}" 

493 

494Answer ONLY "yes" if the query is clearly asking for historical, early, original, or past information from more than 5 years ago. 

495Answer ONLY "no" if the query is asking about recent, current, or new information, or if it's a general query without a specific time focus. 

496 

497The default assumption should be that medical and scientific queries want RECENT information unless clearly specified otherwise. 

498""" 

499 

500 response = self.llm.invoke(prompt) 

501 answer = ( 

502 ( 

503 str(response.content) 

504 if hasattr(response, "content") 

505 else str(response) 

506 ) 

507 .strip() 

508 .lower() 

509 ) 

510 

511 # Log the determination 

512 logger.info(f"Historical focus determination for query: '{query}'") 

513 logger.info(f"LLM determined historical focus: {answer}") 

514 

515 return "yes" in answer 

516 

517 except Exception: 

518 logger.exception("Error determining historical focus") 

519 # Fall back to basic keyword check 

520 historical_terms = [ 

521 "history", 

522 "historical", 

523 "early", 

524 "initial", 

525 "first", 

526 "original", 

527 "before", 

528 "prior to", 

529 "origins", 

530 "evolution", 

531 "development", 

532 ] 

533 return any(term in query.lower() for term in historical_terms) 

534 

535 def _adaptive_search(self, query: str) -> Tuple[List[str], str]: 

536 """ 

537 Perform an adaptive search that adjusts based on topic volume and whether 

538 the query focuses on historical information. 

539 

540 Args: 

541 query: The search query (already optimized) 

542 

543 Returns: 

544 Tuple of (list of PMIDs, search strategy used) 

545 """ 

546 # Estimate topic volume 

547 estimated_volume = self._get_result_count(query) 

548 

549 # Determine if the query is focused on historical information 

550 is_historical_focused = self._is_historical_focused(query) 

551 

552 if is_historical_focused: 

553 # User wants historical information - no date filtering 

554 time_filter = None 

555 strategy = "historical_focus" 

556 elif estimated_volume > 5000: 

557 # Very common topic - use tighter recency filter 

558 time_filter = '"last 1 year"[pdat]' 

559 strategy = "high_volume" 

560 elif estimated_volume > 1000: 

561 # Common topic 

562 time_filter = '"last 3 years"[pdat]' 

563 strategy = "common_topic" 

564 elif estimated_volume > 100: 

565 # Moderate volume 

566 time_filter = '"last 5 years"[pdat]' 

567 strategy = "moderate_volume" 

568 else: 

569 # Rare topic - still use recency but with wider range 

570 time_filter = '"last 10 years"[pdat]' 

571 strategy = "rare_topic" 

572 

573 # Run search based on strategy 

574 if time_filter: 

575 # Try with adaptive time filter 

576 query_with_time = f"({query}) AND {time_filter}" 

577 logger.info( 

578 f"Using adaptive search strategy: {strategy} with filter: {time_filter}" 

579 ) 

580 results = self._search_pubmed(query_with_time) 

581 

582 # If too few results, gradually expand time window 

583 if len(results) < 5 and '"last 10 years"[pdat]' not in time_filter: 

584 logger.info( 

585 f"Insufficient results ({len(results)}), expanding time window" 

586 ) 

587 expanded_time = self._expand_time_window(time_filter) 

588 query_with_expanded_time = f"({query}) AND {expanded_time}" 

589 expanded_results = self._search_pubmed(query_with_expanded_time) 

590 

591 if len(expanded_results) > len(results): 

592 logger.info( 

593 f"Expanded time window yielded {len(expanded_results)} results" 

594 ) 

595 return expanded_results, f"{strategy}_expanded" 

596 

597 # If still no results, try without time filter 

598 if not results: 

599 logger.info( 

600 "No results with time filter, trying without time restrictions" 

601 ) 

602 results = self._search_pubmed(query) 

603 strategy = "no_time_filter" 

604 else: 

605 # Historical query - run without time filter 

606 logger.info( 

607 "Using historical search strategy without date filtering" 

608 ) 

609 results = self._search_pubmed(query) 

610 

611 return results, strategy 

612 

613 def _search_pubmed(self, query: str) -> List[str]: 

614 """ 

615 Search PubMed and return a list of article IDs. 

616 

617 Args: 

618 query: The search query 

619 

620 Returns: 

621 List of PubMed IDs matching the query 

622 """ 

623 try: 

624 # Prepare search parameters 

625 params = { 

626 "db": "pubmed", 

627 "term": query, 

628 "retmode": "json", 

629 "retmax": self.max_results, 

630 "usehistory": "y", 

631 } 

632 

633 # Add API key if available 

634 if self.api_key: 

635 params["api_key"] = self.api_key 

636 logger.debug("Using PubMed API key for higher rate limits") 

637 else: 

638 logger.debug("No PubMed API key - using default rate limits") 

639 

640 # Add date restriction if specified 

641 if self.days_limit: 

642 params["reldate"] = self.days_limit 

643 params["datetype"] = "pdat" # Publication date 

644 logger.debug(f"Limiting results to last {self.days_limit} days") 

645 

646 logger.debug( 

647 f"PubMed search query: '{query}' with max_results={self.max_results}" 

648 ) 

649 

650 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

651 self.engine_type 

652 ) 

653 logger.debug( 

654 f"Applied rate limit wait: {self._last_wait_time:.2f}s" 

655 ) 

656 

657 # Execute search request 

658 logger.debug(f"Sending request to PubMed API: {self.search_url}") 

659 response = safe_get(self.search_url, params=params) 

660 response.raise_for_status() 

661 logger.debug(f"PubMed API response status: {response.status_code}") 

662 

663 # Parse response 

664 data = response.json() 

665 id_list: list[str] = data["esearchresult"]["idlist"] 

666 total_count = data["esearchresult"].get("count", "unknown") 

667 

668 logger.info( 

669 f"PubMed search for '{query}' found {len(id_list)} results (total available: {total_count})" 

670 ) 

671 if len(id_list) > 0: 

672 logger.debug(f"First 5 PMIDs: {id_list[:5]}") 

673 return id_list 

674 

675 except Exception: 

676 logger.exception(f"Error searching PubMed for query '{query}'") 

677 return [] 

678 

679 def _get_article_summaries( 

680 self, id_list: List[str] 

681 ) -> List[Dict[str, Any]]: 

682 """ 

683 Get summaries for a list of PubMed article IDs. 

684 

685 Args: 

686 id_list: List of PubMed IDs 

687 

688 Returns: 

689 List of article summary dictionaries 

690 """ 

691 if not id_list: 

692 logger.debug("Empty ID list provided to _get_article_summaries") 

693 return [] 

694 

695 logger.debug(f"Fetching summaries for {len(id_list)} PubMed articles") 

696 

697 try: 

698 # Prepare parameters 

699 params = { 

700 "db": "pubmed", 

701 "id": ",".join(id_list), 

702 "retmode": "json", 

703 "rettype": "summary", 

704 } 

705 

706 # Add API key if available 

707 if self.api_key: 707 ↛ 708line 707 didn't jump to line 708 because the condition on line 707 was never true

708 params["api_key"] = self.api_key 

709 

710 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

711 self.engine_type 

712 ) 

713 logger.debug( 

714 f"Applied rate limit wait: {self._last_wait_time:.2f}s" 

715 ) 

716 

717 # Execute request 

718 logger.debug(f"Requesting summaries from: {self.summary_url}") 

719 response = safe_get(self.summary_url, params=params) 

720 response.raise_for_status() 

721 logger.debug(f"Summary API response status: {response.status_code}") 

722 

723 # Parse response 

724 data = response.json() 

725 logger.debug( 

726 f"PubMed API returned data for {len(id_list)} requested IDs" 

727 ) 

728 summaries = [] 

729 

730 for pmid in id_list: 

731 if pmid in data["result"]: 731 ↛ 779line 731 didn't jump to line 779 because the condition on line 731 was always true

732 article = data["result"][pmid] 

733 logger.debug( 

734 f"Processing article {pmid}: {article.get('title', 'NO TITLE')[:50]}" 

735 ) 

736 

737 # Extract authors (if available) 

738 authors = [] 

739 if "authors" in article: 739 ↛ 745line 739 didn't jump to line 745 because the condition on line 739 was always true

740 authors = [ 

741 author["name"] for author in article["authors"] 

742 ] 

743 

744 # Extract DOI from articleids if not in main field 

745 doi = article.get("doi", "") 

746 if not doi and "articleids" in article: 746 ↛ 753line 746 didn't jump to line 753 because the condition on line 746 was always true

747 for aid in article["articleids"]: 747 ↛ 753line 747 didn't jump to line 753 because the loop on line 747 didn't complete

748 if aid.get("idtype") == "doi": 748 ↛ 747line 748 didn't jump to line 747 because the condition on line 748 was always true

749 doi = aid.get("value", "") 

750 break 

751 

752 # Create summary dictionary with all available fields 

753 summary = { 

754 "id": pmid, 

755 "title": article.get("title", ""), 

756 "pubdate": article.get("pubdate", ""), 

757 "epubdate": article.get("epubdate", ""), 

758 "source": article.get("source", ""), 

759 "authors": authors, 

760 "lastauthor": article.get("lastauthor", ""), 

761 "journal": article.get("fulljournalname", ""), 

762 "volume": article.get("volume", ""), 

763 "issue": article.get("issue", ""), 

764 "pages": article.get("pages", ""), 

765 "doi": doi, 

766 "issn": article.get("issn", ""), 

767 "essn": article.get("essn", ""), 

768 "pubtype": article.get( 

769 "pubtype", [] 

770 ), # Publication types from esummary 

771 "recordstatus": article.get("recordstatus", ""), 

772 "lang": article.get("lang", []), 

773 "pmcrefcount": article.get("pmcrefcount", None), 

774 "link": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/", 

775 } 

776 

777 summaries.append(summary) 

778 else: 

779 logger.warning( 

780 f"PMID {pmid} not found in PubMed API response" 

781 ) 

782 

783 return summaries 

784 

785 except Exception as e: 

786 error_msg = str(e) 

787 logger.exception( 

788 f"Error getting article summaries for {len(id_list)} articles" 

789 ) 

790 

791 # Check for rate limiting patterns 

792 if ( 

793 "429" in error_msg 

794 or "too many requests" in error_msg.lower() 

795 or "rate limit" in error_msg.lower() 

796 or "service unavailable" in error_msg.lower() 

797 or "503" in error_msg 

798 or "403" in error_msg 

799 ): 

800 raise RateLimitError(f"PubMed rate limit hit: {error_msg}") 

801 

802 return [] 

803 

804 def _get_article_abstracts(self, id_list: List[str]) -> Dict[str, str]: 

805 """ 

806 Get abstracts for a list of PubMed article IDs. 

807 

808 Args: 

809 id_list: List of PubMed IDs 

810 

811 Returns: 

812 Dictionary mapping PubMed IDs to their abstracts 

813 """ 

814 if not id_list: 

815 logger.debug("Empty ID list provided to _get_article_abstracts") 

816 return {} 

817 

818 logger.debug(f"Fetching abstracts for {len(id_list)} PubMed articles") 

819 

820 try: 

821 # Prepare parameters 

822 params = { 

823 "db": "pubmed", 

824 "id": ",".join(id_list), 

825 "retmode": "xml", 

826 "rettype": "abstract", 

827 } 

828 

829 # Add API key if available 

830 if self.api_key: 830 ↛ 831line 830 didn't jump to line 831 because the condition on line 830 was never true

831 params["api_key"] = self.api_key 

832 

833 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

834 self.engine_type 

835 ) 

836 logger.debug( 

837 f"Applied rate limit wait: {self._last_wait_time:.2f}s" 

838 ) 

839 

840 # Execute request 

841 logger.debug(f"Requesting abstracts from: {self.fetch_url}") 

842 response = safe_get(self.fetch_url, params=params) 

843 response.raise_for_status() 

844 logger.debug( 

845 f"Abstract fetch response status: {response.status_code}, size: {len(response.text)} bytes" 

846 ) 

847 

848 # Parse XML response 

849 root = ET.fromstring(response.text) 

850 logger.debug( 

851 f"Parsing abstracts from XML for {len(id_list)} articles" 

852 ) 

853 

854 # Extract abstracts 

855 abstracts = {} 

856 

857 for article in root.findall(".//PubmedArticle"): 

858 pmid_elem = article.find(".//PMID") 

859 pmid = pmid_elem.text if pmid_elem is not None else None 

860 

861 if pmid is None: 

862 continue 

863 

864 # Find abstract text 

865 abstract_text = "" 

866 abstract_elem = article.find(".//AbstractText") 

867 

868 if abstract_elem is not None: 868 ↛ 872line 868 didn't jump to line 872 because the condition on line 868 was always true

869 abstract_text = abstract_elem.text or "" 

870 

871 # Some abstracts are split into multiple sections 

872 abstract_sections = article.findall(".//AbstractText") 

873 if len(abstract_sections) > 1: 

874 logger.debug( 

875 f"Article {pmid} has {len(abstract_sections)} abstract sections" 

876 ) 

877 

878 for section in abstract_sections: 

879 # Get section label if it exists 

880 label = section.get("Label") 

881 section_text = section.text or "" 

882 

883 if label and section_text: 

884 if abstract_text: 884 ↛ 887line 884 didn't jump to line 887 because the condition on line 884 was always true

885 abstract_text += f"\n\n{label}: {section_text}" 

886 else: 

887 abstract_text = f"{label}: {section_text}" 

888 elif section_text: 

889 if abstract_text: 889 ↛ 892line 889 didn't jump to line 892 because the condition on line 889 was always true

890 abstract_text += f"\n\n{section_text}" 

891 else: 

892 abstract_text = section_text 

893 

894 # Store in dictionary 

895 if pmid and abstract_text: 

896 abstracts[pmid] = abstract_text 

897 logger.debug( 

898 f"Abstract for {pmid}: {len(abstract_text)} chars" 

899 ) 

900 elif pmid: 900 ↛ 857line 900 didn't jump to line 857 because the condition on line 900 was always true

901 logger.warning(f"No abstract found for PMID {pmid}") 

902 

903 logger.info( 

904 f"Successfully retrieved {len(abstracts)} abstracts out of {len(id_list)} requested" 

905 ) 

906 return abstracts 

907 

908 except Exception: 

909 logger.exception( 

910 f"Error getting article abstracts for {len(id_list)} articles" 

911 ) 

912 return {} 

913 

914 def _get_article_detailed_metadata( 

915 self, id_list: List[str] 

916 ) -> Dict[str, Dict[str, Any]]: 

917 """ 

918 Get detailed metadata for PubMed articles including publication types, 

919 MeSH terms, keywords, and affiliations. 

920 

921 Args: 

922 id_list: List of PubMed IDs 

923 

924 Returns: 

925 Dictionary mapping PubMed IDs to their detailed metadata 

926 """ 

927 if not id_list: 

928 return {} 

929 

930 try: 

931 # Prepare parameters 

932 params = { 

933 "db": "pubmed", 

934 "id": ",".join(id_list), 

935 "retmode": "xml", 

936 "rettype": "medline", 

937 } 

938 

939 # Add API key if available 

940 if self.api_key: 940 ↛ 941line 940 didn't jump to line 941 because the condition on line 940 was never true

941 params["api_key"] = self.api_key 

942 

943 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

944 self.engine_type 

945 ) 

946 

947 # Execute request 

948 response = safe_get(self.fetch_url, params=params) 

949 response.raise_for_status() 

950 

951 # Parse XML response 

952 root = ET.fromstring(response.text) 

953 

954 metadata = {} 

955 

956 for article in root.findall(".//PubmedArticle"): 

957 pmid_elem = article.find(".//PMID") 

958 pmid = pmid_elem.text if pmid_elem is not None else None 

959 

960 if pmid is None: 960 ↛ 961line 960 didn't jump to line 961 because the condition on line 960 was never true

961 continue 

962 

963 article_metadata: Dict[str, Any] = {} 

964 

965 # Extract publication types 

966 pub_types = [] 

967 for pub_type in article.findall(".//PublicationType"): 

968 if pub_type.text: 968 ↛ 967line 968 didn't jump to line 967 because the condition on line 968 was always true

969 pub_types.append(pub_type.text) 

970 if pub_types: 

971 article_metadata["publication_types"] = pub_types 

972 

973 # Extract MeSH terms 

974 mesh_terms = [] 

975 for mesh in article.findall(".//MeshHeading"): 

976 descriptor = mesh.find(".//DescriptorName") 

977 if descriptor is not None and descriptor.text: 977 ↛ 975line 977 didn't jump to line 975 because the condition on line 977 was always true

978 mesh_terms.append(descriptor.text) 

979 if mesh_terms: 

980 article_metadata["mesh_terms"] = mesh_terms 

981 

982 # Extract keywords 

983 keywords = [] 

984 for keyword in article.findall(".//Keyword"): 

985 if keyword.text: 985 ↛ 984line 985 didn't jump to line 984 because the condition on line 985 was always true

986 keywords.append(keyword.text) 

987 if keywords: 

988 article_metadata["keywords"] = keywords 

989 

990 # Extract affiliations 

991 affiliations = [] 

992 for affiliation in article.findall(".//Affiliation"): 

993 if affiliation.text: 993 ↛ 992line 993 didn't jump to line 992 because the condition on line 993 was always true

994 affiliations.append(affiliation.text) 

995 if affiliations: 

996 article_metadata["affiliations"] = affiliations 

997 

998 # Extract grant information 

999 grants = [] 

1000 for grant in article.findall(".//Grant"): 

1001 grant_info = {} 

1002 grant_id = grant.find(".//GrantID") 

1003 if grant_id is not None and grant_id.text: 1003 ↛ 1005line 1003 didn't jump to line 1005 because the condition on line 1003 was always true

1004 grant_info["id"] = grant_id.text 

1005 agency = grant.find(".//Agency") 

1006 if agency is not None and agency.text: 1006 ↛ 1008line 1006 didn't jump to line 1008 because the condition on line 1006 was always true

1007 grant_info["agency"] = agency.text 

1008 if grant_info: 1008 ↛ 1000line 1008 didn't jump to line 1000 because the condition on line 1008 was always true

1009 grants.append(grant_info) 

1010 if grants: 

1011 article_metadata["grants"] = grants 

1012 

1013 # Check for free full text in PMC 

1014 pmc_elem = article.find(".//ArticleId[@IdType='pmc']") 

1015 if pmc_elem is not None: 

1016 article_metadata["has_free_full_text"] = True 

1017 article_metadata["pmc_id"] = pmc_elem.text 

1018 

1019 # Extract conflict of interest statement 

1020 coi_elem = article.find(".//CoiStatement") 

1021 if coi_elem is not None and coi_elem.text: 

1022 article_metadata["conflict_of_interest"] = coi_elem.text 

1023 

1024 metadata[pmid] = article_metadata 

1025 

1026 return metadata 

1027 

1028 except Exception: 

1029 logger.exception("Error getting detailed article metadata") 

1030 return {} 

1031 

1032 def _create_enriched_content( 

1033 self, result: Dict[str, Any], base_content: str 

1034 ) -> str: 

1035 """ 

1036 Create enriched content by adding relevant metadata context to help the LLM. 

1037 

1038 Args: 

1039 result: The result dictionary with metadata 

1040 base_content: The base content (abstract or full text) 

1041 

1042 Returns: 

1043 Enriched content string with metadata context 

1044 """ 

1045 enriched_parts = [] 

1046 

1047 # Add study type information 

1048 if "publication_types" in result: 

1049 pub_types = result["publication_types"] 

1050 # Filter for significant types 

1051 significant_types = [ 

1052 pt 

1053 for pt in pub_types 

1054 if any( 

1055 key in pt.lower() 

1056 for key in [ 

1057 "clinical trial", 

1058 "randomized", 

1059 "meta-analysis", 

1060 "systematic review", 

1061 "case report", 

1062 "guideline", 

1063 "comparative study", 

1064 "multicenter", 

1065 ] 

1066 ) 

1067 ] 

1068 if significant_types: 

1069 enriched_parts.append( 

1070 f"[Study Type: {', '.join(significant_types)}]" 

1071 ) 

1072 

1073 # Add the main content 

1074 enriched_parts.append(base_content) 

1075 

1076 # Add metadata footer 

1077 metadata_footer = [] 

1078 

1079 # Add ALL MeSH terms 

1080 if "mesh_terms" in result and len(result["mesh_terms"]) > 0: 

1081 metadata_footer.append( 

1082 f"Medical Topics (MeSH): {', '.join(result['mesh_terms'])}" 

1083 ) 

1084 

1085 # Add ALL keywords 

1086 if "keywords" in result and len(result["keywords"]) > 0: 

1087 metadata_footer.append(f"Keywords: {', '.join(result['keywords'])}") 

1088 

1089 # Add ALL affiliations 

1090 if "affiliations" in result and len(result["affiliations"]) > 0: 

1091 if len(result["affiliations"]) == 1: 

1092 metadata_footer.append( 

1093 f"Institution: {result['affiliations'][0]}" 

1094 ) 

1095 else: 

1096 affiliations_text = "\n - " + "\n - ".join( 

1097 result["affiliations"] 

1098 ) 

1099 metadata_footer.append(f"Institutions:{affiliations_text}") 

1100 

1101 # Add ALL funding information with full details 

1102 if "grants" in result and len(result["grants"]) > 0: 

1103 grant_details = [] 

1104 for grant in result["grants"]: 

1105 grant_text = [] 

1106 if "agency" in grant: 

1107 grant_text.append(grant["agency"]) 

1108 if "id" in grant: 

1109 grant_text.append(f"(Grant ID: {grant['id']})") 

1110 if grant_text: 

1111 grant_details.append(" ".join(grant_text)) 

1112 if grant_details: 

1113 if len(grant_details) == 1: 

1114 metadata_footer.append(f"Funded by: {grant_details[0]}") 

1115 else: 

1116 funding_text = "\n - " + "\n - ".join(grant_details) 

1117 metadata_footer.append(f"Funding Sources:{funding_text}") 

1118 

1119 # Add FULL conflict of interest statement 

1120 if "conflict_of_interest" in result: 

1121 coi_text = result["conflict_of_interest"] 

1122 if coi_text: 

1123 # Still skip trivial "no conflict" statements to reduce noise 

1124 if not any( 

1125 phrase in coi_text.lower() 

1126 for phrase in [ 

1127 "no conflict", 

1128 "no competing", 

1129 "nothing to disclose", 

1130 "none declared", 

1131 "authors declare no", 

1132 ] 

1133 ): 

1134 metadata_footer.append(f"Conflict of Interest: {coi_text}") 

1135 elif ( 

1136 "but" in coi_text.lower() 

1137 or "except" in coi_text.lower() 

1138 or "however" in coi_text.lower() 

1139 ): 

1140 # Include if there's a "no conflict BUT..." type statement 

1141 metadata_footer.append(f"Conflict of Interest: {coi_text}") 

1142 

1143 # Combine everything 

1144 if metadata_footer: 

1145 enriched_parts.append("\n---\nStudy Metadata:") 

1146 enriched_parts.extend(metadata_footer) 

1147 

1148 return "\n".join(enriched_parts) 

1149 

1150 def _find_pmc_ids(self, pmid_list: List[str]) -> Dict[str, str]: 

1151 """ 

1152 Find PMC IDs for the given PubMed IDs (for full-text access). 

1153 

1154 Args: 

1155 pmid_list: List of PubMed IDs 

1156 

1157 Returns: 

1158 Dictionary mapping PubMed IDs to their PMC IDs (if available) 

1159 """ 

1160 if not pmid_list or not self.get_full_text: 

1161 return {} 

1162 

1163 try: 

1164 # Prepare parameters 

1165 params = { 

1166 "dbfrom": "pubmed", 

1167 "db": "pmc", 

1168 "linkname": "pubmed_pmc", 

1169 "id": ",".join(pmid_list), 

1170 "retmode": "json", 

1171 } 

1172 

1173 # Add API key if available 

1174 if self.api_key: 1174 ↛ 1175line 1174 didn't jump to line 1175 because the condition on line 1174 was never true

1175 params["api_key"] = self.api_key 

1176 

1177 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

1178 self.engine_type 

1179 ) 

1180 

1181 # Execute request 

1182 response = safe_get(self.link_url, params=params) 

1183 response.raise_for_status() 

1184 

1185 # Parse response 

1186 data = response.json() 

1187 

1188 # Map PubMed IDs to PMC IDs 

1189 pmid_to_pmcid = {} 

1190 

1191 for linkset in data.get("linksets", []): 

1192 pmid = linkset.get("ids", [None])[0] 

1193 

1194 if not pmid: 1194 ↛ 1195line 1194 didn't jump to line 1195 because the condition on line 1194 was never true

1195 continue 

1196 

1197 for link in linkset.get("linksetdbs", []): 

1198 if link.get("linkname") == "pubmed_pmc": 1198 ↛ 1197line 1198 didn't jump to line 1197 because the condition on line 1198 was always true

1199 pmcids = link.get("links", []) 

1200 if pmcids: 1200 ↛ 1197line 1200 didn't jump to line 1197 because the condition on line 1200 was always true

1201 pmid_to_pmcid[str(pmid)] = f"PMC{pmcids[0]}" 

1202 

1203 logger.info( 

1204 f"Found {len(pmid_to_pmcid)} PMC IDs for full-text access" 

1205 ) 

1206 return pmid_to_pmcid 

1207 

1208 except Exception: 

1209 logger.exception("Error finding PMC IDs") 

1210 return {} 

1211 

1212 def _get_pmc_full_text(self, pmcid: str) -> str: 

1213 """ 

1214 Get full text for a PMC article. 

1215 

1216 Args: 

1217 pmcid: PMC ID of the article 

1218 

1219 Returns: 

1220 Full text content or empty string if not available 

1221 """ 

1222 try: 

1223 # Prepare parameters 

1224 params = { 

1225 "db": "pmc", 

1226 "id": pmcid, 

1227 "retmode": "xml", 

1228 "rettype": "full", 

1229 } 

1230 

1231 # Add API key if available 

1232 if self.api_key: 1232 ↛ 1233line 1232 didn't jump to line 1233 because the condition on line 1232 was never true

1233 params["api_key"] = self.api_key 

1234 

1235 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

1236 self.engine_type 

1237 ) 

1238 

1239 # Execute request 

1240 response = safe_get(self.fetch_url, params=params) 

1241 response.raise_for_status() 

1242 

1243 # Parse XML response 

1244 root = ET.fromstring(response.text) 

1245 

1246 # Extract full text 

1247 full_text = [] 

1248 

1249 # Extract article title 

1250 title_elem = root.find(".//article-title") 

1251 if title_elem is not None and title_elem.text: 1251 ↛ 1255line 1251 didn't jump to line 1255 because the condition on line 1251 was always true

1252 full_text.append(f"# {title_elem.text}") 

1253 

1254 # Extract abstract 

1255 abstract_paras = root.findall(".//abstract//p") 

1256 if abstract_paras: 

1257 full_text.append("\n## Abstract\n") 

1258 for p in abstract_paras: 

1259 text = "".join(p.itertext()) 

1260 if text: 1260 ↛ 1258line 1260 didn't jump to line 1258 because the condition on line 1260 was always true

1261 full_text.append(text) 

1262 

1263 # Extract body content 

1264 body = root.find(".//body") 

1265 if body is not None: 1265 ↛ 1278line 1265 didn't jump to line 1278 because the condition on line 1265 was always true

1266 for section in body.findall(".//sec"): 

1267 # Get section title 

1268 title = section.find(".//title") 

1269 if title is not None and title.text: 1269 ↛ 1273line 1269 didn't jump to line 1273 because the condition on line 1269 was always true

1270 full_text.append(f"\n## {title.text}\n") 

1271 

1272 # Get paragraphs 

1273 for p in section.findall(".//p"): 

1274 text = "".join(p.itertext()) 

1275 if text: 1275 ↛ 1273line 1275 didn't jump to line 1273 because the condition on line 1275 was always true

1276 full_text.append(text) 

1277 

1278 result_text = "\n\n".join(full_text) 

1279 logger.debug( 

1280 f"Successfully extracted {len(result_text)} chars of PMC full text with {len(full_text)} sections" 

1281 ) 

1282 return result_text 

1283 

1284 except Exception: 

1285 logger.exception("Error getting PMC full text") 

1286 return "" 

1287 

1288 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

1289 """ 

1290 Get preview information for PubMed articles. 

1291 

1292 Args: 

1293 query: The search query 

1294 

1295 Returns: 

1296 List of preview dictionaries 

1297 """ 

1298 logger.info(f"Getting PubMed previews for query: {query}") 

1299 

1300 # Optimize the query for PubMed if LLM is available 

1301 optimized_query = self._optimize_query_for_pubmed(query) 

1302 

1303 # Perform adaptive search 

1304 pmid_list, strategy = self._adaptive_search(optimized_query) 

1305 

1306 # If no results, try a simplified query 

1307 if not pmid_list: 

1308 logger.warning( 

1309 f"No PubMed results found using strategy: {strategy}" 

1310 ) 

1311 simplified_query = self._simplify_query(optimized_query) 

1312 if simplified_query != optimized_query: 

1313 logger.info(f"Trying with simplified query: {simplified_query}") 

1314 pmid_list, strategy = self._adaptive_search(simplified_query) 

1315 if pmid_list: 

1316 logger.info( 

1317 f"Simplified query found {len(pmid_list)} results" 

1318 ) 

1319 

1320 if not pmid_list: 

1321 logger.warning("No PubMed results found after query simplification") 

1322 return [] 

1323 

1324 # Get article summaries 

1325 logger.debug(f"Fetching article summaries for {len(pmid_list)} PMIDs") 

1326 summaries = self._get_article_summaries(pmid_list) 

1327 logger.debug(f"Retrieved {len(summaries)} summaries") 

1328 

1329 # ALWAYS fetch abstracts for snippet-only mode to provide context for LLM 

1330 logger.debug( 

1331 f"Fetching abstracts for {len(pmid_list)} articles for snippet enrichment" 

1332 ) 

1333 abstracts = self._get_article_abstracts(pmid_list) 

1334 logger.debug(f"Retrieved {len(abstracts)} abstracts") 

1335 

1336 # Format as previews 

1337 previews = [] 

1338 for summary in summaries: 

1339 # Build snippet from individual metadata preferences 

1340 snippet_parts = [] 

1341 

1342 # Check for publication type from esummary (earlier than detailed metadata) 

1343 pub_type_prefix = "" 

1344 if self.include_publication_type_in_context and summary.get( 1344 ↛ 1348line 1344 didn't jump to line 1348 because the condition on line 1344 was never true

1345 "pubtype" 

1346 ): 

1347 # Use first publication type from esummary 

1348 pub_type_prefix = f"[{summary['pubtype'][0]}] " 

1349 

1350 # Add authors if enabled 

1351 if self.include_authors_in_context and summary.get("authors"): 

1352 authors_text = ", ".join(summary.get("authors", [])) 

1353 if len(authors_text) > 100: 

1354 # Truncate long author lists 

1355 authors_text = authors_text[:97] + "..." 

1356 snippet_parts.append(authors_text) 

1357 

1358 # Add journal if enabled 

1359 if self.include_journal_in_context and summary.get("journal"): 1359 ↛ 1363line 1359 didn't jump to line 1363 because the condition on line 1359 was always true

1360 snippet_parts.append(summary["journal"]) 

1361 

1362 # Add date (full or year only) 

1363 if summary.get("pubdate"): 1363 ↛ 1373line 1363 didn't jump to line 1373 because the condition on line 1363 was always true

1364 if self.include_full_date_in_context: 1364 ↛ 1365line 1364 didn't jump to line 1365 because the condition on line 1364 was never true

1365 snippet_parts.append(summary["pubdate"]) 

1366 elif ( 1366 ↛ 1373line 1366 didn't jump to line 1373 because the condition on line 1366 was always true

1367 self.include_year_in_context 

1368 and len(summary["pubdate"]) >= 4 

1369 ): 

1370 snippet_parts.append(summary["pubdate"][:4]) 

1371 

1372 # Add citation details if enabled 

1373 if self.include_citation_in_context: 

1374 citation_parts = [] 

1375 if summary.get("volume"): 1375 ↛ 1377line 1375 didn't jump to line 1377 because the condition on line 1375 was always true

1376 citation_parts.append(f"Vol {summary['volume']}") 

1377 if summary.get("issue"): 1377 ↛ 1379line 1377 didn't jump to line 1379 because the condition on line 1377 was always true

1378 citation_parts.append(f"Issue {summary['issue']}") 

1379 if summary.get("pages"): 1379 ↛ 1381line 1379 didn't jump to line 1381 because the condition on line 1379 was always true

1380 citation_parts.append(f"pp {summary['pages']}") 

1381 if citation_parts: 1381 ↛ 1385line 1381 didn't jump to line 1385 because the condition on line 1381 was always true

1382 snippet_parts.append(f"({', '.join(citation_parts)})") 

1383 

1384 # Join snippet parts or provide default 

1385 if snippet_parts: 1385 ↛ 1396line 1385 didn't jump to line 1396 because the condition on line 1385 was always true

1386 # Use different separators based on what's included 

1387 if self.include_authors_in_context: 

1388 snippet = ". ".join( 

1389 snippet_parts 

1390 ) # Authors need period separator 

1391 else: 

1392 snippet = " - ".join( 

1393 snippet_parts 

1394 ) # Journal and year use dash 

1395 else: 

1396 snippet = "Research article" 

1397 

1398 # Add publication type prefix 

1399 snippet = pub_type_prefix + snippet 

1400 

1401 # Add language indicator if not English 

1402 if self.include_language_in_context and summary.get("lang"): 

1403 langs = summary["lang"] 

1404 if langs and langs[0] != "eng" and langs[0]: 1404 ↛ 1408line 1404 didn't jump to line 1408 because the condition on line 1404 was always true

1405 snippet = f"{snippet} [{langs[0].upper()}]" 

1406 

1407 # Add identifiers if enabled 

1408 identifier_parts = [] 

1409 if self.include_pmid_in_context and summary.get("id"): 

1410 identifier_parts.append(f"PMID: {summary['id']}") 

1411 if self.include_doi_in_context and summary.get("doi"): 

1412 identifier_parts.append(f"DOI: {summary['doi']}") 

1413 

1414 if identifier_parts: 

1415 snippet = f"{snippet} | {' | '.join(identifier_parts)}" 

1416 

1417 # ALWAYS include title and abstract in snippet for LLM analysis 

1418 pmid = summary["id"] 

1419 title = summary["title"] 

1420 abstract_text = abstracts.get(pmid, "") 

1421 

1422 # Truncate abstract if too long 

1423 if len(abstract_text) > 500: 1423 ↛ 1424line 1423 didn't jump to line 1424 because the condition on line 1423 was never true

1424 abstract_text = abstract_text[:497] + "..." 

1425 

1426 # Build the enriched snippet with title and abstract 

1427 if abstract_text: 

1428 enriched_snippet = f"Title: {title}\n\nAbstract: {abstract_text}\n\nMetadata: {snippet}" 

1429 else: 

1430 enriched_snippet = f"Title: {title}\n\nMetadata: {snippet}" 

1431 

1432 # Log the complete snippet for debugging 

1433 logger.debug(f"Complete snippet for PMID {pmid}:") 

1434 logger.debug(f" Title: {title[:100]}...") 

1435 logger.debug(f" Abstract length: {len(abstract_text)} chars") 

1436 logger.debug(f" Metadata: {snippet}") 

1437 logger.debug( 

1438 f" Full enriched snippet ({len(enriched_snippet)} chars): {enriched_snippet[:500]}..." 

1439 ) 

1440 

1441 # Create preview with basic information 

1442 preview = { 

1443 "id": summary["id"], 

1444 "title": summary["title"], 

1445 "link": summary["link"], 

1446 "snippet": enriched_snippet, # Use enriched snippet with title and abstract 

1447 "authors": summary.get("authors", []), 

1448 "journal": summary.get("journal", ""), 

1449 # Alias for the journal reputation filter, which reads 

1450 # `journal_ref` (the field name used by arXiv). 

1451 # Use None (not empty string) to match other engines so the 

1452 # filter treats missing journals consistently. 

1453 "journal_ref": summary.get("journal") or None, 

1454 # Forward the print / linking ISSN so the reputation 

1455 # filter's Tier 2/3 lookups can key on it (faster and 

1456 # more reliable than fuzzy name matching). essn is the 

1457 # electronic ISSN; prefer it when issn is blank. 

1458 "issn": summary.get("issn") or summary.get("essn") or None, 

1459 "pubdate": summary.get("pubdate", ""), 

1460 "doi": summary.get("doi", ""), 

1461 "source": "PubMed", 

1462 "_pmid": summary["id"], # Store PMID for later use 

1463 "_search_strategy": strategy, # Store search strategy for analytics 

1464 } 

1465 

1466 previews.append(preview) 

1467 

1468 logger.info( 

1469 f"Found {len(previews)} PubMed previews using strategy: {strategy}" 

1470 ) 

1471 if previews: 1471 ↛ 1475line 1471 didn't jump to line 1475 because the condition on line 1471 was always true

1472 logger.debug( 

1473 f"Sample preview title: '{previews[0].get('title', 'NO TITLE')[:80]}...'" 

1474 ) 

1475 return previews 

1476 

1477 def _get_full_content( 

1478 self, relevant_items: List[Dict[str, Any]] 

1479 ) -> List[Dict[str, Any]]: 

1480 """ 

1481 Get full content for the relevant PubMed articles. 

1482 Efficiently manages which content to retrieve (abstracts and/or full text). 

1483 

1484 Args: 

1485 relevant_items: List of relevant preview dictionaries 

1486 

1487 Returns: 

1488 List of result dictionaries with full content 

1489 """ 

1490 # Check if we should add full content 

1491 snippets_only_mode = ( 

1492 hasattr(search_config, "SEARCH_SNIPPETS_ONLY") 

1493 and search_config.SEARCH_SNIPPETS_ONLY 

1494 ) 

1495 

1496 if snippets_only_mode: 

1497 logger.info( 

1498 "Snippet-only mode enabled, will fetch abstracts as snippets" 

1499 ) 

1500 # For PubMed, we still need to fetch abstracts as they serve as snippets 

1501 # But we'll skip full-text retrieval 

1502 

1503 logger.info( 

1504 f"Getting content for {len(relevant_items)} PubMed articles" 

1505 ) 

1506 

1507 # Collect all PMIDs for relevant items 

1508 pmids = [] 

1509 for item in relevant_items: 

1510 if "_pmid" in item: 

1511 pmids.append(item["_pmid"]) 

1512 

1513 # Get abstracts if requested and PMIDs exist 

1514 # In snippet-only mode, always get abstracts as they serve as snippets 

1515 abstracts = {} 

1516 if (self.get_abstracts or snippets_only_mode) and pmids: 

1517 abstracts = self._get_article_abstracts(pmids) 

1518 

1519 # Get detailed metadata for all articles (publication types, MeSH terms, etc.) 

1520 detailed_metadata = {} 

1521 if pmids: 

1522 detailed_metadata = self._get_article_detailed_metadata(pmids) 

1523 

1524 # Find PMC IDs for full-text retrieval (if enabled and not in snippet-only mode) 

1525 pmid_to_pmcid = {} 

1526 if self.get_full_text and pmids and not snippets_only_mode: 

1527 pmid_to_pmcid = self._find_pmc_ids(pmids) 

1528 

1529 # Add content to results 

1530 results: List[Dict[str, Any]] = [] 

1531 for item in relevant_items: 

1532 result = item.copy() 

1533 pmid = item.get("_pmid", "") 

1534 

1535 # Add detailed metadata if available 

1536 if pmid in detailed_metadata: 

1537 metadata = detailed_metadata[pmid] 

1538 

1539 # Add publication types (e.g., "Clinical Trial", "Meta-Analysis") 

1540 if "publication_types" in metadata: 

1541 result["publication_types"] = metadata["publication_types"] 

1542 

1543 # Add first publication type to snippet if enabled 

1544 if ( 1544 ↛ 1556line 1544 didn't jump to line 1556 because the condition on line 1544 was always true

1545 self.include_publication_type_in_context 

1546 and metadata["publication_types"] 

1547 ): 

1548 # Just take the first publication type as is 

1549 pub_type = metadata["publication_types"][0] 

1550 if "snippet" in result: 1550 ↛ 1556line 1550 didn't jump to line 1556 because the condition on line 1550 was always true

1551 result["snippet"] = ( 

1552 f"[{pub_type}] {result['snippet']}" 

1553 ) 

1554 

1555 # Add MeSH terms for medical categorization 

1556 if "mesh_terms" in metadata: 

1557 result["mesh_terms"] = metadata["mesh_terms"] 

1558 

1559 # Add MeSH terms to snippet if enabled 

1560 if ( 1560 ↛ 1576line 1560 didn't jump to line 1576 because the condition on line 1560 was always true

1561 self.include_mesh_terms_in_context 

1562 and metadata["mesh_terms"] 

1563 ): 

1564 mesh_to_show = ( 

1565 metadata["mesh_terms"][: self.max_mesh_terms] 

1566 if self.max_mesh_terms > 0 

1567 else metadata["mesh_terms"] 

1568 ) 

1569 if mesh_to_show and "snippet" in result: 1569 ↛ 1576line 1569 didn't jump to line 1576 because the condition on line 1569 was always true

1570 mesh_text = "MeSH: " + ", ".join(mesh_to_show) 

1571 result["snippet"] = ( 

1572 f"{result['snippet']} | {mesh_text}" 

1573 ) 

1574 

1575 # Add keywords 

1576 if "keywords" in metadata: 

1577 result["keywords"] = metadata["keywords"] 

1578 

1579 # Add keywords to snippet if enabled 

1580 if ( 1580 ↛ 1598line 1580 didn't jump to line 1598 because the condition on line 1580 was always true

1581 self.include_keywords_in_context 

1582 and metadata["keywords"] 

1583 ): 

1584 keywords_to_show = ( 

1585 metadata["keywords"][: self.max_keywords] 

1586 if self.max_keywords > 0 

1587 else metadata["keywords"] 

1588 ) 

1589 if keywords_to_show and "snippet" in result: 1589 ↛ 1598line 1589 didn't jump to line 1598 because the condition on line 1589 was always true

1590 keywords_text = "Keywords: " + ", ".join( 

1591 keywords_to_show 

1592 ) 

1593 result["snippet"] = ( 

1594 f"{result['snippet']} | {keywords_text}" 

1595 ) 

1596 

1597 # Add affiliations 

1598 if "affiliations" in metadata: 1598 ↛ 1599line 1598 didn't jump to line 1599 because the condition on line 1598 was never true

1599 result["affiliations"] = metadata["affiliations"] 

1600 

1601 # Add funding/grant information 

1602 if "grants" in metadata: 1602 ↛ 1603line 1602 didn't jump to line 1603 because the condition on line 1602 was never true

1603 result["grants"] = metadata["grants"] 

1604 

1605 # Add conflict of interest statement 

1606 if "conflict_of_interest" in metadata: 1606 ↛ 1607line 1606 didn't jump to line 1607 because the condition on line 1606 was never true

1607 result["conflict_of_interest"] = metadata[ 

1608 "conflict_of_interest" 

1609 ] 

1610 

1611 # Add free full text availability 

1612 if "has_free_full_text" in metadata: 

1613 result["has_free_full_text"] = metadata[ 

1614 "has_free_full_text" 

1615 ] 

1616 if "pmc_id" in metadata: 1616 ↛ 1620line 1616 didn't jump to line 1620 because the condition on line 1616 was always true

1617 result["pmc_id"] = metadata["pmc_id"] 

1618 

1619 # Add PMC availability to snippet if enabled 

1620 if ( 1620 ↛ 1630line 1620 didn't jump to line 1630 because the condition on line 1620 was always true

1621 self.include_pmc_availability_in_context 

1622 and metadata["has_free_full_text"] 

1623 and "snippet" in result 

1624 ): 

1625 result["snippet"] = ( 

1626 f"{result['snippet']} | [Free Full Text]" 

1627 ) 

1628 

1629 # Add abstract if available 

1630 if pmid in abstracts: 

1631 result["abstract"] = abstracts[pmid] 

1632 

1633 # Create enriched content with metadata context 

1634 enriched_content = self._create_enriched_content( 

1635 result, abstracts[pmid] 

1636 ) 

1637 

1638 # ALWAYS include title and abstract in snippet for LLM analysis 

1639 # Build comprehensive snippet with title and abstract 

1640 title = result.get("title", "") 

1641 abstract_text = ( 

1642 abstracts[pmid][:SNIPPET_LENGTH_LONG] 

1643 if len(abstracts[pmid]) > SNIPPET_LENGTH_LONG 

1644 else abstracts[pmid] 

1645 ) 

1646 

1647 # Prepend title and abstract to the existing metadata snippet 

1648 if "snippet" in result: 1648 ↛ 1655line 1648 didn't jump to line 1655 because the condition on line 1648 was always true

1649 # Keep metadata snippet and add content 

1650 result["snippet"] = ( 

1651 f"Title: {title}\n\nAbstract: {abstract_text}\n\nMetadata: {result['snippet']}" 

1652 ) 

1653 else: 

1654 # No metadata snippet, just title and abstract 

1655 result["snippet"] = ( 

1656 f"Title: {title}\n\nAbstract: {abstract_text}" 

1657 ) 

1658 

1659 # In snippet-only mode, use enriched content 

1660 if snippets_only_mode: 

1661 result["full_content"] = enriched_content 

1662 result["content"] = enriched_content 

1663 result["content_type"] = "abstract" 

1664 # Use abstract as content if no full text 

1665 elif pmid not in pmid_to_pmcid: 

1666 result["full_content"] = enriched_content 

1667 result["content"] = enriched_content 

1668 result["content_type"] = "abstract" 

1669 

1670 # Add full text for a limited number of top articles 

1671 if ( 

1672 pmid in pmid_to_pmcid 

1673 and self.get_full_text 

1674 and len( 

1675 [r for r in results if r.get("content_type") == "full_text"] 

1676 ) 

1677 < self.full_text_limit 

1678 ): 

1679 # Get full text content 

1680 pmcid = pmid_to_pmcid[pmid] 

1681 full_text = self._get_pmc_full_text(pmcid) 

1682 

1683 if full_text: 

1684 enriched_full_text = self._create_enriched_content( 

1685 result, full_text 

1686 ) 

1687 result["full_content"] = enriched_full_text 

1688 result["content"] = enriched_full_text 

1689 result["content_type"] = "full_text" 

1690 result["pmcid"] = pmcid 

1691 elif pmid in abstracts: 1691 ↛ 1701line 1691 didn't jump to line 1701 because the condition on line 1691 was always true

1692 # Fall back to abstract if full text retrieval fails 

1693 enriched_content = self._create_enriched_content( 

1694 result, abstracts[pmid] 

1695 ) 

1696 result["full_content"] = enriched_content 

1697 result["content"] = enriched_content 

1698 result["content_type"] = "abstract" 

1699 

1700 # Remove temporary fields 

1701 if "_pmid" in result: 

1702 del result["_pmid"] 

1703 if "_search_strategy" in result: 

1704 del result["_search_strategy"] 

1705 

1706 results.append(result) 

1707 

1708 return results 

1709 

1710 def search_by_author( 

1711 self, author_name: str, max_results: Optional[int] = None 

1712 ) -> List[Dict[str, Any]]: 

1713 """ 

1714 Search for articles by a specific author. 

1715 

1716 Args: 

1717 author_name: Name of the author 

1718 max_results: Maximum number of results (defaults to self.max_results) 

1719 

1720 Returns: 

1721 List of articles by the author 

1722 """ 

1723 original_max_results = self.max_results 

1724 

1725 try: 

1726 if max_results: 

1727 self.max_results = max_results 

1728 

1729 query = f"{author_name}[Author]" 

1730 return self.run(query) 

1731 

1732 finally: 

1733 # Restore original value 

1734 self.max_results = original_max_results 

1735 

1736 def search_by_journal( 

1737 self, journal_name: str, max_results: Optional[int] = None 

1738 ) -> List[Dict[str, Any]]: 

1739 """ 

1740 Search for articles in a specific journal. 

1741 

1742 Args: 

1743 journal_name: Name of the journal 

1744 max_results: Maximum number of results (defaults to self.max_results) 

1745 

1746 Returns: 

1747 List of articles from the journal 

1748 """ 

1749 original_max_results = self.max_results 

1750 

1751 try: 

1752 if max_results: 

1753 self.max_results = max_results 

1754 

1755 query = f"{journal_name}[Journal]" 

1756 return self.run(query) 

1757 

1758 finally: 

1759 # Restore original value 

1760 self.max_results = original_max_results 

1761 

1762 def search_recent( 

1763 self, query: str, days: int = 30, max_results: Optional[int] = None 

1764 ) -> List[Dict[str, Any]]: 

1765 """ 

1766 Search for recent articles matching the query. 

1767 

1768 Args: 

1769 query: The search query 

1770 days: Number of days to look back 

1771 max_results: Maximum number of results (defaults to self.max_results) 

1772 

1773 Returns: 

1774 List of recent articles matching the query 

1775 """ 

1776 original_max_results = self.max_results 

1777 original_days_limit = self.days_limit 

1778 

1779 try: 

1780 if max_results: 

1781 self.max_results = max_results 

1782 

1783 # Set days limit for this search 

1784 self.days_limit = days 

1785 

1786 return self.run(query) 

1787 

1788 finally: 

1789 # Restore original values 

1790 self.max_results = original_max_results 

1791 self.days_limit = original_days_limit 

1792 

1793 def advanced_search( 

1794 self, terms: Dict[str, str], max_results: Optional[int] = None 

1795 ) -> List[Dict[str, Any]]: 

1796 """ 

1797 Perform an advanced search with field-specific terms. 

1798 

1799 Args: 

1800 terms: Dictionary mapping fields to search terms 

1801 Valid fields: Author, Journal, Title, MeSH, Affiliation, etc. 

1802 max_results: Maximum number of results (defaults to self.max_results) 

1803 

1804 Returns: 

1805 List of articles matching the advanced query 

1806 """ 

1807 original_max_results = self.max_results 

1808 

1809 try: 

1810 if max_results: 

1811 self.max_results = max_results 

1812 

1813 # Build advanced query string 

1814 query_parts = [] 

1815 for field, term in terms.items(): 

1816 query_parts.append(f"{term}[{field}]") 

1817 

1818 query = " AND ".join(query_parts) 

1819 return self.run(query) 

1820 

1821 finally: 

1822 # Restore original value 

1823 self.max_results = original_max_results