Coverage for src / local_deep_research / web_search_engines / engines / search_engine_pubmed.py: 10%

707 statements  

« prev     ^ index     » next       coverage.py v7.12.0, created at 2026-01-11 00:51 +0000

1import re 

2import xml.etree.ElementTree as ET 

3from typing import Any, Dict, List, Optional, Tuple 

4 

5from langchain_core.language_models import BaseLLM 

6from loguru import logger 

7 

8from ...config import search_config 

9from ...security.safe_requests import safe_get 

10from ..rate_limiting import RateLimitError 

11from ..search_engine_base import BaseSearchEngine 

12 

13 

14class PubMedSearchEngine(BaseSearchEngine): 

15 """ 

16 PubMed search engine implementation with two-phase approach and adaptive search. 

17 Provides efficient access to biomedical literature while minimizing API usage. 

18 """ 

19 

20 # Mark as public search engine 

21 is_public = True 

22 # Scientific/medical search engine 

23 is_scientific = True 

24 

25 def __init__( 

26 self, 

27 max_results: int = 10, 

28 api_key: Optional[str] = None, 

29 days_limit: Optional[int] = None, 

30 get_abstracts: bool = True, 

31 get_full_text: bool = False, 

32 full_text_limit: int = 3, 

33 llm: Optional[BaseLLM] = None, 

34 max_filtered_results: Optional[int] = None, 

35 optimize_queries: bool = True, 

36 include_publication_type_in_context: bool = True, 

37 include_journal_in_context: bool = True, 

38 include_year_in_context: bool = True, 

39 include_authors_in_context: bool = False, 

40 include_full_date_in_context: bool = False, 

41 include_mesh_terms_in_context: bool = True, 

42 include_keywords_in_context: bool = True, 

43 include_doi_in_context: bool = False, 

44 include_pmid_in_context: bool = False, 

45 include_pmc_availability_in_context: bool = False, 

46 max_mesh_terms: int = 3, 

47 max_keywords: int = 3, 

48 include_citation_in_context: bool = False, 

49 include_language_in_context: bool = False, 

50 ): 

51 """ 

52 Initialize the PubMed search engine. 

53 

54 Args: 

55 max_results: Maximum number of search results 

56 api_key: NCBI API key for higher rate limits (optional) 

57 days_limit: Limit results to N days (optional) 

58 get_abstracts: Whether to fetch abstracts for all results 

59 get_full_text: Whether to fetch full text content (when available in PMC) 

60 full_text_limit: Max number of full-text articles to retrieve 

61 llm: Language model for relevance filtering 

62 max_filtered_results: Maximum number of results to keep after filtering 

63 optimize_queries: Whether to optimize natural language queries for PubMed 

64 """ 

65 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results 

66 super().__init__( 

67 llm=llm, 

68 max_filtered_results=max_filtered_results, 

69 max_results=max_results, 

70 ) 

71 self.max_results = max(self.max_results, 25) 

72 self.api_key = api_key 

73 self.days_limit = days_limit 

74 self.get_abstracts = get_abstracts 

75 self.get_full_text = get_full_text 

76 self.full_text_limit = full_text_limit 

77 self.optimize_queries = optimize_queries 

78 self.include_publication_type_in_context = ( 

79 include_publication_type_in_context 

80 ) 

81 self.include_journal_in_context = include_journal_in_context 

82 self.include_year_in_context = include_year_in_context 

83 self.include_authors_in_context = include_authors_in_context 

84 self.include_full_date_in_context = include_full_date_in_context 

85 self.include_mesh_terms_in_context = include_mesh_terms_in_context 

86 self.include_keywords_in_context = include_keywords_in_context 

87 self.include_doi_in_context = include_doi_in_context 

88 self.include_pmid_in_context = include_pmid_in_context 

89 self.include_pmc_availability_in_context = ( 

90 include_pmc_availability_in_context 

91 ) 

92 self.max_mesh_terms = max_mesh_terms 

93 self.max_keywords = max_keywords 

94 self.include_citation_in_context = include_citation_in_context 

95 self.include_language_in_context = include_language_in_context 

96 

97 # Base API URLs 

98 self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" 

99 self.search_url = f"{self.base_url}/esearch.fcgi" 

100 self.summary_url = f"{self.base_url}/esummary.fcgi" 

101 self.fetch_url = f"{self.base_url}/efetch.fcgi" 

102 self.link_url = f"{self.base_url}/elink.fcgi" 

103 

104 # PMC base URL for full text 

105 self.pmc_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" 

106 

107 def _get_result_count(self, query: str) -> int: 

108 """ 

109 Get the total number of results for a query without retrieving the results themselves. 

110 

111 Args: 

112 query: The search query 

113 

114 Returns: 

115 Total number of matching results 

116 """ 

117 try: 

118 # Prepare search parameters 

119 params = { 

120 "db": "pubmed", 

121 "term": query, 

122 "retmode": "json", 

123 "retmax": 0, # Don't need actual results, just the count 

124 } 

125 

126 # Add API key if available 

127 if self.api_key: 127 ↛ 128line 127 didn't jump to line 128 because the condition on line 127 was never true

128 params["api_key"] = self.api_key 

129 

130 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

131 self.engine_type 

132 ) 

133 

134 # Execute search request 

135 response = safe_get(self.search_url, params=params) 

136 response.raise_for_status() 

137 

138 # Parse response 

139 data = response.json() 

140 count = int(data["esearchresult"]["count"]) 

141 

142 logger.info( 

143 "Query '%s' has %s total results in PubMed", query, count 

144 ) 

145 return count 

146 

147 except Exception: 

148 logger.exception("Error getting result count") 

149 return 0 

150 

151 def _extract_core_terms(self, query: str) -> str: 

152 """ 

153 Extract core terms from a complex query for volume estimation. 

154 

155 Args: 

156 query: PubMed query string 

157 

158 Returns: 

159 Simplified query with core terms 

160 """ 

161 # Remove field specifications and operators 

162 simplified = re.sub(r"\[\w+\]", "", query) # Remove [Field] tags 

163 simplified = re.sub( 

164 r"\b(AND|OR|NOT)\b", "", simplified 

165 ) # Remove operators 

166 

167 # Remove quotes and parentheses 

168 simplified = ( 

169 simplified.replace('"', "").replace("(", "").replace(")", "") 

170 ) 

171 

172 # Split by whitespace and join terms with 4+ chars (likely meaningful) 

173 terms = [term for term in simplified.split() if len(term) >= 4] 

174 

175 # Join with AND to create a basic search 

176 return " ".join(terms[:5]) # Limit to top 5 terms 

177 

178 def _expand_time_window(self, time_filter: str) -> str: 

179 """ 

180 Expand a time window to get more results. 

181 

182 Args: 

183 time_filter: Current time filter 

184 

185 Returns: 

186 Expanded time filter 

187 """ 

188 # Parse current time window 

189 import re 

190 

191 match = re.match(r'"last (\d+) (\w+)"[pdat]', time_filter) 

192 if not match: 

193 return '"last 10 years"[pdat]' 

194 

195 amount, unit = int(match.group(1)), match.group(2) 

196 

197 # Expand based on current unit 

198 if unit == "months" or unit == "month": 

199 if amount < 6: 

200 return '"last 6 months"[pdat]' 

201 elif amount < 12: 

202 return '"last 1 year"[pdat]' 

203 else: 

204 return '"last 2 years"[pdat]' 

205 elif unit == "years" or unit == "year": 

206 if amount < 2: 

207 return '"last 2 years"[pdat]' 

208 elif amount < 5: 

209 return '"last 5 years"[pdat]' 

210 else: 

211 return '"last 10 years"[pdat]' 

212 

213 return '"last 10 years"[pdat]' 

214 

215 def _optimize_query_for_pubmed(self, query: str) -> str: 

216 """ 

217 Optimize a natural language query for PubMed search. 

218 Uses LLM to transform questions into effective keyword-based queries. 

219 

220 Args: 

221 query: Natural language query 

222 

223 Returns: 

224 Optimized query string for PubMed 

225 """ 

226 if not self.llm or not self.optimize_queries: 

227 # Return original query if no LLM available or optimization disabled 

228 return query 

229 

230 try: 

231 # Prompt for query optimization 

232 prompt = f"""Transform this natural language question into an optimized PubMed search query. 

233 

234Original query: "{query}" 

235 

236CRITICAL RULES: 

2371. ONLY RETURN THE EXACT SEARCH QUERY - NO EXPLANATIONS, NO COMMENTS 

2382. DO NOT wrap the entire query in quotes 

2393. DO NOT include ANY date restrictions or year filters 

2404. Use parentheses around OR statements: (term1[Field] OR term2[Field]) 

2415. Use only BASIC MeSH terms - stick to broad categories like "Vaccines"[Mesh] 

2426. KEEP IT SIMPLE - use 2-3 main concepts maximum 

2437. Focus on Title/Abstract searches for reliability: term[Title/Abstract] 

2448. Use wildcards for variations: vaccin*[Title/Abstract] 

245 

246EXAMPLE QUERIES: 

247✓ GOOD: (mRNA[Title/Abstract] OR "messenger RNA"[Title/Abstract]) AND vaccin*[Title/Abstract] 

248✓ GOOD: (influenza[Title/Abstract] OR flu[Title/Abstract]) AND treatment[Title/Abstract] 

249✗ BAD: (mRNA[Title/Abstract]) AND "specific disease"[Mesh] AND treatment[Title/Abstract] AND 2023[dp] 

250✗ BAD: "Here's a query to find articles about vaccines..." 

251 

252Return ONLY the search query without any explanations. 

253""" 

254 

255 # Get response from LLM 

256 response = self.llm.invoke(prompt) 

257 raw_response = response.content.strip() 

258 

259 # Clean up the query - extract only the actual query and remove any explanations 

260 # First check if there are multiple lines and take the first non-empty line 

261 lines = raw_response.split("\n") 

262 cleaned_lines = [line.strip() for line in lines if line.strip()] 

263 

264 if cleaned_lines: 

265 optimized_query = cleaned_lines[0] 

266 

267 # Remove any quotes that wrap the entire query 

268 if optimized_query.startswith('"') and optimized_query.endswith( 

269 '"' 

270 ): 

271 optimized_query = optimized_query[1:-1] 

272 

273 # Remove any explanation phrases that might be at the beginning 

274 explanation_starters = [ 

275 "here is", 

276 "here's", 

277 "this query", 

278 "the following", 

279 ] 

280 for starter in explanation_starters: 

281 if optimized_query.lower().startswith(starter): 

282 # Find the actual query part - typically after a colon 

283 colon_pos = optimized_query.find(":") 

284 if colon_pos > 0: 

285 optimized_query = optimized_query[ 

286 colon_pos + 1 : 

287 ].strip() 

288 

289 # Check if the query still seems to contain explanations 

290 if ( 

291 len(optimized_query) > 200 

292 or "this query will" in optimized_query.lower() 

293 ): 

294 # It's probably still an explanation - try to extract just the query part 

295 # Look for common patterns in the explanation like parentheses 

296 pattern = r"\([^)]+\)\s+AND\s+" 

297 import re 

298 

299 matches = re.findall(pattern, optimized_query) 

300 if matches: 

301 # Extract just the query syntax parts 

302 query_parts = [] 

303 for part in re.split(r"\.\s+", optimized_query): 

304 if ( 

305 "(" in part 

306 and ")" in part 

307 and ("AND" in part or "OR" in part) 

308 ): 

309 query_parts.append(part) 

310 if query_parts: 

311 optimized_query = " ".join(query_parts) 

312 else: 

313 # Fall back to original query if cleaning fails 

314 logger.warning( 

315 "Failed to extract a clean query from LLM response" 

316 ) 

317 optimized_query = query 

318 

319 # Final safety check - if query looks too much like an explanation, use original 

320 if len(optimized_query.split()) > 30: 

321 logger.warning( 

322 "Query too verbose, falling back to simpler form" 

323 ) 

324 # Create a simple query from the original 

325 words = [ 

326 w 

327 for w in query.split() 

328 if len(w) > 3 

329 and w.lower() 

330 not in ( 

331 "what", 

332 "are", 

333 "the", 

334 "and", 

335 "for", 

336 "with", 

337 "from", 

338 "have", 

339 "been", 

340 "recent", 

341 ) 

342 ] 

343 optimized_query = " AND ".join(words[:3]) 

344 

345 # Basic cleanup: standardize field tag case for consistency 

346 import re 

347 

348 optimized_query = re.sub( 

349 r"\[mesh\]", "[Mesh]", optimized_query, flags=re.IGNORECASE 

350 ) 

351 optimized_query = re.sub( 

352 r"\[title/abstract\]", 

353 "[Title/Abstract]", 

354 optimized_query, 

355 flags=re.IGNORECASE, 

356 ) 

357 optimized_query = re.sub( 

358 r"\[publication type\]", 

359 "[Publication Type]", 

360 optimized_query, 

361 flags=re.IGNORECASE, 

362 ) 

363 

364 # Fix unclosed quotes followed by field tags 

365 # Pattern: "term[Field] -> "term"[Field] 

366 optimized_query = re.sub(r'"([^"]+)\[', r'"\1"[', optimized_query) 

367 

368 # Simplify the query if still no results are found 

369 self._simplify_query_cache = optimized_query 

370 

371 # Log original and optimized queries 

372 logger.info("Original query: '%s'", query) 

373 logger.info(f"Optimized for PubMed: '{optimized_query}'") 

374 logger.debug( 

375 f"Query optimization complete: '{query[:50]}...' -> '{optimized_query[:100]}...'" 

376 ) 

377 

378 return optimized_query 

379 

380 except Exception: 

381 logger.exception("Error optimizing query") 

382 logger.debug(f"Falling back to original query: '{query}'") 

383 return query # Fall back to original query on error 

384 

385 def _simplify_query(self, query: str) -> str: 

386 """ 

387 Simplify a PubMed query that returned no results. 

388 Progressively removes elements to get a more basic query. 

389 

390 Args: 

391 query: The original query that returned no results 

392 

393 Returns: 

394 Simplified query 

395 """ 

396 logger.info(f"Simplifying query: {query}") 

397 logger.debug(f"Query simplification started for: '{query[:100]}...'") 

398 

399 # Simple approach: remove field restrictions to broaden the search 

400 import re 

401 

402 # Remove field tags to make search broader 

403 simplified = query 

404 

405 # Remove [Mesh] tags - search in all fields instead 

406 simplified = re.sub(r"\[Mesh\]", "", simplified, flags=re.IGNORECASE) 

407 

408 # Remove [Publication Type] tags 

409 simplified = re.sub( 

410 r"\[Publication Type\]", "", simplified, flags=re.IGNORECASE 

411 ) 

412 

413 # Keep [Title/Abstract] as it's usually helpful 

414 # Clean up any double spaces 

415 simplified = re.sub(r"\s+", " ", simplified).strip() 

416 

417 # If no simplification was possible, return the original query 

418 if simplified == query: 

419 logger.debug("No simplification possible, returning original query") 

420 

421 logger.info(f"Simplified query: {simplified}") 

422 logger.debug( 

423 f"Query simplified from {len(query)} to {len(simplified)} chars" 

424 ) 

425 return simplified 

426 

427 def _is_historical_focused(self, query: str) -> bool: 

428 """ 

429 Determine if a query is specifically focused on historical/older information using LLM. 

430 Default assumption is that queries should prioritize recent information unless 

431 explicitly asking for historical content. 

432 

433 Args: 

434 query: The search query 

435 

436 Returns: 

437 Boolean indicating if the query is focused on historical information 

438 """ 

439 if not self.llm: 

440 # Fall back to basic keyword check if no LLM available 

441 historical_terms = [ 

442 "history", 

443 "historical", 

444 "early", 

445 "initial", 

446 "first", 

447 "original", 

448 "before", 

449 "prior to", 

450 "origins", 

451 "evolution", 

452 "development", 

453 ] 

454 historical_years = [str(year) for year in range(1900, 2020)] 

455 

456 query_lower = query.lower() 

457 has_historical_term = any( 

458 term in query_lower for term in historical_terms 

459 ) 

460 has_past_year = any(year in query for year in historical_years) 

461 

462 return has_historical_term or has_past_year 

463 

464 try: 

465 # Use LLM to determine if the query is focused on historical information 

466 prompt = f"""Determine if this query is specifically asking for HISTORICAL or OLDER information. 

467 

468Query: "{query}" 

469 

470Answer ONLY "yes" if the query is clearly asking for historical, early, original, or past information from more than 5 years ago. 

471Answer ONLY "no" if the query is asking about recent, current, or new information, or if it's a general query without a specific time focus. 

472 

473The default assumption should be that medical and scientific queries want RECENT information unless clearly specified otherwise. 

474""" 

475 

476 response = self.llm.invoke(prompt) 

477 answer = response.content.strip().lower() 

478 

479 # Log the determination 

480 logger.info(f"Historical focus determination for query: '{query}'") 

481 logger.info(f"LLM determined historical focus: {answer}") 

482 

483 return "yes" in answer 

484 

485 except Exception: 

486 logger.exception("Error determining historical focus") 

487 # Fall back to basic keyword check 

488 historical_terms = [ 

489 "history", 

490 "historical", 

491 "early", 

492 "initial", 

493 "first", 

494 "original", 

495 "before", 

496 "prior to", 

497 "origins", 

498 "evolution", 

499 "development", 

500 ] 

501 return any(term in query.lower() for term in historical_terms) 

502 

503 def _adaptive_search(self, query: str) -> Tuple[List[str], str]: 

504 """ 

505 Perform an adaptive search that adjusts based on topic volume and whether 

506 the query focuses on historical information. 

507 

508 Args: 

509 query: The search query (already optimized) 

510 

511 Returns: 

512 Tuple of (list of PMIDs, search strategy used) 

513 """ 

514 # Estimate topic volume 

515 estimated_volume = self._get_result_count(query) 

516 

517 # Determine if the query is focused on historical information 

518 is_historical_focused = self._is_historical_focused(query) 

519 

520 if is_historical_focused: 

521 # User wants historical information - no date filtering 

522 time_filter = None 

523 strategy = "historical_focus" 

524 elif estimated_volume > 5000: 

525 # Very common topic - use tighter recency filter 

526 time_filter = '"last 1 year"[pdat]' 

527 strategy = "high_volume" 

528 elif estimated_volume > 1000: 

529 # Common topic 

530 time_filter = '"last 3 years"[pdat]' 

531 strategy = "common_topic" 

532 elif estimated_volume > 100: 

533 # Moderate volume 

534 time_filter = '"last 5 years"[pdat]' 

535 strategy = "moderate_volume" 

536 else: 

537 # Rare topic - still use recency but with wider range 

538 time_filter = '"last 10 years"[pdat]' 

539 strategy = "rare_topic" 

540 

541 # Run search based on strategy 

542 if time_filter: 

543 # Try with adaptive time filter 

544 query_with_time = f"({query}) AND {time_filter}" 

545 logger.info( 

546 f"Using adaptive search strategy: {strategy} with filter: {time_filter}" 

547 ) 

548 results = self._search_pubmed(query_with_time) 

549 

550 # If too few results, gradually expand time window 

551 if len(results) < 5 and '"last 10 years"[pdat]' not in time_filter: 

552 logger.info( 

553 f"Insufficient results ({len(results)}), expanding time window" 

554 ) 

555 expanded_time = self._expand_time_window(time_filter) 

556 query_with_expanded_time = f"({query}) AND {expanded_time}" 

557 expanded_results = self._search_pubmed(query_with_expanded_time) 

558 

559 if len(expanded_results) > len(results): 

560 logger.info( 

561 f"Expanded time window yielded {len(expanded_results)} results" 

562 ) 

563 return expanded_results, f"{strategy}_expanded" 

564 

565 # If still no results, try without time filter 

566 if not results: 

567 logger.info( 

568 "No results with time filter, trying without time restrictions" 

569 ) 

570 results = self._search_pubmed(query) 

571 strategy = "no_time_filter" 

572 else: 

573 # Historical query - run without time filter 

574 logger.info( 

575 "Using historical search strategy without date filtering" 

576 ) 

577 results = self._search_pubmed(query) 

578 

579 return results, strategy 

580 

581 def _search_pubmed(self, query: str) -> List[str]: 

582 """ 

583 Search PubMed and return a list of article IDs. 

584 

585 Args: 

586 query: The search query 

587 

588 Returns: 

589 List of PubMed IDs matching the query 

590 """ 

591 try: 

592 # Prepare search parameters 

593 params = { 

594 "db": "pubmed", 

595 "term": query, 

596 "retmode": "json", 

597 "retmax": self.max_results, 

598 "usehistory": "y", 

599 } 

600 

601 # Add API key if available 

602 if self.api_key: 

603 params["api_key"] = self.api_key 

604 logger.debug("Using PubMed API key for higher rate limits") 

605 else: 

606 logger.debug("No PubMed API key - using default rate limits") 

607 

608 # Add date restriction if specified 

609 if self.days_limit: 

610 params["reldate"] = self.days_limit 

611 params["datetype"] = "pdat" # Publication date 

612 logger.debug(f"Limiting results to last {self.days_limit} days") 

613 

614 logger.debug( 

615 f"PubMed search query: '{query}' with max_results={self.max_results}" 

616 ) 

617 

618 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

619 self.engine_type 

620 ) 

621 logger.debug( 

622 f"Applied rate limit wait: {self._last_wait_time:.2f}s" 

623 ) 

624 

625 # Execute search request 

626 logger.debug(f"Sending request to PubMed API: {self.search_url}") 

627 response = safe_get(self.search_url, params=params) 

628 response.raise_for_status() 

629 logger.debug(f"PubMed API response status: {response.status_code}") 

630 

631 # Parse response 

632 data = response.json() 

633 id_list = data["esearchresult"]["idlist"] 

634 total_count = data["esearchresult"].get("count", "unknown") 

635 

636 logger.info( 

637 f"PubMed search for '{query}' found {len(id_list)} results (total available: {total_count})" 

638 ) 

639 if len(id_list) > 0: 

640 logger.debug(f"First 5 PMIDs: {id_list[:5]}") 

641 return id_list 

642 

643 except Exception: 

644 logger.exception(f"Error searching PubMed for query '{query}'") 

645 return [] 

646 

647 def _get_article_summaries( 

648 self, id_list: List[str] 

649 ) -> List[Dict[str, Any]]: 

650 """ 

651 Get summaries for a list of PubMed article IDs. 

652 

653 Args: 

654 id_list: List of PubMed IDs 

655 

656 Returns: 

657 List of article summary dictionaries 

658 """ 

659 if not id_list: 

660 logger.debug("Empty ID list provided to _get_article_summaries") 

661 return [] 

662 

663 logger.debug(f"Fetching summaries for {len(id_list)} PubMed articles") 

664 

665 try: 

666 # Prepare parameters 

667 params = { 

668 "db": "pubmed", 

669 "id": ",".join(id_list), 

670 "retmode": "json", 

671 "rettype": "summary", 

672 } 

673 

674 # Add API key if available 

675 if self.api_key: 

676 params["api_key"] = self.api_key 

677 

678 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

679 self.engine_type 

680 ) 

681 logger.debug( 

682 f"Applied rate limit wait: {self._last_wait_time:.2f}s" 

683 ) 

684 

685 # Execute request 

686 logger.debug(f"Requesting summaries from: {self.summary_url}") 

687 response = safe_get(self.summary_url, params=params) 

688 response.raise_for_status() 

689 logger.debug(f"Summary API response status: {response.status_code}") 

690 

691 # Parse response 

692 data = response.json() 

693 logger.debug( 

694 f"PubMed API returned data for {len(id_list)} requested IDs" 

695 ) 

696 summaries = [] 

697 

698 for pmid in id_list: 

699 if pmid in data["result"]: 

700 article = data["result"][pmid] 

701 logger.debug( 

702 f"Processing article {pmid}: {article.get('title', 'NO TITLE')[:50]}" 

703 ) 

704 

705 # Extract authors (if available) 

706 authors = [] 

707 if "authors" in article: 

708 authors = [ 

709 author["name"] for author in article["authors"] 

710 ] 

711 

712 # Extract DOI from articleids if not in main field 

713 doi = article.get("doi", "") 

714 if not doi and "articleids" in article: 

715 for aid in article["articleids"]: 

716 if aid.get("idtype") == "doi": 

717 doi = aid.get("value", "") 

718 break 

719 

720 # Create summary dictionary with all available fields 

721 summary = { 

722 "id": pmid, 

723 "title": article.get("title", ""), 

724 "pubdate": article.get("pubdate", ""), 

725 "epubdate": article.get("epubdate", ""), 

726 "source": article.get("source", ""), 

727 "authors": authors, 

728 "lastauthor": article.get("lastauthor", ""), 

729 "journal": article.get("fulljournalname", ""), 

730 "volume": article.get("volume", ""), 

731 "issue": article.get("issue", ""), 

732 "pages": article.get("pages", ""), 

733 "doi": doi, 

734 "issn": article.get("issn", ""), 

735 "essn": article.get("essn", ""), 

736 "pubtype": article.get( 

737 "pubtype", [] 

738 ), # Publication types from esummary 

739 "recordstatus": article.get("recordstatus", ""), 

740 "lang": article.get("lang", []), 

741 "pmcrefcount": article.get("pmcrefcount", None), 

742 "link": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/", 

743 } 

744 

745 summaries.append(summary) 

746 else: 

747 logger.warning( 

748 f"PMID {pmid} not found in PubMed API response" 

749 ) 

750 

751 return summaries 

752 

753 except Exception as e: 

754 error_msg = str(e) 

755 logger.exception( 

756 f"Error getting article summaries for {len(id_list)} articles" 

757 ) 

758 

759 # Check for rate limiting patterns 

760 if ( 

761 "429" in error_msg 

762 or "too many requests" in error_msg.lower() 

763 or "rate limit" in error_msg.lower() 

764 or "service unavailable" in error_msg.lower() 

765 or "503" in error_msg 

766 or "403" in error_msg 

767 ): 

768 raise RateLimitError(f"PubMed rate limit hit: {error_msg}") 

769 

770 return [] 

771 

772 def _get_article_abstracts(self, id_list: List[str]) -> Dict[str, str]: 

773 """ 

774 Get abstracts for a list of PubMed article IDs. 

775 

776 Args: 

777 id_list: List of PubMed IDs 

778 

779 Returns: 

780 Dictionary mapping PubMed IDs to their abstracts 

781 """ 

782 if not id_list: 

783 logger.debug("Empty ID list provided to _get_article_abstracts") 

784 return {} 

785 

786 logger.debug(f"Fetching abstracts for {len(id_list)} PubMed articles") 

787 

788 try: 

789 # Prepare parameters 

790 params = { 

791 "db": "pubmed", 

792 "id": ",".join(id_list), 

793 "retmode": "xml", 

794 "rettype": "abstract", 

795 } 

796 

797 # Add API key if available 

798 if self.api_key: 

799 params["api_key"] = self.api_key 

800 

801 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

802 self.engine_type 

803 ) 

804 logger.debug( 

805 f"Applied rate limit wait: {self._last_wait_time:.2f}s" 

806 ) 

807 

808 # Execute request 

809 logger.debug(f"Requesting abstracts from: {self.fetch_url}") 

810 response = safe_get(self.fetch_url, params=params) 

811 response.raise_for_status() 

812 logger.debug( 

813 f"Abstract fetch response status: {response.status_code}, size: {len(response.text)} bytes" 

814 ) 

815 

816 # Parse XML response 

817 root = ET.fromstring(response.text) 

818 logger.debug( 

819 f"Parsing abstracts from XML for {len(id_list)} articles" 

820 ) 

821 

822 # Extract abstracts 

823 abstracts = {} 

824 

825 for article in root.findall(".//PubmedArticle"): 

826 pmid_elem = article.find(".//PMID") 

827 pmid = pmid_elem.text if pmid_elem is not None else None 

828 

829 if pmid is None: 

830 continue 

831 

832 # Find abstract text 

833 abstract_text = "" 

834 abstract_elem = article.find(".//AbstractText") 

835 

836 if abstract_elem is not None: 

837 abstract_text = abstract_elem.text or "" 

838 

839 # Some abstracts are split into multiple sections 

840 abstract_sections = article.findall(".//AbstractText") 

841 if len(abstract_sections) > 1: 

842 logger.debug( 

843 f"Article {pmid} has {len(abstract_sections)} abstract sections" 

844 ) 

845 

846 for section in abstract_sections: 

847 # Get section label if it exists 

848 label = section.get("Label") 

849 section_text = section.text or "" 

850 

851 if label and section_text: 

852 if abstract_text: 

853 abstract_text += f"\n\n{label}: {section_text}" 

854 else: 

855 abstract_text = f"{label}: {section_text}" 

856 elif section_text: 

857 if abstract_text: 

858 abstract_text += f"\n\n{section_text}" 

859 else: 

860 abstract_text = section_text 

861 

862 # Store in dictionary 

863 if pmid and abstract_text: 

864 abstracts[pmid] = abstract_text 

865 logger.debug( 

866 f"Abstract for {pmid}: {len(abstract_text)} chars" 

867 ) 

868 elif pmid: 

869 logger.warning(f"No abstract found for PMID {pmid}") 

870 

871 logger.info( 

872 f"Successfully retrieved {len(abstracts)} abstracts out of {len(id_list)} requested" 

873 ) 

874 return abstracts 

875 

876 except Exception as e: 

877 logger.exception( 

878 f"Error getting article abstracts for {len(id_list)} articles: {str(e)}" 

879 ) 

880 return {} 

881 

882 def _get_article_detailed_metadata( 

883 self, id_list: List[str] 

884 ) -> Dict[str, Dict[str, Any]]: 

885 """ 

886 Get detailed metadata for PubMed articles including publication types, 

887 MeSH terms, keywords, and affiliations. 

888 

889 Args: 

890 id_list: List of PubMed IDs 

891 

892 Returns: 

893 Dictionary mapping PubMed IDs to their detailed metadata 

894 """ 

895 if not id_list: 

896 return {} 

897 

898 try: 

899 # Prepare parameters 

900 params = { 

901 "db": "pubmed", 

902 "id": ",".join(id_list), 

903 "retmode": "xml", 

904 "rettype": "medline", 

905 } 

906 

907 # Add API key if available 

908 if self.api_key: 

909 params["api_key"] = self.api_key 

910 

911 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

912 self.engine_type 

913 ) 

914 

915 # Execute request 

916 response = safe_get(self.fetch_url, params=params) 

917 response.raise_for_status() 

918 

919 # Parse XML response 

920 root = ET.fromstring(response.text) 

921 

922 metadata = {} 

923 

924 for article in root.findall(".//PubmedArticle"): 

925 pmid_elem = article.find(".//PMID") 

926 pmid = pmid_elem.text if pmid_elem is not None else None 

927 

928 if pmid is None: 

929 continue 

930 

931 article_metadata = {} 

932 

933 # Extract publication types 

934 pub_types = [] 

935 for pub_type in article.findall(".//PublicationType"): 

936 if pub_type.text: 

937 pub_types.append(pub_type.text) 

938 if pub_types: 

939 article_metadata["publication_types"] = pub_types 

940 

941 # Extract MeSH terms 

942 mesh_terms = [] 

943 for mesh in article.findall(".//MeshHeading"): 

944 descriptor = mesh.find(".//DescriptorName") 

945 if descriptor is not None and descriptor.text: 

946 mesh_terms.append(descriptor.text) 

947 if mesh_terms: 

948 article_metadata["mesh_terms"] = mesh_terms 

949 

950 # Extract keywords 

951 keywords = [] 

952 for keyword in article.findall(".//Keyword"): 

953 if keyword.text: 

954 keywords.append(keyword.text) 

955 if keywords: 

956 article_metadata["keywords"] = keywords 

957 

958 # Extract affiliations 

959 affiliations = [] 

960 for affiliation in article.findall(".//Affiliation"): 

961 if affiliation.text: 

962 affiliations.append(affiliation.text) 

963 if affiliations: 

964 article_metadata["affiliations"] = affiliations 

965 

966 # Extract grant information 

967 grants = [] 

968 for grant in article.findall(".//Grant"): 

969 grant_info = {} 

970 grant_id = grant.find(".//GrantID") 

971 if grant_id is not None and grant_id.text: 

972 grant_info["id"] = grant_id.text 

973 agency = grant.find(".//Agency") 

974 if agency is not None and agency.text: 

975 grant_info["agency"] = agency.text 

976 if grant_info: 

977 grants.append(grant_info) 

978 if grants: 

979 article_metadata["grants"] = grants 

980 

981 # Check for free full text in PMC 

982 pmc_elem = article.find(".//ArticleId[@IdType='pmc']") 

983 if pmc_elem is not None: 

984 article_metadata["has_free_full_text"] = True 

985 article_metadata["pmc_id"] = pmc_elem.text 

986 

987 # Extract conflict of interest statement 

988 coi_elem = article.find(".//CoiStatement") 

989 if coi_elem is not None and coi_elem.text: 

990 article_metadata["conflict_of_interest"] = coi_elem.text 

991 

992 metadata[pmid] = article_metadata 

993 

994 return metadata 

995 

996 except Exception: 

997 logger.exception("Error getting detailed article metadata") 

998 return {} 

999 

1000 def _create_enriched_content( 

1001 self, result: Dict[str, Any], base_content: str 

1002 ) -> str: 

1003 """ 

1004 Create enriched content by adding relevant metadata context to help the LLM. 

1005 

1006 Args: 

1007 result: The result dictionary with metadata 

1008 base_content: The base content (abstract or full text) 

1009 

1010 Returns: 

1011 Enriched content string with metadata context 

1012 """ 

1013 enriched_parts = [] 

1014 

1015 # Add study type information 

1016 if "publication_types" in result: 

1017 pub_types = result["publication_types"] 

1018 # Filter for significant types 

1019 significant_types = [ 

1020 pt 

1021 for pt in pub_types 

1022 if any( 

1023 key in pt.lower() 

1024 for key in [ 

1025 "clinical trial", 

1026 "randomized", 

1027 "meta-analysis", 

1028 "systematic review", 

1029 "case report", 

1030 "guideline", 

1031 "comparative study", 

1032 "multicenter", 

1033 ] 

1034 ) 

1035 ] 

1036 if significant_types: 

1037 enriched_parts.append( 

1038 f"[Study Type: {', '.join(significant_types)}]" 

1039 ) 

1040 

1041 # Add the main content 

1042 enriched_parts.append(base_content) 

1043 

1044 # Add metadata footer 

1045 metadata_footer = [] 

1046 

1047 # Add ALL MeSH terms 

1048 if "mesh_terms" in result and len(result["mesh_terms"]) > 0: 

1049 metadata_footer.append( 

1050 f"Medical Topics (MeSH): {', '.join(result['mesh_terms'])}" 

1051 ) 

1052 

1053 # Add ALL keywords 

1054 if "keywords" in result and len(result["keywords"]) > 0: 

1055 metadata_footer.append(f"Keywords: {', '.join(result['keywords'])}") 

1056 

1057 # Add ALL affiliations 

1058 if "affiliations" in result and len(result["affiliations"]) > 0: 

1059 if len(result["affiliations"]) == 1: 

1060 metadata_footer.append( 

1061 f"Institution: {result['affiliations'][0]}" 

1062 ) 

1063 else: 

1064 affiliations_text = "\n - " + "\n - ".join( 

1065 result["affiliations"] 

1066 ) 

1067 metadata_footer.append(f"Institutions:{affiliations_text}") 

1068 

1069 # Add ALL funding information with full details 

1070 if "grants" in result and len(result["grants"]) > 0: 

1071 grant_details = [] 

1072 for grant in result["grants"]: 

1073 grant_text = [] 

1074 if "agency" in grant: 

1075 grant_text.append(grant["agency"]) 

1076 if "id" in grant: 

1077 grant_text.append(f"(Grant ID: {grant['id']})") 

1078 if grant_text: 

1079 grant_details.append(" ".join(grant_text)) 

1080 if grant_details: 

1081 if len(grant_details) == 1: 

1082 metadata_footer.append(f"Funded by: {grant_details[0]}") 

1083 else: 

1084 funding_text = "\n - " + "\n - ".join(grant_details) 

1085 metadata_footer.append(f"Funding Sources:{funding_text}") 

1086 

1087 # Add FULL conflict of interest statement 

1088 if "conflict_of_interest" in result: 

1089 coi_text = result["conflict_of_interest"] 

1090 if coi_text: 

1091 # Still skip trivial "no conflict" statements to reduce noise 

1092 if not any( 

1093 phrase in coi_text.lower() 

1094 for phrase in [ 

1095 "no conflict", 

1096 "no competing", 

1097 "nothing to disclose", 

1098 "none declared", 

1099 "authors declare no", 

1100 ] 

1101 ): 

1102 metadata_footer.append(f"Conflict of Interest: {coi_text}") 

1103 elif ( 

1104 "but" in coi_text.lower() 

1105 or "except" in coi_text.lower() 

1106 or "however" in coi_text.lower() 

1107 ): 

1108 # Include if there's a "no conflict BUT..." type statement 

1109 metadata_footer.append(f"Conflict of Interest: {coi_text}") 

1110 

1111 # Combine everything 

1112 if metadata_footer: 

1113 enriched_parts.append("\n---\nStudy Metadata:") 

1114 enriched_parts.extend(metadata_footer) 

1115 

1116 return "\n".join(enriched_parts) 

1117 

1118 def _find_pmc_ids(self, pmid_list: List[str]) -> Dict[str, str]: 

1119 """ 

1120 Find PMC IDs for the given PubMed IDs (for full-text access). 

1121 

1122 Args: 

1123 pmid_list: List of PubMed IDs 

1124 

1125 Returns: 

1126 Dictionary mapping PubMed IDs to their PMC IDs (if available) 

1127 """ 

1128 if not pmid_list or not self.get_full_text: 

1129 return {} 

1130 

1131 try: 

1132 # Prepare parameters 

1133 params = { 

1134 "dbfrom": "pubmed", 

1135 "db": "pmc", 

1136 "linkname": "pubmed_pmc", 

1137 "id": ",".join(pmid_list), 

1138 "retmode": "json", 

1139 } 

1140 

1141 # Add API key if available 

1142 if self.api_key: 

1143 params["api_key"] = self.api_key 

1144 

1145 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

1146 self.engine_type 

1147 ) 

1148 

1149 # Execute request 

1150 response = safe_get(self.link_url, params=params) 

1151 response.raise_for_status() 

1152 

1153 # Parse response 

1154 data = response.json() 

1155 

1156 # Map PubMed IDs to PMC IDs 

1157 pmid_to_pmcid = {} 

1158 

1159 for linkset in data.get("linksets", []): 

1160 pmid = linkset.get("ids", [None])[0] 

1161 

1162 if not pmid: 

1163 continue 

1164 

1165 for link in linkset.get("linksetdbs", []): 

1166 if link.get("linkname") == "pubmed_pmc": 

1167 pmcids = link.get("links", []) 

1168 if pmcids: 

1169 pmid_to_pmcid[str(pmid)] = f"PMC{pmcids[0]}" 

1170 

1171 logger.info( 

1172 f"Found {len(pmid_to_pmcid)} PMC IDs for full-text access" 

1173 ) 

1174 return pmid_to_pmcid 

1175 

1176 except Exception: 

1177 logger.exception("Error finding PMC IDs") 

1178 return {} 

1179 

1180 def _get_pmc_full_text(self, pmcid: str) -> str: 

1181 """ 

1182 Get full text for a PMC article. 

1183 

1184 Args: 

1185 pmcid: PMC ID of the article 

1186 

1187 Returns: 

1188 Full text content or empty string if not available 

1189 """ 

1190 try: 

1191 # Prepare parameters 

1192 params = { 

1193 "db": "pmc", 

1194 "id": pmcid, 

1195 "retmode": "xml", 

1196 "rettype": "full", 

1197 } 

1198 

1199 # Add API key if available 

1200 if self.api_key: 

1201 params["api_key"] = self.api_key 

1202 

1203 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

1204 self.engine_type 

1205 ) 

1206 

1207 # Execute request 

1208 response = safe_get(self.fetch_url, params=params) 

1209 response.raise_for_status() 

1210 

1211 # Parse XML response 

1212 root = ET.fromstring(response.text) 

1213 

1214 # Extract full text 

1215 full_text = [] 

1216 

1217 # Extract article title 

1218 title_elem = root.find(".//article-title") 

1219 if title_elem is not None and title_elem.text: 

1220 full_text.append(f"# {title_elem.text}") 

1221 

1222 # Extract abstract 

1223 abstract_paras = root.findall(".//abstract//p") 

1224 if abstract_paras: 

1225 full_text.append("\n## Abstract\n") 

1226 for p in abstract_paras: 

1227 text = "".join(p.itertext()) 

1228 if text: 

1229 full_text.append(text) 

1230 

1231 # Extract body content 

1232 body = root.find(".//body") 

1233 if body is not None: 

1234 for section in body.findall(".//sec"): 

1235 # Get section title 

1236 title = section.find(".//title") 

1237 if title is not None and title.text: 

1238 full_text.append(f"\n## {title.text}\n") 

1239 

1240 # Get paragraphs 

1241 for p in section.findall(".//p"): 

1242 text = "".join(p.itertext()) 

1243 if text: 

1244 full_text.append(text) 

1245 

1246 result_text = "\n\n".join(full_text) 

1247 logger.debug( 

1248 f"Successfully extracted {len(result_text)} chars of PMC full text with {len(full_text)} sections" 

1249 ) 

1250 return result_text 

1251 

1252 except Exception: 

1253 logger.exception("Error getting PMC full text") 

1254 return "" 

1255 

1256 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

1257 """ 

1258 Get preview information for PubMed articles. 

1259 

1260 Args: 

1261 query: The search query 

1262 

1263 Returns: 

1264 List of preview dictionaries 

1265 """ 

1266 logger.info(f"Getting PubMed previews for query: {query}") 

1267 

1268 # Optimize the query for PubMed if LLM is available 

1269 optimized_query = self._optimize_query_for_pubmed(query) 

1270 

1271 # Perform adaptive search 

1272 pmid_list, strategy = self._adaptive_search(optimized_query) 

1273 

1274 # If no results, try a simplified query 

1275 if not pmid_list: 

1276 logger.warning( 

1277 f"No PubMed results found using strategy: {strategy}" 

1278 ) 

1279 simplified_query = self._simplify_query(optimized_query) 

1280 if simplified_query != optimized_query: 

1281 logger.info(f"Trying with simplified query: {simplified_query}") 

1282 pmid_list, strategy = self._adaptive_search(simplified_query) 

1283 if pmid_list: 

1284 logger.info( 

1285 f"Simplified query found {len(pmid_list)} results" 

1286 ) 

1287 

1288 if not pmid_list: 

1289 logger.warning("No PubMed results found after query simplification") 

1290 return [] 

1291 

1292 # Get article summaries 

1293 logger.debug(f"Fetching article summaries for {len(pmid_list)} PMIDs") 

1294 summaries = self._get_article_summaries(pmid_list) 

1295 logger.debug(f"Retrieved {len(summaries)} summaries") 

1296 

1297 # ALWAYS fetch abstracts for snippet-only mode to provide context for LLM 

1298 logger.debug( 

1299 f"Fetching abstracts for {len(pmid_list)} articles for snippet enrichment" 

1300 ) 

1301 abstracts = self._get_article_abstracts(pmid_list) 

1302 logger.debug(f"Retrieved {len(abstracts)} abstracts") 

1303 

1304 # Format as previews 

1305 previews = [] 

1306 for summary in summaries: 

1307 # Build snippet from individual metadata preferences 

1308 snippet_parts = [] 

1309 

1310 # Check for publication type from esummary (earlier than detailed metadata) 

1311 pub_type_prefix = "" 

1312 if self.include_publication_type_in_context and summary.get( 

1313 "pubtype" 

1314 ): 

1315 # Use first publication type from esummary 

1316 pub_type_prefix = f"[{summary['pubtype'][0]}] " 

1317 

1318 # Add authors if enabled 

1319 if self.include_authors_in_context and summary.get("authors"): 

1320 authors_text = ", ".join(summary.get("authors", [])) 

1321 if len(authors_text) > 100: 

1322 # Truncate long author lists 

1323 authors_text = authors_text[:97] + "..." 

1324 snippet_parts.append(authors_text) 

1325 

1326 # Add journal if enabled 

1327 if self.include_journal_in_context and summary.get("journal"): 

1328 snippet_parts.append(summary["journal"]) 

1329 

1330 # Add date (full or year only) 

1331 if summary.get("pubdate"): 

1332 if self.include_full_date_in_context: 

1333 snippet_parts.append(summary["pubdate"]) 

1334 elif ( 

1335 self.include_year_in_context 

1336 and len(summary["pubdate"]) >= 4 

1337 ): 

1338 snippet_parts.append(summary["pubdate"][:4]) 

1339 

1340 # Add citation details if enabled 

1341 if self.include_citation_in_context: 

1342 citation_parts = [] 

1343 if summary.get("volume"): 

1344 citation_parts.append(f"Vol {summary['volume']}") 

1345 if summary.get("issue"): 

1346 citation_parts.append(f"Issue {summary['issue']}") 

1347 if summary.get("pages"): 

1348 citation_parts.append(f"pp {summary['pages']}") 

1349 if citation_parts: 

1350 snippet_parts.append(f"({', '.join(citation_parts)})") 

1351 

1352 # Join snippet parts or provide default 

1353 if snippet_parts: 

1354 # Use different separators based on what's included 

1355 if self.include_authors_in_context: 

1356 snippet = ". ".join( 

1357 snippet_parts 

1358 ) # Authors need period separator 

1359 else: 

1360 snippet = " - ".join( 

1361 snippet_parts 

1362 ) # Journal and year use dash 

1363 else: 

1364 snippet = "Research article" 

1365 

1366 # Add publication type prefix 

1367 snippet = pub_type_prefix + snippet 

1368 

1369 # Add language indicator if not English 

1370 if self.include_language_in_context and summary.get("lang"): 

1371 langs = summary["lang"] 

1372 if langs and langs[0] != "eng" and langs[0]: 

1373 snippet = f"{snippet} [{langs[0].upper()}]" 

1374 

1375 # Add identifiers if enabled 

1376 identifier_parts = [] 

1377 if self.include_pmid_in_context and summary.get("id"): 

1378 identifier_parts.append(f"PMID: {summary['id']}") 

1379 if self.include_doi_in_context and summary.get("doi"): 

1380 identifier_parts.append(f"DOI: {summary['doi']}") 

1381 

1382 if identifier_parts: 

1383 snippet = f"{snippet} | {' | '.join(identifier_parts)}" 

1384 

1385 # ALWAYS include title and abstract in snippet for LLM analysis 

1386 pmid = summary["id"] 

1387 title = summary["title"] 

1388 abstract_text = abstracts.get(pmid, "") 

1389 

1390 # Truncate abstract if too long 

1391 if len(abstract_text) > 500: 

1392 abstract_text = abstract_text[:497] + "..." 

1393 

1394 # Build the enriched snippet with title and abstract 

1395 if abstract_text: 

1396 enriched_snippet = f"Title: {title}\n\nAbstract: {abstract_text}\n\nMetadata: {snippet}" 

1397 else: 

1398 enriched_snippet = f"Title: {title}\n\nMetadata: {snippet}" 

1399 

1400 # Log the complete snippet for debugging 

1401 logger.debug(f"Complete snippet for PMID {pmid}:") 

1402 logger.debug(f" Title: {title[:100]}...") 

1403 logger.debug(f" Abstract length: {len(abstract_text)} chars") 

1404 logger.debug(f" Metadata: {snippet}") 

1405 logger.debug( 

1406 f" Full enriched snippet ({len(enriched_snippet)} chars): {enriched_snippet[:500]}..." 

1407 ) 

1408 

1409 # Create preview with basic information 

1410 preview = { 

1411 "id": summary["id"], 

1412 "title": summary["title"], 

1413 "link": summary["link"], 

1414 "snippet": enriched_snippet, # Use enriched snippet with title and abstract 

1415 "authors": summary.get("authors", []), 

1416 "journal": summary.get("journal", ""), 

1417 "pubdate": summary.get("pubdate", ""), 

1418 "doi": summary.get("doi", ""), 

1419 "source": "PubMed", 

1420 "_pmid": summary["id"], # Store PMID for later use 

1421 "_search_strategy": strategy, # Store search strategy for analytics 

1422 } 

1423 

1424 previews.append(preview) 

1425 

1426 logger.info( 

1427 f"Found {len(previews)} PubMed previews using strategy: {strategy}" 

1428 ) 

1429 if previews: 

1430 logger.debug( 

1431 f"Sample preview title: '{previews[0].get('title', 'NO TITLE')[:80]}...'" 

1432 ) 

1433 return previews 

1434 

1435 def _get_full_content( 

1436 self, relevant_items: List[Dict[str, Any]] 

1437 ) -> List[Dict[str, Any]]: 

1438 """ 

1439 Get full content for the relevant PubMed articles. 

1440 Efficiently manages which content to retrieve (abstracts and/or full text). 

1441 

1442 Args: 

1443 relevant_items: List of relevant preview dictionaries 

1444 

1445 Returns: 

1446 List of result dictionaries with full content 

1447 """ 

1448 # Check if we should add full content 

1449 snippets_only_mode = ( 

1450 hasattr(search_config, "SEARCH_SNIPPETS_ONLY") 

1451 and search_config.SEARCH_SNIPPETS_ONLY 

1452 ) 

1453 

1454 if snippets_only_mode: 

1455 logger.info( 

1456 "Snippet-only mode enabled, will fetch abstracts as snippets" 

1457 ) 

1458 # For PubMed, we still need to fetch abstracts as they serve as snippets 

1459 # But we'll skip full-text retrieval 

1460 

1461 logger.info( 

1462 f"Getting content for {len(relevant_items)} PubMed articles" 

1463 ) 

1464 

1465 # Collect all PMIDs for relevant items 

1466 pmids = [] 

1467 for item in relevant_items: 

1468 if "_pmid" in item: 

1469 pmids.append(item["_pmid"]) 

1470 

1471 # Get abstracts if requested and PMIDs exist 

1472 # In snippet-only mode, always get abstracts as they serve as snippets 

1473 abstracts = {} 

1474 if (self.get_abstracts or snippets_only_mode) and pmids: 

1475 abstracts = self._get_article_abstracts(pmids) 

1476 

1477 # Get detailed metadata for all articles (publication types, MeSH terms, etc.) 

1478 detailed_metadata = {} 

1479 if pmids: 

1480 detailed_metadata = self._get_article_detailed_metadata(pmids) 

1481 

1482 # Find PMC IDs for full-text retrieval (if enabled and not in snippet-only mode) 

1483 pmid_to_pmcid = {} 

1484 if self.get_full_text and pmids and not snippets_only_mode: 

1485 pmid_to_pmcid = self._find_pmc_ids(pmids) 

1486 

1487 # Add content to results 

1488 results = [] 

1489 for item in relevant_items: 

1490 result = item.copy() 

1491 pmid = item.get("_pmid", "") 

1492 

1493 # Add detailed metadata if available 

1494 if pmid in detailed_metadata: 

1495 metadata = detailed_metadata[pmid] 

1496 

1497 # Add publication types (e.g., "Clinical Trial", "Meta-Analysis") 

1498 if "publication_types" in metadata: 

1499 result["publication_types"] = metadata["publication_types"] 

1500 

1501 # Add first publication type to snippet if enabled 

1502 if ( 

1503 self.include_publication_type_in_context 

1504 and metadata["publication_types"] 

1505 ): 

1506 # Just take the first publication type as is 

1507 pub_type = metadata["publication_types"][0] 

1508 if "snippet" in result: 

1509 result["snippet"] = ( 

1510 f"[{pub_type}] {result['snippet']}" 

1511 ) 

1512 

1513 # Add MeSH terms for medical categorization 

1514 if "mesh_terms" in metadata: 

1515 result["mesh_terms"] = metadata["mesh_terms"] 

1516 

1517 # Add MeSH terms to snippet if enabled 

1518 if ( 

1519 self.include_mesh_terms_in_context 

1520 and metadata["mesh_terms"] 

1521 ): 

1522 mesh_to_show = ( 

1523 metadata["mesh_terms"][: self.max_mesh_terms] 

1524 if self.max_mesh_terms > 0 

1525 else metadata["mesh_terms"] 

1526 ) 

1527 if mesh_to_show and "snippet" in result: 

1528 mesh_text = "MeSH: " + ", ".join(mesh_to_show) 

1529 result["snippet"] = ( 

1530 f"{result['snippet']} | {mesh_text}" 

1531 ) 

1532 

1533 # Add keywords 

1534 if "keywords" in metadata: 

1535 result["keywords"] = metadata["keywords"] 

1536 

1537 # Add keywords to snippet if enabled 

1538 if ( 

1539 self.include_keywords_in_context 

1540 and metadata["keywords"] 

1541 ): 

1542 keywords_to_show = ( 

1543 metadata["keywords"][: self.max_keywords] 

1544 if self.max_keywords > 0 

1545 else metadata["keywords"] 

1546 ) 

1547 if keywords_to_show and "snippet" in result: 

1548 keywords_text = "Keywords: " + ", ".join( 

1549 keywords_to_show 

1550 ) 

1551 result["snippet"] = ( 

1552 f"{result['snippet']} | {keywords_text}" 

1553 ) 

1554 

1555 # Add affiliations 

1556 if "affiliations" in metadata: 

1557 result["affiliations"] = metadata["affiliations"] 

1558 

1559 # Add funding/grant information 

1560 if "grants" in metadata: 

1561 result["grants"] = metadata["grants"] 

1562 

1563 # Add conflict of interest statement 

1564 if "conflict_of_interest" in metadata: 

1565 result["conflict_of_interest"] = metadata[ 

1566 "conflict_of_interest" 

1567 ] 

1568 

1569 # Add free full text availability 

1570 if "has_free_full_text" in metadata: 

1571 result["has_free_full_text"] = metadata[ 

1572 "has_free_full_text" 

1573 ] 

1574 if "pmc_id" in metadata: 

1575 result["pmc_id"] = metadata["pmc_id"] 

1576 

1577 # Add PMC availability to snippet if enabled 

1578 if ( 

1579 self.include_pmc_availability_in_context 

1580 and metadata["has_free_full_text"] 

1581 and "snippet" in result 

1582 ): 

1583 result["snippet"] = ( 

1584 f"{result['snippet']} | [Free Full Text]" 

1585 ) 

1586 

1587 # Add abstract if available 

1588 if pmid in abstracts: 

1589 result["abstract"] = abstracts[pmid] 

1590 

1591 # Create enriched content with metadata context 

1592 enriched_content = self._create_enriched_content( 

1593 result, abstracts[pmid] 

1594 ) 

1595 

1596 # ALWAYS include title and abstract in snippet for LLM analysis 

1597 # Build comprehensive snippet with title and abstract 

1598 title = result.get("title", "") 

1599 abstract_text = ( 

1600 abstracts[pmid][:500] 

1601 if len(abstracts[pmid]) > 500 

1602 else abstracts[pmid] 

1603 ) 

1604 

1605 # Prepend title and abstract to the existing metadata snippet 

1606 if "snippet" in result: 

1607 # Keep metadata snippet and add content 

1608 result["snippet"] = ( 

1609 f"Title: {title}\n\nAbstract: {abstract_text}\n\nMetadata: {result['snippet']}" 

1610 ) 

1611 else: 

1612 # No metadata snippet, just title and abstract 

1613 result["snippet"] = ( 

1614 f"Title: {title}\n\nAbstract: {abstract_text}" 

1615 ) 

1616 

1617 # In snippet-only mode, use enriched content 

1618 if snippets_only_mode: 

1619 result["full_content"] = enriched_content 

1620 result["content"] = enriched_content 

1621 result["content_type"] = "abstract" 

1622 # Use abstract as content if no full text 

1623 elif pmid not in pmid_to_pmcid: 

1624 result["full_content"] = enriched_content 

1625 result["content"] = enriched_content 

1626 result["content_type"] = "abstract" 

1627 

1628 # Add full text for a limited number of top articles 

1629 if ( 

1630 pmid in pmid_to_pmcid 

1631 and self.get_full_text 

1632 and len( 

1633 [r for r in results if r.get("content_type") == "full_text"] 

1634 ) 

1635 < self.full_text_limit 

1636 ): 

1637 # Get full text content 

1638 pmcid = pmid_to_pmcid[pmid] 

1639 full_text = self._get_pmc_full_text(pmcid) 

1640 

1641 if full_text: 

1642 enriched_full_text = self._create_enriched_content( 

1643 result, full_text 

1644 ) 

1645 result["full_content"] = enriched_full_text 

1646 result["content"] = enriched_full_text 

1647 result["content_type"] = "full_text" 

1648 result["pmcid"] = pmcid 

1649 elif pmid in abstracts: 

1650 # Fall back to abstract if full text retrieval fails 

1651 enriched_content = self._create_enriched_content( 

1652 result, abstracts[pmid] 

1653 ) 

1654 result["full_content"] = enriched_content 

1655 result["content"] = enriched_content 

1656 result["content_type"] = "abstract" 

1657 

1658 # Remove temporary fields 

1659 if "_pmid" in result: 

1660 del result["_pmid"] 

1661 if "_search_strategy" in result: 

1662 del result["_search_strategy"] 

1663 

1664 results.append(result) 

1665 

1666 return results 

1667 

1668 def search_by_author( 

1669 self, author_name: str, max_results: Optional[int] = None 

1670 ) -> List[Dict[str, Any]]: 

1671 """ 

1672 Search for articles by a specific author. 

1673 

1674 Args: 

1675 author_name: Name of the author 

1676 max_results: Maximum number of results (defaults to self.max_results) 

1677 

1678 Returns: 

1679 List of articles by the author 

1680 """ 

1681 original_max_results = self.max_results 

1682 

1683 try: 

1684 if max_results: 

1685 self.max_results = max_results 

1686 

1687 query = f"{author_name}[Author]" 

1688 return self.run(query) 

1689 

1690 finally: 

1691 # Restore original value 

1692 self.max_results = original_max_results 

1693 

1694 def search_by_journal( 

1695 self, journal_name: str, max_results: Optional[int] = None 

1696 ) -> List[Dict[str, Any]]: 

1697 """ 

1698 Search for articles in a specific journal. 

1699 

1700 Args: 

1701 journal_name: Name of the journal 

1702 max_results: Maximum number of results (defaults to self.max_results) 

1703 

1704 Returns: 

1705 List of articles from the journal 

1706 """ 

1707 original_max_results = self.max_results 

1708 

1709 try: 

1710 if max_results: 

1711 self.max_results = max_results 

1712 

1713 query = f"{journal_name}[Journal]" 

1714 return self.run(query) 

1715 

1716 finally: 

1717 # Restore original value 

1718 self.max_results = original_max_results 

1719 

1720 def search_recent( 

1721 self, query: str, days: int = 30, max_results: Optional[int] = None 

1722 ) -> List[Dict[str, Any]]: 

1723 """ 

1724 Search for recent articles matching the query. 

1725 

1726 Args: 

1727 query: The search query 

1728 days: Number of days to look back 

1729 max_results: Maximum number of results (defaults to self.max_results) 

1730 

1731 Returns: 

1732 List of recent articles matching the query 

1733 """ 

1734 original_max_results = self.max_results 

1735 original_days_limit = self.days_limit 

1736 

1737 try: 

1738 if max_results: 

1739 self.max_results = max_results 

1740 

1741 # Set days limit for this search 

1742 self.days_limit = days 

1743 

1744 return self.run(query) 

1745 

1746 finally: 

1747 # Restore original values 

1748 self.max_results = original_max_results 

1749 self.days_limit = original_days_limit 

1750 

1751 def advanced_search( 

1752 self, terms: Dict[str, str], max_results: Optional[int] = None 

1753 ) -> List[Dict[str, Any]]: 

1754 """ 

1755 Perform an advanced search with field-specific terms. 

1756 

1757 Args: 

1758 terms: Dictionary mapping fields to search terms 

1759 Valid fields: Author, Journal, Title, MeSH, Affiliation, etc. 

1760 max_results: Maximum number of results (defaults to self.max_results) 

1761 

1762 Returns: 

1763 List of articles matching the advanced query 

1764 """ 

1765 original_max_results = self.max_results 

1766 

1767 try: 

1768 if max_results: 

1769 self.max_results = max_results 

1770 

1771 # Build advanced query string 

1772 query_parts = [] 

1773 for field, term in terms.items(): 

1774 query_parts.append(f"{term}[{field}]") 

1775 

1776 query = " AND ".join(query_parts) 

1777 return self.run(query) 

1778 

1779 finally: 

1780 # Restore original value 

1781 self.max_results = original_max_results