Coverage for src / local_deep_research / web_search_engines / engines / search_engine_pubmed.py: 71%

708 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-02-25 01:07 +0000

1import re 

2from typing import Any, Dict, List, Optional, Tuple 

3 

4from defusedxml import ElementTree as ET 

5 

6from langchain_core.language_models import BaseLLM 

7from loguru import logger 

8 

9from ...config import search_config 

10from ...constants import SNIPPET_LENGTH_LONG 

11from ...security.safe_requests import safe_get 

12from ..rate_limiting import RateLimitError 

13from ..search_engine_base import BaseSearchEngine 

14 

15 

16class PubMedSearchEngine(BaseSearchEngine): 

17 """ 

18 PubMed search engine implementation with two-phase approach and adaptive search. 

19 Provides efficient access to biomedical literature while minimizing API usage. 

20 """ 

21 

22 # Mark as public search engine 

23 is_public = True 

24 # Scientific/medical search engine 

25 is_scientific = True 

26 

27 def __init__( 

28 self, 

29 max_results: int = 10, 

30 api_key: Optional[str] = None, 

31 days_limit: Optional[int] = None, 

32 get_abstracts: bool = True, 

33 get_full_text: bool = False, 

34 full_text_limit: int = 3, 

35 llm: Optional[BaseLLM] = None, 

36 max_filtered_results: Optional[int] = None, 

37 optimize_queries: bool = True, 

38 include_publication_type_in_context: bool = True, 

39 include_journal_in_context: bool = True, 

40 include_year_in_context: bool = True, 

41 include_authors_in_context: bool = False, 

42 include_full_date_in_context: bool = False, 

43 include_mesh_terms_in_context: bool = True, 

44 include_keywords_in_context: bool = True, 

45 include_doi_in_context: bool = False, 

46 include_pmid_in_context: bool = False, 

47 include_pmc_availability_in_context: bool = False, 

48 max_mesh_terms: int = 3, 

49 max_keywords: int = 3, 

50 include_citation_in_context: bool = False, 

51 include_language_in_context: bool = False, 

52 ): 

53 """ 

54 Initialize the PubMed search engine. 

55 

56 Args: 

57 max_results: Maximum number of search results 

58 api_key: NCBI API key for higher rate limits (optional) 

59 days_limit: Limit results to N days (optional) 

60 get_abstracts: Whether to fetch abstracts for all results 

61 get_full_text: Whether to fetch full text content (when available in PMC) 

62 full_text_limit: Max number of full-text articles to retrieve 

63 llm: Language model for relevance filtering 

64 max_filtered_results: Maximum number of results to keep after filtering 

65 optimize_queries: Whether to optimize natural language queries for PubMed 

66 """ 

67 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results 

68 super().__init__( 

69 llm=llm, 

70 max_filtered_results=max_filtered_results, 

71 max_results=max_results, 

72 ) 

73 self.max_results = max(self.max_results, 25) 

74 self.api_key = api_key 

75 self.days_limit = days_limit 

76 self.get_abstracts = get_abstracts 

77 self.get_full_text = get_full_text 

78 self.full_text_limit = full_text_limit 

79 self.optimize_queries = optimize_queries 

80 self.include_publication_type_in_context = ( 

81 include_publication_type_in_context 

82 ) 

83 self.include_journal_in_context = include_journal_in_context 

84 self.include_year_in_context = include_year_in_context 

85 self.include_authors_in_context = include_authors_in_context 

86 self.include_full_date_in_context = include_full_date_in_context 

87 self.include_mesh_terms_in_context = include_mesh_terms_in_context 

88 self.include_keywords_in_context = include_keywords_in_context 

89 self.include_doi_in_context = include_doi_in_context 

90 self.include_pmid_in_context = include_pmid_in_context 

91 self.include_pmc_availability_in_context = ( 

92 include_pmc_availability_in_context 

93 ) 

94 self.max_mesh_terms = max_mesh_terms 

95 self.max_keywords = max_keywords 

96 self.include_citation_in_context = include_citation_in_context 

97 self.include_language_in_context = include_language_in_context 

98 

99 # Base API URLs 

100 self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" 

101 self.search_url = f"{self.base_url}/esearch.fcgi" 

102 self.summary_url = f"{self.base_url}/esummary.fcgi" 

103 self.fetch_url = f"{self.base_url}/efetch.fcgi" 

104 self.link_url = f"{self.base_url}/elink.fcgi" 

105 

106 # PMC base URL for full text 

107 self.pmc_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" 

108 

109 def _get_result_count(self, query: str) -> int: 

110 """ 

111 Get the total number of results for a query without retrieving the results themselves. 

112 

113 Args: 

114 query: The search query 

115 

116 Returns: 

117 Total number of matching results 

118 """ 

119 try: 

120 # Prepare search parameters 

121 params = { 

122 "db": "pubmed", 

123 "term": query, 

124 "retmode": "json", 

125 "retmax": 0, # Don't need actual results, just the count 

126 } 

127 

128 # Add API key if available 

129 if self.api_key: 

130 params["api_key"] = self.api_key 

131 

132 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

133 self.engine_type 

134 ) 

135 

136 # Execute search request 

137 response = safe_get(self.search_url, params=params) 

138 response.raise_for_status() 

139 

140 # Parse response 

141 data = response.json() 

142 count = int(data["esearchresult"]["count"]) 

143 

144 logger.info( 

145 "Query '%s' has %s total results in PubMed", query, count 

146 ) 

147 return count 

148 

149 except Exception: 

150 logger.exception("Error getting result count") 

151 return 0 

152 

153 def _extract_core_terms(self, query: str) -> str: 

154 """ 

155 Extract core terms from a complex query for volume estimation. 

156 

157 Args: 

158 query: PubMed query string 

159 

160 Returns: 

161 Simplified query with core terms 

162 """ 

163 # Remove field specifications and operators 

164 simplified = re.sub(r"\[\w+\]", "", query) # Remove [Field] tags 

165 simplified = re.sub( 

166 r"\b(AND|OR|NOT)\b", "", simplified 

167 ) # Remove operators 

168 

169 # Remove quotes and parentheses 

170 simplified = ( 

171 simplified.replace('"', "").replace("(", "").replace(")", "") 

172 ) 

173 

174 # Split by whitespace and join terms with 4+ chars (likely meaningful) 

175 terms = [term for term in simplified.split() if len(term) >= 4] 

176 

177 # Join with AND to create a basic search 

178 return " ".join(terms[:5]) # Limit to top 5 terms 

179 

180 def _expand_time_window(self, time_filter: str) -> str: 

181 """ 

182 Expand a time window to get more results. 

183 

184 Args: 

185 time_filter: Current time filter 

186 

187 Returns: 

188 Expanded time filter 

189 """ 

190 # Parse current time window 

191 import re 

192 

193 match = re.match(r'"last (\d+) (\w+)"[pdat]', time_filter) 

194 if not match: 

195 return '"last 10 years"[pdat]' 

196 

197 amount, unit = int(match.group(1)), match.group(2) 

198 

199 # Expand based on current unit 

200 if unit == "months" or unit == "month": 

201 if amount < 6: 

202 return '"last 6 months"[pdat]' 

203 elif amount < 12: 203 ↛ 206line 203 didn't jump to line 206 because the condition on line 203 was always true

204 return '"last 1 year"[pdat]' 

205 else: 

206 return '"last 2 years"[pdat]' 

207 elif unit == "years" or unit == "year": 207 ↛ 215line 207 didn't jump to line 215 because the condition on line 207 was always true

208 if amount < 2: 

209 return '"last 2 years"[pdat]' 

210 elif amount < 5: 210 ↛ 213line 210 didn't jump to line 213 because the condition on line 210 was always true

211 return '"last 5 years"[pdat]' 

212 else: 

213 return '"last 10 years"[pdat]' 

214 

215 return '"last 10 years"[pdat]' 

216 

217 def _optimize_query_for_pubmed(self, query: str) -> str: 

218 """ 

219 Optimize a natural language query for PubMed search. 

220 Uses LLM to transform questions into effective keyword-based queries. 

221 

222 Args: 

223 query: Natural language query 

224 

225 Returns: 

226 Optimized query string for PubMed 

227 """ 

228 if not self.llm or not self.optimize_queries: 

229 # Return original query if no LLM available or optimization disabled 

230 return query 

231 

232 try: 

233 # Prompt for query optimization 

234 prompt = f"""Transform this natural language question into an optimized PubMed search query. 

235 

236Original query: "{query}" 

237 

238CRITICAL RULES: 

2391. ONLY RETURN THE EXACT SEARCH QUERY - NO EXPLANATIONS, NO COMMENTS 

2402. DO NOT wrap the entire query in quotes 

2413. DO NOT include ANY date restrictions or year filters 

2424. Use parentheses around OR statements: (term1[Field] OR term2[Field]) 

2435. Use only BASIC MeSH terms - stick to broad categories like "Vaccines"[Mesh] 

2446. KEEP IT SIMPLE - use 2-3 main concepts maximum 

2457. Focus on Title/Abstract searches for reliability: term[Title/Abstract] 

2468. Use wildcards for variations: vaccin*[Title/Abstract] 

247 

248EXAMPLE QUERIES: 

249✓ GOOD: (mRNA[Title/Abstract] OR "messenger RNA"[Title/Abstract]) AND vaccin*[Title/Abstract] 

250✓ GOOD: (influenza[Title/Abstract] OR flu[Title/Abstract]) AND treatment[Title/Abstract] 

251✗ BAD: (mRNA[Title/Abstract]) AND "specific disease"[Mesh] AND treatment[Title/Abstract] AND 2023[dp] 

252✗ BAD: "Here's a query to find articles about vaccines..." 

253 

254Return ONLY the search query without any explanations. 

255""" 

256 

257 # Get response from LLM 

258 response = self.llm.invoke(prompt) 

259 raw_response = response.content.strip() 

260 

261 # Clean up the query - extract only the actual query and remove any explanations 

262 # First check if there are multiple lines and take the first non-empty line 

263 lines = raw_response.split("\n") 

264 cleaned_lines = [line.strip() for line in lines if line.strip()] 

265 

266 if cleaned_lines: 266 ↛ 316line 266 didn't jump to line 316 because the condition on line 266 was always true

267 optimized_query = cleaned_lines[0] 

268 

269 # Remove any quotes that wrap the entire query 

270 if optimized_query.startswith('"') and optimized_query.endswith( 270 ↛ 273line 270 didn't jump to line 273 because the condition on line 270 was never true

271 '"' 

272 ): 

273 optimized_query = optimized_query[1:-1] 

274 

275 # Remove any explanation phrases that might be at the beginning 

276 explanation_starters = [ 

277 "here is", 

278 "here's", 

279 "this query", 

280 "the following", 

281 ] 

282 for starter in explanation_starters: 

283 if optimized_query.lower().startswith(starter): 283 ↛ 285line 283 didn't jump to line 285 because the condition on line 283 was never true

284 # Find the actual query part - typically after a colon 

285 colon_pos = optimized_query.find(":") 

286 if colon_pos > 0: 

287 optimized_query = optimized_query[ 

288 colon_pos + 1 : 

289 ].strip() 

290 

291 # Check if the query still seems to contain explanations 

292 if ( 292 ↛ 298line 292 didn't jump to line 298 because the condition on line 292 was never true

293 len(optimized_query) > 200 

294 or "this query will" in optimized_query.lower() 

295 ): 

296 # It's probably still an explanation - try to extract just the query part 

297 # Look for common patterns in the explanation like parentheses 

298 pattern = r"\([^)]+\)\s+AND\s+" 

299 import re 

300 

301 matches = re.findall(pattern, optimized_query) 

302 if matches: 

303 # Extract just the query syntax parts 

304 query_parts = [] 

305 for part in re.split(r"\.\s+", optimized_query): 

306 if ( 

307 "(" in part 

308 and ")" in part 

309 and ("AND" in part or "OR" in part) 

310 ): 

311 query_parts.append(part) 

312 if query_parts: 

313 optimized_query = " ".join(query_parts) 

314 else: 

315 # Fall back to original query if cleaning fails 

316 logger.warning( 

317 "Failed to extract a clean query from LLM response" 

318 ) 

319 optimized_query = query 

320 

321 # Final safety check - if query looks too much like an explanation, use original 

322 if len(optimized_query.split()) > 30: 322 ↛ 323line 322 didn't jump to line 323 because the condition on line 322 was never true

323 logger.warning( 

324 "Query too verbose, falling back to simpler form" 

325 ) 

326 # Create a simple query from the original 

327 words = [ 

328 w 

329 for w in query.split() 

330 if len(w) > 3 

331 and w.lower() 

332 not in ( 

333 "what", 

334 "are", 

335 "the", 

336 "and", 

337 "for", 

338 "with", 

339 "from", 

340 "have", 

341 "been", 

342 "recent", 

343 ) 

344 ] 

345 optimized_query = " AND ".join(words[:3]) 

346 

347 # Basic cleanup: standardize field tag case for consistency 

348 import re 

349 

350 optimized_query = re.sub( 

351 r"\[mesh\]", "[Mesh]", optimized_query, flags=re.IGNORECASE 

352 ) 

353 optimized_query = re.sub( 

354 r"\[title/abstract\]", 

355 "[Title/Abstract]", 

356 optimized_query, 

357 flags=re.IGNORECASE, 

358 ) 

359 optimized_query = re.sub( 

360 r"\[publication type\]", 

361 "[Publication Type]", 

362 optimized_query, 

363 flags=re.IGNORECASE, 

364 ) 

365 

366 # Fix unclosed quotes followed by field tags 

367 # Pattern: "term[Field] -> "term"[Field] 

368 optimized_query = re.sub(r'"([^"]+)\[', r'"\1"[', optimized_query) 

369 

370 # Simplify the query if still no results are found 

371 self._simplify_query_cache = optimized_query 

372 

373 # Log original and optimized queries 

374 logger.info("Original query: '%s'", query) 

375 logger.info(f"Optimized for PubMed: '{optimized_query}'") 

376 logger.debug( 

377 f"Query optimization complete: '{query[:50]}...' -> '{optimized_query[:100]}...'" 

378 ) 

379 

380 return optimized_query 

381 

382 except Exception: 

383 logger.exception("Error optimizing query") 

384 logger.debug(f"Falling back to original query: '{query}'") 

385 return query # Fall back to original query on error 

386 

387 def _simplify_query(self, query: str) -> str: 

388 """ 

389 Simplify a PubMed query that returned no results. 

390 Progressively removes elements to get a more basic query. 

391 

392 Args: 

393 query: The original query that returned no results 

394 

395 Returns: 

396 Simplified query 

397 """ 

398 logger.info(f"Simplifying query: {query}") 

399 logger.debug(f"Query simplification started for: '{query[:100]}...'") 

400 

401 # Simple approach: remove field restrictions to broaden the search 

402 import re 

403 

404 # Remove field tags to make search broader 

405 simplified = query 

406 

407 # Remove [Mesh] tags - search in all fields instead 

408 simplified = re.sub(r"\[Mesh\]", "", simplified, flags=re.IGNORECASE) 

409 

410 # Remove [Publication Type] tags 

411 simplified = re.sub( 

412 r"\[Publication Type\]", "", simplified, flags=re.IGNORECASE 

413 ) 

414 

415 # Keep [Title/Abstract] as it's usually helpful 

416 # Clean up any double spaces 

417 simplified = re.sub(r"\s+", " ", simplified).strip() 

418 

419 # If no simplification was possible, return the original query 

420 if simplified == query: 

421 logger.debug("No simplification possible, returning original query") 

422 

423 logger.info(f"Simplified query: {simplified}") 

424 logger.debug( 

425 f"Query simplified from {len(query)} to {len(simplified)} chars" 

426 ) 

427 return simplified 

428 

429 def _is_historical_focused(self, query: str) -> bool: 

430 """ 

431 Determine if a query is specifically focused on historical/older information using LLM. 

432 Default assumption is that queries should prioritize recent information unless 

433 explicitly asking for historical content. 

434 

435 Args: 

436 query: The search query 

437 

438 Returns: 

439 Boolean indicating if the query is focused on historical information 

440 """ 

441 if not self.llm: 

442 # Fall back to basic keyword check if no LLM available 

443 historical_terms = [ 

444 "history", 

445 "historical", 

446 "early", 

447 "initial", 

448 "first", 

449 "original", 

450 "before", 

451 "prior to", 

452 "origins", 

453 "evolution", 

454 "development", 

455 ] 

456 historical_years = [str(year) for year in range(1900, 2020)] 

457 

458 query_lower = query.lower() 

459 has_historical_term = any( 

460 term in query_lower for term in historical_terms 

461 ) 

462 has_past_year = any(year in query for year in historical_years) 

463 

464 return has_historical_term or has_past_year 

465 

466 try: 

467 # Use LLM to determine if the query is focused on historical information 

468 prompt = f"""Determine if this query is specifically asking for HISTORICAL or OLDER information. 

469 

470Query: "{query}" 

471 

472Answer ONLY "yes" if the query is clearly asking for historical, early, original, or past information from more than 5 years ago. 

473Answer ONLY "no" if the query is asking about recent, current, or new information, or if it's a general query without a specific time focus. 

474 

475The default assumption should be that medical and scientific queries want RECENT information unless clearly specified otherwise. 

476""" 

477 

478 response = self.llm.invoke(prompt) 

479 answer = response.content.strip().lower() 

480 

481 # Log the determination 

482 logger.info(f"Historical focus determination for query: '{query}'") 

483 logger.info(f"LLM determined historical focus: {answer}") 

484 

485 return "yes" in answer 

486 

487 except Exception: 

488 logger.exception("Error determining historical focus") 

489 # Fall back to basic keyword check 

490 historical_terms = [ 

491 "history", 

492 "historical", 

493 "early", 

494 "initial", 

495 "first", 

496 "original", 

497 "before", 

498 "prior to", 

499 "origins", 

500 "evolution", 

501 "development", 

502 ] 

503 return any(term in query.lower() for term in historical_terms) 

504 

505 def _adaptive_search(self, query: str) -> Tuple[List[str], str]: 

506 """ 

507 Perform an adaptive search that adjusts based on topic volume and whether 

508 the query focuses on historical information. 

509 

510 Args: 

511 query: The search query (already optimized) 

512 

513 Returns: 

514 Tuple of (list of PMIDs, search strategy used) 

515 """ 

516 # Estimate topic volume 

517 estimated_volume = self._get_result_count(query) 

518 

519 # Determine if the query is focused on historical information 

520 is_historical_focused = self._is_historical_focused(query) 

521 

522 if is_historical_focused: 

523 # User wants historical information - no date filtering 

524 time_filter = None 

525 strategy = "historical_focus" 

526 elif estimated_volume > 5000: 

527 # Very common topic - use tighter recency filter 

528 time_filter = '"last 1 year"[pdat]' 

529 strategy = "high_volume" 

530 elif estimated_volume > 1000: 

531 # Common topic 

532 time_filter = '"last 3 years"[pdat]' 

533 strategy = "common_topic" 

534 elif estimated_volume > 100: 534 ↛ 540line 534 didn't jump to line 540 because the condition on line 534 was always true

535 # Moderate volume 

536 time_filter = '"last 5 years"[pdat]' 

537 strategy = "moderate_volume" 

538 else: 

539 # Rare topic - still use recency but with wider range 

540 time_filter = '"last 10 years"[pdat]' 

541 strategy = "rare_topic" 

542 

543 # Run search based on strategy 

544 if time_filter: 

545 # Try with adaptive time filter 

546 query_with_time = f"({query}) AND {time_filter}" 

547 logger.info( 

548 f"Using adaptive search strategy: {strategy} with filter: {time_filter}" 

549 ) 

550 results = self._search_pubmed(query_with_time) 

551 

552 # If too few results, gradually expand time window 

553 if len(results) < 5 and '"last 10 years"[pdat]' not in time_filter: 

554 logger.info( 

555 f"Insufficient results ({len(results)}), expanding time window" 

556 ) 

557 expanded_time = self._expand_time_window(time_filter) 

558 query_with_expanded_time = f"({query}) AND {expanded_time}" 

559 expanded_results = self._search_pubmed(query_with_expanded_time) 

560 

561 if len(expanded_results) > len(results): 

562 logger.info( 

563 f"Expanded time window yielded {len(expanded_results)} results" 

564 ) 

565 return expanded_results, f"{strategy}_expanded" 

566 

567 # If still no results, try without time filter 

568 if not results: 

569 logger.info( 

570 "No results with time filter, trying without time restrictions" 

571 ) 

572 results = self._search_pubmed(query) 

573 strategy = "no_time_filter" 

574 else: 

575 # Historical query - run without time filter 

576 logger.info( 

577 "Using historical search strategy without date filtering" 

578 ) 

579 results = self._search_pubmed(query) 

580 

581 return results, strategy 

582 

583 def _search_pubmed(self, query: str) -> List[str]: 

584 """ 

585 Search PubMed and return a list of article IDs. 

586 

587 Args: 

588 query: The search query 

589 

590 Returns: 

591 List of PubMed IDs matching the query 

592 """ 

593 try: 

594 # Prepare search parameters 

595 params = { 

596 "db": "pubmed", 

597 "term": query, 

598 "retmode": "json", 

599 "retmax": self.max_results, 

600 "usehistory": "y", 

601 } 

602 

603 # Add API key if available 

604 if self.api_key: 604 ↛ 605line 604 didn't jump to line 605 because the condition on line 604 was never true

605 params["api_key"] = self.api_key 

606 logger.debug("Using PubMed API key for higher rate limits") 

607 else: 

608 logger.debug("No PubMed API key - using default rate limits") 

609 

610 # Add date restriction if specified 

611 if self.days_limit: 611 ↛ 612line 611 didn't jump to line 612 because the condition on line 611 was never true

612 params["reldate"] = self.days_limit 

613 params["datetype"] = "pdat" # Publication date 

614 logger.debug(f"Limiting results to last {self.days_limit} days") 

615 

616 logger.debug( 

617 f"PubMed search query: '{query}' with max_results={self.max_results}" 

618 ) 

619 

620 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

621 self.engine_type 

622 ) 

623 logger.debug( 

624 f"Applied rate limit wait: {self._last_wait_time:.2f}s" 

625 ) 

626 

627 # Execute search request 

628 logger.debug(f"Sending request to PubMed API: {self.search_url}") 

629 response = safe_get(self.search_url, params=params) 

630 response.raise_for_status() 

631 logger.debug(f"PubMed API response status: {response.status_code}") 

632 

633 # Parse response 

634 data = response.json() 

635 id_list = data["esearchresult"]["idlist"] 

636 total_count = data["esearchresult"].get("count", "unknown") 

637 

638 logger.info( 

639 f"PubMed search for '{query}' found {len(id_list)} results (total available: {total_count})" 

640 ) 

641 if len(id_list) > 0: 641 ↛ 643line 641 didn't jump to line 643 because the condition on line 641 was always true

642 logger.debug(f"First 5 PMIDs: {id_list[:5]}") 

643 return id_list 

644 

645 except Exception: 

646 logger.exception(f"Error searching PubMed for query '{query}'") 

647 return [] 

648 

649 def _get_article_summaries( 

650 self, id_list: List[str] 

651 ) -> List[Dict[str, Any]]: 

652 """ 

653 Get summaries for a list of PubMed article IDs. 

654 

655 Args: 

656 id_list: List of PubMed IDs 

657 

658 Returns: 

659 List of article summary dictionaries 

660 """ 

661 if not id_list: 

662 logger.debug("Empty ID list provided to _get_article_summaries") 

663 return [] 

664 

665 logger.debug(f"Fetching summaries for {len(id_list)} PubMed articles") 

666 

667 try: 

668 # Prepare parameters 

669 params = { 

670 "db": "pubmed", 

671 "id": ",".join(id_list), 

672 "retmode": "json", 

673 "rettype": "summary", 

674 } 

675 

676 # Add API key if available 

677 if self.api_key: 677 ↛ 678line 677 didn't jump to line 678 because the condition on line 677 was never true

678 params["api_key"] = self.api_key 

679 

680 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

681 self.engine_type 

682 ) 

683 logger.debug( 

684 f"Applied rate limit wait: {self._last_wait_time:.2f}s" 

685 ) 

686 

687 # Execute request 

688 logger.debug(f"Requesting summaries from: {self.summary_url}") 

689 response = safe_get(self.summary_url, params=params) 

690 response.raise_for_status() 

691 logger.debug(f"Summary API response status: {response.status_code}") 

692 

693 # Parse response 

694 data = response.json() 

695 logger.debug( 

696 f"PubMed API returned data for {len(id_list)} requested IDs" 

697 ) 

698 summaries = [] 

699 

700 for pmid in id_list: 

701 if pmid in data["result"]: 701 ↛ 749line 701 didn't jump to line 749 because the condition on line 701 was always true

702 article = data["result"][pmid] 

703 logger.debug( 

704 f"Processing article {pmid}: {article.get('title', 'NO TITLE')[:50]}" 

705 ) 

706 

707 # Extract authors (if available) 

708 authors = [] 

709 if "authors" in article: 709 ↛ 715line 709 didn't jump to line 715 because the condition on line 709 was always true

710 authors = [ 

711 author["name"] for author in article["authors"] 

712 ] 

713 

714 # Extract DOI from articleids if not in main field 

715 doi = article.get("doi", "") 

716 if not doi and "articleids" in article: 716 ↛ 723line 716 didn't jump to line 723 because the condition on line 716 was always true

717 for aid in article["articleids"]: 717 ↛ 723line 717 didn't jump to line 723 because the loop on line 717 didn't complete

718 if aid.get("idtype") == "doi": 718 ↛ 717line 718 didn't jump to line 717 because the condition on line 718 was always true

719 doi = aid.get("value", "") 

720 break 

721 

722 # Create summary dictionary with all available fields 

723 summary = { 

724 "id": pmid, 

725 "title": article.get("title", ""), 

726 "pubdate": article.get("pubdate", ""), 

727 "epubdate": article.get("epubdate", ""), 

728 "source": article.get("source", ""), 

729 "authors": authors, 

730 "lastauthor": article.get("lastauthor", ""), 

731 "journal": article.get("fulljournalname", ""), 

732 "volume": article.get("volume", ""), 

733 "issue": article.get("issue", ""), 

734 "pages": article.get("pages", ""), 

735 "doi": doi, 

736 "issn": article.get("issn", ""), 

737 "essn": article.get("essn", ""), 

738 "pubtype": article.get( 

739 "pubtype", [] 

740 ), # Publication types from esummary 

741 "recordstatus": article.get("recordstatus", ""), 

742 "lang": article.get("lang", []), 

743 "pmcrefcount": article.get("pmcrefcount", None), 

744 "link": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/", 

745 } 

746 

747 summaries.append(summary) 

748 else: 

749 logger.warning( 

750 f"PMID {pmid} not found in PubMed API response" 

751 ) 

752 

753 return summaries 

754 

755 except Exception as e: 

756 error_msg = str(e) 

757 logger.exception( 

758 f"Error getting article summaries for {len(id_list)} articles" 

759 ) 

760 

761 # Check for rate limiting patterns 

762 if ( 

763 "429" in error_msg 

764 or "too many requests" in error_msg.lower() 

765 or "rate limit" in error_msg.lower() 

766 or "service unavailable" in error_msg.lower() 

767 or "503" in error_msg 

768 or "403" in error_msg 

769 ): 

770 raise RateLimitError(f"PubMed rate limit hit: {error_msg}") 

771 

772 return [] 

773 

774 def _get_article_abstracts(self, id_list: List[str]) -> Dict[str, str]: 

775 """ 

776 Get abstracts for a list of PubMed article IDs. 

777 

778 Args: 

779 id_list: List of PubMed IDs 

780 

781 Returns: 

782 Dictionary mapping PubMed IDs to their abstracts 

783 """ 

784 if not id_list: 

785 logger.debug("Empty ID list provided to _get_article_abstracts") 

786 return {} 

787 

788 logger.debug(f"Fetching abstracts for {len(id_list)} PubMed articles") 

789 

790 try: 

791 # Prepare parameters 

792 params = { 

793 "db": "pubmed", 

794 "id": ",".join(id_list), 

795 "retmode": "xml", 

796 "rettype": "abstract", 

797 } 

798 

799 # Add API key if available 

800 if self.api_key: 800 ↛ 801line 800 didn't jump to line 801 because the condition on line 800 was never true

801 params["api_key"] = self.api_key 

802 

803 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

804 self.engine_type 

805 ) 

806 logger.debug( 

807 f"Applied rate limit wait: {self._last_wait_time:.2f}s" 

808 ) 

809 

810 # Execute request 

811 logger.debug(f"Requesting abstracts from: {self.fetch_url}") 

812 response = safe_get(self.fetch_url, params=params) 

813 response.raise_for_status() 

814 logger.debug( 

815 f"Abstract fetch response status: {response.status_code}, size: {len(response.text)} bytes" 

816 ) 

817 

818 # Parse XML response 

819 root = ET.fromstring(response.text) 

820 logger.debug( 

821 f"Parsing abstracts from XML for {len(id_list)} articles" 

822 ) 

823 

824 # Extract abstracts 

825 abstracts = {} 

826 

827 for article in root.findall(".//PubmedArticle"): 

828 pmid_elem = article.find(".//PMID") 

829 pmid = pmid_elem.text if pmid_elem is not None else None 

830 

831 if pmid is None: 831 ↛ 832line 831 didn't jump to line 832 because the condition on line 831 was never true

832 continue 

833 

834 # Find abstract text 

835 abstract_text = "" 

836 abstract_elem = article.find(".//AbstractText") 

837 

838 if abstract_elem is not None: 838 ↛ 842line 838 didn't jump to line 842 because the condition on line 838 was always true

839 abstract_text = abstract_elem.text or "" 

840 

841 # Some abstracts are split into multiple sections 

842 abstract_sections = article.findall(".//AbstractText") 

843 if len(abstract_sections) > 1: 

844 logger.debug( 

845 f"Article {pmid} has {len(abstract_sections)} abstract sections" 

846 ) 

847 

848 for section in abstract_sections: 

849 # Get section label if it exists 

850 label = section.get("Label") 

851 section_text = section.text or "" 

852 

853 if label and section_text: 

854 if abstract_text: 854 ↛ 857line 854 didn't jump to line 857 because the condition on line 854 was always true

855 abstract_text += f"\n\n{label}: {section_text}" 

856 else: 

857 abstract_text = f"{label}: {section_text}" 

858 elif section_text: 858 ↛ 848line 858 didn't jump to line 848 because the condition on line 858 was always true

859 if abstract_text: 859 ↛ 862line 859 didn't jump to line 862 because the condition on line 859 was always true

860 abstract_text += f"\n\n{section_text}" 

861 else: 

862 abstract_text = section_text 

863 

864 # Store in dictionary 

865 if pmid and abstract_text: 865 ↛ 870line 865 didn't jump to line 870 because the condition on line 865 was always true

866 abstracts[pmid] = abstract_text 

867 logger.debug( 

868 f"Abstract for {pmid}: {len(abstract_text)} chars" 

869 ) 

870 elif pmid: 

871 logger.warning(f"No abstract found for PMID {pmid}") 

872 

873 logger.info( 

874 f"Successfully retrieved {len(abstracts)} abstracts out of {len(id_list)} requested" 

875 ) 

876 return abstracts 

877 

878 except Exception: 

879 logger.exception( 

880 f"Error getting article abstracts for {len(id_list)} articles" 

881 ) 

882 return {} 

883 

884 def _get_article_detailed_metadata( 

885 self, id_list: List[str] 

886 ) -> Dict[str, Dict[str, Any]]: 

887 """ 

888 Get detailed metadata for PubMed articles including publication types, 

889 MeSH terms, keywords, and affiliations. 

890 

891 Args: 

892 id_list: List of PubMed IDs 

893 

894 Returns: 

895 Dictionary mapping PubMed IDs to their detailed metadata 

896 """ 

897 if not id_list: 

898 return {} 

899 

900 try: 

901 # Prepare parameters 

902 params = { 

903 "db": "pubmed", 

904 "id": ",".join(id_list), 

905 "retmode": "xml", 

906 "rettype": "medline", 

907 } 

908 

909 # Add API key if available 

910 if self.api_key: 910 ↛ 911line 910 didn't jump to line 911 because the condition on line 910 was never true

911 params["api_key"] = self.api_key 

912 

913 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

914 self.engine_type 

915 ) 

916 

917 # Execute request 

918 response = safe_get(self.fetch_url, params=params) 

919 response.raise_for_status() 

920 

921 # Parse XML response 

922 root = ET.fromstring(response.text) 

923 

924 metadata = {} 

925 

926 for article in root.findall(".//PubmedArticle"): 

927 pmid_elem = article.find(".//PMID") 

928 pmid = pmid_elem.text if pmid_elem is not None else None 

929 

930 if pmid is None: 930 ↛ 931line 930 didn't jump to line 931 because the condition on line 930 was never true

931 continue 

932 

933 article_metadata = {} 

934 

935 # Extract publication types 

936 pub_types = [] 

937 for pub_type in article.findall(".//PublicationType"): 

938 if pub_type.text: 938 ↛ 937line 938 didn't jump to line 937 because the condition on line 938 was always true

939 pub_types.append(pub_type.text) 

940 if pub_types: 

941 article_metadata["publication_types"] = pub_types 

942 

943 # Extract MeSH terms 

944 mesh_terms = [] 

945 for mesh in article.findall(".//MeshHeading"): 

946 descriptor = mesh.find(".//DescriptorName") 

947 if descriptor is not None and descriptor.text: 947 ↛ 945line 947 didn't jump to line 945 because the condition on line 947 was always true

948 mesh_terms.append(descriptor.text) 

949 if mesh_terms: 

950 article_metadata["mesh_terms"] = mesh_terms 

951 

952 # Extract keywords 

953 keywords = [] 

954 for keyword in article.findall(".//Keyword"): 

955 if keyword.text: 955 ↛ 954line 955 didn't jump to line 954 because the condition on line 955 was always true

956 keywords.append(keyword.text) 

957 if keywords: 

958 article_metadata["keywords"] = keywords 

959 

960 # Extract affiliations 

961 affiliations = [] 

962 for affiliation in article.findall(".//Affiliation"): 962 ↛ 963line 962 didn't jump to line 963 because the loop on line 962 never started

963 if affiliation.text: 

964 affiliations.append(affiliation.text) 

965 if affiliations: 965 ↛ 966line 965 didn't jump to line 966 because the condition on line 965 was never true

966 article_metadata["affiliations"] = affiliations 

967 

968 # Extract grant information 

969 grants = [] 

970 for grant in article.findall(".//Grant"): 970 ↛ 971line 970 didn't jump to line 971 because the loop on line 970 never started

971 grant_info = {} 

972 grant_id = grant.find(".//GrantID") 

973 if grant_id is not None and grant_id.text: 

974 grant_info["id"] = grant_id.text 

975 agency = grant.find(".//Agency") 

976 if agency is not None and agency.text: 

977 grant_info["agency"] = agency.text 

978 if grant_info: 

979 grants.append(grant_info) 

980 if grants: 980 ↛ 981line 980 didn't jump to line 981 because the condition on line 980 was never true

981 article_metadata["grants"] = grants 

982 

983 # Check for free full text in PMC 

984 pmc_elem = article.find(".//ArticleId[@IdType='pmc']") 

985 if pmc_elem is not None: 985 ↛ 986line 985 didn't jump to line 986 because the condition on line 985 was never true

986 article_metadata["has_free_full_text"] = True 

987 article_metadata["pmc_id"] = pmc_elem.text 

988 

989 # Extract conflict of interest statement 

990 coi_elem = article.find(".//CoiStatement") 

991 if coi_elem is not None and coi_elem.text: 991 ↛ 992line 991 didn't jump to line 992 because the condition on line 991 was never true

992 article_metadata["conflict_of_interest"] = coi_elem.text 

993 

994 metadata[pmid] = article_metadata 

995 

996 return metadata 

997 

998 except Exception: 

999 logger.exception("Error getting detailed article metadata") 

1000 return {} 

1001 

1002 def _create_enriched_content( 

1003 self, result: Dict[str, Any], base_content: str 

1004 ) -> str: 

1005 """ 

1006 Create enriched content by adding relevant metadata context to help the LLM. 

1007 

1008 Args: 

1009 result: The result dictionary with metadata 

1010 base_content: The base content (abstract or full text) 

1011 

1012 Returns: 

1013 Enriched content string with metadata context 

1014 """ 

1015 enriched_parts = [] 

1016 

1017 # Add study type information 

1018 if "publication_types" in result: 

1019 pub_types = result["publication_types"] 

1020 # Filter for significant types 

1021 significant_types = [ 

1022 pt 

1023 for pt in pub_types 

1024 if any( 

1025 key in pt.lower() 

1026 for key in [ 

1027 "clinical trial", 

1028 "randomized", 

1029 "meta-analysis", 

1030 "systematic review", 

1031 "case report", 

1032 "guideline", 

1033 "comparative study", 

1034 "multicenter", 

1035 ] 

1036 ) 

1037 ] 

1038 if significant_types: 1038 ↛ 1044line 1038 didn't jump to line 1044 because the condition on line 1038 was always true

1039 enriched_parts.append( 

1040 f"[Study Type: {', '.join(significant_types)}]" 

1041 ) 

1042 

1043 # Add the main content 

1044 enriched_parts.append(base_content) 

1045 

1046 # Add metadata footer 

1047 metadata_footer = [] 

1048 

1049 # Add ALL MeSH terms 

1050 if "mesh_terms" in result and len(result["mesh_terms"]) > 0: 

1051 metadata_footer.append( 

1052 f"Medical Topics (MeSH): {', '.join(result['mesh_terms'])}" 

1053 ) 

1054 

1055 # Add ALL keywords 

1056 if "keywords" in result and len(result["keywords"]) > 0: 1056 ↛ 1057line 1056 didn't jump to line 1057 because the condition on line 1056 was never true

1057 metadata_footer.append(f"Keywords: {', '.join(result['keywords'])}") 

1058 

1059 # Add ALL affiliations 

1060 if "affiliations" in result and len(result["affiliations"]) > 0: 1060 ↛ 1061line 1060 didn't jump to line 1061 because the condition on line 1060 was never true

1061 if len(result["affiliations"]) == 1: 

1062 metadata_footer.append( 

1063 f"Institution: {result['affiliations'][0]}" 

1064 ) 

1065 else: 

1066 affiliations_text = "\n - " + "\n - ".join( 

1067 result["affiliations"] 

1068 ) 

1069 metadata_footer.append(f"Institutions:{affiliations_text}") 

1070 

1071 # Add ALL funding information with full details 

1072 if "grants" in result and len(result["grants"]) > 0: 

1073 grant_details = [] 

1074 for grant in result["grants"]: 

1075 grant_text = [] 

1076 if "agency" in grant: 1076 ↛ 1078line 1076 didn't jump to line 1078 because the condition on line 1076 was always true

1077 grant_text.append(grant["agency"]) 

1078 if "id" in grant: 1078 ↛ 1080line 1078 didn't jump to line 1080 because the condition on line 1078 was always true

1079 grant_text.append(f"(Grant ID: {grant['id']})") 

1080 if grant_text: 1080 ↛ 1074line 1080 didn't jump to line 1074 because the condition on line 1080 was always true

1081 grant_details.append(" ".join(grant_text)) 

1082 if grant_details: 1082 ↛ 1090line 1082 didn't jump to line 1090 because the condition on line 1082 was always true

1083 if len(grant_details) == 1: 1083 ↛ 1086line 1083 didn't jump to line 1086 because the condition on line 1083 was always true

1084 metadata_footer.append(f"Funded by: {grant_details[0]}") 

1085 else: 

1086 funding_text = "\n - " + "\n - ".join(grant_details) 

1087 metadata_footer.append(f"Funding Sources:{funding_text}") 

1088 

1089 # Add FULL conflict of interest statement 

1090 if "conflict_of_interest" in result: 1090 ↛ 1091line 1090 didn't jump to line 1091 because the condition on line 1090 was never true

1091 coi_text = result["conflict_of_interest"] 

1092 if coi_text: 

1093 # Still skip trivial "no conflict" statements to reduce noise 

1094 if not any( 

1095 phrase in coi_text.lower() 

1096 for phrase in [ 

1097 "no conflict", 

1098 "no competing", 

1099 "nothing to disclose", 

1100 "none declared", 

1101 "authors declare no", 

1102 ] 

1103 ): 

1104 metadata_footer.append(f"Conflict of Interest: {coi_text}") 

1105 elif ( 

1106 "but" in coi_text.lower() 

1107 or "except" in coi_text.lower() 

1108 or "however" in coi_text.lower() 

1109 ): 

1110 # Include if there's a "no conflict BUT..." type statement 

1111 metadata_footer.append(f"Conflict of Interest: {coi_text}") 

1112 

1113 # Combine everything 

1114 if metadata_footer: 

1115 enriched_parts.append("\n---\nStudy Metadata:") 

1116 enriched_parts.extend(metadata_footer) 

1117 

1118 return "\n".join(enriched_parts) 

1119 

1120 def _find_pmc_ids(self, pmid_list: List[str]) -> Dict[str, str]: 

1121 """ 

1122 Find PMC IDs for the given PubMed IDs (for full-text access). 

1123 

1124 Args: 

1125 pmid_list: List of PubMed IDs 

1126 

1127 Returns: 

1128 Dictionary mapping PubMed IDs to their PMC IDs (if available) 

1129 """ 

1130 if not pmid_list or not self.get_full_text: 

1131 return {} 

1132 

1133 try: 

1134 # Prepare parameters 

1135 params = { 

1136 "dbfrom": "pubmed", 

1137 "db": "pmc", 

1138 "linkname": "pubmed_pmc", 

1139 "id": ",".join(pmid_list), 

1140 "retmode": "json", 

1141 } 

1142 

1143 # Add API key if available 

1144 if self.api_key: 1144 ↛ 1145line 1144 didn't jump to line 1145 because the condition on line 1144 was never true

1145 params["api_key"] = self.api_key 

1146 

1147 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

1148 self.engine_type 

1149 ) 

1150 

1151 # Execute request 

1152 response = safe_get(self.link_url, params=params) 

1153 response.raise_for_status() 

1154 

1155 # Parse response 

1156 data = response.json() 

1157 

1158 # Map PubMed IDs to PMC IDs 

1159 pmid_to_pmcid = {} 

1160 

1161 for linkset in data.get("linksets", []): 

1162 pmid = linkset.get("ids", [None])[0] 

1163 

1164 if not pmid: 1164 ↛ 1165line 1164 didn't jump to line 1165 because the condition on line 1164 was never true

1165 continue 

1166 

1167 for link in linkset.get("linksetdbs", []): 

1168 if link.get("linkname") == "pubmed_pmc": 1168 ↛ 1167line 1168 didn't jump to line 1167 because the condition on line 1168 was always true

1169 pmcids = link.get("links", []) 

1170 if pmcids: 1170 ↛ 1167line 1170 didn't jump to line 1167 because the condition on line 1170 was always true

1171 pmid_to_pmcid[str(pmid)] = f"PMC{pmcids[0]}" 

1172 

1173 logger.info( 

1174 f"Found {len(pmid_to_pmcid)} PMC IDs for full-text access" 

1175 ) 

1176 return pmid_to_pmcid 

1177 

1178 except Exception: 

1179 logger.exception("Error finding PMC IDs") 

1180 return {} 

1181 

1182 def _get_pmc_full_text(self, pmcid: str) -> str: 

1183 """ 

1184 Get full text for a PMC article. 

1185 

1186 Args: 

1187 pmcid: PMC ID of the article 

1188 

1189 Returns: 

1190 Full text content or empty string if not available 

1191 """ 

1192 try: 

1193 # Prepare parameters 

1194 params = { 

1195 "db": "pmc", 

1196 "id": pmcid, 

1197 "retmode": "xml", 

1198 "rettype": "full", 

1199 } 

1200 

1201 # Add API key if available 

1202 if self.api_key: 1202 ↛ 1203line 1202 didn't jump to line 1203 because the condition on line 1202 was never true

1203 params["api_key"] = self.api_key 

1204 

1205 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

1206 self.engine_type 

1207 ) 

1208 

1209 # Execute request 

1210 response = safe_get(self.fetch_url, params=params) 

1211 response.raise_for_status() 

1212 

1213 # Parse XML response 

1214 root = ET.fromstring(response.text) 

1215 

1216 # Extract full text 

1217 full_text = [] 

1218 

1219 # Extract article title 

1220 title_elem = root.find(".//article-title") 

1221 if title_elem is not None and title_elem.text: 1221 ↛ 1225line 1221 didn't jump to line 1225 because the condition on line 1221 was always true

1222 full_text.append(f"# {title_elem.text}") 

1223 

1224 # Extract abstract 

1225 abstract_paras = root.findall(".//abstract//p") 

1226 if abstract_paras: 1226 ↛ 1227line 1226 didn't jump to line 1227 because the condition on line 1226 was never true

1227 full_text.append("\n## Abstract\n") 

1228 for p in abstract_paras: 

1229 text = "".join(p.itertext()) 

1230 if text: 

1231 full_text.append(text) 

1232 

1233 # Extract body content 

1234 body = root.find(".//body") 

1235 if body is not None: 1235 ↛ 1248line 1235 didn't jump to line 1248 because the condition on line 1235 was always true

1236 for section in body.findall(".//sec"): 

1237 # Get section title 

1238 title = section.find(".//title") 

1239 if title is not None and title.text: 1239 ↛ 1243line 1239 didn't jump to line 1243 because the condition on line 1239 was always true

1240 full_text.append(f"\n## {title.text}\n") 

1241 

1242 # Get paragraphs 

1243 for p in section.findall(".//p"): 

1244 text = "".join(p.itertext()) 

1245 if text: 1245 ↛ 1243line 1245 didn't jump to line 1243 because the condition on line 1245 was always true

1246 full_text.append(text) 

1247 

1248 result_text = "\n\n".join(full_text) 

1249 logger.debug( 

1250 f"Successfully extracted {len(result_text)} chars of PMC full text with {len(full_text)} sections" 

1251 ) 

1252 return result_text 

1253 

1254 except Exception: 

1255 logger.exception("Error getting PMC full text") 

1256 return "" 

1257 

1258 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

1259 """ 

1260 Get preview information for PubMed articles. 

1261 

1262 Args: 

1263 query: The search query 

1264 

1265 Returns: 

1266 List of preview dictionaries 

1267 """ 

1268 logger.info(f"Getting PubMed previews for query: {query}") 

1269 

1270 # Optimize the query for PubMed if LLM is available 

1271 optimized_query = self._optimize_query_for_pubmed(query) 

1272 

1273 # Perform adaptive search 

1274 pmid_list, strategy = self._adaptive_search(optimized_query) 

1275 

1276 # If no results, try a simplified query 

1277 if not pmid_list: 

1278 logger.warning( 

1279 f"No PubMed results found using strategy: {strategy}" 

1280 ) 

1281 simplified_query = self._simplify_query(optimized_query) 

1282 if simplified_query != optimized_query: 

1283 logger.info(f"Trying with simplified query: {simplified_query}") 

1284 pmid_list, strategy = self._adaptive_search(simplified_query) 

1285 if pmid_list: 1285 ↛ 1286line 1285 didn't jump to line 1286 because the condition on line 1285 was never true

1286 logger.info( 

1287 f"Simplified query found {len(pmid_list)} results" 

1288 ) 

1289 

1290 if not pmid_list: 

1291 logger.warning("No PubMed results found after query simplification") 

1292 return [] 

1293 

1294 # Get article summaries 

1295 logger.debug(f"Fetching article summaries for {len(pmid_list)} PMIDs") 

1296 summaries = self._get_article_summaries(pmid_list) 

1297 logger.debug(f"Retrieved {len(summaries)} summaries") 

1298 

1299 # ALWAYS fetch abstracts for snippet-only mode to provide context for LLM 

1300 logger.debug( 

1301 f"Fetching abstracts for {len(pmid_list)} articles for snippet enrichment" 

1302 ) 

1303 abstracts = self._get_article_abstracts(pmid_list) 

1304 logger.debug(f"Retrieved {len(abstracts)} abstracts") 

1305 

1306 # Format as previews 

1307 previews = [] 

1308 for summary in summaries: 

1309 # Build snippet from individual metadata preferences 

1310 snippet_parts = [] 

1311 

1312 # Check for publication type from esummary (earlier than detailed metadata) 

1313 pub_type_prefix = "" 

1314 if self.include_publication_type_in_context and summary.get( 1314 ↛ 1318line 1314 didn't jump to line 1318 because the condition on line 1314 was never true

1315 "pubtype" 

1316 ): 

1317 # Use first publication type from esummary 

1318 pub_type_prefix = f"[{summary['pubtype'][0]}] " 

1319 

1320 # Add authors if enabled 

1321 if self.include_authors_in_context and summary.get("authors"): 1321 ↛ 1322line 1321 didn't jump to line 1322 because the condition on line 1321 was never true

1322 authors_text = ", ".join(summary.get("authors", [])) 

1323 if len(authors_text) > 100: 

1324 # Truncate long author lists 

1325 authors_text = authors_text[:97] + "..." 

1326 snippet_parts.append(authors_text) 

1327 

1328 # Add journal if enabled 

1329 if self.include_journal_in_context and summary.get("journal"): 1329 ↛ 1333line 1329 didn't jump to line 1333 because the condition on line 1329 was always true

1330 snippet_parts.append(summary["journal"]) 

1331 

1332 # Add date (full or year only) 

1333 if summary.get("pubdate"): 1333 ↛ 1343line 1333 didn't jump to line 1343 because the condition on line 1333 was always true

1334 if self.include_full_date_in_context: 1334 ↛ 1335line 1334 didn't jump to line 1335 because the condition on line 1334 was never true

1335 snippet_parts.append(summary["pubdate"]) 

1336 elif ( 1336 ↛ 1343line 1336 didn't jump to line 1343 because the condition on line 1336 was always true

1337 self.include_year_in_context 

1338 and len(summary["pubdate"]) >= 4 

1339 ): 

1340 snippet_parts.append(summary["pubdate"][:4]) 

1341 

1342 # Add citation details if enabled 

1343 if self.include_citation_in_context: 1343 ↛ 1344line 1343 didn't jump to line 1344 because the condition on line 1343 was never true

1344 citation_parts = [] 

1345 if summary.get("volume"): 

1346 citation_parts.append(f"Vol {summary['volume']}") 

1347 if summary.get("issue"): 

1348 citation_parts.append(f"Issue {summary['issue']}") 

1349 if summary.get("pages"): 

1350 citation_parts.append(f"pp {summary['pages']}") 

1351 if citation_parts: 

1352 snippet_parts.append(f"({', '.join(citation_parts)})") 

1353 

1354 # Join snippet parts or provide default 

1355 if snippet_parts: 1355 ↛ 1366line 1355 didn't jump to line 1366 because the condition on line 1355 was always true

1356 # Use different separators based on what's included 

1357 if self.include_authors_in_context: 1357 ↛ 1358line 1357 didn't jump to line 1358 because the condition on line 1357 was never true

1358 snippet = ". ".join( 

1359 snippet_parts 

1360 ) # Authors need period separator 

1361 else: 

1362 snippet = " - ".join( 

1363 snippet_parts 

1364 ) # Journal and year use dash 

1365 else: 

1366 snippet = "Research article" 

1367 

1368 # Add publication type prefix 

1369 snippet = pub_type_prefix + snippet 

1370 

1371 # Add language indicator if not English 

1372 if self.include_language_in_context and summary.get("lang"): 1372 ↛ 1373line 1372 didn't jump to line 1373 because the condition on line 1372 was never true

1373 langs = summary["lang"] 

1374 if langs and langs[0] != "eng" and langs[0]: 

1375 snippet = f"{snippet} [{langs[0].upper()}]" 

1376 

1377 # Add identifiers if enabled 

1378 identifier_parts = [] 

1379 if self.include_pmid_in_context and summary.get("id"): 1379 ↛ 1380line 1379 didn't jump to line 1380 because the condition on line 1379 was never true

1380 identifier_parts.append(f"PMID: {summary['id']}") 

1381 if self.include_doi_in_context and summary.get("doi"): 1381 ↛ 1382line 1381 didn't jump to line 1382 because the condition on line 1381 was never true

1382 identifier_parts.append(f"DOI: {summary['doi']}") 

1383 

1384 if identifier_parts: 1384 ↛ 1385line 1384 didn't jump to line 1385 because the condition on line 1384 was never true

1385 snippet = f"{snippet} | {' | '.join(identifier_parts)}" 

1386 

1387 # ALWAYS include title and abstract in snippet for LLM analysis 

1388 pmid = summary["id"] 

1389 title = summary["title"] 

1390 abstract_text = abstracts.get(pmid, "") 

1391 

1392 # Truncate abstract if too long 

1393 if len(abstract_text) > 500: 1393 ↛ 1394line 1393 didn't jump to line 1394 because the condition on line 1393 was never true

1394 abstract_text = abstract_text[:497] + "..." 

1395 

1396 # Build the enriched snippet with title and abstract 

1397 if abstract_text: 1397 ↛ 1400line 1397 didn't jump to line 1400 because the condition on line 1397 was always true

1398 enriched_snippet = f"Title: {title}\n\nAbstract: {abstract_text}\n\nMetadata: {snippet}" 

1399 else: 

1400 enriched_snippet = f"Title: {title}\n\nMetadata: {snippet}" 

1401 

1402 # Log the complete snippet for debugging 

1403 logger.debug(f"Complete snippet for PMID {pmid}:") 

1404 logger.debug(f" Title: {title[:100]}...") 

1405 logger.debug(f" Abstract length: {len(abstract_text)} chars") 

1406 logger.debug(f" Metadata: {snippet}") 

1407 logger.debug( 

1408 f" Full enriched snippet ({len(enriched_snippet)} chars): {enriched_snippet[:500]}..." 

1409 ) 

1410 

1411 # Create preview with basic information 

1412 preview = { 

1413 "id": summary["id"], 

1414 "title": summary["title"], 

1415 "link": summary["link"], 

1416 "snippet": enriched_snippet, # Use enriched snippet with title and abstract 

1417 "authors": summary.get("authors", []), 

1418 "journal": summary.get("journal", ""), 

1419 "pubdate": summary.get("pubdate", ""), 

1420 "doi": summary.get("doi", ""), 

1421 "source": "PubMed", 

1422 "_pmid": summary["id"], # Store PMID for later use 

1423 "_search_strategy": strategy, # Store search strategy for analytics 

1424 } 

1425 

1426 previews.append(preview) 

1427 

1428 logger.info( 

1429 f"Found {len(previews)} PubMed previews using strategy: {strategy}" 

1430 ) 

1431 if previews: 1431 ↛ 1435line 1431 didn't jump to line 1435 because the condition on line 1431 was always true

1432 logger.debug( 

1433 f"Sample preview title: '{previews[0].get('title', 'NO TITLE')[:80]}...'" 

1434 ) 

1435 return previews 

1436 

1437 def _get_full_content( 

1438 self, relevant_items: List[Dict[str, Any]] 

1439 ) -> List[Dict[str, Any]]: 

1440 """ 

1441 Get full content for the relevant PubMed articles. 

1442 Efficiently manages which content to retrieve (abstracts and/or full text). 

1443 

1444 Args: 

1445 relevant_items: List of relevant preview dictionaries 

1446 

1447 Returns: 

1448 List of result dictionaries with full content 

1449 """ 

1450 # Check if we should add full content 

1451 snippets_only_mode = ( 

1452 hasattr(search_config, "SEARCH_SNIPPETS_ONLY") 

1453 and search_config.SEARCH_SNIPPETS_ONLY 

1454 ) 

1455 

1456 if snippets_only_mode: 1456 ↛ 1457line 1456 didn't jump to line 1457 because the condition on line 1456 was never true

1457 logger.info( 

1458 "Snippet-only mode enabled, will fetch abstracts as snippets" 

1459 ) 

1460 # For PubMed, we still need to fetch abstracts as they serve as snippets 

1461 # But we'll skip full-text retrieval 

1462 

1463 logger.info( 

1464 f"Getting content for {len(relevant_items)} PubMed articles" 

1465 ) 

1466 

1467 # Collect all PMIDs for relevant items 

1468 pmids = [] 

1469 for item in relevant_items: 

1470 if "_pmid" in item: 1470 ↛ 1469line 1470 didn't jump to line 1469 because the condition on line 1470 was always true

1471 pmids.append(item["_pmid"]) 

1472 

1473 # Get abstracts if requested and PMIDs exist 

1474 # In snippet-only mode, always get abstracts as they serve as snippets 

1475 abstracts = {} 

1476 if (self.get_abstracts or snippets_only_mode) and pmids: 1476 ↛ 1480line 1476 didn't jump to line 1480 because the condition on line 1476 was always true

1477 abstracts = self._get_article_abstracts(pmids) 

1478 

1479 # Get detailed metadata for all articles (publication types, MeSH terms, etc.) 

1480 detailed_metadata = {} 

1481 if pmids: 1481 ↛ 1485line 1481 didn't jump to line 1485 because the condition on line 1481 was always true

1482 detailed_metadata = self._get_article_detailed_metadata(pmids) 

1483 

1484 # Find PMC IDs for full-text retrieval (if enabled and not in snippet-only mode) 

1485 pmid_to_pmcid = {} 

1486 if self.get_full_text and pmids and not snippets_only_mode: 1486 ↛ 1487line 1486 didn't jump to line 1487 because the condition on line 1486 was never true

1487 pmid_to_pmcid = self._find_pmc_ids(pmids) 

1488 

1489 # Add content to results 

1490 results = [] 

1491 for item in relevant_items: 

1492 result = item.copy() 

1493 pmid = item.get("_pmid", "") 

1494 

1495 # Add detailed metadata if available 

1496 if pmid in detailed_metadata: 

1497 metadata = detailed_metadata[pmid] 

1498 

1499 # Add publication types (e.g., "Clinical Trial", "Meta-Analysis") 

1500 if "publication_types" in metadata: 1500 ↛ 1516line 1500 didn't jump to line 1516 because the condition on line 1500 was always true

1501 result["publication_types"] = metadata["publication_types"] 

1502 

1503 # Add first publication type to snippet if enabled 

1504 if ( 1504 ↛ 1516line 1504 didn't jump to line 1516 because the condition on line 1504 was always true

1505 self.include_publication_type_in_context 

1506 and metadata["publication_types"] 

1507 ): 

1508 # Just take the first publication type as is 

1509 pub_type = metadata["publication_types"][0] 

1510 if "snippet" in result: 1510 ↛ 1516line 1510 didn't jump to line 1516 because the condition on line 1510 was always true

1511 result["snippet"] = ( 

1512 f"[{pub_type}] {result['snippet']}" 

1513 ) 

1514 

1515 # Add MeSH terms for medical categorization 

1516 if "mesh_terms" in metadata: 1516 ↛ 1536line 1516 didn't jump to line 1536 because the condition on line 1516 was always true

1517 result["mesh_terms"] = metadata["mesh_terms"] 

1518 

1519 # Add MeSH terms to snippet if enabled 

1520 if ( 1520 ↛ 1536line 1520 didn't jump to line 1536 because the condition on line 1520 was always true

1521 self.include_mesh_terms_in_context 

1522 and metadata["mesh_terms"] 

1523 ): 

1524 mesh_to_show = ( 

1525 metadata["mesh_terms"][: self.max_mesh_terms] 

1526 if self.max_mesh_terms > 0 

1527 else metadata["mesh_terms"] 

1528 ) 

1529 if mesh_to_show and "snippet" in result: 1529 ↛ 1536line 1529 didn't jump to line 1536 because the condition on line 1529 was always true

1530 mesh_text = "MeSH: " + ", ".join(mesh_to_show) 

1531 result["snippet"] = ( 

1532 f"{result['snippet']} | {mesh_text}" 

1533 ) 

1534 

1535 # Add keywords 

1536 if "keywords" in metadata: 1536 ↛ 1537line 1536 didn't jump to line 1537 because the condition on line 1536 was never true

1537 result["keywords"] = metadata["keywords"] 

1538 

1539 # Add keywords to snippet if enabled 

1540 if ( 

1541 self.include_keywords_in_context 

1542 and metadata["keywords"] 

1543 ): 

1544 keywords_to_show = ( 

1545 metadata["keywords"][: self.max_keywords] 

1546 if self.max_keywords > 0 

1547 else metadata["keywords"] 

1548 ) 

1549 if keywords_to_show and "snippet" in result: 

1550 keywords_text = "Keywords: " + ", ".join( 

1551 keywords_to_show 

1552 ) 

1553 result["snippet"] = ( 

1554 f"{result['snippet']} | {keywords_text}" 

1555 ) 

1556 

1557 # Add affiliations 

1558 if "affiliations" in metadata: 1558 ↛ 1559line 1558 didn't jump to line 1559 because the condition on line 1558 was never true

1559 result["affiliations"] = metadata["affiliations"] 

1560 

1561 # Add funding/grant information 

1562 if "grants" in metadata: 1562 ↛ 1563line 1562 didn't jump to line 1563 because the condition on line 1562 was never true

1563 result["grants"] = metadata["grants"] 

1564 

1565 # Add conflict of interest statement 

1566 if "conflict_of_interest" in metadata: 1566 ↛ 1567line 1566 didn't jump to line 1567 because the condition on line 1566 was never true

1567 result["conflict_of_interest"] = metadata[ 

1568 "conflict_of_interest" 

1569 ] 

1570 

1571 # Add free full text availability 

1572 if "has_free_full_text" in metadata: 1572 ↛ 1573line 1572 didn't jump to line 1573 because the condition on line 1572 was never true

1573 result["has_free_full_text"] = metadata[ 

1574 "has_free_full_text" 

1575 ] 

1576 if "pmc_id" in metadata: 

1577 result["pmc_id"] = metadata["pmc_id"] 

1578 

1579 # Add PMC availability to snippet if enabled 

1580 if ( 

1581 self.include_pmc_availability_in_context 

1582 and metadata["has_free_full_text"] 

1583 and "snippet" in result 

1584 ): 

1585 result["snippet"] = ( 

1586 f"{result['snippet']} | [Free Full Text]" 

1587 ) 

1588 

1589 # Add abstract if available 

1590 if pmid in abstracts: 1590 ↛ 1631line 1590 didn't jump to line 1631 because the condition on line 1590 was always true

1591 result["abstract"] = abstracts[pmid] 

1592 

1593 # Create enriched content with metadata context 

1594 enriched_content = self._create_enriched_content( 

1595 result, abstracts[pmid] 

1596 ) 

1597 

1598 # ALWAYS include title and abstract in snippet for LLM analysis 

1599 # Build comprehensive snippet with title and abstract 

1600 title = result.get("title", "") 

1601 abstract_text = ( 

1602 abstracts[pmid][:SNIPPET_LENGTH_LONG] 

1603 if len(abstracts[pmid]) > SNIPPET_LENGTH_LONG 

1604 else abstracts[pmid] 

1605 ) 

1606 

1607 # Prepend title and abstract to the existing metadata snippet 

1608 if "snippet" in result: 1608 ↛ 1615line 1608 didn't jump to line 1615 because the condition on line 1608 was always true

1609 # Keep metadata snippet and add content 

1610 result["snippet"] = ( 

1611 f"Title: {title}\n\nAbstract: {abstract_text}\n\nMetadata: {result['snippet']}" 

1612 ) 

1613 else: 

1614 # No metadata snippet, just title and abstract 

1615 result["snippet"] = ( 

1616 f"Title: {title}\n\nAbstract: {abstract_text}" 

1617 ) 

1618 

1619 # In snippet-only mode, use enriched content 

1620 if snippets_only_mode: 1620 ↛ 1621line 1620 didn't jump to line 1621 because the condition on line 1620 was never true

1621 result["full_content"] = enriched_content 

1622 result["content"] = enriched_content 

1623 result["content_type"] = "abstract" 

1624 # Use abstract as content if no full text 

1625 elif pmid not in pmid_to_pmcid: 1625 ↛ 1631line 1625 didn't jump to line 1631 because the condition on line 1625 was always true

1626 result["full_content"] = enriched_content 

1627 result["content"] = enriched_content 

1628 result["content_type"] = "abstract" 

1629 

1630 # Add full text for a limited number of top articles 

1631 if ( 1631 ↛ 1640line 1631 didn't jump to line 1640 because the condition on line 1631 was never true

1632 pmid in pmid_to_pmcid 

1633 and self.get_full_text 

1634 and len( 

1635 [r for r in results if r.get("content_type") == "full_text"] 

1636 ) 

1637 < self.full_text_limit 

1638 ): 

1639 # Get full text content 

1640 pmcid = pmid_to_pmcid[pmid] 

1641 full_text = self._get_pmc_full_text(pmcid) 

1642 

1643 if full_text: 

1644 enriched_full_text = self._create_enriched_content( 

1645 result, full_text 

1646 ) 

1647 result["full_content"] = enriched_full_text 

1648 result["content"] = enriched_full_text 

1649 result["content_type"] = "full_text" 

1650 result["pmcid"] = pmcid 

1651 elif pmid in abstracts: 

1652 # Fall back to abstract if full text retrieval fails 

1653 enriched_content = self._create_enriched_content( 

1654 result, abstracts[pmid] 

1655 ) 

1656 result["full_content"] = enriched_content 

1657 result["content"] = enriched_content 

1658 result["content_type"] = "abstract" 

1659 

1660 # Remove temporary fields 

1661 if "_pmid" in result: 1661 ↛ 1663line 1661 didn't jump to line 1663 because the condition on line 1661 was always true

1662 del result["_pmid"] 

1663 if "_search_strategy" in result: 1663 ↛ 1664line 1663 didn't jump to line 1664 because the condition on line 1663 was never true

1664 del result["_search_strategy"] 

1665 

1666 results.append(result) 

1667 

1668 return results 

1669 

1670 def search_by_author( 

1671 self, author_name: str, max_results: Optional[int] = None 

1672 ) -> List[Dict[str, Any]]: 

1673 """ 

1674 Search for articles by a specific author. 

1675 

1676 Args: 

1677 author_name: Name of the author 

1678 max_results: Maximum number of results (defaults to self.max_results) 

1679 

1680 Returns: 

1681 List of articles by the author 

1682 """ 

1683 original_max_results = self.max_results 

1684 

1685 try: 

1686 if max_results: 1686 ↛ 1687line 1686 didn't jump to line 1687 because the condition on line 1686 was never true

1687 self.max_results = max_results 

1688 

1689 query = f"{author_name}[Author]" 

1690 return self.run(query) 

1691 

1692 finally: 

1693 # Restore original value 

1694 self.max_results = original_max_results 

1695 

1696 def search_by_journal( 

1697 self, journal_name: str, max_results: Optional[int] = None 

1698 ) -> List[Dict[str, Any]]: 

1699 """ 

1700 Search for articles in a specific journal. 

1701 

1702 Args: 

1703 journal_name: Name of the journal 

1704 max_results: Maximum number of results (defaults to self.max_results) 

1705 

1706 Returns: 

1707 List of articles from the journal 

1708 """ 

1709 original_max_results = self.max_results 

1710 

1711 try: 

1712 if max_results: 1712 ↛ 1713line 1712 didn't jump to line 1713 because the condition on line 1712 was never true

1713 self.max_results = max_results 

1714 

1715 query = f"{journal_name}[Journal]" 

1716 return self.run(query) 

1717 

1718 finally: 

1719 # Restore original value 

1720 self.max_results = original_max_results 

1721 

1722 def search_recent( 

1723 self, query: str, days: int = 30, max_results: Optional[int] = None 

1724 ) -> List[Dict[str, Any]]: 

1725 """ 

1726 Search for recent articles matching the query. 

1727 

1728 Args: 

1729 query: The search query 

1730 days: Number of days to look back 

1731 max_results: Maximum number of results (defaults to self.max_results) 

1732 

1733 Returns: 

1734 List of recent articles matching the query 

1735 """ 

1736 original_max_results = self.max_results 

1737 original_days_limit = self.days_limit 

1738 

1739 try: 

1740 if max_results: 

1741 self.max_results = max_results 

1742 

1743 # Set days limit for this search 

1744 self.days_limit = days 

1745 

1746 return self.run(query) 

1747 

1748 finally: 

1749 # Restore original values 

1750 self.max_results = original_max_results 

1751 self.days_limit = original_days_limit 

1752 

1753 def advanced_search( 

1754 self, terms: Dict[str, str], max_results: Optional[int] = None 

1755 ) -> List[Dict[str, Any]]: 

1756 """ 

1757 Perform an advanced search with field-specific terms. 

1758 

1759 Args: 

1760 terms: Dictionary mapping fields to search terms 

1761 Valid fields: Author, Journal, Title, MeSH, Affiliation, etc. 

1762 max_results: Maximum number of results (defaults to self.max_results) 

1763 

1764 Returns: 

1765 List of articles matching the advanced query 

1766 """ 

1767 original_max_results = self.max_results 

1768 

1769 try: 

1770 if max_results: 

1771 self.max_results = max_results 

1772 

1773 # Build advanced query string 

1774 query_parts = [] 

1775 for field, term in terms.items(): 

1776 query_parts.append(f"{term}[{field}]") 

1777 

1778 query = " AND ".join(query_parts) 

1779 return self.run(query) 

1780 

1781 finally: 

1782 # Restore original value 

1783 self.max_results = original_max_results