Coverage for src / local_deep_research / web_search_engines / engines / search_engine_pubmed.py: 92%

710 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1import re 

2from typing import Any, Dict, List, Optional, Tuple 

3 

4from defusedxml import ElementTree as ET 

5 

6from langchain_core.language_models import BaseLLM 

7from loguru import logger 

8 

9from ...config import search_config 

10from ...constants import SNIPPET_LENGTH_LONG 

11from ...security.safe_requests import safe_get 

12from ..rate_limiting import RateLimitError 

13from ..search_engine_base import BaseSearchEngine 

14 

15 

16class PubMedSearchEngine(BaseSearchEngine): 

17 """ 

18 PubMed search engine implementation with two-phase approach and adaptive search. 

19 Provides efficient access to biomedical literature while minimizing API usage. 

20 """ 

21 

22 # Mark as public search engine 

23 is_public = True 

24 # Scientific/medical search engine 

25 is_scientific = True 

26 is_lexical = True 

27 needs_llm_relevance_filter = True 

28 

29 def __init__( 

30 self, 

31 max_results: int = 10, 

32 api_key: Optional[str] = None, 

33 days_limit: Optional[int] = None, 

34 get_abstracts: bool = True, 

35 get_full_text: bool = False, 

36 full_text_limit: int = 3, 

37 llm: Optional[BaseLLM] = None, 

38 max_filtered_results: Optional[int] = None, 

39 optimize_queries: bool = True, 

40 include_publication_type_in_context: bool = True, 

41 include_journal_in_context: bool = True, 

42 include_year_in_context: bool = True, 

43 include_authors_in_context: bool = False, 

44 include_full_date_in_context: bool = False, 

45 include_mesh_terms_in_context: bool = True, 

46 include_keywords_in_context: bool = True, 

47 include_doi_in_context: bool = False, 

48 include_pmid_in_context: bool = False, 

49 include_pmc_availability_in_context: bool = False, 

50 max_mesh_terms: int = 3, 

51 max_keywords: int = 3, 

52 include_citation_in_context: bool = False, 

53 include_language_in_context: bool = False, 

54 settings_snapshot: Optional[Dict[str, Any]] = None, 

55 ): 

56 """ 

57 Initialize the PubMed search engine. 

58 

59 Args: 

60 max_results: Maximum number of search results 

61 api_key: NCBI API key for higher rate limits (optional) 

62 days_limit: Limit results to N days (optional) 

63 get_abstracts: Whether to fetch abstracts for all results 

64 get_full_text: Whether to fetch full text content (when available in PMC) 

65 full_text_limit: Max number of full-text articles to retrieve 

66 llm: Language model for relevance filtering 

67 max_filtered_results: Maximum number of results to keep after filtering 

68 optimize_queries: Whether to optimize natural language queries for PubMed 

69 """ 

70 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results 

71 super().__init__( 

72 llm=llm, 

73 max_filtered_results=max_filtered_results, 

74 max_results=max_results, 

75 settings_snapshot=settings_snapshot, 

76 ) 

77 self.max_results = max(self.max_results, 25) 

78 self.api_key = api_key 

79 self.days_limit = days_limit 

80 self.get_abstracts = get_abstracts 

81 self.get_full_text = get_full_text 

82 self.full_text_limit = full_text_limit 

83 self.optimize_queries = optimize_queries 

84 self.include_publication_type_in_context = ( 

85 include_publication_type_in_context 

86 ) 

87 self.include_journal_in_context = include_journal_in_context 

88 self.include_year_in_context = include_year_in_context 

89 self.include_authors_in_context = include_authors_in_context 

90 self.include_full_date_in_context = include_full_date_in_context 

91 self.include_mesh_terms_in_context = include_mesh_terms_in_context 

92 self.include_keywords_in_context = include_keywords_in_context 

93 self.include_doi_in_context = include_doi_in_context 

94 self.include_pmid_in_context = include_pmid_in_context 

95 self.include_pmc_availability_in_context = ( 

96 include_pmc_availability_in_context 

97 ) 

98 self.max_mesh_terms = max_mesh_terms 

99 self.max_keywords = max_keywords 

100 self.include_citation_in_context = include_citation_in_context 

101 self.include_language_in_context = include_language_in_context 

102 

103 # Base API URLs 

104 self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils" 

105 self.search_url = f"{self.base_url}/esearch.fcgi" 

106 self.summary_url = f"{self.base_url}/esummary.fcgi" 

107 self.fetch_url = f"{self.base_url}/efetch.fcgi" 

108 self.link_url = f"{self.base_url}/elink.fcgi" 

109 

110 # PMC base URL for full text 

111 self.pmc_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/" 

112 

113 def _get_result_count(self, query: str) -> int: 

114 """ 

115 Get the total number of results for a query without retrieving the results themselves. 

116 

117 Args: 

118 query: The search query 

119 

120 Returns: 

121 Total number of matching results 

122 """ 

123 try: 

124 # Prepare search parameters 

125 params = { 

126 "db": "pubmed", 

127 "term": query, 

128 "retmode": "json", 

129 "retmax": 0, # Don't need actual results, just the count 

130 } 

131 

132 # Add API key if available 

133 if self.api_key: 

134 params["api_key"] = self.api_key 

135 

136 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

137 self.engine_type 

138 ) 

139 

140 # Execute search request 

141 response = safe_get(self.search_url, params=params) 

142 response.raise_for_status() 

143 

144 # Parse response 

145 data = response.json() 

146 count = int(data["esearchresult"]["count"]) 

147 

148 logger.info( 

149 "Query '{}' has {} total results in PubMed", query, count 

150 ) 

151 return count 

152 

153 except Exception: 

154 logger.exception("Error getting result count") 

155 return 0 

156 

157 def _extract_core_terms(self, query: str) -> str: 

158 """ 

159 Extract core terms from a complex query for volume estimation. 

160 

161 Args: 

162 query: PubMed query string 

163 

164 Returns: 

165 Simplified query with core terms 

166 """ 

167 # Remove field specifications and operators 

168 simplified = re.sub(r"\[\w+\]", "", query) # Remove [Field] tags 

169 simplified = re.sub( 

170 r"\b(AND|OR|NOT)\b", "", simplified 

171 ) # Remove operators 

172 

173 # Remove quotes and parentheses 

174 simplified = ( 

175 simplified.replace('"', "").replace("(", "").replace(")", "") 

176 ) 

177 

178 # Split by whitespace and join terms with 4+ chars (likely meaningful) 

179 terms = [term for term in simplified.split() if len(term) >= 4] 

180 

181 # Join with AND to create a basic search 

182 return " ".join(terms[:5]) # Limit to top 5 terms 

183 

184 def _expand_time_window(self, time_filter: str) -> str: 

185 """ 

186 Expand a time window to get more results. 

187 

188 Args: 

189 time_filter: Current time filter 

190 

191 Returns: 

192 Expanded time filter 

193 """ 

194 # Parse current time window 

195 import re 

196 

197 match = re.match(r'"last (\d+) (\w+)"[pdat]', time_filter) 

198 if not match: 

199 return '"last 10 years"[pdat]' 

200 

201 amount, unit = int(match.group(1)), match.group(2) 

202 

203 # Expand based on current unit 

204 if unit == "months" or unit == "month": 

205 if amount < 6: 

206 return '"last 6 months"[pdat]' 

207 if amount < 12: 

208 return '"last 1 year"[pdat]' 

209 return '"last 2 years"[pdat]' 

210 if unit == "years" or unit == "year": 210 ↛ 217line 210 didn't jump to line 217 because the condition on line 210 was always true

211 if amount < 2: 

212 return '"last 2 years"[pdat]' 

213 if amount < 5: 

214 return '"last 5 years"[pdat]' 

215 return '"last 10 years"[pdat]' 

216 

217 return '"last 10 years"[pdat]' 

218 

219 def _optimize_query_for_pubmed(self, query: str) -> str: 

220 """ 

221 Optimize a natural language query for PubMed search. 

222 Uses LLM to transform questions into effective keyword-based queries. 

223 

224 Args: 

225 query: Natural language query 

226 

227 Returns: 

228 Optimized query string for PubMed 

229 """ 

230 if not self.llm or not self.optimize_queries: 

231 # Return original query if no LLM available or optimization disabled 

232 return query 

233 

234 try: 

235 # Prompt for query optimization 

236 prompt = f"""Transform this natural language question into an optimized PubMed search query. 

237 

238Original query: "{query}" 

239 

240CRITICAL RULES: 

2411. ONLY RETURN THE EXACT SEARCH QUERY - NO EXPLANATIONS, NO COMMENTS 

2422. DO NOT wrap the entire query in quotes 

2433. DO NOT include ANY date restrictions or year filters 

2444. Use parentheses around OR statements: (term1[Field] OR term2[Field]) 

2455. Use only BASIC MeSH terms - stick to broad categories like "Vaccines"[Mesh] 

2466. KEEP IT SIMPLE - use 2-3 main concepts maximum 

2477. Focus on Title/Abstract searches for reliability: term[Title/Abstract] 

2488. Use wildcards for variations: vaccin*[Title/Abstract] 

249 

250EXAMPLE QUERIES: 

251✓ GOOD: (mRNA[Title/Abstract] OR "messenger RNA"[Title/Abstract]) AND vaccin*[Title/Abstract] 

252✓ GOOD: (influenza[Title/Abstract] OR flu[Title/Abstract]) AND treatment[Title/Abstract] 

253✗ BAD: (mRNA[Title/Abstract]) AND "specific disease"[Mesh] AND treatment[Title/Abstract] AND 2023[dp] 

254✗ BAD: "Here's a query to find articles about vaccines..." 

255 

256Return ONLY the search query without any explanations. 

257""" 

258 

259 # Get response from LLM 

260 response = self.llm.invoke(prompt) 

261 raw_response = ( 

262 str(response.content) 

263 if hasattr(response, "content") 

264 else str(response) 

265 ).strip() 

266 

267 # Clean up the query - extract only the actual query and remove any explanations 

268 # First check if there are multiple lines and take the first non-empty line 

269 lines = raw_response.split("\n") 

270 cleaned_lines = [line.strip() for line in lines if line.strip()] 

271 

272 if cleaned_lines: 272 ↛ 322line 272 didn't jump to line 322 because the condition on line 272 was always true

273 optimized_query = cleaned_lines[0] 

274 

275 # Remove any quotes that wrap the entire query 

276 if optimized_query.startswith('"') and optimized_query.endswith( 

277 '"' 

278 ): 

279 optimized_query = optimized_query[1:-1] 

280 

281 # Remove any explanation phrases that might be at the beginning 

282 explanation_starters = [ 

283 "here is", 

284 "here's", 

285 "this query", 

286 "the following", 

287 ] 

288 for starter in explanation_starters: 

289 if optimized_query.lower().startswith(starter): 

290 # Find the actual query part - typically after a colon 

291 colon_pos = optimized_query.find(":") 

292 if colon_pos > 0: 

293 optimized_query = optimized_query[ 

294 colon_pos + 1 : 

295 ].strip() 

296 

297 # Check if the query still seems to contain explanations 

298 if ( 

299 len(optimized_query) > 200 

300 or "this query will" in optimized_query.lower() 

301 ): 

302 # It's probably still an explanation - try to extract just the query part 

303 # Look for common patterns in the explanation like parentheses 

304 pattern = r"\([^)]+\)\s+AND\s+" 

305 import re 

306 

307 matches = re.findall(pattern, optimized_query) 

308 if matches: 308 ↛ 328line 308 didn't jump to line 328 because the condition on line 308 was always true

309 # Extract just the query syntax parts 

310 query_parts = [] 

311 for part in re.split(r"\.\s+", optimized_query): 

312 if ( 

313 "(" in part 

314 and ")" in part 

315 and ("AND" in part or "OR" in part) 

316 ): 

317 query_parts.append(part) 

318 if query_parts: 318 ↛ 328line 318 didn't jump to line 328 because the condition on line 318 was always true

319 optimized_query = " ".join(query_parts) 

320 else: 

321 # Fall back to original query if cleaning fails 

322 logger.warning( 

323 "Failed to extract a clean query from LLM response" 

324 ) 

325 optimized_query = query 

326 

327 # Final safety check - if query looks too much like an explanation, use original 

328 if len(optimized_query.split()) > 30: 

329 logger.warning( 

330 "Query too verbose, falling back to simpler form" 

331 ) 

332 # Create a simple query from the original 

333 words = [ 

334 w 

335 for w in query.split() 

336 if len(w) > 3 

337 and w.lower() 

338 not in ( 

339 "what", 

340 "are", 

341 "the", 

342 "and", 

343 "for", 

344 "with", 

345 "from", 

346 "have", 

347 "been", 

348 "recent", 

349 ) 

350 ] 

351 optimized_query = " AND ".join(words[:3]) 

352 

353 # Basic cleanup: standardize field tag case for consistency 

354 import re 

355 

356 optimized_query = re.sub( 

357 r"\[mesh\]", "[Mesh]", optimized_query, flags=re.IGNORECASE 

358 ) 

359 optimized_query = re.sub( 

360 r"\[title/abstract\]", 

361 "[Title/Abstract]", 

362 optimized_query, 

363 flags=re.IGNORECASE, 

364 ) 

365 optimized_query = re.sub( 

366 r"\[publication type\]", 

367 "[Publication Type]", 

368 optimized_query, 

369 flags=re.IGNORECASE, 

370 ) 

371 

372 # Fix unclosed quotes followed by field tags 

373 # Pattern: "term[Field] -> "term"[Field] 

374 optimized_query = re.sub(r'"([^"]+)\[', r'"\1"[', optimized_query) 

375 

376 # Simplify the query if still no results are found 

377 self._simplify_query_cache = optimized_query 

378 

379 # Log original and optimized queries 

380 logger.info("Original query: '{}'", query) 

381 logger.info(f"Optimized for PubMed: '{optimized_query}'") 

382 logger.debug( 

383 f"Query optimization complete: '{query[:50]}...' -> '{optimized_query[:100]}...'" 

384 ) 

385 

386 return optimized_query 

387 

388 except Exception: 

389 logger.exception("Error optimizing query") 

390 logger.debug(f"Falling back to original query: '{query}'") 

391 return query # Fall back to original query on error 

392 

393 def _simplify_query(self, query: str) -> str: 

394 """ 

395 Simplify a PubMed query that returned no results. 

396 Progressively removes elements to get a more basic query. 

397 

398 Args: 

399 query: The original query that returned no results 

400 

401 Returns: 

402 Simplified query 

403 """ 

404 logger.info(f"Simplifying query: {query}") 

405 logger.debug(f"Query simplification started for: '{query[:100]}...'") 

406 

407 # Simple approach: remove field restrictions to broaden the search 

408 import re 

409 

410 # Remove field tags to make search broader 

411 simplified = query 

412 

413 # Remove [Mesh] tags - search in all fields instead 

414 simplified = re.sub(r"\[Mesh\]", "", simplified, flags=re.IGNORECASE) 

415 

416 # Remove [Publication Type] tags 

417 simplified = re.sub( 

418 r"\[Publication Type\]", "", simplified, flags=re.IGNORECASE 

419 ) 

420 

421 # Keep [Title/Abstract] as it's usually helpful 

422 # Clean up any double spaces 

423 simplified = re.sub(r"\s+", " ", simplified).strip() 

424 

425 # If no simplification was possible, return the original query 

426 if simplified == query: 

427 logger.debug("No simplification possible, returning original query") 

428 

429 logger.info(f"Simplified query: {simplified}") 

430 logger.debug( 

431 f"Query simplified from {len(query)} to {len(simplified)} chars" 

432 ) 

433 return simplified 

434 

435 def _is_historical_focused(self, query: str) -> bool: 

436 """ 

437 Determine if a query is specifically focused on historical/older information using LLM. 

438 Default assumption is that queries should prioritize recent information unless 

439 explicitly asking for historical content. 

440 

441 Args: 

442 query: The search query 

443 

444 Returns: 

445 Boolean indicating if the query is focused on historical information 

446 """ 

447 if not self.llm: 

448 # Fall back to basic keyword check if no LLM available 

449 historical_terms = [ 

450 "history", 

451 "historical", 

452 "early", 

453 "initial", 

454 "first", 

455 "original", 

456 "before", 

457 "prior to", 

458 "origins", 

459 "evolution", 

460 "development", 

461 ] 

462 historical_years = [str(year) for year in range(1900, 2020)] 

463 

464 query_lower = query.lower() 

465 has_historical_term = any( 

466 term in query_lower for term in historical_terms 

467 ) 

468 has_past_year = any(year in query for year in historical_years) 

469 

470 return has_historical_term or has_past_year 

471 

472 try: 

473 # Use LLM to determine if the query is focused on historical information 

474 prompt = f"""Determine if this query is specifically asking for HISTORICAL or OLDER information. 

475 

476Query: "{query}" 

477 

478Answer ONLY "yes" if the query is clearly asking for historical, early, original, or past information from more than 5 years ago. 

479Answer ONLY "no" if the query is asking about recent, current, or new information, or if it's a general query without a specific time focus. 

480 

481The default assumption should be that medical and scientific queries want RECENT information unless clearly specified otherwise. 

482""" 

483 

484 response = self.llm.invoke(prompt) 

485 answer = ( 

486 ( 

487 str(response.content) 

488 if hasattr(response, "content") 

489 else str(response) 

490 ) 

491 .strip() 

492 .lower() 

493 ) 

494 

495 # Log the determination 

496 logger.info(f"Historical focus determination for query: '{query}'") 

497 logger.info(f"LLM determined historical focus: {answer}") 

498 

499 return "yes" in answer 

500 

501 except Exception: 

502 logger.exception("Error determining historical focus") 

503 # Fall back to basic keyword check 

504 historical_terms = [ 

505 "history", 

506 "historical", 

507 "early", 

508 "initial", 

509 "first", 

510 "original", 

511 "before", 

512 "prior to", 

513 "origins", 

514 "evolution", 

515 "development", 

516 ] 

517 return any(term in query.lower() for term in historical_terms) 

518 

519 def _adaptive_search(self, query: str) -> Tuple[List[str], str]: 

520 """ 

521 Perform an adaptive search that adjusts based on topic volume and whether 

522 the query focuses on historical information. 

523 

524 Args: 

525 query: The search query (already optimized) 

526 

527 Returns: 

528 Tuple of (list of PMIDs, search strategy used) 

529 """ 

530 # Estimate topic volume 

531 estimated_volume = self._get_result_count(query) 

532 

533 # Determine if the query is focused on historical information 

534 is_historical_focused = self._is_historical_focused(query) 

535 

536 if is_historical_focused: 

537 # User wants historical information - no date filtering 

538 time_filter = None 

539 strategy = "historical_focus" 

540 elif estimated_volume > 5000: 

541 # Very common topic - use tighter recency filter 

542 time_filter = '"last 1 year"[pdat]' 

543 strategy = "high_volume" 

544 elif estimated_volume > 1000: 

545 # Common topic 

546 time_filter = '"last 3 years"[pdat]' 

547 strategy = "common_topic" 

548 elif estimated_volume > 100: 

549 # Moderate volume 

550 time_filter = '"last 5 years"[pdat]' 

551 strategy = "moderate_volume" 

552 else: 

553 # Rare topic - still use recency but with wider range 

554 time_filter = '"last 10 years"[pdat]' 

555 strategy = "rare_topic" 

556 

557 # Run search based on strategy 

558 if time_filter: 

559 # Try with adaptive time filter 

560 query_with_time = f"({query}) AND {time_filter}" 

561 logger.info( 

562 f"Using adaptive search strategy: {strategy} with filter: {time_filter}" 

563 ) 

564 results = self._search_pubmed(query_with_time) 

565 

566 # If too few results, gradually expand time window 

567 if len(results) < 5 and '"last 10 years"[pdat]' not in time_filter: 

568 logger.info( 

569 f"Insufficient results ({len(results)}), expanding time window" 

570 ) 

571 expanded_time = self._expand_time_window(time_filter) 

572 query_with_expanded_time = f"({query}) AND {expanded_time}" 

573 expanded_results = self._search_pubmed(query_with_expanded_time) 

574 

575 if len(expanded_results) > len(results): 

576 logger.info( 

577 f"Expanded time window yielded {len(expanded_results)} results" 

578 ) 

579 return expanded_results, f"{strategy}_expanded" 

580 

581 # If still no results, try without time filter 

582 if not results: 

583 logger.info( 

584 "No results with time filter, trying without time restrictions" 

585 ) 

586 results = self._search_pubmed(query) 

587 strategy = "no_time_filter" 

588 else: 

589 # Historical query - run without time filter 

590 logger.info( 

591 "Using historical search strategy without date filtering" 

592 ) 

593 results = self._search_pubmed(query) 

594 

595 return results, strategy 

596 

597 def _search_pubmed(self, query: str) -> List[str]: 

598 """ 

599 Search PubMed and return a list of article IDs. 

600 

601 Args: 

602 query: The search query 

603 

604 Returns: 

605 List of PubMed IDs matching the query 

606 """ 

607 try: 

608 # Prepare search parameters 

609 params = { 

610 "db": "pubmed", 

611 "term": query, 

612 "retmode": "json", 

613 "retmax": self.max_results, 

614 "usehistory": "y", 

615 } 

616 

617 # Add API key if available 

618 if self.api_key: 

619 params["api_key"] = self.api_key 

620 logger.debug("Using PubMed API key for higher rate limits") 

621 else: 

622 logger.debug("No PubMed API key - using default rate limits") 

623 

624 # Add date restriction if specified 

625 if self.days_limit: 

626 params["reldate"] = self.days_limit 

627 params["datetype"] = "pdat" # Publication date 

628 logger.debug(f"Limiting results to last {self.days_limit} days") 

629 

630 logger.debug( 

631 f"PubMed search query: '{query}' with max_results={self.max_results}" 

632 ) 

633 

634 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

635 self.engine_type 

636 ) 

637 logger.debug( 

638 f"Applied rate limit wait: {self._last_wait_time:.2f}s" 

639 ) 

640 

641 # Execute search request 

642 logger.debug(f"Sending request to PubMed API: {self.search_url}") 

643 response = safe_get(self.search_url, params=params) 

644 response.raise_for_status() 

645 logger.debug(f"PubMed API response status: {response.status_code}") 

646 

647 # Parse response 

648 data = response.json() 

649 id_list: list[str] = data["esearchresult"]["idlist"] 

650 total_count = data["esearchresult"].get("count", "unknown") 

651 

652 logger.info( 

653 f"PubMed search for '{query}' found {len(id_list)} results (total available: {total_count})" 

654 ) 

655 if len(id_list) > 0: 

656 logger.debug(f"First 5 PMIDs: {id_list[:5]}") 

657 return id_list 

658 

659 except Exception: 

660 logger.exception(f"Error searching PubMed for query '{query}'") 

661 return [] 

662 

663 def _get_article_summaries( 

664 self, id_list: List[str] 

665 ) -> List[Dict[str, Any]]: 

666 """ 

667 Get summaries for a list of PubMed article IDs. 

668 

669 Args: 

670 id_list: List of PubMed IDs 

671 

672 Returns: 

673 List of article summary dictionaries 

674 """ 

675 if not id_list: 

676 logger.debug("Empty ID list provided to _get_article_summaries") 

677 return [] 

678 

679 logger.debug(f"Fetching summaries for {len(id_list)} PubMed articles") 

680 

681 try: 

682 # Prepare parameters 

683 params = { 

684 "db": "pubmed", 

685 "id": ",".join(id_list), 

686 "retmode": "json", 

687 "rettype": "summary", 

688 } 

689 

690 # Add API key if available 

691 if self.api_key: 691 ↛ 692line 691 didn't jump to line 692 because the condition on line 691 was never true

692 params["api_key"] = self.api_key 

693 

694 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

695 self.engine_type 

696 ) 

697 logger.debug( 

698 f"Applied rate limit wait: {self._last_wait_time:.2f}s" 

699 ) 

700 

701 # Execute request 

702 logger.debug(f"Requesting summaries from: {self.summary_url}") 

703 response = safe_get(self.summary_url, params=params) 

704 response.raise_for_status() 

705 logger.debug(f"Summary API response status: {response.status_code}") 

706 

707 # Parse response 

708 data = response.json() 

709 logger.debug( 

710 f"PubMed API returned data for {len(id_list)} requested IDs" 

711 ) 

712 summaries = [] 

713 

714 for pmid in id_list: 

715 if pmid in data["result"]: 715 ↛ 763line 715 didn't jump to line 763 because the condition on line 715 was always true

716 article = data["result"][pmid] 

717 logger.debug( 

718 f"Processing article {pmid}: {article.get('title', 'NO TITLE')[:50]}" 

719 ) 

720 

721 # Extract authors (if available) 

722 authors = [] 

723 if "authors" in article: 723 ↛ 729line 723 didn't jump to line 729 because the condition on line 723 was always true

724 authors = [ 

725 author["name"] for author in article["authors"] 

726 ] 

727 

728 # Extract DOI from articleids if not in main field 

729 doi = article.get("doi", "") 

730 if not doi and "articleids" in article: 730 ↛ 737line 730 didn't jump to line 737 because the condition on line 730 was always true

731 for aid in article["articleids"]: 731 ↛ 737line 731 didn't jump to line 737 because the loop on line 731 didn't complete

732 if aid.get("idtype") == "doi": 732 ↛ 731line 732 didn't jump to line 731 because the condition on line 732 was always true

733 doi = aid.get("value", "") 

734 break 

735 

736 # Create summary dictionary with all available fields 

737 summary = { 

738 "id": pmid, 

739 "title": article.get("title", ""), 

740 "pubdate": article.get("pubdate", ""), 

741 "epubdate": article.get("epubdate", ""), 

742 "source": article.get("source", ""), 

743 "authors": authors, 

744 "lastauthor": article.get("lastauthor", ""), 

745 "journal": article.get("fulljournalname", ""), 

746 "volume": article.get("volume", ""), 

747 "issue": article.get("issue", ""), 

748 "pages": article.get("pages", ""), 

749 "doi": doi, 

750 "issn": article.get("issn", ""), 

751 "essn": article.get("essn", ""), 

752 "pubtype": article.get( 

753 "pubtype", [] 

754 ), # Publication types from esummary 

755 "recordstatus": article.get("recordstatus", ""), 

756 "lang": article.get("lang", []), 

757 "pmcrefcount": article.get("pmcrefcount", None), 

758 "link": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/", 

759 } 

760 

761 summaries.append(summary) 

762 else: 

763 logger.warning( 

764 f"PMID {pmid} not found in PubMed API response" 

765 ) 

766 

767 return summaries 

768 

769 except Exception as e: 

770 error_msg = str(e) 

771 logger.exception( 

772 f"Error getting article summaries for {len(id_list)} articles" 

773 ) 

774 

775 # Check for rate limiting patterns 

776 if ( 

777 "429" in error_msg 

778 or "too many requests" in error_msg.lower() 

779 or "rate limit" in error_msg.lower() 

780 or "service unavailable" in error_msg.lower() 

781 or "503" in error_msg 

782 or "403" in error_msg 

783 ): 

784 raise RateLimitError(f"PubMed rate limit hit: {error_msg}") 

785 

786 return [] 

787 

788 def _get_article_abstracts(self, id_list: List[str]) -> Dict[str, str]: 

789 """ 

790 Get abstracts for a list of PubMed article IDs. 

791 

792 Args: 

793 id_list: List of PubMed IDs 

794 

795 Returns: 

796 Dictionary mapping PubMed IDs to their abstracts 

797 """ 

798 if not id_list: 

799 logger.debug("Empty ID list provided to _get_article_abstracts") 

800 return {} 

801 

802 logger.debug(f"Fetching abstracts for {len(id_list)} PubMed articles") 

803 

804 try: 

805 # Prepare parameters 

806 params = { 

807 "db": "pubmed", 

808 "id": ",".join(id_list), 

809 "retmode": "xml", 

810 "rettype": "abstract", 

811 } 

812 

813 # Add API key if available 

814 if self.api_key: 814 ↛ 815line 814 didn't jump to line 815 because the condition on line 814 was never true

815 params["api_key"] = self.api_key 

816 

817 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

818 self.engine_type 

819 ) 

820 logger.debug( 

821 f"Applied rate limit wait: {self._last_wait_time:.2f}s" 

822 ) 

823 

824 # Execute request 

825 logger.debug(f"Requesting abstracts from: {self.fetch_url}") 

826 response = safe_get(self.fetch_url, params=params) 

827 response.raise_for_status() 

828 logger.debug( 

829 f"Abstract fetch response status: {response.status_code}, size: {len(response.text)} bytes" 

830 ) 

831 

832 # Parse XML response 

833 root = ET.fromstring(response.text) 

834 logger.debug( 

835 f"Parsing abstracts from XML for {len(id_list)} articles" 

836 ) 

837 

838 # Extract abstracts 

839 abstracts = {} 

840 

841 for article in root.findall(".//PubmedArticle"): 

842 pmid_elem = article.find(".//PMID") 

843 pmid = pmid_elem.text if pmid_elem is not None else None 

844 

845 if pmid is None: 

846 continue 

847 

848 # Find abstract text 

849 abstract_text = "" 

850 abstract_elem = article.find(".//AbstractText") 

851 

852 if abstract_elem is not None: 852 ↛ 856line 852 didn't jump to line 856 because the condition on line 852 was always true

853 abstract_text = abstract_elem.text or "" 

854 

855 # Some abstracts are split into multiple sections 

856 abstract_sections = article.findall(".//AbstractText") 

857 if len(abstract_sections) > 1: 

858 logger.debug( 

859 f"Article {pmid} has {len(abstract_sections)} abstract sections" 

860 ) 

861 

862 for section in abstract_sections: 

863 # Get section label if it exists 

864 label = section.get("Label") 

865 section_text = section.text or "" 

866 

867 if label and section_text: 

868 if abstract_text: 868 ↛ 871line 868 didn't jump to line 871 because the condition on line 868 was always true

869 abstract_text += f"\n\n{label}: {section_text}" 

870 else: 

871 abstract_text = f"{label}: {section_text}" 

872 elif section_text: 

873 if abstract_text: 873 ↛ 876line 873 didn't jump to line 876 because the condition on line 873 was always true

874 abstract_text += f"\n\n{section_text}" 

875 else: 

876 abstract_text = section_text 

877 

878 # Store in dictionary 

879 if pmid and abstract_text: 

880 abstracts[pmid] = abstract_text 

881 logger.debug( 

882 f"Abstract for {pmid}: {len(abstract_text)} chars" 

883 ) 

884 elif pmid: 884 ↛ 841line 884 didn't jump to line 841 because the condition on line 884 was always true

885 logger.warning(f"No abstract found for PMID {pmid}") 

886 

887 logger.info( 

888 f"Successfully retrieved {len(abstracts)} abstracts out of {len(id_list)} requested" 

889 ) 

890 return abstracts 

891 

892 except Exception: 

893 logger.exception( 

894 f"Error getting article abstracts for {len(id_list)} articles" 

895 ) 

896 return {} 

897 

898 def _get_article_detailed_metadata( 

899 self, id_list: List[str] 

900 ) -> Dict[str, Dict[str, Any]]: 

901 """ 

902 Get detailed metadata for PubMed articles including publication types, 

903 MeSH terms, keywords, and affiliations. 

904 

905 Args: 

906 id_list: List of PubMed IDs 

907 

908 Returns: 

909 Dictionary mapping PubMed IDs to their detailed metadata 

910 """ 

911 if not id_list: 

912 return {} 

913 

914 try: 

915 # Prepare parameters 

916 params = { 

917 "db": "pubmed", 

918 "id": ",".join(id_list), 

919 "retmode": "xml", 

920 "rettype": "medline", 

921 } 

922 

923 # Add API key if available 

924 if self.api_key: 924 ↛ 925line 924 didn't jump to line 925 because the condition on line 924 was never true

925 params["api_key"] = self.api_key 

926 

927 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

928 self.engine_type 

929 ) 

930 

931 # Execute request 

932 response = safe_get(self.fetch_url, params=params) 

933 response.raise_for_status() 

934 

935 # Parse XML response 

936 root = ET.fromstring(response.text) 

937 

938 metadata = {} 

939 

940 for article in root.findall(".//PubmedArticle"): 

941 pmid_elem = article.find(".//PMID") 

942 pmid = pmid_elem.text if pmid_elem is not None else None 

943 

944 if pmid is None: 944 ↛ 945line 944 didn't jump to line 945 because the condition on line 944 was never true

945 continue 

946 

947 article_metadata: Dict[str, Any] = {} 

948 

949 # Extract publication types 

950 pub_types = [] 

951 for pub_type in article.findall(".//PublicationType"): 

952 if pub_type.text: 952 ↛ 951line 952 didn't jump to line 951 because the condition on line 952 was always true

953 pub_types.append(pub_type.text) 

954 if pub_types: 

955 article_metadata["publication_types"] = pub_types 

956 

957 # Extract MeSH terms 

958 mesh_terms = [] 

959 for mesh in article.findall(".//MeshHeading"): 

960 descriptor = mesh.find(".//DescriptorName") 

961 if descriptor is not None and descriptor.text: 961 ↛ 959line 961 didn't jump to line 959 because the condition on line 961 was always true

962 mesh_terms.append(descriptor.text) 

963 if mesh_terms: 

964 article_metadata["mesh_terms"] = mesh_terms 

965 

966 # Extract keywords 

967 keywords = [] 

968 for keyword in article.findall(".//Keyword"): 

969 if keyword.text: 969 ↛ 968line 969 didn't jump to line 968 because the condition on line 969 was always true

970 keywords.append(keyword.text) 

971 if keywords: 

972 article_metadata["keywords"] = keywords 

973 

974 # Extract affiliations 

975 affiliations = [] 

976 for affiliation in article.findall(".//Affiliation"): 

977 if affiliation.text: 977 ↛ 976line 977 didn't jump to line 976 because the condition on line 977 was always true

978 affiliations.append(affiliation.text) 

979 if affiliations: 

980 article_metadata["affiliations"] = affiliations 

981 

982 # Extract grant information 

983 grants = [] 

984 for grant in article.findall(".//Grant"): 

985 grant_info = {} 

986 grant_id = grant.find(".//GrantID") 

987 if grant_id is not None and grant_id.text: 987 ↛ 989line 987 didn't jump to line 989 because the condition on line 987 was always true

988 grant_info["id"] = grant_id.text 

989 agency = grant.find(".//Agency") 

990 if agency is not None and agency.text: 990 ↛ 992line 990 didn't jump to line 992 because the condition on line 990 was always true

991 grant_info["agency"] = agency.text 

992 if grant_info: 992 ↛ 984line 992 didn't jump to line 984 because the condition on line 992 was always true

993 grants.append(grant_info) 

994 if grants: 

995 article_metadata["grants"] = grants 

996 

997 # Check for free full text in PMC 

998 pmc_elem = article.find(".//ArticleId[@IdType='pmc']") 

999 if pmc_elem is not None: 

1000 article_metadata["has_free_full_text"] = True 

1001 article_metadata["pmc_id"] = pmc_elem.text 

1002 

1003 # Extract conflict of interest statement 

1004 coi_elem = article.find(".//CoiStatement") 

1005 if coi_elem is not None and coi_elem.text: 

1006 article_metadata["conflict_of_interest"] = coi_elem.text 

1007 

1008 metadata[pmid] = article_metadata 

1009 

1010 return metadata 

1011 

1012 except Exception: 

1013 logger.exception("Error getting detailed article metadata") 

1014 return {} 

1015 

1016 def _create_enriched_content( 

1017 self, result: Dict[str, Any], base_content: str 

1018 ) -> str: 

1019 """ 

1020 Create enriched content by adding relevant metadata context to help the LLM. 

1021 

1022 Args: 

1023 result: The result dictionary with metadata 

1024 base_content: The base content (abstract or full text) 

1025 

1026 Returns: 

1027 Enriched content string with metadata context 

1028 """ 

1029 enriched_parts = [] 

1030 

1031 # Add study type information 

1032 if "publication_types" in result: 

1033 pub_types = result["publication_types"] 

1034 # Filter for significant types 

1035 significant_types = [ 

1036 pt 

1037 for pt in pub_types 

1038 if any( 

1039 key in pt.lower() 

1040 for key in [ 

1041 "clinical trial", 

1042 "randomized", 

1043 "meta-analysis", 

1044 "systematic review", 

1045 "case report", 

1046 "guideline", 

1047 "comparative study", 

1048 "multicenter", 

1049 ] 

1050 ) 

1051 ] 

1052 if significant_types: 

1053 enriched_parts.append( 

1054 f"[Study Type: {', '.join(significant_types)}]" 

1055 ) 

1056 

1057 # Add the main content 

1058 enriched_parts.append(base_content) 

1059 

1060 # Add metadata footer 

1061 metadata_footer = [] 

1062 

1063 # Add ALL MeSH terms 

1064 if "mesh_terms" in result and len(result["mesh_terms"]) > 0: 

1065 metadata_footer.append( 

1066 f"Medical Topics (MeSH): {', '.join(result['mesh_terms'])}" 

1067 ) 

1068 

1069 # Add ALL keywords 

1070 if "keywords" in result and len(result["keywords"]) > 0: 

1071 metadata_footer.append(f"Keywords: {', '.join(result['keywords'])}") 

1072 

1073 # Add ALL affiliations 

1074 if "affiliations" in result and len(result["affiliations"]) > 0: 

1075 if len(result["affiliations"]) == 1: 

1076 metadata_footer.append( 

1077 f"Institution: {result['affiliations'][0]}" 

1078 ) 

1079 else: 

1080 affiliations_text = "\n - " + "\n - ".join( 

1081 result["affiliations"] 

1082 ) 

1083 metadata_footer.append(f"Institutions:{affiliations_text}") 

1084 

1085 # Add ALL funding information with full details 

1086 if "grants" in result and len(result["grants"]) > 0: 

1087 grant_details = [] 

1088 for grant in result["grants"]: 

1089 grant_text = [] 

1090 if "agency" in grant: 

1091 grant_text.append(grant["agency"]) 

1092 if "id" in grant: 

1093 grant_text.append(f"(Grant ID: {grant['id']})") 

1094 if grant_text: 

1095 grant_details.append(" ".join(grant_text)) 

1096 if grant_details: 

1097 if len(grant_details) == 1: 

1098 metadata_footer.append(f"Funded by: {grant_details[0]}") 

1099 else: 

1100 funding_text = "\n - " + "\n - ".join(grant_details) 

1101 metadata_footer.append(f"Funding Sources:{funding_text}") 

1102 

1103 # Add FULL conflict of interest statement 

1104 if "conflict_of_interest" in result: 

1105 coi_text = result["conflict_of_interest"] 

1106 if coi_text: 

1107 # Still skip trivial "no conflict" statements to reduce noise 

1108 if not any( 

1109 phrase in coi_text.lower() 

1110 for phrase in [ 

1111 "no conflict", 

1112 "no competing", 

1113 "nothing to disclose", 

1114 "none declared", 

1115 "authors declare no", 

1116 ] 

1117 ): 

1118 metadata_footer.append(f"Conflict of Interest: {coi_text}") 

1119 elif ( 

1120 "but" in coi_text.lower() 

1121 or "except" in coi_text.lower() 

1122 or "however" in coi_text.lower() 

1123 ): 

1124 # Include if there's a "no conflict BUT..." type statement 

1125 metadata_footer.append(f"Conflict of Interest: {coi_text}") 

1126 

1127 # Combine everything 

1128 if metadata_footer: 

1129 enriched_parts.append("\n---\nStudy Metadata:") 

1130 enriched_parts.extend(metadata_footer) 

1131 

1132 return "\n".join(enriched_parts) 

1133 

1134 def _find_pmc_ids(self, pmid_list: List[str]) -> Dict[str, str]: 

1135 """ 

1136 Find PMC IDs for the given PubMed IDs (for full-text access). 

1137 

1138 Args: 

1139 pmid_list: List of PubMed IDs 

1140 

1141 Returns: 

1142 Dictionary mapping PubMed IDs to their PMC IDs (if available) 

1143 """ 

1144 if not pmid_list or not self.get_full_text: 

1145 return {} 

1146 

1147 try: 

1148 # Prepare parameters 

1149 params = { 

1150 "dbfrom": "pubmed", 

1151 "db": "pmc", 

1152 "linkname": "pubmed_pmc", 

1153 "id": ",".join(pmid_list), 

1154 "retmode": "json", 

1155 } 

1156 

1157 # Add API key if available 

1158 if self.api_key: 1158 ↛ 1159line 1158 didn't jump to line 1159 because the condition on line 1158 was never true

1159 params["api_key"] = self.api_key 

1160 

1161 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

1162 self.engine_type 

1163 ) 

1164 

1165 # Execute request 

1166 response = safe_get(self.link_url, params=params) 

1167 response.raise_for_status() 

1168 

1169 # Parse response 

1170 data = response.json() 

1171 

1172 # Map PubMed IDs to PMC IDs 

1173 pmid_to_pmcid = {} 

1174 

1175 for linkset in data.get("linksets", []): 

1176 pmid = linkset.get("ids", [None])[0] 

1177 

1178 if not pmid: 1178 ↛ 1179line 1178 didn't jump to line 1179 because the condition on line 1178 was never true

1179 continue 

1180 

1181 for link in linkset.get("linksetdbs", []): 

1182 if link.get("linkname") == "pubmed_pmc": 1182 ↛ 1181line 1182 didn't jump to line 1181 because the condition on line 1182 was always true

1183 pmcids = link.get("links", []) 

1184 if pmcids: 1184 ↛ 1181line 1184 didn't jump to line 1181 because the condition on line 1184 was always true

1185 pmid_to_pmcid[str(pmid)] = f"PMC{pmcids[0]}" 

1186 

1187 logger.info( 

1188 f"Found {len(pmid_to_pmcid)} PMC IDs for full-text access" 

1189 ) 

1190 return pmid_to_pmcid 

1191 

1192 except Exception: 

1193 logger.exception("Error finding PMC IDs") 

1194 return {} 

1195 

1196 def _get_pmc_full_text(self, pmcid: str) -> str: 

1197 """ 

1198 Get full text for a PMC article. 

1199 

1200 Args: 

1201 pmcid: PMC ID of the article 

1202 

1203 Returns: 

1204 Full text content or empty string if not available 

1205 """ 

1206 try: 

1207 # Prepare parameters 

1208 params = { 

1209 "db": "pmc", 

1210 "id": pmcid, 

1211 "retmode": "xml", 

1212 "rettype": "full", 

1213 } 

1214 

1215 # Add API key if available 

1216 if self.api_key: 1216 ↛ 1217line 1216 didn't jump to line 1217 because the condition on line 1216 was never true

1217 params["api_key"] = self.api_key 

1218 

1219 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

1220 self.engine_type 

1221 ) 

1222 

1223 # Execute request 

1224 response = safe_get(self.fetch_url, params=params) 

1225 response.raise_for_status() 

1226 

1227 # Parse XML response 

1228 root = ET.fromstring(response.text) 

1229 

1230 # Extract full text 

1231 full_text = [] 

1232 

1233 # Extract article title 

1234 title_elem = root.find(".//article-title") 

1235 if title_elem is not None and title_elem.text: 1235 ↛ 1239line 1235 didn't jump to line 1239 because the condition on line 1235 was always true

1236 full_text.append(f"# {title_elem.text}") 

1237 

1238 # Extract abstract 

1239 abstract_paras = root.findall(".//abstract//p") 

1240 if abstract_paras: 

1241 full_text.append("\n## Abstract\n") 

1242 for p in abstract_paras: 

1243 text = "".join(p.itertext()) 

1244 if text: 1244 ↛ 1242line 1244 didn't jump to line 1242 because the condition on line 1244 was always true

1245 full_text.append(text) 

1246 

1247 # Extract body content 

1248 body = root.find(".//body") 

1249 if body is not None: 1249 ↛ 1262line 1249 didn't jump to line 1262 because the condition on line 1249 was always true

1250 for section in body.findall(".//sec"): 

1251 # Get section title 

1252 title = section.find(".//title") 

1253 if title is not None and title.text: 1253 ↛ 1257line 1253 didn't jump to line 1257 because the condition on line 1253 was always true

1254 full_text.append(f"\n## {title.text}\n") 

1255 

1256 # Get paragraphs 

1257 for p in section.findall(".//p"): 

1258 text = "".join(p.itertext()) 

1259 if text: 1259 ↛ 1257line 1259 didn't jump to line 1257 because the condition on line 1259 was always true

1260 full_text.append(text) 

1261 

1262 result_text = "\n\n".join(full_text) 

1263 logger.debug( 

1264 f"Successfully extracted {len(result_text)} chars of PMC full text with {len(full_text)} sections" 

1265 ) 

1266 return result_text 

1267 

1268 except Exception: 

1269 logger.exception("Error getting PMC full text") 

1270 return "" 

1271 

1272 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

1273 """ 

1274 Get preview information for PubMed articles. 

1275 

1276 Args: 

1277 query: The search query 

1278 

1279 Returns: 

1280 List of preview dictionaries 

1281 """ 

1282 logger.info(f"Getting PubMed previews for query: {query}") 

1283 

1284 # Optimize the query for PubMed if LLM is available 

1285 optimized_query = self._optimize_query_for_pubmed(query) 

1286 

1287 # Perform adaptive search 

1288 pmid_list, strategy = self._adaptive_search(optimized_query) 

1289 

1290 # If no results, try a simplified query 

1291 if not pmid_list: 

1292 logger.warning( 

1293 f"No PubMed results found using strategy: {strategy}" 

1294 ) 

1295 simplified_query = self._simplify_query(optimized_query) 

1296 if simplified_query != optimized_query: 

1297 logger.info(f"Trying with simplified query: {simplified_query}") 

1298 pmid_list, strategy = self._adaptive_search(simplified_query) 

1299 if pmid_list: 

1300 logger.info( 

1301 f"Simplified query found {len(pmid_list)} results" 

1302 ) 

1303 

1304 if not pmid_list: 

1305 logger.warning("No PubMed results found after query simplification") 

1306 return [] 

1307 

1308 # Get article summaries 

1309 logger.debug(f"Fetching article summaries for {len(pmid_list)} PMIDs") 

1310 summaries = self._get_article_summaries(pmid_list) 

1311 logger.debug(f"Retrieved {len(summaries)} summaries") 

1312 

1313 # ALWAYS fetch abstracts for snippet-only mode to provide context for LLM 

1314 logger.debug( 

1315 f"Fetching abstracts for {len(pmid_list)} articles for snippet enrichment" 

1316 ) 

1317 abstracts = self._get_article_abstracts(pmid_list) 

1318 logger.debug(f"Retrieved {len(abstracts)} abstracts") 

1319 

1320 # Format as previews 

1321 previews = [] 

1322 for summary in summaries: 

1323 # Build snippet from individual metadata preferences 

1324 snippet_parts = [] 

1325 

1326 # Check for publication type from esummary (earlier than detailed metadata) 

1327 pub_type_prefix = "" 

1328 if self.include_publication_type_in_context and summary.get( 1328 ↛ 1332line 1328 didn't jump to line 1332 because the condition on line 1328 was never true

1329 "pubtype" 

1330 ): 

1331 # Use first publication type from esummary 

1332 pub_type_prefix = f"[{summary['pubtype'][0]}] " 

1333 

1334 # Add authors if enabled 

1335 if self.include_authors_in_context and summary.get("authors"): 

1336 authors_text = ", ".join(summary.get("authors", [])) 

1337 if len(authors_text) > 100: 

1338 # Truncate long author lists 

1339 authors_text = authors_text[:97] + "..." 

1340 snippet_parts.append(authors_text) 

1341 

1342 # Add journal if enabled 

1343 if self.include_journal_in_context and summary.get("journal"): 1343 ↛ 1347line 1343 didn't jump to line 1347 because the condition on line 1343 was always true

1344 snippet_parts.append(summary["journal"]) 

1345 

1346 # Add date (full or year only) 

1347 if summary.get("pubdate"): 1347 ↛ 1357line 1347 didn't jump to line 1357 because the condition on line 1347 was always true

1348 if self.include_full_date_in_context: 1348 ↛ 1349line 1348 didn't jump to line 1349 because the condition on line 1348 was never true

1349 snippet_parts.append(summary["pubdate"]) 

1350 elif ( 1350 ↛ 1357line 1350 didn't jump to line 1357 because the condition on line 1350 was always true

1351 self.include_year_in_context 

1352 and len(summary["pubdate"]) >= 4 

1353 ): 

1354 snippet_parts.append(summary["pubdate"][:4]) 

1355 

1356 # Add citation details if enabled 

1357 if self.include_citation_in_context: 

1358 citation_parts = [] 

1359 if summary.get("volume"): 1359 ↛ 1361line 1359 didn't jump to line 1361 because the condition on line 1359 was always true

1360 citation_parts.append(f"Vol {summary['volume']}") 

1361 if summary.get("issue"): 1361 ↛ 1363line 1361 didn't jump to line 1363 because the condition on line 1361 was always true

1362 citation_parts.append(f"Issue {summary['issue']}") 

1363 if summary.get("pages"): 1363 ↛ 1365line 1363 didn't jump to line 1365 because the condition on line 1363 was always true

1364 citation_parts.append(f"pp {summary['pages']}") 

1365 if citation_parts: 1365 ↛ 1369line 1365 didn't jump to line 1369 because the condition on line 1365 was always true

1366 snippet_parts.append(f"({', '.join(citation_parts)})") 

1367 

1368 # Join snippet parts or provide default 

1369 if snippet_parts: 1369 ↛ 1380line 1369 didn't jump to line 1380 because the condition on line 1369 was always true

1370 # Use different separators based on what's included 

1371 if self.include_authors_in_context: 

1372 snippet = ". ".join( 

1373 snippet_parts 

1374 ) # Authors need period separator 

1375 else: 

1376 snippet = " - ".join( 

1377 snippet_parts 

1378 ) # Journal and year use dash 

1379 else: 

1380 snippet = "Research article" 

1381 

1382 # Add publication type prefix 

1383 snippet = pub_type_prefix + snippet 

1384 

1385 # Add language indicator if not English 

1386 if self.include_language_in_context and summary.get("lang"): 

1387 langs = summary["lang"] 

1388 if langs and langs[0] != "eng" and langs[0]: 1388 ↛ 1392line 1388 didn't jump to line 1392 because the condition on line 1388 was always true

1389 snippet = f"{snippet} [{langs[0].upper()}]" 

1390 

1391 # Add identifiers if enabled 

1392 identifier_parts = [] 

1393 if self.include_pmid_in_context and summary.get("id"): 

1394 identifier_parts.append(f"PMID: {summary['id']}") 

1395 if self.include_doi_in_context and summary.get("doi"): 

1396 identifier_parts.append(f"DOI: {summary['doi']}") 

1397 

1398 if identifier_parts: 

1399 snippet = f"{snippet} | {' | '.join(identifier_parts)}" 

1400 

1401 # ALWAYS include title and abstract in snippet for LLM analysis 

1402 pmid = summary["id"] 

1403 title = summary["title"] 

1404 abstract_text = abstracts.get(pmid, "") 

1405 

1406 # Truncate abstract if too long 

1407 if len(abstract_text) > 500: 1407 ↛ 1408line 1407 didn't jump to line 1408 because the condition on line 1407 was never true

1408 abstract_text = abstract_text[:497] + "..." 

1409 

1410 # Build the enriched snippet with title and abstract 

1411 if abstract_text: 

1412 enriched_snippet = f"Title: {title}\n\nAbstract: {abstract_text}\n\nMetadata: {snippet}" 

1413 else: 

1414 enriched_snippet = f"Title: {title}\n\nMetadata: {snippet}" 

1415 

1416 # Log the complete snippet for debugging 

1417 logger.debug(f"Complete snippet for PMID {pmid}:") 

1418 logger.debug(f" Title: {title[:100]}...") 

1419 logger.debug(f" Abstract length: {len(abstract_text)} chars") 

1420 logger.debug(f" Metadata: {snippet}") 

1421 logger.debug( 

1422 f" Full enriched snippet ({len(enriched_snippet)} chars): {enriched_snippet[:500]}..." 

1423 ) 

1424 

1425 # Create preview with basic information 

1426 preview = { 

1427 "id": summary["id"], 

1428 "title": summary["title"], 

1429 "link": summary["link"], 

1430 "snippet": enriched_snippet, # Use enriched snippet with title and abstract 

1431 "authors": summary.get("authors", []), 

1432 "journal": summary.get("journal", ""), 

1433 "pubdate": summary.get("pubdate", ""), 

1434 "doi": summary.get("doi", ""), 

1435 "source": "PubMed", 

1436 "_pmid": summary["id"], # Store PMID for later use 

1437 "_search_strategy": strategy, # Store search strategy for analytics 

1438 } 

1439 

1440 previews.append(preview) 

1441 

1442 logger.info( 

1443 f"Found {len(previews)} PubMed previews using strategy: {strategy}" 

1444 ) 

1445 if previews: 1445 ↛ 1449line 1445 didn't jump to line 1449 because the condition on line 1445 was always true

1446 logger.debug( 

1447 f"Sample preview title: '{previews[0].get('title', 'NO TITLE')[:80]}...'" 

1448 ) 

1449 return previews 

1450 

1451 def _get_full_content( 

1452 self, relevant_items: List[Dict[str, Any]] 

1453 ) -> List[Dict[str, Any]]: 

1454 """ 

1455 Get full content for the relevant PubMed articles. 

1456 Efficiently manages which content to retrieve (abstracts and/or full text). 

1457 

1458 Args: 

1459 relevant_items: List of relevant preview dictionaries 

1460 

1461 Returns: 

1462 List of result dictionaries with full content 

1463 """ 

1464 # Check if we should add full content 

1465 snippets_only_mode = ( 

1466 hasattr(search_config, "SEARCH_SNIPPETS_ONLY") 

1467 and search_config.SEARCH_SNIPPETS_ONLY 

1468 ) 

1469 

1470 if snippets_only_mode: 

1471 logger.info( 

1472 "Snippet-only mode enabled, will fetch abstracts as snippets" 

1473 ) 

1474 # For PubMed, we still need to fetch abstracts as they serve as snippets 

1475 # But we'll skip full-text retrieval 

1476 

1477 logger.info( 

1478 f"Getting content for {len(relevant_items)} PubMed articles" 

1479 ) 

1480 

1481 # Collect all PMIDs for relevant items 

1482 pmids = [] 

1483 for item in relevant_items: 

1484 if "_pmid" in item: 

1485 pmids.append(item["_pmid"]) 

1486 

1487 # Get abstracts if requested and PMIDs exist 

1488 # In snippet-only mode, always get abstracts as they serve as snippets 

1489 abstracts = {} 

1490 if (self.get_abstracts or snippets_only_mode) and pmids: 

1491 abstracts = self._get_article_abstracts(pmids) 

1492 

1493 # Get detailed metadata for all articles (publication types, MeSH terms, etc.) 

1494 detailed_metadata = {} 

1495 if pmids: 

1496 detailed_metadata = self._get_article_detailed_metadata(pmids) 

1497 

1498 # Find PMC IDs for full-text retrieval (if enabled and not in snippet-only mode) 

1499 pmid_to_pmcid = {} 

1500 if self.get_full_text and pmids and not snippets_only_mode: 

1501 pmid_to_pmcid = self._find_pmc_ids(pmids) 

1502 

1503 # Add content to results 

1504 results: List[Dict[str, Any]] = [] 

1505 for item in relevant_items: 

1506 result = item.copy() 

1507 pmid = item.get("_pmid", "") 

1508 

1509 # Add detailed metadata if available 

1510 if pmid in detailed_metadata: 

1511 metadata = detailed_metadata[pmid] 

1512 

1513 # Add publication types (e.g., "Clinical Trial", "Meta-Analysis") 

1514 if "publication_types" in metadata: 

1515 result["publication_types"] = metadata["publication_types"] 

1516 

1517 # Add first publication type to snippet if enabled 

1518 if ( 1518 ↛ 1530line 1518 didn't jump to line 1530 because the condition on line 1518 was always true

1519 self.include_publication_type_in_context 

1520 and metadata["publication_types"] 

1521 ): 

1522 # Just take the first publication type as is 

1523 pub_type = metadata["publication_types"][0] 

1524 if "snippet" in result: 1524 ↛ 1530line 1524 didn't jump to line 1530 because the condition on line 1524 was always true

1525 result["snippet"] = ( 

1526 f"[{pub_type}] {result['snippet']}" 

1527 ) 

1528 

1529 # Add MeSH terms for medical categorization 

1530 if "mesh_terms" in metadata: 

1531 result["mesh_terms"] = metadata["mesh_terms"] 

1532 

1533 # Add MeSH terms to snippet if enabled 

1534 if ( 1534 ↛ 1550line 1534 didn't jump to line 1550 because the condition on line 1534 was always true

1535 self.include_mesh_terms_in_context 

1536 and metadata["mesh_terms"] 

1537 ): 

1538 mesh_to_show = ( 

1539 metadata["mesh_terms"][: self.max_mesh_terms] 

1540 if self.max_mesh_terms > 0 

1541 else metadata["mesh_terms"] 

1542 ) 

1543 if mesh_to_show and "snippet" in result: 1543 ↛ 1550line 1543 didn't jump to line 1550 because the condition on line 1543 was always true

1544 mesh_text = "MeSH: " + ", ".join(mesh_to_show) 

1545 result["snippet"] = ( 

1546 f"{result['snippet']} | {mesh_text}" 

1547 ) 

1548 

1549 # Add keywords 

1550 if "keywords" in metadata: 

1551 result["keywords"] = metadata["keywords"] 

1552 

1553 # Add keywords to snippet if enabled 

1554 if ( 1554 ↛ 1572line 1554 didn't jump to line 1572 because the condition on line 1554 was always true

1555 self.include_keywords_in_context 

1556 and metadata["keywords"] 

1557 ): 

1558 keywords_to_show = ( 

1559 metadata["keywords"][: self.max_keywords] 

1560 if self.max_keywords > 0 

1561 else metadata["keywords"] 

1562 ) 

1563 if keywords_to_show and "snippet" in result: 1563 ↛ 1572line 1563 didn't jump to line 1572 because the condition on line 1563 was always true

1564 keywords_text = "Keywords: " + ", ".join( 

1565 keywords_to_show 

1566 ) 

1567 result["snippet"] = ( 

1568 f"{result['snippet']} | {keywords_text}" 

1569 ) 

1570 

1571 # Add affiliations 

1572 if "affiliations" in metadata: 1572 ↛ 1573line 1572 didn't jump to line 1573 because the condition on line 1572 was never true

1573 result["affiliations"] = metadata["affiliations"] 

1574 

1575 # Add funding/grant information 

1576 if "grants" in metadata: 1576 ↛ 1577line 1576 didn't jump to line 1577 because the condition on line 1576 was never true

1577 result["grants"] = metadata["grants"] 

1578 

1579 # Add conflict of interest statement 

1580 if "conflict_of_interest" in metadata: 1580 ↛ 1581line 1580 didn't jump to line 1581 because the condition on line 1580 was never true

1581 result["conflict_of_interest"] = metadata[ 

1582 "conflict_of_interest" 

1583 ] 

1584 

1585 # Add free full text availability 

1586 if "has_free_full_text" in metadata: 

1587 result["has_free_full_text"] = metadata[ 

1588 "has_free_full_text" 

1589 ] 

1590 if "pmc_id" in metadata: 1590 ↛ 1594line 1590 didn't jump to line 1594 because the condition on line 1590 was always true

1591 result["pmc_id"] = metadata["pmc_id"] 

1592 

1593 # Add PMC availability to snippet if enabled 

1594 if ( 1594 ↛ 1604line 1594 didn't jump to line 1604 because the condition on line 1594 was always true

1595 self.include_pmc_availability_in_context 

1596 and metadata["has_free_full_text"] 

1597 and "snippet" in result 

1598 ): 

1599 result["snippet"] = ( 

1600 f"{result['snippet']} | [Free Full Text]" 

1601 ) 

1602 

1603 # Add abstract if available 

1604 if pmid in abstracts: 

1605 result["abstract"] = abstracts[pmid] 

1606 

1607 # Create enriched content with metadata context 

1608 enriched_content = self._create_enriched_content( 

1609 result, abstracts[pmid] 

1610 ) 

1611 

1612 # ALWAYS include title and abstract in snippet for LLM analysis 

1613 # Build comprehensive snippet with title and abstract 

1614 title = result.get("title", "") 

1615 abstract_text = ( 

1616 abstracts[pmid][:SNIPPET_LENGTH_LONG] 

1617 if len(abstracts[pmid]) > SNIPPET_LENGTH_LONG 

1618 else abstracts[pmid] 

1619 ) 

1620 

1621 # Prepend title and abstract to the existing metadata snippet 

1622 if "snippet" in result: 1622 ↛ 1629line 1622 didn't jump to line 1629 because the condition on line 1622 was always true

1623 # Keep metadata snippet and add content 

1624 result["snippet"] = ( 

1625 f"Title: {title}\n\nAbstract: {abstract_text}\n\nMetadata: {result['snippet']}" 

1626 ) 

1627 else: 

1628 # No metadata snippet, just title and abstract 

1629 result["snippet"] = ( 

1630 f"Title: {title}\n\nAbstract: {abstract_text}" 

1631 ) 

1632 

1633 # In snippet-only mode, use enriched content 

1634 if snippets_only_mode: 

1635 result["full_content"] = enriched_content 

1636 result["content"] = enriched_content 

1637 result["content_type"] = "abstract" 

1638 # Use abstract as content if no full text 

1639 elif pmid not in pmid_to_pmcid: 

1640 result["full_content"] = enriched_content 

1641 result["content"] = enriched_content 

1642 result["content_type"] = "abstract" 

1643 

1644 # Add full text for a limited number of top articles 

1645 if ( 

1646 pmid in pmid_to_pmcid 

1647 and self.get_full_text 

1648 and len( 

1649 [r for r in results if r.get("content_type") == "full_text"] 

1650 ) 

1651 < self.full_text_limit 

1652 ): 

1653 # Get full text content 

1654 pmcid = pmid_to_pmcid[pmid] 

1655 full_text = self._get_pmc_full_text(pmcid) 

1656 

1657 if full_text: 

1658 enriched_full_text = self._create_enriched_content( 

1659 result, full_text 

1660 ) 

1661 result["full_content"] = enriched_full_text 

1662 result["content"] = enriched_full_text 

1663 result["content_type"] = "full_text" 

1664 result["pmcid"] = pmcid 

1665 elif pmid in abstracts: 1665 ↛ 1675line 1665 didn't jump to line 1675 because the condition on line 1665 was always true

1666 # Fall back to abstract if full text retrieval fails 

1667 enriched_content = self._create_enriched_content( 

1668 result, abstracts[pmid] 

1669 ) 

1670 result["full_content"] = enriched_content 

1671 result["content"] = enriched_content 

1672 result["content_type"] = "abstract" 

1673 

1674 # Remove temporary fields 

1675 if "_pmid" in result: 

1676 del result["_pmid"] 

1677 if "_search_strategy" in result: 

1678 del result["_search_strategy"] 

1679 

1680 results.append(result) 

1681 

1682 return results 

1683 

1684 def search_by_author( 

1685 self, author_name: str, max_results: Optional[int] = None 

1686 ) -> List[Dict[str, Any]]: 

1687 """ 

1688 Search for articles by a specific author. 

1689 

1690 Args: 

1691 author_name: Name of the author 

1692 max_results: Maximum number of results (defaults to self.max_results) 

1693 

1694 Returns: 

1695 List of articles by the author 

1696 """ 

1697 original_max_results = self.max_results 

1698 

1699 try: 

1700 if max_results: 

1701 self.max_results = max_results 

1702 

1703 query = f"{author_name}[Author]" 

1704 return self.run(query) 

1705 

1706 finally: 

1707 # Restore original value 

1708 self.max_results = original_max_results 

1709 

1710 def search_by_journal( 

1711 self, journal_name: str, max_results: Optional[int] = None 

1712 ) -> List[Dict[str, Any]]: 

1713 """ 

1714 Search for articles in a specific journal. 

1715 

1716 Args: 

1717 journal_name: Name of the journal 

1718 max_results: Maximum number of results (defaults to self.max_results) 

1719 

1720 Returns: 

1721 List of articles from the journal 

1722 """ 

1723 original_max_results = self.max_results 

1724 

1725 try: 

1726 if max_results: 

1727 self.max_results = max_results 

1728 

1729 query = f"{journal_name}[Journal]" 

1730 return self.run(query) 

1731 

1732 finally: 

1733 # Restore original value 

1734 self.max_results = original_max_results 

1735 

1736 def search_recent( 

1737 self, query: str, days: int = 30, max_results: Optional[int] = None 

1738 ) -> List[Dict[str, Any]]: 

1739 """ 

1740 Search for recent articles matching the query. 

1741 

1742 Args: 

1743 query: The search query 

1744 days: Number of days to look back 

1745 max_results: Maximum number of results (defaults to self.max_results) 

1746 

1747 Returns: 

1748 List of recent articles matching the query 

1749 """ 

1750 original_max_results = self.max_results 

1751 original_days_limit = self.days_limit 

1752 

1753 try: 

1754 if max_results: 

1755 self.max_results = max_results 

1756 

1757 # Set days limit for this search 

1758 self.days_limit = days 

1759 

1760 return self.run(query) 

1761 

1762 finally: 

1763 # Restore original values 

1764 self.max_results = original_max_results 

1765 self.days_limit = original_days_limit 

1766 

1767 def advanced_search( 

1768 self, terms: Dict[str, str], max_results: Optional[int] = None 

1769 ) -> List[Dict[str, Any]]: 

1770 """ 

1771 Perform an advanced search with field-specific terms. 

1772 

1773 Args: 

1774 terms: Dictionary mapping fields to search terms 

1775 Valid fields: Author, Journal, Title, MeSH, Affiliation, etc. 

1776 max_results: Maximum number of results (defaults to self.max_results) 

1777 

1778 Returns: 

1779 List of articles matching the advanced query 

1780 """ 

1781 original_max_results = self.max_results 

1782 

1783 try: 

1784 if max_results: 

1785 self.max_results = max_results 

1786 

1787 # Build advanced query string 

1788 query_parts = [] 

1789 for field, term in terms.items(): 

1790 query_parts.append(f"{term}[{field}]") 

1791 

1792 query = " AND ".join(query_parts) 

1793 return self.run(query) 

1794 

1795 finally: 

1796 # Restore original value 

1797 self.max_results = original_max_results