Coverage for src / local_deep_research / web_search_engines / engines / search_engine_pubmed.py: 71%
708 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-02-25 01:07 +0000
1import re
2from typing import Any, Dict, List, Optional, Tuple
4from defusedxml import ElementTree as ET
6from langchain_core.language_models import BaseLLM
7from loguru import logger
9from ...config import search_config
10from ...constants import SNIPPET_LENGTH_LONG
11from ...security.safe_requests import safe_get
12from ..rate_limiting import RateLimitError
13from ..search_engine_base import BaseSearchEngine
16class PubMedSearchEngine(BaseSearchEngine):
17 """
18 PubMed search engine implementation with two-phase approach and adaptive search.
19 Provides efficient access to biomedical literature while minimizing API usage.
20 """
22 # Mark as public search engine
23 is_public = True
24 # Scientific/medical search engine
25 is_scientific = True
27 def __init__(
28 self,
29 max_results: int = 10,
30 api_key: Optional[str] = None,
31 days_limit: Optional[int] = None,
32 get_abstracts: bool = True,
33 get_full_text: bool = False,
34 full_text_limit: int = 3,
35 llm: Optional[BaseLLM] = None,
36 max_filtered_results: Optional[int] = None,
37 optimize_queries: bool = True,
38 include_publication_type_in_context: bool = True,
39 include_journal_in_context: bool = True,
40 include_year_in_context: bool = True,
41 include_authors_in_context: bool = False,
42 include_full_date_in_context: bool = False,
43 include_mesh_terms_in_context: bool = True,
44 include_keywords_in_context: bool = True,
45 include_doi_in_context: bool = False,
46 include_pmid_in_context: bool = False,
47 include_pmc_availability_in_context: bool = False,
48 max_mesh_terms: int = 3,
49 max_keywords: int = 3,
50 include_citation_in_context: bool = False,
51 include_language_in_context: bool = False,
52 ):
53 """
54 Initialize the PubMed search engine.
56 Args:
57 max_results: Maximum number of search results
58 api_key: NCBI API key for higher rate limits (optional)
59 days_limit: Limit results to N days (optional)
60 get_abstracts: Whether to fetch abstracts for all results
61 get_full_text: Whether to fetch full text content (when available in PMC)
62 full_text_limit: Max number of full-text articles to retrieve
63 llm: Language model for relevance filtering
64 max_filtered_results: Maximum number of results to keep after filtering
65 optimize_queries: Whether to optimize natural language queries for PubMed
66 """
67 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
68 super().__init__(
69 llm=llm,
70 max_filtered_results=max_filtered_results,
71 max_results=max_results,
72 )
73 self.max_results = max(self.max_results, 25)
74 self.api_key = api_key
75 self.days_limit = days_limit
76 self.get_abstracts = get_abstracts
77 self.get_full_text = get_full_text
78 self.full_text_limit = full_text_limit
79 self.optimize_queries = optimize_queries
80 self.include_publication_type_in_context = (
81 include_publication_type_in_context
82 )
83 self.include_journal_in_context = include_journal_in_context
84 self.include_year_in_context = include_year_in_context
85 self.include_authors_in_context = include_authors_in_context
86 self.include_full_date_in_context = include_full_date_in_context
87 self.include_mesh_terms_in_context = include_mesh_terms_in_context
88 self.include_keywords_in_context = include_keywords_in_context
89 self.include_doi_in_context = include_doi_in_context
90 self.include_pmid_in_context = include_pmid_in_context
91 self.include_pmc_availability_in_context = (
92 include_pmc_availability_in_context
93 )
94 self.max_mesh_terms = max_mesh_terms
95 self.max_keywords = max_keywords
96 self.include_citation_in_context = include_citation_in_context
97 self.include_language_in_context = include_language_in_context
99 # Base API URLs
100 self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
101 self.search_url = f"{self.base_url}/esearch.fcgi"
102 self.summary_url = f"{self.base_url}/esummary.fcgi"
103 self.fetch_url = f"{self.base_url}/efetch.fcgi"
104 self.link_url = f"{self.base_url}/elink.fcgi"
106 # PMC base URL for full text
107 self.pmc_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/"
109 def _get_result_count(self, query: str) -> int:
110 """
111 Get the total number of results for a query without retrieving the results themselves.
113 Args:
114 query: The search query
116 Returns:
117 Total number of matching results
118 """
119 try:
120 # Prepare search parameters
121 params = {
122 "db": "pubmed",
123 "term": query,
124 "retmode": "json",
125 "retmax": 0, # Don't need actual results, just the count
126 }
128 # Add API key if available
129 if self.api_key:
130 params["api_key"] = self.api_key
132 self._last_wait_time = self.rate_tracker.apply_rate_limit(
133 self.engine_type
134 )
136 # Execute search request
137 response = safe_get(self.search_url, params=params)
138 response.raise_for_status()
140 # Parse response
141 data = response.json()
142 count = int(data["esearchresult"]["count"])
144 logger.info(
145 "Query '%s' has %s total results in PubMed", query, count
146 )
147 return count
149 except Exception:
150 logger.exception("Error getting result count")
151 return 0
153 def _extract_core_terms(self, query: str) -> str:
154 """
155 Extract core terms from a complex query for volume estimation.
157 Args:
158 query: PubMed query string
160 Returns:
161 Simplified query with core terms
162 """
163 # Remove field specifications and operators
164 simplified = re.sub(r"\[\w+\]", "", query) # Remove [Field] tags
165 simplified = re.sub(
166 r"\b(AND|OR|NOT)\b", "", simplified
167 ) # Remove operators
169 # Remove quotes and parentheses
170 simplified = (
171 simplified.replace('"', "").replace("(", "").replace(")", "")
172 )
174 # Split by whitespace and join terms with 4+ chars (likely meaningful)
175 terms = [term for term in simplified.split() if len(term) >= 4]
177 # Join with AND to create a basic search
178 return " ".join(terms[:5]) # Limit to top 5 terms
180 def _expand_time_window(self, time_filter: str) -> str:
181 """
182 Expand a time window to get more results.
184 Args:
185 time_filter: Current time filter
187 Returns:
188 Expanded time filter
189 """
190 # Parse current time window
191 import re
193 match = re.match(r'"last (\d+) (\w+)"[pdat]', time_filter)
194 if not match:
195 return '"last 10 years"[pdat]'
197 amount, unit = int(match.group(1)), match.group(2)
199 # Expand based on current unit
200 if unit == "months" or unit == "month":
201 if amount < 6:
202 return '"last 6 months"[pdat]'
203 elif amount < 12: 203 ↛ 206line 203 didn't jump to line 206 because the condition on line 203 was always true
204 return '"last 1 year"[pdat]'
205 else:
206 return '"last 2 years"[pdat]'
207 elif unit == "years" or unit == "year": 207 ↛ 215line 207 didn't jump to line 215 because the condition on line 207 was always true
208 if amount < 2:
209 return '"last 2 years"[pdat]'
210 elif amount < 5: 210 ↛ 213line 210 didn't jump to line 213 because the condition on line 210 was always true
211 return '"last 5 years"[pdat]'
212 else:
213 return '"last 10 years"[pdat]'
215 return '"last 10 years"[pdat]'
217 def _optimize_query_for_pubmed(self, query: str) -> str:
218 """
219 Optimize a natural language query for PubMed search.
220 Uses LLM to transform questions into effective keyword-based queries.
222 Args:
223 query: Natural language query
225 Returns:
226 Optimized query string for PubMed
227 """
228 if not self.llm or not self.optimize_queries:
229 # Return original query if no LLM available or optimization disabled
230 return query
232 try:
233 # Prompt for query optimization
234 prompt = f"""Transform this natural language question into an optimized PubMed search query.
236Original query: "{query}"
238CRITICAL RULES:
2391. ONLY RETURN THE EXACT SEARCH QUERY - NO EXPLANATIONS, NO COMMENTS
2402. DO NOT wrap the entire query in quotes
2413. DO NOT include ANY date restrictions or year filters
2424. Use parentheses around OR statements: (term1[Field] OR term2[Field])
2435. Use only BASIC MeSH terms - stick to broad categories like "Vaccines"[Mesh]
2446. KEEP IT SIMPLE - use 2-3 main concepts maximum
2457. Focus on Title/Abstract searches for reliability: term[Title/Abstract]
2468. Use wildcards for variations: vaccin*[Title/Abstract]
248EXAMPLE QUERIES:
249✓ GOOD: (mRNA[Title/Abstract] OR "messenger RNA"[Title/Abstract]) AND vaccin*[Title/Abstract]
250✓ GOOD: (influenza[Title/Abstract] OR flu[Title/Abstract]) AND treatment[Title/Abstract]
251✗ BAD: (mRNA[Title/Abstract]) AND "specific disease"[Mesh] AND treatment[Title/Abstract] AND 2023[dp]
252✗ BAD: "Here's a query to find articles about vaccines..."
254Return ONLY the search query without any explanations.
255"""
257 # Get response from LLM
258 response = self.llm.invoke(prompt)
259 raw_response = response.content.strip()
261 # Clean up the query - extract only the actual query and remove any explanations
262 # First check if there are multiple lines and take the first non-empty line
263 lines = raw_response.split("\n")
264 cleaned_lines = [line.strip() for line in lines if line.strip()]
266 if cleaned_lines: 266 ↛ 316line 266 didn't jump to line 316 because the condition on line 266 was always true
267 optimized_query = cleaned_lines[0]
269 # Remove any quotes that wrap the entire query
270 if optimized_query.startswith('"') and optimized_query.endswith( 270 ↛ 273line 270 didn't jump to line 273 because the condition on line 270 was never true
271 '"'
272 ):
273 optimized_query = optimized_query[1:-1]
275 # Remove any explanation phrases that might be at the beginning
276 explanation_starters = [
277 "here is",
278 "here's",
279 "this query",
280 "the following",
281 ]
282 for starter in explanation_starters:
283 if optimized_query.lower().startswith(starter): 283 ↛ 285line 283 didn't jump to line 285 because the condition on line 283 was never true
284 # Find the actual query part - typically after a colon
285 colon_pos = optimized_query.find(":")
286 if colon_pos > 0:
287 optimized_query = optimized_query[
288 colon_pos + 1 :
289 ].strip()
291 # Check if the query still seems to contain explanations
292 if ( 292 ↛ 298line 292 didn't jump to line 298 because the condition on line 292 was never true
293 len(optimized_query) > 200
294 or "this query will" in optimized_query.lower()
295 ):
296 # It's probably still an explanation - try to extract just the query part
297 # Look for common patterns in the explanation like parentheses
298 pattern = r"\([^)]+\)\s+AND\s+"
299 import re
301 matches = re.findall(pattern, optimized_query)
302 if matches:
303 # Extract just the query syntax parts
304 query_parts = []
305 for part in re.split(r"\.\s+", optimized_query):
306 if (
307 "(" in part
308 and ")" in part
309 and ("AND" in part or "OR" in part)
310 ):
311 query_parts.append(part)
312 if query_parts:
313 optimized_query = " ".join(query_parts)
314 else:
315 # Fall back to original query if cleaning fails
316 logger.warning(
317 "Failed to extract a clean query from LLM response"
318 )
319 optimized_query = query
321 # Final safety check - if query looks too much like an explanation, use original
322 if len(optimized_query.split()) > 30: 322 ↛ 323line 322 didn't jump to line 323 because the condition on line 322 was never true
323 logger.warning(
324 "Query too verbose, falling back to simpler form"
325 )
326 # Create a simple query from the original
327 words = [
328 w
329 for w in query.split()
330 if len(w) > 3
331 and w.lower()
332 not in (
333 "what",
334 "are",
335 "the",
336 "and",
337 "for",
338 "with",
339 "from",
340 "have",
341 "been",
342 "recent",
343 )
344 ]
345 optimized_query = " AND ".join(words[:3])
347 # Basic cleanup: standardize field tag case for consistency
348 import re
350 optimized_query = re.sub(
351 r"\[mesh\]", "[Mesh]", optimized_query, flags=re.IGNORECASE
352 )
353 optimized_query = re.sub(
354 r"\[title/abstract\]",
355 "[Title/Abstract]",
356 optimized_query,
357 flags=re.IGNORECASE,
358 )
359 optimized_query = re.sub(
360 r"\[publication type\]",
361 "[Publication Type]",
362 optimized_query,
363 flags=re.IGNORECASE,
364 )
366 # Fix unclosed quotes followed by field tags
367 # Pattern: "term[Field] -> "term"[Field]
368 optimized_query = re.sub(r'"([^"]+)\[', r'"\1"[', optimized_query)
370 # Simplify the query if still no results are found
371 self._simplify_query_cache = optimized_query
373 # Log original and optimized queries
374 logger.info("Original query: '%s'", query)
375 logger.info(f"Optimized for PubMed: '{optimized_query}'")
376 logger.debug(
377 f"Query optimization complete: '{query[:50]}...' -> '{optimized_query[:100]}...'"
378 )
380 return optimized_query
382 except Exception:
383 logger.exception("Error optimizing query")
384 logger.debug(f"Falling back to original query: '{query}'")
385 return query # Fall back to original query on error
387 def _simplify_query(self, query: str) -> str:
388 """
389 Simplify a PubMed query that returned no results.
390 Progressively removes elements to get a more basic query.
392 Args:
393 query: The original query that returned no results
395 Returns:
396 Simplified query
397 """
398 logger.info(f"Simplifying query: {query}")
399 logger.debug(f"Query simplification started for: '{query[:100]}...'")
401 # Simple approach: remove field restrictions to broaden the search
402 import re
404 # Remove field tags to make search broader
405 simplified = query
407 # Remove [Mesh] tags - search in all fields instead
408 simplified = re.sub(r"\[Mesh\]", "", simplified, flags=re.IGNORECASE)
410 # Remove [Publication Type] tags
411 simplified = re.sub(
412 r"\[Publication Type\]", "", simplified, flags=re.IGNORECASE
413 )
415 # Keep [Title/Abstract] as it's usually helpful
416 # Clean up any double spaces
417 simplified = re.sub(r"\s+", " ", simplified).strip()
419 # If no simplification was possible, return the original query
420 if simplified == query:
421 logger.debug("No simplification possible, returning original query")
423 logger.info(f"Simplified query: {simplified}")
424 logger.debug(
425 f"Query simplified from {len(query)} to {len(simplified)} chars"
426 )
427 return simplified
429 def _is_historical_focused(self, query: str) -> bool:
430 """
431 Determine if a query is specifically focused on historical/older information using LLM.
432 Default assumption is that queries should prioritize recent information unless
433 explicitly asking for historical content.
435 Args:
436 query: The search query
438 Returns:
439 Boolean indicating if the query is focused on historical information
440 """
441 if not self.llm:
442 # Fall back to basic keyword check if no LLM available
443 historical_terms = [
444 "history",
445 "historical",
446 "early",
447 "initial",
448 "first",
449 "original",
450 "before",
451 "prior to",
452 "origins",
453 "evolution",
454 "development",
455 ]
456 historical_years = [str(year) for year in range(1900, 2020)]
458 query_lower = query.lower()
459 has_historical_term = any(
460 term in query_lower for term in historical_terms
461 )
462 has_past_year = any(year in query for year in historical_years)
464 return has_historical_term or has_past_year
466 try:
467 # Use LLM to determine if the query is focused on historical information
468 prompt = f"""Determine if this query is specifically asking for HISTORICAL or OLDER information.
470Query: "{query}"
472Answer ONLY "yes" if the query is clearly asking for historical, early, original, or past information from more than 5 years ago.
473Answer ONLY "no" if the query is asking about recent, current, or new information, or if it's a general query without a specific time focus.
475The default assumption should be that medical and scientific queries want RECENT information unless clearly specified otherwise.
476"""
478 response = self.llm.invoke(prompt)
479 answer = response.content.strip().lower()
481 # Log the determination
482 logger.info(f"Historical focus determination for query: '{query}'")
483 logger.info(f"LLM determined historical focus: {answer}")
485 return "yes" in answer
487 except Exception:
488 logger.exception("Error determining historical focus")
489 # Fall back to basic keyword check
490 historical_terms = [
491 "history",
492 "historical",
493 "early",
494 "initial",
495 "first",
496 "original",
497 "before",
498 "prior to",
499 "origins",
500 "evolution",
501 "development",
502 ]
503 return any(term in query.lower() for term in historical_terms)
505 def _adaptive_search(self, query: str) -> Tuple[List[str], str]:
506 """
507 Perform an adaptive search that adjusts based on topic volume and whether
508 the query focuses on historical information.
510 Args:
511 query: The search query (already optimized)
513 Returns:
514 Tuple of (list of PMIDs, search strategy used)
515 """
516 # Estimate topic volume
517 estimated_volume = self._get_result_count(query)
519 # Determine if the query is focused on historical information
520 is_historical_focused = self._is_historical_focused(query)
522 if is_historical_focused:
523 # User wants historical information - no date filtering
524 time_filter = None
525 strategy = "historical_focus"
526 elif estimated_volume > 5000:
527 # Very common topic - use tighter recency filter
528 time_filter = '"last 1 year"[pdat]'
529 strategy = "high_volume"
530 elif estimated_volume > 1000:
531 # Common topic
532 time_filter = '"last 3 years"[pdat]'
533 strategy = "common_topic"
534 elif estimated_volume > 100: 534 ↛ 540line 534 didn't jump to line 540 because the condition on line 534 was always true
535 # Moderate volume
536 time_filter = '"last 5 years"[pdat]'
537 strategy = "moderate_volume"
538 else:
539 # Rare topic - still use recency but with wider range
540 time_filter = '"last 10 years"[pdat]'
541 strategy = "rare_topic"
543 # Run search based on strategy
544 if time_filter:
545 # Try with adaptive time filter
546 query_with_time = f"({query}) AND {time_filter}"
547 logger.info(
548 f"Using adaptive search strategy: {strategy} with filter: {time_filter}"
549 )
550 results = self._search_pubmed(query_with_time)
552 # If too few results, gradually expand time window
553 if len(results) < 5 and '"last 10 years"[pdat]' not in time_filter:
554 logger.info(
555 f"Insufficient results ({len(results)}), expanding time window"
556 )
557 expanded_time = self._expand_time_window(time_filter)
558 query_with_expanded_time = f"({query}) AND {expanded_time}"
559 expanded_results = self._search_pubmed(query_with_expanded_time)
561 if len(expanded_results) > len(results):
562 logger.info(
563 f"Expanded time window yielded {len(expanded_results)} results"
564 )
565 return expanded_results, f"{strategy}_expanded"
567 # If still no results, try without time filter
568 if not results:
569 logger.info(
570 "No results with time filter, trying without time restrictions"
571 )
572 results = self._search_pubmed(query)
573 strategy = "no_time_filter"
574 else:
575 # Historical query - run without time filter
576 logger.info(
577 "Using historical search strategy without date filtering"
578 )
579 results = self._search_pubmed(query)
581 return results, strategy
583 def _search_pubmed(self, query: str) -> List[str]:
584 """
585 Search PubMed and return a list of article IDs.
587 Args:
588 query: The search query
590 Returns:
591 List of PubMed IDs matching the query
592 """
593 try:
594 # Prepare search parameters
595 params = {
596 "db": "pubmed",
597 "term": query,
598 "retmode": "json",
599 "retmax": self.max_results,
600 "usehistory": "y",
601 }
603 # Add API key if available
604 if self.api_key: 604 ↛ 605line 604 didn't jump to line 605 because the condition on line 604 was never true
605 params["api_key"] = self.api_key
606 logger.debug("Using PubMed API key for higher rate limits")
607 else:
608 logger.debug("No PubMed API key - using default rate limits")
610 # Add date restriction if specified
611 if self.days_limit: 611 ↛ 612line 611 didn't jump to line 612 because the condition on line 611 was never true
612 params["reldate"] = self.days_limit
613 params["datetype"] = "pdat" # Publication date
614 logger.debug(f"Limiting results to last {self.days_limit} days")
616 logger.debug(
617 f"PubMed search query: '{query}' with max_results={self.max_results}"
618 )
620 self._last_wait_time = self.rate_tracker.apply_rate_limit(
621 self.engine_type
622 )
623 logger.debug(
624 f"Applied rate limit wait: {self._last_wait_time:.2f}s"
625 )
627 # Execute search request
628 logger.debug(f"Sending request to PubMed API: {self.search_url}")
629 response = safe_get(self.search_url, params=params)
630 response.raise_for_status()
631 logger.debug(f"PubMed API response status: {response.status_code}")
633 # Parse response
634 data = response.json()
635 id_list = data["esearchresult"]["idlist"]
636 total_count = data["esearchresult"].get("count", "unknown")
638 logger.info(
639 f"PubMed search for '{query}' found {len(id_list)} results (total available: {total_count})"
640 )
641 if len(id_list) > 0: 641 ↛ 643line 641 didn't jump to line 643 because the condition on line 641 was always true
642 logger.debug(f"First 5 PMIDs: {id_list[:5]}")
643 return id_list
645 except Exception:
646 logger.exception(f"Error searching PubMed for query '{query}'")
647 return []
649 def _get_article_summaries(
650 self, id_list: List[str]
651 ) -> List[Dict[str, Any]]:
652 """
653 Get summaries for a list of PubMed article IDs.
655 Args:
656 id_list: List of PubMed IDs
658 Returns:
659 List of article summary dictionaries
660 """
661 if not id_list:
662 logger.debug("Empty ID list provided to _get_article_summaries")
663 return []
665 logger.debug(f"Fetching summaries for {len(id_list)} PubMed articles")
667 try:
668 # Prepare parameters
669 params = {
670 "db": "pubmed",
671 "id": ",".join(id_list),
672 "retmode": "json",
673 "rettype": "summary",
674 }
676 # Add API key if available
677 if self.api_key: 677 ↛ 678line 677 didn't jump to line 678 because the condition on line 677 was never true
678 params["api_key"] = self.api_key
680 self._last_wait_time = self.rate_tracker.apply_rate_limit(
681 self.engine_type
682 )
683 logger.debug(
684 f"Applied rate limit wait: {self._last_wait_time:.2f}s"
685 )
687 # Execute request
688 logger.debug(f"Requesting summaries from: {self.summary_url}")
689 response = safe_get(self.summary_url, params=params)
690 response.raise_for_status()
691 logger.debug(f"Summary API response status: {response.status_code}")
693 # Parse response
694 data = response.json()
695 logger.debug(
696 f"PubMed API returned data for {len(id_list)} requested IDs"
697 )
698 summaries = []
700 for pmid in id_list:
701 if pmid in data["result"]: 701 ↛ 749line 701 didn't jump to line 749 because the condition on line 701 was always true
702 article = data["result"][pmid]
703 logger.debug(
704 f"Processing article {pmid}: {article.get('title', 'NO TITLE')[:50]}"
705 )
707 # Extract authors (if available)
708 authors = []
709 if "authors" in article: 709 ↛ 715line 709 didn't jump to line 715 because the condition on line 709 was always true
710 authors = [
711 author["name"] for author in article["authors"]
712 ]
714 # Extract DOI from articleids if not in main field
715 doi = article.get("doi", "")
716 if not doi and "articleids" in article: 716 ↛ 723line 716 didn't jump to line 723 because the condition on line 716 was always true
717 for aid in article["articleids"]: 717 ↛ 723line 717 didn't jump to line 723 because the loop on line 717 didn't complete
718 if aid.get("idtype") == "doi": 718 ↛ 717line 718 didn't jump to line 717 because the condition on line 718 was always true
719 doi = aid.get("value", "")
720 break
722 # Create summary dictionary with all available fields
723 summary = {
724 "id": pmid,
725 "title": article.get("title", ""),
726 "pubdate": article.get("pubdate", ""),
727 "epubdate": article.get("epubdate", ""),
728 "source": article.get("source", ""),
729 "authors": authors,
730 "lastauthor": article.get("lastauthor", ""),
731 "journal": article.get("fulljournalname", ""),
732 "volume": article.get("volume", ""),
733 "issue": article.get("issue", ""),
734 "pages": article.get("pages", ""),
735 "doi": doi,
736 "issn": article.get("issn", ""),
737 "essn": article.get("essn", ""),
738 "pubtype": article.get(
739 "pubtype", []
740 ), # Publication types from esummary
741 "recordstatus": article.get("recordstatus", ""),
742 "lang": article.get("lang", []),
743 "pmcrefcount": article.get("pmcrefcount", None),
744 "link": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
745 }
747 summaries.append(summary)
748 else:
749 logger.warning(
750 f"PMID {pmid} not found in PubMed API response"
751 )
753 return summaries
755 except Exception as e:
756 error_msg = str(e)
757 logger.exception(
758 f"Error getting article summaries for {len(id_list)} articles"
759 )
761 # Check for rate limiting patterns
762 if (
763 "429" in error_msg
764 or "too many requests" in error_msg.lower()
765 or "rate limit" in error_msg.lower()
766 or "service unavailable" in error_msg.lower()
767 or "503" in error_msg
768 or "403" in error_msg
769 ):
770 raise RateLimitError(f"PubMed rate limit hit: {error_msg}")
772 return []
774 def _get_article_abstracts(self, id_list: List[str]) -> Dict[str, str]:
775 """
776 Get abstracts for a list of PubMed article IDs.
778 Args:
779 id_list: List of PubMed IDs
781 Returns:
782 Dictionary mapping PubMed IDs to their abstracts
783 """
784 if not id_list:
785 logger.debug("Empty ID list provided to _get_article_abstracts")
786 return {}
788 logger.debug(f"Fetching abstracts for {len(id_list)} PubMed articles")
790 try:
791 # Prepare parameters
792 params = {
793 "db": "pubmed",
794 "id": ",".join(id_list),
795 "retmode": "xml",
796 "rettype": "abstract",
797 }
799 # Add API key if available
800 if self.api_key: 800 ↛ 801line 800 didn't jump to line 801 because the condition on line 800 was never true
801 params["api_key"] = self.api_key
803 self._last_wait_time = self.rate_tracker.apply_rate_limit(
804 self.engine_type
805 )
806 logger.debug(
807 f"Applied rate limit wait: {self._last_wait_time:.2f}s"
808 )
810 # Execute request
811 logger.debug(f"Requesting abstracts from: {self.fetch_url}")
812 response = safe_get(self.fetch_url, params=params)
813 response.raise_for_status()
814 logger.debug(
815 f"Abstract fetch response status: {response.status_code}, size: {len(response.text)} bytes"
816 )
818 # Parse XML response
819 root = ET.fromstring(response.text)
820 logger.debug(
821 f"Parsing abstracts from XML for {len(id_list)} articles"
822 )
824 # Extract abstracts
825 abstracts = {}
827 for article in root.findall(".//PubmedArticle"):
828 pmid_elem = article.find(".//PMID")
829 pmid = pmid_elem.text if pmid_elem is not None else None
831 if pmid is None: 831 ↛ 832line 831 didn't jump to line 832 because the condition on line 831 was never true
832 continue
834 # Find abstract text
835 abstract_text = ""
836 abstract_elem = article.find(".//AbstractText")
838 if abstract_elem is not None: 838 ↛ 842line 838 didn't jump to line 842 because the condition on line 838 was always true
839 abstract_text = abstract_elem.text or ""
841 # Some abstracts are split into multiple sections
842 abstract_sections = article.findall(".//AbstractText")
843 if len(abstract_sections) > 1:
844 logger.debug(
845 f"Article {pmid} has {len(abstract_sections)} abstract sections"
846 )
848 for section in abstract_sections:
849 # Get section label if it exists
850 label = section.get("Label")
851 section_text = section.text or ""
853 if label and section_text:
854 if abstract_text: 854 ↛ 857line 854 didn't jump to line 857 because the condition on line 854 was always true
855 abstract_text += f"\n\n{label}: {section_text}"
856 else:
857 abstract_text = f"{label}: {section_text}"
858 elif section_text: 858 ↛ 848line 858 didn't jump to line 848 because the condition on line 858 was always true
859 if abstract_text: 859 ↛ 862line 859 didn't jump to line 862 because the condition on line 859 was always true
860 abstract_text += f"\n\n{section_text}"
861 else:
862 abstract_text = section_text
864 # Store in dictionary
865 if pmid and abstract_text: 865 ↛ 870line 865 didn't jump to line 870 because the condition on line 865 was always true
866 abstracts[pmid] = abstract_text
867 logger.debug(
868 f"Abstract for {pmid}: {len(abstract_text)} chars"
869 )
870 elif pmid:
871 logger.warning(f"No abstract found for PMID {pmid}")
873 logger.info(
874 f"Successfully retrieved {len(abstracts)} abstracts out of {len(id_list)} requested"
875 )
876 return abstracts
878 except Exception:
879 logger.exception(
880 f"Error getting article abstracts for {len(id_list)} articles"
881 )
882 return {}
884 def _get_article_detailed_metadata(
885 self, id_list: List[str]
886 ) -> Dict[str, Dict[str, Any]]:
887 """
888 Get detailed metadata for PubMed articles including publication types,
889 MeSH terms, keywords, and affiliations.
891 Args:
892 id_list: List of PubMed IDs
894 Returns:
895 Dictionary mapping PubMed IDs to their detailed metadata
896 """
897 if not id_list:
898 return {}
900 try:
901 # Prepare parameters
902 params = {
903 "db": "pubmed",
904 "id": ",".join(id_list),
905 "retmode": "xml",
906 "rettype": "medline",
907 }
909 # Add API key if available
910 if self.api_key: 910 ↛ 911line 910 didn't jump to line 911 because the condition on line 910 was never true
911 params["api_key"] = self.api_key
913 self._last_wait_time = self.rate_tracker.apply_rate_limit(
914 self.engine_type
915 )
917 # Execute request
918 response = safe_get(self.fetch_url, params=params)
919 response.raise_for_status()
921 # Parse XML response
922 root = ET.fromstring(response.text)
924 metadata = {}
926 for article in root.findall(".//PubmedArticle"):
927 pmid_elem = article.find(".//PMID")
928 pmid = pmid_elem.text if pmid_elem is not None else None
930 if pmid is None: 930 ↛ 931line 930 didn't jump to line 931 because the condition on line 930 was never true
931 continue
933 article_metadata = {}
935 # Extract publication types
936 pub_types = []
937 for pub_type in article.findall(".//PublicationType"):
938 if pub_type.text: 938 ↛ 937line 938 didn't jump to line 937 because the condition on line 938 was always true
939 pub_types.append(pub_type.text)
940 if pub_types:
941 article_metadata["publication_types"] = pub_types
943 # Extract MeSH terms
944 mesh_terms = []
945 for mesh in article.findall(".//MeshHeading"):
946 descriptor = mesh.find(".//DescriptorName")
947 if descriptor is not None and descriptor.text: 947 ↛ 945line 947 didn't jump to line 945 because the condition on line 947 was always true
948 mesh_terms.append(descriptor.text)
949 if mesh_terms:
950 article_metadata["mesh_terms"] = mesh_terms
952 # Extract keywords
953 keywords = []
954 for keyword in article.findall(".//Keyword"):
955 if keyword.text: 955 ↛ 954line 955 didn't jump to line 954 because the condition on line 955 was always true
956 keywords.append(keyword.text)
957 if keywords:
958 article_metadata["keywords"] = keywords
960 # Extract affiliations
961 affiliations = []
962 for affiliation in article.findall(".//Affiliation"): 962 ↛ 963line 962 didn't jump to line 963 because the loop on line 962 never started
963 if affiliation.text:
964 affiliations.append(affiliation.text)
965 if affiliations: 965 ↛ 966line 965 didn't jump to line 966 because the condition on line 965 was never true
966 article_metadata["affiliations"] = affiliations
968 # Extract grant information
969 grants = []
970 for grant in article.findall(".//Grant"): 970 ↛ 971line 970 didn't jump to line 971 because the loop on line 970 never started
971 grant_info = {}
972 grant_id = grant.find(".//GrantID")
973 if grant_id is not None and grant_id.text:
974 grant_info["id"] = grant_id.text
975 agency = grant.find(".//Agency")
976 if agency is not None and agency.text:
977 grant_info["agency"] = agency.text
978 if grant_info:
979 grants.append(grant_info)
980 if grants: 980 ↛ 981line 980 didn't jump to line 981 because the condition on line 980 was never true
981 article_metadata["grants"] = grants
983 # Check for free full text in PMC
984 pmc_elem = article.find(".//ArticleId[@IdType='pmc']")
985 if pmc_elem is not None: 985 ↛ 986line 985 didn't jump to line 986 because the condition on line 985 was never true
986 article_metadata["has_free_full_text"] = True
987 article_metadata["pmc_id"] = pmc_elem.text
989 # Extract conflict of interest statement
990 coi_elem = article.find(".//CoiStatement")
991 if coi_elem is not None and coi_elem.text: 991 ↛ 992line 991 didn't jump to line 992 because the condition on line 991 was never true
992 article_metadata["conflict_of_interest"] = coi_elem.text
994 metadata[pmid] = article_metadata
996 return metadata
998 except Exception:
999 logger.exception("Error getting detailed article metadata")
1000 return {}
1002 def _create_enriched_content(
1003 self, result: Dict[str, Any], base_content: str
1004 ) -> str:
1005 """
1006 Create enriched content by adding relevant metadata context to help the LLM.
1008 Args:
1009 result: The result dictionary with metadata
1010 base_content: The base content (abstract or full text)
1012 Returns:
1013 Enriched content string with metadata context
1014 """
1015 enriched_parts = []
1017 # Add study type information
1018 if "publication_types" in result:
1019 pub_types = result["publication_types"]
1020 # Filter for significant types
1021 significant_types = [
1022 pt
1023 for pt in pub_types
1024 if any(
1025 key in pt.lower()
1026 for key in [
1027 "clinical trial",
1028 "randomized",
1029 "meta-analysis",
1030 "systematic review",
1031 "case report",
1032 "guideline",
1033 "comparative study",
1034 "multicenter",
1035 ]
1036 )
1037 ]
1038 if significant_types: 1038 ↛ 1044line 1038 didn't jump to line 1044 because the condition on line 1038 was always true
1039 enriched_parts.append(
1040 f"[Study Type: {', '.join(significant_types)}]"
1041 )
1043 # Add the main content
1044 enriched_parts.append(base_content)
1046 # Add metadata footer
1047 metadata_footer = []
1049 # Add ALL MeSH terms
1050 if "mesh_terms" in result and len(result["mesh_terms"]) > 0:
1051 metadata_footer.append(
1052 f"Medical Topics (MeSH): {', '.join(result['mesh_terms'])}"
1053 )
1055 # Add ALL keywords
1056 if "keywords" in result and len(result["keywords"]) > 0: 1056 ↛ 1057line 1056 didn't jump to line 1057 because the condition on line 1056 was never true
1057 metadata_footer.append(f"Keywords: {', '.join(result['keywords'])}")
1059 # Add ALL affiliations
1060 if "affiliations" in result and len(result["affiliations"]) > 0: 1060 ↛ 1061line 1060 didn't jump to line 1061 because the condition on line 1060 was never true
1061 if len(result["affiliations"]) == 1:
1062 metadata_footer.append(
1063 f"Institution: {result['affiliations'][0]}"
1064 )
1065 else:
1066 affiliations_text = "\n - " + "\n - ".join(
1067 result["affiliations"]
1068 )
1069 metadata_footer.append(f"Institutions:{affiliations_text}")
1071 # Add ALL funding information with full details
1072 if "grants" in result and len(result["grants"]) > 0:
1073 grant_details = []
1074 for grant in result["grants"]:
1075 grant_text = []
1076 if "agency" in grant: 1076 ↛ 1078line 1076 didn't jump to line 1078 because the condition on line 1076 was always true
1077 grant_text.append(grant["agency"])
1078 if "id" in grant: 1078 ↛ 1080line 1078 didn't jump to line 1080 because the condition on line 1078 was always true
1079 grant_text.append(f"(Grant ID: {grant['id']})")
1080 if grant_text: 1080 ↛ 1074line 1080 didn't jump to line 1074 because the condition on line 1080 was always true
1081 grant_details.append(" ".join(grant_text))
1082 if grant_details: 1082 ↛ 1090line 1082 didn't jump to line 1090 because the condition on line 1082 was always true
1083 if len(grant_details) == 1: 1083 ↛ 1086line 1083 didn't jump to line 1086 because the condition on line 1083 was always true
1084 metadata_footer.append(f"Funded by: {grant_details[0]}")
1085 else:
1086 funding_text = "\n - " + "\n - ".join(grant_details)
1087 metadata_footer.append(f"Funding Sources:{funding_text}")
1089 # Add FULL conflict of interest statement
1090 if "conflict_of_interest" in result: 1090 ↛ 1091line 1090 didn't jump to line 1091 because the condition on line 1090 was never true
1091 coi_text = result["conflict_of_interest"]
1092 if coi_text:
1093 # Still skip trivial "no conflict" statements to reduce noise
1094 if not any(
1095 phrase in coi_text.lower()
1096 for phrase in [
1097 "no conflict",
1098 "no competing",
1099 "nothing to disclose",
1100 "none declared",
1101 "authors declare no",
1102 ]
1103 ):
1104 metadata_footer.append(f"Conflict of Interest: {coi_text}")
1105 elif (
1106 "but" in coi_text.lower()
1107 or "except" in coi_text.lower()
1108 or "however" in coi_text.lower()
1109 ):
1110 # Include if there's a "no conflict BUT..." type statement
1111 metadata_footer.append(f"Conflict of Interest: {coi_text}")
1113 # Combine everything
1114 if metadata_footer:
1115 enriched_parts.append("\n---\nStudy Metadata:")
1116 enriched_parts.extend(metadata_footer)
1118 return "\n".join(enriched_parts)
1120 def _find_pmc_ids(self, pmid_list: List[str]) -> Dict[str, str]:
1121 """
1122 Find PMC IDs for the given PubMed IDs (for full-text access).
1124 Args:
1125 pmid_list: List of PubMed IDs
1127 Returns:
1128 Dictionary mapping PubMed IDs to their PMC IDs (if available)
1129 """
1130 if not pmid_list or not self.get_full_text:
1131 return {}
1133 try:
1134 # Prepare parameters
1135 params = {
1136 "dbfrom": "pubmed",
1137 "db": "pmc",
1138 "linkname": "pubmed_pmc",
1139 "id": ",".join(pmid_list),
1140 "retmode": "json",
1141 }
1143 # Add API key if available
1144 if self.api_key: 1144 ↛ 1145line 1144 didn't jump to line 1145 because the condition on line 1144 was never true
1145 params["api_key"] = self.api_key
1147 self._last_wait_time = self.rate_tracker.apply_rate_limit(
1148 self.engine_type
1149 )
1151 # Execute request
1152 response = safe_get(self.link_url, params=params)
1153 response.raise_for_status()
1155 # Parse response
1156 data = response.json()
1158 # Map PubMed IDs to PMC IDs
1159 pmid_to_pmcid = {}
1161 for linkset in data.get("linksets", []):
1162 pmid = linkset.get("ids", [None])[0]
1164 if not pmid: 1164 ↛ 1165line 1164 didn't jump to line 1165 because the condition on line 1164 was never true
1165 continue
1167 for link in linkset.get("linksetdbs", []):
1168 if link.get("linkname") == "pubmed_pmc": 1168 ↛ 1167line 1168 didn't jump to line 1167 because the condition on line 1168 was always true
1169 pmcids = link.get("links", [])
1170 if pmcids: 1170 ↛ 1167line 1170 didn't jump to line 1167 because the condition on line 1170 was always true
1171 pmid_to_pmcid[str(pmid)] = f"PMC{pmcids[0]}"
1173 logger.info(
1174 f"Found {len(pmid_to_pmcid)} PMC IDs for full-text access"
1175 )
1176 return pmid_to_pmcid
1178 except Exception:
1179 logger.exception("Error finding PMC IDs")
1180 return {}
1182 def _get_pmc_full_text(self, pmcid: str) -> str:
1183 """
1184 Get full text for a PMC article.
1186 Args:
1187 pmcid: PMC ID of the article
1189 Returns:
1190 Full text content or empty string if not available
1191 """
1192 try:
1193 # Prepare parameters
1194 params = {
1195 "db": "pmc",
1196 "id": pmcid,
1197 "retmode": "xml",
1198 "rettype": "full",
1199 }
1201 # Add API key if available
1202 if self.api_key: 1202 ↛ 1203line 1202 didn't jump to line 1203 because the condition on line 1202 was never true
1203 params["api_key"] = self.api_key
1205 self._last_wait_time = self.rate_tracker.apply_rate_limit(
1206 self.engine_type
1207 )
1209 # Execute request
1210 response = safe_get(self.fetch_url, params=params)
1211 response.raise_for_status()
1213 # Parse XML response
1214 root = ET.fromstring(response.text)
1216 # Extract full text
1217 full_text = []
1219 # Extract article title
1220 title_elem = root.find(".//article-title")
1221 if title_elem is not None and title_elem.text: 1221 ↛ 1225line 1221 didn't jump to line 1225 because the condition on line 1221 was always true
1222 full_text.append(f"# {title_elem.text}")
1224 # Extract abstract
1225 abstract_paras = root.findall(".//abstract//p")
1226 if abstract_paras: 1226 ↛ 1227line 1226 didn't jump to line 1227 because the condition on line 1226 was never true
1227 full_text.append("\n## Abstract\n")
1228 for p in abstract_paras:
1229 text = "".join(p.itertext())
1230 if text:
1231 full_text.append(text)
1233 # Extract body content
1234 body = root.find(".//body")
1235 if body is not None: 1235 ↛ 1248line 1235 didn't jump to line 1248 because the condition on line 1235 was always true
1236 for section in body.findall(".//sec"):
1237 # Get section title
1238 title = section.find(".//title")
1239 if title is not None and title.text: 1239 ↛ 1243line 1239 didn't jump to line 1243 because the condition on line 1239 was always true
1240 full_text.append(f"\n## {title.text}\n")
1242 # Get paragraphs
1243 for p in section.findall(".//p"):
1244 text = "".join(p.itertext())
1245 if text: 1245 ↛ 1243line 1245 didn't jump to line 1243 because the condition on line 1245 was always true
1246 full_text.append(text)
1248 result_text = "\n\n".join(full_text)
1249 logger.debug(
1250 f"Successfully extracted {len(result_text)} chars of PMC full text with {len(full_text)} sections"
1251 )
1252 return result_text
1254 except Exception:
1255 logger.exception("Error getting PMC full text")
1256 return ""
1258 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
1259 """
1260 Get preview information for PubMed articles.
1262 Args:
1263 query: The search query
1265 Returns:
1266 List of preview dictionaries
1267 """
1268 logger.info(f"Getting PubMed previews for query: {query}")
1270 # Optimize the query for PubMed if LLM is available
1271 optimized_query = self._optimize_query_for_pubmed(query)
1273 # Perform adaptive search
1274 pmid_list, strategy = self._adaptive_search(optimized_query)
1276 # If no results, try a simplified query
1277 if not pmid_list:
1278 logger.warning(
1279 f"No PubMed results found using strategy: {strategy}"
1280 )
1281 simplified_query = self._simplify_query(optimized_query)
1282 if simplified_query != optimized_query:
1283 logger.info(f"Trying with simplified query: {simplified_query}")
1284 pmid_list, strategy = self._adaptive_search(simplified_query)
1285 if pmid_list: 1285 ↛ 1286line 1285 didn't jump to line 1286 because the condition on line 1285 was never true
1286 logger.info(
1287 f"Simplified query found {len(pmid_list)} results"
1288 )
1290 if not pmid_list:
1291 logger.warning("No PubMed results found after query simplification")
1292 return []
1294 # Get article summaries
1295 logger.debug(f"Fetching article summaries for {len(pmid_list)} PMIDs")
1296 summaries = self._get_article_summaries(pmid_list)
1297 logger.debug(f"Retrieved {len(summaries)} summaries")
1299 # ALWAYS fetch abstracts for snippet-only mode to provide context for LLM
1300 logger.debug(
1301 f"Fetching abstracts for {len(pmid_list)} articles for snippet enrichment"
1302 )
1303 abstracts = self._get_article_abstracts(pmid_list)
1304 logger.debug(f"Retrieved {len(abstracts)} abstracts")
1306 # Format as previews
1307 previews = []
1308 for summary in summaries:
1309 # Build snippet from individual metadata preferences
1310 snippet_parts = []
1312 # Check for publication type from esummary (earlier than detailed metadata)
1313 pub_type_prefix = ""
1314 if self.include_publication_type_in_context and summary.get( 1314 ↛ 1318line 1314 didn't jump to line 1318 because the condition on line 1314 was never true
1315 "pubtype"
1316 ):
1317 # Use first publication type from esummary
1318 pub_type_prefix = f"[{summary['pubtype'][0]}] "
1320 # Add authors if enabled
1321 if self.include_authors_in_context and summary.get("authors"): 1321 ↛ 1322line 1321 didn't jump to line 1322 because the condition on line 1321 was never true
1322 authors_text = ", ".join(summary.get("authors", []))
1323 if len(authors_text) > 100:
1324 # Truncate long author lists
1325 authors_text = authors_text[:97] + "..."
1326 snippet_parts.append(authors_text)
1328 # Add journal if enabled
1329 if self.include_journal_in_context and summary.get("journal"): 1329 ↛ 1333line 1329 didn't jump to line 1333 because the condition on line 1329 was always true
1330 snippet_parts.append(summary["journal"])
1332 # Add date (full or year only)
1333 if summary.get("pubdate"): 1333 ↛ 1343line 1333 didn't jump to line 1343 because the condition on line 1333 was always true
1334 if self.include_full_date_in_context: 1334 ↛ 1335line 1334 didn't jump to line 1335 because the condition on line 1334 was never true
1335 snippet_parts.append(summary["pubdate"])
1336 elif ( 1336 ↛ 1343line 1336 didn't jump to line 1343 because the condition on line 1336 was always true
1337 self.include_year_in_context
1338 and len(summary["pubdate"]) >= 4
1339 ):
1340 snippet_parts.append(summary["pubdate"][:4])
1342 # Add citation details if enabled
1343 if self.include_citation_in_context: 1343 ↛ 1344line 1343 didn't jump to line 1344 because the condition on line 1343 was never true
1344 citation_parts = []
1345 if summary.get("volume"):
1346 citation_parts.append(f"Vol {summary['volume']}")
1347 if summary.get("issue"):
1348 citation_parts.append(f"Issue {summary['issue']}")
1349 if summary.get("pages"):
1350 citation_parts.append(f"pp {summary['pages']}")
1351 if citation_parts:
1352 snippet_parts.append(f"({', '.join(citation_parts)})")
1354 # Join snippet parts or provide default
1355 if snippet_parts: 1355 ↛ 1366line 1355 didn't jump to line 1366 because the condition on line 1355 was always true
1356 # Use different separators based on what's included
1357 if self.include_authors_in_context: 1357 ↛ 1358line 1357 didn't jump to line 1358 because the condition on line 1357 was never true
1358 snippet = ". ".join(
1359 snippet_parts
1360 ) # Authors need period separator
1361 else:
1362 snippet = " - ".join(
1363 snippet_parts
1364 ) # Journal and year use dash
1365 else:
1366 snippet = "Research article"
1368 # Add publication type prefix
1369 snippet = pub_type_prefix + snippet
1371 # Add language indicator if not English
1372 if self.include_language_in_context and summary.get("lang"): 1372 ↛ 1373line 1372 didn't jump to line 1373 because the condition on line 1372 was never true
1373 langs = summary["lang"]
1374 if langs and langs[0] != "eng" and langs[0]:
1375 snippet = f"{snippet} [{langs[0].upper()}]"
1377 # Add identifiers if enabled
1378 identifier_parts = []
1379 if self.include_pmid_in_context and summary.get("id"): 1379 ↛ 1380line 1379 didn't jump to line 1380 because the condition on line 1379 was never true
1380 identifier_parts.append(f"PMID: {summary['id']}")
1381 if self.include_doi_in_context and summary.get("doi"): 1381 ↛ 1382line 1381 didn't jump to line 1382 because the condition on line 1381 was never true
1382 identifier_parts.append(f"DOI: {summary['doi']}")
1384 if identifier_parts: 1384 ↛ 1385line 1384 didn't jump to line 1385 because the condition on line 1384 was never true
1385 snippet = f"{snippet} | {' | '.join(identifier_parts)}"
1387 # ALWAYS include title and abstract in snippet for LLM analysis
1388 pmid = summary["id"]
1389 title = summary["title"]
1390 abstract_text = abstracts.get(pmid, "")
1392 # Truncate abstract if too long
1393 if len(abstract_text) > 500: 1393 ↛ 1394line 1393 didn't jump to line 1394 because the condition on line 1393 was never true
1394 abstract_text = abstract_text[:497] + "..."
1396 # Build the enriched snippet with title and abstract
1397 if abstract_text: 1397 ↛ 1400line 1397 didn't jump to line 1400 because the condition on line 1397 was always true
1398 enriched_snippet = f"Title: {title}\n\nAbstract: {abstract_text}\n\nMetadata: {snippet}"
1399 else:
1400 enriched_snippet = f"Title: {title}\n\nMetadata: {snippet}"
1402 # Log the complete snippet for debugging
1403 logger.debug(f"Complete snippet for PMID {pmid}:")
1404 logger.debug(f" Title: {title[:100]}...")
1405 logger.debug(f" Abstract length: {len(abstract_text)} chars")
1406 logger.debug(f" Metadata: {snippet}")
1407 logger.debug(
1408 f" Full enriched snippet ({len(enriched_snippet)} chars): {enriched_snippet[:500]}..."
1409 )
1411 # Create preview with basic information
1412 preview = {
1413 "id": summary["id"],
1414 "title": summary["title"],
1415 "link": summary["link"],
1416 "snippet": enriched_snippet, # Use enriched snippet with title and abstract
1417 "authors": summary.get("authors", []),
1418 "journal": summary.get("journal", ""),
1419 "pubdate": summary.get("pubdate", ""),
1420 "doi": summary.get("doi", ""),
1421 "source": "PubMed",
1422 "_pmid": summary["id"], # Store PMID for later use
1423 "_search_strategy": strategy, # Store search strategy for analytics
1424 }
1426 previews.append(preview)
1428 logger.info(
1429 f"Found {len(previews)} PubMed previews using strategy: {strategy}"
1430 )
1431 if previews: 1431 ↛ 1435line 1431 didn't jump to line 1435 because the condition on line 1431 was always true
1432 logger.debug(
1433 f"Sample preview title: '{previews[0].get('title', 'NO TITLE')[:80]}...'"
1434 )
1435 return previews
1437 def _get_full_content(
1438 self, relevant_items: List[Dict[str, Any]]
1439 ) -> List[Dict[str, Any]]:
1440 """
1441 Get full content for the relevant PubMed articles.
1442 Efficiently manages which content to retrieve (abstracts and/or full text).
1444 Args:
1445 relevant_items: List of relevant preview dictionaries
1447 Returns:
1448 List of result dictionaries with full content
1449 """
1450 # Check if we should add full content
1451 snippets_only_mode = (
1452 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
1453 and search_config.SEARCH_SNIPPETS_ONLY
1454 )
1456 if snippets_only_mode: 1456 ↛ 1457line 1456 didn't jump to line 1457 because the condition on line 1456 was never true
1457 logger.info(
1458 "Snippet-only mode enabled, will fetch abstracts as snippets"
1459 )
1460 # For PubMed, we still need to fetch abstracts as they serve as snippets
1461 # But we'll skip full-text retrieval
1463 logger.info(
1464 f"Getting content for {len(relevant_items)} PubMed articles"
1465 )
1467 # Collect all PMIDs for relevant items
1468 pmids = []
1469 for item in relevant_items:
1470 if "_pmid" in item: 1470 ↛ 1469line 1470 didn't jump to line 1469 because the condition on line 1470 was always true
1471 pmids.append(item["_pmid"])
1473 # Get abstracts if requested and PMIDs exist
1474 # In snippet-only mode, always get abstracts as they serve as snippets
1475 abstracts = {}
1476 if (self.get_abstracts or snippets_only_mode) and pmids: 1476 ↛ 1480line 1476 didn't jump to line 1480 because the condition on line 1476 was always true
1477 abstracts = self._get_article_abstracts(pmids)
1479 # Get detailed metadata for all articles (publication types, MeSH terms, etc.)
1480 detailed_metadata = {}
1481 if pmids: 1481 ↛ 1485line 1481 didn't jump to line 1485 because the condition on line 1481 was always true
1482 detailed_metadata = self._get_article_detailed_metadata(pmids)
1484 # Find PMC IDs for full-text retrieval (if enabled and not in snippet-only mode)
1485 pmid_to_pmcid = {}
1486 if self.get_full_text and pmids and not snippets_only_mode: 1486 ↛ 1487line 1486 didn't jump to line 1487 because the condition on line 1486 was never true
1487 pmid_to_pmcid = self._find_pmc_ids(pmids)
1489 # Add content to results
1490 results = []
1491 for item in relevant_items:
1492 result = item.copy()
1493 pmid = item.get("_pmid", "")
1495 # Add detailed metadata if available
1496 if pmid in detailed_metadata:
1497 metadata = detailed_metadata[pmid]
1499 # Add publication types (e.g., "Clinical Trial", "Meta-Analysis")
1500 if "publication_types" in metadata: 1500 ↛ 1516line 1500 didn't jump to line 1516 because the condition on line 1500 was always true
1501 result["publication_types"] = metadata["publication_types"]
1503 # Add first publication type to snippet if enabled
1504 if ( 1504 ↛ 1516line 1504 didn't jump to line 1516 because the condition on line 1504 was always true
1505 self.include_publication_type_in_context
1506 and metadata["publication_types"]
1507 ):
1508 # Just take the first publication type as is
1509 pub_type = metadata["publication_types"][0]
1510 if "snippet" in result: 1510 ↛ 1516line 1510 didn't jump to line 1516 because the condition on line 1510 was always true
1511 result["snippet"] = (
1512 f"[{pub_type}] {result['snippet']}"
1513 )
1515 # Add MeSH terms for medical categorization
1516 if "mesh_terms" in metadata: 1516 ↛ 1536line 1516 didn't jump to line 1536 because the condition on line 1516 was always true
1517 result["mesh_terms"] = metadata["mesh_terms"]
1519 # Add MeSH terms to snippet if enabled
1520 if ( 1520 ↛ 1536line 1520 didn't jump to line 1536 because the condition on line 1520 was always true
1521 self.include_mesh_terms_in_context
1522 and metadata["mesh_terms"]
1523 ):
1524 mesh_to_show = (
1525 metadata["mesh_terms"][: self.max_mesh_terms]
1526 if self.max_mesh_terms > 0
1527 else metadata["mesh_terms"]
1528 )
1529 if mesh_to_show and "snippet" in result: 1529 ↛ 1536line 1529 didn't jump to line 1536 because the condition on line 1529 was always true
1530 mesh_text = "MeSH: " + ", ".join(mesh_to_show)
1531 result["snippet"] = (
1532 f"{result['snippet']} | {mesh_text}"
1533 )
1535 # Add keywords
1536 if "keywords" in metadata: 1536 ↛ 1537line 1536 didn't jump to line 1537 because the condition on line 1536 was never true
1537 result["keywords"] = metadata["keywords"]
1539 # Add keywords to snippet if enabled
1540 if (
1541 self.include_keywords_in_context
1542 and metadata["keywords"]
1543 ):
1544 keywords_to_show = (
1545 metadata["keywords"][: self.max_keywords]
1546 if self.max_keywords > 0
1547 else metadata["keywords"]
1548 )
1549 if keywords_to_show and "snippet" in result:
1550 keywords_text = "Keywords: " + ", ".join(
1551 keywords_to_show
1552 )
1553 result["snippet"] = (
1554 f"{result['snippet']} | {keywords_text}"
1555 )
1557 # Add affiliations
1558 if "affiliations" in metadata: 1558 ↛ 1559line 1558 didn't jump to line 1559 because the condition on line 1558 was never true
1559 result["affiliations"] = metadata["affiliations"]
1561 # Add funding/grant information
1562 if "grants" in metadata: 1562 ↛ 1563line 1562 didn't jump to line 1563 because the condition on line 1562 was never true
1563 result["grants"] = metadata["grants"]
1565 # Add conflict of interest statement
1566 if "conflict_of_interest" in metadata: 1566 ↛ 1567line 1566 didn't jump to line 1567 because the condition on line 1566 was never true
1567 result["conflict_of_interest"] = metadata[
1568 "conflict_of_interest"
1569 ]
1571 # Add free full text availability
1572 if "has_free_full_text" in metadata: 1572 ↛ 1573line 1572 didn't jump to line 1573 because the condition on line 1572 was never true
1573 result["has_free_full_text"] = metadata[
1574 "has_free_full_text"
1575 ]
1576 if "pmc_id" in metadata:
1577 result["pmc_id"] = metadata["pmc_id"]
1579 # Add PMC availability to snippet if enabled
1580 if (
1581 self.include_pmc_availability_in_context
1582 and metadata["has_free_full_text"]
1583 and "snippet" in result
1584 ):
1585 result["snippet"] = (
1586 f"{result['snippet']} | [Free Full Text]"
1587 )
1589 # Add abstract if available
1590 if pmid in abstracts: 1590 ↛ 1631line 1590 didn't jump to line 1631 because the condition on line 1590 was always true
1591 result["abstract"] = abstracts[pmid]
1593 # Create enriched content with metadata context
1594 enriched_content = self._create_enriched_content(
1595 result, abstracts[pmid]
1596 )
1598 # ALWAYS include title and abstract in snippet for LLM analysis
1599 # Build comprehensive snippet with title and abstract
1600 title = result.get("title", "")
1601 abstract_text = (
1602 abstracts[pmid][:SNIPPET_LENGTH_LONG]
1603 if len(abstracts[pmid]) > SNIPPET_LENGTH_LONG
1604 else abstracts[pmid]
1605 )
1607 # Prepend title and abstract to the existing metadata snippet
1608 if "snippet" in result: 1608 ↛ 1615line 1608 didn't jump to line 1615 because the condition on line 1608 was always true
1609 # Keep metadata snippet and add content
1610 result["snippet"] = (
1611 f"Title: {title}\n\nAbstract: {abstract_text}\n\nMetadata: {result['snippet']}"
1612 )
1613 else:
1614 # No metadata snippet, just title and abstract
1615 result["snippet"] = (
1616 f"Title: {title}\n\nAbstract: {abstract_text}"
1617 )
1619 # In snippet-only mode, use enriched content
1620 if snippets_only_mode: 1620 ↛ 1621line 1620 didn't jump to line 1621 because the condition on line 1620 was never true
1621 result["full_content"] = enriched_content
1622 result["content"] = enriched_content
1623 result["content_type"] = "abstract"
1624 # Use abstract as content if no full text
1625 elif pmid not in pmid_to_pmcid: 1625 ↛ 1631line 1625 didn't jump to line 1631 because the condition on line 1625 was always true
1626 result["full_content"] = enriched_content
1627 result["content"] = enriched_content
1628 result["content_type"] = "abstract"
1630 # Add full text for a limited number of top articles
1631 if ( 1631 ↛ 1640line 1631 didn't jump to line 1640 because the condition on line 1631 was never true
1632 pmid in pmid_to_pmcid
1633 and self.get_full_text
1634 and len(
1635 [r for r in results if r.get("content_type") == "full_text"]
1636 )
1637 < self.full_text_limit
1638 ):
1639 # Get full text content
1640 pmcid = pmid_to_pmcid[pmid]
1641 full_text = self._get_pmc_full_text(pmcid)
1643 if full_text:
1644 enriched_full_text = self._create_enriched_content(
1645 result, full_text
1646 )
1647 result["full_content"] = enriched_full_text
1648 result["content"] = enriched_full_text
1649 result["content_type"] = "full_text"
1650 result["pmcid"] = pmcid
1651 elif pmid in abstracts:
1652 # Fall back to abstract if full text retrieval fails
1653 enriched_content = self._create_enriched_content(
1654 result, abstracts[pmid]
1655 )
1656 result["full_content"] = enriched_content
1657 result["content"] = enriched_content
1658 result["content_type"] = "abstract"
1660 # Remove temporary fields
1661 if "_pmid" in result: 1661 ↛ 1663line 1661 didn't jump to line 1663 because the condition on line 1661 was always true
1662 del result["_pmid"]
1663 if "_search_strategy" in result: 1663 ↛ 1664line 1663 didn't jump to line 1664 because the condition on line 1663 was never true
1664 del result["_search_strategy"]
1666 results.append(result)
1668 return results
1670 def search_by_author(
1671 self, author_name: str, max_results: Optional[int] = None
1672 ) -> List[Dict[str, Any]]:
1673 """
1674 Search for articles by a specific author.
1676 Args:
1677 author_name: Name of the author
1678 max_results: Maximum number of results (defaults to self.max_results)
1680 Returns:
1681 List of articles by the author
1682 """
1683 original_max_results = self.max_results
1685 try:
1686 if max_results: 1686 ↛ 1687line 1686 didn't jump to line 1687 because the condition on line 1686 was never true
1687 self.max_results = max_results
1689 query = f"{author_name}[Author]"
1690 return self.run(query)
1692 finally:
1693 # Restore original value
1694 self.max_results = original_max_results
1696 def search_by_journal(
1697 self, journal_name: str, max_results: Optional[int] = None
1698 ) -> List[Dict[str, Any]]:
1699 """
1700 Search for articles in a specific journal.
1702 Args:
1703 journal_name: Name of the journal
1704 max_results: Maximum number of results (defaults to self.max_results)
1706 Returns:
1707 List of articles from the journal
1708 """
1709 original_max_results = self.max_results
1711 try:
1712 if max_results: 1712 ↛ 1713line 1712 didn't jump to line 1713 because the condition on line 1712 was never true
1713 self.max_results = max_results
1715 query = f"{journal_name}[Journal]"
1716 return self.run(query)
1718 finally:
1719 # Restore original value
1720 self.max_results = original_max_results
1722 def search_recent(
1723 self, query: str, days: int = 30, max_results: Optional[int] = None
1724 ) -> List[Dict[str, Any]]:
1725 """
1726 Search for recent articles matching the query.
1728 Args:
1729 query: The search query
1730 days: Number of days to look back
1731 max_results: Maximum number of results (defaults to self.max_results)
1733 Returns:
1734 List of recent articles matching the query
1735 """
1736 original_max_results = self.max_results
1737 original_days_limit = self.days_limit
1739 try:
1740 if max_results:
1741 self.max_results = max_results
1743 # Set days limit for this search
1744 self.days_limit = days
1746 return self.run(query)
1748 finally:
1749 # Restore original values
1750 self.max_results = original_max_results
1751 self.days_limit = original_days_limit
1753 def advanced_search(
1754 self, terms: Dict[str, str], max_results: Optional[int] = None
1755 ) -> List[Dict[str, Any]]:
1756 """
1757 Perform an advanced search with field-specific terms.
1759 Args:
1760 terms: Dictionary mapping fields to search terms
1761 Valid fields: Author, Journal, Title, MeSH, Affiliation, etc.
1762 max_results: Maximum number of results (defaults to self.max_results)
1764 Returns:
1765 List of articles matching the advanced query
1766 """
1767 original_max_results = self.max_results
1769 try:
1770 if max_results:
1771 self.max_results = max_results
1773 # Build advanced query string
1774 query_parts = []
1775 for field, term in terms.items():
1776 query_parts.append(f"{term}[{field}]")
1778 query = " AND ".join(query_parts)
1779 return self.run(query)
1781 finally:
1782 # Restore original value
1783 self.max_results = original_max_results