Coverage for src / local_deep_research / web_search_engines / engines / search_engine_pubmed.py: 92%
710 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1import re
2from typing import Any, Dict, List, Optional, Tuple
4from defusedxml import ElementTree as ET
6from langchain_core.language_models import BaseLLM
7from loguru import logger
9from ...config import search_config
10from ...constants import SNIPPET_LENGTH_LONG
11from ...security.safe_requests import safe_get
12from ..rate_limiting import RateLimitError
13from ..search_engine_base import BaseSearchEngine
16class PubMedSearchEngine(BaseSearchEngine):
17 """
18 PubMed search engine implementation with two-phase approach and adaptive search.
19 Provides efficient access to biomedical literature while minimizing API usage.
20 """
22 # Mark as public search engine
23 is_public = True
24 # Scientific/medical search engine
25 is_scientific = True
26 is_lexical = True
27 needs_llm_relevance_filter = True
29 def __init__(
30 self,
31 max_results: int = 10,
32 api_key: Optional[str] = None,
33 days_limit: Optional[int] = None,
34 get_abstracts: bool = True,
35 get_full_text: bool = False,
36 full_text_limit: int = 3,
37 llm: Optional[BaseLLM] = None,
38 max_filtered_results: Optional[int] = None,
39 optimize_queries: bool = True,
40 include_publication_type_in_context: bool = True,
41 include_journal_in_context: bool = True,
42 include_year_in_context: bool = True,
43 include_authors_in_context: bool = False,
44 include_full_date_in_context: bool = False,
45 include_mesh_terms_in_context: bool = True,
46 include_keywords_in_context: bool = True,
47 include_doi_in_context: bool = False,
48 include_pmid_in_context: bool = False,
49 include_pmc_availability_in_context: bool = False,
50 max_mesh_terms: int = 3,
51 max_keywords: int = 3,
52 include_citation_in_context: bool = False,
53 include_language_in_context: bool = False,
54 settings_snapshot: Optional[Dict[str, Any]] = None,
55 ):
56 """
57 Initialize the PubMed search engine.
59 Args:
60 max_results: Maximum number of search results
61 api_key: NCBI API key for higher rate limits (optional)
62 days_limit: Limit results to N days (optional)
63 get_abstracts: Whether to fetch abstracts for all results
64 get_full_text: Whether to fetch full text content (when available in PMC)
65 full_text_limit: Max number of full-text articles to retrieve
66 llm: Language model for relevance filtering
67 max_filtered_results: Maximum number of results to keep after filtering
68 optimize_queries: Whether to optimize natural language queries for PubMed
69 """
70 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
71 super().__init__(
72 llm=llm,
73 max_filtered_results=max_filtered_results,
74 max_results=max_results,
75 settings_snapshot=settings_snapshot,
76 )
77 self.max_results = max(self.max_results, 25)
78 self.api_key = api_key
79 self.days_limit = days_limit
80 self.get_abstracts = get_abstracts
81 self.get_full_text = get_full_text
82 self.full_text_limit = full_text_limit
83 self.optimize_queries = optimize_queries
84 self.include_publication_type_in_context = (
85 include_publication_type_in_context
86 )
87 self.include_journal_in_context = include_journal_in_context
88 self.include_year_in_context = include_year_in_context
89 self.include_authors_in_context = include_authors_in_context
90 self.include_full_date_in_context = include_full_date_in_context
91 self.include_mesh_terms_in_context = include_mesh_terms_in_context
92 self.include_keywords_in_context = include_keywords_in_context
93 self.include_doi_in_context = include_doi_in_context
94 self.include_pmid_in_context = include_pmid_in_context
95 self.include_pmc_availability_in_context = (
96 include_pmc_availability_in_context
97 )
98 self.max_mesh_terms = max_mesh_terms
99 self.max_keywords = max_keywords
100 self.include_citation_in_context = include_citation_in_context
101 self.include_language_in_context = include_language_in_context
103 # Base API URLs
104 self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
105 self.search_url = f"{self.base_url}/esearch.fcgi"
106 self.summary_url = f"{self.base_url}/esummary.fcgi"
107 self.fetch_url = f"{self.base_url}/efetch.fcgi"
108 self.link_url = f"{self.base_url}/elink.fcgi"
110 # PMC base URL for full text
111 self.pmc_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/"
113 def _get_result_count(self, query: str) -> int:
114 """
115 Get the total number of results for a query without retrieving the results themselves.
117 Args:
118 query: The search query
120 Returns:
121 Total number of matching results
122 """
123 try:
124 # Prepare search parameters
125 params = {
126 "db": "pubmed",
127 "term": query,
128 "retmode": "json",
129 "retmax": 0, # Don't need actual results, just the count
130 }
132 # Add API key if available
133 if self.api_key:
134 params["api_key"] = self.api_key
136 self._last_wait_time = self.rate_tracker.apply_rate_limit(
137 self.engine_type
138 )
140 # Execute search request
141 response = safe_get(self.search_url, params=params)
142 response.raise_for_status()
144 # Parse response
145 data = response.json()
146 count = int(data["esearchresult"]["count"])
148 logger.info(
149 "Query '{}' has {} total results in PubMed", query, count
150 )
151 return count
153 except Exception:
154 logger.exception("Error getting result count")
155 return 0
157 def _extract_core_terms(self, query: str) -> str:
158 """
159 Extract core terms from a complex query for volume estimation.
161 Args:
162 query: PubMed query string
164 Returns:
165 Simplified query with core terms
166 """
167 # Remove field specifications and operators
168 simplified = re.sub(r"\[\w+\]", "", query) # Remove [Field] tags
169 simplified = re.sub(
170 r"\b(AND|OR|NOT)\b", "", simplified
171 ) # Remove operators
173 # Remove quotes and parentheses
174 simplified = (
175 simplified.replace('"', "").replace("(", "").replace(")", "")
176 )
178 # Split by whitespace and join terms with 4+ chars (likely meaningful)
179 terms = [term for term in simplified.split() if len(term) >= 4]
181 # Join with AND to create a basic search
182 return " ".join(terms[:5]) # Limit to top 5 terms
184 def _expand_time_window(self, time_filter: str) -> str:
185 """
186 Expand a time window to get more results.
188 Args:
189 time_filter: Current time filter
191 Returns:
192 Expanded time filter
193 """
194 # Parse current time window
195 import re
197 match = re.match(r'"last (\d+) (\w+)"[pdat]', time_filter)
198 if not match:
199 return '"last 10 years"[pdat]'
201 amount, unit = int(match.group(1)), match.group(2)
203 # Expand based on current unit
204 if unit == "months" or unit == "month":
205 if amount < 6:
206 return '"last 6 months"[pdat]'
207 if amount < 12:
208 return '"last 1 year"[pdat]'
209 return '"last 2 years"[pdat]'
210 if unit == "years" or unit == "year": 210 ↛ 217line 210 didn't jump to line 217 because the condition on line 210 was always true
211 if amount < 2:
212 return '"last 2 years"[pdat]'
213 if amount < 5:
214 return '"last 5 years"[pdat]'
215 return '"last 10 years"[pdat]'
217 return '"last 10 years"[pdat]'
219 def _optimize_query_for_pubmed(self, query: str) -> str:
220 """
221 Optimize a natural language query for PubMed search.
222 Uses LLM to transform questions into effective keyword-based queries.
224 Args:
225 query: Natural language query
227 Returns:
228 Optimized query string for PubMed
229 """
230 if not self.llm or not self.optimize_queries:
231 # Return original query if no LLM available or optimization disabled
232 return query
234 try:
235 # Prompt for query optimization
236 prompt = f"""Transform this natural language question into an optimized PubMed search query.
238Original query: "{query}"
240CRITICAL RULES:
2411. ONLY RETURN THE EXACT SEARCH QUERY - NO EXPLANATIONS, NO COMMENTS
2422. DO NOT wrap the entire query in quotes
2433. DO NOT include ANY date restrictions or year filters
2444. Use parentheses around OR statements: (term1[Field] OR term2[Field])
2455. Use only BASIC MeSH terms - stick to broad categories like "Vaccines"[Mesh]
2466. KEEP IT SIMPLE - use 2-3 main concepts maximum
2477. Focus on Title/Abstract searches for reliability: term[Title/Abstract]
2488. Use wildcards for variations: vaccin*[Title/Abstract]
250EXAMPLE QUERIES:
251✓ GOOD: (mRNA[Title/Abstract] OR "messenger RNA"[Title/Abstract]) AND vaccin*[Title/Abstract]
252✓ GOOD: (influenza[Title/Abstract] OR flu[Title/Abstract]) AND treatment[Title/Abstract]
253✗ BAD: (mRNA[Title/Abstract]) AND "specific disease"[Mesh] AND treatment[Title/Abstract] AND 2023[dp]
254✗ BAD: "Here's a query to find articles about vaccines..."
256Return ONLY the search query without any explanations.
257"""
259 # Get response from LLM
260 response = self.llm.invoke(prompt)
261 raw_response = (
262 str(response.content)
263 if hasattr(response, "content")
264 else str(response)
265 ).strip()
267 # Clean up the query - extract only the actual query and remove any explanations
268 # First check if there are multiple lines and take the first non-empty line
269 lines = raw_response.split("\n")
270 cleaned_lines = [line.strip() for line in lines if line.strip()]
272 if cleaned_lines: 272 ↛ 322line 272 didn't jump to line 322 because the condition on line 272 was always true
273 optimized_query = cleaned_lines[0]
275 # Remove any quotes that wrap the entire query
276 if optimized_query.startswith('"') and optimized_query.endswith(
277 '"'
278 ):
279 optimized_query = optimized_query[1:-1]
281 # Remove any explanation phrases that might be at the beginning
282 explanation_starters = [
283 "here is",
284 "here's",
285 "this query",
286 "the following",
287 ]
288 for starter in explanation_starters:
289 if optimized_query.lower().startswith(starter):
290 # Find the actual query part - typically after a colon
291 colon_pos = optimized_query.find(":")
292 if colon_pos > 0:
293 optimized_query = optimized_query[
294 colon_pos + 1 :
295 ].strip()
297 # Check if the query still seems to contain explanations
298 if (
299 len(optimized_query) > 200
300 or "this query will" in optimized_query.lower()
301 ):
302 # It's probably still an explanation - try to extract just the query part
303 # Look for common patterns in the explanation like parentheses
304 pattern = r"\([^)]+\)\s+AND\s+"
305 import re
307 matches = re.findall(pattern, optimized_query)
308 if matches: 308 ↛ 328line 308 didn't jump to line 328 because the condition on line 308 was always true
309 # Extract just the query syntax parts
310 query_parts = []
311 for part in re.split(r"\.\s+", optimized_query):
312 if (
313 "(" in part
314 and ")" in part
315 and ("AND" in part or "OR" in part)
316 ):
317 query_parts.append(part)
318 if query_parts: 318 ↛ 328line 318 didn't jump to line 328 because the condition on line 318 was always true
319 optimized_query = " ".join(query_parts)
320 else:
321 # Fall back to original query if cleaning fails
322 logger.warning(
323 "Failed to extract a clean query from LLM response"
324 )
325 optimized_query = query
327 # Final safety check - if query looks too much like an explanation, use original
328 if len(optimized_query.split()) > 30:
329 logger.warning(
330 "Query too verbose, falling back to simpler form"
331 )
332 # Create a simple query from the original
333 words = [
334 w
335 for w in query.split()
336 if len(w) > 3
337 and w.lower()
338 not in (
339 "what",
340 "are",
341 "the",
342 "and",
343 "for",
344 "with",
345 "from",
346 "have",
347 "been",
348 "recent",
349 )
350 ]
351 optimized_query = " AND ".join(words[:3])
353 # Basic cleanup: standardize field tag case for consistency
354 import re
356 optimized_query = re.sub(
357 r"\[mesh\]", "[Mesh]", optimized_query, flags=re.IGNORECASE
358 )
359 optimized_query = re.sub(
360 r"\[title/abstract\]",
361 "[Title/Abstract]",
362 optimized_query,
363 flags=re.IGNORECASE,
364 )
365 optimized_query = re.sub(
366 r"\[publication type\]",
367 "[Publication Type]",
368 optimized_query,
369 flags=re.IGNORECASE,
370 )
372 # Fix unclosed quotes followed by field tags
373 # Pattern: "term[Field] -> "term"[Field]
374 optimized_query = re.sub(r'"([^"]+)\[', r'"\1"[', optimized_query)
376 # Simplify the query if still no results are found
377 self._simplify_query_cache = optimized_query
379 # Log original and optimized queries
380 logger.info("Original query: '{}'", query)
381 logger.info(f"Optimized for PubMed: '{optimized_query}'")
382 logger.debug(
383 f"Query optimization complete: '{query[:50]}...' -> '{optimized_query[:100]}...'"
384 )
386 return optimized_query
388 except Exception:
389 logger.exception("Error optimizing query")
390 logger.debug(f"Falling back to original query: '{query}'")
391 return query # Fall back to original query on error
393 def _simplify_query(self, query: str) -> str:
394 """
395 Simplify a PubMed query that returned no results.
396 Progressively removes elements to get a more basic query.
398 Args:
399 query: The original query that returned no results
401 Returns:
402 Simplified query
403 """
404 logger.info(f"Simplifying query: {query}")
405 logger.debug(f"Query simplification started for: '{query[:100]}...'")
407 # Simple approach: remove field restrictions to broaden the search
408 import re
410 # Remove field tags to make search broader
411 simplified = query
413 # Remove [Mesh] tags - search in all fields instead
414 simplified = re.sub(r"\[Mesh\]", "", simplified, flags=re.IGNORECASE)
416 # Remove [Publication Type] tags
417 simplified = re.sub(
418 r"\[Publication Type\]", "", simplified, flags=re.IGNORECASE
419 )
421 # Keep [Title/Abstract] as it's usually helpful
422 # Clean up any double spaces
423 simplified = re.sub(r"\s+", " ", simplified).strip()
425 # If no simplification was possible, return the original query
426 if simplified == query:
427 logger.debug("No simplification possible, returning original query")
429 logger.info(f"Simplified query: {simplified}")
430 logger.debug(
431 f"Query simplified from {len(query)} to {len(simplified)} chars"
432 )
433 return simplified
435 def _is_historical_focused(self, query: str) -> bool:
436 """
437 Determine if a query is specifically focused on historical/older information using LLM.
438 Default assumption is that queries should prioritize recent information unless
439 explicitly asking for historical content.
441 Args:
442 query: The search query
444 Returns:
445 Boolean indicating if the query is focused on historical information
446 """
447 if not self.llm:
448 # Fall back to basic keyword check if no LLM available
449 historical_terms = [
450 "history",
451 "historical",
452 "early",
453 "initial",
454 "first",
455 "original",
456 "before",
457 "prior to",
458 "origins",
459 "evolution",
460 "development",
461 ]
462 historical_years = [str(year) for year in range(1900, 2020)]
464 query_lower = query.lower()
465 has_historical_term = any(
466 term in query_lower for term in historical_terms
467 )
468 has_past_year = any(year in query for year in historical_years)
470 return has_historical_term or has_past_year
472 try:
473 # Use LLM to determine if the query is focused on historical information
474 prompt = f"""Determine if this query is specifically asking for HISTORICAL or OLDER information.
476Query: "{query}"
478Answer ONLY "yes" if the query is clearly asking for historical, early, original, or past information from more than 5 years ago.
479Answer ONLY "no" if the query is asking about recent, current, or new information, or if it's a general query without a specific time focus.
481The default assumption should be that medical and scientific queries want RECENT information unless clearly specified otherwise.
482"""
484 response = self.llm.invoke(prompt)
485 answer = (
486 (
487 str(response.content)
488 if hasattr(response, "content")
489 else str(response)
490 )
491 .strip()
492 .lower()
493 )
495 # Log the determination
496 logger.info(f"Historical focus determination for query: '{query}'")
497 logger.info(f"LLM determined historical focus: {answer}")
499 return "yes" in answer
501 except Exception:
502 logger.exception("Error determining historical focus")
503 # Fall back to basic keyword check
504 historical_terms = [
505 "history",
506 "historical",
507 "early",
508 "initial",
509 "first",
510 "original",
511 "before",
512 "prior to",
513 "origins",
514 "evolution",
515 "development",
516 ]
517 return any(term in query.lower() for term in historical_terms)
519 def _adaptive_search(self, query: str) -> Tuple[List[str], str]:
520 """
521 Perform an adaptive search that adjusts based on topic volume and whether
522 the query focuses on historical information.
524 Args:
525 query: The search query (already optimized)
527 Returns:
528 Tuple of (list of PMIDs, search strategy used)
529 """
530 # Estimate topic volume
531 estimated_volume = self._get_result_count(query)
533 # Determine if the query is focused on historical information
534 is_historical_focused = self._is_historical_focused(query)
536 if is_historical_focused:
537 # User wants historical information - no date filtering
538 time_filter = None
539 strategy = "historical_focus"
540 elif estimated_volume > 5000:
541 # Very common topic - use tighter recency filter
542 time_filter = '"last 1 year"[pdat]'
543 strategy = "high_volume"
544 elif estimated_volume > 1000:
545 # Common topic
546 time_filter = '"last 3 years"[pdat]'
547 strategy = "common_topic"
548 elif estimated_volume > 100:
549 # Moderate volume
550 time_filter = '"last 5 years"[pdat]'
551 strategy = "moderate_volume"
552 else:
553 # Rare topic - still use recency but with wider range
554 time_filter = '"last 10 years"[pdat]'
555 strategy = "rare_topic"
557 # Run search based on strategy
558 if time_filter:
559 # Try with adaptive time filter
560 query_with_time = f"({query}) AND {time_filter}"
561 logger.info(
562 f"Using adaptive search strategy: {strategy} with filter: {time_filter}"
563 )
564 results = self._search_pubmed(query_with_time)
566 # If too few results, gradually expand time window
567 if len(results) < 5 and '"last 10 years"[pdat]' not in time_filter:
568 logger.info(
569 f"Insufficient results ({len(results)}), expanding time window"
570 )
571 expanded_time = self._expand_time_window(time_filter)
572 query_with_expanded_time = f"({query}) AND {expanded_time}"
573 expanded_results = self._search_pubmed(query_with_expanded_time)
575 if len(expanded_results) > len(results):
576 logger.info(
577 f"Expanded time window yielded {len(expanded_results)} results"
578 )
579 return expanded_results, f"{strategy}_expanded"
581 # If still no results, try without time filter
582 if not results:
583 logger.info(
584 "No results with time filter, trying without time restrictions"
585 )
586 results = self._search_pubmed(query)
587 strategy = "no_time_filter"
588 else:
589 # Historical query - run without time filter
590 logger.info(
591 "Using historical search strategy without date filtering"
592 )
593 results = self._search_pubmed(query)
595 return results, strategy
597 def _search_pubmed(self, query: str) -> List[str]:
598 """
599 Search PubMed and return a list of article IDs.
601 Args:
602 query: The search query
604 Returns:
605 List of PubMed IDs matching the query
606 """
607 try:
608 # Prepare search parameters
609 params = {
610 "db": "pubmed",
611 "term": query,
612 "retmode": "json",
613 "retmax": self.max_results,
614 "usehistory": "y",
615 }
617 # Add API key if available
618 if self.api_key:
619 params["api_key"] = self.api_key
620 logger.debug("Using PubMed API key for higher rate limits")
621 else:
622 logger.debug("No PubMed API key - using default rate limits")
624 # Add date restriction if specified
625 if self.days_limit:
626 params["reldate"] = self.days_limit
627 params["datetype"] = "pdat" # Publication date
628 logger.debug(f"Limiting results to last {self.days_limit} days")
630 logger.debug(
631 f"PubMed search query: '{query}' with max_results={self.max_results}"
632 )
634 self._last_wait_time = self.rate_tracker.apply_rate_limit(
635 self.engine_type
636 )
637 logger.debug(
638 f"Applied rate limit wait: {self._last_wait_time:.2f}s"
639 )
641 # Execute search request
642 logger.debug(f"Sending request to PubMed API: {self.search_url}")
643 response = safe_get(self.search_url, params=params)
644 response.raise_for_status()
645 logger.debug(f"PubMed API response status: {response.status_code}")
647 # Parse response
648 data = response.json()
649 id_list: list[str] = data["esearchresult"]["idlist"]
650 total_count = data["esearchresult"].get("count", "unknown")
652 logger.info(
653 f"PubMed search for '{query}' found {len(id_list)} results (total available: {total_count})"
654 )
655 if len(id_list) > 0:
656 logger.debug(f"First 5 PMIDs: {id_list[:5]}")
657 return id_list
659 except Exception:
660 logger.exception(f"Error searching PubMed for query '{query}'")
661 return []
663 def _get_article_summaries(
664 self, id_list: List[str]
665 ) -> List[Dict[str, Any]]:
666 """
667 Get summaries for a list of PubMed article IDs.
669 Args:
670 id_list: List of PubMed IDs
672 Returns:
673 List of article summary dictionaries
674 """
675 if not id_list:
676 logger.debug("Empty ID list provided to _get_article_summaries")
677 return []
679 logger.debug(f"Fetching summaries for {len(id_list)} PubMed articles")
681 try:
682 # Prepare parameters
683 params = {
684 "db": "pubmed",
685 "id": ",".join(id_list),
686 "retmode": "json",
687 "rettype": "summary",
688 }
690 # Add API key if available
691 if self.api_key: 691 ↛ 692line 691 didn't jump to line 692 because the condition on line 691 was never true
692 params["api_key"] = self.api_key
694 self._last_wait_time = self.rate_tracker.apply_rate_limit(
695 self.engine_type
696 )
697 logger.debug(
698 f"Applied rate limit wait: {self._last_wait_time:.2f}s"
699 )
701 # Execute request
702 logger.debug(f"Requesting summaries from: {self.summary_url}")
703 response = safe_get(self.summary_url, params=params)
704 response.raise_for_status()
705 logger.debug(f"Summary API response status: {response.status_code}")
707 # Parse response
708 data = response.json()
709 logger.debug(
710 f"PubMed API returned data for {len(id_list)} requested IDs"
711 )
712 summaries = []
714 for pmid in id_list:
715 if pmid in data["result"]: 715 ↛ 763line 715 didn't jump to line 763 because the condition on line 715 was always true
716 article = data["result"][pmid]
717 logger.debug(
718 f"Processing article {pmid}: {article.get('title', 'NO TITLE')[:50]}"
719 )
721 # Extract authors (if available)
722 authors = []
723 if "authors" in article: 723 ↛ 729line 723 didn't jump to line 729 because the condition on line 723 was always true
724 authors = [
725 author["name"] for author in article["authors"]
726 ]
728 # Extract DOI from articleids if not in main field
729 doi = article.get("doi", "")
730 if not doi and "articleids" in article: 730 ↛ 737line 730 didn't jump to line 737 because the condition on line 730 was always true
731 for aid in article["articleids"]: 731 ↛ 737line 731 didn't jump to line 737 because the loop on line 731 didn't complete
732 if aid.get("idtype") == "doi": 732 ↛ 731line 732 didn't jump to line 731 because the condition on line 732 was always true
733 doi = aid.get("value", "")
734 break
736 # Create summary dictionary with all available fields
737 summary = {
738 "id": pmid,
739 "title": article.get("title", ""),
740 "pubdate": article.get("pubdate", ""),
741 "epubdate": article.get("epubdate", ""),
742 "source": article.get("source", ""),
743 "authors": authors,
744 "lastauthor": article.get("lastauthor", ""),
745 "journal": article.get("fulljournalname", ""),
746 "volume": article.get("volume", ""),
747 "issue": article.get("issue", ""),
748 "pages": article.get("pages", ""),
749 "doi": doi,
750 "issn": article.get("issn", ""),
751 "essn": article.get("essn", ""),
752 "pubtype": article.get(
753 "pubtype", []
754 ), # Publication types from esummary
755 "recordstatus": article.get("recordstatus", ""),
756 "lang": article.get("lang", []),
757 "pmcrefcount": article.get("pmcrefcount", None),
758 "link": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
759 }
761 summaries.append(summary)
762 else:
763 logger.warning(
764 f"PMID {pmid} not found in PubMed API response"
765 )
767 return summaries
769 except Exception as e:
770 error_msg = str(e)
771 logger.exception(
772 f"Error getting article summaries for {len(id_list)} articles"
773 )
775 # Check for rate limiting patterns
776 if (
777 "429" in error_msg
778 or "too many requests" in error_msg.lower()
779 or "rate limit" in error_msg.lower()
780 or "service unavailable" in error_msg.lower()
781 or "503" in error_msg
782 or "403" in error_msg
783 ):
784 raise RateLimitError(f"PubMed rate limit hit: {error_msg}")
786 return []
788 def _get_article_abstracts(self, id_list: List[str]) -> Dict[str, str]:
789 """
790 Get abstracts for a list of PubMed article IDs.
792 Args:
793 id_list: List of PubMed IDs
795 Returns:
796 Dictionary mapping PubMed IDs to their abstracts
797 """
798 if not id_list:
799 logger.debug("Empty ID list provided to _get_article_abstracts")
800 return {}
802 logger.debug(f"Fetching abstracts for {len(id_list)} PubMed articles")
804 try:
805 # Prepare parameters
806 params = {
807 "db": "pubmed",
808 "id": ",".join(id_list),
809 "retmode": "xml",
810 "rettype": "abstract",
811 }
813 # Add API key if available
814 if self.api_key: 814 ↛ 815line 814 didn't jump to line 815 because the condition on line 814 was never true
815 params["api_key"] = self.api_key
817 self._last_wait_time = self.rate_tracker.apply_rate_limit(
818 self.engine_type
819 )
820 logger.debug(
821 f"Applied rate limit wait: {self._last_wait_time:.2f}s"
822 )
824 # Execute request
825 logger.debug(f"Requesting abstracts from: {self.fetch_url}")
826 response = safe_get(self.fetch_url, params=params)
827 response.raise_for_status()
828 logger.debug(
829 f"Abstract fetch response status: {response.status_code}, size: {len(response.text)} bytes"
830 )
832 # Parse XML response
833 root = ET.fromstring(response.text)
834 logger.debug(
835 f"Parsing abstracts from XML for {len(id_list)} articles"
836 )
838 # Extract abstracts
839 abstracts = {}
841 for article in root.findall(".//PubmedArticle"):
842 pmid_elem = article.find(".//PMID")
843 pmid = pmid_elem.text if pmid_elem is not None else None
845 if pmid is None:
846 continue
848 # Find abstract text
849 abstract_text = ""
850 abstract_elem = article.find(".//AbstractText")
852 if abstract_elem is not None: 852 ↛ 856line 852 didn't jump to line 856 because the condition on line 852 was always true
853 abstract_text = abstract_elem.text or ""
855 # Some abstracts are split into multiple sections
856 abstract_sections = article.findall(".//AbstractText")
857 if len(abstract_sections) > 1:
858 logger.debug(
859 f"Article {pmid} has {len(abstract_sections)} abstract sections"
860 )
862 for section in abstract_sections:
863 # Get section label if it exists
864 label = section.get("Label")
865 section_text = section.text or ""
867 if label and section_text:
868 if abstract_text: 868 ↛ 871line 868 didn't jump to line 871 because the condition on line 868 was always true
869 abstract_text += f"\n\n{label}: {section_text}"
870 else:
871 abstract_text = f"{label}: {section_text}"
872 elif section_text:
873 if abstract_text: 873 ↛ 876line 873 didn't jump to line 876 because the condition on line 873 was always true
874 abstract_text += f"\n\n{section_text}"
875 else:
876 abstract_text = section_text
878 # Store in dictionary
879 if pmid and abstract_text:
880 abstracts[pmid] = abstract_text
881 logger.debug(
882 f"Abstract for {pmid}: {len(abstract_text)} chars"
883 )
884 elif pmid: 884 ↛ 841line 884 didn't jump to line 841 because the condition on line 884 was always true
885 logger.warning(f"No abstract found for PMID {pmid}")
887 logger.info(
888 f"Successfully retrieved {len(abstracts)} abstracts out of {len(id_list)} requested"
889 )
890 return abstracts
892 except Exception:
893 logger.exception(
894 f"Error getting article abstracts for {len(id_list)} articles"
895 )
896 return {}
898 def _get_article_detailed_metadata(
899 self, id_list: List[str]
900 ) -> Dict[str, Dict[str, Any]]:
901 """
902 Get detailed metadata for PubMed articles including publication types,
903 MeSH terms, keywords, and affiliations.
905 Args:
906 id_list: List of PubMed IDs
908 Returns:
909 Dictionary mapping PubMed IDs to their detailed metadata
910 """
911 if not id_list:
912 return {}
914 try:
915 # Prepare parameters
916 params = {
917 "db": "pubmed",
918 "id": ",".join(id_list),
919 "retmode": "xml",
920 "rettype": "medline",
921 }
923 # Add API key if available
924 if self.api_key: 924 ↛ 925line 924 didn't jump to line 925 because the condition on line 924 was never true
925 params["api_key"] = self.api_key
927 self._last_wait_time = self.rate_tracker.apply_rate_limit(
928 self.engine_type
929 )
931 # Execute request
932 response = safe_get(self.fetch_url, params=params)
933 response.raise_for_status()
935 # Parse XML response
936 root = ET.fromstring(response.text)
938 metadata = {}
940 for article in root.findall(".//PubmedArticle"):
941 pmid_elem = article.find(".//PMID")
942 pmid = pmid_elem.text if pmid_elem is not None else None
944 if pmid is None: 944 ↛ 945line 944 didn't jump to line 945 because the condition on line 944 was never true
945 continue
947 article_metadata: Dict[str, Any] = {}
949 # Extract publication types
950 pub_types = []
951 for pub_type in article.findall(".//PublicationType"):
952 if pub_type.text: 952 ↛ 951line 952 didn't jump to line 951 because the condition on line 952 was always true
953 pub_types.append(pub_type.text)
954 if pub_types:
955 article_metadata["publication_types"] = pub_types
957 # Extract MeSH terms
958 mesh_terms = []
959 for mesh in article.findall(".//MeshHeading"):
960 descriptor = mesh.find(".//DescriptorName")
961 if descriptor is not None and descriptor.text: 961 ↛ 959line 961 didn't jump to line 959 because the condition on line 961 was always true
962 mesh_terms.append(descriptor.text)
963 if mesh_terms:
964 article_metadata["mesh_terms"] = mesh_terms
966 # Extract keywords
967 keywords = []
968 for keyword in article.findall(".//Keyword"):
969 if keyword.text: 969 ↛ 968line 969 didn't jump to line 968 because the condition on line 969 was always true
970 keywords.append(keyword.text)
971 if keywords:
972 article_metadata["keywords"] = keywords
974 # Extract affiliations
975 affiliations = []
976 for affiliation in article.findall(".//Affiliation"):
977 if affiliation.text: 977 ↛ 976line 977 didn't jump to line 976 because the condition on line 977 was always true
978 affiliations.append(affiliation.text)
979 if affiliations:
980 article_metadata["affiliations"] = affiliations
982 # Extract grant information
983 grants = []
984 for grant in article.findall(".//Grant"):
985 grant_info = {}
986 grant_id = grant.find(".//GrantID")
987 if grant_id is not None and grant_id.text: 987 ↛ 989line 987 didn't jump to line 989 because the condition on line 987 was always true
988 grant_info["id"] = grant_id.text
989 agency = grant.find(".//Agency")
990 if agency is not None and agency.text: 990 ↛ 992line 990 didn't jump to line 992 because the condition on line 990 was always true
991 grant_info["agency"] = agency.text
992 if grant_info: 992 ↛ 984line 992 didn't jump to line 984 because the condition on line 992 was always true
993 grants.append(grant_info)
994 if grants:
995 article_metadata["grants"] = grants
997 # Check for free full text in PMC
998 pmc_elem = article.find(".//ArticleId[@IdType='pmc']")
999 if pmc_elem is not None:
1000 article_metadata["has_free_full_text"] = True
1001 article_metadata["pmc_id"] = pmc_elem.text
1003 # Extract conflict of interest statement
1004 coi_elem = article.find(".//CoiStatement")
1005 if coi_elem is not None and coi_elem.text:
1006 article_metadata["conflict_of_interest"] = coi_elem.text
1008 metadata[pmid] = article_metadata
1010 return metadata
1012 except Exception:
1013 logger.exception("Error getting detailed article metadata")
1014 return {}
1016 def _create_enriched_content(
1017 self, result: Dict[str, Any], base_content: str
1018 ) -> str:
1019 """
1020 Create enriched content by adding relevant metadata context to help the LLM.
1022 Args:
1023 result: The result dictionary with metadata
1024 base_content: The base content (abstract or full text)
1026 Returns:
1027 Enriched content string with metadata context
1028 """
1029 enriched_parts = []
1031 # Add study type information
1032 if "publication_types" in result:
1033 pub_types = result["publication_types"]
1034 # Filter for significant types
1035 significant_types = [
1036 pt
1037 for pt in pub_types
1038 if any(
1039 key in pt.lower()
1040 for key in [
1041 "clinical trial",
1042 "randomized",
1043 "meta-analysis",
1044 "systematic review",
1045 "case report",
1046 "guideline",
1047 "comparative study",
1048 "multicenter",
1049 ]
1050 )
1051 ]
1052 if significant_types:
1053 enriched_parts.append(
1054 f"[Study Type: {', '.join(significant_types)}]"
1055 )
1057 # Add the main content
1058 enriched_parts.append(base_content)
1060 # Add metadata footer
1061 metadata_footer = []
1063 # Add ALL MeSH terms
1064 if "mesh_terms" in result and len(result["mesh_terms"]) > 0:
1065 metadata_footer.append(
1066 f"Medical Topics (MeSH): {', '.join(result['mesh_terms'])}"
1067 )
1069 # Add ALL keywords
1070 if "keywords" in result and len(result["keywords"]) > 0:
1071 metadata_footer.append(f"Keywords: {', '.join(result['keywords'])}")
1073 # Add ALL affiliations
1074 if "affiliations" in result and len(result["affiliations"]) > 0:
1075 if len(result["affiliations"]) == 1:
1076 metadata_footer.append(
1077 f"Institution: {result['affiliations'][0]}"
1078 )
1079 else:
1080 affiliations_text = "\n - " + "\n - ".join(
1081 result["affiliations"]
1082 )
1083 metadata_footer.append(f"Institutions:{affiliations_text}")
1085 # Add ALL funding information with full details
1086 if "grants" in result and len(result["grants"]) > 0:
1087 grant_details = []
1088 for grant in result["grants"]:
1089 grant_text = []
1090 if "agency" in grant:
1091 grant_text.append(grant["agency"])
1092 if "id" in grant:
1093 grant_text.append(f"(Grant ID: {grant['id']})")
1094 if grant_text:
1095 grant_details.append(" ".join(grant_text))
1096 if grant_details:
1097 if len(grant_details) == 1:
1098 metadata_footer.append(f"Funded by: {grant_details[0]}")
1099 else:
1100 funding_text = "\n - " + "\n - ".join(grant_details)
1101 metadata_footer.append(f"Funding Sources:{funding_text}")
1103 # Add FULL conflict of interest statement
1104 if "conflict_of_interest" in result:
1105 coi_text = result["conflict_of_interest"]
1106 if coi_text:
1107 # Still skip trivial "no conflict" statements to reduce noise
1108 if not any(
1109 phrase in coi_text.lower()
1110 for phrase in [
1111 "no conflict",
1112 "no competing",
1113 "nothing to disclose",
1114 "none declared",
1115 "authors declare no",
1116 ]
1117 ):
1118 metadata_footer.append(f"Conflict of Interest: {coi_text}")
1119 elif (
1120 "but" in coi_text.lower()
1121 or "except" in coi_text.lower()
1122 or "however" in coi_text.lower()
1123 ):
1124 # Include if there's a "no conflict BUT..." type statement
1125 metadata_footer.append(f"Conflict of Interest: {coi_text}")
1127 # Combine everything
1128 if metadata_footer:
1129 enriched_parts.append("\n---\nStudy Metadata:")
1130 enriched_parts.extend(metadata_footer)
1132 return "\n".join(enriched_parts)
1134 def _find_pmc_ids(self, pmid_list: List[str]) -> Dict[str, str]:
1135 """
1136 Find PMC IDs for the given PubMed IDs (for full-text access).
1138 Args:
1139 pmid_list: List of PubMed IDs
1141 Returns:
1142 Dictionary mapping PubMed IDs to their PMC IDs (if available)
1143 """
1144 if not pmid_list or not self.get_full_text:
1145 return {}
1147 try:
1148 # Prepare parameters
1149 params = {
1150 "dbfrom": "pubmed",
1151 "db": "pmc",
1152 "linkname": "pubmed_pmc",
1153 "id": ",".join(pmid_list),
1154 "retmode": "json",
1155 }
1157 # Add API key if available
1158 if self.api_key: 1158 ↛ 1159line 1158 didn't jump to line 1159 because the condition on line 1158 was never true
1159 params["api_key"] = self.api_key
1161 self._last_wait_time = self.rate_tracker.apply_rate_limit(
1162 self.engine_type
1163 )
1165 # Execute request
1166 response = safe_get(self.link_url, params=params)
1167 response.raise_for_status()
1169 # Parse response
1170 data = response.json()
1172 # Map PubMed IDs to PMC IDs
1173 pmid_to_pmcid = {}
1175 for linkset in data.get("linksets", []):
1176 pmid = linkset.get("ids", [None])[0]
1178 if not pmid: 1178 ↛ 1179line 1178 didn't jump to line 1179 because the condition on line 1178 was never true
1179 continue
1181 for link in linkset.get("linksetdbs", []):
1182 if link.get("linkname") == "pubmed_pmc": 1182 ↛ 1181line 1182 didn't jump to line 1181 because the condition on line 1182 was always true
1183 pmcids = link.get("links", [])
1184 if pmcids: 1184 ↛ 1181line 1184 didn't jump to line 1181 because the condition on line 1184 was always true
1185 pmid_to_pmcid[str(pmid)] = f"PMC{pmcids[0]}"
1187 logger.info(
1188 f"Found {len(pmid_to_pmcid)} PMC IDs for full-text access"
1189 )
1190 return pmid_to_pmcid
1192 except Exception:
1193 logger.exception("Error finding PMC IDs")
1194 return {}
1196 def _get_pmc_full_text(self, pmcid: str) -> str:
1197 """
1198 Get full text for a PMC article.
1200 Args:
1201 pmcid: PMC ID of the article
1203 Returns:
1204 Full text content or empty string if not available
1205 """
1206 try:
1207 # Prepare parameters
1208 params = {
1209 "db": "pmc",
1210 "id": pmcid,
1211 "retmode": "xml",
1212 "rettype": "full",
1213 }
1215 # Add API key if available
1216 if self.api_key: 1216 ↛ 1217line 1216 didn't jump to line 1217 because the condition on line 1216 was never true
1217 params["api_key"] = self.api_key
1219 self._last_wait_time = self.rate_tracker.apply_rate_limit(
1220 self.engine_type
1221 )
1223 # Execute request
1224 response = safe_get(self.fetch_url, params=params)
1225 response.raise_for_status()
1227 # Parse XML response
1228 root = ET.fromstring(response.text)
1230 # Extract full text
1231 full_text = []
1233 # Extract article title
1234 title_elem = root.find(".//article-title")
1235 if title_elem is not None and title_elem.text: 1235 ↛ 1239line 1235 didn't jump to line 1239 because the condition on line 1235 was always true
1236 full_text.append(f"# {title_elem.text}")
1238 # Extract abstract
1239 abstract_paras = root.findall(".//abstract//p")
1240 if abstract_paras:
1241 full_text.append("\n## Abstract\n")
1242 for p in abstract_paras:
1243 text = "".join(p.itertext())
1244 if text: 1244 ↛ 1242line 1244 didn't jump to line 1242 because the condition on line 1244 was always true
1245 full_text.append(text)
1247 # Extract body content
1248 body = root.find(".//body")
1249 if body is not None: 1249 ↛ 1262line 1249 didn't jump to line 1262 because the condition on line 1249 was always true
1250 for section in body.findall(".//sec"):
1251 # Get section title
1252 title = section.find(".//title")
1253 if title is not None and title.text: 1253 ↛ 1257line 1253 didn't jump to line 1257 because the condition on line 1253 was always true
1254 full_text.append(f"\n## {title.text}\n")
1256 # Get paragraphs
1257 for p in section.findall(".//p"):
1258 text = "".join(p.itertext())
1259 if text: 1259 ↛ 1257line 1259 didn't jump to line 1257 because the condition on line 1259 was always true
1260 full_text.append(text)
1262 result_text = "\n\n".join(full_text)
1263 logger.debug(
1264 f"Successfully extracted {len(result_text)} chars of PMC full text with {len(full_text)} sections"
1265 )
1266 return result_text
1268 except Exception:
1269 logger.exception("Error getting PMC full text")
1270 return ""
1272 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
1273 """
1274 Get preview information for PubMed articles.
1276 Args:
1277 query: The search query
1279 Returns:
1280 List of preview dictionaries
1281 """
1282 logger.info(f"Getting PubMed previews for query: {query}")
1284 # Optimize the query for PubMed if LLM is available
1285 optimized_query = self._optimize_query_for_pubmed(query)
1287 # Perform adaptive search
1288 pmid_list, strategy = self._adaptive_search(optimized_query)
1290 # If no results, try a simplified query
1291 if not pmid_list:
1292 logger.warning(
1293 f"No PubMed results found using strategy: {strategy}"
1294 )
1295 simplified_query = self._simplify_query(optimized_query)
1296 if simplified_query != optimized_query:
1297 logger.info(f"Trying with simplified query: {simplified_query}")
1298 pmid_list, strategy = self._adaptive_search(simplified_query)
1299 if pmid_list:
1300 logger.info(
1301 f"Simplified query found {len(pmid_list)} results"
1302 )
1304 if not pmid_list:
1305 logger.warning("No PubMed results found after query simplification")
1306 return []
1308 # Get article summaries
1309 logger.debug(f"Fetching article summaries for {len(pmid_list)} PMIDs")
1310 summaries = self._get_article_summaries(pmid_list)
1311 logger.debug(f"Retrieved {len(summaries)} summaries")
1313 # ALWAYS fetch abstracts for snippet-only mode to provide context for LLM
1314 logger.debug(
1315 f"Fetching abstracts for {len(pmid_list)} articles for snippet enrichment"
1316 )
1317 abstracts = self._get_article_abstracts(pmid_list)
1318 logger.debug(f"Retrieved {len(abstracts)} abstracts")
1320 # Format as previews
1321 previews = []
1322 for summary in summaries:
1323 # Build snippet from individual metadata preferences
1324 snippet_parts = []
1326 # Check for publication type from esummary (earlier than detailed metadata)
1327 pub_type_prefix = ""
1328 if self.include_publication_type_in_context and summary.get( 1328 ↛ 1332line 1328 didn't jump to line 1332 because the condition on line 1328 was never true
1329 "pubtype"
1330 ):
1331 # Use first publication type from esummary
1332 pub_type_prefix = f"[{summary['pubtype'][0]}] "
1334 # Add authors if enabled
1335 if self.include_authors_in_context and summary.get("authors"):
1336 authors_text = ", ".join(summary.get("authors", []))
1337 if len(authors_text) > 100:
1338 # Truncate long author lists
1339 authors_text = authors_text[:97] + "..."
1340 snippet_parts.append(authors_text)
1342 # Add journal if enabled
1343 if self.include_journal_in_context and summary.get("journal"): 1343 ↛ 1347line 1343 didn't jump to line 1347 because the condition on line 1343 was always true
1344 snippet_parts.append(summary["journal"])
1346 # Add date (full or year only)
1347 if summary.get("pubdate"): 1347 ↛ 1357line 1347 didn't jump to line 1357 because the condition on line 1347 was always true
1348 if self.include_full_date_in_context: 1348 ↛ 1349line 1348 didn't jump to line 1349 because the condition on line 1348 was never true
1349 snippet_parts.append(summary["pubdate"])
1350 elif ( 1350 ↛ 1357line 1350 didn't jump to line 1357 because the condition on line 1350 was always true
1351 self.include_year_in_context
1352 and len(summary["pubdate"]) >= 4
1353 ):
1354 snippet_parts.append(summary["pubdate"][:4])
1356 # Add citation details if enabled
1357 if self.include_citation_in_context:
1358 citation_parts = []
1359 if summary.get("volume"): 1359 ↛ 1361line 1359 didn't jump to line 1361 because the condition on line 1359 was always true
1360 citation_parts.append(f"Vol {summary['volume']}")
1361 if summary.get("issue"): 1361 ↛ 1363line 1361 didn't jump to line 1363 because the condition on line 1361 was always true
1362 citation_parts.append(f"Issue {summary['issue']}")
1363 if summary.get("pages"): 1363 ↛ 1365line 1363 didn't jump to line 1365 because the condition on line 1363 was always true
1364 citation_parts.append(f"pp {summary['pages']}")
1365 if citation_parts: 1365 ↛ 1369line 1365 didn't jump to line 1369 because the condition on line 1365 was always true
1366 snippet_parts.append(f"({', '.join(citation_parts)})")
1368 # Join snippet parts or provide default
1369 if snippet_parts: 1369 ↛ 1380line 1369 didn't jump to line 1380 because the condition on line 1369 was always true
1370 # Use different separators based on what's included
1371 if self.include_authors_in_context:
1372 snippet = ". ".join(
1373 snippet_parts
1374 ) # Authors need period separator
1375 else:
1376 snippet = " - ".join(
1377 snippet_parts
1378 ) # Journal and year use dash
1379 else:
1380 snippet = "Research article"
1382 # Add publication type prefix
1383 snippet = pub_type_prefix + snippet
1385 # Add language indicator if not English
1386 if self.include_language_in_context and summary.get("lang"):
1387 langs = summary["lang"]
1388 if langs and langs[0] != "eng" and langs[0]: 1388 ↛ 1392line 1388 didn't jump to line 1392 because the condition on line 1388 was always true
1389 snippet = f"{snippet} [{langs[0].upper()}]"
1391 # Add identifiers if enabled
1392 identifier_parts = []
1393 if self.include_pmid_in_context and summary.get("id"):
1394 identifier_parts.append(f"PMID: {summary['id']}")
1395 if self.include_doi_in_context and summary.get("doi"):
1396 identifier_parts.append(f"DOI: {summary['doi']}")
1398 if identifier_parts:
1399 snippet = f"{snippet} | {' | '.join(identifier_parts)}"
1401 # ALWAYS include title and abstract in snippet for LLM analysis
1402 pmid = summary["id"]
1403 title = summary["title"]
1404 abstract_text = abstracts.get(pmid, "")
1406 # Truncate abstract if too long
1407 if len(abstract_text) > 500: 1407 ↛ 1408line 1407 didn't jump to line 1408 because the condition on line 1407 was never true
1408 abstract_text = abstract_text[:497] + "..."
1410 # Build the enriched snippet with title and abstract
1411 if abstract_text:
1412 enriched_snippet = f"Title: {title}\n\nAbstract: {abstract_text}\n\nMetadata: {snippet}"
1413 else:
1414 enriched_snippet = f"Title: {title}\n\nMetadata: {snippet}"
1416 # Log the complete snippet for debugging
1417 logger.debug(f"Complete snippet for PMID {pmid}:")
1418 logger.debug(f" Title: {title[:100]}...")
1419 logger.debug(f" Abstract length: {len(abstract_text)} chars")
1420 logger.debug(f" Metadata: {snippet}")
1421 logger.debug(
1422 f" Full enriched snippet ({len(enriched_snippet)} chars): {enriched_snippet[:500]}..."
1423 )
1425 # Create preview with basic information
1426 preview = {
1427 "id": summary["id"],
1428 "title": summary["title"],
1429 "link": summary["link"],
1430 "snippet": enriched_snippet, # Use enriched snippet with title and abstract
1431 "authors": summary.get("authors", []),
1432 "journal": summary.get("journal", ""),
1433 "pubdate": summary.get("pubdate", ""),
1434 "doi": summary.get("doi", ""),
1435 "source": "PubMed",
1436 "_pmid": summary["id"], # Store PMID for later use
1437 "_search_strategy": strategy, # Store search strategy for analytics
1438 }
1440 previews.append(preview)
1442 logger.info(
1443 f"Found {len(previews)} PubMed previews using strategy: {strategy}"
1444 )
1445 if previews: 1445 ↛ 1449line 1445 didn't jump to line 1449 because the condition on line 1445 was always true
1446 logger.debug(
1447 f"Sample preview title: '{previews[0].get('title', 'NO TITLE')[:80]}...'"
1448 )
1449 return previews
1451 def _get_full_content(
1452 self, relevant_items: List[Dict[str, Any]]
1453 ) -> List[Dict[str, Any]]:
1454 """
1455 Get full content for the relevant PubMed articles.
1456 Efficiently manages which content to retrieve (abstracts and/or full text).
1458 Args:
1459 relevant_items: List of relevant preview dictionaries
1461 Returns:
1462 List of result dictionaries with full content
1463 """
1464 # Check if we should add full content
1465 snippets_only_mode = (
1466 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
1467 and search_config.SEARCH_SNIPPETS_ONLY
1468 )
1470 if snippets_only_mode:
1471 logger.info(
1472 "Snippet-only mode enabled, will fetch abstracts as snippets"
1473 )
1474 # For PubMed, we still need to fetch abstracts as they serve as snippets
1475 # But we'll skip full-text retrieval
1477 logger.info(
1478 f"Getting content for {len(relevant_items)} PubMed articles"
1479 )
1481 # Collect all PMIDs for relevant items
1482 pmids = []
1483 for item in relevant_items:
1484 if "_pmid" in item:
1485 pmids.append(item["_pmid"])
1487 # Get abstracts if requested and PMIDs exist
1488 # In snippet-only mode, always get abstracts as they serve as snippets
1489 abstracts = {}
1490 if (self.get_abstracts or snippets_only_mode) and pmids:
1491 abstracts = self._get_article_abstracts(pmids)
1493 # Get detailed metadata for all articles (publication types, MeSH terms, etc.)
1494 detailed_metadata = {}
1495 if pmids:
1496 detailed_metadata = self._get_article_detailed_metadata(pmids)
1498 # Find PMC IDs for full-text retrieval (if enabled and not in snippet-only mode)
1499 pmid_to_pmcid = {}
1500 if self.get_full_text and pmids and not snippets_only_mode:
1501 pmid_to_pmcid = self._find_pmc_ids(pmids)
1503 # Add content to results
1504 results: List[Dict[str, Any]] = []
1505 for item in relevant_items:
1506 result = item.copy()
1507 pmid = item.get("_pmid", "")
1509 # Add detailed metadata if available
1510 if pmid in detailed_metadata:
1511 metadata = detailed_metadata[pmid]
1513 # Add publication types (e.g., "Clinical Trial", "Meta-Analysis")
1514 if "publication_types" in metadata:
1515 result["publication_types"] = metadata["publication_types"]
1517 # Add first publication type to snippet if enabled
1518 if ( 1518 ↛ 1530line 1518 didn't jump to line 1530 because the condition on line 1518 was always true
1519 self.include_publication_type_in_context
1520 and metadata["publication_types"]
1521 ):
1522 # Just take the first publication type as is
1523 pub_type = metadata["publication_types"][0]
1524 if "snippet" in result: 1524 ↛ 1530line 1524 didn't jump to line 1530 because the condition on line 1524 was always true
1525 result["snippet"] = (
1526 f"[{pub_type}] {result['snippet']}"
1527 )
1529 # Add MeSH terms for medical categorization
1530 if "mesh_terms" in metadata:
1531 result["mesh_terms"] = metadata["mesh_terms"]
1533 # Add MeSH terms to snippet if enabled
1534 if ( 1534 ↛ 1550line 1534 didn't jump to line 1550 because the condition on line 1534 was always true
1535 self.include_mesh_terms_in_context
1536 and metadata["mesh_terms"]
1537 ):
1538 mesh_to_show = (
1539 metadata["mesh_terms"][: self.max_mesh_terms]
1540 if self.max_mesh_terms > 0
1541 else metadata["mesh_terms"]
1542 )
1543 if mesh_to_show and "snippet" in result: 1543 ↛ 1550line 1543 didn't jump to line 1550 because the condition on line 1543 was always true
1544 mesh_text = "MeSH: " + ", ".join(mesh_to_show)
1545 result["snippet"] = (
1546 f"{result['snippet']} | {mesh_text}"
1547 )
1549 # Add keywords
1550 if "keywords" in metadata:
1551 result["keywords"] = metadata["keywords"]
1553 # Add keywords to snippet if enabled
1554 if ( 1554 ↛ 1572line 1554 didn't jump to line 1572 because the condition on line 1554 was always true
1555 self.include_keywords_in_context
1556 and metadata["keywords"]
1557 ):
1558 keywords_to_show = (
1559 metadata["keywords"][: self.max_keywords]
1560 if self.max_keywords > 0
1561 else metadata["keywords"]
1562 )
1563 if keywords_to_show and "snippet" in result: 1563 ↛ 1572line 1563 didn't jump to line 1572 because the condition on line 1563 was always true
1564 keywords_text = "Keywords: " + ", ".join(
1565 keywords_to_show
1566 )
1567 result["snippet"] = (
1568 f"{result['snippet']} | {keywords_text}"
1569 )
1571 # Add affiliations
1572 if "affiliations" in metadata: 1572 ↛ 1573line 1572 didn't jump to line 1573 because the condition on line 1572 was never true
1573 result["affiliations"] = metadata["affiliations"]
1575 # Add funding/grant information
1576 if "grants" in metadata: 1576 ↛ 1577line 1576 didn't jump to line 1577 because the condition on line 1576 was never true
1577 result["grants"] = metadata["grants"]
1579 # Add conflict of interest statement
1580 if "conflict_of_interest" in metadata: 1580 ↛ 1581line 1580 didn't jump to line 1581 because the condition on line 1580 was never true
1581 result["conflict_of_interest"] = metadata[
1582 "conflict_of_interest"
1583 ]
1585 # Add free full text availability
1586 if "has_free_full_text" in metadata:
1587 result["has_free_full_text"] = metadata[
1588 "has_free_full_text"
1589 ]
1590 if "pmc_id" in metadata: 1590 ↛ 1594line 1590 didn't jump to line 1594 because the condition on line 1590 was always true
1591 result["pmc_id"] = metadata["pmc_id"]
1593 # Add PMC availability to snippet if enabled
1594 if ( 1594 ↛ 1604line 1594 didn't jump to line 1604 because the condition on line 1594 was always true
1595 self.include_pmc_availability_in_context
1596 and metadata["has_free_full_text"]
1597 and "snippet" in result
1598 ):
1599 result["snippet"] = (
1600 f"{result['snippet']} | [Free Full Text]"
1601 )
1603 # Add abstract if available
1604 if pmid in abstracts:
1605 result["abstract"] = abstracts[pmid]
1607 # Create enriched content with metadata context
1608 enriched_content = self._create_enriched_content(
1609 result, abstracts[pmid]
1610 )
1612 # ALWAYS include title and abstract in snippet for LLM analysis
1613 # Build comprehensive snippet with title and abstract
1614 title = result.get("title", "")
1615 abstract_text = (
1616 abstracts[pmid][:SNIPPET_LENGTH_LONG]
1617 if len(abstracts[pmid]) > SNIPPET_LENGTH_LONG
1618 else abstracts[pmid]
1619 )
1621 # Prepend title and abstract to the existing metadata snippet
1622 if "snippet" in result: 1622 ↛ 1629line 1622 didn't jump to line 1629 because the condition on line 1622 was always true
1623 # Keep metadata snippet and add content
1624 result["snippet"] = (
1625 f"Title: {title}\n\nAbstract: {abstract_text}\n\nMetadata: {result['snippet']}"
1626 )
1627 else:
1628 # No metadata snippet, just title and abstract
1629 result["snippet"] = (
1630 f"Title: {title}\n\nAbstract: {abstract_text}"
1631 )
1633 # In snippet-only mode, use enriched content
1634 if snippets_only_mode:
1635 result["full_content"] = enriched_content
1636 result["content"] = enriched_content
1637 result["content_type"] = "abstract"
1638 # Use abstract as content if no full text
1639 elif pmid not in pmid_to_pmcid:
1640 result["full_content"] = enriched_content
1641 result["content"] = enriched_content
1642 result["content_type"] = "abstract"
1644 # Add full text for a limited number of top articles
1645 if (
1646 pmid in pmid_to_pmcid
1647 and self.get_full_text
1648 and len(
1649 [r for r in results if r.get("content_type") == "full_text"]
1650 )
1651 < self.full_text_limit
1652 ):
1653 # Get full text content
1654 pmcid = pmid_to_pmcid[pmid]
1655 full_text = self._get_pmc_full_text(pmcid)
1657 if full_text:
1658 enriched_full_text = self._create_enriched_content(
1659 result, full_text
1660 )
1661 result["full_content"] = enriched_full_text
1662 result["content"] = enriched_full_text
1663 result["content_type"] = "full_text"
1664 result["pmcid"] = pmcid
1665 elif pmid in abstracts: 1665 ↛ 1675line 1665 didn't jump to line 1675 because the condition on line 1665 was always true
1666 # Fall back to abstract if full text retrieval fails
1667 enriched_content = self._create_enriched_content(
1668 result, abstracts[pmid]
1669 )
1670 result["full_content"] = enriched_content
1671 result["content"] = enriched_content
1672 result["content_type"] = "abstract"
1674 # Remove temporary fields
1675 if "_pmid" in result:
1676 del result["_pmid"]
1677 if "_search_strategy" in result:
1678 del result["_search_strategy"]
1680 results.append(result)
1682 return results
1684 def search_by_author(
1685 self, author_name: str, max_results: Optional[int] = None
1686 ) -> List[Dict[str, Any]]:
1687 """
1688 Search for articles by a specific author.
1690 Args:
1691 author_name: Name of the author
1692 max_results: Maximum number of results (defaults to self.max_results)
1694 Returns:
1695 List of articles by the author
1696 """
1697 original_max_results = self.max_results
1699 try:
1700 if max_results:
1701 self.max_results = max_results
1703 query = f"{author_name}[Author]"
1704 return self.run(query)
1706 finally:
1707 # Restore original value
1708 self.max_results = original_max_results
1710 def search_by_journal(
1711 self, journal_name: str, max_results: Optional[int] = None
1712 ) -> List[Dict[str, Any]]:
1713 """
1714 Search for articles in a specific journal.
1716 Args:
1717 journal_name: Name of the journal
1718 max_results: Maximum number of results (defaults to self.max_results)
1720 Returns:
1721 List of articles from the journal
1722 """
1723 original_max_results = self.max_results
1725 try:
1726 if max_results:
1727 self.max_results = max_results
1729 query = f"{journal_name}[Journal]"
1730 return self.run(query)
1732 finally:
1733 # Restore original value
1734 self.max_results = original_max_results
1736 def search_recent(
1737 self, query: str, days: int = 30, max_results: Optional[int] = None
1738 ) -> List[Dict[str, Any]]:
1739 """
1740 Search for recent articles matching the query.
1742 Args:
1743 query: The search query
1744 days: Number of days to look back
1745 max_results: Maximum number of results (defaults to self.max_results)
1747 Returns:
1748 List of recent articles matching the query
1749 """
1750 original_max_results = self.max_results
1751 original_days_limit = self.days_limit
1753 try:
1754 if max_results:
1755 self.max_results = max_results
1757 # Set days limit for this search
1758 self.days_limit = days
1760 return self.run(query)
1762 finally:
1763 # Restore original values
1764 self.max_results = original_max_results
1765 self.days_limit = original_days_limit
1767 def advanced_search(
1768 self, terms: Dict[str, str], max_results: Optional[int] = None
1769 ) -> List[Dict[str, Any]]:
1770 """
1771 Perform an advanced search with field-specific terms.
1773 Args:
1774 terms: Dictionary mapping fields to search terms
1775 Valid fields: Author, Journal, Title, MeSH, Affiliation, etc.
1776 max_results: Maximum number of results (defaults to self.max_results)
1778 Returns:
1779 List of articles matching the advanced query
1780 """
1781 original_max_results = self.max_results
1783 try:
1784 if max_results:
1785 self.max_results = max_results
1787 # Build advanced query string
1788 query_parts = []
1789 for field, term in terms.items():
1790 query_parts.append(f"{term}[{field}]")
1792 query = " AND ".join(query_parts)
1793 return self.run(query)
1795 finally:
1796 # Restore original value
1797 self.max_results = original_max_results