Coverage for src/local_deep_research/web_search_engines/engines/search_engine_pubmed.py: 92%
715 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1import re
2from typing import Any, Dict, List, Optional, Tuple
4from defusedxml import ElementTree as ET
6from langchain_core.language_models import BaseLLM
7from loguru import logger
9from ...config import search_config
10from ...constants import SNIPPET_LENGTH_LONG
11from ...security.safe_requests import safe_get
12from ...advanced_search_system.filters.journal_reputation_filter import (
13 JournalReputationFilter,
14)
15from ..rate_limiting import RateLimitError
16from ..search_engine_base import BaseSearchEngine
19class PubMedSearchEngine(BaseSearchEngine):
20 """
21 PubMed search engine implementation with two-phase approach and adaptive search.
22 Provides efficient access to biomedical literature while minimizing API usage.
23 """
25 # Mark as public search engine
26 is_public = True
27 # Scientific/medical search engine
28 is_scientific = True
29 is_lexical = True
30 needs_llm_relevance_filter = True
32 def __init__(
33 self,
34 max_results: int = 10,
35 api_key: Optional[str] = None,
36 days_limit: Optional[int] = None,
37 get_abstracts: bool = True,
38 get_full_text: bool = False,
39 full_text_limit: int = 3,
40 llm: Optional[BaseLLM] = None,
41 max_filtered_results: Optional[int] = None,
42 optimize_queries: bool = True,
43 include_publication_type_in_context: bool = True,
44 include_journal_in_context: bool = True,
45 include_year_in_context: bool = True,
46 include_authors_in_context: bool = False,
47 include_full_date_in_context: bool = False,
48 include_mesh_terms_in_context: bool = True,
49 include_keywords_in_context: bool = True,
50 include_doi_in_context: bool = False,
51 include_pmid_in_context: bool = False,
52 include_pmc_availability_in_context: bool = False,
53 max_mesh_terms: int = 3,
54 max_keywords: int = 3,
55 include_citation_in_context: bool = False,
56 include_language_in_context: bool = False,
57 settings_snapshot: Optional[Dict[str, Any]] = None,
58 ):
59 """
60 Initialize the PubMed search engine.
62 Args:
63 max_results: Maximum number of search results
64 api_key: NCBI API key for higher rate limits (optional)
65 days_limit: Limit results to N days (optional)
66 get_abstracts: Whether to fetch abstracts for all results
67 get_full_text: Whether to fetch full text content (when available in PMC)
68 full_text_limit: Max number of full-text articles to retrieve
69 llm: Language model for relevance filtering
70 max_filtered_results: Maximum number of results to keep after filtering
71 optimize_queries: Whether to optimize natural language queries for PubMed
72 """
73 # Wire up the journal reputation filter as a preview filter so
74 # results are scored against bundled OpenAlex/DOAJ/predatory data
75 # before the (more expensive) LLM relevance pass.
76 preview_filters = []
77 journal_filter = JournalReputationFilter.create_default(
78 model=llm, # type: ignore[arg-type]
79 engine_name="pubmed",
80 settings_snapshot=settings_snapshot,
81 )
82 if journal_filter is not None: 82 ↛ 83line 82 didn't jump to line 83 because the condition on line 82 was never true
83 preview_filters.append(journal_filter)
85 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
86 super().__init__(
87 llm=llm,
88 max_filtered_results=max_filtered_results,
89 max_results=max_results,
90 preview_filters=preview_filters, # type: ignore[arg-type]
91 settings_snapshot=settings_snapshot,
92 )
93 self.max_results = max(self.max_results, 25)
94 self.api_key = api_key
95 self.days_limit = days_limit
96 self.get_abstracts = get_abstracts
97 self.get_full_text = get_full_text
98 self.full_text_limit = full_text_limit
99 self.optimize_queries = optimize_queries
100 self.include_publication_type_in_context = (
101 include_publication_type_in_context
102 )
103 self.include_journal_in_context = include_journal_in_context
104 self.include_year_in_context = include_year_in_context
105 self.include_authors_in_context = include_authors_in_context
106 self.include_full_date_in_context = include_full_date_in_context
107 self.include_mesh_terms_in_context = include_mesh_terms_in_context
108 self.include_keywords_in_context = include_keywords_in_context
109 self.include_doi_in_context = include_doi_in_context
110 self.include_pmid_in_context = include_pmid_in_context
111 self.include_pmc_availability_in_context = (
112 include_pmc_availability_in_context
113 )
114 self.max_mesh_terms = max_mesh_terms
115 self.max_keywords = max_keywords
116 self.include_citation_in_context = include_citation_in_context
117 self.include_language_in_context = include_language_in_context
119 # Base API URLs
120 self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
121 self.search_url = f"{self.base_url}/esearch.fcgi"
122 self.summary_url = f"{self.base_url}/esummary.fcgi"
123 self.fetch_url = f"{self.base_url}/efetch.fcgi"
124 self.link_url = f"{self.base_url}/elink.fcgi"
126 # PMC base URL for full text
127 self.pmc_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/"
129 def _get_result_count(self, query: str) -> int:
130 """
131 Get the total number of results for a query without retrieving the results themselves.
133 Args:
134 query: The search query
136 Returns:
137 Total number of matching results
138 """
139 try:
140 # Prepare search parameters
141 params = {
142 "db": "pubmed",
143 "term": query,
144 "retmode": "json",
145 "retmax": 0, # Don't need actual results, just the count
146 }
148 # Add API key if available
149 if self.api_key:
150 params["api_key"] = self.api_key
152 self._last_wait_time = self.rate_tracker.apply_rate_limit(
153 self.engine_type
154 )
156 # Execute search request
157 response = safe_get(self.search_url, params=params)
158 response.raise_for_status()
160 # Parse response
161 data = response.json()
162 count = int(data["esearchresult"]["count"])
164 logger.info(
165 "Query '{}' has {} total results in PubMed", query, count
166 )
167 return count
169 except Exception:
170 logger.exception("Error getting result count")
171 return 0
173 def _extract_core_terms(self, query: str) -> str:
174 """
175 Extract core terms from a complex query for volume estimation.
177 Args:
178 query: PubMed query string
180 Returns:
181 Simplified query with core terms
182 """
183 # Remove field specifications and operators
184 simplified = re.sub(r"\[\w+\]", "", query) # Remove [Field] tags
185 simplified = re.sub(
186 r"\b(AND|OR|NOT)\b", "", simplified
187 ) # Remove operators
189 # Remove quotes and parentheses
190 simplified = (
191 simplified.replace('"', "").replace("(", "").replace(")", "")
192 )
194 # Split by whitespace and join terms with 4+ chars (likely meaningful)
195 terms = [term for term in simplified.split() if len(term) >= 4]
197 # Join with AND to create a basic search
198 return " ".join(terms[:5]) # Limit to top 5 terms
200 def _expand_time_window(self, time_filter: str) -> str:
201 """
202 Expand a time window to get more results.
204 Args:
205 time_filter: Current time filter
207 Returns:
208 Expanded time filter
209 """
210 # Parse current time window
211 import re
213 match = re.match(r'"last (\d+) (\w+)"[pdat]', time_filter)
214 if not match:
215 return '"last 10 years"[pdat]'
217 amount, unit = int(match.group(1)), match.group(2)
219 # Expand based on current unit
220 if unit == "months" or unit == "month":
221 if amount < 6:
222 return '"last 6 months"[pdat]'
223 if amount < 12:
224 return '"last 1 year"[pdat]'
225 return '"last 2 years"[pdat]'
226 if unit == "years" or unit == "year": 226 ↛ 233line 226 didn't jump to line 233 because the condition on line 226 was always true
227 if amount < 2:
228 return '"last 2 years"[pdat]'
229 if amount < 5:
230 return '"last 5 years"[pdat]'
231 return '"last 10 years"[pdat]'
233 return '"last 10 years"[pdat]'
235 def _optimize_query_for_pubmed(self, query: str) -> str:
236 """
237 Optimize a natural language query for PubMed search.
238 Uses LLM to transform questions into effective keyword-based queries.
240 Args:
241 query: Natural language query
243 Returns:
244 Optimized query string for PubMed
245 """
246 if not self.llm or not self.optimize_queries:
247 # Return original query if no LLM available or optimization disabled
248 return query
250 try:
251 # Prompt for query optimization
252 prompt = f"""Transform this natural language question into an optimized PubMed search query.
254Original query: "{query}"
256CRITICAL RULES:
2571. ONLY RETURN THE EXACT SEARCH QUERY - NO EXPLANATIONS, NO COMMENTS
2582. DO NOT wrap the entire query in quotes
2593. DO NOT include ANY date restrictions or year filters
2604. Use parentheses around OR statements: (term1[Field] OR term2[Field])
2615. Use only BASIC MeSH terms - stick to broad categories like "Vaccines"[Mesh]
2626. KEEP IT SIMPLE - use 2-3 main concepts maximum
2637. Focus on Title/Abstract searches for reliability: term[Title/Abstract]
2648. Use wildcards for variations: vaccin*[Title/Abstract]
266EXAMPLE QUERIES:
267✓ GOOD: (mRNA[Title/Abstract] OR "messenger RNA"[Title/Abstract]) AND vaccin*[Title/Abstract]
268✓ GOOD: (influenza[Title/Abstract] OR flu[Title/Abstract]) AND treatment[Title/Abstract]
269✗ BAD: (mRNA[Title/Abstract]) AND "specific disease"[Mesh] AND treatment[Title/Abstract] AND 2023[dp]
270✗ BAD: "Here's a query to find articles about vaccines..."
272Return ONLY the search query without any explanations.
273"""
275 # Get response from LLM
276 response = self.llm.invoke(prompt)
277 raw_response = (
278 str(response.content)
279 if hasattr(response, "content")
280 else str(response)
281 ).strip()
283 # Clean up the query - extract only the actual query and remove any explanations
284 # First check if there are multiple lines and take the first non-empty line
285 lines = raw_response.split("\n")
286 cleaned_lines = [line.strip() for line in lines if line.strip()]
288 if cleaned_lines: 288 ↛ 338line 288 didn't jump to line 338 because the condition on line 288 was always true
289 optimized_query = cleaned_lines[0]
291 # Remove any quotes that wrap the entire query
292 if optimized_query.startswith('"') and optimized_query.endswith(
293 '"'
294 ):
295 optimized_query = optimized_query[1:-1]
297 # Remove any explanation phrases that might be at the beginning
298 explanation_starters = [
299 "here is",
300 "here's",
301 "this query",
302 "the following",
303 ]
304 for starter in explanation_starters:
305 if optimized_query.lower().startswith(starter):
306 # Find the actual query part - typically after a colon
307 colon_pos = optimized_query.find(":")
308 if colon_pos > 0:
309 optimized_query = optimized_query[
310 colon_pos + 1 :
311 ].strip()
313 # Check if the query still seems to contain explanations
314 if (
315 len(optimized_query) > 200
316 or "this query will" in optimized_query.lower()
317 ):
318 # It's probably still an explanation - try to extract just the query part
319 # Look for common patterns in the explanation like parentheses
320 pattern = r"\([^)]+\)\s+AND\s+"
321 import re
323 matches = re.findall(pattern, optimized_query)
324 if matches: 324 ↛ 344line 324 didn't jump to line 344 because the condition on line 324 was always true
325 # Extract just the query syntax parts
326 query_parts = []
327 for part in re.split(r"\.\s+", optimized_query):
328 if (
329 "(" in part
330 and ")" in part
331 and ("AND" in part or "OR" in part)
332 ):
333 query_parts.append(part)
334 if query_parts: 334 ↛ 344line 334 didn't jump to line 344 because the condition on line 334 was always true
335 optimized_query = " ".join(query_parts)
336 else:
337 # Fall back to original query if cleaning fails
338 logger.warning(
339 "Failed to extract a clean query from LLM response"
340 )
341 optimized_query = query
343 # Final safety check - if query looks too much like an explanation, use original
344 if len(optimized_query.split()) > 30:
345 logger.warning(
346 "Query too verbose, falling back to simpler form"
347 )
348 # Create a simple query from the original
349 words = [
350 w
351 for w in query.split()
352 if len(w) > 3
353 and w.lower()
354 not in (
355 "what",
356 "are",
357 "the",
358 "and",
359 "for",
360 "with",
361 "from",
362 "have",
363 "been",
364 "recent",
365 )
366 ]
367 optimized_query = " AND ".join(words[:3])
369 # Basic cleanup: standardize field tag case for consistency
370 import re
372 optimized_query = re.sub(
373 r"\[mesh\]", "[Mesh]", optimized_query, flags=re.IGNORECASE
374 )
375 optimized_query = re.sub(
376 r"\[title/abstract\]",
377 "[Title/Abstract]",
378 optimized_query,
379 flags=re.IGNORECASE,
380 )
381 optimized_query = re.sub(
382 r"\[publication type\]",
383 "[Publication Type]",
384 optimized_query,
385 flags=re.IGNORECASE,
386 )
388 # Fix unclosed quotes followed by field tags
389 # Pattern: "term[Field] -> "term"[Field]
390 optimized_query = re.sub(r'"([^"]+)\[', r'"\1"[', optimized_query)
392 # Simplify the query if still no results are found
393 self._simplify_query_cache = optimized_query
395 # Log original and optimized queries
396 logger.info("Original query: '{}'", query)
397 logger.info(f"Optimized for PubMed: '{optimized_query}'")
398 logger.debug(
399 f"Query optimization complete: '{query[:50]}...' -> '{optimized_query[:100]}...'"
400 )
402 return optimized_query
404 except Exception:
405 logger.exception("Error optimizing query")
406 logger.debug(f"Falling back to original query: '{query}'")
407 return query # Fall back to original query on error
409 def _simplify_query(self, query: str) -> str:
410 """
411 Simplify a PubMed query that returned no results.
412 Progressively removes elements to get a more basic query.
414 Args:
415 query: The original query that returned no results
417 Returns:
418 Simplified query
419 """
420 logger.info(f"Simplifying query: {query}")
421 logger.debug(f"Query simplification started for: '{query[:100]}...'")
423 # Simple approach: remove field restrictions to broaden the search
424 import re
426 # Remove field tags to make search broader
427 simplified = query
429 # Remove [Mesh] tags - search in all fields instead
430 simplified = re.sub(r"\[Mesh\]", "", simplified, flags=re.IGNORECASE)
432 # Remove [Publication Type] tags
433 simplified = re.sub(
434 r"\[Publication Type\]", "", simplified, flags=re.IGNORECASE
435 )
437 # Keep [Title/Abstract] as it's usually helpful
438 # Clean up any double spaces
439 simplified = re.sub(r"\s+", " ", simplified).strip()
441 # If no simplification was possible, return the original query
442 if simplified == query:
443 logger.debug("No simplification possible, returning original query")
445 logger.info(f"Simplified query: {simplified}")
446 logger.debug(
447 f"Query simplified from {len(query)} to {len(simplified)} chars"
448 )
449 return simplified
451 def _is_historical_focused(self, query: str) -> bool:
452 """
453 Determine if a query is specifically focused on historical/older information using LLM.
454 Default assumption is that queries should prioritize recent information unless
455 explicitly asking for historical content.
457 Args:
458 query: The search query
460 Returns:
461 Boolean indicating if the query is focused on historical information
462 """
463 if not self.llm:
464 # Fall back to basic keyword check if no LLM available
465 historical_terms = [
466 "history",
467 "historical",
468 "early",
469 "initial",
470 "first",
471 "original",
472 "before",
473 "prior to",
474 "origins",
475 "evolution",
476 "development",
477 ]
478 historical_years = [str(year) for year in range(1900, 2020)]
480 query_lower = query.lower()
481 has_historical_term = any(
482 term in query_lower for term in historical_terms
483 )
484 has_past_year = any(year in query for year in historical_years)
486 return has_historical_term or has_past_year
488 try:
489 # Use LLM to determine if the query is focused on historical information
490 prompt = f"""Determine if this query is specifically asking for HISTORICAL or OLDER information.
492Query: "{query}"
494Answer ONLY "yes" if the query is clearly asking for historical, early, original, or past information from more than 5 years ago.
495Answer ONLY "no" if the query is asking about recent, current, or new information, or if it's a general query without a specific time focus.
497The default assumption should be that medical and scientific queries want RECENT information unless clearly specified otherwise.
498"""
500 response = self.llm.invoke(prompt)
501 answer = (
502 (
503 str(response.content)
504 if hasattr(response, "content")
505 else str(response)
506 )
507 .strip()
508 .lower()
509 )
511 # Log the determination
512 logger.info(f"Historical focus determination for query: '{query}'")
513 logger.info(f"LLM determined historical focus: {answer}")
515 return "yes" in answer
517 except Exception:
518 logger.exception("Error determining historical focus")
519 # Fall back to basic keyword check
520 historical_terms = [
521 "history",
522 "historical",
523 "early",
524 "initial",
525 "first",
526 "original",
527 "before",
528 "prior to",
529 "origins",
530 "evolution",
531 "development",
532 ]
533 return any(term in query.lower() for term in historical_terms)
535 def _adaptive_search(self, query: str) -> Tuple[List[str], str]:
536 """
537 Perform an adaptive search that adjusts based on topic volume and whether
538 the query focuses on historical information.
540 Args:
541 query: The search query (already optimized)
543 Returns:
544 Tuple of (list of PMIDs, search strategy used)
545 """
546 # Estimate topic volume
547 estimated_volume = self._get_result_count(query)
549 # Determine if the query is focused on historical information
550 is_historical_focused = self._is_historical_focused(query)
552 if is_historical_focused:
553 # User wants historical information - no date filtering
554 time_filter = None
555 strategy = "historical_focus"
556 elif estimated_volume > 5000:
557 # Very common topic - use tighter recency filter
558 time_filter = '"last 1 year"[pdat]'
559 strategy = "high_volume"
560 elif estimated_volume > 1000:
561 # Common topic
562 time_filter = '"last 3 years"[pdat]'
563 strategy = "common_topic"
564 elif estimated_volume > 100:
565 # Moderate volume
566 time_filter = '"last 5 years"[pdat]'
567 strategy = "moderate_volume"
568 else:
569 # Rare topic - still use recency but with wider range
570 time_filter = '"last 10 years"[pdat]'
571 strategy = "rare_topic"
573 # Run search based on strategy
574 if time_filter:
575 # Try with adaptive time filter
576 query_with_time = f"({query}) AND {time_filter}"
577 logger.info(
578 f"Using adaptive search strategy: {strategy} with filter: {time_filter}"
579 )
580 results = self._search_pubmed(query_with_time)
582 # If too few results, gradually expand time window
583 if len(results) < 5 and '"last 10 years"[pdat]' not in time_filter:
584 logger.info(
585 f"Insufficient results ({len(results)}), expanding time window"
586 )
587 expanded_time = self._expand_time_window(time_filter)
588 query_with_expanded_time = f"({query}) AND {expanded_time}"
589 expanded_results = self._search_pubmed(query_with_expanded_time)
591 if len(expanded_results) > len(results):
592 logger.info(
593 f"Expanded time window yielded {len(expanded_results)} results"
594 )
595 return expanded_results, f"{strategy}_expanded"
597 # If still no results, try without time filter
598 if not results:
599 logger.info(
600 "No results with time filter, trying without time restrictions"
601 )
602 results = self._search_pubmed(query)
603 strategy = "no_time_filter"
604 else:
605 # Historical query - run without time filter
606 logger.info(
607 "Using historical search strategy without date filtering"
608 )
609 results = self._search_pubmed(query)
611 return results, strategy
613 def _search_pubmed(self, query: str) -> List[str]:
614 """
615 Search PubMed and return a list of article IDs.
617 Args:
618 query: The search query
620 Returns:
621 List of PubMed IDs matching the query
622 """
623 try:
624 # Prepare search parameters
625 params = {
626 "db": "pubmed",
627 "term": query,
628 "retmode": "json",
629 "retmax": self.max_results,
630 "usehistory": "y",
631 }
633 # Add API key if available
634 if self.api_key:
635 params["api_key"] = self.api_key
636 logger.debug("Using PubMed API key for higher rate limits")
637 else:
638 logger.debug("No PubMed API key - using default rate limits")
640 # Add date restriction if specified
641 if self.days_limit:
642 params["reldate"] = self.days_limit
643 params["datetype"] = "pdat" # Publication date
644 logger.debug(f"Limiting results to last {self.days_limit} days")
646 logger.debug(
647 f"PubMed search query: '{query}' with max_results={self.max_results}"
648 )
650 self._last_wait_time = self.rate_tracker.apply_rate_limit(
651 self.engine_type
652 )
653 logger.debug(
654 f"Applied rate limit wait: {self._last_wait_time:.2f}s"
655 )
657 # Execute search request
658 logger.debug(f"Sending request to PubMed API: {self.search_url}")
659 response = safe_get(self.search_url, params=params)
660 response.raise_for_status()
661 logger.debug(f"PubMed API response status: {response.status_code}")
663 # Parse response
664 data = response.json()
665 id_list: list[str] = data["esearchresult"]["idlist"]
666 total_count = data["esearchresult"].get("count", "unknown")
668 logger.info(
669 f"PubMed search for '{query}' found {len(id_list)} results (total available: {total_count})"
670 )
671 if len(id_list) > 0:
672 logger.debug(f"First 5 PMIDs: {id_list[:5]}")
673 return id_list
675 except Exception:
676 logger.exception(f"Error searching PubMed for query '{query}'")
677 return []
679 def _get_article_summaries(
680 self, id_list: List[str]
681 ) -> List[Dict[str, Any]]:
682 """
683 Get summaries for a list of PubMed article IDs.
685 Args:
686 id_list: List of PubMed IDs
688 Returns:
689 List of article summary dictionaries
690 """
691 if not id_list:
692 logger.debug("Empty ID list provided to _get_article_summaries")
693 return []
695 logger.debug(f"Fetching summaries for {len(id_list)} PubMed articles")
697 try:
698 # Prepare parameters
699 params = {
700 "db": "pubmed",
701 "id": ",".join(id_list),
702 "retmode": "json",
703 "rettype": "summary",
704 }
706 # Add API key if available
707 if self.api_key: 707 ↛ 708line 707 didn't jump to line 708 because the condition on line 707 was never true
708 params["api_key"] = self.api_key
710 self._last_wait_time = self.rate_tracker.apply_rate_limit(
711 self.engine_type
712 )
713 logger.debug(
714 f"Applied rate limit wait: {self._last_wait_time:.2f}s"
715 )
717 # Execute request
718 logger.debug(f"Requesting summaries from: {self.summary_url}")
719 response = safe_get(self.summary_url, params=params)
720 response.raise_for_status()
721 logger.debug(f"Summary API response status: {response.status_code}")
723 # Parse response
724 data = response.json()
725 logger.debug(
726 f"PubMed API returned data for {len(id_list)} requested IDs"
727 )
728 summaries = []
730 for pmid in id_list:
731 if pmid in data["result"]: 731 ↛ 779line 731 didn't jump to line 779 because the condition on line 731 was always true
732 article = data["result"][pmid]
733 logger.debug(
734 f"Processing article {pmid}: {article.get('title', 'NO TITLE')[:50]}"
735 )
737 # Extract authors (if available)
738 authors = []
739 if "authors" in article: 739 ↛ 745line 739 didn't jump to line 745 because the condition on line 739 was always true
740 authors = [
741 author["name"] for author in article["authors"]
742 ]
744 # Extract DOI from articleids if not in main field
745 doi = article.get("doi", "")
746 if not doi and "articleids" in article: 746 ↛ 753line 746 didn't jump to line 753 because the condition on line 746 was always true
747 for aid in article["articleids"]: 747 ↛ 753line 747 didn't jump to line 753 because the loop on line 747 didn't complete
748 if aid.get("idtype") == "doi": 748 ↛ 747line 748 didn't jump to line 747 because the condition on line 748 was always true
749 doi = aid.get("value", "")
750 break
752 # Create summary dictionary with all available fields
753 summary = {
754 "id": pmid,
755 "title": article.get("title", ""),
756 "pubdate": article.get("pubdate", ""),
757 "epubdate": article.get("epubdate", ""),
758 "source": article.get("source", ""),
759 "authors": authors,
760 "lastauthor": article.get("lastauthor", ""),
761 "journal": article.get("fulljournalname", ""),
762 "volume": article.get("volume", ""),
763 "issue": article.get("issue", ""),
764 "pages": article.get("pages", ""),
765 "doi": doi,
766 "issn": article.get("issn", ""),
767 "essn": article.get("essn", ""),
768 "pubtype": article.get(
769 "pubtype", []
770 ), # Publication types from esummary
771 "recordstatus": article.get("recordstatus", ""),
772 "lang": article.get("lang", []),
773 "pmcrefcount": article.get("pmcrefcount", None),
774 "link": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
775 }
777 summaries.append(summary)
778 else:
779 logger.warning(
780 f"PMID {pmid} not found in PubMed API response"
781 )
783 return summaries
785 except Exception as e:
786 error_msg = str(e)
787 logger.exception(
788 f"Error getting article summaries for {len(id_list)} articles"
789 )
791 # Check for rate limiting patterns
792 if (
793 "429" in error_msg
794 or "too many requests" in error_msg.lower()
795 or "rate limit" in error_msg.lower()
796 or "service unavailable" in error_msg.lower()
797 or "503" in error_msg
798 or "403" in error_msg
799 ):
800 raise RateLimitError(f"PubMed rate limit hit: {error_msg}")
802 return []
804 def _get_article_abstracts(self, id_list: List[str]) -> Dict[str, str]:
805 """
806 Get abstracts for a list of PubMed article IDs.
808 Args:
809 id_list: List of PubMed IDs
811 Returns:
812 Dictionary mapping PubMed IDs to their abstracts
813 """
814 if not id_list:
815 logger.debug("Empty ID list provided to _get_article_abstracts")
816 return {}
818 logger.debug(f"Fetching abstracts for {len(id_list)} PubMed articles")
820 try:
821 # Prepare parameters
822 params = {
823 "db": "pubmed",
824 "id": ",".join(id_list),
825 "retmode": "xml",
826 "rettype": "abstract",
827 }
829 # Add API key if available
830 if self.api_key: 830 ↛ 831line 830 didn't jump to line 831 because the condition on line 830 was never true
831 params["api_key"] = self.api_key
833 self._last_wait_time = self.rate_tracker.apply_rate_limit(
834 self.engine_type
835 )
836 logger.debug(
837 f"Applied rate limit wait: {self._last_wait_time:.2f}s"
838 )
840 # Execute request
841 logger.debug(f"Requesting abstracts from: {self.fetch_url}")
842 response = safe_get(self.fetch_url, params=params)
843 response.raise_for_status()
844 logger.debug(
845 f"Abstract fetch response status: {response.status_code}, size: {len(response.text)} bytes"
846 )
848 # Parse XML response
849 root = ET.fromstring(response.text)
850 logger.debug(
851 f"Parsing abstracts from XML for {len(id_list)} articles"
852 )
854 # Extract abstracts
855 abstracts = {}
857 for article in root.findall(".//PubmedArticle"):
858 pmid_elem = article.find(".//PMID")
859 pmid = pmid_elem.text if pmid_elem is not None else None
861 if pmid is None:
862 continue
864 # Find abstract text
865 abstract_text = ""
866 abstract_elem = article.find(".//AbstractText")
868 if abstract_elem is not None: 868 ↛ 872line 868 didn't jump to line 872 because the condition on line 868 was always true
869 abstract_text = abstract_elem.text or ""
871 # Some abstracts are split into multiple sections
872 abstract_sections = article.findall(".//AbstractText")
873 if len(abstract_sections) > 1:
874 logger.debug(
875 f"Article {pmid} has {len(abstract_sections)} abstract sections"
876 )
878 for section in abstract_sections:
879 # Get section label if it exists
880 label = section.get("Label")
881 section_text = section.text or ""
883 if label and section_text:
884 if abstract_text: 884 ↛ 887line 884 didn't jump to line 887 because the condition on line 884 was always true
885 abstract_text += f"\n\n{label}: {section_text}"
886 else:
887 abstract_text = f"{label}: {section_text}"
888 elif section_text:
889 if abstract_text: 889 ↛ 892line 889 didn't jump to line 892 because the condition on line 889 was always true
890 abstract_text += f"\n\n{section_text}"
891 else:
892 abstract_text = section_text
894 # Store in dictionary
895 if pmid and abstract_text:
896 abstracts[pmid] = abstract_text
897 logger.debug(
898 f"Abstract for {pmid}: {len(abstract_text)} chars"
899 )
900 elif pmid: 900 ↛ 857line 900 didn't jump to line 857 because the condition on line 900 was always true
901 logger.warning(f"No abstract found for PMID {pmid}")
903 logger.info(
904 f"Successfully retrieved {len(abstracts)} abstracts out of {len(id_list)} requested"
905 )
906 return abstracts
908 except Exception:
909 logger.exception(
910 f"Error getting article abstracts for {len(id_list)} articles"
911 )
912 return {}
914 def _get_article_detailed_metadata(
915 self, id_list: List[str]
916 ) -> Dict[str, Dict[str, Any]]:
917 """
918 Get detailed metadata for PubMed articles including publication types,
919 MeSH terms, keywords, and affiliations.
921 Args:
922 id_list: List of PubMed IDs
924 Returns:
925 Dictionary mapping PubMed IDs to their detailed metadata
926 """
927 if not id_list:
928 return {}
930 try:
931 # Prepare parameters
932 params = {
933 "db": "pubmed",
934 "id": ",".join(id_list),
935 "retmode": "xml",
936 "rettype": "medline",
937 }
939 # Add API key if available
940 if self.api_key: 940 ↛ 941line 940 didn't jump to line 941 because the condition on line 940 was never true
941 params["api_key"] = self.api_key
943 self._last_wait_time = self.rate_tracker.apply_rate_limit(
944 self.engine_type
945 )
947 # Execute request
948 response = safe_get(self.fetch_url, params=params)
949 response.raise_for_status()
951 # Parse XML response
952 root = ET.fromstring(response.text)
954 metadata = {}
956 for article in root.findall(".//PubmedArticle"):
957 pmid_elem = article.find(".//PMID")
958 pmid = pmid_elem.text if pmid_elem is not None else None
960 if pmid is None: 960 ↛ 961line 960 didn't jump to line 961 because the condition on line 960 was never true
961 continue
963 article_metadata: Dict[str, Any] = {}
965 # Extract publication types
966 pub_types = []
967 for pub_type in article.findall(".//PublicationType"):
968 if pub_type.text: 968 ↛ 967line 968 didn't jump to line 967 because the condition on line 968 was always true
969 pub_types.append(pub_type.text)
970 if pub_types:
971 article_metadata["publication_types"] = pub_types
973 # Extract MeSH terms
974 mesh_terms = []
975 for mesh in article.findall(".//MeshHeading"):
976 descriptor = mesh.find(".//DescriptorName")
977 if descriptor is not None and descriptor.text: 977 ↛ 975line 977 didn't jump to line 975 because the condition on line 977 was always true
978 mesh_terms.append(descriptor.text)
979 if mesh_terms:
980 article_metadata["mesh_terms"] = mesh_terms
982 # Extract keywords
983 keywords = []
984 for keyword in article.findall(".//Keyword"):
985 if keyword.text: 985 ↛ 984line 985 didn't jump to line 984 because the condition on line 985 was always true
986 keywords.append(keyword.text)
987 if keywords:
988 article_metadata["keywords"] = keywords
990 # Extract affiliations
991 affiliations = []
992 for affiliation in article.findall(".//Affiliation"):
993 if affiliation.text: 993 ↛ 992line 993 didn't jump to line 992 because the condition on line 993 was always true
994 affiliations.append(affiliation.text)
995 if affiliations:
996 article_metadata["affiliations"] = affiliations
998 # Extract grant information
999 grants = []
1000 for grant in article.findall(".//Grant"):
1001 grant_info = {}
1002 grant_id = grant.find(".//GrantID")
1003 if grant_id is not None and grant_id.text: 1003 ↛ 1005line 1003 didn't jump to line 1005 because the condition on line 1003 was always true
1004 grant_info["id"] = grant_id.text
1005 agency = grant.find(".//Agency")
1006 if agency is not None and agency.text: 1006 ↛ 1008line 1006 didn't jump to line 1008 because the condition on line 1006 was always true
1007 grant_info["agency"] = agency.text
1008 if grant_info: 1008 ↛ 1000line 1008 didn't jump to line 1000 because the condition on line 1008 was always true
1009 grants.append(grant_info)
1010 if grants:
1011 article_metadata["grants"] = grants
1013 # Check for free full text in PMC
1014 pmc_elem = article.find(".//ArticleId[@IdType='pmc']")
1015 if pmc_elem is not None:
1016 article_metadata["has_free_full_text"] = True
1017 article_metadata["pmc_id"] = pmc_elem.text
1019 # Extract conflict of interest statement
1020 coi_elem = article.find(".//CoiStatement")
1021 if coi_elem is not None and coi_elem.text:
1022 article_metadata["conflict_of_interest"] = coi_elem.text
1024 metadata[pmid] = article_metadata
1026 return metadata
1028 except Exception:
1029 logger.exception("Error getting detailed article metadata")
1030 return {}
1032 def _create_enriched_content(
1033 self, result: Dict[str, Any], base_content: str
1034 ) -> str:
1035 """
1036 Create enriched content by adding relevant metadata context to help the LLM.
1038 Args:
1039 result: The result dictionary with metadata
1040 base_content: The base content (abstract or full text)
1042 Returns:
1043 Enriched content string with metadata context
1044 """
1045 enriched_parts = []
1047 # Add study type information
1048 if "publication_types" in result:
1049 pub_types = result["publication_types"]
1050 # Filter for significant types
1051 significant_types = [
1052 pt
1053 for pt in pub_types
1054 if any(
1055 key in pt.lower()
1056 for key in [
1057 "clinical trial",
1058 "randomized",
1059 "meta-analysis",
1060 "systematic review",
1061 "case report",
1062 "guideline",
1063 "comparative study",
1064 "multicenter",
1065 ]
1066 )
1067 ]
1068 if significant_types:
1069 enriched_parts.append(
1070 f"[Study Type: {', '.join(significant_types)}]"
1071 )
1073 # Add the main content
1074 enriched_parts.append(base_content)
1076 # Add metadata footer
1077 metadata_footer = []
1079 # Add ALL MeSH terms
1080 if "mesh_terms" in result and len(result["mesh_terms"]) > 0:
1081 metadata_footer.append(
1082 f"Medical Topics (MeSH): {', '.join(result['mesh_terms'])}"
1083 )
1085 # Add ALL keywords
1086 if "keywords" in result and len(result["keywords"]) > 0:
1087 metadata_footer.append(f"Keywords: {', '.join(result['keywords'])}")
1089 # Add ALL affiliations
1090 if "affiliations" in result and len(result["affiliations"]) > 0:
1091 if len(result["affiliations"]) == 1:
1092 metadata_footer.append(
1093 f"Institution: {result['affiliations'][0]}"
1094 )
1095 else:
1096 affiliations_text = "\n - " + "\n - ".join(
1097 result["affiliations"]
1098 )
1099 metadata_footer.append(f"Institutions:{affiliations_text}")
1101 # Add ALL funding information with full details
1102 if "grants" in result and len(result["grants"]) > 0:
1103 grant_details = []
1104 for grant in result["grants"]:
1105 grant_text = []
1106 if "agency" in grant:
1107 grant_text.append(grant["agency"])
1108 if "id" in grant:
1109 grant_text.append(f"(Grant ID: {grant['id']})")
1110 if grant_text:
1111 grant_details.append(" ".join(grant_text))
1112 if grant_details:
1113 if len(grant_details) == 1:
1114 metadata_footer.append(f"Funded by: {grant_details[0]}")
1115 else:
1116 funding_text = "\n - " + "\n - ".join(grant_details)
1117 metadata_footer.append(f"Funding Sources:{funding_text}")
1119 # Add FULL conflict of interest statement
1120 if "conflict_of_interest" in result:
1121 coi_text = result["conflict_of_interest"]
1122 if coi_text:
1123 # Still skip trivial "no conflict" statements to reduce noise
1124 if not any(
1125 phrase in coi_text.lower()
1126 for phrase in [
1127 "no conflict",
1128 "no competing",
1129 "nothing to disclose",
1130 "none declared",
1131 "authors declare no",
1132 ]
1133 ):
1134 metadata_footer.append(f"Conflict of Interest: {coi_text}")
1135 elif (
1136 "but" in coi_text.lower()
1137 or "except" in coi_text.lower()
1138 or "however" in coi_text.lower()
1139 ):
1140 # Include if there's a "no conflict BUT..." type statement
1141 metadata_footer.append(f"Conflict of Interest: {coi_text}")
1143 # Combine everything
1144 if metadata_footer:
1145 enriched_parts.append("\n---\nStudy Metadata:")
1146 enriched_parts.extend(metadata_footer)
1148 return "\n".join(enriched_parts)
1150 def _find_pmc_ids(self, pmid_list: List[str]) -> Dict[str, str]:
1151 """
1152 Find PMC IDs for the given PubMed IDs (for full-text access).
1154 Args:
1155 pmid_list: List of PubMed IDs
1157 Returns:
1158 Dictionary mapping PubMed IDs to their PMC IDs (if available)
1159 """
1160 if not pmid_list or not self.get_full_text:
1161 return {}
1163 try:
1164 # Prepare parameters
1165 params = {
1166 "dbfrom": "pubmed",
1167 "db": "pmc",
1168 "linkname": "pubmed_pmc",
1169 "id": ",".join(pmid_list),
1170 "retmode": "json",
1171 }
1173 # Add API key if available
1174 if self.api_key: 1174 ↛ 1175line 1174 didn't jump to line 1175 because the condition on line 1174 was never true
1175 params["api_key"] = self.api_key
1177 self._last_wait_time = self.rate_tracker.apply_rate_limit(
1178 self.engine_type
1179 )
1181 # Execute request
1182 response = safe_get(self.link_url, params=params)
1183 response.raise_for_status()
1185 # Parse response
1186 data = response.json()
1188 # Map PubMed IDs to PMC IDs
1189 pmid_to_pmcid = {}
1191 for linkset in data.get("linksets", []):
1192 pmid = linkset.get("ids", [None])[0]
1194 if not pmid: 1194 ↛ 1195line 1194 didn't jump to line 1195 because the condition on line 1194 was never true
1195 continue
1197 for link in linkset.get("linksetdbs", []):
1198 if link.get("linkname") == "pubmed_pmc": 1198 ↛ 1197line 1198 didn't jump to line 1197 because the condition on line 1198 was always true
1199 pmcids = link.get("links", [])
1200 if pmcids: 1200 ↛ 1197line 1200 didn't jump to line 1197 because the condition on line 1200 was always true
1201 pmid_to_pmcid[str(pmid)] = f"PMC{pmcids[0]}"
1203 logger.info(
1204 f"Found {len(pmid_to_pmcid)} PMC IDs for full-text access"
1205 )
1206 return pmid_to_pmcid
1208 except Exception:
1209 logger.exception("Error finding PMC IDs")
1210 return {}
1212 def _get_pmc_full_text(self, pmcid: str) -> str:
1213 """
1214 Get full text for a PMC article.
1216 Args:
1217 pmcid: PMC ID of the article
1219 Returns:
1220 Full text content or empty string if not available
1221 """
1222 try:
1223 # Prepare parameters
1224 params = {
1225 "db": "pmc",
1226 "id": pmcid,
1227 "retmode": "xml",
1228 "rettype": "full",
1229 }
1231 # Add API key if available
1232 if self.api_key: 1232 ↛ 1233line 1232 didn't jump to line 1233 because the condition on line 1232 was never true
1233 params["api_key"] = self.api_key
1235 self._last_wait_time = self.rate_tracker.apply_rate_limit(
1236 self.engine_type
1237 )
1239 # Execute request
1240 response = safe_get(self.fetch_url, params=params)
1241 response.raise_for_status()
1243 # Parse XML response
1244 root = ET.fromstring(response.text)
1246 # Extract full text
1247 full_text = []
1249 # Extract article title
1250 title_elem = root.find(".//article-title")
1251 if title_elem is not None and title_elem.text: 1251 ↛ 1255line 1251 didn't jump to line 1255 because the condition on line 1251 was always true
1252 full_text.append(f"# {title_elem.text}")
1254 # Extract abstract
1255 abstract_paras = root.findall(".//abstract//p")
1256 if abstract_paras:
1257 full_text.append("\n## Abstract\n")
1258 for p in abstract_paras:
1259 text = "".join(p.itertext())
1260 if text: 1260 ↛ 1258line 1260 didn't jump to line 1258 because the condition on line 1260 was always true
1261 full_text.append(text)
1263 # Extract body content
1264 body = root.find(".//body")
1265 if body is not None: 1265 ↛ 1278line 1265 didn't jump to line 1278 because the condition on line 1265 was always true
1266 for section in body.findall(".//sec"):
1267 # Get section title
1268 title = section.find(".//title")
1269 if title is not None and title.text: 1269 ↛ 1273line 1269 didn't jump to line 1273 because the condition on line 1269 was always true
1270 full_text.append(f"\n## {title.text}\n")
1272 # Get paragraphs
1273 for p in section.findall(".//p"):
1274 text = "".join(p.itertext())
1275 if text: 1275 ↛ 1273line 1275 didn't jump to line 1273 because the condition on line 1275 was always true
1276 full_text.append(text)
1278 result_text = "\n\n".join(full_text)
1279 logger.debug(
1280 f"Successfully extracted {len(result_text)} chars of PMC full text with {len(full_text)} sections"
1281 )
1282 return result_text
1284 except Exception:
1285 logger.exception("Error getting PMC full text")
1286 return ""
1288 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
1289 """
1290 Get preview information for PubMed articles.
1292 Args:
1293 query: The search query
1295 Returns:
1296 List of preview dictionaries
1297 """
1298 logger.info(f"Getting PubMed previews for query: {query}")
1300 # Optimize the query for PubMed if LLM is available
1301 optimized_query = self._optimize_query_for_pubmed(query)
1303 # Perform adaptive search
1304 pmid_list, strategy = self._adaptive_search(optimized_query)
1306 # If no results, try a simplified query
1307 if not pmid_list:
1308 logger.warning(
1309 f"No PubMed results found using strategy: {strategy}"
1310 )
1311 simplified_query = self._simplify_query(optimized_query)
1312 if simplified_query != optimized_query:
1313 logger.info(f"Trying with simplified query: {simplified_query}")
1314 pmid_list, strategy = self._adaptive_search(simplified_query)
1315 if pmid_list:
1316 logger.info(
1317 f"Simplified query found {len(pmid_list)} results"
1318 )
1320 if not pmid_list:
1321 logger.warning("No PubMed results found after query simplification")
1322 return []
1324 # Get article summaries
1325 logger.debug(f"Fetching article summaries for {len(pmid_list)} PMIDs")
1326 summaries = self._get_article_summaries(pmid_list)
1327 logger.debug(f"Retrieved {len(summaries)} summaries")
1329 # ALWAYS fetch abstracts for snippet-only mode to provide context for LLM
1330 logger.debug(
1331 f"Fetching abstracts for {len(pmid_list)} articles for snippet enrichment"
1332 )
1333 abstracts = self._get_article_abstracts(pmid_list)
1334 logger.debug(f"Retrieved {len(abstracts)} abstracts")
1336 # Format as previews
1337 previews = []
1338 for summary in summaries:
1339 # Build snippet from individual metadata preferences
1340 snippet_parts = []
1342 # Check for publication type from esummary (earlier than detailed metadata)
1343 pub_type_prefix = ""
1344 if self.include_publication_type_in_context and summary.get( 1344 ↛ 1348line 1344 didn't jump to line 1348 because the condition on line 1344 was never true
1345 "pubtype"
1346 ):
1347 # Use first publication type from esummary
1348 pub_type_prefix = f"[{summary['pubtype'][0]}] "
1350 # Add authors if enabled
1351 if self.include_authors_in_context and summary.get("authors"):
1352 authors_text = ", ".join(summary.get("authors", []))
1353 if len(authors_text) > 100:
1354 # Truncate long author lists
1355 authors_text = authors_text[:97] + "..."
1356 snippet_parts.append(authors_text)
1358 # Add journal if enabled
1359 if self.include_journal_in_context and summary.get("journal"): 1359 ↛ 1363line 1359 didn't jump to line 1363 because the condition on line 1359 was always true
1360 snippet_parts.append(summary["journal"])
1362 # Add date (full or year only)
1363 if summary.get("pubdate"): 1363 ↛ 1373line 1363 didn't jump to line 1373 because the condition on line 1363 was always true
1364 if self.include_full_date_in_context: 1364 ↛ 1365line 1364 didn't jump to line 1365 because the condition on line 1364 was never true
1365 snippet_parts.append(summary["pubdate"])
1366 elif ( 1366 ↛ 1373line 1366 didn't jump to line 1373 because the condition on line 1366 was always true
1367 self.include_year_in_context
1368 and len(summary["pubdate"]) >= 4
1369 ):
1370 snippet_parts.append(summary["pubdate"][:4])
1372 # Add citation details if enabled
1373 if self.include_citation_in_context:
1374 citation_parts = []
1375 if summary.get("volume"): 1375 ↛ 1377line 1375 didn't jump to line 1377 because the condition on line 1375 was always true
1376 citation_parts.append(f"Vol {summary['volume']}")
1377 if summary.get("issue"): 1377 ↛ 1379line 1377 didn't jump to line 1379 because the condition on line 1377 was always true
1378 citation_parts.append(f"Issue {summary['issue']}")
1379 if summary.get("pages"): 1379 ↛ 1381line 1379 didn't jump to line 1381 because the condition on line 1379 was always true
1380 citation_parts.append(f"pp {summary['pages']}")
1381 if citation_parts: 1381 ↛ 1385line 1381 didn't jump to line 1385 because the condition on line 1381 was always true
1382 snippet_parts.append(f"({', '.join(citation_parts)})")
1384 # Join snippet parts or provide default
1385 if snippet_parts: 1385 ↛ 1396line 1385 didn't jump to line 1396 because the condition on line 1385 was always true
1386 # Use different separators based on what's included
1387 if self.include_authors_in_context:
1388 snippet = ". ".join(
1389 snippet_parts
1390 ) # Authors need period separator
1391 else:
1392 snippet = " - ".join(
1393 snippet_parts
1394 ) # Journal and year use dash
1395 else:
1396 snippet = "Research article"
1398 # Add publication type prefix
1399 snippet = pub_type_prefix + snippet
1401 # Add language indicator if not English
1402 if self.include_language_in_context and summary.get("lang"):
1403 langs = summary["lang"]
1404 if langs and langs[0] != "eng" and langs[0]: 1404 ↛ 1408line 1404 didn't jump to line 1408 because the condition on line 1404 was always true
1405 snippet = f"{snippet} [{langs[0].upper()}]"
1407 # Add identifiers if enabled
1408 identifier_parts = []
1409 if self.include_pmid_in_context and summary.get("id"):
1410 identifier_parts.append(f"PMID: {summary['id']}")
1411 if self.include_doi_in_context and summary.get("doi"):
1412 identifier_parts.append(f"DOI: {summary['doi']}")
1414 if identifier_parts:
1415 snippet = f"{snippet} | {' | '.join(identifier_parts)}"
1417 # ALWAYS include title and abstract in snippet for LLM analysis
1418 pmid = summary["id"]
1419 title = summary["title"]
1420 abstract_text = abstracts.get(pmid, "")
1422 # Truncate abstract if too long
1423 if len(abstract_text) > 500: 1423 ↛ 1424line 1423 didn't jump to line 1424 because the condition on line 1423 was never true
1424 abstract_text = abstract_text[:497] + "..."
1426 # Build the enriched snippet with title and abstract
1427 if abstract_text:
1428 enriched_snippet = f"Title: {title}\n\nAbstract: {abstract_text}\n\nMetadata: {snippet}"
1429 else:
1430 enriched_snippet = f"Title: {title}\n\nMetadata: {snippet}"
1432 # Log the complete snippet for debugging
1433 logger.debug(f"Complete snippet for PMID {pmid}:")
1434 logger.debug(f" Title: {title[:100]}...")
1435 logger.debug(f" Abstract length: {len(abstract_text)} chars")
1436 logger.debug(f" Metadata: {snippet}")
1437 logger.debug(
1438 f" Full enriched snippet ({len(enriched_snippet)} chars): {enriched_snippet[:500]}..."
1439 )
1441 # Create preview with basic information
1442 preview = {
1443 "id": summary["id"],
1444 "title": summary["title"],
1445 "link": summary["link"],
1446 "snippet": enriched_snippet, # Use enriched snippet with title and abstract
1447 "authors": summary.get("authors", []),
1448 "journal": summary.get("journal", ""),
1449 # Alias for the journal reputation filter, which reads
1450 # `journal_ref` (the field name used by arXiv).
1451 # Use None (not empty string) to match other engines so the
1452 # filter treats missing journals consistently.
1453 "journal_ref": summary.get("journal") or None,
1454 # Forward the print / linking ISSN so the reputation
1455 # filter's Tier 2/3 lookups can key on it (faster and
1456 # more reliable than fuzzy name matching). essn is the
1457 # electronic ISSN; prefer it when issn is blank.
1458 "issn": summary.get("issn") or summary.get("essn") or None,
1459 "pubdate": summary.get("pubdate", ""),
1460 "doi": summary.get("doi", ""),
1461 "source": "PubMed",
1462 "_pmid": summary["id"], # Store PMID for later use
1463 "_search_strategy": strategy, # Store search strategy for analytics
1464 }
1466 previews.append(preview)
1468 logger.info(
1469 f"Found {len(previews)} PubMed previews using strategy: {strategy}"
1470 )
1471 if previews: 1471 ↛ 1475line 1471 didn't jump to line 1475 because the condition on line 1471 was always true
1472 logger.debug(
1473 f"Sample preview title: '{previews[0].get('title', 'NO TITLE')[:80]}...'"
1474 )
1475 return previews
1477 def _get_full_content(
1478 self, relevant_items: List[Dict[str, Any]]
1479 ) -> List[Dict[str, Any]]:
1480 """
1481 Get full content for the relevant PubMed articles.
1482 Efficiently manages which content to retrieve (abstracts and/or full text).
1484 Args:
1485 relevant_items: List of relevant preview dictionaries
1487 Returns:
1488 List of result dictionaries with full content
1489 """
1490 # Check if we should add full content
1491 snippets_only_mode = (
1492 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
1493 and search_config.SEARCH_SNIPPETS_ONLY
1494 )
1496 if snippets_only_mode:
1497 logger.info(
1498 "Snippet-only mode enabled, will fetch abstracts as snippets"
1499 )
1500 # For PubMed, we still need to fetch abstracts as they serve as snippets
1501 # But we'll skip full-text retrieval
1503 logger.info(
1504 f"Getting content for {len(relevant_items)} PubMed articles"
1505 )
1507 # Collect all PMIDs for relevant items
1508 pmids = []
1509 for item in relevant_items:
1510 if "_pmid" in item:
1511 pmids.append(item["_pmid"])
1513 # Get abstracts if requested and PMIDs exist
1514 # In snippet-only mode, always get abstracts as they serve as snippets
1515 abstracts = {}
1516 if (self.get_abstracts or snippets_only_mode) and pmids:
1517 abstracts = self._get_article_abstracts(pmids)
1519 # Get detailed metadata for all articles (publication types, MeSH terms, etc.)
1520 detailed_metadata = {}
1521 if pmids:
1522 detailed_metadata = self._get_article_detailed_metadata(pmids)
1524 # Find PMC IDs for full-text retrieval (if enabled and not in snippet-only mode)
1525 pmid_to_pmcid = {}
1526 if self.get_full_text and pmids and not snippets_only_mode:
1527 pmid_to_pmcid = self._find_pmc_ids(pmids)
1529 # Add content to results
1530 results: List[Dict[str, Any]] = []
1531 for item in relevant_items:
1532 result = item.copy()
1533 pmid = item.get("_pmid", "")
1535 # Add detailed metadata if available
1536 if pmid in detailed_metadata:
1537 metadata = detailed_metadata[pmid]
1539 # Add publication types (e.g., "Clinical Trial", "Meta-Analysis")
1540 if "publication_types" in metadata:
1541 result["publication_types"] = metadata["publication_types"]
1543 # Add first publication type to snippet if enabled
1544 if ( 1544 ↛ 1556line 1544 didn't jump to line 1556 because the condition on line 1544 was always true
1545 self.include_publication_type_in_context
1546 and metadata["publication_types"]
1547 ):
1548 # Just take the first publication type as is
1549 pub_type = metadata["publication_types"][0]
1550 if "snippet" in result: 1550 ↛ 1556line 1550 didn't jump to line 1556 because the condition on line 1550 was always true
1551 result["snippet"] = (
1552 f"[{pub_type}] {result['snippet']}"
1553 )
1555 # Add MeSH terms for medical categorization
1556 if "mesh_terms" in metadata:
1557 result["mesh_terms"] = metadata["mesh_terms"]
1559 # Add MeSH terms to snippet if enabled
1560 if ( 1560 ↛ 1576line 1560 didn't jump to line 1576 because the condition on line 1560 was always true
1561 self.include_mesh_terms_in_context
1562 and metadata["mesh_terms"]
1563 ):
1564 mesh_to_show = (
1565 metadata["mesh_terms"][: self.max_mesh_terms]
1566 if self.max_mesh_terms > 0
1567 else metadata["mesh_terms"]
1568 )
1569 if mesh_to_show and "snippet" in result: 1569 ↛ 1576line 1569 didn't jump to line 1576 because the condition on line 1569 was always true
1570 mesh_text = "MeSH: " + ", ".join(mesh_to_show)
1571 result["snippet"] = (
1572 f"{result['snippet']} | {mesh_text}"
1573 )
1575 # Add keywords
1576 if "keywords" in metadata:
1577 result["keywords"] = metadata["keywords"]
1579 # Add keywords to snippet if enabled
1580 if ( 1580 ↛ 1598line 1580 didn't jump to line 1598 because the condition on line 1580 was always true
1581 self.include_keywords_in_context
1582 and metadata["keywords"]
1583 ):
1584 keywords_to_show = (
1585 metadata["keywords"][: self.max_keywords]
1586 if self.max_keywords > 0
1587 else metadata["keywords"]
1588 )
1589 if keywords_to_show and "snippet" in result: 1589 ↛ 1598line 1589 didn't jump to line 1598 because the condition on line 1589 was always true
1590 keywords_text = "Keywords: " + ", ".join(
1591 keywords_to_show
1592 )
1593 result["snippet"] = (
1594 f"{result['snippet']} | {keywords_text}"
1595 )
1597 # Add affiliations
1598 if "affiliations" in metadata: 1598 ↛ 1599line 1598 didn't jump to line 1599 because the condition on line 1598 was never true
1599 result["affiliations"] = metadata["affiliations"]
1601 # Add funding/grant information
1602 if "grants" in metadata: 1602 ↛ 1603line 1602 didn't jump to line 1603 because the condition on line 1602 was never true
1603 result["grants"] = metadata["grants"]
1605 # Add conflict of interest statement
1606 if "conflict_of_interest" in metadata: 1606 ↛ 1607line 1606 didn't jump to line 1607 because the condition on line 1606 was never true
1607 result["conflict_of_interest"] = metadata[
1608 "conflict_of_interest"
1609 ]
1611 # Add free full text availability
1612 if "has_free_full_text" in metadata:
1613 result["has_free_full_text"] = metadata[
1614 "has_free_full_text"
1615 ]
1616 if "pmc_id" in metadata: 1616 ↛ 1620line 1616 didn't jump to line 1620 because the condition on line 1616 was always true
1617 result["pmc_id"] = metadata["pmc_id"]
1619 # Add PMC availability to snippet if enabled
1620 if ( 1620 ↛ 1630line 1620 didn't jump to line 1630 because the condition on line 1620 was always true
1621 self.include_pmc_availability_in_context
1622 and metadata["has_free_full_text"]
1623 and "snippet" in result
1624 ):
1625 result["snippet"] = (
1626 f"{result['snippet']} | [Free Full Text]"
1627 )
1629 # Add abstract if available
1630 if pmid in abstracts:
1631 result["abstract"] = abstracts[pmid]
1633 # Create enriched content with metadata context
1634 enriched_content = self._create_enriched_content(
1635 result, abstracts[pmid]
1636 )
1638 # ALWAYS include title and abstract in snippet for LLM analysis
1639 # Build comprehensive snippet with title and abstract
1640 title = result.get("title", "")
1641 abstract_text = (
1642 abstracts[pmid][:SNIPPET_LENGTH_LONG]
1643 if len(abstracts[pmid]) > SNIPPET_LENGTH_LONG
1644 else abstracts[pmid]
1645 )
1647 # Prepend title and abstract to the existing metadata snippet
1648 if "snippet" in result: 1648 ↛ 1655line 1648 didn't jump to line 1655 because the condition on line 1648 was always true
1649 # Keep metadata snippet and add content
1650 result["snippet"] = (
1651 f"Title: {title}\n\nAbstract: {abstract_text}\n\nMetadata: {result['snippet']}"
1652 )
1653 else:
1654 # No metadata snippet, just title and abstract
1655 result["snippet"] = (
1656 f"Title: {title}\n\nAbstract: {abstract_text}"
1657 )
1659 # In snippet-only mode, use enriched content
1660 if snippets_only_mode:
1661 result["full_content"] = enriched_content
1662 result["content"] = enriched_content
1663 result["content_type"] = "abstract"
1664 # Use abstract as content if no full text
1665 elif pmid not in pmid_to_pmcid:
1666 result["full_content"] = enriched_content
1667 result["content"] = enriched_content
1668 result["content_type"] = "abstract"
1670 # Add full text for a limited number of top articles
1671 if (
1672 pmid in pmid_to_pmcid
1673 and self.get_full_text
1674 and len(
1675 [r for r in results if r.get("content_type") == "full_text"]
1676 )
1677 < self.full_text_limit
1678 ):
1679 # Get full text content
1680 pmcid = pmid_to_pmcid[pmid]
1681 full_text = self._get_pmc_full_text(pmcid)
1683 if full_text:
1684 enriched_full_text = self._create_enriched_content(
1685 result, full_text
1686 )
1687 result["full_content"] = enriched_full_text
1688 result["content"] = enriched_full_text
1689 result["content_type"] = "full_text"
1690 result["pmcid"] = pmcid
1691 elif pmid in abstracts: 1691 ↛ 1701line 1691 didn't jump to line 1701 because the condition on line 1691 was always true
1692 # Fall back to abstract if full text retrieval fails
1693 enriched_content = self._create_enriched_content(
1694 result, abstracts[pmid]
1695 )
1696 result["full_content"] = enriched_content
1697 result["content"] = enriched_content
1698 result["content_type"] = "abstract"
1700 # Remove temporary fields
1701 if "_pmid" in result:
1702 del result["_pmid"]
1703 if "_search_strategy" in result:
1704 del result["_search_strategy"]
1706 results.append(result)
1708 return results
1710 def search_by_author(
1711 self, author_name: str, max_results: Optional[int] = None
1712 ) -> List[Dict[str, Any]]:
1713 """
1714 Search for articles by a specific author.
1716 Args:
1717 author_name: Name of the author
1718 max_results: Maximum number of results (defaults to self.max_results)
1720 Returns:
1721 List of articles by the author
1722 """
1723 original_max_results = self.max_results
1725 try:
1726 if max_results:
1727 self.max_results = max_results
1729 query = f"{author_name}[Author]"
1730 return self.run(query)
1732 finally:
1733 # Restore original value
1734 self.max_results = original_max_results
1736 def search_by_journal(
1737 self, journal_name: str, max_results: Optional[int] = None
1738 ) -> List[Dict[str, Any]]:
1739 """
1740 Search for articles in a specific journal.
1742 Args:
1743 journal_name: Name of the journal
1744 max_results: Maximum number of results (defaults to self.max_results)
1746 Returns:
1747 List of articles from the journal
1748 """
1749 original_max_results = self.max_results
1751 try:
1752 if max_results:
1753 self.max_results = max_results
1755 query = f"{journal_name}[Journal]"
1756 return self.run(query)
1758 finally:
1759 # Restore original value
1760 self.max_results = original_max_results
1762 def search_recent(
1763 self, query: str, days: int = 30, max_results: Optional[int] = None
1764 ) -> List[Dict[str, Any]]:
1765 """
1766 Search for recent articles matching the query.
1768 Args:
1769 query: The search query
1770 days: Number of days to look back
1771 max_results: Maximum number of results (defaults to self.max_results)
1773 Returns:
1774 List of recent articles matching the query
1775 """
1776 original_max_results = self.max_results
1777 original_days_limit = self.days_limit
1779 try:
1780 if max_results:
1781 self.max_results = max_results
1783 # Set days limit for this search
1784 self.days_limit = days
1786 return self.run(query)
1788 finally:
1789 # Restore original values
1790 self.max_results = original_max_results
1791 self.days_limit = original_days_limit
1793 def advanced_search(
1794 self, terms: Dict[str, str], max_results: Optional[int] = None
1795 ) -> List[Dict[str, Any]]:
1796 """
1797 Perform an advanced search with field-specific terms.
1799 Args:
1800 terms: Dictionary mapping fields to search terms
1801 Valid fields: Author, Journal, Title, MeSH, Affiliation, etc.
1802 max_results: Maximum number of results (defaults to self.max_results)
1804 Returns:
1805 List of articles matching the advanced query
1806 """
1807 original_max_results = self.max_results
1809 try:
1810 if max_results:
1811 self.max_results = max_results
1813 # Build advanced query string
1814 query_parts = []
1815 for field, term in terms.items():
1816 query_parts.append(f"{term}[{field}]")
1818 query = " AND ".join(query_parts)
1819 return self.run(query)
1821 finally:
1822 # Restore original value
1823 self.max_results = original_max_results