Coverage for src / local_deep_research / web_search_engines / engines / search_engine_pubmed.py: 10%
707 statements
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
« prev ^ index » next coverage.py v7.12.0, created at 2026-01-11 00:51 +0000
1import re
2import xml.etree.ElementTree as ET
3from typing import Any, Dict, List, Optional, Tuple
5from langchain_core.language_models import BaseLLM
6from loguru import logger
8from ...config import search_config
9from ...security.safe_requests import safe_get
10from ..rate_limiting import RateLimitError
11from ..search_engine_base import BaseSearchEngine
14class PubMedSearchEngine(BaseSearchEngine):
15 """
16 PubMed search engine implementation with two-phase approach and adaptive search.
17 Provides efficient access to biomedical literature while minimizing API usage.
18 """
20 # Mark as public search engine
21 is_public = True
22 # Scientific/medical search engine
23 is_scientific = True
25 def __init__(
26 self,
27 max_results: int = 10,
28 api_key: Optional[str] = None,
29 days_limit: Optional[int] = None,
30 get_abstracts: bool = True,
31 get_full_text: bool = False,
32 full_text_limit: int = 3,
33 llm: Optional[BaseLLM] = None,
34 max_filtered_results: Optional[int] = None,
35 optimize_queries: bool = True,
36 include_publication_type_in_context: bool = True,
37 include_journal_in_context: bool = True,
38 include_year_in_context: bool = True,
39 include_authors_in_context: bool = False,
40 include_full_date_in_context: bool = False,
41 include_mesh_terms_in_context: bool = True,
42 include_keywords_in_context: bool = True,
43 include_doi_in_context: bool = False,
44 include_pmid_in_context: bool = False,
45 include_pmc_availability_in_context: bool = False,
46 max_mesh_terms: int = 3,
47 max_keywords: int = 3,
48 include_citation_in_context: bool = False,
49 include_language_in_context: bool = False,
50 ):
51 """
52 Initialize the PubMed search engine.
54 Args:
55 max_results: Maximum number of search results
56 api_key: NCBI API key for higher rate limits (optional)
57 days_limit: Limit results to N days (optional)
58 get_abstracts: Whether to fetch abstracts for all results
59 get_full_text: Whether to fetch full text content (when available in PMC)
60 full_text_limit: Max number of full-text articles to retrieve
61 llm: Language model for relevance filtering
62 max_filtered_results: Maximum number of results to keep after filtering
63 optimize_queries: Whether to optimize natural language queries for PubMed
64 """
65 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results
66 super().__init__(
67 llm=llm,
68 max_filtered_results=max_filtered_results,
69 max_results=max_results,
70 )
71 self.max_results = max(self.max_results, 25)
72 self.api_key = api_key
73 self.days_limit = days_limit
74 self.get_abstracts = get_abstracts
75 self.get_full_text = get_full_text
76 self.full_text_limit = full_text_limit
77 self.optimize_queries = optimize_queries
78 self.include_publication_type_in_context = (
79 include_publication_type_in_context
80 )
81 self.include_journal_in_context = include_journal_in_context
82 self.include_year_in_context = include_year_in_context
83 self.include_authors_in_context = include_authors_in_context
84 self.include_full_date_in_context = include_full_date_in_context
85 self.include_mesh_terms_in_context = include_mesh_terms_in_context
86 self.include_keywords_in_context = include_keywords_in_context
87 self.include_doi_in_context = include_doi_in_context
88 self.include_pmid_in_context = include_pmid_in_context
89 self.include_pmc_availability_in_context = (
90 include_pmc_availability_in_context
91 )
92 self.max_mesh_terms = max_mesh_terms
93 self.max_keywords = max_keywords
94 self.include_citation_in_context = include_citation_in_context
95 self.include_language_in_context = include_language_in_context
97 # Base API URLs
98 self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
99 self.search_url = f"{self.base_url}/esearch.fcgi"
100 self.summary_url = f"{self.base_url}/esummary.fcgi"
101 self.fetch_url = f"{self.base_url}/efetch.fcgi"
102 self.link_url = f"{self.base_url}/elink.fcgi"
104 # PMC base URL for full text
105 self.pmc_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/"
107 def _get_result_count(self, query: str) -> int:
108 """
109 Get the total number of results for a query without retrieving the results themselves.
111 Args:
112 query: The search query
114 Returns:
115 Total number of matching results
116 """
117 try:
118 # Prepare search parameters
119 params = {
120 "db": "pubmed",
121 "term": query,
122 "retmode": "json",
123 "retmax": 0, # Don't need actual results, just the count
124 }
126 # Add API key if available
127 if self.api_key: 127 ↛ 128line 127 didn't jump to line 128 because the condition on line 127 was never true
128 params["api_key"] = self.api_key
130 self._last_wait_time = self.rate_tracker.apply_rate_limit(
131 self.engine_type
132 )
134 # Execute search request
135 response = safe_get(self.search_url, params=params)
136 response.raise_for_status()
138 # Parse response
139 data = response.json()
140 count = int(data["esearchresult"]["count"])
142 logger.info(
143 "Query '%s' has %s total results in PubMed", query, count
144 )
145 return count
147 except Exception:
148 logger.exception("Error getting result count")
149 return 0
151 def _extract_core_terms(self, query: str) -> str:
152 """
153 Extract core terms from a complex query for volume estimation.
155 Args:
156 query: PubMed query string
158 Returns:
159 Simplified query with core terms
160 """
161 # Remove field specifications and operators
162 simplified = re.sub(r"\[\w+\]", "", query) # Remove [Field] tags
163 simplified = re.sub(
164 r"\b(AND|OR|NOT)\b", "", simplified
165 ) # Remove operators
167 # Remove quotes and parentheses
168 simplified = (
169 simplified.replace('"', "").replace("(", "").replace(")", "")
170 )
172 # Split by whitespace and join terms with 4+ chars (likely meaningful)
173 terms = [term for term in simplified.split() if len(term) >= 4]
175 # Join with AND to create a basic search
176 return " ".join(terms[:5]) # Limit to top 5 terms
178 def _expand_time_window(self, time_filter: str) -> str:
179 """
180 Expand a time window to get more results.
182 Args:
183 time_filter: Current time filter
185 Returns:
186 Expanded time filter
187 """
188 # Parse current time window
189 import re
191 match = re.match(r'"last (\d+) (\w+)"[pdat]', time_filter)
192 if not match:
193 return '"last 10 years"[pdat]'
195 amount, unit = int(match.group(1)), match.group(2)
197 # Expand based on current unit
198 if unit == "months" or unit == "month":
199 if amount < 6:
200 return '"last 6 months"[pdat]'
201 elif amount < 12:
202 return '"last 1 year"[pdat]'
203 else:
204 return '"last 2 years"[pdat]'
205 elif unit == "years" or unit == "year":
206 if amount < 2:
207 return '"last 2 years"[pdat]'
208 elif amount < 5:
209 return '"last 5 years"[pdat]'
210 else:
211 return '"last 10 years"[pdat]'
213 return '"last 10 years"[pdat]'
215 def _optimize_query_for_pubmed(self, query: str) -> str:
216 """
217 Optimize a natural language query for PubMed search.
218 Uses LLM to transform questions into effective keyword-based queries.
220 Args:
221 query: Natural language query
223 Returns:
224 Optimized query string for PubMed
225 """
226 if not self.llm or not self.optimize_queries:
227 # Return original query if no LLM available or optimization disabled
228 return query
230 try:
231 # Prompt for query optimization
232 prompt = f"""Transform this natural language question into an optimized PubMed search query.
234Original query: "{query}"
236CRITICAL RULES:
2371. ONLY RETURN THE EXACT SEARCH QUERY - NO EXPLANATIONS, NO COMMENTS
2382. DO NOT wrap the entire query in quotes
2393. DO NOT include ANY date restrictions or year filters
2404. Use parentheses around OR statements: (term1[Field] OR term2[Field])
2415. Use only BASIC MeSH terms - stick to broad categories like "Vaccines"[Mesh]
2426. KEEP IT SIMPLE - use 2-3 main concepts maximum
2437. Focus on Title/Abstract searches for reliability: term[Title/Abstract]
2448. Use wildcards for variations: vaccin*[Title/Abstract]
246EXAMPLE QUERIES:
247✓ GOOD: (mRNA[Title/Abstract] OR "messenger RNA"[Title/Abstract]) AND vaccin*[Title/Abstract]
248✓ GOOD: (influenza[Title/Abstract] OR flu[Title/Abstract]) AND treatment[Title/Abstract]
249✗ BAD: (mRNA[Title/Abstract]) AND "specific disease"[Mesh] AND treatment[Title/Abstract] AND 2023[dp]
250✗ BAD: "Here's a query to find articles about vaccines..."
252Return ONLY the search query without any explanations.
253"""
255 # Get response from LLM
256 response = self.llm.invoke(prompt)
257 raw_response = response.content.strip()
259 # Clean up the query - extract only the actual query and remove any explanations
260 # First check if there are multiple lines and take the first non-empty line
261 lines = raw_response.split("\n")
262 cleaned_lines = [line.strip() for line in lines if line.strip()]
264 if cleaned_lines:
265 optimized_query = cleaned_lines[0]
267 # Remove any quotes that wrap the entire query
268 if optimized_query.startswith('"') and optimized_query.endswith(
269 '"'
270 ):
271 optimized_query = optimized_query[1:-1]
273 # Remove any explanation phrases that might be at the beginning
274 explanation_starters = [
275 "here is",
276 "here's",
277 "this query",
278 "the following",
279 ]
280 for starter in explanation_starters:
281 if optimized_query.lower().startswith(starter):
282 # Find the actual query part - typically after a colon
283 colon_pos = optimized_query.find(":")
284 if colon_pos > 0:
285 optimized_query = optimized_query[
286 colon_pos + 1 :
287 ].strip()
289 # Check if the query still seems to contain explanations
290 if (
291 len(optimized_query) > 200
292 or "this query will" in optimized_query.lower()
293 ):
294 # It's probably still an explanation - try to extract just the query part
295 # Look for common patterns in the explanation like parentheses
296 pattern = r"\([^)]+\)\s+AND\s+"
297 import re
299 matches = re.findall(pattern, optimized_query)
300 if matches:
301 # Extract just the query syntax parts
302 query_parts = []
303 for part in re.split(r"\.\s+", optimized_query):
304 if (
305 "(" in part
306 and ")" in part
307 and ("AND" in part or "OR" in part)
308 ):
309 query_parts.append(part)
310 if query_parts:
311 optimized_query = " ".join(query_parts)
312 else:
313 # Fall back to original query if cleaning fails
314 logger.warning(
315 "Failed to extract a clean query from LLM response"
316 )
317 optimized_query = query
319 # Final safety check - if query looks too much like an explanation, use original
320 if len(optimized_query.split()) > 30:
321 logger.warning(
322 "Query too verbose, falling back to simpler form"
323 )
324 # Create a simple query from the original
325 words = [
326 w
327 for w in query.split()
328 if len(w) > 3
329 and w.lower()
330 not in (
331 "what",
332 "are",
333 "the",
334 "and",
335 "for",
336 "with",
337 "from",
338 "have",
339 "been",
340 "recent",
341 )
342 ]
343 optimized_query = " AND ".join(words[:3])
345 # Basic cleanup: standardize field tag case for consistency
346 import re
348 optimized_query = re.sub(
349 r"\[mesh\]", "[Mesh]", optimized_query, flags=re.IGNORECASE
350 )
351 optimized_query = re.sub(
352 r"\[title/abstract\]",
353 "[Title/Abstract]",
354 optimized_query,
355 flags=re.IGNORECASE,
356 )
357 optimized_query = re.sub(
358 r"\[publication type\]",
359 "[Publication Type]",
360 optimized_query,
361 flags=re.IGNORECASE,
362 )
364 # Fix unclosed quotes followed by field tags
365 # Pattern: "term[Field] -> "term"[Field]
366 optimized_query = re.sub(r'"([^"]+)\[', r'"\1"[', optimized_query)
368 # Simplify the query if still no results are found
369 self._simplify_query_cache = optimized_query
371 # Log original and optimized queries
372 logger.info("Original query: '%s'", query)
373 logger.info(f"Optimized for PubMed: '{optimized_query}'")
374 logger.debug(
375 f"Query optimization complete: '{query[:50]}...' -> '{optimized_query[:100]}...'"
376 )
378 return optimized_query
380 except Exception:
381 logger.exception("Error optimizing query")
382 logger.debug(f"Falling back to original query: '{query}'")
383 return query # Fall back to original query on error
385 def _simplify_query(self, query: str) -> str:
386 """
387 Simplify a PubMed query that returned no results.
388 Progressively removes elements to get a more basic query.
390 Args:
391 query: The original query that returned no results
393 Returns:
394 Simplified query
395 """
396 logger.info(f"Simplifying query: {query}")
397 logger.debug(f"Query simplification started for: '{query[:100]}...'")
399 # Simple approach: remove field restrictions to broaden the search
400 import re
402 # Remove field tags to make search broader
403 simplified = query
405 # Remove [Mesh] tags - search in all fields instead
406 simplified = re.sub(r"\[Mesh\]", "", simplified, flags=re.IGNORECASE)
408 # Remove [Publication Type] tags
409 simplified = re.sub(
410 r"\[Publication Type\]", "", simplified, flags=re.IGNORECASE
411 )
413 # Keep [Title/Abstract] as it's usually helpful
414 # Clean up any double spaces
415 simplified = re.sub(r"\s+", " ", simplified).strip()
417 # If no simplification was possible, return the original query
418 if simplified == query:
419 logger.debug("No simplification possible, returning original query")
421 logger.info(f"Simplified query: {simplified}")
422 logger.debug(
423 f"Query simplified from {len(query)} to {len(simplified)} chars"
424 )
425 return simplified
427 def _is_historical_focused(self, query: str) -> bool:
428 """
429 Determine if a query is specifically focused on historical/older information using LLM.
430 Default assumption is that queries should prioritize recent information unless
431 explicitly asking for historical content.
433 Args:
434 query: The search query
436 Returns:
437 Boolean indicating if the query is focused on historical information
438 """
439 if not self.llm:
440 # Fall back to basic keyword check if no LLM available
441 historical_terms = [
442 "history",
443 "historical",
444 "early",
445 "initial",
446 "first",
447 "original",
448 "before",
449 "prior to",
450 "origins",
451 "evolution",
452 "development",
453 ]
454 historical_years = [str(year) for year in range(1900, 2020)]
456 query_lower = query.lower()
457 has_historical_term = any(
458 term in query_lower for term in historical_terms
459 )
460 has_past_year = any(year in query for year in historical_years)
462 return has_historical_term or has_past_year
464 try:
465 # Use LLM to determine if the query is focused on historical information
466 prompt = f"""Determine if this query is specifically asking for HISTORICAL or OLDER information.
468Query: "{query}"
470Answer ONLY "yes" if the query is clearly asking for historical, early, original, or past information from more than 5 years ago.
471Answer ONLY "no" if the query is asking about recent, current, or new information, or if it's a general query without a specific time focus.
473The default assumption should be that medical and scientific queries want RECENT information unless clearly specified otherwise.
474"""
476 response = self.llm.invoke(prompt)
477 answer = response.content.strip().lower()
479 # Log the determination
480 logger.info(f"Historical focus determination for query: '{query}'")
481 logger.info(f"LLM determined historical focus: {answer}")
483 return "yes" in answer
485 except Exception:
486 logger.exception("Error determining historical focus")
487 # Fall back to basic keyword check
488 historical_terms = [
489 "history",
490 "historical",
491 "early",
492 "initial",
493 "first",
494 "original",
495 "before",
496 "prior to",
497 "origins",
498 "evolution",
499 "development",
500 ]
501 return any(term in query.lower() for term in historical_terms)
503 def _adaptive_search(self, query: str) -> Tuple[List[str], str]:
504 """
505 Perform an adaptive search that adjusts based on topic volume and whether
506 the query focuses on historical information.
508 Args:
509 query: The search query (already optimized)
511 Returns:
512 Tuple of (list of PMIDs, search strategy used)
513 """
514 # Estimate topic volume
515 estimated_volume = self._get_result_count(query)
517 # Determine if the query is focused on historical information
518 is_historical_focused = self._is_historical_focused(query)
520 if is_historical_focused:
521 # User wants historical information - no date filtering
522 time_filter = None
523 strategy = "historical_focus"
524 elif estimated_volume > 5000:
525 # Very common topic - use tighter recency filter
526 time_filter = '"last 1 year"[pdat]'
527 strategy = "high_volume"
528 elif estimated_volume > 1000:
529 # Common topic
530 time_filter = '"last 3 years"[pdat]'
531 strategy = "common_topic"
532 elif estimated_volume > 100:
533 # Moderate volume
534 time_filter = '"last 5 years"[pdat]'
535 strategy = "moderate_volume"
536 else:
537 # Rare topic - still use recency but with wider range
538 time_filter = '"last 10 years"[pdat]'
539 strategy = "rare_topic"
541 # Run search based on strategy
542 if time_filter:
543 # Try with adaptive time filter
544 query_with_time = f"({query}) AND {time_filter}"
545 logger.info(
546 f"Using adaptive search strategy: {strategy} with filter: {time_filter}"
547 )
548 results = self._search_pubmed(query_with_time)
550 # If too few results, gradually expand time window
551 if len(results) < 5 and '"last 10 years"[pdat]' not in time_filter:
552 logger.info(
553 f"Insufficient results ({len(results)}), expanding time window"
554 )
555 expanded_time = self._expand_time_window(time_filter)
556 query_with_expanded_time = f"({query}) AND {expanded_time}"
557 expanded_results = self._search_pubmed(query_with_expanded_time)
559 if len(expanded_results) > len(results):
560 logger.info(
561 f"Expanded time window yielded {len(expanded_results)} results"
562 )
563 return expanded_results, f"{strategy}_expanded"
565 # If still no results, try without time filter
566 if not results:
567 logger.info(
568 "No results with time filter, trying without time restrictions"
569 )
570 results = self._search_pubmed(query)
571 strategy = "no_time_filter"
572 else:
573 # Historical query - run without time filter
574 logger.info(
575 "Using historical search strategy without date filtering"
576 )
577 results = self._search_pubmed(query)
579 return results, strategy
581 def _search_pubmed(self, query: str) -> List[str]:
582 """
583 Search PubMed and return a list of article IDs.
585 Args:
586 query: The search query
588 Returns:
589 List of PubMed IDs matching the query
590 """
591 try:
592 # Prepare search parameters
593 params = {
594 "db": "pubmed",
595 "term": query,
596 "retmode": "json",
597 "retmax": self.max_results,
598 "usehistory": "y",
599 }
601 # Add API key if available
602 if self.api_key:
603 params["api_key"] = self.api_key
604 logger.debug("Using PubMed API key for higher rate limits")
605 else:
606 logger.debug("No PubMed API key - using default rate limits")
608 # Add date restriction if specified
609 if self.days_limit:
610 params["reldate"] = self.days_limit
611 params["datetype"] = "pdat" # Publication date
612 logger.debug(f"Limiting results to last {self.days_limit} days")
614 logger.debug(
615 f"PubMed search query: '{query}' with max_results={self.max_results}"
616 )
618 self._last_wait_time = self.rate_tracker.apply_rate_limit(
619 self.engine_type
620 )
621 logger.debug(
622 f"Applied rate limit wait: {self._last_wait_time:.2f}s"
623 )
625 # Execute search request
626 logger.debug(f"Sending request to PubMed API: {self.search_url}")
627 response = safe_get(self.search_url, params=params)
628 response.raise_for_status()
629 logger.debug(f"PubMed API response status: {response.status_code}")
631 # Parse response
632 data = response.json()
633 id_list = data["esearchresult"]["idlist"]
634 total_count = data["esearchresult"].get("count", "unknown")
636 logger.info(
637 f"PubMed search for '{query}' found {len(id_list)} results (total available: {total_count})"
638 )
639 if len(id_list) > 0:
640 logger.debug(f"First 5 PMIDs: {id_list[:5]}")
641 return id_list
643 except Exception:
644 logger.exception(f"Error searching PubMed for query '{query}'")
645 return []
647 def _get_article_summaries(
648 self, id_list: List[str]
649 ) -> List[Dict[str, Any]]:
650 """
651 Get summaries for a list of PubMed article IDs.
653 Args:
654 id_list: List of PubMed IDs
656 Returns:
657 List of article summary dictionaries
658 """
659 if not id_list:
660 logger.debug("Empty ID list provided to _get_article_summaries")
661 return []
663 logger.debug(f"Fetching summaries for {len(id_list)} PubMed articles")
665 try:
666 # Prepare parameters
667 params = {
668 "db": "pubmed",
669 "id": ",".join(id_list),
670 "retmode": "json",
671 "rettype": "summary",
672 }
674 # Add API key if available
675 if self.api_key:
676 params["api_key"] = self.api_key
678 self._last_wait_time = self.rate_tracker.apply_rate_limit(
679 self.engine_type
680 )
681 logger.debug(
682 f"Applied rate limit wait: {self._last_wait_time:.2f}s"
683 )
685 # Execute request
686 logger.debug(f"Requesting summaries from: {self.summary_url}")
687 response = safe_get(self.summary_url, params=params)
688 response.raise_for_status()
689 logger.debug(f"Summary API response status: {response.status_code}")
691 # Parse response
692 data = response.json()
693 logger.debug(
694 f"PubMed API returned data for {len(id_list)} requested IDs"
695 )
696 summaries = []
698 for pmid in id_list:
699 if pmid in data["result"]:
700 article = data["result"][pmid]
701 logger.debug(
702 f"Processing article {pmid}: {article.get('title', 'NO TITLE')[:50]}"
703 )
705 # Extract authors (if available)
706 authors = []
707 if "authors" in article:
708 authors = [
709 author["name"] for author in article["authors"]
710 ]
712 # Extract DOI from articleids if not in main field
713 doi = article.get("doi", "")
714 if not doi and "articleids" in article:
715 for aid in article["articleids"]:
716 if aid.get("idtype") == "doi":
717 doi = aid.get("value", "")
718 break
720 # Create summary dictionary with all available fields
721 summary = {
722 "id": pmid,
723 "title": article.get("title", ""),
724 "pubdate": article.get("pubdate", ""),
725 "epubdate": article.get("epubdate", ""),
726 "source": article.get("source", ""),
727 "authors": authors,
728 "lastauthor": article.get("lastauthor", ""),
729 "journal": article.get("fulljournalname", ""),
730 "volume": article.get("volume", ""),
731 "issue": article.get("issue", ""),
732 "pages": article.get("pages", ""),
733 "doi": doi,
734 "issn": article.get("issn", ""),
735 "essn": article.get("essn", ""),
736 "pubtype": article.get(
737 "pubtype", []
738 ), # Publication types from esummary
739 "recordstatus": article.get("recordstatus", ""),
740 "lang": article.get("lang", []),
741 "pmcrefcount": article.get("pmcrefcount", None),
742 "link": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",
743 }
745 summaries.append(summary)
746 else:
747 logger.warning(
748 f"PMID {pmid} not found in PubMed API response"
749 )
751 return summaries
753 except Exception as e:
754 error_msg = str(e)
755 logger.exception(
756 f"Error getting article summaries for {len(id_list)} articles"
757 )
759 # Check for rate limiting patterns
760 if (
761 "429" in error_msg
762 or "too many requests" in error_msg.lower()
763 or "rate limit" in error_msg.lower()
764 or "service unavailable" in error_msg.lower()
765 or "503" in error_msg
766 or "403" in error_msg
767 ):
768 raise RateLimitError(f"PubMed rate limit hit: {error_msg}")
770 return []
772 def _get_article_abstracts(self, id_list: List[str]) -> Dict[str, str]:
773 """
774 Get abstracts for a list of PubMed article IDs.
776 Args:
777 id_list: List of PubMed IDs
779 Returns:
780 Dictionary mapping PubMed IDs to their abstracts
781 """
782 if not id_list:
783 logger.debug("Empty ID list provided to _get_article_abstracts")
784 return {}
786 logger.debug(f"Fetching abstracts for {len(id_list)} PubMed articles")
788 try:
789 # Prepare parameters
790 params = {
791 "db": "pubmed",
792 "id": ",".join(id_list),
793 "retmode": "xml",
794 "rettype": "abstract",
795 }
797 # Add API key if available
798 if self.api_key:
799 params["api_key"] = self.api_key
801 self._last_wait_time = self.rate_tracker.apply_rate_limit(
802 self.engine_type
803 )
804 logger.debug(
805 f"Applied rate limit wait: {self._last_wait_time:.2f}s"
806 )
808 # Execute request
809 logger.debug(f"Requesting abstracts from: {self.fetch_url}")
810 response = safe_get(self.fetch_url, params=params)
811 response.raise_for_status()
812 logger.debug(
813 f"Abstract fetch response status: {response.status_code}, size: {len(response.text)} bytes"
814 )
816 # Parse XML response
817 root = ET.fromstring(response.text)
818 logger.debug(
819 f"Parsing abstracts from XML for {len(id_list)} articles"
820 )
822 # Extract abstracts
823 abstracts = {}
825 for article in root.findall(".//PubmedArticle"):
826 pmid_elem = article.find(".//PMID")
827 pmid = pmid_elem.text if pmid_elem is not None else None
829 if pmid is None:
830 continue
832 # Find abstract text
833 abstract_text = ""
834 abstract_elem = article.find(".//AbstractText")
836 if abstract_elem is not None:
837 abstract_text = abstract_elem.text or ""
839 # Some abstracts are split into multiple sections
840 abstract_sections = article.findall(".//AbstractText")
841 if len(abstract_sections) > 1:
842 logger.debug(
843 f"Article {pmid} has {len(abstract_sections)} abstract sections"
844 )
846 for section in abstract_sections:
847 # Get section label if it exists
848 label = section.get("Label")
849 section_text = section.text or ""
851 if label and section_text:
852 if abstract_text:
853 abstract_text += f"\n\n{label}: {section_text}"
854 else:
855 abstract_text = f"{label}: {section_text}"
856 elif section_text:
857 if abstract_text:
858 abstract_text += f"\n\n{section_text}"
859 else:
860 abstract_text = section_text
862 # Store in dictionary
863 if pmid and abstract_text:
864 abstracts[pmid] = abstract_text
865 logger.debug(
866 f"Abstract for {pmid}: {len(abstract_text)} chars"
867 )
868 elif pmid:
869 logger.warning(f"No abstract found for PMID {pmid}")
871 logger.info(
872 f"Successfully retrieved {len(abstracts)} abstracts out of {len(id_list)} requested"
873 )
874 return abstracts
876 except Exception as e:
877 logger.exception(
878 f"Error getting article abstracts for {len(id_list)} articles: {str(e)}"
879 )
880 return {}
882 def _get_article_detailed_metadata(
883 self, id_list: List[str]
884 ) -> Dict[str, Dict[str, Any]]:
885 """
886 Get detailed metadata for PubMed articles including publication types,
887 MeSH terms, keywords, and affiliations.
889 Args:
890 id_list: List of PubMed IDs
892 Returns:
893 Dictionary mapping PubMed IDs to their detailed metadata
894 """
895 if not id_list:
896 return {}
898 try:
899 # Prepare parameters
900 params = {
901 "db": "pubmed",
902 "id": ",".join(id_list),
903 "retmode": "xml",
904 "rettype": "medline",
905 }
907 # Add API key if available
908 if self.api_key:
909 params["api_key"] = self.api_key
911 self._last_wait_time = self.rate_tracker.apply_rate_limit(
912 self.engine_type
913 )
915 # Execute request
916 response = safe_get(self.fetch_url, params=params)
917 response.raise_for_status()
919 # Parse XML response
920 root = ET.fromstring(response.text)
922 metadata = {}
924 for article in root.findall(".//PubmedArticle"):
925 pmid_elem = article.find(".//PMID")
926 pmid = pmid_elem.text if pmid_elem is not None else None
928 if pmid is None:
929 continue
931 article_metadata = {}
933 # Extract publication types
934 pub_types = []
935 for pub_type in article.findall(".//PublicationType"):
936 if pub_type.text:
937 pub_types.append(pub_type.text)
938 if pub_types:
939 article_metadata["publication_types"] = pub_types
941 # Extract MeSH terms
942 mesh_terms = []
943 for mesh in article.findall(".//MeshHeading"):
944 descriptor = mesh.find(".//DescriptorName")
945 if descriptor is not None and descriptor.text:
946 mesh_terms.append(descriptor.text)
947 if mesh_terms:
948 article_metadata["mesh_terms"] = mesh_terms
950 # Extract keywords
951 keywords = []
952 for keyword in article.findall(".//Keyword"):
953 if keyword.text:
954 keywords.append(keyword.text)
955 if keywords:
956 article_metadata["keywords"] = keywords
958 # Extract affiliations
959 affiliations = []
960 for affiliation in article.findall(".//Affiliation"):
961 if affiliation.text:
962 affiliations.append(affiliation.text)
963 if affiliations:
964 article_metadata["affiliations"] = affiliations
966 # Extract grant information
967 grants = []
968 for grant in article.findall(".//Grant"):
969 grant_info = {}
970 grant_id = grant.find(".//GrantID")
971 if grant_id is not None and grant_id.text:
972 grant_info["id"] = grant_id.text
973 agency = grant.find(".//Agency")
974 if agency is not None and agency.text:
975 grant_info["agency"] = agency.text
976 if grant_info:
977 grants.append(grant_info)
978 if grants:
979 article_metadata["grants"] = grants
981 # Check for free full text in PMC
982 pmc_elem = article.find(".//ArticleId[@IdType='pmc']")
983 if pmc_elem is not None:
984 article_metadata["has_free_full_text"] = True
985 article_metadata["pmc_id"] = pmc_elem.text
987 # Extract conflict of interest statement
988 coi_elem = article.find(".//CoiStatement")
989 if coi_elem is not None and coi_elem.text:
990 article_metadata["conflict_of_interest"] = coi_elem.text
992 metadata[pmid] = article_metadata
994 return metadata
996 except Exception:
997 logger.exception("Error getting detailed article metadata")
998 return {}
1000 def _create_enriched_content(
1001 self, result: Dict[str, Any], base_content: str
1002 ) -> str:
1003 """
1004 Create enriched content by adding relevant metadata context to help the LLM.
1006 Args:
1007 result: The result dictionary with metadata
1008 base_content: The base content (abstract or full text)
1010 Returns:
1011 Enriched content string with metadata context
1012 """
1013 enriched_parts = []
1015 # Add study type information
1016 if "publication_types" in result:
1017 pub_types = result["publication_types"]
1018 # Filter for significant types
1019 significant_types = [
1020 pt
1021 for pt in pub_types
1022 if any(
1023 key in pt.lower()
1024 for key in [
1025 "clinical trial",
1026 "randomized",
1027 "meta-analysis",
1028 "systematic review",
1029 "case report",
1030 "guideline",
1031 "comparative study",
1032 "multicenter",
1033 ]
1034 )
1035 ]
1036 if significant_types:
1037 enriched_parts.append(
1038 f"[Study Type: {', '.join(significant_types)}]"
1039 )
1041 # Add the main content
1042 enriched_parts.append(base_content)
1044 # Add metadata footer
1045 metadata_footer = []
1047 # Add ALL MeSH terms
1048 if "mesh_terms" in result and len(result["mesh_terms"]) > 0:
1049 metadata_footer.append(
1050 f"Medical Topics (MeSH): {', '.join(result['mesh_terms'])}"
1051 )
1053 # Add ALL keywords
1054 if "keywords" in result and len(result["keywords"]) > 0:
1055 metadata_footer.append(f"Keywords: {', '.join(result['keywords'])}")
1057 # Add ALL affiliations
1058 if "affiliations" in result and len(result["affiliations"]) > 0:
1059 if len(result["affiliations"]) == 1:
1060 metadata_footer.append(
1061 f"Institution: {result['affiliations'][0]}"
1062 )
1063 else:
1064 affiliations_text = "\n - " + "\n - ".join(
1065 result["affiliations"]
1066 )
1067 metadata_footer.append(f"Institutions:{affiliations_text}")
1069 # Add ALL funding information with full details
1070 if "grants" in result and len(result["grants"]) > 0:
1071 grant_details = []
1072 for grant in result["grants"]:
1073 grant_text = []
1074 if "agency" in grant:
1075 grant_text.append(grant["agency"])
1076 if "id" in grant:
1077 grant_text.append(f"(Grant ID: {grant['id']})")
1078 if grant_text:
1079 grant_details.append(" ".join(grant_text))
1080 if grant_details:
1081 if len(grant_details) == 1:
1082 metadata_footer.append(f"Funded by: {grant_details[0]}")
1083 else:
1084 funding_text = "\n - " + "\n - ".join(grant_details)
1085 metadata_footer.append(f"Funding Sources:{funding_text}")
1087 # Add FULL conflict of interest statement
1088 if "conflict_of_interest" in result:
1089 coi_text = result["conflict_of_interest"]
1090 if coi_text:
1091 # Still skip trivial "no conflict" statements to reduce noise
1092 if not any(
1093 phrase in coi_text.lower()
1094 for phrase in [
1095 "no conflict",
1096 "no competing",
1097 "nothing to disclose",
1098 "none declared",
1099 "authors declare no",
1100 ]
1101 ):
1102 metadata_footer.append(f"Conflict of Interest: {coi_text}")
1103 elif (
1104 "but" in coi_text.lower()
1105 or "except" in coi_text.lower()
1106 or "however" in coi_text.lower()
1107 ):
1108 # Include if there's a "no conflict BUT..." type statement
1109 metadata_footer.append(f"Conflict of Interest: {coi_text}")
1111 # Combine everything
1112 if metadata_footer:
1113 enriched_parts.append("\n---\nStudy Metadata:")
1114 enriched_parts.extend(metadata_footer)
1116 return "\n".join(enriched_parts)
1118 def _find_pmc_ids(self, pmid_list: List[str]) -> Dict[str, str]:
1119 """
1120 Find PMC IDs for the given PubMed IDs (for full-text access).
1122 Args:
1123 pmid_list: List of PubMed IDs
1125 Returns:
1126 Dictionary mapping PubMed IDs to their PMC IDs (if available)
1127 """
1128 if not pmid_list or not self.get_full_text:
1129 return {}
1131 try:
1132 # Prepare parameters
1133 params = {
1134 "dbfrom": "pubmed",
1135 "db": "pmc",
1136 "linkname": "pubmed_pmc",
1137 "id": ",".join(pmid_list),
1138 "retmode": "json",
1139 }
1141 # Add API key if available
1142 if self.api_key:
1143 params["api_key"] = self.api_key
1145 self._last_wait_time = self.rate_tracker.apply_rate_limit(
1146 self.engine_type
1147 )
1149 # Execute request
1150 response = safe_get(self.link_url, params=params)
1151 response.raise_for_status()
1153 # Parse response
1154 data = response.json()
1156 # Map PubMed IDs to PMC IDs
1157 pmid_to_pmcid = {}
1159 for linkset in data.get("linksets", []):
1160 pmid = linkset.get("ids", [None])[0]
1162 if not pmid:
1163 continue
1165 for link in linkset.get("linksetdbs", []):
1166 if link.get("linkname") == "pubmed_pmc":
1167 pmcids = link.get("links", [])
1168 if pmcids:
1169 pmid_to_pmcid[str(pmid)] = f"PMC{pmcids[0]}"
1171 logger.info(
1172 f"Found {len(pmid_to_pmcid)} PMC IDs for full-text access"
1173 )
1174 return pmid_to_pmcid
1176 except Exception:
1177 logger.exception("Error finding PMC IDs")
1178 return {}
1180 def _get_pmc_full_text(self, pmcid: str) -> str:
1181 """
1182 Get full text for a PMC article.
1184 Args:
1185 pmcid: PMC ID of the article
1187 Returns:
1188 Full text content or empty string if not available
1189 """
1190 try:
1191 # Prepare parameters
1192 params = {
1193 "db": "pmc",
1194 "id": pmcid,
1195 "retmode": "xml",
1196 "rettype": "full",
1197 }
1199 # Add API key if available
1200 if self.api_key:
1201 params["api_key"] = self.api_key
1203 self._last_wait_time = self.rate_tracker.apply_rate_limit(
1204 self.engine_type
1205 )
1207 # Execute request
1208 response = safe_get(self.fetch_url, params=params)
1209 response.raise_for_status()
1211 # Parse XML response
1212 root = ET.fromstring(response.text)
1214 # Extract full text
1215 full_text = []
1217 # Extract article title
1218 title_elem = root.find(".//article-title")
1219 if title_elem is not None and title_elem.text:
1220 full_text.append(f"# {title_elem.text}")
1222 # Extract abstract
1223 abstract_paras = root.findall(".//abstract//p")
1224 if abstract_paras:
1225 full_text.append("\n## Abstract\n")
1226 for p in abstract_paras:
1227 text = "".join(p.itertext())
1228 if text:
1229 full_text.append(text)
1231 # Extract body content
1232 body = root.find(".//body")
1233 if body is not None:
1234 for section in body.findall(".//sec"):
1235 # Get section title
1236 title = section.find(".//title")
1237 if title is not None and title.text:
1238 full_text.append(f"\n## {title.text}\n")
1240 # Get paragraphs
1241 for p in section.findall(".//p"):
1242 text = "".join(p.itertext())
1243 if text:
1244 full_text.append(text)
1246 result_text = "\n\n".join(full_text)
1247 logger.debug(
1248 f"Successfully extracted {len(result_text)} chars of PMC full text with {len(full_text)} sections"
1249 )
1250 return result_text
1252 except Exception:
1253 logger.exception("Error getting PMC full text")
1254 return ""
1256 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
1257 """
1258 Get preview information for PubMed articles.
1260 Args:
1261 query: The search query
1263 Returns:
1264 List of preview dictionaries
1265 """
1266 logger.info(f"Getting PubMed previews for query: {query}")
1268 # Optimize the query for PubMed if LLM is available
1269 optimized_query = self._optimize_query_for_pubmed(query)
1271 # Perform adaptive search
1272 pmid_list, strategy = self._adaptive_search(optimized_query)
1274 # If no results, try a simplified query
1275 if not pmid_list:
1276 logger.warning(
1277 f"No PubMed results found using strategy: {strategy}"
1278 )
1279 simplified_query = self._simplify_query(optimized_query)
1280 if simplified_query != optimized_query:
1281 logger.info(f"Trying with simplified query: {simplified_query}")
1282 pmid_list, strategy = self._adaptive_search(simplified_query)
1283 if pmid_list:
1284 logger.info(
1285 f"Simplified query found {len(pmid_list)} results"
1286 )
1288 if not pmid_list:
1289 logger.warning("No PubMed results found after query simplification")
1290 return []
1292 # Get article summaries
1293 logger.debug(f"Fetching article summaries for {len(pmid_list)} PMIDs")
1294 summaries = self._get_article_summaries(pmid_list)
1295 logger.debug(f"Retrieved {len(summaries)} summaries")
1297 # ALWAYS fetch abstracts for snippet-only mode to provide context for LLM
1298 logger.debug(
1299 f"Fetching abstracts for {len(pmid_list)} articles for snippet enrichment"
1300 )
1301 abstracts = self._get_article_abstracts(pmid_list)
1302 logger.debug(f"Retrieved {len(abstracts)} abstracts")
1304 # Format as previews
1305 previews = []
1306 for summary in summaries:
1307 # Build snippet from individual metadata preferences
1308 snippet_parts = []
1310 # Check for publication type from esummary (earlier than detailed metadata)
1311 pub_type_prefix = ""
1312 if self.include_publication_type_in_context and summary.get(
1313 "pubtype"
1314 ):
1315 # Use first publication type from esummary
1316 pub_type_prefix = f"[{summary['pubtype'][0]}] "
1318 # Add authors if enabled
1319 if self.include_authors_in_context and summary.get("authors"):
1320 authors_text = ", ".join(summary.get("authors", []))
1321 if len(authors_text) > 100:
1322 # Truncate long author lists
1323 authors_text = authors_text[:97] + "..."
1324 snippet_parts.append(authors_text)
1326 # Add journal if enabled
1327 if self.include_journal_in_context and summary.get("journal"):
1328 snippet_parts.append(summary["journal"])
1330 # Add date (full or year only)
1331 if summary.get("pubdate"):
1332 if self.include_full_date_in_context:
1333 snippet_parts.append(summary["pubdate"])
1334 elif (
1335 self.include_year_in_context
1336 and len(summary["pubdate"]) >= 4
1337 ):
1338 snippet_parts.append(summary["pubdate"][:4])
1340 # Add citation details if enabled
1341 if self.include_citation_in_context:
1342 citation_parts = []
1343 if summary.get("volume"):
1344 citation_parts.append(f"Vol {summary['volume']}")
1345 if summary.get("issue"):
1346 citation_parts.append(f"Issue {summary['issue']}")
1347 if summary.get("pages"):
1348 citation_parts.append(f"pp {summary['pages']}")
1349 if citation_parts:
1350 snippet_parts.append(f"({', '.join(citation_parts)})")
1352 # Join snippet parts or provide default
1353 if snippet_parts:
1354 # Use different separators based on what's included
1355 if self.include_authors_in_context:
1356 snippet = ". ".join(
1357 snippet_parts
1358 ) # Authors need period separator
1359 else:
1360 snippet = " - ".join(
1361 snippet_parts
1362 ) # Journal and year use dash
1363 else:
1364 snippet = "Research article"
1366 # Add publication type prefix
1367 snippet = pub_type_prefix + snippet
1369 # Add language indicator if not English
1370 if self.include_language_in_context and summary.get("lang"):
1371 langs = summary["lang"]
1372 if langs and langs[0] != "eng" and langs[0]:
1373 snippet = f"{snippet} [{langs[0].upper()}]"
1375 # Add identifiers if enabled
1376 identifier_parts = []
1377 if self.include_pmid_in_context and summary.get("id"):
1378 identifier_parts.append(f"PMID: {summary['id']}")
1379 if self.include_doi_in_context and summary.get("doi"):
1380 identifier_parts.append(f"DOI: {summary['doi']}")
1382 if identifier_parts:
1383 snippet = f"{snippet} | {' | '.join(identifier_parts)}"
1385 # ALWAYS include title and abstract in snippet for LLM analysis
1386 pmid = summary["id"]
1387 title = summary["title"]
1388 abstract_text = abstracts.get(pmid, "")
1390 # Truncate abstract if too long
1391 if len(abstract_text) > 500:
1392 abstract_text = abstract_text[:497] + "..."
1394 # Build the enriched snippet with title and abstract
1395 if abstract_text:
1396 enriched_snippet = f"Title: {title}\n\nAbstract: {abstract_text}\n\nMetadata: {snippet}"
1397 else:
1398 enriched_snippet = f"Title: {title}\n\nMetadata: {snippet}"
1400 # Log the complete snippet for debugging
1401 logger.debug(f"Complete snippet for PMID {pmid}:")
1402 logger.debug(f" Title: {title[:100]}...")
1403 logger.debug(f" Abstract length: {len(abstract_text)} chars")
1404 logger.debug(f" Metadata: {snippet}")
1405 logger.debug(
1406 f" Full enriched snippet ({len(enriched_snippet)} chars): {enriched_snippet[:500]}..."
1407 )
1409 # Create preview with basic information
1410 preview = {
1411 "id": summary["id"],
1412 "title": summary["title"],
1413 "link": summary["link"],
1414 "snippet": enriched_snippet, # Use enriched snippet with title and abstract
1415 "authors": summary.get("authors", []),
1416 "journal": summary.get("journal", ""),
1417 "pubdate": summary.get("pubdate", ""),
1418 "doi": summary.get("doi", ""),
1419 "source": "PubMed",
1420 "_pmid": summary["id"], # Store PMID for later use
1421 "_search_strategy": strategy, # Store search strategy for analytics
1422 }
1424 previews.append(preview)
1426 logger.info(
1427 f"Found {len(previews)} PubMed previews using strategy: {strategy}"
1428 )
1429 if previews:
1430 logger.debug(
1431 f"Sample preview title: '{previews[0].get('title', 'NO TITLE')[:80]}...'"
1432 )
1433 return previews
1435 def _get_full_content(
1436 self, relevant_items: List[Dict[str, Any]]
1437 ) -> List[Dict[str, Any]]:
1438 """
1439 Get full content for the relevant PubMed articles.
1440 Efficiently manages which content to retrieve (abstracts and/or full text).
1442 Args:
1443 relevant_items: List of relevant preview dictionaries
1445 Returns:
1446 List of result dictionaries with full content
1447 """
1448 # Check if we should add full content
1449 snippets_only_mode = (
1450 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")
1451 and search_config.SEARCH_SNIPPETS_ONLY
1452 )
1454 if snippets_only_mode:
1455 logger.info(
1456 "Snippet-only mode enabled, will fetch abstracts as snippets"
1457 )
1458 # For PubMed, we still need to fetch abstracts as they serve as snippets
1459 # But we'll skip full-text retrieval
1461 logger.info(
1462 f"Getting content for {len(relevant_items)} PubMed articles"
1463 )
1465 # Collect all PMIDs for relevant items
1466 pmids = []
1467 for item in relevant_items:
1468 if "_pmid" in item:
1469 pmids.append(item["_pmid"])
1471 # Get abstracts if requested and PMIDs exist
1472 # In snippet-only mode, always get abstracts as they serve as snippets
1473 abstracts = {}
1474 if (self.get_abstracts or snippets_only_mode) and pmids:
1475 abstracts = self._get_article_abstracts(pmids)
1477 # Get detailed metadata for all articles (publication types, MeSH terms, etc.)
1478 detailed_metadata = {}
1479 if pmids:
1480 detailed_metadata = self._get_article_detailed_metadata(pmids)
1482 # Find PMC IDs for full-text retrieval (if enabled and not in snippet-only mode)
1483 pmid_to_pmcid = {}
1484 if self.get_full_text and pmids and not snippets_only_mode:
1485 pmid_to_pmcid = self._find_pmc_ids(pmids)
1487 # Add content to results
1488 results = []
1489 for item in relevant_items:
1490 result = item.copy()
1491 pmid = item.get("_pmid", "")
1493 # Add detailed metadata if available
1494 if pmid in detailed_metadata:
1495 metadata = detailed_metadata[pmid]
1497 # Add publication types (e.g., "Clinical Trial", "Meta-Analysis")
1498 if "publication_types" in metadata:
1499 result["publication_types"] = metadata["publication_types"]
1501 # Add first publication type to snippet if enabled
1502 if (
1503 self.include_publication_type_in_context
1504 and metadata["publication_types"]
1505 ):
1506 # Just take the first publication type as is
1507 pub_type = metadata["publication_types"][0]
1508 if "snippet" in result:
1509 result["snippet"] = (
1510 f"[{pub_type}] {result['snippet']}"
1511 )
1513 # Add MeSH terms for medical categorization
1514 if "mesh_terms" in metadata:
1515 result["mesh_terms"] = metadata["mesh_terms"]
1517 # Add MeSH terms to snippet if enabled
1518 if (
1519 self.include_mesh_terms_in_context
1520 and metadata["mesh_terms"]
1521 ):
1522 mesh_to_show = (
1523 metadata["mesh_terms"][: self.max_mesh_terms]
1524 if self.max_mesh_terms > 0
1525 else metadata["mesh_terms"]
1526 )
1527 if mesh_to_show and "snippet" in result:
1528 mesh_text = "MeSH: " + ", ".join(mesh_to_show)
1529 result["snippet"] = (
1530 f"{result['snippet']} | {mesh_text}"
1531 )
1533 # Add keywords
1534 if "keywords" in metadata:
1535 result["keywords"] = metadata["keywords"]
1537 # Add keywords to snippet if enabled
1538 if (
1539 self.include_keywords_in_context
1540 and metadata["keywords"]
1541 ):
1542 keywords_to_show = (
1543 metadata["keywords"][: self.max_keywords]
1544 if self.max_keywords > 0
1545 else metadata["keywords"]
1546 )
1547 if keywords_to_show and "snippet" in result:
1548 keywords_text = "Keywords: " + ", ".join(
1549 keywords_to_show
1550 )
1551 result["snippet"] = (
1552 f"{result['snippet']} | {keywords_text}"
1553 )
1555 # Add affiliations
1556 if "affiliations" in metadata:
1557 result["affiliations"] = metadata["affiliations"]
1559 # Add funding/grant information
1560 if "grants" in metadata:
1561 result["grants"] = metadata["grants"]
1563 # Add conflict of interest statement
1564 if "conflict_of_interest" in metadata:
1565 result["conflict_of_interest"] = metadata[
1566 "conflict_of_interest"
1567 ]
1569 # Add free full text availability
1570 if "has_free_full_text" in metadata:
1571 result["has_free_full_text"] = metadata[
1572 "has_free_full_text"
1573 ]
1574 if "pmc_id" in metadata:
1575 result["pmc_id"] = metadata["pmc_id"]
1577 # Add PMC availability to snippet if enabled
1578 if (
1579 self.include_pmc_availability_in_context
1580 and metadata["has_free_full_text"]
1581 and "snippet" in result
1582 ):
1583 result["snippet"] = (
1584 f"{result['snippet']} | [Free Full Text]"
1585 )
1587 # Add abstract if available
1588 if pmid in abstracts:
1589 result["abstract"] = abstracts[pmid]
1591 # Create enriched content with metadata context
1592 enriched_content = self._create_enriched_content(
1593 result, abstracts[pmid]
1594 )
1596 # ALWAYS include title and abstract in snippet for LLM analysis
1597 # Build comprehensive snippet with title and abstract
1598 title = result.get("title", "")
1599 abstract_text = (
1600 abstracts[pmid][:500]
1601 if len(abstracts[pmid]) > 500
1602 else abstracts[pmid]
1603 )
1605 # Prepend title and abstract to the existing metadata snippet
1606 if "snippet" in result:
1607 # Keep metadata snippet and add content
1608 result["snippet"] = (
1609 f"Title: {title}\n\nAbstract: {abstract_text}\n\nMetadata: {result['snippet']}"
1610 )
1611 else:
1612 # No metadata snippet, just title and abstract
1613 result["snippet"] = (
1614 f"Title: {title}\n\nAbstract: {abstract_text}"
1615 )
1617 # In snippet-only mode, use enriched content
1618 if snippets_only_mode:
1619 result["full_content"] = enriched_content
1620 result["content"] = enriched_content
1621 result["content_type"] = "abstract"
1622 # Use abstract as content if no full text
1623 elif pmid not in pmid_to_pmcid:
1624 result["full_content"] = enriched_content
1625 result["content"] = enriched_content
1626 result["content_type"] = "abstract"
1628 # Add full text for a limited number of top articles
1629 if (
1630 pmid in pmid_to_pmcid
1631 and self.get_full_text
1632 and len(
1633 [r for r in results if r.get("content_type") == "full_text"]
1634 )
1635 < self.full_text_limit
1636 ):
1637 # Get full text content
1638 pmcid = pmid_to_pmcid[pmid]
1639 full_text = self._get_pmc_full_text(pmcid)
1641 if full_text:
1642 enriched_full_text = self._create_enriched_content(
1643 result, full_text
1644 )
1645 result["full_content"] = enriched_full_text
1646 result["content"] = enriched_full_text
1647 result["content_type"] = "full_text"
1648 result["pmcid"] = pmcid
1649 elif pmid in abstracts:
1650 # Fall back to abstract if full text retrieval fails
1651 enriched_content = self._create_enriched_content(
1652 result, abstracts[pmid]
1653 )
1654 result["full_content"] = enriched_content
1655 result["content"] = enriched_content
1656 result["content_type"] = "abstract"
1658 # Remove temporary fields
1659 if "_pmid" in result:
1660 del result["_pmid"]
1661 if "_search_strategy" in result:
1662 del result["_search_strategy"]
1664 results.append(result)
1666 return results
1668 def search_by_author(
1669 self, author_name: str, max_results: Optional[int] = None
1670 ) -> List[Dict[str, Any]]:
1671 """
1672 Search for articles by a specific author.
1674 Args:
1675 author_name: Name of the author
1676 max_results: Maximum number of results (defaults to self.max_results)
1678 Returns:
1679 List of articles by the author
1680 """
1681 original_max_results = self.max_results
1683 try:
1684 if max_results:
1685 self.max_results = max_results
1687 query = f"{author_name}[Author]"
1688 return self.run(query)
1690 finally:
1691 # Restore original value
1692 self.max_results = original_max_results
1694 def search_by_journal(
1695 self, journal_name: str, max_results: Optional[int] = None
1696 ) -> List[Dict[str, Any]]:
1697 """
1698 Search for articles in a specific journal.
1700 Args:
1701 journal_name: Name of the journal
1702 max_results: Maximum number of results (defaults to self.max_results)
1704 Returns:
1705 List of articles from the journal
1706 """
1707 original_max_results = self.max_results
1709 try:
1710 if max_results:
1711 self.max_results = max_results
1713 query = f"{journal_name}[Journal]"
1714 return self.run(query)
1716 finally:
1717 # Restore original value
1718 self.max_results = original_max_results
1720 def search_recent(
1721 self, query: str, days: int = 30, max_results: Optional[int] = None
1722 ) -> List[Dict[str, Any]]:
1723 """
1724 Search for recent articles matching the query.
1726 Args:
1727 query: The search query
1728 days: Number of days to look back
1729 max_results: Maximum number of results (defaults to self.max_results)
1731 Returns:
1732 List of recent articles matching the query
1733 """
1734 original_max_results = self.max_results
1735 original_days_limit = self.days_limit
1737 try:
1738 if max_results:
1739 self.max_results = max_results
1741 # Set days limit for this search
1742 self.days_limit = days
1744 return self.run(query)
1746 finally:
1747 # Restore original values
1748 self.max_results = original_max_results
1749 self.days_limit = original_days_limit
1751 def advanced_search(
1752 self, terms: Dict[str, str], max_results: Optional[int] = None
1753 ) -> List[Dict[str, Any]]:
1754 """
1755 Perform an advanced search with field-specific terms.
1757 Args:
1758 terms: Dictionary mapping fields to search terms
1759 Valid fields: Author, Journal, Title, MeSH, Affiliation, etc.
1760 max_results: Maximum number of results (defaults to self.max_results)
1762 Returns:
1763 List of articles matching the advanced query
1764 """
1765 original_max_results = self.max_results
1767 try:
1768 if max_results:
1769 self.max_results = max_results
1771 # Build advanced query string
1772 query_parts = []
1773 for field, term in terms.items():
1774 query_parts.append(f"{term}[{field}]")
1776 query = " AND ".join(query_parts)
1777 return self.run(query)
1779 finally:
1780 # Restore original value
1781 self.max_results = original_max_results