Coverage for src/local_deep_research/web_search_engines/engines/search_engine

1import re

2from typing import Any, Dict, List, Optional, Tuple

4from defusedxml import ElementTree as ET

6from langchain_core.language_models import BaseLLM

7from loguru import logger

9from ...config import search_config

10from ...constants import SNIPPET_LENGTH_LONG

11from ...security.safe_requests import safe_get

12from ..rate_limiting import RateLimitError

13from ..search_engine_base import BaseSearchEngine

16class PubMedSearchEngine(BaseSearchEngine):

17 """

18 PubMed search engine implementation with two-phase approach and adaptive search.

19 Provides efficient access to biomedical literature while minimizing API usage.

20 """

22 # Mark as public search engine

23 is_public = True

24 # Scientific/medical search engine

25 is_scientific = True

27 def __init__(

28 self,

29 max_results: int = 10,

30 api_key: Optional[str] = None,

31 days_limit: Optional[int] = None,

32 get_abstracts: bool = True,

33 get_full_text: bool = False,

34 full_text_limit: int = 3,

35 llm: Optional[BaseLLM] = None,

36 max_filtered_results: Optional[int] = None,

37 optimize_queries: bool = True,

38 include_publication_type_in_context: bool = True,

39 include_journal_in_context: bool = True,

40 include_year_in_context: bool = True,

41 include_authors_in_context: bool = False,

42 include_full_date_in_context: bool = False,

43 include_mesh_terms_in_context: bool = True,

44 include_keywords_in_context: bool = True,

45 include_doi_in_context: bool = False,

46 include_pmid_in_context: bool = False,

47 include_pmc_availability_in_context: bool = False,

48 max_mesh_terms: int = 3,

49 max_keywords: int = 3,

50 include_citation_in_context: bool = False,

51 include_language_in_context: bool = False,

52 ):

53 """

54 Initialize the PubMed search engine.

56 Args:

57 max_results: Maximum number of search results

58 api_key: NCBI API key for higher rate limits (optional)

59 days_limit: Limit results to N days (optional)

60 get_abstracts: Whether to fetch abstracts for all results

61 get_full_text: Whether to fetch full text content (when available in PMC)

62 full_text_limit: Max number of full-text articles to retrieve

63 llm: Language model for relevance filtering

64 max_filtered_results: Maximum number of results to keep after filtering

65 optimize_queries: Whether to optimize natural language queries for PubMed

66 """

67 # Initialize the BaseSearchEngine with LLM, max_filtered_results, and max_results

68 super().__init__(

69 llm=llm,

70 max_filtered_results=max_filtered_results,

71 max_results=max_results,

72 )

73 self.max_results = max(self.max_results, 25)

74 self.api_key = api_key

75 self.days_limit = days_limit

76 self.get_abstracts = get_abstracts

77 self.get_full_text = get_full_text

78 self.full_text_limit = full_text_limit

79 self.optimize_queries = optimize_queries

80 self.include_publication_type_in_context = (

81 include_publication_type_in_context

82 )

83 self.include_journal_in_context = include_journal_in_context

84 self.include_year_in_context = include_year_in_context

85 self.include_authors_in_context = include_authors_in_context

86 self.include_full_date_in_context = include_full_date_in_context

87 self.include_mesh_terms_in_context = include_mesh_terms_in_context

88 self.include_keywords_in_context = include_keywords_in_context

89 self.include_doi_in_context = include_doi_in_context

90 self.include_pmid_in_context = include_pmid_in_context

91 self.include_pmc_availability_in_context = (

92 include_pmc_availability_in_context

93 )

94 self.max_mesh_terms = max_mesh_terms

95 self.max_keywords = max_keywords

96 self.include_citation_in_context = include_citation_in_context

97 self.include_language_in_context = include_language_in_context

99 # Base API URLs

100 self.base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"

101 self.search_url = f"{self.base_url}/esearch.fcgi"

102 self.summary_url = f"{self.base_url}/esummary.fcgi"

103 self.fetch_url = f"{self.base_url}/efetch.fcgi"

104 self.link_url = f"{self.base_url}/elink.fcgi"

105

106 # PMC base URL for full text

107 self.pmc_url = "https://www.ncbi.nlm.nih.gov/pmc/articles/"

108

109 def _get_result_count(self, query: str) -> int:

110 """

111 Get the total number of results for a query without retrieving the results themselves.

112

113 Args:

114 query: The search query

115

116 Returns:

117 Total number of matching results

118 """

119 try:

120 # Prepare search parameters

121 params = {

122 "db": "pubmed",

123 "term": query,

124 "retmode": "json",

125 "retmax": 0, # Don't need actual results, just the count

126 }

127

128 # Add API key if available

129 if self.api_key:

130 params["api_key"] = self.api_key

131

132 self._last_wait_time = self.rate_tracker.apply_rate_limit(

133 self.engine_type

134 )

135

136 # Execute search request

137 response = safe_get(self.search_url, params=params)

138 response.raise_for_status()

139

140 # Parse response

141 data = response.json()

142 count = int(data["esearchresult"]["count"])

143

144 logger.info(

145 "Query '%s' has %s total results in PubMed", query, count

146 )

147 return count

148

149 except Exception:

150 logger.exception("Error getting result count")

151 return 0

152

153 def _extract_core_terms(self, query: str) -> str:

154 """

155 Extract core terms from a complex query for volume estimation.

156

157 Args:

158 query: PubMed query string

159

160 Returns:

161 Simplified query with core terms

162 """

163 # Remove field specifications and operators

164 simplified = re.sub(r"\[\w+\]", "", query) # Remove [Field] tags

165 simplified = re.sub(

166 r"\b(AND|OR|NOT)\b", "", simplified

167 ) # Remove operators

168

169 # Remove quotes and parentheses

170 simplified = (

171 simplified.replace('"', "").replace("(", "").replace(")", "")

172 )

173

174 # Split by whitespace and join terms with 4+ chars (likely meaningful)

175 terms = [term for term in simplified.split() if len(term) >= 4]

176

177 # Join with AND to create a basic search

178 return " ".join(terms[:5]) # Limit to top 5 terms

179

180 def _expand_time_window(self, time_filter: str) -> str:

181 """

182 Expand a time window to get more results.

183

184 Args:

185 time_filter: Current time filter

186

187 Returns:

188 Expanded time filter

189 """

190 # Parse current time window

191 import re

192

193 match = re.match(r'"last (\d+) (\w+)"[pdat]', time_filter)

194 if not match:

195 return '"last 10 years"[pdat]'

196

197 amount, unit = int(match.group(1)), match.group(2)

198

199 # Expand based on current unit

200 if unit == "months" or unit == "month":

201 if amount < 6:

202 return '"last 6 months"[pdat]'

203 elif amount < 12: 203 ↛ 206line 203 didn't jump to line 206 because the condition on line 203 was always true

204 return '"last 1 year"[pdat]'

205 else:

206 return '"last 2 years"[pdat]'

207 elif unit == "years" or unit == "year": 207 ↛ 215line 207 didn't jump to line 215 because the condition on line 207 was always true

208 if amount < 2:

209 return '"last 2 years"[pdat]'

210 elif amount < 5: 210 ↛ 213line 210 didn't jump to line 213 because the condition on line 210 was always true

211 return '"last 5 years"[pdat]'

212 else:

213 return '"last 10 years"[pdat]'

214

215 return '"last 10 years"[pdat]'

216

217 def _optimize_query_for_pubmed(self, query: str) -> str:

218 """

219 Optimize a natural language query for PubMed search.

220 Uses LLM to transform questions into effective keyword-based queries.

221

222 Args:

223 query: Natural language query

224

225 Returns:

226 Optimized query string for PubMed

227 """

228 if not self.llm or not self.optimize_queries:

229 # Return original query if no LLM available or optimization disabled

230 return query

231

232 try:

233 # Prompt for query optimization

234 prompt = f"""Transform this natural language question into an optimized PubMed search query.

235

236Original query: "{query}"

237

238CRITICAL RULES:

2391. ONLY RETURN THE EXACT SEARCH QUERY - NO EXPLANATIONS, NO COMMENTS

2402. DO NOT wrap the entire query in quotes

2413. DO NOT include ANY date restrictions or year filters

2424. Use parentheses around OR statements: (term1[Field] OR term2[Field])

2435. Use only BASIC MeSH terms - stick to broad categories like "Vaccines"[Mesh]

2446. KEEP IT SIMPLE - use 2-3 main concepts maximum

2457. Focus on Title/Abstract searches for reliability: term[Title/Abstract]

2468. Use wildcards for variations: vaccin*[Title/Abstract]

247

248EXAMPLE QUERIES:

249✓ GOOD: (mRNA[Title/Abstract] OR "messenger RNA"[Title/Abstract]) AND vaccin*[Title/Abstract]

250✓ GOOD: (influenza[Title/Abstract] OR flu[Title/Abstract]) AND treatment[Title/Abstract]

251✗ BAD: (mRNA[Title/Abstract]) AND "specific disease"[Mesh] AND treatment[Title/Abstract] AND 2023[dp]

252✗ BAD: "Here's a query to find articles about vaccines..."

253

254Return ONLY the search query without any explanations.

255"""

256

257 # Get response from LLM

258 response = self.llm.invoke(prompt)

259 raw_response = response.content.strip()

260

261 # Clean up the query - extract only the actual query and remove any explanations

262 # First check if there are multiple lines and take the first non-empty line

263 lines = raw_response.split("\n")

264 cleaned_lines = [line.strip() for line in lines if line.strip()]

265

266 if cleaned_lines: 266 ↛ 316line 266 didn't jump to line 316 because the condition on line 266 was always true

267 optimized_query = cleaned_lines[0]

268

269 # Remove any quotes that wrap the entire query

270 if optimized_query.startswith('"') and optimized_query.endswith( 270 ↛ 273line 270 didn't jump to line 273 because the condition on line 270 was never true

271 '"'

272 ):

273 optimized_query = optimized_query[1:-1]

274

275 # Remove any explanation phrases that might be at the beginning

276 explanation_starters = [

277 "here is",

278 "here's",

279 "this query",

280 "the following",

281 ]

282 for starter in explanation_starters:

283 if optimized_query.lower().startswith(starter): 283 ↛ 285line 283 didn't jump to line 285 because the condition on line 283 was never true

284 # Find the actual query part - typically after a colon

285 colon_pos = optimized_query.find(":")

286 if colon_pos > 0:

287 optimized_query = optimized_query[

288 colon_pos + 1 :

289 ].strip()

290

291 # Check if the query still seems to contain explanations

292 if ( 292 ↛ 298line 292 didn't jump to line 298 because the condition on line 292 was never true

293 len(optimized_query) > 200

294 or "this query will" in optimized_query.lower()

295 ):

296 # It's probably still an explanation - try to extract just the query part

297 # Look for common patterns in the explanation like parentheses

298 pattern = r"\([^)]+\)\s+AND\s+"

299 import re

300

301 matches = re.findall(pattern, optimized_query)

302 if matches:

303 # Extract just the query syntax parts

304 query_parts = []

305 for part in re.split(r"\.\s+", optimized_query):

306 if (

307 "(" in part

308 and ")" in part

309 and ("AND" in part or "OR" in part)

310 ):

311 query_parts.append(part)

312 if query_parts:

313 optimized_query = " ".join(query_parts)

314 else:

315 # Fall back to original query if cleaning fails

316 logger.warning(

317 "Failed to extract a clean query from LLM response"

318 )

319 optimized_query = query

320

321 # Final safety check - if query looks too much like an explanation, use original

322 if len(optimized_query.split()) > 30: 322 ↛ 323line 322 didn't jump to line 323 because the condition on line 322 was never true

323 logger.warning(

324 "Query too verbose, falling back to simpler form"

325 )

326 # Create a simple query from the original

327 words = [

328 w

329 for w in query.split()

330 if len(w) > 3

331 and w.lower()

332 not in (

333 "what",

334 "are",

335 "the",

336 "and",

337 "for",

338 "with",

339 "from",

340 "have",

341 "been",

342 "recent",

343 )

344 ]

345 optimized_query = " AND ".join(words[:3])

346

347 # Basic cleanup: standardize field tag case for consistency

348 import re

349

350 optimized_query = re.sub(

351 r"\[mesh\]", "[Mesh]", optimized_query, flags=re.IGNORECASE

352 )

353 optimized_query = re.sub(

354 r"\[title/abstract\]",

355 "[Title/Abstract]",

356 optimized_query,

357 flags=re.IGNORECASE,

358 )

359 optimized_query = re.sub(

360 r"\[publication type\]",

361 "[Publication Type]",

362 optimized_query,

363 flags=re.IGNORECASE,

364 )

365

366 # Fix unclosed quotes followed by field tags

367 # Pattern: "term[Field] -> "term"[Field]

368 optimized_query = re.sub(r'"([^"]+)\[', r'"\1"[', optimized_query)

369

370 # Simplify the query if still no results are found

371 self._simplify_query_cache = optimized_query

372

373 # Log original and optimized queries

374 logger.info("Original query: '%s'", query)

375 logger.info(f"Optimized for PubMed: '{optimized_query}'")

376 logger.debug(

377 f"Query optimization complete: '{query[:50]}...' -> '{optimized_query[:100]}...'"

378 )

379

380 return optimized_query

381

382 except Exception:

383 logger.exception("Error optimizing query")

384 logger.debug(f"Falling back to original query: '{query}'")

385 return query # Fall back to original query on error

386

387 def _simplify_query(self, query: str) -> str:

388 """

389 Simplify a PubMed query that returned no results.

390 Progressively removes elements to get a more basic query.

391

392 Args:

393 query: The original query that returned no results

394

395 Returns:

396 Simplified query

397 """

398 logger.info(f"Simplifying query: {query}")

399 logger.debug(f"Query simplification started for: '{query[:100]}...'")

400

401 # Simple approach: remove field restrictions to broaden the search

402 import re

403

404 # Remove field tags to make search broader

405 simplified = query

406

407 # Remove [Mesh] tags - search in all fields instead

408 simplified = re.sub(r"\[Mesh\]", "", simplified, flags=re.IGNORECASE)

409

410 # Remove [Publication Type] tags

411 simplified = re.sub(

412 r"\[Publication Type\]", "", simplified, flags=re.IGNORECASE

413 )

414

415 # Keep [Title/Abstract] as it's usually helpful

416 # Clean up any double spaces

417 simplified = re.sub(r"\s+", " ", simplified).strip()

418

419 # If no simplification was possible, return the original query

420 if simplified == query:

421 logger.debug("No simplification possible, returning original query")

422

423 logger.info(f"Simplified query: {simplified}")

424 logger.debug(

425 f"Query simplified from {len(query)} to {len(simplified)} chars"

426 )

427 return simplified

428

429 def _is_historical_focused(self, query: str) -> bool:

430 """

431 Determine if a query is specifically focused on historical/older information using LLM.

432 Default assumption is that queries should prioritize recent information unless

433 explicitly asking for historical content.

434

435 Args:

436 query: The search query

437

438 Returns:

439 Boolean indicating if the query is focused on historical information

440 """

441 if not self.llm:

442 # Fall back to basic keyword check if no LLM available

443 historical_terms = [

444 "history",

445 "historical",

446 "early",

447 "initial",

448 "first",

449 "original",

450 "before",

451 "prior to",

452 "origins",

453 "evolution",

454 "development",

455 ]

456 historical_years = [str(year) for year in range(1900, 2020)]

457

458 query_lower = query.lower()

459 has_historical_term = any(

460 term in query_lower for term in historical_terms

461 )

462 has_past_year = any(year in query for year in historical_years)

463

464 return has_historical_term or has_past_year

465

466 try:

467 # Use LLM to determine if the query is focused on historical information

468 prompt = f"""Determine if this query is specifically asking for HISTORICAL or OLDER information.

469

470Query: "{query}"

471

472Answer ONLY "yes" if the query is clearly asking for historical, early, original, or past information from more than 5 years ago.

473Answer ONLY "no" if the query is asking about recent, current, or new information, or if it's a general query without a specific time focus.

474

475The default assumption should be that medical and scientific queries want RECENT information unless clearly specified otherwise.

476"""

477

478 response = self.llm.invoke(prompt)

479 answer = response.content.strip().lower()

480

481 # Log the determination

482 logger.info(f"Historical focus determination for query: '{query}'")

483 logger.info(f"LLM determined historical focus: {answer}")

484

485 return "yes" in answer

486

487 except Exception:

488 logger.exception("Error determining historical focus")

489 # Fall back to basic keyword check

490 historical_terms = [

491 "history",

492 "historical",

493 "early",

494 "initial",

495 "first",

496 "original",

497 "before",

498 "prior to",

499 "origins",

500 "evolution",

501 "development",

502 ]

503 return any(term in query.lower() for term in historical_terms)

504

505 def _adaptive_search(self, query: str) -> Tuple[List[str], str]:

506 """

507 Perform an adaptive search that adjusts based on topic volume and whether

508 the query focuses on historical information.

509

510 Args:

511 query: The search query (already optimized)

512

513 Returns:

514 Tuple of (list of PMIDs, search strategy used)

515 """

516 # Estimate topic volume

517 estimated_volume = self._get_result_count(query)

518

519 # Determine if the query is focused on historical information

520 is_historical_focused = self._is_historical_focused(query)

521

522 if is_historical_focused:

523 # User wants historical information - no date filtering

524 time_filter = None

525 strategy = "historical_focus"

526 elif estimated_volume > 5000:

527 # Very common topic - use tighter recency filter

528 time_filter = '"last 1 year"[pdat]'

529 strategy = "high_volume"

530 elif estimated_volume > 1000:

531 # Common topic

532 time_filter = '"last 3 years"[pdat]'

533 strategy = "common_topic"

534 elif estimated_volume > 100: 534 ↛ 540line 534 didn't jump to line 540 because the condition on line 534 was always true

535 # Moderate volume

536 time_filter = '"last 5 years"[pdat]'

537 strategy = "moderate_volume"

538 else:

539 # Rare topic - still use recency but with wider range

540 time_filter = '"last 10 years"[pdat]'

541 strategy = "rare_topic"

542

543 # Run search based on strategy

544 if time_filter:

545 # Try with adaptive time filter

546 query_with_time = f"({query}) AND {time_filter}"

547 logger.info(

548 f"Using adaptive search strategy: {strategy} with filter: {time_filter}"

549 )

550 results = self._search_pubmed(query_with_time)

551

552 # If too few results, gradually expand time window

553 if len(results) < 5 and '"last 10 years"[pdat]' not in time_filter:

554 logger.info(

555 f"Insufficient results ({len(results)}), expanding time window"

556 )

557 expanded_time = self._expand_time_window(time_filter)

558 query_with_expanded_time = f"({query}) AND {expanded_time}"

559 expanded_results = self._search_pubmed(query_with_expanded_time)

560

561 if len(expanded_results) > len(results):

562 logger.info(

563 f"Expanded time window yielded {len(expanded_results)} results"

564 )

565 return expanded_results, f"{strategy}_expanded"

566

567 # If still no results, try without time filter

568 if not results:

569 logger.info(

570 "No results with time filter, trying without time restrictions"

571 )

572 results = self._search_pubmed(query)

573 strategy = "no_time_filter"

574 else:

575 # Historical query - run without time filter

576 logger.info(

577 "Using historical search strategy without date filtering"

578 )

579 results = self._search_pubmed(query)

580

581 return results, strategy

582

583 def _search_pubmed(self, query: str) -> List[str]:

584 """

585 Search PubMed and return a list of article IDs.

586

587 Args:

588 query: The search query

589

590 Returns:

591 List of PubMed IDs matching the query

592 """

593 try:

594 # Prepare search parameters

595 params = {

596 "db": "pubmed",

597 "term": query,

598 "retmode": "json",

599 "retmax": self.max_results,

600 "usehistory": "y",

601 }

602

603 # Add API key if available

604 if self.api_key: 604 ↛ 605line 604 didn't jump to line 605 because the condition on line 604 was never true

605 params["api_key"] = self.api_key

606 logger.debug("Using PubMed API key for higher rate limits")

607 else:

608 logger.debug("No PubMed API key - using default rate limits")

609

610 # Add date restriction if specified

611 if self.days_limit: 611 ↛ 612line 611 didn't jump to line 612 because the condition on line 611 was never true

612 params["reldate"] = self.days_limit

613 params["datetype"] = "pdat" # Publication date

614 logger.debug(f"Limiting results to last {self.days_limit} days")

615

616 logger.debug(

617 f"PubMed search query: '{query}' with max_results={self.max_results}"

618 )

619

620 self._last_wait_time = self.rate_tracker.apply_rate_limit(

621 self.engine_type

622 )

623 logger.debug(

624 f"Applied rate limit wait: {self._last_wait_time:.2f}s"

625 )

626

627 # Execute search request

628 logger.debug(f"Sending request to PubMed API: {self.search_url}")

629 response = safe_get(self.search_url, params=params)

630 response.raise_for_status()

631 logger.debug(f"PubMed API response status: {response.status_code}")

632

633 # Parse response

634 data = response.json()

635 id_list = data["esearchresult"]["idlist"]

636 total_count = data["esearchresult"].get("count", "unknown")

637

638 logger.info(

639 f"PubMed search for '{query}' found {len(id_list)} results (total available: {total_count})"

640 )

641 if len(id_list) > 0: 641 ↛ 643line 641 didn't jump to line 643 because the condition on line 641 was always true

642 logger.debug(f"First 5 PMIDs: {id_list[:5]}")

643 return id_list

644

645 except Exception:

646 logger.exception(f"Error searching PubMed for query '{query}'")

647 return []

648

649 def _get_article_summaries(

650 self, id_list: List[str]

651 ) -> List[Dict[str, Any]]:

652 """

653 Get summaries for a list of PubMed article IDs.

654

655 Args:

656 id_list: List of PubMed IDs

657

658 Returns:

659 List of article summary dictionaries

660 """

661 if not id_list:

662 logger.debug("Empty ID list provided to _get_article_summaries")

663 return []

664

665 logger.debug(f"Fetching summaries for {len(id_list)} PubMed articles")

666

667 try:

668 # Prepare parameters

669 params = {

670 "db": "pubmed",

671 "id": ",".join(id_list),

672 "retmode": "json",

673 "rettype": "summary",

674 }

675

676 # Add API key if available

677 if self.api_key: 677 ↛ 678line 677 didn't jump to line 678 because the condition on line 677 was never true

678 params["api_key"] = self.api_key

679

680 self._last_wait_time = self.rate_tracker.apply_rate_limit(

681 self.engine_type

682 )

683 logger.debug(

684 f"Applied rate limit wait: {self._last_wait_time:.2f}s"

685 )

686

687 # Execute request

688 logger.debug(f"Requesting summaries from: {self.summary_url}")

689 response = safe_get(self.summary_url, params=params)

690 response.raise_for_status()

691 logger.debug(f"Summary API response status: {response.status_code}")

692

693 # Parse response

694 data = response.json()

695 logger.debug(

696 f"PubMed API returned data for {len(id_list)} requested IDs"

697 )

698 summaries = []

699

700 for pmid in id_list:

701 if pmid in data["result"]: 701 ↛ 749line 701 didn't jump to line 749 because the condition on line 701 was always true

702 article = data["result"][pmid]

703 logger.debug(

704 f"Processing article {pmid}: {article.get('title', 'NO TITLE')[:50]}"

705 )

706

707 # Extract authors (if available)

708 authors = []

709 if "authors" in article: 709 ↛ 715line 709 didn't jump to line 715 because the condition on line 709 was always true

710 authors = [

711 author["name"] for author in article["authors"]

712 ]

713

714 # Extract DOI from articleids if not in main field

715 doi = article.get("doi", "")

716 if not doi and "articleids" in article: 716 ↛ 723line 716 didn't jump to line 723 because the condition on line 716 was always true

717 for aid in article["articleids"]: 717 ↛ 723line 717 didn't jump to line 723 because the loop on line 717 didn't complete

718 if aid.get("idtype") == "doi": 718 ↛ 717line 718 didn't jump to line 717 because the condition on line 718 was always true

719 doi = aid.get("value", "")

720 break

721

722 # Create summary dictionary with all available fields

723 summary = {

724 "id": pmid,

725 "title": article.get("title", ""),

726 "pubdate": article.get("pubdate", ""),

727 "epubdate": article.get("epubdate", ""),

728 "source": article.get("source", ""),

729 "authors": authors,

730 "lastauthor": article.get("lastauthor", ""),

731 "journal": article.get("fulljournalname", ""),

732 "volume": article.get("volume", ""),

733 "issue": article.get("issue", ""),

734 "pages": article.get("pages", ""),

735 "doi": doi,

736 "issn": article.get("issn", ""),

737 "essn": article.get("essn", ""),

738 "pubtype": article.get(

739 "pubtype", []

740 ), # Publication types from esummary

741 "recordstatus": article.get("recordstatus", ""),

742 "lang": article.get("lang", []),

743 "pmcrefcount": article.get("pmcrefcount", None),

744 "link": f"https://pubmed.ncbi.nlm.nih.gov/{pmid}/",

745 }

746

747 summaries.append(summary)

748 else:

749 logger.warning(

750 f"PMID {pmid} not found in PubMed API response"

751 )

752

753 return summaries

754

755 except Exception as e:

756 error_msg = str(e)

757 logger.exception(

758 f"Error getting article summaries for {len(id_list)} articles"

759 )

760

761 # Check for rate limiting patterns

762 if (

763 "429" in error_msg

764 or "too many requests" in error_msg.lower()

765 or "rate limit" in error_msg.lower()

766 or "service unavailable" in error_msg.lower()

767 or "503" in error_msg

768 or "403" in error_msg

769 ):

770 raise RateLimitError(f"PubMed rate limit hit: {error_msg}")

771

772 return []

773

774 def _get_article_abstracts(self, id_list: List[str]) -> Dict[str, str]:

775 """

776 Get abstracts for a list of PubMed article IDs.

777

778 Args:

779 id_list: List of PubMed IDs

780

781 Returns:

782 Dictionary mapping PubMed IDs to their abstracts

783 """

784 if not id_list:

785 logger.debug("Empty ID list provided to _get_article_abstracts")

786 return {}

787

788 logger.debug(f"Fetching abstracts for {len(id_list)} PubMed articles")

789

790 try:

791 # Prepare parameters

792 params = {

793 "db": "pubmed",

794 "id": ",".join(id_list),

795 "retmode": "xml",

796 "rettype": "abstract",

797 }

798

799 # Add API key if available

800 if self.api_key: 800 ↛ 801line 800 didn't jump to line 801 because the condition on line 800 was never true

801 params["api_key"] = self.api_key

802

803 self._last_wait_time = self.rate_tracker.apply_rate_limit(

804 self.engine_type

805 )

806 logger.debug(

807 f"Applied rate limit wait: {self._last_wait_time:.2f}s"

808 )

809

810 # Execute request

811 logger.debug(f"Requesting abstracts from: {self.fetch_url}")

812 response = safe_get(self.fetch_url, params=params)

813 response.raise_for_status()

814 logger.debug(

815 f"Abstract fetch response status: {response.status_code}, size: {len(response.text)} bytes"

816 )

817

818 # Parse XML response

819 root = ET.fromstring(response.text)

820 logger.debug(

821 f"Parsing abstracts from XML for {len(id_list)} articles"

822 )

823

824 # Extract abstracts

825 abstracts = {}

826

827 for article in root.findall(".//PubmedArticle"):

828 pmid_elem = article.find(".//PMID")

829 pmid = pmid_elem.text if pmid_elem is not None else None

830

831 if pmid is None: 831 ↛ 832line 831 didn't jump to line 832 because the condition on line 831 was never true

832 continue

833

834 # Find abstract text

835 abstract_text = ""

836 abstract_elem = article.find(".//AbstractText")

837

838 if abstract_elem is not None: 838 ↛ 842line 838 didn't jump to line 842 because the condition on line 838 was always true

839 abstract_text = abstract_elem.text or ""

840

841 # Some abstracts are split into multiple sections

842 abstract_sections = article.findall(".//AbstractText")

843 if len(abstract_sections) > 1:

844 logger.debug(

845 f"Article {pmid} has {len(abstract_sections)} abstract sections"

846 )

847

848 for section in abstract_sections:

849 # Get section label if it exists

850 label = section.get("Label")

851 section_text = section.text or ""

852

853 if label and section_text:

854 if abstract_text: 854 ↛ 857line 854 didn't jump to line 857 because the condition on line 854 was always true

855 abstract_text += f"\n\n{label}: {section_text}"

856 else:

857 abstract_text = f"{label}: {section_text}"

858 elif section_text: 858 ↛ 848line 858 didn't jump to line 848 because the condition on line 858 was always true

859 if abstract_text: 859 ↛ 862line 859 didn't jump to line 862 because the condition on line 859 was always true

860 abstract_text += f"\n\n{section_text}"

861 else:

862 abstract_text = section_text

863

864 # Store in dictionary

865 if pmid and abstract_text: 865 ↛ 870line 865 didn't jump to line 870 because the condition on line 865 was always true

866 abstracts[pmid] = abstract_text

867 logger.debug(

868 f"Abstract for {pmid}: {len(abstract_text)} chars"

869 )

870 elif pmid:

871 logger.warning(f"No abstract found for PMID {pmid}")

872

873 logger.info(

874 f"Successfully retrieved {len(abstracts)} abstracts out of {len(id_list)} requested"

875 )

876 return abstracts

877

878 except Exception:

879 logger.exception(

880 f"Error getting article abstracts for {len(id_list)} articles"

881 )

882 return {}

883

884 def _get_article_detailed_metadata(

885 self, id_list: List[str]

886 ) -> Dict[str, Dict[str, Any]]:

887 """

888 Get detailed metadata for PubMed articles including publication types,

889 MeSH terms, keywords, and affiliations.

890

891 Args:

892 id_list: List of PubMed IDs

893

894 Returns:

895 Dictionary mapping PubMed IDs to their detailed metadata

896 """

897 if not id_list:

898 return {}

899

900 try:

901 # Prepare parameters

902 params = {

903 "db": "pubmed",

904 "id": ",".join(id_list),

905 "retmode": "xml",

906 "rettype": "medline",

907 }

908

909 # Add API key if available

910 if self.api_key: 910 ↛ 911line 910 didn't jump to line 911 because the condition on line 910 was never true

911 params["api_key"] = self.api_key

912

913 self._last_wait_time = self.rate_tracker.apply_rate_limit(

914 self.engine_type

915 )

916

917 # Execute request

918 response = safe_get(self.fetch_url, params=params)

919 response.raise_for_status()

920

921 # Parse XML response

922 root = ET.fromstring(response.text)

923

924 metadata = {}

925

926 for article in root.findall(".//PubmedArticle"):

927 pmid_elem = article.find(".//PMID")

928 pmid = pmid_elem.text if pmid_elem is not None else None

929

930 if pmid is None: 930 ↛ 931line 930 didn't jump to line 931 because the condition on line 930 was never true

931 continue

932

933 article_metadata = {}

934

935 # Extract publication types

936 pub_types = []

937 for pub_type in article.findall(".//PublicationType"):

938 if pub_type.text: 938 ↛ 937line 938 didn't jump to line 937 because the condition on line 938 was always true

939 pub_types.append(pub_type.text)

940 if pub_types:

941 article_metadata["publication_types"] = pub_types

942

943 # Extract MeSH terms

944 mesh_terms = []

945 for mesh in article.findall(".//MeshHeading"):

946 descriptor = mesh.find(".//DescriptorName")

947 if descriptor is not None and descriptor.text: 947 ↛ 945line 947 didn't jump to line 945 because the condition on line 947 was always true

948 mesh_terms.append(descriptor.text)

949 if mesh_terms:

950 article_metadata["mesh_terms"] = mesh_terms

951

952 # Extract keywords

953 keywords = []

954 for keyword in article.findall(".//Keyword"):

955 if keyword.text: 955 ↛ 954line 955 didn't jump to line 954 because the condition on line 955 was always true

956 keywords.append(keyword.text)

957 if keywords:

958 article_metadata["keywords"] = keywords

959

960 # Extract affiliations

961 affiliations = []

962 for affiliation in article.findall(".//Affiliation"): 962 ↛ 963line 962 didn't jump to line 963 because the loop on line 962 never started

963 if affiliation.text:

964 affiliations.append(affiliation.text)

965 if affiliations: 965 ↛ 966line 965 didn't jump to line 966 because the condition on line 965 was never true

966 article_metadata["affiliations"] = affiliations

967

968 # Extract grant information

969 grants = []

970 for grant in article.findall(".//Grant"): 970 ↛ 971line 970 didn't jump to line 971 because the loop on line 970 never started

971 grant_info = {}

972 grant_id = grant.find(".//GrantID")

973 if grant_id is not None and grant_id.text:

974 grant_info["id"] = grant_id.text

975 agency = grant.find(".//Agency")

976 if agency is not None and agency.text:

977 grant_info["agency"] = agency.text

978 if grant_info:

979 grants.append(grant_info)

980 if grants: 980 ↛ 981line 980 didn't jump to line 981 because the condition on line 980 was never true

981 article_metadata["grants"] = grants

982

983 # Check for free full text in PMC

984 pmc_elem = article.find(".//ArticleId[@IdType='pmc']")

985 if pmc_elem is not None: 985 ↛ 986line 985 didn't jump to line 986 because the condition on line 985 was never true

986 article_metadata["has_free_full_text"] = True

987 article_metadata["pmc_id"] = pmc_elem.text

988

989 # Extract conflict of interest statement

990 coi_elem = article.find(".//CoiStatement")

991 if coi_elem is not None and coi_elem.text: 991 ↛ 992line 991 didn't jump to line 992 because the condition on line 991 was never true

992 article_metadata["conflict_of_interest"] = coi_elem.text

993

994 metadata[pmid] = article_metadata

995

996 return metadata

997

998 except Exception:

999 logger.exception("Error getting detailed article metadata")

1000 return {}

1001

1002 def _create_enriched_content(

1003 self, result: Dict[str, Any], base_content: str

1004 ) -> str:

1005 """

1006 Create enriched content by adding relevant metadata context to help the LLM.

1007

1008 Args:

1009 result: The result dictionary with metadata

1010 base_content: The base content (abstract or full text)

1011

1012 Returns:

1013 Enriched content string with metadata context

1014 """

1015 enriched_parts = []

1016

1017 # Add study type information

1018 if "publication_types" in result:

1019 pub_types = result["publication_types"]

1020 # Filter for significant types

1021 significant_types = [

1022 pt

1023 for pt in pub_types

1024 if any(

1025 key in pt.lower()

1026 for key in [

1027 "clinical trial",

1028 "randomized",

1029 "meta-analysis",

1030 "systematic review",

1031 "case report",

1032 "guideline",

1033 "comparative study",

1034 "multicenter",

1035 ]

1036 )

1037 ]

1038 if significant_types: 1038 ↛ 1044line 1038 didn't jump to line 1044 because the condition on line 1038 was always true

1039 enriched_parts.append(

1040 f"[Study Type: {', '.join(significant_types)}]"

1041 )

1042

1043 # Add the main content

1044 enriched_parts.append(base_content)

1045

1046 # Add metadata footer

1047 metadata_footer = []

1048

1049 # Add ALL MeSH terms

1050 if "mesh_terms" in result and len(result["mesh_terms"]) > 0:

1051 metadata_footer.append(

1052 f"Medical Topics (MeSH): {', '.join(result['mesh_terms'])}"

1053 )

1054

1055 # Add ALL keywords

1056 if "keywords" in result and len(result["keywords"]) > 0: 1056 ↛ 1057line 1056 didn't jump to line 1057 because the condition on line 1056 was never true

1057 metadata_footer.append(f"Keywords: {', '.join(result['keywords'])}")

1058

1059 # Add ALL affiliations

1060 if "affiliations" in result and len(result["affiliations"]) > 0: 1060 ↛ 1061line 1060 didn't jump to line 1061 because the condition on line 1060 was never true

1061 if len(result["affiliations"]) == 1:

1062 metadata_footer.append(

1063 f"Institution: {result['affiliations'][0]}"

1064 )

1065 else:

1066 affiliations_text = "\n - " + "\n - ".join(

1067 result["affiliations"]

1068 )

1069 metadata_footer.append(f"Institutions:{affiliations_text}")

1070

1071 # Add ALL funding information with full details

1072 if "grants" in result and len(result["grants"]) > 0:

1073 grant_details = []

1074 for grant in result["grants"]:

1075 grant_text = []

1076 if "agency" in grant: 1076 ↛ 1078line 1076 didn't jump to line 1078 because the condition on line 1076 was always true

1077 grant_text.append(grant["agency"])

1078 if "id" in grant: 1078 ↛ 1080line 1078 didn't jump to line 1080 because the condition on line 1078 was always true

1079 grant_text.append(f"(Grant ID: {grant['id']})")

1080 if grant_text: 1080 ↛ 1074line 1080 didn't jump to line 1074 because the condition on line 1080 was always true

1081 grant_details.append(" ".join(grant_text))

1082 if grant_details: 1082 ↛ 1090line 1082 didn't jump to line 1090 because the condition on line 1082 was always true

1083 if len(grant_details) == 1: 1083 ↛ 1086line 1083 didn't jump to line 1086 because the condition on line 1083 was always true

1084 metadata_footer.append(f"Funded by: {grant_details[0]}")

1085 else:

1086 funding_text = "\n - " + "\n - ".join(grant_details)

1087 metadata_footer.append(f"Funding Sources:{funding_text}")

1088

1089 # Add FULL conflict of interest statement

1090 if "conflict_of_interest" in result: 1090 ↛ 1091line 1090 didn't jump to line 1091 because the condition on line 1090 was never true

1091 coi_text = result["conflict_of_interest"]

1092 if coi_text:

1093 # Still skip trivial "no conflict" statements to reduce noise

1094 if not any(

1095 phrase in coi_text.lower()

1096 for phrase in [

1097 "no conflict",

1098 "no competing",

1099 "nothing to disclose",

1100 "none declared",

1101 "authors declare no",

1102 ]

1103 ):

1104 metadata_footer.append(f"Conflict of Interest: {coi_text}")

1105 elif (

1106 "but" in coi_text.lower()

1107 or "except" in coi_text.lower()

1108 or "however" in coi_text.lower()

1109 ):

1110 # Include if there's a "no conflict BUT..." type statement

1111 metadata_footer.append(f"Conflict of Interest: {coi_text}")

1112

1113 # Combine everything

1114 if metadata_footer:

1115 enriched_parts.append("\n---\nStudy Metadata:")

1116 enriched_parts.extend(metadata_footer)

1117

1118 return "\n".join(enriched_parts)

1119

1120 def _find_pmc_ids(self, pmid_list: List[str]) -> Dict[str, str]:

1121 """

1122 Find PMC IDs for the given PubMed IDs (for full-text access).

1123

1124 Args:

1125 pmid_list: List of PubMed IDs

1126

1127 Returns:

1128 Dictionary mapping PubMed IDs to their PMC IDs (if available)

1129 """

1130 if not pmid_list or not self.get_full_text:

1131 return {}

1132

1133 try:

1134 # Prepare parameters

1135 params = {

1136 "dbfrom": "pubmed",

1137 "db": "pmc",

1138 "linkname": "pubmed_pmc",

1139 "id": ",".join(pmid_list),

1140 "retmode": "json",

1141 }

1142

1143 # Add API key if available

1144 if self.api_key: 1144 ↛ 1145line 1144 didn't jump to line 1145 because the condition on line 1144 was never true

1145 params["api_key"] = self.api_key

1146

1147 self._last_wait_time = self.rate_tracker.apply_rate_limit(

1148 self.engine_type

1149 )

1150

1151 # Execute request

1152 response = safe_get(self.link_url, params=params)

1153 response.raise_for_status()

1154

1155 # Parse response

1156 data = response.json()

1157

1158 # Map PubMed IDs to PMC IDs

1159 pmid_to_pmcid = {}

1160

1161 for linkset in data.get("linksets", []):

1162 pmid = linkset.get("ids", [None])[0]

1163

1164 if not pmid: 1164 ↛ 1165line 1164 didn't jump to line 1165 because the condition on line 1164 was never true

1165 continue

1166

1167 for link in linkset.get("linksetdbs", []):

1168 if link.get("linkname") == "pubmed_pmc": 1168 ↛ 1167line 1168 didn't jump to line 1167 because the condition on line 1168 was always true

1169 pmcids = link.get("links", [])

1170 if pmcids: 1170 ↛ 1167line 1170 didn't jump to line 1167 because the condition on line 1170 was always true

1171 pmid_to_pmcid[str(pmid)] = f"PMC{pmcids[0]}"

1172

1173 logger.info(

1174 f"Found {len(pmid_to_pmcid)} PMC IDs for full-text access"

1175 )

1176 return pmid_to_pmcid

1177

1178 except Exception:

1179 logger.exception("Error finding PMC IDs")

1180 return {}

1181

1182 def _get_pmc_full_text(self, pmcid: str) -> str:

1183 """

1184 Get full text for a PMC article.

1185

1186 Args:

1187 pmcid: PMC ID of the article

1188

1189 Returns:

1190 Full text content or empty string if not available

1191 """

1192 try:

1193 # Prepare parameters

1194 params = {

1195 "db": "pmc",

1196 "id": pmcid,

1197 "retmode": "xml",

1198 "rettype": "full",

1199 }

1200

1201 # Add API key if available

1202 if self.api_key: 1202 ↛ 1203line 1202 didn't jump to line 1203 because the condition on line 1202 was never true

1203 params["api_key"] = self.api_key

1204

1205 self._last_wait_time = self.rate_tracker.apply_rate_limit(

1206 self.engine_type

1207 )

1208

1209 # Execute request

1210 response = safe_get(self.fetch_url, params=params)

1211 response.raise_for_status()

1212

1213 # Parse XML response

1214 root = ET.fromstring(response.text)

1215

1216 # Extract full text

1217 full_text = []

1218

1219 # Extract article title

1220 title_elem = root.find(".//article-title")

1221 if title_elem is not None and title_elem.text: 1221 ↛ 1225line 1221 didn't jump to line 1225 because the condition on line 1221 was always true

1222 full_text.append(f"# {title_elem.text}")

1223

1224 # Extract abstract

1225 abstract_paras = root.findall(".//abstract//p")

1226 if abstract_paras: 1226 ↛ 1227line 1226 didn't jump to line 1227 because the condition on line 1226 was never true

1227 full_text.append("\n## Abstract\n")

1228 for p in abstract_paras:

1229 text = "".join(p.itertext())

1230 if text:

1231 full_text.append(text)

1232

1233 # Extract body content

1234 body = root.find(".//body")

1235 if body is not None: 1235 ↛ 1248line 1235 didn't jump to line 1248 because the condition on line 1235 was always true

1236 for section in body.findall(".//sec"):

1237 # Get section title

1238 title = section.find(".//title")

1239 if title is not None and title.text: 1239 ↛ 1243line 1239 didn't jump to line 1243 because the condition on line 1239 was always true

1240 full_text.append(f"\n## {title.text}\n")

1241

1242 # Get paragraphs

1243 for p in section.findall(".//p"):

1244 text = "".join(p.itertext())

1245 if text: 1245 ↛ 1243line 1245 didn't jump to line 1243 because the condition on line 1245 was always true

1246 full_text.append(text)

1247

1248 result_text = "\n\n".join(full_text)

1249 logger.debug(

1250 f"Successfully extracted {len(result_text)} chars of PMC full text with {len(full_text)} sections"

1251 )

1252 return result_text

1253

1254 except Exception:

1255 logger.exception("Error getting PMC full text")

1256 return ""

1257

1258 def _get_previews(self, query: str) -> List[Dict[str, Any]]:

1259 """

1260 Get preview information for PubMed articles.

1261

1262 Args:

1263 query: The search query

1264

1265 Returns:

1266 List of preview dictionaries

1267 """

1268 logger.info(f"Getting PubMed previews for query: {query}")

1269

1270 # Optimize the query for PubMed if LLM is available

1271 optimized_query = self._optimize_query_for_pubmed(query)

1272

1273 # Perform adaptive search

1274 pmid_list, strategy = self._adaptive_search(optimized_query)

1275

1276 # If no results, try a simplified query

1277 if not pmid_list:

1278 logger.warning(

1279 f"No PubMed results found using strategy: {strategy}"

1280 )

1281 simplified_query = self._simplify_query(optimized_query)

1282 if simplified_query != optimized_query:

1283 logger.info(f"Trying with simplified query: {simplified_query}")

1284 pmid_list, strategy = self._adaptive_search(simplified_query)

1285 if pmid_list: 1285 ↛ 1286line 1285 didn't jump to line 1286 because the condition on line 1285 was never true

1286 logger.info(

1287 f"Simplified query found {len(pmid_list)} results"

1288 )

1289

1290 if not pmid_list:

1291 logger.warning("No PubMed results found after query simplification")

1292 return []

1293

1294 # Get article summaries

1295 logger.debug(f"Fetching article summaries for {len(pmid_list)} PMIDs")

1296 summaries = self._get_article_summaries(pmid_list)

1297 logger.debug(f"Retrieved {len(summaries)} summaries")

1298

1299 # ALWAYS fetch abstracts for snippet-only mode to provide context for LLM

1300 logger.debug(

1301 f"Fetching abstracts for {len(pmid_list)} articles for snippet enrichment"

1302 )

1303 abstracts = self._get_article_abstracts(pmid_list)

1304 logger.debug(f"Retrieved {len(abstracts)} abstracts")

1305

1306 # Format as previews

1307 previews = []

1308 for summary in summaries:

1309 # Build snippet from individual metadata preferences

1310 snippet_parts = []

1311

1312 # Check for publication type from esummary (earlier than detailed metadata)

1313 pub_type_prefix = ""

1314 if self.include_publication_type_in_context and summary.get( 1314 ↛ 1318line 1314 didn't jump to line 1318 because the condition on line 1314 was never true

1315 "pubtype"

1316 ):

1317 # Use first publication type from esummary

1318 pub_type_prefix = f"[{summary['pubtype'][0]}] "

1319

1320 # Add authors if enabled

1321 if self.include_authors_in_context and summary.get("authors"): 1321 ↛ 1322line 1321 didn't jump to line 1322 because the condition on line 1321 was never true

1322 authors_text = ", ".join(summary.get("authors", []))

1323 if len(authors_text) > 100:

1324 # Truncate long author lists

1325 authors_text = authors_text[:97] + "..."

1326 snippet_parts.append(authors_text)

1327

1328 # Add journal if enabled

1329 if self.include_journal_in_context and summary.get("journal"): 1329 ↛ 1333line 1329 didn't jump to line 1333 because the condition on line 1329 was always true

1330 snippet_parts.append(summary["journal"])

1331

1332 # Add date (full or year only)

1333 if summary.get("pubdate"): 1333 ↛ 1343line 1333 didn't jump to line 1343 because the condition on line 1333 was always true

1334 if self.include_full_date_in_context: 1334 ↛ 1335line 1334 didn't jump to line 1335 because the condition on line 1334 was never true

1335 snippet_parts.append(summary["pubdate"])

1336 elif ( 1336 ↛ 1343line 1336 didn't jump to line 1343 because the condition on line 1336 was always true

1337 self.include_year_in_context

1338 and len(summary["pubdate"]) >= 4

1339 ):

1340 snippet_parts.append(summary["pubdate"][:4])

1341

1342 # Add citation details if enabled

1343 if self.include_citation_in_context: 1343 ↛ 1344line 1343 didn't jump to line 1344 because the condition on line 1343 was never true

1344 citation_parts = []

1345 if summary.get("volume"):

1346 citation_parts.append(f"Vol {summary['volume']}")

1347 if summary.get("issue"):

1348 citation_parts.append(f"Issue {summary['issue']}")

1349 if summary.get("pages"):

1350 citation_parts.append(f"pp {summary['pages']}")

1351 if citation_parts:

1352 snippet_parts.append(f"({', '.join(citation_parts)})")

1353

1354 # Join snippet parts or provide default

1355 if snippet_parts: 1355 ↛ 1366line 1355 didn't jump to line 1366 because the condition on line 1355 was always true

1356 # Use different separators based on what's included

1357 if self.include_authors_in_context: 1357 ↛ 1358line 1357 didn't jump to line 1358 because the condition on line 1357 was never true

1358 snippet = ". ".join(

1359 snippet_parts

1360 ) # Authors need period separator

1361 else:

1362 snippet = " - ".join(

1363 snippet_parts

1364 ) # Journal and year use dash

1365 else:

1366 snippet = "Research article"

1367

1368 # Add publication type prefix

1369 snippet = pub_type_prefix + snippet

1370

1371 # Add language indicator if not English

1372 if self.include_language_in_context and summary.get("lang"): 1372 ↛ 1373line 1372 didn't jump to line 1373 because the condition on line 1372 was never true

1373 langs = summary["lang"]

1374 if langs and langs[0] != "eng" and langs[0]:

1375 snippet = f"{snippet} [{langs[0].upper()}]"

1376

1377 # Add identifiers if enabled

1378 identifier_parts = []

1379 if self.include_pmid_in_context and summary.get("id"): 1379 ↛ 1380line 1379 didn't jump to line 1380 because the condition on line 1379 was never true

1380 identifier_parts.append(f"PMID: {summary['id']}")

1381 if self.include_doi_in_context and summary.get("doi"): 1381 ↛ 1382line 1381 didn't jump to line 1382 because the condition on line 1381 was never true

1382 identifier_parts.append(f"DOI: {summary['doi']}")

1383

1384 if identifier_parts: 1384 ↛ 1385line 1384 didn't jump to line 1385 because the condition on line 1384 was never true

1385 snippet = f"{snippet} | {' | '.join(identifier_parts)}"

1386

1387 # ALWAYS include title and abstract in snippet for LLM analysis

1388 pmid = summary["id"]

1389 title = summary["title"]

1390 abstract_text = abstracts.get(pmid, "")

1391

1392 # Truncate abstract if too long

1393 if len(abstract_text) > 500: 1393 ↛ 1394line 1393 didn't jump to line 1394 because the condition on line 1393 was never true

1394 abstract_text = abstract_text[:497] + "..."

1395

1396 # Build the enriched snippet with title and abstract

1397 if abstract_text: 1397 ↛ 1400line 1397 didn't jump to line 1400 because the condition on line 1397 was always true

1398 enriched_snippet = f"Title: {title}\n\nAbstract: {abstract_text}\n\nMetadata: {snippet}"

1399 else:

1400 enriched_snippet = f"Title: {title}\n\nMetadata: {snippet}"

1401

1402 # Log the complete snippet for debugging

1403 logger.debug(f"Complete snippet for PMID {pmid}:")

1404 logger.debug(f" Title: {title[:100]}...")

1405 logger.debug(f" Abstract length: {len(abstract_text)} chars")

1406 logger.debug(f" Metadata: {snippet}")

1407 logger.debug(

1408 f" Full enriched snippet ({len(enriched_snippet)} chars): {enriched_snippet[:500]}..."

1409 )

1410

1411 # Create preview with basic information

1412 preview = {

1413 "id": summary["id"],

1414 "title": summary["title"],

1415 "link": summary["link"],

1416 "snippet": enriched_snippet, # Use enriched snippet with title and abstract

1417 "authors": summary.get("authors", []),

1418 "journal": summary.get("journal", ""),

1419 "pubdate": summary.get("pubdate", ""),

1420 "doi": summary.get("doi", ""),

1421 "source": "PubMed",

1422 "_pmid": summary["id"], # Store PMID for later use

1423 "_search_strategy": strategy, # Store search strategy for analytics

1424 }

1425

1426 previews.append(preview)

1427

1428 logger.info(

1429 f"Found {len(previews)} PubMed previews using strategy: {strategy}"

1430 )

1431 if previews: 1431 ↛ 1435line 1431 didn't jump to line 1435 because the condition on line 1431 was always true

1432 logger.debug(

1433 f"Sample preview title: '{previews[0].get('title', 'NO TITLE')[:80]}...'"

1434 )

1435 return previews

1436

1437 def _get_full_content(

1438 self, relevant_items: List[Dict[str, Any]]

1439 ) -> List[Dict[str, Any]]:

1440 """

1441 Get full content for the relevant PubMed articles.

1442 Efficiently manages which content to retrieve (abstracts and/or full text).

1443

1444 Args:

1445 relevant_items: List of relevant preview dictionaries

1446

1447 Returns:

1448 List of result dictionaries with full content

1449 """

1450 # Check if we should add full content

1451 snippets_only_mode = (

1452 hasattr(search_config, "SEARCH_SNIPPETS_ONLY")

1453 and search_config.SEARCH_SNIPPETS_ONLY

1454 )

1455

1456 if snippets_only_mode: 1456 ↛ 1457line 1456 didn't jump to line 1457 because the condition on line 1456 was never true

1457 logger.info(

1458 "Snippet-only mode enabled, will fetch abstracts as snippets"

1459 )

1460 # For PubMed, we still need to fetch abstracts as they serve as snippets

1461 # But we'll skip full-text retrieval

1462

1463 logger.info(

1464 f"Getting content for {len(relevant_items)} PubMed articles"

1465 )

1466

1467 # Collect all PMIDs for relevant items

1468 pmids = []

1469 for item in relevant_items:

1470 if "_pmid" in item: 1470 ↛ 1469line 1470 didn't jump to line 1469 because the condition on line 1470 was always true

1471 pmids.append(item["_pmid"])

1472

1473 # Get abstracts if requested and PMIDs exist

1474 # In snippet-only mode, always get abstracts as they serve as snippets

1475 abstracts = {}

1476 if (self.get_abstracts or snippets_only_mode) and pmids: 1476 ↛ 1480line 1476 didn't jump to line 1480 because the condition on line 1476 was always true

1477 abstracts = self._get_article_abstracts(pmids)

1478

1479 # Get detailed metadata for all articles (publication types, MeSH terms, etc.)

1480 detailed_metadata = {}

1481 if pmids: 1481 ↛ 1485line 1481 didn't jump to line 1485 because the condition on line 1481 was always true

1482 detailed_metadata = self._get_article_detailed_metadata(pmids)

1483

1484 # Find PMC IDs for full-text retrieval (if enabled and not in snippet-only mode)

1485 pmid_to_pmcid = {}

1486 if self.get_full_text and pmids and not snippets_only_mode: 1486 ↛ 1487line 1486 didn't jump to line 1487 because the condition on line 1486 was never true

1487 pmid_to_pmcid = self._find_pmc_ids(pmids)

1488

1489 # Add content to results

1490 results = []

1491 for item in relevant_items:

1492 result = item.copy()

1493 pmid = item.get("_pmid", "")

1494

1495 # Add detailed metadata if available

1496 if pmid in detailed_metadata:

1497 metadata = detailed_metadata[pmid]

1498

1499 # Add publication types (e.g., "Clinical Trial", "Meta-Analysis")

1500 if "publication_types" in metadata: 1500 ↛ 1516line 1500 didn't jump to line 1516 because the condition on line 1500 was always true

1501 result["publication_types"] = metadata["publication_types"]

1502

1503 # Add first publication type to snippet if enabled

1504 if ( 1504 ↛ 1516line 1504 didn't jump to line 1516 because the condition on line 1504 was always true

1505 self.include_publication_type_in_context

1506 and metadata["publication_types"]

1507 ):

1508 # Just take the first publication type as is

1509 pub_type = metadata["publication_types"][0]

1510 if "snippet" in result: 1510 ↛ 1516line 1510 didn't jump to line 1516 because the condition on line 1510 was always true

1511 result["snippet"] = (

1512 f"[{pub_type}] {result['snippet']}"

1513 )

1514

1515 # Add MeSH terms for medical categorization

1516 if "mesh_terms" in metadata: 1516 ↛ 1536line 1516 didn't jump to line 1536 because the condition on line 1516 was always true

1517 result["mesh_terms"] = metadata["mesh_terms"]

1518

1519 # Add MeSH terms to snippet if enabled

1520 if ( 1520 ↛ 1536line 1520 didn't jump to line 1536 because the condition on line 1520 was always true

1521 self.include_mesh_terms_in_context

1522 and metadata["mesh_terms"]

1523 ):

1524 mesh_to_show = (

1525 metadata["mesh_terms"][: self.max_mesh_terms]

1526 if self.max_mesh_terms > 0

1527 else metadata["mesh_terms"]

1528 )

1529 if mesh_to_show and "snippet" in result: 1529 ↛ 1536line 1529 didn't jump to line 1536 because the condition on line 1529 was always true

1530 mesh_text = "MeSH: " + ", ".join(mesh_to_show)

1531 result["snippet"] = (

1532 f"{result['snippet']} | {mesh_text}"

1533 )

1534

1535 # Add keywords

1536 if "keywords" in metadata: 1536 ↛ 1537line 1536 didn't jump to line 1537 because the condition on line 1536 was never true

1537 result["keywords"] = metadata["keywords"]

1538

1539 # Add keywords to snippet if enabled

1540 if (

1541 self.include_keywords_in_context

1542 and metadata["keywords"]

1543 ):

1544 keywords_to_show = (

1545 metadata["keywords"][: self.max_keywords]

1546 if self.max_keywords > 0

1547 else metadata["keywords"]

1548 )

1549 if keywords_to_show and "snippet" in result:

1550 keywords_text = "Keywords: " + ", ".join(

1551 keywords_to_show

1552 )

1553 result["snippet"] = (

1554 f"{result['snippet']} | {keywords_text}"

1555 )

1556

1557 # Add affiliations

1558 if "affiliations" in metadata: 1558 ↛ 1559line 1558 didn't jump to line 1559 because the condition on line 1558 was never true

1559 result["affiliations"] = metadata["affiliations"]

1560

1561 # Add funding/grant information

1562 if "grants" in metadata: 1562 ↛ 1563line 1562 didn't jump to line 1563 because the condition on line 1562 was never true

1563 result["grants"] = metadata["grants"]

1564

1565 # Add conflict of interest statement

1566 if "conflict_of_interest" in metadata: 1566 ↛ 1567line 1566 didn't jump to line 1567 because the condition on line 1566 was never true

1567 result["conflict_of_interest"] = metadata[

1568 "conflict_of_interest"

1569 ]

1570

1571 # Add free full text availability

1572 if "has_free_full_text" in metadata: 1572 ↛ 1573line 1572 didn't jump to line 1573 because the condition on line 1572 was never true

1573 result["has_free_full_text"] = metadata[

1574 "has_free_full_text"

1575 ]

1576 if "pmc_id" in metadata:

1577 result["pmc_id"] = metadata["pmc_id"]

1578

1579 # Add PMC availability to snippet if enabled

1580 if (

1581 self.include_pmc_availability_in_context

1582 and metadata["has_free_full_text"]

1583 and "snippet" in result

1584 ):

1585 result["snippet"] = (

1586 f"{result['snippet']} | [Free Full Text]"

1587 )

1588

1589 # Add abstract if available

1590 if pmid in abstracts: 1590 ↛ 1631line 1590 didn't jump to line 1631 because the condition on line 1590 was always true

1591 result["abstract"] = abstracts[pmid]

1592

1593 # Create enriched content with metadata context

1594 enriched_content = self._create_enriched_content(

1595 result, abstracts[pmid]

1596 )

1597

1598 # ALWAYS include title and abstract in snippet for LLM analysis

1599 # Build comprehensive snippet with title and abstract

1600 title = result.get("title", "")

1601 abstract_text = (

1602 abstracts[pmid][:SNIPPET_LENGTH_LONG]

1603 if len(abstracts[pmid]) > SNIPPET_LENGTH_LONG

1604 else abstracts[pmid]

1605 )

1606

1607 # Prepend title and abstract to the existing metadata snippet

1608 if "snippet" in result: 1608 ↛ 1615line 1608 didn't jump to line 1615 because the condition on line 1608 was always true

1609 # Keep metadata snippet and add content

1610 result["snippet"] = (

1611 f"Title: {title}\n\nAbstract: {abstract_text}\n\nMetadata: {result['snippet']}"

1612 )

1613 else:

1614 # No metadata snippet, just title and abstract

1615 result["snippet"] = (

1616 f"Title: {title}\n\nAbstract: {abstract_text}"

1617 )

1618

1619 # In snippet-only mode, use enriched content

1620 if snippets_only_mode: 1620 ↛ 1621line 1620 didn't jump to line 1621 because the condition on line 1620 was never true

1621 result["full_content"] = enriched_content

1622 result["content"] = enriched_content

1623 result["content_type"] = "abstract"

1624 # Use abstract as content if no full text

1625 elif pmid not in pmid_to_pmcid: 1625 ↛ 1631line 1625 didn't jump to line 1631 because the condition on line 1625 was always true

1626 result["full_content"] = enriched_content

1627 result["content"] = enriched_content

1628 result["content_type"] = "abstract"

1629

1630 # Add full text for a limited number of top articles

1631 if ( 1631 ↛ 1640line 1631 didn't jump to line 1640 because the condition on line 1631 was never true

1632 pmid in pmid_to_pmcid

1633 and self.get_full_text

1634 and len(

1635 [r for r in results if r.get("content_type") == "full_text"]

1636 )

1637 < self.full_text_limit

1638 ):

1639 # Get full text content

1640 pmcid = pmid_to_pmcid[pmid]

1641 full_text = self._get_pmc_full_text(pmcid)

1642

1643 if full_text:

1644 enriched_full_text = self._create_enriched_content(

1645 result, full_text

1646 )

1647 result["full_content"] = enriched_full_text

1648 result["content"] = enriched_full_text

1649 result["content_type"] = "full_text"

1650 result["pmcid"] = pmcid

1651 elif pmid in abstracts:

1652 # Fall back to abstract if full text retrieval fails

1653 enriched_content = self._create_enriched_content(

1654 result, abstracts[pmid]

1655 )

1656 result["full_content"] = enriched_content

1657 result["content"] = enriched_content

1658 result["content_type"] = "abstract"

1659

1660 # Remove temporary fields

1661 if "_pmid" in result: 1661 ↛ 1663line 1661 didn't jump to line 1663 because the condition on line 1661 was always true

1662 del result["_pmid"]

1663 if "_search_strategy" in result: 1663 ↛ 1664line 1663 didn't jump to line 1664 because the condition on line 1663 was never true

1664 del result["_search_strategy"]

1665

1666 results.append(result)

1667

1668 return results

1669

1670 def search_by_author(

1671 self, author_name: str, max_results: Optional[int] = None

1672 ) -> List[Dict[str, Any]]:

1673 """

1674 Search for articles by a specific author.

1675

1676 Args:

1677 author_name: Name of the author

1678 max_results: Maximum number of results (defaults to self.max_results)

1679

1680 Returns:

1681 List of articles by the author

1682 """

1683 original_max_results = self.max_results

1684

1685 try:

1686 if max_results: 1686 ↛ 1687line 1686 didn't jump to line 1687 because the condition on line 1686 was never true

1687 self.max_results = max_results

1688

1689 query = f"{author_name}[Author]"

1690 return self.run(query)

1691

1692 finally:

1693 # Restore original value

1694 self.max_results = original_max_results

1695

1696 def search_by_journal(

1697 self, journal_name: str, max_results: Optional[int] = None

1698 ) -> List[Dict[str, Any]]:

1699 """

1700 Search for articles in a specific journal.

1701

1702 Args:

1703 journal_name: Name of the journal

1704 max_results: Maximum number of results (defaults to self.max_results)

1705

1706 Returns:

1707 List of articles from the journal

1708 """

1709 original_max_results = self.max_results

1710

1711 try:

1712 if max_results: 1712 ↛ 1713line 1712 didn't jump to line 1713 because the condition on line 1712 was never true

1713 self.max_results = max_results

1714

1715 query = f"{journal_name}[Journal]"

1716 return self.run(query)

1717

1718 finally:

1719 # Restore original value

1720 self.max_results = original_max_results

1721

1722 def search_recent(

1723 self, query: str, days: int = 30, max_results: Optional[int] = None

1724 ) -> List[Dict[str, Any]]:

1725 """

1726 Search for recent articles matching the query.

1727

1728 Args:

1729 query: The search query

1730 days: Number of days to look back

1731 max_results: Maximum number of results (defaults to self.max_results)

1732

1733 Returns:

1734 List of recent articles matching the query

1735 """

1736 original_max_results = self.max_results

1737 original_days_limit = self.days_limit

1738

1739 try:

1740 if max_results:

1741 self.max_results = max_results

1742

1743 # Set days limit for this search

1744 self.days_limit = days

1745

1746 return self.run(query)

1747

1748 finally:

1749 # Restore original values

1750 self.max_results = original_max_results

1751 self.days_limit = original_days_limit

1752

1753 def advanced_search(

1754 self, terms: Dict[str, str], max_results: Optional[int] = None

1755 ) -> List[Dict[str, Any]]:

1756 """

1757 Perform an advanced search with field-specific terms.

1758

1759 Args:

1760 terms: Dictionary mapping fields to search terms

1761 Valid fields: Author, Journal, Title, MeSH, Affiliation, etc.

1762 max_results: Maximum number of results (defaults to self.max_results)

1763

1764 Returns:

1765 List of articles matching the advanced query

1766 """

1767 original_max_results = self.max_results

1768

1769 try:

1770 if max_results:

1771 self.max_results = max_results

1772

1773 # Build advanced query string

1774 query_parts = []

1775 for field, term in terms.items():

1776 query_parts.append(f"{term}[{field}]")

1777

1778 query = " AND ".join(query_parts)

1779 return self.run(query)

1780

1781 finally:

1782 # Restore original value

1783 self.max_results = original_max_results

Coverage for src / local_deep_research / web_search_engines / engines / search_engine_pubmed.py: 71%

708 statements