Coverage for src/local_deep_research/web_search_engines/engines/search_engine_pubchem.py: 89%
251 statements
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
« prev ^ index » next coverage.py v7.14.1, created at 2026-06-03 23:15 +0000
1"""PubChem search engine for chemical compound information."""
3from typing import Any, Dict, List, Optional
4from urllib.parse import quote
6from langchain_core.language_models import BaseLLM
7from loguru import logger
9from ...constants import USER_AGENT
10from ...security.safe_requests import safe_get
11from ..rate_limiting import RateLimitError
12from ..search_engine_base import BaseSearchEngine
15class PubChemSearchEngine(BaseSearchEngine):
16 """
17 PubChem search engine for chemical compound information.
19 Provides access to chemical structures, properties, and bioactivity data.
20 No authentication required.
21 """
23 is_public = True
24 is_generic = False
25 is_scientific = True
26 is_code = False
27 is_lexical = True
28 needs_llm_relevance_filter = True
30 def __init__(
31 self,
32 max_results: int = 10,
33 include_synonyms: bool = True,
34 llm: Optional[BaseLLM] = None,
35 max_filtered_results: Optional[int] = None,
36 settings_snapshot: Optional[Dict[str, Any]] = None,
37 **kwargs,
38 ):
39 """
40 Initialize the PubChem search engine.
42 Args:
43 max_results: Maximum number of search results
44 include_synonyms: Whether to include compound synonyms
45 llm: Language model for relevance filtering
46 max_filtered_results: Maximum results after filtering
47 settings_snapshot: Settings snapshot for thread context
48 """
49 super().__init__(
50 llm=llm,
51 max_filtered_results=max_filtered_results,
52 max_results=max_results,
53 settings_snapshot=settings_snapshot,
54 **kwargs,
55 )
57 self.include_synonyms = include_synonyms
58 self.base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
59 self.autocomplete_url = (
60 "https://pubchem.ncbi.nlm.nih.gov/rest/autocomplete"
61 )
63 # User-Agent header for API requests
64 self.headers = {"User-Agent": USER_AGENT}
66 def _search_compounds(self, query: str) -> List[str]:
67 """Search for compound names matching the query."""
68 try:
69 url = (
70 f"{self.autocomplete_url}/compound/{quote(query, safe='')}/json"
71 )
72 params = {"limit": self.max_results * 2} # Get extra for filtering
74 response = safe_get(
75 url, params=params, headers=self.headers, timeout=30
76 )
77 self._raise_if_rate_limit(response.status_code)
78 response.raise_for_status()
79 data = response.json()
81 terms: list[str] = data.get("dictionary_terms", {}).get(
82 "compound", []
83 )
84 return terms
86 except RateLimitError:
87 raise
88 except Exception:
89 logger.exception("PubChem autocomplete search failed")
90 return []
92 def _get_compound_by_name(self, name: str) -> Optional[Dict[str, Any]]:
93 """Get compound information by name."""
94 try:
95 self.rate_tracker.apply_rate_limit(self.engine_type)
96 # Get CID first
97 url = f"{self.base_url}/compound/name/{quote(name, safe='')}/cids/JSON"
98 response = safe_get(url, headers=self.headers, timeout=30)
100 if response.status_code == 404:
101 return None
102 self._raise_if_rate_limit(response.status_code)
104 response.raise_for_status()
105 data = response.json()
106 cids = data.get("IdentifierList", {}).get("CID", [])
108 if not cids:
109 return None
111 cid = cids[0]
113 # Get compound properties
114 properties = self._get_compound_properties(cid)
116 # Get compound description
117 description = self._get_compound_description(cid)
119 return {
120 "cid": cid,
121 "name": name,
122 "properties": properties,
123 "description": description,
124 }
126 except RateLimitError:
127 raise
128 except Exception:
129 logger.exception(f"Error fetching PubChem compound: {name}")
130 return None
132 def _get_compound_properties(self, cid: int) -> Dict[str, Any]:
133 """Get properties for a compound by CID."""
134 try:
135 self.rate_tracker.apply_rate_limit(self.engine_type)
136 properties_list = [
137 "MolecularFormula",
138 "MolecularWeight",
139 "IUPACName",
140 "CanonicalSMILES",
141 "IsomericSMILES",
142 "InChI",
143 "InChIKey",
144 "XLogP",
145 "TPSA",
146 "Complexity",
147 "Charge",
148 "HBondDonorCount",
149 "HBondAcceptorCount",
150 "RotatableBondCount",
151 "HeavyAtomCount",
152 ]
154 url = f"{self.base_url}/compound/cid/{cid}/property/{','.join(properties_list)}/JSON"
155 response = safe_get(url, headers=self.headers, timeout=30)
156 self._raise_if_rate_limit(response.status_code)
157 response.raise_for_status()
158 data = response.json()
160 props = data.get("PropertyTable", {}).get("Properties", [])
161 return props[0] if props else {}
163 except RateLimitError:
164 raise
165 except Exception:
166 logger.exception(f"Error fetching PubChem properties for CID {cid}")
167 return {}
169 def _get_compound_description(self, cid: int) -> str:
170 """Get description for a compound by CID."""
171 try:
172 self.rate_tracker.apply_rate_limit(self.engine_type)
173 url = f"{self.base_url}/compound/cid/{cid}/description/JSON"
174 response = safe_get(url, headers=self.headers, timeout=30)
176 if response.status_code == 404:
177 return ""
178 self._raise_if_rate_limit(response.status_code)
180 response.raise_for_status()
181 data = response.json()
183 descriptions = data.get("InformationList", {}).get(
184 "Information", []
185 )
186 for desc in descriptions:
187 if desc.get("Description"):
188 return desc.get("Description", "") # type: ignore[no-any-return]
190 return ""
192 except RateLimitError:
193 raise
194 except Exception:
195 logger.exception(
196 f"Error fetching PubChem description for CID {cid}"
197 )
198 return ""
200 def _get_compound_synonyms(self, cid: int, limit: int = 10) -> List[str]:
201 """Get synonyms for a compound by CID."""
202 try:
203 self.rate_tracker.apply_rate_limit(self.engine_type)
204 url = f"{self.base_url}/compound/cid/{cid}/synonyms/JSON"
205 response = safe_get(url, headers=self.headers, timeout=30)
207 if response.status_code == 404:
208 return []
209 self._raise_if_rate_limit(response.status_code)
211 response.raise_for_status()
212 data = response.json()
214 info = data.get("InformationList", {}).get("Information", [])
215 if info:
216 synonyms = info[0].get("Synonym", [])
217 return synonyms[:limit] # type: ignore[no-any-return]
218 return []
220 except RateLimitError:
221 raise
222 except Exception:
223 logger.exception(f"Error fetching PubChem synonyms for CID {cid}")
224 return []
226 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
227 """
228 Get preview information for PubChem compounds.
230 Args:
231 query: The search query (compound name)
233 Returns:
234 List of preview dictionaries
235 """
236 logger.info(f"Getting PubChem previews for query: {query}")
238 # Apply rate limiting
239 self._last_wait_time = self.rate_tracker.apply_rate_limit(
240 self.engine_type
241 )
243 # Search for matching compound names
244 compound_names = self._search_compounds(query)
246 if not compound_names:
247 # Try direct lookup
248 compound = self._get_compound_by_name(query)
249 if compound:
250 compound_names = [query]
251 else:
252 logger.info("No PubChem compounds found")
253 return []
255 logger.info(f"Found {len(compound_names)} potential compounds")
257 previews: list[dict[str, Any]] = []
258 seen_cids = set()
259 for name in compound_names:
260 if len(previews) >= self.max_results:
261 break
263 try:
264 compound = self._get_compound_by_name(name)
265 if not compound:
266 continue
268 cid = compound["cid"]
270 # Deduplicate by CID (autocomplete may return
271 # case variants like "Caffeine" and "caffeine")
272 if cid in seen_cids:
273 continue
274 seen_cids.add(cid)
275 properties = compound.get("properties", {})
276 description = compound.get("description", "")
278 # Build compound URL
279 compound_url = (
280 f"https://pubchem.ncbi.nlm.nih.gov/compound/{cid}"
281 )
283 # Get key properties
284 molecular_formula = properties.get("MolecularFormula", "")
285 molecular_weight = properties.get("MolecularWeight", "")
286 iupac_name = properties.get("IUPACName", "")
287 smiles = (
288 properties.get("CanonicalSMILES", "")
289 or properties.get("SMILES", "")
290 or properties.get("IsomericSMILES", "")
291 or properties.get("ConnectivitySMILES", "")
292 )
294 # Get drug-relevant properties
295 xlogp = properties.get("XLogP")
296 hbond_donors = properties.get("HBondDonorCount")
297 hbond_acceptors = properties.get("HBondAcceptorCount")
299 # Build snippet
300 snippet_parts = []
301 if molecular_formula:
302 snippet_parts.append(f"Formula: {molecular_formula}")
303 if molecular_weight: 303 ↛ 304line 303 didn't jump to line 304 because the condition on line 303 was never true
304 snippet_parts.append(f"MW: {molecular_weight}")
305 if xlogp is not None: 305 ↛ 306line 305 didn't jump to line 306 because the condition on line 305 was never true
306 snippet_parts.append(f"XLogP: {xlogp}")
307 if hbond_donors is not None or hbond_acceptors is not None: 307 ↛ 308line 307 didn't jump to line 308 because the condition on line 307 was never true
308 hbond_info = []
309 if hbond_donors is not None:
310 hbond_info.append(f"H-Donors: {hbond_donors}")
311 if hbond_acceptors is not None:
312 hbond_info.append(f"H-Acceptors: {hbond_acceptors}")
313 snippet_parts.append(", ".join(hbond_info))
314 if iupac_name:
315 snippet_parts.append(f"IUPAC: {iupac_name}")
316 if description:
317 snippet_parts.append(description[:200])
318 snippet = ". ".join(snippet_parts)
320 preview = {
321 "id": str(cid),
322 "cid": cid,
323 "title": name,
324 "link": compound_url,
325 "snippet": snippet,
326 "molecular_formula": molecular_formula,
327 "molecular_weight": molecular_weight,
328 "iupac_name": iupac_name,
329 "smiles": smiles,
330 "inchi_key": properties.get("InChIKey", ""),
331 "description": description,
332 "source": "PubChem",
333 "_raw": {
334 "properties": properties,
335 "description": description,
336 },
337 }
339 previews.append(preview)
341 except RateLimitError:
342 raise
343 except Exception:
344 logger.exception(f"Error processing PubChem compound: {name}")
345 continue
347 return previews
349 def _get_full_content(
350 self, relevant_items: List[Dict[str, Any]]
351 ) -> List[Dict[str, Any]]:
352 """
353 Get full content for the relevant PubChem compounds.
355 Args:
356 relevant_items: List of relevant preview dictionaries
358 Returns:
359 List of result dictionaries with full content
360 """
361 logger.info(
362 f"Getting full content for {len(relevant_items)} PubChem compounds"
363 )
365 results = []
366 for item in relevant_items:
367 result = item.copy()
369 cid = item.get("cid")
370 if cid and self.include_synonyms: 370 ↛ 372line 370 didn't jump to line 372 because the condition on line 370 was never true
371 # Get synonyms
372 synonyms = self._get_compound_synonyms(cid)
373 result["synonyms"] = synonyms
375 raw = item.get("_raw", {})
376 if raw: 376 ↛ 428line 376 didn't jump to line 428 because the condition on line 376 was always true
377 properties = raw.get("properties", {})
378 description = raw.get("description", "")
380 # Build content summary
381 content_parts = []
382 content_parts.append(
383 f"Compound: {result.get('title', 'Unknown')}"
384 )
385 if cid is not None: 385 ↛ 388line 385 didn't jump to line 388 because the condition on line 385 was always true
386 content_parts.append(f"CID: {cid}")
388 if result.get("molecular_formula"): 388 ↛ 392line 388 didn't jump to line 392 because the condition on line 388 was always true
389 content_parts.append(
390 f"Molecular Formula: {result['molecular_formula']}"
391 )
392 if result.get("molecular_weight"): 392 ↛ 396line 392 didn't jump to line 396 because the condition on line 392 was always true
393 content_parts.append(
394 f"Molecular Weight: {result['molecular_weight']} g/mol"
395 )
396 if result.get("iupac_name"): 396 ↛ 398line 396 didn't jump to line 398 because the condition on line 396 was always true
397 content_parts.append(f"IUPAC Name: {result['iupac_name']}")
398 if result.get("smiles"): 398 ↛ 400line 398 didn't jump to line 400 because the condition on line 398 was always true
399 content_parts.append(f"SMILES: {result['smiles']}")
400 if result.get("inchi_key"): 400 ↛ 404line 400 didn't jump to line 404 because the condition on line 400 was always true
401 content_parts.append(f"InChIKey: {result['inchi_key']}")
403 # Additional properties
404 if properties.get("XLogP") is not None: 404 ↛ 406line 404 didn't jump to line 406 because the condition on line 404 was always true
405 content_parts.append(f"XLogP: {properties['XLogP']}")
406 if properties.get("TPSA") is not None: 406 ↛ 408line 406 didn't jump to line 408 because the condition on line 406 was always true
407 content_parts.append(f"TPSA: {properties['TPSA']} Ų")
408 if properties.get("HBondDonorCount") is not None: 408 ↛ 412line 408 didn't jump to line 412 because the condition on line 408 was always true
409 content_parts.append(
410 f"H-Bond Donors: {properties['HBondDonorCount']}"
411 )
412 if properties.get("HBondAcceptorCount") is not None: 412 ↛ 417line 412 didn't jump to line 417 because the condition on line 412 was always true
413 content_parts.append(
414 f"H-Bond Acceptors: {properties['HBondAcceptorCount']}"
415 )
417 if result.get("synonyms"): 417 ↛ 418line 417 didn't jump to line 418 because the condition on line 417 was never true
418 content_parts.append(
419 f"\nSynonyms: {', '.join(result['synonyms'][:5])}"
420 )
422 if description: 422 ↛ 425line 422 didn't jump to line 425 because the condition on line 422 was always true
423 content_parts.append(f"\nDescription: {description}")
425 result["content"] = "\n".join(content_parts)
427 # Clean up internal fields
428 if "_raw" in result: 428 ↛ 431line 428 didn't jump to line 431 because the condition on line 428 was always true
429 del result["_raw"]
431 results.append(result)
433 return results
435 def get_compound(self, cid: int) -> Optional[Dict[str, Any]]:
436 """
437 Get a specific compound by CID.
439 Args:
440 cid: The PubChem compound ID
442 Returns:
443 Compound dictionary or None
444 """
445 try:
446 properties = self._get_compound_properties(cid)
447 description = self._get_compound_description(cid)
448 synonyms = self._get_compound_synonyms(cid)
450 return {
451 "cid": cid,
452 "properties": properties,
453 "description": description,
454 "synonyms": synonyms,
455 }
456 except RateLimitError:
457 raise
458 except Exception:
459 logger.exception(f"Error fetching PubChem compound {cid}")
460 return None
462 def search_by_formula(self, formula: str) -> List[Dict[str, Any]]:
463 """
464 Search compounds by molecular formula.
466 Args:
467 formula: Molecular formula (e.g., "C6H12O6")
469 Returns:
470 List of matching compounds
471 """
472 try:
473 url = f"{self.base_url}/compound/fastformula/{quote(formula, safe='')}/cids/JSON"
474 response = safe_get(url, headers=self.headers, timeout=30)
476 if response.status_code == 404:
477 return []
478 self._raise_if_rate_limit(response.status_code)
480 response.raise_for_status()
481 data = response.json()
482 cids = data.get("IdentifierList", {}).get("CID", [])
484 results = []
485 for cid in cids[: self.max_results]:
486 compound = self.get_compound(cid)
487 if compound:
488 results.append(compound)
490 return results
492 except RateLimitError:
493 raise
494 except Exception:
495 logger.exception(f"Error searching by formula: {formula}")
496 return []