Coverage for src / local_deep_research / web_search_engines / engines / search_engine_pubchem.py: 89%
250 statements
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
« prev ^ index » next coverage.py v7.13.5, created at 2026-04-14 23:55 +0000
1"""PubChem search engine for chemical compound information."""
3from typing import Any, Dict, List, Optional
4from urllib.parse import quote
6from langchain_core.language_models import BaseLLM
7from loguru import logger
9from ...security.safe_requests import safe_get
10from ..rate_limiting import RateLimitError
11from ..search_engine_base import BaseSearchEngine
14class PubChemSearchEngine(BaseSearchEngine):
15 """
16 PubChem search engine for chemical compound information.
18 Provides access to chemical structures, properties, and bioactivity data.
19 No authentication required.
20 """
22 is_public = True
23 is_generic = False
24 is_scientific = True
25 is_code = False
26 is_lexical = True
27 needs_llm_relevance_filter = True
29 def __init__(
30 self,
31 max_results: int = 10,
32 include_synonyms: bool = True,
33 llm: Optional[BaseLLM] = None,
34 max_filtered_results: Optional[int] = None,
35 settings_snapshot: Optional[Dict[str, Any]] = None,
36 **kwargs,
37 ):
38 """
39 Initialize the PubChem search engine.
41 Args:
42 max_results: Maximum number of search results
43 include_synonyms: Whether to include compound synonyms
44 llm: Language model for relevance filtering
45 max_filtered_results: Maximum results after filtering
46 settings_snapshot: Settings snapshot for thread context
47 """
48 super().__init__(
49 llm=llm,
50 max_filtered_results=max_filtered_results,
51 max_results=max_results,
52 settings_snapshot=settings_snapshot,
53 **kwargs,
54 )
56 self.include_synonyms = include_synonyms
57 self.base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"
58 self.autocomplete_url = (
59 "https://pubchem.ncbi.nlm.nih.gov/rest/autocomplete"
60 )
62 # User-Agent header for API requests
63 self.headers = {
64 "User-Agent": "LocalDeepResearch/1.0 (https://github.com/LearningCircuit/local-deep-research)"
65 }
67 def _search_compounds(self, query: str) -> List[str]:
68 """Search for compound names matching the query."""
69 try:
70 url = (
71 f"{self.autocomplete_url}/compound/{quote(query, safe='')}/json"
72 )
73 params = {"limit": self.max_results * 2} # Get extra for filtering
75 response = safe_get(
76 url, params=params, headers=self.headers, timeout=30
77 )
78 self._raise_if_rate_limit(response.status_code)
79 response.raise_for_status()
80 data = response.json()
82 terms: list[str] = data.get("dictionary_terms", {}).get(
83 "compound", []
84 )
85 return terms
87 except RateLimitError:
88 raise
89 except Exception:
90 logger.exception("PubChem autocomplete search failed")
91 return []
93 def _get_compound_by_name(self, name: str) -> Optional[Dict[str, Any]]:
94 """Get compound information by name."""
95 try:
96 self.rate_tracker.apply_rate_limit(self.engine_type)
97 # Get CID first
98 url = f"{self.base_url}/compound/name/{quote(name, safe='')}/cids/JSON"
99 response = safe_get(url, headers=self.headers, timeout=30)
101 if response.status_code == 404:
102 return None
103 self._raise_if_rate_limit(response.status_code)
105 response.raise_for_status()
106 data = response.json()
107 cids = data.get("IdentifierList", {}).get("CID", [])
109 if not cids:
110 return None
112 cid = cids[0]
114 # Get compound properties
115 properties = self._get_compound_properties(cid)
117 # Get compound description
118 description = self._get_compound_description(cid)
120 return {
121 "cid": cid,
122 "name": name,
123 "properties": properties,
124 "description": description,
125 }
127 except RateLimitError:
128 raise
129 except Exception:
130 logger.exception(f"Error fetching PubChem compound: {name}")
131 return None
133 def _get_compound_properties(self, cid: int) -> Dict[str, Any]:
134 """Get properties for a compound by CID."""
135 try:
136 self.rate_tracker.apply_rate_limit(self.engine_type)
137 properties_list = [
138 "MolecularFormula",
139 "MolecularWeight",
140 "IUPACName",
141 "CanonicalSMILES",
142 "IsomericSMILES",
143 "InChI",
144 "InChIKey",
145 "XLogP",
146 "TPSA",
147 "Complexity",
148 "Charge",
149 "HBondDonorCount",
150 "HBondAcceptorCount",
151 "RotatableBondCount",
152 "HeavyAtomCount",
153 ]
155 url = f"{self.base_url}/compound/cid/{cid}/property/{','.join(properties_list)}/JSON"
156 response = safe_get(url, headers=self.headers, timeout=30)
157 self._raise_if_rate_limit(response.status_code)
158 response.raise_for_status()
159 data = response.json()
161 props = data.get("PropertyTable", {}).get("Properties", [])
162 return props[0] if props else {}
164 except RateLimitError:
165 raise
166 except Exception:
167 logger.exception(f"Error fetching PubChem properties for CID {cid}")
168 return {}
170 def _get_compound_description(self, cid: int) -> str:
171 """Get description for a compound by CID."""
172 try:
173 self.rate_tracker.apply_rate_limit(self.engine_type)
174 url = f"{self.base_url}/compound/cid/{cid}/description/JSON"
175 response = safe_get(url, headers=self.headers, timeout=30)
177 if response.status_code == 404:
178 return ""
179 self._raise_if_rate_limit(response.status_code)
181 response.raise_for_status()
182 data = response.json()
184 descriptions = data.get("InformationList", {}).get(
185 "Information", []
186 )
187 for desc in descriptions:
188 if desc.get("Description"):
189 return desc.get("Description", "") # type: ignore[no-any-return]
191 return ""
193 except RateLimitError:
194 raise
195 except Exception:
196 logger.exception(
197 f"Error fetching PubChem description for CID {cid}"
198 )
199 return ""
201 def _get_compound_synonyms(self, cid: int, limit: int = 10) -> List[str]:
202 """Get synonyms for a compound by CID."""
203 try:
204 self.rate_tracker.apply_rate_limit(self.engine_type)
205 url = f"{self.base_url}/compound/cid/{cid}/synonyms/JSON"
206 response = safe_get(url, headers=self.headers, timeout=30)
208 if response.status_code == 404:
209 return []
210 self._raise_if_rate_limit(response.status_code)
212 response.raise_for_status()
213 data = response.json()
215 info = data.get("InformationList", {}).get("Information", [])
216 if info:
217 synonyms = info[0].get("Synonym", [])
218 return synonyms[:limit] # type: ignore[no-any-return]
219 return []
221 except RateLimitError:
222 raise
223 except Exception:
224 logger.exception(f"Error fetching PubChem synonyms for CID {cid}")
225 return []
227 def _get_previews(self, query: str) -> List[Dict[str, Any]]:
228 """
229 Get preview information for PubChem compounds.
231 Args:
232 query: The search query (compound name)
234 Returns:
235 List of preview dictionaries
236 """
237 logger.info(f"Getting PubChem previews for query: {query}")
239 # Apply rate limiting
240 self._last_wait_time = self.rate_tracker.apply_rate_limit(
241 self.engine_type
242 )
244 # Search for matching compound names
245 compound_names = self._search_compounds(query)
247 if not compound_names:
248 # Try direct lookup
249 compound = self._get_compound_by_name(query)
250 if compound:
251 compound_names = [query]
252 else:
253 logger.info("No PubChem compounds found")
254 return []
256 logger.info(f"Found {len(compound_names)} potential compounds")
258 previews: list[dict[str, Any]] = []
259 seen_cids = set()
260 for name in compound_names:
261 if len(previews) >= self.max_results:
262 break
264 try:
265 compound = self._get_compound_by_name(name)
266 if not compound:
267 continue
269 cid = compound["cid"]
271 # Deduplicate by CID (autocomplete may return
272 # case variants like "Caffeine" and "caffeine")
273 if cid in seen_cids:
274 continue
275 seen_cids.add(cid)
276 properties = compound.get("properties", {})
277 description = compound.get("description", "")
279 # Build compound URL
280 compound_url = (
281 f"https://pubchem.ncbi.nlm.nih.gov/compound/{cid}"
282 )
284 # Get key properties
285 molecular_formula = properties.get("MolecularFormula", "")
286 molecular_weight = properties.get("MolecularWeight", "")
287 iupac_name = properties.get("IUPACName", "")
288 smiles = (
289 properties.get("CanonicalSMILES", "")
290 or properties.get("SMILES", "")
291 or properties.get("IsomericSMILES", "")
292 or properties.get("ConnectivitySMILES", "")
293 )
295 # Get drug-relevant properties
296 xlogp = properties.get("XLogP")
297 hbond_donors = properties.get("HBondDonorCount")
298 hbond_acceptors = properties.get("HBondAcceptorCount")
300 # Build snippet
301 snippet_parts = []
302 if molecular_formula:
303 snippet_parts.append(f"Formula: {molecular_formula}")
304 if molecular_weight: 304 ↛ 305line 304 didn't jump to line 305 because the condition on line 304 was never true
305 snippet_parts.append(f"MW: {molecular_weight}")
306 if xlogp is not None: 306 ↛ 307line 306 didn't jump to line 307 because the condition on line 306 was never true
307 snippet_parts.append(f"XLogP: {xlogp}")
308 if hbond_donors is not None or hbond_acceptors is not None: 308 ↛ 309line 308 didn't jump to line 309 because the condition on line 308 was never true
309 hbond_info = []
310 if hbond_donors is not None:
311 hbond_info.append(f"H-Donors: {hbond_donors}")
312 if hbond_acceptors is not None:
313 hbond_info.append(f"H-Acceptors: {hbond_acceptors}")
314 snippet_parts.append(", ".join(hbond_info))
315 if iupac_name:
316 snippet_parts.append(f"IUPAC: {iupac_name}")
317 if description:
318 snippet_parts.append(description[:200])
319 snippet = ". ".join(snippet_parts)
321 preview = {
322 "id": str(cid),
323 "cid": cid,
324 "title": name,
325 "link": compound_url,
326 "snippet": snippet,
327 "molecular_formula": molecular_formula,
328 "molecular_weight": molecular_weight,
329 "iupac_name": iupac_name,
330 "smiles": smiles,
331 "inchi_key": properties.get("InChIKey", ""),
332 "description": description,
333 "source": "PubChem",
334 "_raw": {
335 "properties": properties,
336 "description": description,
337 },
338 }
340 previews.append(preview)
342 except RateLimitError:
343 raise
344 except Exception:
345 logger.exception(f"Error processing PubChem compound: {name}")
346 continue
348 return previews
350 def _get_full_content(
351 self, relevant_items: List[Dict[str, Any]]
352 ) -> List[Dict[str, Any]]:
353 """
354 Get full content for the relevant PubChem compounds.
356 Args:
357 relevant_items: List of relevant preview dictionaries
359 Returns:
360 List of result dictionaries with full content
361 """
362 logger.info(
363 f"Getting full content for {len(relevant_items)} PubChem compounds"
364 )
366 results = []
367 for item in relevant_items:
368 result = item.copy()
370 cid = item.get("cid")
371 if cid and self.include_synonyms: 371 ↛ 373line 371 didn't jump to line 373 because the condition on line 371 was never true
372 # Get synonyms
373 synonyms = self._get_compound_synonyms(cid)
374 result["synonyms"] = synonyms
376 raw = item.get("_raw", {})
377 if raw: 377 ↛ 429line 377 didn't jump to line 429 because the condition on line 377 was always true
378 properties = raw.get("properties", {})
379 description = raw.get("description", "")
381 # Build content summary
382 content_parts = []
383 content_parts.append(
384 f"Compound: {result.get('title', 'Unknown')}"
385 )
386 if cid is not None: 386 ↛ 389line 386 didn't jump to line 389 because the condition on line 386 was always true
387 content_parts.append(f"CID: {cid}")
389 if result.get("molecular_formula"): 389 ↛ 393line 389 didn't jump to line 393 because the condition on line 389 was always true
390 content_parts.append(
391 f"Molecular Formula: {result['molecular_formula']}"
392 )
393 if result.get("molecular_weight"): 393 ↛ 397line 393 didn't jump to line 397 because the condition on line 393 was always true
394 content_parts.append(
395 f"Molecular Weight: {result['molecular_weight']} g/mol"
396 )
397 if result.get("iupac_name"): 397 ↛ 399line 397 didn't jump to line 399 because the condition on line 397 was always true
398 content_parts.append(f"IUPAC Name: {result['iupac_name']}")
399 if result.get("smiles"): 399 ↛ 401line 399 didn't jump to line 401 because the condition on line 399 was always true
400 content_parts.append(f"SMILES: {result['smiles']}")
401 if result.get("inchi_key"): 401 ↛ 405line 401 didn't jump to line 405 because the condition on line 401 was always true
402 content_parts.append(f"InChIKey: {result['inchi_key']}")
404 # Additional properties
405 if properties.get("XLogP") is not None: 405 ↛ 407line 405 didn't jump to line 407 because the condition on line 405 was always true
406 content_parts.append(f"XLogP: {properties['XLogP']}")
407 if properties.get("TPSA") is not None: 407 ↛ 409line 407 didn't jump to line 409 because the condition on line 407 was always true
408 content_parts.append(f"TPSA: {properties['TPSA']} Ų")
409 if properties.get("HBondDonorCount") is not None: 409 ↛ 413line 409 didn't jump to line 413 because the condition on line 409 was always true
410 content_parts.append(
411 f"H-Bond Donors: {properties['HBondDonorCount']}"
412 )
413 if properties.get("HBondAcceptorCount") is not None: 413 ↛ 418line 413 didn't jump to line 418 because the condition on line 413 was always true
414 content_parts.append(
415 f"H-Bond Acceptors: {properties['HBondAcceptorCount']}"
416 )
418 if result.get("synonyms"): 418 ↛ 419line 418 didn't jump to line 419 because the condition on line 418 was never true
419 content_parts.append(
420 f"\nSynonyms: {', '.join(result['synonyms'][:5])}"
421 )
423 if description: 423 ↛ 426line 423 didn't jump to line 426 because the condition on line 423 was always true
424 content_parts.append(f"\nDescription: {description}")
426 result["content"] = "\n".join(content_parts)
428 # Clean up internal fields
429 if "_raw" in result: 429 ↛ 432line 429 didn't jump to line 432 because the condition on line 429 was always true
430 del result["_raw"]
432 results.append(result)
434 return results
436 def get_compound(self, cid: int) -> Optional[Dict[str, Any]]:
437 """
438 Get a specific compound by CID.
440 Args:
441 cid: The PubChem compound ID
443 Returns:
444 Compound dictionary or None
445 """
446 try:
447 properties = self._get_compound_properties(cid)
448 description = self._get_compound_description(cid)
449 synonyms = self._get_compound_synonyms(cid)
451 return {
452 "cid": cid,
453 "properties": properties,
454 "description": description,
455 "synonyms": synonyms,
456 }
457 except RateLimitError:
458 raise
459 except Exception:
460 logger.exception(f"Error fetching PubChem compound {cid}")
461 return None
463 def search_by_formula(self, formula: str) -> List[Dict[str, Any]]:
464 """
465 Search compounds by molecular formula.
467 Args:
468 formula: Molecular formula (e.g., "C6H12O6")
470 Returns:
471 List of matching compounds
472 """
473 try:
474 url = f"{self.base_url}/compound/fastformula/{quote(formula, safe='')}/cids/JSON"
475 response = safe_get(url, headers=self.headers, timeout=30)
477 if response.status_code == 404:
478 return []
479 self._raise_if_rate_limit(response.status_code)
481 response.raise_for_status()
482 data = response.json()
483 cids = data.get("IdentifierList", {}).get("CID", [])
485 results = []
486 for cid in cids[: self.max_results]:
487 compound = self.get_compound(cid)
488 if compound:
489 results.append(compound)
491 return results
493 except RateLimitError:
494 raise
495 except Exception:
496 logger.exception(f"Error searching by formula: {formula}")
497 return []