Coverage for src/local_deep_research/web_search_engines/engines/search_engine_pubchem.py: 89%

251 statements  

« prev     ^ index     » next       coverage.py v7.14.1, created at 2026-06-03 23:15 +0000

1"""PubChem search engine for chemical compound information.""" 

2 

3from typing import Any, Dict, List, Optional 

4from urllib.parse import quote 

5 

6from langchain_core.language_models import BaseLLM 

7from loguru import logger 

8 

9from ...constants import USER_AGENT 

10from ...security.safe_requests import safe_get 

11from ..rate_limiting import RateLimitError 

12from ..search_engine_base import BaseSearchEngine 

13 

14 

15class PubChemSearchEngine(BaseSearchEngine): 

16 """ 

17 PubChem search engine for chemical compound information. 

18 

19 Provides access to chemical structures, properties, and bioactivity data. 

20 No authentication required. 

21 """ 

22 

23 is_public = True 

24 is_generic = False 

25 is_scientific = True 

26 is_code = False 

27 is_lexical = True 

28 needs_llm_relevance_filter = True 

29 

30 def __init__( 

31 self, 

32 max_results: int = 10, 

33 include_synonyms: bool = True, 

34 llm: Optional[BaseLLM] = None, 

35 max_filtered_results: Optional[int] = None, 

36 settings_snapshot: Optional[Dict[str, Any]] = None, 

37 **kwargs, 

38 ): 

39 """ 

40 Initialize the PubChem search engine. 

41 

42 Args: 

43 max_results: Maximum number of search results 

44 include_synonyms: Whether to include compound synonyms 

45 llm: Language model for relevance filtering 

46 max_filtered_results: Maximum results after filtering 

47 settings_snapshot: Settings snapshot for thread context 

48 """ 

49 super().__init__( 

50 llm=llm, 

51 max_filtered_results=max_filtered_results, 

52 max_results=max_results, 

53 settings_snapshot=settings_snapshot, 

54 **kwargs, 

55 ) 

56 

57 self.include_synonyms = include_synonyms 

58 self.base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug" 

59 self.autocomplete_url = ( 

60 "https://pubchem.ncbi.nlm.nih.gov/rest/autocomplete" 

61 ) 

62 

63 # User-Agent header for API requests 

64 self.headers = {"User-Agent": USER_AGENT} 

65 

66 def _search_compounds(self, query: str) -> List[str]: 

67 """Search for compound names matching the query.""" 

68 try: 

69 url = ( 

70 f"{self.autocomplete_url}/compound/{quote(query, safe='')}/json" 

71 ) 

72 params = {"limit": self.max_results * 2} # Get extra for filtering 

73 

74 response = safe_get( 

75 url, params=params, headers=self.headers, timeout=30 

76 ) 

77 self._raise_if_rate_limit(response.status_code) 

78 response.raise_for_status() 

79 data = response.json() 

80 

81 terms: list[str] = data.get("dictionary_terms", {}).get( 

82 "compound", [] 

83 ) 

84 return terms 

85 

86 except RateLimitError: 

87 raise 

88 except Exception: 

89 logger.exception("PubChem autocomplete search failed") 

90 return [] 

91 

92 def _get_compound_by_name(self, name: str) -> Optional[Dict[str, Any]]: 

93 """Get compound information by name.""" 

94 try: 

95 self.rate_tracker.apply_rate_limit(self.engine_type) 

96 # Get CID first 

97 url = f"{self.base_url}/compound/name/{quote(name, safe='')}/cids/JSON" 

98 response = safe_get(url, headers=self.headers, timeout=30) 

99 

100 if response.status_code == 404: 

101 return None 

102 self._raise_if_rate_limit(response.status_code) 

103 

104 response.raise_for_status() 

105 data = response.json() 

106 cids = data.get("IdentifierList", {}).get("CID", []) 

107 

108 if not cids: 

109 return None 

110 

111 cid = cids[0] 

112 

113 # Get compound properties 

114 properties = self._get_compound_properties(cid) 

115 

116 # Get compound description 

117 description = self._get_compound_description(cid) 

118 

119 return { 

120 "cid": cid, 

121 "name": name, 

122 "properties": properties, 

123 "description": description, 

124 } 

125 

126 except RateLimitError: 

127 raise 

128 except Exception: 

129 logger.exception(f"Error fetching PubChem compound: {name}") 

130 return None 

131 

132 def _get_compound_properties(self, cid: int) -> Dict[str, Any]: 

133 """Get properties for a compound by CID.""" 

134 try: 

135 self.rate_tracker.apply_rate_limit(self.engine_type) 

136 properties_list = [ 

137 "MolecularFormula", 

138 "MolecularWeight", 

139 "IUPACName", 

140 "CanonicalSMILES", 

141 "IsomericSMILES", 

142 "InChI", 

143 "InChIKey", 

144 "XLogP", 

145 "TPSA", 

146 "Complexity", 

147 "Charge", 

148 "HBondDonorCount", 

149 "HBondAcceptorCount", 

150 "RotatableBondCount", 

151 "HeavyAtomCount", 

152 ] 

153 

154 url = f"{self.base_url}/compound/cid/{cid}/property/{','.join(properties_list)}/JSON" 

155 response = safe_get(url, headers=self.headers, timeout=30) 

156 self._raise_if_rate_limit(response.status_code) 

157 response.raise_for_status() 

158 data = response.json() 

159 

160 props = data.get("PropertyTable", {}).get("Properties", []) 

161 return props[0] if props else {} 

162 

163 except RateLimitError: 

164 raise 

165 except Exception: 

166 logger.exception(f"Error fetching PubChem properties for CID {cid}") 

167 return {} 

168 

169 def _get_compound_description(self, cid: int) -> str: 

170 """Get description for a compound by CID.""" 

171 try: 

172 self.rate_tracker.apply_rate_limit(self.engine_type) 

173 url = f"{self.base_url}/compound/cid/{cid}/description/JSON" 

174 response = safe_get(url, headers=self.headers, timeout=30) 

175 

176 if response.status_code == 404: 

177 return "" 

178 self._raise_if_rate_limit(response.status_code) 

179 

180 response.raise_for_status() 

181 data = response.json() 

182 

183 descriptions = data.get("InformationList", {}).get( 

184 "Information", [] 

185 ) 

186 for desc in descriptions: 

187 if desc.get("Description"): 

188 return desc.get("Description", "") # type: ignore[no-any-return] 

189 

190 return "" 

191 

192 except RateLimitError: 

193 raise 

194 except Exception: 

195 logger.exception( 

196 f"Error fetching PubChem description for CID {cid}" 

197 ) 

198 return "" 

199 

200 def _get_compound_synonyms(self, cid: int, limit: int = 10) -> List[str]: 

201 """Get synonyms for a compound by CID.""" 

202 try: 

203 self.rate_tracker.apply_rate_limit(self.engine_type) 

204 url = f"{self.base_url}/compound/cid/{cid}/synonyms/JSON" 

205 response = safe_get(url, headers=self.headers, timeout=30) 

206 

207 if response.status_code == 404: 

208 return [] 

209 self._raise_if_rate_limit(response.status_code) 

210 

211 response.raise_for_status() 

212 data = response.json() 

213 

214 info = data.get("InformationList", {}).get("Information", []) 

215 if info: 

216 synonyms = info[0].get("Synonym", []) 

217 return synonyms[:limit] # type: ignore[no-any-return] 

218 return [] 

219 

220 except RateLimitError: 

221 raise 

222 except Exception: 

223 logger.exception(f"Error fetching PubChem synonyms for CID {cid}") 

224 return [] 

225 

226 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

227 """ 

228 Get preview information for PubChem compounds. 

229 

230 Args: 

231 query: The search query (compound name) 

232 

233 Returns: 

234 List of preview dictionaries 

235 """ 

236 logger.info(f"Getting PubChem previews for query: {query}") 

237 

238 # Apply rate limiting 

239 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

240 self.engine_type 

241 ) 

242 

243 # Search for matching compound names 

244 compound_names = self._search_compounds(query) 

245 

246 if not compound_names: 

247 # Try direct lookup 

248 compound = self._get_compound_by_name(query) 

249 if compound: 

250 compound_names = [query] 

251 else: 

252 logger.info("No PubChem compounds found") 

253 return [] 

254 

255 logger.info(f"Found {len(compound_names)} potential compounds") 

256 

257 previews: list[dict[str, Any]] = [] 

258 seen_cids = set() 

259 for name in compound_names: 

260 if len(previews) >= self.max_results: 

261 break 

262 

263 try: 

264 compound = self._get_compound_by_name(name) 

265 if not compound: 

266 continue 

267 

268 cid = compound["cid"] 

269 

270 # Deduplicate by CID (autocomplete may return 

271 # case variants like "Caffeine" and "caffeine") 

272 if cid in seen_cids: 

273 continue 

274 seen_cids.add(cid) 

275 properties = compound.get("properties", {}) 

276 description = compound.get("description", "") 

277 

278 # Build compound URL 

279 compound_url = ( 

280 f"https://pubchem.ncbi.nlm.nih.gov/compound/{cid}" 

281 ) 

282 

283 # Get key properties 

284 molecular_formula = properties.get("MolecularFormula", "") 

285 molecular_weight = properties.get("MolecularWeight", "") 

286 iupac_name = properties.get("IUPACName", "") 

287 smiles = ( 

288 properties.get("CanonicalSMILES", "") 

289 or properties.get("SMILES", "") 

290 or properties.get("IsomericSMILES", "") 

291 or properties.get("ConnectivitySMILES", "") 

292 ) 

293 

294 # Get drug-relevant properties 

295 xlogp = properties.get("XLogP") 

296 hbond_donors = properties.get("HBondDonorCount") 

297 hbond_acceptors = properties.get("HBondAcceptorCount") 

298 

299 # Build snippet 

300 snippet_parts = [] 

301 if molecular_formula: 

302 snippet_parts.append(f"Formula: {molecular_formula}") 

303 if molecular_weight: 303 ↛ 304line 303 didn't jump to line 304 because the condition on line 303 was never true

304 snippet_parts.append(f"MW: {molecular_weight}") 

305 if xlogp is not None: 305 ↛ 306line 305 didn't jump to line 306 because the condition on line 305 was never true

306 snippet_parts.append(f"XLogP: {xlogp}") 

307 if hbond_donors is not None or hbond_acceptors is not None: 307 ↛ 308line 307 didn't jump to line 308 because the condition on line 307 was never true

308 hbond_info = [] 

309 if hbond_donors is not None: 

310 hbond_info.append(f"H-Donors: {hbond_donors}") 

311 if hbond_acceptors is not None: 

312 hbond_info.append(f"H-Acceptors: {hbond_acceptors}") 

313 snippet_parts.append(", ".join(hbond_info)) 

314 if iupac_name: 

315 snippet_parts.append(f"IUPAC: {iupac_name}") 

316 if description: 

317 snippet_parts.append(description[:200]) 

318 snippet = ". ".join(snippet_parts) 

319 

320 preview = { 

321 "id": str(cid), 

322 "cid": cid, 

323 "title": name, 

324 "link": compound_url, 

325 "snippet": snippet, 

326 "molecular_formula": molecular_formula, 

327 "molecular_weight": molecular_weight, 

328 "iupac_name": iupac_name, 

329 "smiles": smiles, 

330 "inchi_key": properties.get("InChIKey", ""), 

331 "description": description, 

332 "source": "PubChem", 

333 "_raw": { 

334 "properties": properties, 

335 "description": description, 

336 }, 

337 } 

338 

339 previews.append(preview) 

340 

341 except RateLimitError: 

342 raise 

343 except Exception: 

344 logger.exception(f"Error processing PubChem compound: {name}") 

345 continue 

346 

347 return previews 

348 

349 def _get_full_content( 

350 self, relevant_items: List[Dict[str, Any]] 

351 ) -> List[Dict[str, Any]]: 

352 """ 

353 Get full content for the relevant PubChem compounds. 

354 

355 Args: 

356 relevant_items: List of relevant preview dictionaries 

357 

358 Returns: 

359 List of result dictionaries with full content 

360 """ 

361 logger.info( 

362 f"Getting full content for {len(relevant_items)} PubChem compounds" 

363 ) 

364 

365 results = [] 

366 for item in relevant_items: 

367 result = item.copy() 

368 

369 cid = item.get("cid") 

370 if cid and self.include_synonyms: 370 ↛ 372line 370 didn't jump to line 372 because the condition on line 370 was never true

371 # Get synonyms 

372 synonyms = self._get_compound_synonyms(cid) 

373 result["synonyms"] = synonyms 

374 

375 raw = item.get("_raw", {}) 

376 if raw: 376 ↛ 428line 376 didn't jump to line 428 because the condition on line 376 was always true

377 properties = raw.get("properties", {}) 

378 description = raw.get("description", "") 

379 

380 # Build content summary 

381 content_parts = [] 

382 content_parts.append( 

383 f"Compound: {result.get('title', 'Unknown')}" 

384 ) 

385 if cid is not None: 385 ↛ 388line 385 didn't jump to line 388 because the condition on line 385 was always true

386 content_parts.append(f"CID: {cid}") 

387 

388 if result.get("molecular_formula"): 388 ↛ 392line 388 didn't jump to line 392 because the condition on line 388 was always true

389 content_parts.append( 

390 f"Molecular Formula: {result['molecular_formula']}" 

391 ) 

392 if result.get("molecular_weight"): 392 ↛ 396line 392 didn't jump to line 396 because the condition on line 392 was always true

393 content_parts.append( 

394 f"Molecular Weight: {result['molecular_weight']} g/mol" 

395 ) 

396 if result.get("iupac_name"): 396 ↛ 398line 396 didn't jump to line 398 because the condition on line 396 was always true

397 content_parts.append(f"IUPAC Name: {result['iupac_name']}") 

398 if result.get("smiles"): 398 ↛ 400line 398 didn't jump to line 400 because the condition on line 398 was always true

399 content_parts.append(f"SMILES: {result['smiles']}") 

400 if result.get("inchi_key"): 400 ↛ 404line 400 didn't jump to line 404 because the condition on line 400 was always true

401 content_parts.append(f"InChIKey: {result['inchi_key']}") 

402 

403 # Additional properties 

404 if properties.get("XLogP") is not None: 404 ↛ 406line 404 didn't jump to line 406 because the condition on line 404 was always true

405 content_parts.append(f"XLogP: {properties['XLogP']}") 

406 if properties.get("TPSA") is not None: 406 ↛ 408line 406 didn't jump to line 408 because the condition on line 406 was always true

407 content_parts.append(f"TPSA: {properties['TPSA']} Ų") 

408 if properties.get("HBondDonorCount") is not None: 408 ↛ 412line 408 didn't jump to line 412 because the condition on line 408 was always true

409 content_parts.append( 

410 f"H-Bond Donors: {properties['HBondDonorCount']}" 

411 ) 

412 if properties.get("HBondAcceptorCount") is not None: 412 ↛ 417line 412 didn't jump to line 417 because the condition on line 412 was always true

413 content_parts.append( 

414 f"H-Bond Acceptors: {properties['HBondAcceptorCount']}" 

415 ) 

416 

417 if result.get("synonyms"): 417 ↛ 418line 417 didn't jump to line 418 because the condition on line 417 was never true

418 content_parts.append( 

419 f"\nSynonyms: {', '.join(result['synonyms'][:5])}" 

420 ) 

421 

422 if description: 422 ↛ 425line 422 didn't jump to line 425 because the condition on line 422 was always true

423 content_parts.append(f"\nDescription: {description}") 

424 

425 result["content"] = "\n".join(content_parts) 

426 

427 # Clean up internal fields 

428 if "_raw" in result: 428 ↛ 431line 428 didn't jump to line 431 because the condition on line 428 was always true

429 del result["_raw"] 

430 

431 results.append(result) 

432 

433 return results 

434 

435 def get_compound(self, cid: int) -> Optional[Dict[str, Any]]: 

436 """ 

437 Get a specific compound by CID. 

438 

439 Args: 

440 cid: The PubChem compound ID 

441 

442 Returns: 

443 Compound dictionary or None 

444 """ 

445 try: 

446 properties = self._get_compound_properties(cid) 

447 description = self._get_compound_description(cid) 

448 synonyms = self._get_compound_synonyms(cid) 

449 

450 return { 

451 "cid": cid, 

452 "properties": properties, 

453 "description": description, 

454 "synonyms": synonyms, 

455 } 

456 except RateLimitError: 

457 raise 

458 except Exception: 

459 logger.exception(f"Error fetching PubChem compound {cid}") 

460 return None 

461 

462 def search_by_formula(self, formula: str) -> List[Dict[str, Any]]: 

463 """ 

464 Search compounds by molecular formula. 

465 

466 Args: 

467 formula: Molecular formula (e.g., "C6H12O6") 

468 

469 Returns: 

470 List of matching compounds 

471 """ 

472 try: 

473 url = f"{self.base_url}/compound/fastformula/{quote(formula, safe='')}/cids/JSON" 

474 response = safe_get(url, headers=self.headers, timeout=30) 

475 

476 if response.status_code == 404: 

477 return [] 

478 self._raise_if_rate_limit(response.status_code) 

479 

480 response.raise_for_status() 

481 data = response.json() 

482 cids = data.get("IdentifierList", {}).get("CID", []) 

483 

484 results = [] 

485 for cid in cids[: self.max_results]: 

486 compound = self.get_compound(cid) 

487 if compound: 

488 results.append(compound) 

489 

490 return results 

491 

492 except RateLimitError: 

493 raise 

494 except Exception: 

495 logger.exception(f"Error searching by formula: {formula}") 

496 return []