Coverage for src / local_deep_research / web_search_engines / engines / search_engine_pubchem.py: 89%

250 statements  

« prev     ^ index     » next       coverage.py v7.13.5, created at 2026-04-14 23:55 +0000

1"""PubChem search engine for chemical compound information.""" 

2 

3from typing import Any, Dict, List, Optional 

4from urllib.parse import quote 

5 

6from langchain_core.language_models import BaseLLM 

7from loguru import logger 

8 

9from ...security.safe_requests import safe_get 

10from ..rate_limiting import RateLimitError 

11from ..search_engine_base import BaseSearchEngine 

12 

13 

14class PubChemSearchEngine(BaseSearchEngine): 

15 """ 

16 PubChem search engine for chemical compound information. 

17 

18 Provides access to chemical structures, properties, and bioactivity data. 

19 No authentication required. 

20 """ 

21 

22 is_public = True 

23 is_generic = False 

24 is_scientific = True 

25 is_code = False 

26 is_lexical = True 

27 needs_llm_relevance_filter = True 

28 

29 def __init__( 

30 self, 

31 max_results: int = 10, 

32 include_synonyms: bool = True, 

33 llm: Optional[BaseLLM] = None, 

34 max_filtered_results: Optional[int] = None, 

35 settings_snapshot: Optional[Dict[str, Any]] = None, 

36 **kwargs, 

37 ): 

38 """ 

39 Initialize the PubChem search engine. 

40 

41 Args: 

42 max_results: Maximum number of search results 

43 include_synonyms: Whether to include compound synonyms 

44 llm: Language model for relevance filtering 

45 max_filtered_results: Maximum results after filtering 

46 settings_snapshot: Settings snapshot for thread context 

47 """ 

48 super().__init__( 

49 llm=llm, 

50 max_filtered_results=max_filtered_results, 

51 max_results=max_results, 

52 settings_snapshot=settings_snapshot, 

53 **kwargs, 

54 ) 

55 

56 self.include_synonyms = include_synonyms 

57 self.base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug" 

58 self.autocomplete_url = ( 

59 "https://pubchem.ncbi.nlm.nih.gov/rest/autocomplete" 

60 ) 

61 

62 # User-Agent header for API requests 

63 self.headers = { 

64 "User-Agent": "LocalDeepResearch/1.0 (https://github.com/LearningCircuit/local-deep-research)" 

65 } 

66 

67 def _search_compounds(self, query: str) -> List[str]: 

68 """Search for compound names matching the query.""" 

69 try: 

70 url = ( 

71 f"{self.autocomplete_url}/compound/{quote(query, safe='')}/json" 

72 ) 

73 params = {"limit": self.max_results * 2} # Get extra for filtering 

74 

75 response = safe_get( 

76 url, params=params, headers=self.headers, timeout=30 

77 ) 

78 self._raise_if_rate_limit(response.status_code) 

79 response.raise_for_status() 

80 data = response.json() 

81 

82 terms: list[str] = data.get("dictionary_terms", {}).get( 

83 "compound", [] 

84 ) 

85 return terms 

86 

87 except RateLimitError: 

88 raise 

89 except Exception: 

90 logger.exception("PubChem autocomplete search failed") 

91 return [] 

92 

93 def _get_compound_by_name(self, name: str) -> Optional[Dict[str, Any]]: 

94 """Get compound information by name.""" 

95 try: 

96 self.rate_tracker.apply_rate_limit(self.engine_type) 

97 # Get CID first 

98 url = f"{self.base_url}/compound/name/{quote(name, safe='')}/cids/JSON" 

99 response = safe_get(url, headers=self.headers, timeout=30) 

100 

101 if response.status_code == 404: 

102 return None 

103 self._raise_if_rate_limit(response.status_code) 

104 

105 response.raise_for_status() 

106 data = response.json() 

107 cids = data.get("IdentifierList", {}).get("CID", []) 

108 

109 if not cids: 

110 return None 

111 

112 cid = cids[0] 

113 

114 # Get compound properties 

115 properties = self._get_compound_properties(cid) 

116 

117 # Get compound description 

118 description = self._get_compound_description(cid) 

119 

120 return { 

121 "cid": cid, 

122 "name": name, 

123 "properties": properties, 

124 "description": description, 

125 } 

126 

127 except RateLimitError: 

128 raise 

129 except Exception: 

130 logger.exception(f"Error fetching PubChem compound: {name}") 

131 return None 

132 

133 def _get_compound_properties(self, cid: int) -> Dict[str, Any]: 

134 """Get properties for a compound by CID.""" 

135 try: 

136 self.rate_tracker.apply_rate_limit(self.engine_type) 

137 properties_list = [ 

138 "MolecularFormula", 

139 "MolecularWeight", 

140 "IUPACName", 

141 "CanonicalSMILES", 

142 "IsomericSMILES", 

143 "InChI", 

144 "InChIKey", 

145 "XLogP", 

146 "TPSA", 

147 "Complexity", 

148 "Charge", 

149 "HBondDonorCount", 

150 "HBondAcceptorCount", 

151 "RotatableBondCount", 

152 "HeavyAtomCount", 

153 ] 

154 

155 url = f"{self.base_url}/compound/cid/{cid}/property/{','.join(properties_list)}/JSON" 

156 response = safe_get(url, headers=self.headers, timeout=30) 

157 self._raise_if_rate_limit(response.status_code) 

158 response.raise_for_status() 

159 data = response.json() 

160 

161 props = data.get("PropertyTable", {}).get("Properties", []) 

162 return props[0] if props else {} 

163 

164 except RateLimitError: 

165 raise 

166 except Exception: 

167 logger.exception(f"Error fetching PubChem properties for CID {cid}") 

168 return {} 

169 

170 def _get_compound_description(self, cid: int) -> str: 

171 """Get description for a compound by CID.""" 

172 try: 

173 self.rate_tracker.apply_rate_limit(self.engine_type) 

174 url = f"{self.base_url}/compound/cid/{cid}/description/JSON" 

175 response = safe_get(url, headers=self.headers, timeout=30) 

176 

177 if response.status_code == 404: 

178 return "" 

179 self._raise_if_rate_limit(response.status_code) 

180 

181 response.raise_for_status() 

182 data = response.json() 

183 

184 descriptions = data.get("InformationList", {}).get( 

185 "Information", [] 

186 ) 

187 for desc in descriptions: 

188 if desc.get("Description"): 

189 return desc.get("Description", "") # type: ignore[no-any-return] 

190 

191 return "" 

192 

193 except RateLimitError: 

194 raise 

195 except Exception: 

196 logger.exception( 

197 f"Error fetching PubChem description for CID {cid}" 

198 ) 

199 return "" 

200 

201 def _get_compound_synonyms(self, cid: int, limit: int = 10) -> List[str]: 

202 """Get synonyms for a compound by CID.""" 

203 try: 

204 self.rate_tracker.apply_rate_limit(self.engine_type) 

205 url = f"{self.base_url}/compound/cid/{cid}/synonyms/JSON" 

206 response = safe_get(url, headers=self.headers, timeout=30) 

207 

208 if response.status_code == 404: 

209 return [] 

210 self._raise_if_rate_limit(response.status_code) 

211 

212 response.raise_for_status() 

213 data = response.json() 

214 

215 info = data.get("InformationList", {}).get("Information", []) 

216 if info: 

217 synonyms = info[0].get("Synonym", []) 

218 return synonyms[:limit] # type: ignore[no-any-return] 

219 return [] 

220 

221 except RateLimitError: 

222 raise 

223 except Exception: 

224 logger.exception(f"Error fetching PubChem synonyms for CID {cid}") 

225 return [] 

226 

227 def _get_previews(self, query: str) -> List[Dict[str, Any]]: 

228 """ 

229 Get preview information for PubChem compounds. 

230 

231 Args: 

232 query: The search query (compound name) 

233 

234 Returns: 

235 List of preview dictionaries 

236 """ 

237 logger.info(f"Getting PubChem previews for query: {query}") 

238 

239 # Apply rate limiting 

240 self._last_wait_time = self.rate_tracker.apply_rate_limit( 

241 self.engine_type 

242 ) 

243 

244 # Search for matching compound names 

245 compound_names = self._search_compounds(query) 

246 

247 if not compound_names: 

248 # Try direct lookup 

249 compound = self._get_compound_by_name(query) 

250 if compound: 

251 compound_names = [query] 

252 else: 

253 logger.info("No PubChem compounds found") 

254 return [] 

255 

256 logger.info(f"Found {len(compound_names)} potential compounds") 

257 

258 previews: list[dict[str, Any]] = [] 

259 seen_cids = set() 

260 for name in compound_names: 

261 if len(previews) >= self.max_results: 

262 break 

263 

264 try: 

265 compound = self._get_compound_by_name(name) 

266 if not compound: 

267 continue 

268 

269 cid = compound["cid"] 

270 

271 # Deduplicate by CID (autocomplete may return 

272 # case variants like "Caffeine" and "caffeine") 

273 if cid in seen_cids: 

274 continue 

275 seen_cids.add(cid) 

276 properties = compound.get("properties", {}) 

277 description = compound.get("description", "") 

278 

279 # Build compound URL 

280 compound_url = ( 

281 f"https://pubchem.ncbi.nlm.nih.gov/compound/{cid}" 

282 ) 

283 

284 # Get key properties 

285 molecular_formula = properties.get("MolecularFormula", "") 

286 molecular_weight = properties.get("MolecularWeight", "") 

287 iupac_name = properties.get("IUPACName", "") 

288 smiles = ( 

289 properties.get("CanonicalSMILES", "") 

290 or properties.get("SMILES", "") 

291 or properties.get("IsomericSMILES", "") 

292 or properties.get("ConnectivitySMILES", "") 

293 ) 

294 

295 # Get drug-relevant properties 

296 xlogp = properties.get("XLogP") 

297 hbond_donors = properties.get("HBondDonorCount") 

298 hbond_acceptors = properties.get("HBondAcceptorCount") 

299 

300 # Build snippet 

301 snippet_parts = [] 

302 if molecular_formula: 

303 snippet_parts.append(f"Formula: {molecular_formula}") 

304 if molecular_weight: 304 ↛ 305line 304 didn't jump to line 305 because the condition on line 304 was never true

305 snippet_parts.append(f"MW: {molecular_weight}") 

306 if xlogp is not None: 306 ↛ 307line 306 didn't jump to line 307 because the condition on line 306 was never true

307 snippet_parts.append(f"XLogP: {xlogp}") 

308 if hbond_donors is not None or hbond_acceptors is not None: 308 ↛ 309line 308 didn't jump to line 309 because the condition on line 308 was never true

309 hbond_info = [] 

310 if hbond_donors is not None: 

311 hbond_info.append(f"H-Donors: {hbond_donors}") 

312 if hbond_acceptors is not None: 

313 hbond_info.append(f"H-Acceptors: {hbond_acceptors}") 

314 snippet_parts.append(", ".join(hbond_info)) 

315 if iupac_name: 

316 snippet_parts.append(f"IUPAC: {iupac_name}") 

317 if description: 

318 snippet_parts.append(description[:200]) 

319 snippet = ". ".join(snippet_parts) 

320 

321 preview = { 

322 "id": str(cid), 

323 "cid": cid, 

324 "title": name, 

325 "link": compound_url, 

326 "snippet": snippet, 

327 "molecular_formula": molecular_formula, 

328 "molecular_weight": molecular_weight, 

329 "iupac_name": iupac_name, 

330 "smiles": smiles, 

331 "inchi_key": properties.get("InChIKey", ""), 

332 "description": description, 

333 "source": "PubChem", 

334 "_raw": { 

335 "properties": properties, 

336 "description": description, 

337 }, 

338 } 

339 

340 previews.append(preview) 

341 

342 except RateLimitError: 

343 raise 

344 except Exception: 

345 logger.exception(f"Error processing PubChem compound: {name}") 

346 continue 

347 

348 return previews 

349 

350 def _get_full_content( 

351 self, relevant_items: List[Dict[str, Any]] 

352 ) -> List[Dict[str, Any]]: 

353 """ 

354 Get full content for the relevant PubChem compounds. 

355 

356 Args: 

357 relevant_items: List of relevant preview dictionaries 

358 

359 Returns: 

360 List of result dictionaries with full content 

361 """ 

362 logger.info( 

363 f"Getting full content for {len(relevant_items)} PubChem compounds" 

364 ) 

365 

366 results = [] 

367 for item in relevant_items: 

368 result = item.copy() 

369 

370 cid = item.get("cid") 

371 if cid and self.include_synonyms: 371 ↛ 373line 371 didn't jump to line 373 because the condition on line 371 was never true

372 # Get synonyms 

373 synonyms = self._get_compound_synonyms(cid) 

374 result["synonyms"] = synonyms 

375 

376 raw = item.get("_raw", {}) 

377 if raw: 377 ↛ 429line 377 didn't jump to line 429 because the condition on line 377 was always true

378 properties = raw.get("properties", {}) 

379 description = raw.get("description", "") 

380 

381 # Build content summary 

382 content_parts = [] 

383 content_parts.append( 

384 f"Compound: {result.get('title', 'Unknown')}" 

385 ) 

386 if cid is not None: 386 ↛ 389line 386 didn't jump to line 389 because the condition on line 386 was always true

387 content_parts.append(f"CID: {cid}") 

388 

389 if result.get("molecular_formula"): 389 ↛ 393line 389 didn't jump to line 393 because the condition on line 389 was always true

390 content_parts.append( 

391 f"Molecular Formula: {result['molecular_formula']}" 

392 ) 

393 if result.get("molecular_weight"): 393 ↛ 397line 393 didn't jump to line 397 because the condition on line 393 was always true

394 content_parts.append( 

395 f"Molecular Weight: {result['molecular_weight']} g/mol" 

396 ) 

397 if result.get("iupac_name"): 397 ↛ 399line 397 didn't jump to line 399 because the condition on line 397 was always true

398 content_parts.append(f"IUPAC Name: {result['iupac_name']}") 

399 if result.get("smiles"): 399 ↛ 401line 399 didn't jump to line 401 because the condition on line 399 was always true

400 content_parts.append(f"SMILES: {result['smiles']}") 

401 if result.get("inchi_key"): 401 ↛ 405line 401 didn't jump to line 405 because the condition on line 401 was always true

402 content_parts.append(f"InChIKey: {result['inchi_key']}") 

403 

404 # Additional properties 

405 if properties.get("XLogP") is not None: 405 ↛ 407line 405 didn't jump to line 407 because the condition on line 405 was always true

406 content_parts.append(f"XLogP: {properties['XLogP']}") 

407 if properties.get("TPSA") is not None: 407 ↛ 409line 407 didn't jump to line 409 because the condition on line 407 was always true

408 content_parts.append(f"TPSA: {properties['TPSA']} Ų") 

409 if properties.get("HBondDonorCount") is not None: 409 ↛ 413line 409 didn't jump to line 413 because the condition on line 409 was always true

410 content_parts.append( 

411 f"H-Bond Donors: {properties['HBondDonorCount']}" 

412 ) 

413 if properties.get("HBondAcceptorCount") is not None: 413 ↛ 418line 413 didn't jump to line 418 because the condition on line 413 was always true

414 content_parts.append( 

415 f"H-Bond Acceptors: {properties['HBondAcceptorCount']}" 

416 ) 

417 

418 if result.get("synonyms"): 418 ↛ 419line 418 didn't jump to line 419 because the condition on line 418 was never true

419 content_parts.append( 

420 f"\nSynonyms: {', '.join(result['synonyms'][:5])}" 

421 ) 

422 

423 if description: 423 ↛ 426line 423 didn't jump to line 426 because the condition on line 423 was always true

424 content_parts.append(f"\nDescription: {description}") 

425 

426 result["content"] = "\n".join(content_parts) 

427 

428 # Clean up internal fields 

429 if "_raw" in result: 429 ↛ 432line 429 didn't jump to line 432 because the condition on line 429 was always true

430 del result["_raw"] 

431 

432 results.append(result) 

433 

434 return results 

435 

436 def get_compound(self, cid: int) -> Optional[Dict[str, Any]]: 

437 """ 

438 Get a specific compound by CID. 

439 

440 Args: 

441 cid: The PubChem compound ID 

442 

443 Returns: 

444 Compound dictionary or None 

445 """ 

446 try: 

447 properties = self._get_compound_properties(cid) 

448 description = self._get_compound_description(cid) 

449 synonyms = self._get_compound_synonyms(cid) 

450 

451 return { 

452 "cid": cid, 

453 "properties": properties, 

454 "description": description, 

455 "synonyms": synonyms, 

456 } 

457 except RateLimitError: 

458 raise 

459 except Exception: 

460 logger.exception(f"Error fetching PubChem compound {cid}") 

461 return None 

462 

463 def search_by_formula(self, formula: str) -> List[Dict[str, Any]]: 

464 """ 

465 Search compounds by molecular formula. 

466 

467 Args: 

468 formula: Molecular formula (e.g., "C6H12O6") 

469 

470 Returns: 

471 List of matching compounds 

472 """ 

473 try: 

474 url = f"{self.base_url}/compound/fastformula/{quote(formula, safe='')}/cids/JSON" 

475 response = safe_get(url, headers=self.headers, timeout=30) 

476 

477 if response.status_code == 404: 

478 return [] 

479 self._raise_if_rate_limit(response.status_code) 

480 

481 response.raise_for_status() 

482 data = response.json() 

483 cids = data.get("IdentifierList", {}).get("CID", []) 

484 

485 results = [] 

486 for cid in cids[: self.max_results]: 

487 compound = self.get_compound(cid) 

488 if compound: 

489 results.append(compound) 

490 

491 return results 

492 

493 except RateLimitError: 

494 raise 

495 except Exception: 

496 logger.exception(f"Error searching by formula: {formula}") 

497 return []