Coverage for src/local_deep_research/web_search_engines/engines/search_engine

1"""PubChem search engine for chemical compound information."""

3from typing import Any, Dict, List, Optional

4from urllib.parse import quote

6from langchain_core.language_models import BaseLLM

7from loguru import logger

9from ...constants import USER_AGENT

10from ...security.safe_requests import safe_get

11from ..rate_limiting import RateLimitError

12from ..search_engine_base import BaseSearchEngine

15class PubChemSearchEngine(BaseSearchEngine):

16 """

17 PubChem search engine for chemical compound information.

19 Provides access to chemical structures, properties, and bioactivity data.

20 No authentication required.

21 """

23 is_public = True

24 is_generic = False

25 is_scientific = True

26 is_code = False

27 is_lexical = True

28 needs_llm_relevance_filter = True

30 def __init__(

31 self,

32 max_results: int = 10,

33 include_synonyms: bool = True,

34 llm: Optional[BaseLLM] = None,

35 max_filtered_results: Optional[int] = None,

36 settings_snapshot: Optional[Dict[str, Any]] = None,

37 **kwargs,

38 ):

39 """

40 Initialize the PubChem search engine.

42 Args:

43 max_results: Maximum number of search results

44 include_synonyms: Whether to include compound synonyms

45 llm: Language model for relevance filtering

46 max_filtered_results: Maximum results after filtering

47 settings_snapshot: Settings snapshot for thread context

48 """

49 super().__init__(

50 llm=llm,

51 max_filtered_results=max_filtered_results,

52 max_results=max_results,

53 settings_snapshot=settings_snapshot,

54 **kwargs,

55 )

57 self.include_synonyms = include_synonyms

58 self.base_url = "https://pubchem.ncbi.nlm.nih.gov/rest/pug"

59 self.autocomplete_url = (

60 "https://pubchem.ncbi.nlm.nih.gov/rest/autocomplete"

61 )

63 # User-Agent header for API requests

64 self.headers = {"User-Agent": USER_AGENT}

66 def _search_compounds(self, query: str) -> List[str]:

67 """Search for compound names matching the query."""

68 try:

69 url = (

70 f"{self.autocomplete_url}/compound/{quote(query, safe='')}/json"

71 )

72 params = {"limit": self.max_results * 2} # Get extra for filtering

74 response = safe_get(

75 url, params=params, headers=self.headers, timeout=30

76 )

77 self._raise_if_rate_limit(response.status_code)

78 response.raise_for_status()

79 data = response.json()

81 terms: list[str] = data.get("dictionary_terms", {}).get(

82 "compound", []

83 )

84 return terms

86 except RateLimitError:

87 raise

88 except Exception:

89 logger.exception("PubChem autocomplete search failed")

90 return []

92 def _get_compound_by_name(self, name: str) -> Optional[Dict[str, Any]]:

93 """Get compound information by name."""

94 try:

95 self.rate_tracker.apply_rate_limit(self.engine_type)

96 # Get CID first

97 url = f"{self.base_url}/compound/name/{quote(name, safe='')}/cids/JSON"

98 response = safe_get(url, headers=self.headers, timeout=30)

100 if response.status_code == 404:

101 return None

102 self._raise_if_rate_limit(response.status_code)

103

104 response.raise_for_status()

105 data = response.json()

106 cids = data.get("IdentifierList", {}).get("CID", [])

107

108 if not cids:

109 return None

110

111 cid = cids[0]

112

113 # Get compound properties

114 properties = self._get_compound_properties(cid)

115

116 # Get compound description

117 description = self._get_compound_description(cid)

118

119 return {

120 "cid": cid,

121 "name": name,

122 "properties": properties,

123 "description": description,

124 }

125

126 except RateLimitError:

127 raise

128 except Exception:

129 logger.exception(f"Error fetching PubChem compound: {name}")

130 return None

131

132 def _get_compound_properties(self, cid: int) -> Dict[str, Any]:

133 """Get properties for a compound by CID."""

134 try:

135 self.rate_tracker.apply_rate_limit(self.engine_type)

136 properties_list = [

137 "MolecularFormula",

138 "MolecularWeight",

139 "IUPACName",

140 "CanonicalSMILES",

141 "IsomericSMILES",

142 "InChI",

143 "InChIKey",

144 "XLogP",

145 "TPSA",

146 "Complexity",

147 "Charge",

148 "HBondDonorCount",

149 "HBondAcceptorCount",

150 "RotatableBondCount",

151 "HeavyAtomCount",

152 ]

153

154 url = f"{self.base_url}/compound/cid/{cid}/property/{','.join(properties_list)}/JSON"

155 response = safe_get(url, headers=self.headers, timeout=30)

156 self._raise_if_rate_limit(response.status_code)

157 response.raise_for_status()

158 data = response.json()

159

160 props = data.get("PropertyTable", {}).get("Properties", [])

161 return props[0] if props else {}

162

163 except RateLimitError:

164 raise

165 except Exception:

166 logger.exception(f"Error fetching PubChem properties for CID {cid}")

167 return {}

168

169 def _get_compound_description(self, cid: int) -> str:

170 """Get description for a compound by CID."""

171 try:

172 self.rate_tracker.apply_rate_limit(self.engine_type)

173 url = f"{self.base_url}/compound/cid/{cid}/description/JSON"

174 response = safe_get(url, headers=self.headers, timeout=30)

175

176 if response.status_code == 404:

177 return ""

178 self._raise_if_rate_limit(response.status_code)

179

180 response.raise_for_status()

181 data = response.json()

182

183 descriptions = data.get("InformationList", {}).get(

184 "Information", []

185 )

186 for desc in descriptions:

187 if desc.get("Description"):

188 return desc.get("Description", "") # type: ignore[no-any-return]

189

190 return ""

191

192 except RateLimitError:

193 raise

194 except Exception:

195 logger.exception(

196 f"Error fetching PubChem description for CID {cid}"

197 )

198 return ""

199

200 def _get_compound_synonyms(self, cid: int, limit: int = 10) -> List[str]:

201 """Get synonyms for a compound by CID."""

202 try:

203 self.rate_tracker.apply_rate_limit(self.engine_type)

204 url = f"{self.base_url}/compound/cid/{cid}/synonyms/JSON"

205 response = safe_get(url, headers=self.headers, timeout=30)

206

207 if response.status_code == 404:

208 return []

209 self._raise_if_rate_limit(response.status_code)

210

211 response.raise_for_status()

212 data = response.json()

213

214 info = data.get("InformationList", {}).get("Information", [])

215 if info:

216 synonyms = info[0].get("Synonym", [])

217 return synonyms[:limit] # type: ignore[no-any-return]

218 return []

219

220 except RateLimitError:

221 raise

222 except Exception:

223 logger.exception(f"Error fetching PubChem synonyms for CID {cid}")

224 return []

225

226 def _get_previews(self, query: str) -> List[Dict[str, Any]]:

227 """

228 Get preview information for PubChem compounds.

229

230 Args:

231 query: The search query (compound name)

232

233 Returns:

234 List of preview dictionaries

235 """

236 logger.info(f"Getting PubChem previews for query: {query}")

237

238 # Apply rate limiting

239 self._last_wait_time = self.rate_tracker.apply_rate_limit(

240 self.engine_type

241 )

242

243 # Search for matching compound names

244 compound_names = self._search_compounds(query)

245

246 if not compound_names:

247 # Try direct lookup

248 compound = self._get_compound_by_name(query)

249 if compound:

250 compound_names = [query]

251 else:

252 logger.info("No PubChem compounds found")

253 return []

254

255 logger.info(f"Found {len(compound_names)} potential compounds")

256

257 previews: list[dict[str, Any]] = []

258 seen_cids = set()

259 for name in compound_names:

260 if len(previews) >= self.max_results:

261 break

262

263 try:

264 compound = self._get_compound_by_name(name)

265 if not compound:

266 continue

267

268 cid = compound["cid"]

269

270 # Deduplicate by CID (autocomplete may return

271 # case variants like "Caffeine" and "caffeine")

272 if cid in seen_cids:

273 continue

274 seen_cids.add(cid)

275 properties = compound.get("properties", {})

276 description = compound.get("description", "")

277

278 # Build compound URL

279 compound_url = (

280 f"https://pubchem.ncbi.nlm.nih.gov/compound/{cid}"

281 )

282

283 # Get key properties

284 molecular_formula = properties.get("MolecularFormula", "")

285 molecular_weight = properties.get("MolecularWeight", "")

286 iupac_name = properties.get("IUPACName", "")

287 smiles = (

288 properties.get("CanonicalSMILES", "")

289 or properties.get("SMILES", "")

290 or properties.get("IsomericSMILES", "")

291 or properties.get("ConnectivitySMILES", "")

292 )

293

294 # Get drug-relevant properties

295 xlogp = properties.get("XLogP")

296 hbond_donors = properties.get("HBondDonorCount")

297 hbond_acceptors = properties.get("HBondAcceptorCount")

298

299 # Build snippet

300 snippet_parts = []

301 if molecular_formula:

302 snippet_parts.append(f"Formula: {molecular_formula}")

303 if molecular_weight: 303 ↛ 304line 303 didn't jump to line 304 because the condition on line 303 was never true

304 snippet_parts.append(f"MW: {molecular_weight}")

305 if xlogp is not None: 305 ↛ 306line 305 didn't jump to line 306 because the condition on line 305 was never true

306 snippet_parts.append(f"XLogP: {xlogp}")

307 if hbond_donors is not None or hbond_acceptors is not None: 307 ↛ 308line 307 didn't jump to line 308 because the condition on line 307 was never true

308 hbond_info = []

309 if hbond_donors is not None:

310 hbond_info.append(f"H-Donors: {hbond_donors}")

311 if hbond_acceptors is not None:

312 hbond_info.append(f"H-Acceptors: {hbond_acceptors}")

313 snippet_parts.append(", ".join(hbond_info))

314 if iupac_name:

315 snippet_parts.append(f"IUPAC: {iupac_name}")

316 if description:

317 snippet_parts.append(description[:200])

318 snippet = ". ".join(snippet_parts)

319

320 preview = {

321 "id": str(cid),

322 "cid": cid,

323 "title": name,

324 "link": compound_url,

325 "snippet": snippet,

326 "molecular_formula": molecular_formula,

327 "molecular_weight": molecular_weight,

328 "iupac_name": iupac_name,

329 "smiles": smiles,

330 "inchi_key": properties.get("InChIKey", ""),

331 "description": description,

332 "source": "PubChem",

333 "_raw": {

334 "properties": properties,

335 "description": description,

336 },

337 }

338

339 previews.append(preview)

340

341 except RateLimitError:

342 raise

343 except Exception:

344 logger.exception(f"Error processing PubChem compound: {name}")

345 continue

346

347 return previews

348

349 def _get_full_content(

350 self, relevant_items: List[Dict[str, Any]]

351 ) -> List[Dict[str, Any]]:

352 """

353 Get full content for the relevant PubChem compounds.

354

355 Args:

356 relevant_items: List of relevant preview dictionaries

357

358 Returns:

359 List of result dictionaries with full content

360 """

361 logger.info(

362 f"Getting full content for {len(relevant_items)} PubChem compounds"

363 )

364

365 results = []

366 for item in relevant_items:

367 result = item.copy()

368

369 cid = item.get("cid")

370 if cid and self.include_synonyms: 370 ↛ 372line 370 didn't jump to line 372 because the condition on line 370 was never true

371 # Get synonyms

372 synonyms = self._get_compound_synonyms(cid)

373 result["synonyms"] = synonyms

374

375 raw = item.get("_raw", {})

376 if raw: 376 ↛ 428line 376 didn't jump to line 428 because the condition on line 376 was always true

377 properties = raw.get("properties", {})

378 description = raw.get("description", "")

379

380 # Build content summary

381 content_parts = []

382 content_parts.append(

383 f"Compound: {result.get('title', 'Unknown')}"

384 )

385 if cid is not None: 385 ↛ 388line 385 didn't jump to line 388 because the condition on line 385 was always true

386 content_parts.append(f"CID: {cid}")

387

388 if result.get("molecular_formula"): 388 ↛ 392line 388 didn't jump to line 392 because the condition on line 388 was always true

389 content_parts.append(

390 f"Molecular Formula: {result['molecular_formula']}"

391 )

392 if result.get("molecular_weight"): 392 ↛ 396line 392 didn't jump to line 396 because the condition on line 392 was always true

393 content_parts.append(

394 f"Molecular Weight: {result['molecular_weight']} g/mol"

395 )

396 if result.get("iupac_name"): 396 ↛ 398line 396 didn't jump to line 398 because the condition on line 396 was always true

397 content_parts.append(f"IUPAC Name: {result['iupac_name']}")

398 if result.get("smiles"): 398 ↛ 400line 398 didn't jump to line 400 because the condition on line 398 was always true

399 content_parts.append(f"SMILES: {result['smiles']}")

400 if result.get("inchi_key"): 400 ↛ 404line 400 didn't jump to line 404 because the condition on line 400 was always true

401 content_parts.append(f"InChIKey: {result['inchi_key']}")

402

403 # Additional properties

404 if properties.get("XLogP") is not None: 404 ↛ 406line 404 didn't jump to line 406 because the condition on line 404 was always true

405 content_parts.append(f"XLogP: {properties['XLogP']}")

406 if properties.get("TPSA") is not None: 406 ↛ 408line 406 didn't jump to line 408 because the condition on line 406 was always true

407 content_parts.append(f"TPSA: {properties['TPSA']} Å²")

408 if properties.get("HBondDonorCount") is not None: 408 ↛ 412line 408 didn't jump to line 412 because the condition on line 408 was always true

409 content_parts.append(

410 f"H-Bond Donors: {properties['HBondDonorCount']}"

411 )

412 if properties.get("HBondAcceptorCount") is not None: 412 ↛ 417line 412 didn't jump to line 417 because the condition on line 412 was always true

413 content_parts.append(

414 f"H-Bond Acceptors: {properties['HBondAcceptorCount']}"

415 )

416

417 if result.get("synonyms"): 417 ↛ 418line 417 didn't jump to line 418 because the condition on line 417 was never true

418 content_parts.append(

419 f"\nSynonyms: {', '.join(result['synonyms'][:5])}"

420 )

421

422 if description: 422 ↛ 425line 422 didn't jump to line 425 because the condition on line 422 was always true

423 content_parts.append(f"\nDescription: {description}")

424

425 result["content"] = "\n".join(content_parts)

426

427 # Clean up internal fields

428 if "_raw" in result: 428 ↛ 431line 428 didn't jump to line 431 because the condition on line 428 was always true

429 del result["_raw"]

430

431 results.append(result)

432

433 return results

434

435 def get_compound(self, cid: int) -> Optional[Dict[str, Any]]:

436 """

437 Get a specific compound by CID.

438

439 Args:

440 cid: The PubChem compound ID

441

442 Returns:

443 Compound dictionary or None

444 """

445 try:

446 properties = self._get_compound_properties(cid)

447 description = self._get_compound_description(cid)

448 synonyms = self._get_compound_synonyms(cid)

449

450 return {

451 "cid": cid,

452 "properties": properties,

453 "description": description,

454 "synonyms": synonyms,

455 }

456 except RateLimitError:

457 raise

458 except Exception:

459 logger.exception(f"Error fetching PubChem compound {cid}")

460 return None

461

462 def search_by_formula(self, formula: str) -> List[Dict[str, Any]]:

463 """

464 Search compounds by molecular formula.

465

466 Args:

467 formula: Molecular formula (e.g., "C6H12O6")

468

469 Returns:

470 List of matching compounds

471 """

472 try:

473 url = f"{self.base_url}/compound/fastformula/{quote(formula, safe='')}/cids/JSON"

474 response = safe_get(url, headers=self.headers, timeout=30)

475

476 if response.status_code == 404:

477 return []

478 self._raise_if_rate_limit(response.status_code)

479

480 response.raise_for_status()

481 data = response.json()

482 cids = data.get("IdentifierList", {}).get("CID", [])

483

484 results = []

485 for cid in cids[: self.max_results]:

486 compound = self.get_compound(cid)

487 if compound:

488 results.append(compound)

489

490 return results

491

492 except RateLimitError:

493 raise

494 except Exception:

495 logger.exception(f"Error searching by formula: {formula}")

496 return []

Coverage for src/local_deep_research/web_search_engines/engines/search_engine_pubchem.py: 89%

251 statements