Files
2025-11-30 08:30:10 +08:00

298 lines
8.1 KiB
Python

#!/usr/bin/env python3
"""
PubChem Compound Search Utility
This script provides functions for searching and retrieving compound information
from PubChem using the PubChemPy library.
"""
import sys
import json
from typing import List, Dict, Optional, Union
try:
import pubchempy as pcp
except ImportError:
print("Error: pubchempy is not installed. Install it with: pip install pubchempy")
sys.exit(1)
def search_by_name(name: str, max_results: int = 10) -> List[pcp.Compound]:
"""
Search for compounds by name.
Args:
name: Chemical name to search for
max_results: Maximum number of results to return
Returns:
List of Compound objects
"""
try:
compounds = pcp.get_compounds(name, 'name')
return compounds[:max_results]
except Exception as e:
print(f"Error searching for '{name}': {e}")
return []
def search_by_smiles(smiles: str) -> Optional[pcp.Compound]:
"""
Search for a compound by SMILES string.
Args:
smiles: SMILES string
Returns:
Compound object or None if not found
"""
try:
compounds = pcp.get_compounds(smiles, 'smiles')
return compounds[0] if compounds else None
except Exception as e:
print(f"Error searching for SMILES '{smiles}': {e}")
return None
def get_compound_by_cid(cid: int) -> Optional[pcp.Compound]:
"""
Retrieve a compound by its CID (Compound ID).
Args:
cid: PubChem Compound ID
Returns:
Compound object or None if not found
"""
try:
return pcp.Compound.from_cid(cid)
except Exception as e:
print(f"Error retrieving CID {cid}: {e}")
return None
def get_compound_properties(
identifier: Union[str, int],
namespace: str = 'name',
properties: Optional[List[str]] = None
) -> Dict:
"""
Get specific properties for a compound.
Args:
identifier: Compound identifier (name, SMILES, CID, etc.)
namespace: Type of identifier ('name', 'smiles', 'cid', 'inchi', etc.)
properties: List of properties to retrieve. If None, returns common properties.
Returns:
Dictionary of properties
"""
if properties is None:
properties = [
'MolecularFormula',
'MolecularWeight',
'CanonicalSMILES',
'IUPACName',
'XLogP',
'TPSA',
'HBondDonorCount',
'HBondAcceptorCount'
]
try:
result = pcp.get_properties(properties, identifier, namespace)
return result[0] if result else {}
except Exception as e:
print(f"Error getting properties for '{identifier}': {e}")
return {}
def similarity_search(
smiles: str,
threshold: int = 90,
max_records: int = 10
) -> List[pcp.Compound]:
"""
Perform similarity search for compounds similar to the query structure.
Args:
smiles: Query SMILES string
threshold: Similarity threshold (0-100)
max_records: Maximum number of results
Returns:
List of similar Compound objects
"""
try:
compounds = pcp.get_compounds(
smiles,
'smiles',
searchtype='similarity',
Threshold=threshold,
MaxRecords=max_records
)
return compounds
except Exception as e:
print(f"Error in similarity search: {e}")
return []
def substructure_search(
smiles: str,
max_records: int = 100
) -> List[pcp.Compound]:
"""
Perform substructure search for compounds containing the query structure.
Args:
smiles: Query SMILES string (substructure)
max_records: Maximum number of results
Returns:
List of Compound objects containing the substructure
"""
try:
compounds = pcp.get_compounds(
smiles,
'smiles',
searchtype='substructure',
MaxRecords=max_records
)
return compounds
except Exception as e:
print(f"Error in substructure search: {e}")
return []
def get_synonyms(identifier: Union[str, int], namespace: str = 'name') -> List[str]:
"""
Get all synonyms for a compound.
Args:
identifier: Compound identifier
namespace: Type of identifier
Returns:
List of synonym strings
"""
try:
results = pcp.get_synonyms(identifier, namespace)
if results:
return results[0].get('Synonym', [])
return []
except Exception as e:
print(f"Error getting synonyms: {e}")
return []
def batch_search(
identifiers: List[str],
namespace: str = 'name',
properties: Optional[List[str]] = None
) -> List[Dict]:
"""
Batch search for multiple compounds.
Args:
identifiers: List of compound identifiers
namespace: Type of identifiers
properties: List of properties to retrieve
Returns:
List of dictionaries containing properties for each compound
"""
results = []
for identifier in identifiers:
props = get_compound_properties(identifier, namespace, properties)
if props:
props['query'] = identifier
results.append(props)
return results
def download_structure(
identifier: Union[str, int],
namespace: str = 'name',
format: str = 'SDF',
filename: Optional[str] = None
) -> Optional[str]:
"""
Download compound structure in specified format.
Args:
identifier: Compound identifier
namespace: Type of identifier
format: Output format ('SDF', 'JSON', 'PNG', etc.)
filename: Output filename (if None, returns data as string)
Returns:
Data string if filename is None, else None
"""
try:
if filename:
pcp.download(format, identifier, namespace, filename, overwrite=True)
return None
else:
return pcp.download(format, identifier, namespace)
except Exception as e:
print(f"Error downloading structure: {e}")
return None
def print_compound_info(compound: pcp.Compound) -> None:
"""
Print formatted compound information.
Args:
compound: PubChemPy Compound object
"""
print(f"\n{'='*60}")
print(f"Compound CID: {compound.cid}")
print(f"{'='*60}")
print(f"IUPAC Name: {compound.iupac_name or 'N/A'}")
print(f"Molecular Formula: {compound.molecular_formula or 'N/A'}")
print(f"Molecular Weight: {compound.molecular_weight or 'N/A'} g/mol")
print(f"Canonical SMILES: {compound.canonical_smiles or 'N/A'}")
print(f"InChI: {compound.inchi or 'N/A'}")
print(f"InChI Key: {compound.inchikey or 'N/A'}")
print(f"XLogP: {compound.xlogp or 'N/A'}")
print(f"TPSA: {compound.tpsa or 'N/A'} Ų")
print(f"H-Bond Donors: {compound.h_bond_donor_count or 'N/A'}")
print(f"H-Bond Acceptors: {compound.h_bond_acceptor_count or 'N/A'}")
print(f"{'='*60}\n")
def main():
"""Example usage of PubChem search functions."""
# Example 1: Search by name
print("Example 1: Searching for 'aspirin'...")
compounds = search_by_name('aspirin', max_results=1)
if compounds:
print_compound_info(compounds[0])
# Example 2: Get properties
print("\nExample 2: Getting properties for caffeine...")
props = get_compound_properties('caffeine', 'name')
print(json.dumps(props, indent=2))
# Example 3: Similarity search
print("\nExample 3: Finding compounds similar to benzene...")
benzene_smiles = 'c1ccccc1'
similar = similarity_search(benzene_smiles, threshold=95, max_records=5)
print(f"Found {len(similar)} similar compounds:")
for comp in similar:
print(f" CID {comp.cid}: {comp.iupac_name or 'N/A'}")
# Example 4: Batch search
print("\nExample 4: Batch search for multiple compounds...")
names = ['aspirin', 'ibuprofen', 'paracetamol']
results = batch_search(names, properties=['MolecularFormula', 'MolecularWeight'])
for result in results:
print(f" {result.get('query')}: {result.get('MolecularFormula')} "
f"({result.get('MolecularWeight')} g/mol)")
if __name__ == '__main__':
main()