Initial commit
This commit is contained in:
297
skills/pubchem-database/scripts/compound_search.py
Normal file
297
skills/pubchem-database/scripts/compound_search.py
Normal file
@@ -0,0 +1,297 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PubChem Compound Search Utility
|
||||
|
||||
This script provides functions for searching and retrieving compound information
|
||||
from PubChem using the PubChemPy library.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import json
|
||||
from typing import List, Dict, Optional, Union
|
||||
|
||||
try:
|
||||
import pubchempy as pcp
|
||||
except ImportError:
|
||||
print("Error: pubchempy is not installed. Install it with: pip install pubchempy")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
def search_by_name(name: str, max_results: int = 10) -> List[pcp.Compound]:
|
||||
"""
|
||||
Search for compounds by name.
|
||||
|
||||
Args:
|
||||
name: Chemical name to search for
|
||||
max_results: Maximum number of results to return
|
||||
|
||||
Returns:
|
||||
List of Compound objects
|
||||
"""
|
||||
try:
|
||||
compounds = pcp.get_compounds(name, 'name')
|
||||
return compounds[:max_results]
|
||||
except Exception as e:
|
||||
print(f"Error searching for '{name}': {e}")
|
||||
return []
|
||||
|
||||
|
||||
def search_by_smiles(smiles: str) -> Optional[pcp.Compound]:
|
||||
"""
|
||||
Search for a compound by SMILES string.
|
||||
|
||||
Args:
|
||||
smiles: SMILES string
|
||||
|
||||
Returns:
|
||||
Compound object or None if not found
|
||||
"""
|
||||
try:
|
||||
compounds = pcp.get_compounds(smiles, 'smiles')
|
||||
return compounds[0] if compounds else None
|
||||
except Exception as e:
|
||||
print(f"Error searching for SMILES '{smiles}': {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_compound_by_cid(cid: int) -> Optional[pcp.Compound]:
|
||||
"""
|
||||
Retrieve a compound by its CID (Compound ID).
|
||||
|
||||
Args:
|
||||
cid: PubChem Compound ID
|
||||
|
||||
Returns:
|
||||
Compound object or None if not found
|
||||
"""
|
||||
try:
|
||||
return pcp.Compound.from_cid(cid)
|
||||
except Exception as e:
|
||||
print(f"Error retrieving CID {cid}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_compound_properties(
|
||||
identifier: Union[str, int],
|
||||
namespace: str = 'name',
|
||||
properties: Optional[List[str]] = None
|
||||
) -> Dict:
|
||||
"""
|
||||
Get specific properties for a compound.
|
||||
|
||||
Args:
|
||||
identifier: Compound identifier (name, SMILES, CID, etc.)
|
||||
namespace: Type of identifier ('name', 'smiles', 'cid', 'inchi', etc.)
|
||||
properties: List of properties to retrieve. If None, returns common properties.
|
||||
|
||||
Returns:
|
||||
Dictionary of properties
|
||||
"""
|
||||
if properties is None:
|
||||
properties = [
|
||||
'MolecularFormula',
|
||||
'MolecularWeight',
|
||||
'CanonicalSMILES',
|
||||
'IUPACName',
|
||||
'XLogP',
|
||||
'TPSA',
|
||||
'HBondDonorCount',
|
||||
'HBondAcceptorCount'
|
||||
]
|
||||
|
||||
try:
|
||||
result = pcp.get_properties(properties, identifier, namespace)
|
||||
return result[0] if result else {}
|
||||
except Exception as e:
|
||||
print(f"Error getting properties for '{identifier}': {e}")
|
||||
return {}
|
||||
|
||||
|
||||
def similarity_search(
|
||||
smiles: str,
|
||||
threshold: int = 90,
|
||||
max_records: int = 10
|
||||
) -> List[pcp.Compound]:
|
||||
"""
|
||||
Perform similarity search for compounds similar to the query structure.
|
||||
|
||||
Args:
|
||||
smiles: Query SMILES string
|
||||
threshold: Similarity threshold (0-100)
|
||||
max_records: Maximum number of results
|
||||
|
||||
Returns:
|
||||
List of similar Compound objects
|
||||
"""
|
||||
try:
|
||||
compounds = pcp.get_compounds(
|
||||
smiles,
|
||||
'smiles',
|
||||
searchtype='similarity',
|
||||
Threshold=threshold,
|
||||
MaxRecords=max_records
|
||||
)
|
||||
return compounds
|
||||
except Exception as e:
|
||||
print(f"Error in similarity search: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def substructure_search(
|
||||
smiles: str,
|
||||
max_records: int = 100
|
||||
) -> List[pcp.Compound]:
|
||||
"""
|
||||
Perform substructure search for compounds containing the query structure.
|
||||
|
||||
Args:
|
||||
smiles: Query SMILES string (substructure)
|
||||
max_records: Maximum number of results
|
||||
|
||||
Returns:
|
||||
List of Compound objects containing the substructure
|
||||
"""
|
||||
try:
|
||||
compounds = pcp.get_compounds(
|
||||
smiles,
|
||||
'smiles',
|
||||
searchtype='substructure',
|
||||
MaxRecords=max_records
|
||||
)
|
||||
return compounds
|
||||
except Exception as e:
|
||||
print(f"Error in substructure search: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def get_synonyms(identifier: Union[str, int], namespace: str = 'name') -> List[str]:
|
||||
"""
|
||||
Get all synonyms for a compound.
|
||||
|
||||
Args:
|
||||
identifier: Compound identifier
|
||||
namespace: Type of identifier
|
||||
|
||||
Returns:
|
||||
List of synonym strings
|
||||
"""
|
||||
try:
|
||||
results = pcp.get_synonyms(identifier, namespace)
|
||||
if results:
|
||||
return results[0].get('Synonym', [])
|
||||
return []
|
||||
except Exception as e:
|
||||
print(f"Error getting synonyms: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def batch_search(
|
||||
identifiers: List[str],
|
||||
namespace: str = 'name',
|
||||
properties: Optional[List[str]] = None
|
||||
) -> List[Dict]:
|
||||
"""
|
||||
Batch search for multiple compounds.
|
||||
|
||||
Args:
|
||||
identifiers: List of compound identifiers
|
||||
namespace: Type of identifiers
|
||||
properties: List of properties to retrieve
|
||||
|
||||
Returns:
|
||||
List of dictionaries containing properties for each compound
|
||||
"""
|
||||
results = []
|
||||
for identifier in identifiers:
|
||||
props = get_compound_properties(identifier, namespace, properties)
|
||||
if props:
|
||||
props['query'] = identifier
|
||||
results.append(props)
|
||||
return results
|
||||
|
||||
|
||||
def download_structure(
|
||||
identifier: Union[str, int],
|
||||
namespace: str = 'name',
|
||||
format: str = 'SDF',
|
||||
filename: Optional[str] = None
|
||||
) -> Optional[str]:
|
||||
"""
|
||||
Download compound structure in specified format.
|
||||
|
||||
Args:
|
||||
identifier: Compound identifier
|
||||
namespace: Type of identifier
|
||||
format: Output format ('SDF', 'JSON', 'PNG', etc.)
|
||||
filename: Output filename (if None, returns data as string)
|
||||
|
||||
Returns:
|
||||
Data string if filename is None, else None
|
||||
"""
|
||||
try:
|
||||
if filename:
|
||||
pcp.download(format, identifier, namespace, filename, overwrite=True)
|
||||
return None
|
||||
else:
|
||||
return pcp.download(format, identifier, namespace)
|
||||
except Exception as e:
|
||||
print(f"Error downloading structure: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def print_compound_info(compound: pcp.Compound) -> None:
|
||||
"""
|
||||
Print formatted compound information.
|
||||
|
||||
Args:
|
||||
compound: PubChemPy Compound object
|
||||
"""
|
||||
print(f"\n{'='*60}")
|
||||
print(f"Compound CID: {compound.cid}")
|
||||
print(f"{'='*60}")
|
||||
print(f"IUPAC Name: {compound.iupac_name or 'N/A'}")
|
||||
print(f"Molecular Formula: {compound.molecular_formula or 'N/A'}")
|
||||
print(f"Molecular Weight: {compound.molecular_weight or 'N/A'} g/mol")
|
||||
print(f"Canonical SMILES: {compound.canonical_smiles or 'N/A'}")
|
||||
print(f"InChI: {compound.inchi or 'N/A'}")
|
||||
print(f"InChI Key: {compound.inchikey or 'N/A'}")
|
||||
print(f"XLogP: {compound.xlogp or 'N/A'}")
|
||||
print(f"TPSA: {compound.tpsa or 'N/A'} Ų")
|
||||
print(f"H-Bond Donors: {compound.h_bond_donor_count or 'N/A'}")
|
||||
print(f"H-Bond Acceptors: {compound.h_bond_acceptor_count or 'N/A'}")
|
||||
print(f"{'='*60}\n")
|
||||
|
||||
|
||||
def main():
|
||||
"""Example usage of PubChem search functions."""
|
||||
|
||||
# Example 1: Search by name
|
||||
print("Example 1: Searching for 'aspirin'...")
|
||||
compounds = search_by_name('aspirin', max_results=1)
|
||||
if compounds:
|
||||
print_compound_info(compounds[0])
|
||||
|
||||
# Example 2: Get properties
|
||||
print("\nExample 2: Getting properties for caffeine...")
|
||||
props = get_compound_properties('caffeine', 'name')
|
||||
print(json.dumps(props, indent=2))
|
||||
|
||||
# Example 3: Similarity search
|
||||
print("\nExample 3: Finding compounds similar to benzene...")
|
||||
benzene_smiles = 'c1ccccc1'
|
||||
similar = similarity_search(benzene_smiles, threshold=95, max_records=5)
|
||||
print(f"Found {len(similar)} similar compounds:")
|
||||
for comp in similar:
|
||||
print(f" CID {comp.cid}: {comp.iupac_name or 'N/A'}")
|
||||
|
||||
# Example 4: Batch search
|
||||
print("\nExample 4: Batch search for multiple compounds...")
|
||||
names = ['aspirin', 'ibuprofen', 'paracetamol']
|
||||
results = batch_search(names, properties=['MolecularFormula', 'MolecularWeight'])
|
||||
for result in results:
|
||||
print(f" {result.get('query')}: {result.get('MolecularFormula')} "
|
||||
f"({result.get('MolecularWeight')} g/mol)")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user