#!/usr/bin/env python3 """ PubChem Compound Search Utility This script provides functions for searching and retrieving compound information from PubChem using the PubChemPy library. """ import sys import json from typing import List, Dict, Optional, Union try: import pubchempy as pcp except ImportError: print("Error: pubchempy is not installed. Install it with: pip install pubchempy") sys.exit(1) def search_by_name(name: str, max_results: int = 10) -> List[pcp.Compound]: """ Search for compounds by name. Args: name: Chemical name to search for max_results: Maximum number of results to return Returns: List of Compound objects """ try: compounds = pcp.get_compounds(name, 'name') return compounds[:max_results] except Exception as e: print(f"Error searching for '{name}': {e}") return [] def search_by_smiles(smiles: str) -> Optional[pcp.Compound]: """ Search for a compound by SMILES string. Args: smiles: SMILES string Returns: Compound object or None if not found """ try: compounds = pcp.get_compounds(smiles, 'smiles') return compounds[0] if compounds else None except Exception as e: print(f"Error searching for SMILES '{smiles}': {e}") return None def get_compound_by_cid(cid: int) -> Optional[pcp.Compound]: """ Retrieve a compound by its CID (Compound ID). Args: cid: PubChem Compound ID Returns: Compound object or None if not found """ try: return pcp.Compound.from_cid(cid) except Exception as e: print(f"Error retrieving CID {cid}: {e}") return None def get_compound_properties( identifier: Union[str, int], namespace: str = 'name', properties: Optional[List[str]] = None ) -> Dict: """ Get specific properties for a compound. Args: identifier: Compound identifier (name, SMILES, CID, etc.) namespace: Type of identifier ('name', 'smiles', 'cid', 'inchi', etc.) properties: List of properties to retrieve. If None, returns common properties. Returns: Dictionary of properties """ if properties is None: properties = [ 'MolecularFormula', 'MolecularWeight', 'CanonicalSMILES', 'IUPACName', 'XLogP', 'TPSA', 'HBondDonorCount', 'HBondAcceptorCount' ] try: result = pcp.get_properties(properties, identifier, namespace) return result[0] if result else {} except Exception as e: print(f"Error getting properties for '{identifier}': {e}") return {} def similarity_search( smiles: str, threshold: int = 90, max_records: int = 10 ) -> List[pcp.Compound]: """ Perform similarity search for compounds similar to the query structure. Args: smiles: Query SMILES string threshold: Similarity threshold (0-100) max_records: Maximum number of results Returns: List of similar Compound objects """ try: compounds = pcp.get_compounds( smiles, 'smiles', searchtype='similarity', Threshold=threshold, MaxRecords=max_records ) return compounds except Exception as e: print(f"Error in similarity search: {e}") return [] def substructure_search( smiles: str, max_records: int = 100 ) -> List[pcp.Compound]: """ Perform substructure search for compounds containing the query structure. Args: smiles: Query SMILES string (substructure) max_records: Maximum number of results Returns: List of Compound objects containing the substructure """ try: compounds = pcp.get_compounds( smiles, 'smiles', searchtype='substructure', MaxRecords=max_records ) return compounds except Exception as e: print(f"Error in substructure search: {e}") return [] def get_synonyms(identifier: Union[str, int], namespace: str = 'name') -> List[str]: """ Get all synonyms for a compound. Args: identifier: Compound identifier namespace: Type of identifier Returns: List of synonym strings """ try: results = pcp.get_synonyms(identifier, namespace) if results: return results[0].get('Synonym', []) return [] except Exception as e: print(f"Error getting synonyms: {e}") return [] def batch_search( identifiers: List[str], namespace: str = 'name', properties: Optional[List[str]] = None ) -> List[Dict]: """ Batch search for multiple compounds. Args: identifiers: List of compound identifiers namespace: Type of identifiers properties: List of properties to retrieve Returns: List of dictionaries containing properties for each compound """ results = [] for identifier in identifiers: props = get_compound_properties(identifier, namespace, properties) if props: props['query'] = identifier results.append(props) return results def download_structure( identifier: Union[str, int], namespace: str = 'name', format: str = 'SDF', filename: Optional[str] = None ) -> Optional[str]: """ Download compound structure in specified format. Args: identifier: Compound identifier namespace: Type of identifier format: Output format ('SDF', 'JSON', 'PNG', etc.) filename: Output filename (if None, returns data as string) Returns: Data string if filename is None, else None """ try: if filename: pcp.download(format, identifier, namespace, filename, overwrite=True) return None else: return pcp.download(format, identifier, namespace) except Exception as e: print(f"Error downloading structure: {e}") return None def print_compound_info(compound: pcp.Compound) -> None: """ Print formatted compound information. Args: compound: PubChemPy Compound object """ print(f"\n{'='*60}") print(f"Compound CID: {compound.cid}") print(f"{'='*60}") print(f"IUPAC Name: {compound.iupac_name or 'N/A'}") print(f"Molecular Formula: {compound.molecular_formula or 'N/A'}") print(f"Molecular Weight: {compound.molecular_weight or 'N/A'} g/mol") print(f"Canonical SMILES: {compound.canonical_smiles or 'N/A'}") print(f"InChI: {compound.inchi or 'N/A'}") print(f"InChI Key: {compound.inchikey or 'N/A'}") print(f"XLogP: {compound.xlogp or 'N/A'}") print(f"TPSA: {compound.tpsa or 'N/A'} Ų") print(f"H-Bond Donors: {compound.h_bond_donor_count or 'N/A'}") print(f"H-Bond Acceptors: {compound.h_bond_acceptor_count or 'N/A'}") print(f"{'='*60}\n") def main(): """Example usage of PubChem search functions.""" # Example 1: Search by name print("Example 1: Searching for 'aspirin'...") compounds = search_by_name('aspirin', max_results=1) if compounds: print_compound_info(compounds[0]) # Example 2: Get properties print("\nExample 2: Getting properties for caffeine...") props = get_compound_properties('caffeine', 'name') print(json.dumps(props, indent=2)) # Example 3: Similarity search print("\nExample 3: Finding compounds similar to benzene...") benzene_smiles = 'c1ccccc1' similar = similarity_search(benzene_smiles, threshold=95, max_records=5) print(f"Found {len(similar)} similar compounds:") for comp in similar: print(f" CID {comp.cid}: {comp.iupac_name or 'N/A'}") # Example 4: Batch search print("\nExample 4: Batch search for multiple compounds...") names = ['aspirin', 'ibuprofen', 'paracetamol'] results = batch_search(names, properties=['MolecularFormula', 'MolecularWeight']) for result in results: print(f" {result.get('query')}: {result.get('MolecularFormula')} " f"({result.get('MolecularWeight')} g/mol)") if __name__ == '__main__': main()