#!/usr/bin/env python3 """ Compound Cross-Database Search This script searches for a compound by name and retrieves identifiers from multiple databases: - KEGG Compound - ChEBI - ChEMBL (via UniChem) - Basic compound properties Usage: python compound_cross_reference.py COMPOUND_NAME [--output FILE] Examples: python compound_cross_reference.py Geldanamycin python compound_cross_reference.py "Adenosine triphosphate" python compound_cross_reference.py Aspirin --output aspirin_info.txt """ import sys import argparse from bioservices import KEGG, UniChem, ChEBI, ChEMBL def search_kegg_compound(compound_name): """Search KEGG for compound by name.""" print(f"\n{'='*70}") print("STEP 1: KEGG Compound Search") print(f"{'='*70}") k = KEGG() print(f"Searching KEGG for: {compound_name}") try: results = k.find("compound", compound_name) if not results or not results.strip(): print(f"✗ No results found in KEGG") return k, None # Parse results lines = results.strip().split("\n") print(f"✓ Found {len(lines)} result(s):\n") for i, line in enumerate(lines[:5], 1): parts = line.split("\t") kegg_id = parts[0] description = parts[1] if len(parts) > 1 else "No description" print(f" {i}. {kegg_id}: {description}") # Use first result first_result = lines[0].split("\t") kegg_id = first_result[0].replace("cpd:", "") print(f"\nUsing: {kegg_id}") return k, kegg_id except Exception as e: print(f"✗ Error: {e}") return k, None def get_kegg_info(kegg, kegg_id): """Retrieve detailed KEGG compound information.""" print(f"\n{'='*70}") print("STEP 2: KEGG Compound Details") print(f"{'='*70}") try: print(f"Retrieving KEGG entry for {kegg_id}...") entry = kegg.get(f"cpd:{kegg_id}") if not entry: print("✗ Failed to retrieve entry") return None # Parse entry compound_info = { 'kegg_id': kegg_id, 'name': None, 'formula': None, 'exact_mass': None, 'mol_weight': None, 'chebi_id': None, 'pathways': [] } current_section = None for line in entry.split("\n"): if line.startswith("NAME"): compound_info['name'] = line.replace("NAME", "").strip().rstrip(";") elif line.startswith("FORMULA"): compound_info['formula'] = line.replace("FORMULA", "").strip() elif line.startswith("EXACT_MASS"): compound_info['exact_mass'] = line.replace("EXACT_MASS", "").strip() elif line.startswith("MOL_WEIGHT"): compound_info['mol_weight'] = line.replace("MOL_WEIGHT", "").strip() elif "ChEBI:" in line: parts = line.split("ChEBI:") if len(parts) > 1: compound_info['chebi_id'] = parts[1].strip().split()[0] elif line.startswith("PATHWAY"): current_section = "pathway" pathway = line.replace("PATHWAY", "").strip() if pathway: compound_info['pathways'].append(pathway) elif current_section == "pathway" and line.startswith(" "): pathway = line.strip() if pathway: compound_info['pathways'].append(pathway) elif line.startswith(" ") and not line.startswith(" "): current_section = None # Display information print(f"\n✓ KEGG Compound Information:") print(f" ID: {compound_info['kegg_id']}") print(f" Name: {compound_info['name']}") print(f" Formula: {compound_info['formula']}") print(f" Exact Mass: {compound_info['exact_mass']}") print(f" Molecular Weight: {compound_info['mol_weight']}") if compound_info['chebi_id']: print(f" ChEBI ID: {compound_info['chebi_id']}") if compound_info['pathways']: print(f" Pathways: {len(compound_info['pathways'])} found") return compound_info except Exception as e: print(f"✗ Error: {e}") return None def get_chembl_id(kegg_id): """Map KEGG ID to ChEMBL via UniChem.""" print(f"\n{'='*70}") print("STEP 3: ChEMBL Mapping (via UniChem)") print(f"{'='*70}") try: u = UniChem() print(f"Mapping KEGG:{kegg_id} to ChEMBL...") chembl_id = u.get_compound_id_from_kegg(kegg_id) if chembl_id: print(f"✓ ChEMBL ID: {chembl_id}") return chembl_id else: print("✗ No ChEMBL mapping found") return None except Exception as e: print(f"✗ Error: {e}") return None def get_chebi_info(chebi_id): """Retrieve ChEBI compound information.""" print(f"\n{'='*70}") print("STEP 4: ChEBI Details") print(f"{'='*70}") if not chebi_id: print("⊘ No ChEBI ID available") return None try: c = ChEBI() print(f"Retrieving ChEBI entry for {chebi_id}...") # Ensure proper format if not chebi_id.startswith("CHEBI:"): chebi_id = f"CHEBI:{chebi_id}" entity = c.getCompleteEntity(chebi_id) if entity: print(f"\n✓ ChEBI Information:") print(f" ID: {entity.chebiId}") print(f" Name: {entity.chebiAsciiName}") if hasattr(entity, 'Formulae') and entity.Formulae: print(f" Formula: {entity.Formulae}") if hasattr(entity, 'mass') and entity.mass: print(f" Mass: {entity.mass}") if hasattr(entity, 'charge') and entity.charge: print(f" Charge: {entity.charge}") return { 'chebi_id': entity.chebiId, 'name': entity.chebiAsciiName, 'formula': entity.Formulae if hasattr(entity, 'Formulae') else None, 'mass': entity.mass if hasattr(entity, 'mass') else None } else: print("✗ Failed to retrieve ChEBI entry") return None except Exception as e: print(f"✗ Error: {e}") return None def get_chembl_info(chembl_id): """Retrieve ChEMBL compound information.""" print(f"\n{'='*70}") print("STEP 5: ChEMBL Details") print(f"{'='*70}") if not chembl_id: print("⊘ No ChEMBL ID available") return None try: c = ChEMBL() print(f"Retrieving ChEMBL entry for {chembl_id}...") compound = c.get_compound_by_chemblId(chembl_id) if compound: print(f"\n✓ ChEMBL Information:") print(f" ID: {chembl_id}") if 'pref_name' in compound and compound['pref_name']: print(f" Preferred Name: {compound['pref_name']}") if 'molecule_properties' in compound: props = compound['molecule_properties'] if 'full_mwt' in props: print(f" Molecular Weight: {props['full_mwt']}") if 'alogp' in props: print(f" LogP: {props['alogp']}") if 'hba' in props: print(f" H-Bond Acceptors: {props['hba']}") if 'hbd' in props: print(f" H-Bond Donors: {props['hbd']}") if 'molecule_structures' in compound: structs = compound['molecule_structures'] if 'canonical_smiles' in structs: smiles = structs['canonical_smiles'] print(f" SMILES: {smiles[:60]}{'...' if len(smiles) > 60 else ''}") return compound else: print("✗ Failed to retrieve ChEMBL entry") return None except Exception as e: print(f"✗ Error: {e}") return None def save_results(compound_name, kegg_info, chembl_id, output_file): """Save results to file.""" print(f"\n{'='*70}") print(f"Saving results to {output_file}") print(f"{'='*70}") with open(output_file, 'w') as f: f.write("=" * 70 + "\n") f.write(f"Compound Cross-Reference Report: {compound_name}\n") f.write("=" * 70 + "\n\n") # KEGG information if kegg_info: f.write("KEGG Compound\n") f.write("-" * 70 + "\n") f.write(f"ID: {kegg_info['kegg_id']}\n") f.write(f"Name: {kegg_info['name']}\n") f.write(f"Formula: {kegg_info['formula']}\n") f.write(f"Exact Mass: {kegg_info['exact_mass']}\n") f.write(f"Molecular Weight: {kegg_info['mol_weight']}\n") f.write(f"Pathways: {len(kegg_info['pathways'])} found\n") f.write("\n") # Database IDs f.write("Cross-Database Identifiers\n") f.write("-" * 70 + "\n") if kegg_info: f.write(f"KEGG: {kegg_info['kegg_id']}\n") if kegg_info['chebi_id']: f.write(f"ChEBI: {kegg_info['chebi_id']}\n") if chembl_id: f.write(f"ChEMBL: {chembl_id}\n") f.write("\n") print(f"✓ Results saved") def main(): """Main workflow.""" parser = argparse.ArgumentParser( description="Search compound across multiple databases", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples: python compound_cross_reference.py Geldanamycin python compound_cross_reference.py "Adenosine triphosphate" python compound_cross_reference.py Aspirin --output aspirin_info.txt """ ) parser.add_argument("compound", help="Compound name to search") parser.add_argument("--output", default=None, help="Output file for results (optional)") args = parser.parse_args() print("=" * 70) print("BIOSERVICES: Compound Cross-Database Search") print("=" * 70) # Step 1: Search KEGG kegg, kegg_id = search_kegg_compound(args.compound) if not kegg_id: print("\n✗ Failed to find compound. Exiting.") sys.exit(1) # Step 2: Get KEGG details kegg_info = get_kegg_info(kegg, kegg_id) # Step 3: Map to ChEMBL chembl_id = get_chembl_id(kegg_id) # Step 4: Get ChEBI details chebi_info = None if kegg_info and kegg_info['chebi_id']: chebi_info = get_chebi_info(kegg_info['chebi_id']) # Step 5: Get ChEMBL details chembl_info = None if chembl_id: chembl_info = get_chembl_info(chembl_id) # Summary print(f"\n{'='*70}") print("SUMMARY") print(f"{'='*70}") print(f" Compound: {args.compound}") if kegg_info: print(f" KEGG ID: {kegg_info['kegg_id']}") if kegg_info['chebi_id']: print(f" ChEBI ID: {kegg_info['chebi_id']}") if chembl_id: print(f" ChEMBL ID: {chembl_id}") print(f"{'='*70}") # Save to file if requested if args.output: save_results(args.compound, kegg_info, chembl_id, args.output) if __name__ == "__main__": main()