379 lines
11 KiB
Python
Executable File
379 lines
11 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Compound Cross-Database Search
|
|
|
|
This script searches for a compound by name and retrieves identifiers
|
|
from multiple databases:
|
|
- KEGG Compound
|
|
- ChEBI
|
|
- ChEMBL (via UniChem)
|
|
- Basic compound properties
|
|
|
|
Usage:
|
|
python compound_cross_reference.py COMPOUND_NAME [--output FILE]
|
|
|
|
Examples:
|
|
python compound_cross_reference.py Geldanamycin
|
|
python compound_cross_reference.py "Adenosine triphosphate"
|
|
python compound_cross_reference.py Aspirin --output aspirin_info.txt
|
|
"""
|
|
|
|
import sys
|
|
import argparse
|
|
from bioservices import KEGG, UniChem, ChEBI, ChEMBL
|
|
|
|
|
|
def search_kegg_compound(compound_name):
|
|
"""Search KEGG for compound by name."""
|
|
print(f"\n{'='*70}")
|
|
print("STEP 1: KEGG Compound Search")
|
|
print(f"{'='*70}")
|
|
|
|
k = KEGG()
|
|
|
|
print(f"Searching KEGG for: {compound_name}")
|
|
|
|
try:
|
|
results = k.find("compound", compound_name)
|
|
|
|
if not results or not results.strip():
|
|
print(f"✗ No results found in KEGG")
|
|
return k, None
|
|
|
|
# Parse results
|
|
lines = results.strip().split("\n")
|
|
print(f"✓ Found {len(lines)} result(s):\n")
|
|
|
|
for i, line in enumerate(lines[:5], 1):
|
|
parts = line.split("\t")
|
|
kegg_id = parts[0]
|
|
description = parts[1] if len(parts) > 1 else "No description"
|
|
print(f" {i}. {kegg_id}: {description}")
|
|
|
|
# Use first result
|
|
first_result = lines[0].split("\t")
|
|
kegg_id = first_result[0].replace("cpd:", "")
|
|
|
|
print(f"\nUsing: {kegg_id}")
|
|
|
|
return k, kegg_id
|
|
|
|
except Exception as e:
|
|
print(f"✗ Error: {e}")
|
|
return k, None
|
|
|
|
|
|
def get_kegg_info(kegg, kegg_id):
|
|
"""Retrieve detailed KEGG compound information."""
|
|
print(f"\n{'='*70}")
|
|
print("STEP 2: KEGG Compound Details")
|
|
print(f"{'='*70}")
|
|
|
|
try:
|
|
print(f"Retrieving KEGG entry for {kegg_id}...")
|
|
|
|
entry = kegg.get(f"cpd:{kegg_id}")
|
|
|
|
if not entry:
|
|
print("✗ Failed to retrieve entry")
|
|
return None
|
|
|
|
# Parse entry
|
|
compound_info = {
|
|
'kegg_id': kegg_id,
|
|
'name': None,
|
|
'formula': None,
|
|
'exact_mass': None,
|
|
'mol_weight': None,
|
|
'chebi_id': None,
|
|
'pathways': []
|
|
}
|
|
|
|
current_section = None
|
|
|
|
for line in entry.split("\n"):
|
|
if line.startswith("NAME"):
|
|
compound_info['name'] = line.replace("NAME", "").strip().rstrip(";")
|
|
|
|
elif line.startswith("FORMULA"):
|
|
compound_info['formula'] = line.replace("FORMULA", "").strip()
|
|
|
|
elif line.startswith("EXACT_MASS"):
|
|
compound_info['exact_mass'] = line.replace("EXACT_MASS", "").strip()
|
|
|
|
elif line.startswith("MOL_WEIGHT"):
|
|
compound_info['mol_weight'] = line.replace("MOL_WEIGHT", "").strip()
|
|
|
|
elif "ChEBI:" in line:
|
|
parts = line.split("ChEBI:")
|
|
if len(parts) > 1:
|
|
compound_info['chebi_id'] = parts[1].strip().split()[0]
|
|
|
|
elif line.startswith("PATHWAY"):
|
|
current_section = "pathway"
|
|
pathway = line.replace("PATHWAY", "").strip()
|
|
if pathway:
|
|
compound_info['pathways'].append(pathway)
|
|
|
|
elif current_section == "pathway" and line.startswith(" "):
|
|
pathway = line.strip()
|
|
if pathway:
|
|
compound_info['pathways'].append(pathway)
|
|
|
|
elif line.startswith(" ") and not line.startswith(" "):
|
|
current_section = None
|
|
|
|
# Display information
|
|
print(f"\n✓ KEGG Compound Information:")
|
|
print(f" ID: {compound_info['kegg_id']}")
|
|
print(f" Name: {compound_info['name']}")
|
|
print(f" Formula: {compound_info['formula']}")
|
|
print(f" Exact Mass: {compound_info['exact_mass']}")
|
|
print(f" Molecular Weight: {compound_info['mol_weight']}")
|
|
|
|
if compound_info['chebi_id']:
|
|
print(f" ChEBI ID: {compound_info['chebi_id']}")
|
|
|
|
if compound_info['pathways']:
|
|
print(f" Pathways: {len(compound_info['pathways'])} found")
|
|
|
|
return compound_info
|
|
|
|
except Exception as e:
|
|
print(f"✗ Error: {e}")
|
|
return None
|
|
|
|
|
|
def get_chembl_id(kegg_id):
|
|
"""Map KEGG ID to ChEMBL via UniChem."""
|
|
print(f"\n{'='*70}")
|
|
print("STEP 3: ChEMBL Mapping (via UniChem)")
|
|
print(f"{'='*70}")
|
|
|
|
try:
|
|
u = UniChem()
|
|
|
|
print(f"Mapping KEGG:{kegg_id} to ChEMBL...")
|
|
|
|
chembl_id = u.get_compound_id_from_kegg(kegg_id)
|
|
|
|
if chembl_id:
|
|
print(f"✓ ChEMBL ID: {chembl_id}")
|
|
return chembl_id
|
|
else:
|
|
print("✗ No ChEMBL mapping found")
|
|
return None
|
|
|
|
except Exception as e:
|
|
print(f"✗ Error: {e}")
|
|
return None
|
|
|
|
|
|
def get_chebi_info(chebi_id):
|
|
"""Retrieve ChEBI compound information."""
|
|
print(f"\n{'='*70}")
|
|
print("STEP 4: ChEBI Details")
|
|
print(f"{'='*70}")
|
|
|
|
if not chebi_id:
|
|
print("⊘ No ChEBI ID available")
|
|
return None
|
|
|
|
try:
|
|
c = ChEBI()
|
|
|
|
print(f"Retrieving ChEBI entry for {chebi_id}...")
|
|
|
|
# Ensure proper format
|
|
if not chebi_id.startswith("CHEBI:"):
|
|
chebi_id = f"CHEBI:{chebi_id}"
|
|
|
|
entity = c.getCompleteEntity(chebi_id)
|
|
|
|
if entity:
|
|
print(f"\n✓ ChEBI Information:")
|
|
print(f" ID: {entity.chebiId}")
|
|
print(f" Name: {entity.chebiAsciiName}")
|
|
|
|
if hasattr(entity, 'Formulae') and entity.Formulae:
|
|
print(f" Formula: {entity.Formulae}")
|
|
|
|
if hasattr(entity, 'mass') and entity.mass:
|
|
print(f" Mass: {entity.mass}")
|
|
|
|
if hasattr(entity, 'charge') and entity.charge:
|
|
print(f" Charge: {entity.charge}")
|
|
|
|
return {
|
|
'chebi_id': entity.chebiId,
|
|
'name': entity.chebiAsciiName,
|
|
'formula': entity.Formulae if hasattr(entity, 'Formulae') else None,
|
|
'mass': entity.mass if hasattr(entity, 'mass') else None
|
|
}
|
|
else:
|
|
print("✗ Failed to retrieve ChEBI entry")
|
|
return None
|
|
|
|
except Exception as e:
|
|
print(f"✗ Error: {e}")
|
|
return None
|
|
|
|
|
|
def get_chembl_info(chembl_id):
|
|
"""Retrieve ChEMBL compound information."""
|
|
print(f"\n{'='*70}")
|
|
print("STEP 5: ChEMBL Details")
|
|
print(f"{'='*70}")
|
|
|
|
if not chembl_id:
|
|
print("⊘ No ChEMBL ID available")
|
|
return None
|
|
|
|
try:
|
|
c = ChEMBL()
|
|
|
|
print(f"Retrieving ChEMBL entry for {chembl_id}...")
|
|
|
|
compound = c.get_compound_by_chemblId(chembl_id)
|
|
|
|
if compound:
|
|
print(f"\n✓ ChEMBL Information:")
|
|
print(f" ID: {chembl_id}")
|
|
|
|
if 'pref_name' in compound and compound['pref_name']:
|
|
print(f" Preferred Name: {compound['pref_name']}")
|
|
|
|
if 'molecule_properties' in compound:
|
|
props = compound['molecule_properties']
|
|
|
|
if 'full_mwt' in props:
|
|
print(f" Molecular Weight: {props['full_mwt']}")
|
|
|
|
if 'alogp' in props:
|
|
print(f" LogP: {props['alogp']}")
|
|
|
|
if 'hba' in props:
|
|
print(f" H-Bond Acceptors: {props['hba']}")
|
|
|
|
if 'hbd' in props:
|
|
print(f" H-Bond Donors: {props['hbd']}")
|
|
|
|
if 'molecule_structures' in compound:
|
|
structs = compound['molecule_structures']
|
|
|
|
if 'canonical_smiles' in structs:
|
|
smiles = structs['canonical_smiles']
|
|
print(f" SMILES: {smiles[:60]}{'...' if len(smiles) > 60 else ''}")
|
|
|
|
return compound
|
|
else:
|
|
print("✗ Failed to retrieve ChEMBL entry")
|
|
return None
|
|
|
|
except Exception as e:
|
|
print(f"✗ Error: {e}")
|
|
return None
|
|
|
|
|
|
def save_results(compound_name, kegg_info, chembl_id, output_file):
|
|
"""Save results to file."""
|
|
print(f"\n{'='*70}")
|
|
print(f"Saving results to {output_file}")
|
|
print(f"{'='*70}")
|
|
|
|
with open(output_file, 'w') as f:
|
|
f.write("=" * 70 + "\n")
|
|
f.write(f"Compound Cross-Reference Report: {compound_name}\n")
|
|
f.write("=" * 70 + "\n\n")
|
|
|
|
# KEGG information
|
|
if kegg_info:
|
|
f.write("KEGG Compound\n")
|
|
f.write("-" * 70 + "\n")
|
|
f.write(f"ID: {kegg_info['kegg_id']}\n")
|
|
f.write(f"Name: {kegg_info['name']}\n")
|
|
f.write(f"Formula: {kegg_info['formula']}\n")
|
|
f.write(f"Exact Mass: {kegg_info['exact_mass']}\n")
|
|
f.write(f"Molecular Weight: {kegg_info['mol_weight']}\n")
|
|
f.write(f"Pathways: {len(kegg_info['pathways'])} found\n")
|
|
f.write("\n")
|
|
|
|
# Database IDs
|
|
f.write("Cross-Database Identifiers\n")
|
|
f.write("-" * 70 + "\n")
|
|
if kegg_info:
|
|
f.write(f"KEGG: {kegg_info['kegg_id']}\n")
|
|
if kegg_info['chebi_id']:
|
|
f.write(f"ChEBI: {kegg_info['chebi_id']}\n")
|
|
if chembl_id:
|
|
f.write(f"ChEMBL: {chembl_id}\n")
|
|
f.write("\n")
|
|
|
|
print(f"✓ Results saved")
|
|
|
|
|
|
def main():
|
|
"""Main workflow."""
|
|
parser = argparse.ArgumentParser(
|
|
description="Search compound across multiple databases",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
python compound_cross_reference.py Geldanamycin
|
|
python compound_cross_reference.py "Adenosine triphosphate"
|
|
python compound_cross_reference.py Aspirin --output aspirin_info.txt
|
|
"""
|
|
)
|
|
parser.add_argument("compound", help="Compound name to search")
|
|
parser.add_argument("--output", default=None,
|
|
help="Output file for results (optional)")
|
|
|
|
args = parser.parse_args()
|
|
|
|
print("=" * 70)
|
|
print("BIOSERVICES: Compound Cross-Database Search")
|
|
print("=" * 70)
|
|
|
|
# Step 1: Search KEGG
|
|
kegg, kegg_id = search_kegg_compound(args.compound)
|
|
if not kegg_id:
|
|
print("\n✗ Failed to find compound. Exiting.")
|
|
sys.exit(1)
|
|
|
|
# Step 2: Get KEGG details
|
|
kegg_info = get_kegg_info(kegg, kegg_id)
|
|
|
|
# Step 3: Map to ChEMBL
|
|
chembl_id = get_chembl_id(kegg_id)
|
|
|
|
# Step 4: Get ChEBI details
|
|
chebi_info = None
|
|
if kegg_info and kegg_info['chebi_id']:
|
|
chebi_info = get_chebi_info(kegg_info['chebi_id'])
|
|
|
|
# Step 5: Get ChEMBL details
|
|
chembl_info = None
|
|
if chembl_id:
|
|
chembl_info = get_chembl_info(chembl_id)
|
|
|
|
# Summary
|
|
print(f"\n{'='*70}")
|
|
print("SUMMARY")
|
|
print(f"{'='*70}")
|
|
print(f" Compound: {args.compound}")
|
|
if kegg_info:
|
|
print(f" KEGG ID: {kegg_info['kegg_id']}")
|
|
if kegg_info['chebi_id']:
|
|
print(f" ChEBI ID: {kegg_info['chebi_id']}")
|
|
if chembl_id:
|
|
print(f" ChEMBL ID: {chembl_id}")
|
|
print(f"{'='*70}")
|
|
|
|
# Save to file if requested
|
|
if args.output:
|
|
save_results(args.compound, kegg_info, chembl_id, args.output)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|