Initial commit
This commit is contained in:
347
skills/bioservices/scripts/batch_id_converter.py
Executable file
347
skills/bioservices/scripts/batch_id_converter.py
Executable file
@@ -0,0 +1,347 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Batch Identifier Converter
|
||||
|
||||
This script converts multiple identifiers between biological databases
|
||||
using UniProt's mapping service. Supports batch processing with
|
||||
automatic chunking and error handling.
|
||||
|
||||
Usage:
|
||||
python batch_id_converter.py INPUT_FILE --from DB1 --to DB2 [options]
|
||||
|
||||
Examples:
|
||||
python batch_id_converter.py uniprot_ids.txt --from UniProtKB_AC-ID --to KEGG
|
||||
python batch_id_converter.py gene_ids.txt --from GeneID --to UniProtKB --output mapping.csv
|
||||
python batch_id_converter.py ids.txt --from UniProtKB_AC-ID --to Ensembl --chunk-size 50
|
||||
|
||||
Input file format:
|
||||
One identifier per line (plain text)
|
||||
|
||||
Common database codes:
|
||||
UniProtKB_AC-ID - UniProt accession/ID
|
||||
KEGG - KEGG gene IDs
|
||||
GeneID - NCBI Gene (Entrez) IDs
|
||||
Ensembl - Ensembl gene IDs
|
||||
Ensembl_Protein - Ensembl protein IDs
|
||||
RefSeq_Protein - RefSeq protein IDs
|
||||
PDB - Protein Data Bank IDs
|
||||
HGNC - Human gene symbols
|
||||
GO - Gene Ontology IDs
|
||||
"""
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
import csv
|
||||
import time
|
||||
from bioservices import UniProt
|
||||
|
||||
|
||||
# Common database code mappings
|
||||
DATABASE_CODES = {
|
||||
'uniprot': 'UniProtKB_AC-ID',
|
||||
'uniprotkb': 'UniProtKB_AC-ID',
|
||||
'kegg': 'KEGG',
|
||||
'geneid': 'GeneID',
|
||||
'entrez': 'GeneID',
|
||||
'ensembl': 'Ensembl',
|
||||
'ensembl_protein': 'Ensembl_Protein',
|
||||
'ensembl_transcript': 'Ensembl_Transcript',
|
||||
'refseq': 'RefSeq_Protein',
|
||||
'refseq_protein': 'RefSeq_Protein',
|
||||
'pdb': 'PDB',
|
||||
'hgnc': 'HGNC',
|
||||
'mgi': 'MGI',
|
||||
'go': 'GO',
|
||||
'pfam': 'Pfam',
|
||||
'interpro': 'InterPro',
|
||||
'reactome': 'Reactome',
|
||||
'string': 'STRING',
|
||||
'biogrid': 'BioGRID'
|
||||
}
|
||||
|
||||
|
||||
def normalize_database_code(code):
|
||||
"""Normalize database code to official format."""
|
||||
# Try exact match first
|
||||
if code in DATABASE_CODES.values():
|
||||
return code
|
||||
|
||||
# Try lowercase lookup
|
||||
lowercase = code.lower()
|
||||
if lowercase in DATABASE_CODES:
|
||||
return DATABASE_CODES[lowercase]
|
||||
|
||||
# Return as-is if not found (may still be valid)
|
||||
return code
|
||||
|
||||
|
||||
def read_ids_from_file(filename):
|
||||
"""Read identifiers from file (one per line)."""
|
||||
print(f"Reading identifiers from {filename}...")
|
||||
|
||||
ids = []
|
||||
with open(filename, 'r') as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line and not line.startswith('#'):
|
||||
ids.append(line)
|
||||
|
||||
print(f"✓ Read {len(ids)} identifier(s)")
|
||||
|
||||
return ids
|
||||
|
||||
|
||||
def batch_convert(ids, from_db, to_db, chunk_size=100, delay=0.5):
|
||||
"""Convert IDs with automatic chunking and error handling."""
|
||||
print(f"\nConverting {len(ids)} IDs:")
|
||||
print(f" From: {from_db}")
|
||||
print(f" To: {to_db}")
|
||||
print(f" Chunk size: {chunk_size}")
|
||||
print()
|
||||
|
||||
u = UniProt(verbose=False)
|
||||
all_results = {}
|
||||
failed_ids = []
|
||||
|
||||
total_chunks = (len(ids) + chunk_size - 1) // chunk_size
|
||||
|
||||
for i in range(0, len(ids), chunk_size):
|
||||
chunk = ids[i:i+chunk_size]
|
||||
chunk_num = (i // chunk_size) + 1
|
||||
|
||||
query = ",".join(chunk)
|
||||
|
||||
try:
|
||||
print(f" [{chunk_num}/{total_chunks}] Processing {len(chunk)} IDs...", end=" ")
|
||||
|
||||
results = u.mapping(fr=from_db, to=to_db, query=query)
|
||||
|
||||
if results:
|
||||
all_results.update(results)
|
||||
mapped_count = len([v for v in results.values() if v])
|
||||
print(f"✓ Mapped: {mapped_count}/{len(chunk)}")
|
||||
else:
|
||||
print(f"✗ No mappings returned")
|
||||
failed_ids.extend(chunk)
|
||||
|
||||
# Rate limiting
|
||||
if delay > 0 and i + chunk_size < len(ids):
|
||||
time.sleep(delay)
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Error: {e}")
|
||||
|
||||
# Try individual IDs in failed chunk
|
||||
print(f" Retrying individual IDs...")
|
||||
for single_id in chunk:
|
||||
try:
|
||||
result = u.mapping(fr=from_db, to=to_db, query=single_id)
|
||||
if result:
|
||||
all_results.update(result)
|
||||
print(f" ✓ {single_id}")
|
||||
else:
|
||||
failed_ids.append(single_id)
|
||||
print(f" ✗ {single_id} - no mapping")
|
||||
except Exception as e2:
|
||||
failed_ids.append(single_id)
|
||||
print(f" ✗ {single_id} - {e2}")
|
||||
|
||||
time.sleep(0.2)
|
||||
|
||||
# Add missing IDs to results (mark as failed)
|
||||
for id_ in ids:
|
||||
if id_ not in all_results:
|
||||
all_results[id_] = None
|
||||
|
||||
print(f"\n✓ Conversion complete:")
|
||||
print(f" Total: {len(ids)}")
|
||||
print(f" Mapped: {len([v for v in all_results.values() if v])}")
|
||||
print(f" Failed: {len(failed_ids)}")
|
||||
|
||||
return all_results, failed_ids
|
||||
|
||||
|
||||
def save_mapping_csv(mapping, output_file, from_db, to_db):
|
||||
"""Save mapping results to CSV."""
|
||||
print(f"\nSaving results to {output_file}...")
|
||||
|
||||
with open(output_file, 'w', newline='') as f:
|
||||
writer = csv.writer(f)
|
||||
|
||||
# Header
|
||||
writer.writerow(['Source_ID', 'Source_DB', 'Target_IDs', 'Target_DB', 'Mapping_Status'])
|
||||
|
||||
# Data
|
||||
for source_id, target_ids in sorted(mapping.items()):
|
||||
if target_ids:
|
||||
target_str = ";".join(target_ids)
|
||||
status = "Success"
|
||||
else:
|
||||
target_str = ""
|
||||
status = "Failed"
|
||||
|
||||
writer.writerow([source_id, from_db, target_str, to_db, status])
|
||||
|
||||
print(f"✓ Results saved")
|
||||
|
||||
|
||||
def save_failed_ids(failed_ids, output_file):
|
||||
"""Save failed IDs to file."""
|
||||
if not failed_ids:
|
||||
return
|
||||
|
||||
print(f"\nSaving failed IDs to {output_file}...")
|
||||
|
||||
with open(output_file, 'w') as f:
|
||||
for id_ in failed_ids:
|
||||
f.write(f"{id_}\n")
|
||||
|
||||
print(f"✓ Saved {len(failed_ids)} failed ID(s)")
|
||||
|
||||
|
||||
def print_mapping_summary(mapping, from_db, to_db):
|
||||
"""Print summary of mapping results."""
|
||||
print(f"\n{'='*70}")
|
||||
print("MAPPING SUMMARY")
|
||||
print(f"{'='*70}")
|
||||
|
||||
total = len(mapping)
|
||||
mapped = len([v for v in mapping.values() if v])
|
||||
failed = total - mapped
|
||||
|
||||
print(f"\nSource database: {from_db}")
|
||||
print(f"Target database: {to_db}")
|
||||
print(f"\nTotal identifiers: {total}")
|
||||
print(f"Successfully mapped: {mapped} ({mapped/total*100:.1f}%)")
|
||||
print(f"Failed to map: {failed} ({failed/total*100:.1f}%)")
|
||||
|
||||
# Show some examples
|
||||
if mapped > 0:
|
||||
print(f"\nExample mappings (first 5):")
|
||||
count = 0
|
||||
for source_id, target_ids in mapping.items():
|
||||
if target_ids:
|
||||
target_str = ", ".join(target_ids[:3])
|
||||
if len(target_ids) > 3:
|
||||
target_str += f" ... +{len(target_ids)-3} more"
|
||||
print(f" {source_id} → {target_str}")
|
||||
count += 1
|
||||
if count >= 5:
|
||||
break
|
||||
|
||||
# Show multiple mapping statistics
|
||||
multiple_mappings = [v for v in mapping.values() if v and len(v) > 1]
|
||||
if multiple_mappings:
|
||||
print(f"\nMultiple target mappings: {len(multiple_mappings)} ID(s)")
|
||||
print(f" (These source IDs map to multiple target IDs)")
|
||||
|
||||
print(f"{'='*70}")
|
||||
|
||||
|
||||
def list_common_databases():
|
||||
"""Print list of common database codes."""
|
||||
print("\nCommon Database Codes:")
|
||||
print("-" * 70)
|
||||
print(f"{'Alias':<20} {'Official Code':<30}")
|
||||
print("-" * 70)
|
||||
|
||||
for alias, code in sorted(DATABASE_CODES.items()):
|
||||
if alias != code.lower():
|
||||
print(f"{alias:<20} {code:<30}")
|
||||
|
||||
print("-" * 70)
|
||||
print("\nNote: Many other database codes are supported.")
|
||||
print("See UniProt documentation for complete list.")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main conversion workflow."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Batch convert biological identifiers between databases",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python batch_id_converter.py uniprot_ids.txt --from UniProtKB_AC-ID --to KEGG
|
||||
python batch_id_converter.py ids.txt --from GeneID --to UniProtKB -o mapping.csv
|
||||
python batch_id_converter.py ids.txt --from uniprot --to ensembl --chunk-size 50
|
||||
|
||||
Common database codes:
|
||||
UniProtKB_AC-ID, KEGG, GeneID, Ensembl, Ensembl_Protein,
|
||||
RefSeq_Protein, PDB, HGNC, GO, Pfam, InterPro, Reactome
|
||||
|
||||
Use --list-databases to see all supported aliases.
|
||||
"""
|
||||
)
|
||||
parser.add_argument("input_file", help="Input file with IDs (one per line)")
|
||||
parser.add_argument("--from", dest="from_db", required=True,
|
||||
help="Source database code")
|
||||
parser.add_argument("--to", dest="to_db", required=True,
|
||||
help="Target database code")
|
||||
parser.add_argument("-o", "--output", default=None,
|
||||
help="Output CSV file (default: mapping_results.csv)")
|
||||
parser.add_argument("--chunk-size", type=int, default=100,
|
||||
help="Number of IDs per batch (default: 100)")
|
||||
parser.add_argument("--delay", type=float, default=0.5,
|
||||
help="Delay between batches in seconds (default: 0.5)")
|
||||
parser.add_argument("--save-failed", action="store_true",
|
||||
help="Save failed IDs to separate file")
|
||||
parser.add_argument("--list-databases", action="store_true",
|
||||
help="List common database codes and exit")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# List databases and exit
|
||||
if args.list_databases:
|
||||
list_common_databases()
|
||||
sys.exit(0)
|
||||
|
||||
print("=" * 70)
|
||||
print("BIOSERVICES: Batch Identifier Converter")
|
||||
print("=" * 70)
|
||||
|
||||
# Normalize database codes
|
||||
from_db = normalize_database_code(args.from_db)
|
||||
to_db = normalize_database_code(args.to_db)
|
||||
|
||||
if from_db != args.from_db:
|
||||
print(f"\nNote: Normalized '{args.from_db}' → '{from_db}'")
|
||||
if to_db != args.to_db:
|
||||
print(f"Note: Normalized '{args.to_db}' → '{to_db}'")
|
||||
|
||||
# Read input IDs
|
||||
try:
|
||||
ids = read_ids_from_file(args.input_file)
|
||||
except Exception as e:
|
||||
print(f"\n✗ Error reading input file: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
if not ids:
|
||||
print("\n✗ No IDs found in input file")
|
||||
sys.exit(1)
|
||||
|
||||
# Perform conversion
|
||||
mapping, failed_ids = batch_convert(
|
||||
ids,
|
||||
from_db,
|
||||
to_db,
|
||||
chunk_size=args.chunk_size,
|
||||
delay=args.delay
|
||||
)
|
||||
|
||||
# Print summary
|
||||
print_mapping_summary(mapping, from_db, to_db)
|
||||
|
||||
# Save results
|
||||
output_file = args.output or "mapping_results.csv"
|
||||
save_mapping_csv(mapping, output_file, from_db, to_db)
|
||||
|
||||
# Save failed IDs if requested
|
||||
if args.save_failed and failed_ids:
|
||||
failed_file = output_file.replace(".csv", "_failed.txt")
|
||||
save_failed_ids(failed_ids, failed_file)
|
||||
|
||||
print(f"\n✓ Done!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
378
skills/bioservices/scripts/compound_cross_reference.py
Executable file
378
skills/bioservices/scripts/compound_cross_reference.py
Executable file
@@ -0,0 +1,378 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Compound Cross-Database Search
|
||||
|
||||
This script searches for a compound by name and retrieves identifiers
|
||||
from multiple databases:
|
||||
- KEGG Compound
|
||||
- ChEBI
|
||||
- ChEMBL (via UniChem)
|
||||
- Basic compound properties
|
||||
|
||||
Usage:
|
||||
python compound_cross_reference.py COMPOUND_NAME [--output FILE]
|
||||
|
||||
Examples:
|
||||
python compound_cross_reference.py Geldanamycin
|
||||
python compound_cross_reference.py "Adenosine triphosphate"
|
||||
python compound_cross_reference.py Aspirin --output aspirin_info.txt
|
||||
"""
|
||||
|
||||
import sys
|
||||
import argparse
|
||||
from bioservices import KEGG, UniChem, ChEBI, ChEMBL
|
||||
|
||||
|
||||
def search_kegg_compound(compound_name):
|
||||
"""Search KEGG for compound by name."""
|
||||
print(f"\n{'='*70}")
|
||||
print("STEP 1: KEGG Compound Search")
|
||||
print(f"{'='*70}")
|
||||
|
||||
k = KEGG()
|
||||
|
||||
print(f"Searching KEGG for: {compound_name}")
|
||||
|
||||
try:
|
||||
results = k.find("compound", compound_name)
|
||||
|
||||
if not results or not results.strip():
|
||||
print(f"✗ No results found in KEGG")
|
||||
return k, None
|
||||
|
||||
# Parse results
|
||||
lines = results.strip().split("\n")
|
||||
print(f"✓ Found {len(lines)} result(s):\n")
|
||||
|
||||
for i, line in enumerate(lines[:5], 1):
|
||||
parts = line.split("\t")
|
||||
kegg_id = parts[0]
|
||||
description = parts[1] if len(parts) > 1 else "No description"
|
||||
print(f" {i}. {kegg_id}: {description}")
|
||||
|
||||
# Use first result
|
||||
first_result = lines[0].split("\t")
|
||||
kegg_id = first_result[0].replace("cpd:", "")
|
||||
|
||||
print(f"\nUsing: {kegg_id}")
|
||||
|
||||
return k, kegg_id
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Error: {e}")
|
||||
return k, None
|
||||
|
||||
|
||||
def get_kegg_info(kegg, kegg_id):
|
||||
"""Retrieve detailed KEGG compound information."""
|
||||
print(f"\n{'='*70}")
|
||||
print("STEP 2: KEGG Compound Details")
|
||||
print(f"{'='*70}")
|
||||
|
||||
try:
|
||||
print(f"Retrieving KEGG entry for {kegg_id}...")
|
||||
|
||||
entry = kegg.get(f"cpd:{kegg_id}")
|
||||
|
||||
if not entry:
|
||||
print("✗ Failed to retrieve entry")
|
||||
return None
|
||||
|
||||
# Parse entry
|
||||
compound_info = {
|
||||
'kegg_id': kegg_id,
|
||||
'name': None,
|
||||
'formula': None,
|
||||
'exact_mass': None,
|
||||
'mol_weight': None,
|
||||
'chebi_id': None,
|
||||
'pathways': []
|
||||
}
|
||||
|
||||
current_section = None
|
||||
|
||||
for line in entry.split("\n"):
|
||||
if line.startswith("NAME"):
|
||||
compound_info['name'] = line.replace("NAME", "").strip().rstrip(";")
|
||||
|
||||
elif line.startswith("FORMULA"):
|
||||
compound_info['formula'] = line.replace("FORMULA", "").strip()
|
||||
|
||||
elif line.startswith("EXACT_MASS"):
|
||||
compound_info['exact_mass'] = line.replace("EXACT_MASS", "").strip()
|
||||
|
||||
elif line.startswith("MOL_WEIGHT"):
|
||||
compound_info['mol_weight'] = line.replace("MOL_WEIGHT", "").strip()
|
||||
|
||||
elif "ChEBI:" in line:
|
||||
parts = line.split("ChEBI:")
|
||||
if len(parts) > 1:
|
||||
compound_info['chebi_id'] = parts[1].strip().split()[0]
|
||||
|
||||
elif line.startswith("PATHWAY"):
|
||||
current_section = "pathway"
|
||||
pathway = line.replace("PATHWAY", "").strip()
|
||||
if pathway:
|
||||
compound_info['pathways'].append(pathway)
|
||||
|
||||
elif current_section == "pathway" and line.startswith(" "):
|
||||
pathway = line.strip()
|
||||
if pathway:
|
||||
compound_info['pathways'].append(pathway)
|
||||
|
||||
elif line.startswith(" ") and not line.startswith(" "):
|
||||
current_section = None
|
||||
|
||||
# Display information
|
||||
print(f"\n✓ KEGG Compound Information:")
|
||||
print(f" ID: {compound_info['kegg_id']}")
|
||||
print(f" Name: {compound_info['name']}")
|
||||
print(f" Formula: {compound_info['formula']}")
|
||||
print(f" Exact Mass: {compound_info['exact_mass']}")
|
||||
print(f" Molecular Weight: {compound_info['mol_weight']}")
|
||||
|
||||
if compound_info['chebi_id']:
|
||||
print(f" ChEBI ID: {compound_info['chebi_id']}")
|
||||
|
||||
if compound_info['pathways']:
|
||||
print(f" Pathways: {len(compound_info['pathways'])} found")
|
||||
|
||||
return compound_info
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_chembl_id(kegg_id):
|
||||
"""Map KEGG ID to ChEMBL via UniChem."""
|
||||
print(f"\n{'='*70}")
|
||||
print("STEP 3: ChEMBL Mapping (via UniChem)")
|
||||
print(f"{'='*70}")
|
||||
|
||||
try:
|
||||
u = UniChem()
|
||||
|
||||
print(f"Mapping KEGG:{kegg_id} to ChEMBL...")
|
||||
|
||||
chembl_id = u.get_compound_id_from_kegg(kegg_id)
|
||||
|
||||
if chembl_id:
|
||||
print(f"✓ ChEMBL ID: {chembl_id}")
|
||||
return chembl_id
|
||||
else:
|
||||
print("✗ No ChEMBL mapping found")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_chebi_info(chebi_id):
|
||||
"""Retrieve ChEBI compound information."""
|
||||
print(f"\n{'='*70}")
|
||||
print("STEP 4: ChEBI Details")
|
||||
print(f"{'='*70}")
|
||||
|
||||
if not chebi_id:
|
||||
print("⊘ No ChEBI ID available")
|
||||
return None
|
||||
|
||||
try:
|
||||
c = ChEBI()
|
||||
|
||||
print(f"Retrieving ChEBI entry for {chebi_id}...")
|
||||
|
||||
# Ensure proper format
|
||||
if not chebi_id.startswith("CHEBI:"):
|
||||
chebi_id = f"CHEBI:{chebi_id}"
|
||||
|
||||
entity = c.getCompleteEntity(chebi_id)
|
||||
|
||||
if entity:
|
||||
print(f"\n✓ ChEBI Information:")
|
||||
print(f" ID: {entity.chebiId}")
|
||||
print(f" Name: {entity.chebiAsciiName}")
|
||||
|
||||
if hasattr(entity, 'Formulae') and entity.Formulae:
|
||||
print(f" Formula: {entity.Formulae}")
|
||||
|
||||
if hasattr(entity, 'mass') and entity.mass:
|
||||
print(f" Mass: {entity.mass}")
|
||||
|
||||
if hasattr(entity, 'charge') and entity.charge:
|
||||
print(f" Charge: {entity.charge}")
|
||||
|
||||
return {
|
||||
'chebi_id': entity.chebiId,
|
||||
'name': entity.chebiAsciiName,
|
||||
'formula': entity.Formulae if hasattr(entity, 'Formulae') else None,
|
||||
'mass': entity.mass if hasattr(entity, 'mass') else None
|
||||
}
|
||||
else:
|
||||
print("✗ Failed to retrieve ChEBI entry")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def get_chembl_info(chembl_id):
|
||||
"""Retrieve ChEMBL compound information."""
|
||||
print(f"\n{'='*70}")
|
||||
print("STEP 5: ChEMBL Details")
|
||||
print(f"{'='*70}")
|
||||
|
||||
if not chembl_id:
|
||||
print("⊘ No ChEMBL ID available")
|
||||
return None
|
||||
|
||||
try:
|
||||
c = ChEMBL()
|
||||
|
||||
print(f"Retrieving ChEMBL entry for {chembl_id}...")
|
||||
|
||||
compound = c.get_compound_by_chemblId(chembl_id)
|
||||
|
||||
if compound:
|
||||
print(f"\n✓ ChEMBL Information:")
|
||||
print(f" ID: {chembl_id}")
|
||||
|
||||
if 'pref_name' in compound and compound['pref_name']:
|
||||
print(f" Preferred Name: {compound['pref_name']}")
|
||||
|
||||
if 'molecule_properties' in compound:
|
||||
props = compound['molecule_properties']
|
||||
|
||||
if 'full_mwt' in props:
|
||||
print(f" Molecular Weight: {props['full_mwt']}")
|
||||
|
||||
if 'alogp' in props:
|
||||
print(f" LogP: {props['alogp']}")
|
||||
|
||||
if 'hba' in props:
|
||||
print(f" H-Bond Acceptors: {props['hba']}")
|
||||
|
||||
if 'hbd' in props:
|
||||
print(f" H-Bond Donors: {props['hbd']}")
|
||||
|
||||
if 'molecule_structures' in compound:
|
||||
structs = compound['molecule_structures']
|
||||
|
||||
if 'canonical_smiles' in structs:
|
||||
smiles = structs['canonical_smiles']
|
||||
print(f" SMILES: {smiles[:60]}{'...' if len(smiles) > 60 else ''}")
|
||||
|
||||
return compound
|
||||
else:
|
||||
print("✗ Failed to retrieve ChEMBL entry")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def save_results(compound_name, kegg_info, chembl_id, output_file):
|
||||
"""Save results to file."""
|
||||
print(f"\n{'='*70}")
|
||||
print(f"Saving results to {output_file}")
|
||||
print(f"{'='*70}")
|
||||
|
||||
with open(output_file, 'w') as f:
|
||||
f.write("=" * 70 + "\n")
|
||||
f.write(f"Compound Cross-Reference Report: {compound_name}\n")
|
||||
f.write("=" * 70 + "\n\n")
|
||||
|
||||
# KEGG information
|
||||
if kegg_info:
|
||||
f.write("KEGG Compound\n")
|
||||
f.write("-" * 70 + "\n")
|
||||
f.write(f"ID: {kegg_info['kegg_id']}\n")
|
||||
f.write(f"Name: {kegg_info['name']}\n")
|
||||
f.write(f"Formula: {kegg_info['formula']}\n")
|
||||
f.write(f"Exact Mass: {kegg_info['exact_mass']}\n")
|
||||
f.write(f"Molecular Weight: {kegg_info['mol_weight']}\n")
|
||||
f.write(f"Pathways: {len(kegg_info['pathways'])} found\n")
|
||||
f.write("\n")
|
||||
|
||||
# Database IDs
|
||||
f.write("Cross-Database Identifiers\n")
|
||||
f.write("-" * 70 + "\n")
|
||||
if kegg_info:
|
||||
f.write(f"KEGG: {kegg_info['kegg_id']}\n")
|
||||
if kegg_info['chebi_id']:
|
||||
f.write(f"ChEBI: {kegg_info['chebi_id']}\n")
|
||||
if chembl_id:
|
||||
f.write(f"ChEMBL: {chembl_id}\n")
|
||||
f.write("\n")
|
||||
|
||||
print(f"✓ Results saved")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main workflow."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Search compound across multiple databases",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python compound_cross_reference.py Geldanamycin
|
||||
python compound_cross_reference.py "Adenosine triphosphate"
|
||||
python compound_cross_reference.py Aspirin --output aspirin_info.txt
|
||||
"""
|
||||
)
|
||||
parser.add_argument("compound", help="Compound name to search")
|
||||
parser.add_argument("--output", default=None,
|
||||
help="Output file for results (optional)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 70)
|
||||
print("BIOSERVICES: Compound Cross-Database Search")
|
||||
print("=" * 70)
|
||||
|
||||
# Step 1: Search KEGG
|
||||
kegg, kegg_id = search_kegg_compound(args.compound)
|
||||
if not kegg_id:
|
||||
print("\n✗ Failed to find compound. Exiting.")
|
||||
sys.exit(1)
|
||||
|
||||
# Step 2: Get KEGG details
|
||||
kegg_info = get_kegg_info(kegg, kegg_id)
|
||||
|
||||
# Step 3: Map to ChEMBL
|
||||
chembl_id = get_chembl_id(kegg_id)
|
||||
|
||||
# Step 4: Get ChEBI details
|
||||
chebi_info = None
|
||||
if kegg_info and kegg_info['chebi_id']:
|
||||
chebi_info = get_chebi_info(kegg_info['chebi_id'])
|
||||
|
||||
# Step 5: Get ChEMBL details
|
||||
chembl_info = None
|
||||
if chembl_id:
|
||||
chembl_info = get_chembl_info(chembl_id)
|
||||
|
||||
# Summary
|
||||
print(f"\n{'='*70}")
|
||||
print("SUMMARY")
|
||||
print(f"{'='*70}")
|
||||
print(f" Compound: {args.compound}")
|
||||
if kegg_info:
|
||||
print(f" KEGG ID: {kegg_info['kegg_id']}")
|
||||
if kegg_info['chebi_id']:
|
||||
print(f" ChEBI ID: {kegg_info['chebi_id']}")
|
||||
if chembl_id:
|
||||
print(f" ChEMBL ID: {chembl_id}")
|
||||
print(f"{'='*70}")
|
||||
|
||||
# Save to file if requested
|
||||
if args.output:
|
||||
save_results(args.compound, kegg_info, chembl_id, args.output)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
309
skills/bioservices/scripts/pathway_analysis.py
Executable file
309
skills/bioservices/scripts/pathway_analysis.py
Executable file
@@ -0,0 +1,309 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
KEGG Pathway Network Analysis
|
||||
|
||||
This script analyzes all pathways for an organism and extracts:
|
||||
- Pathway sizes (number of genes)
|
||||
- Protein-protein interactions
|
||||
- Interaction type distributions
|
||||
- Network data in various formats (CSV, SIF)
|
||||
|
||||
Usage:
|
||||
python pathway_analysis.py ORGANISM OUTPUT_DIR [--limit N]
|
||||
|
||||
Examples:
|
||||
python pathway_analysis.py hsa ./human_pathways
|
||||
python pathway_analysis.py mmu ./mouse_pathways --limit 50
|
||||
|
||||
Organism codes:
|
||||
hsa = Homo sapiens (human)
|
||||
mmu = Mus musculus (mouse)
|
||||
dme = Drosophila melanogaster
|
||||
sce = Saccharomyces cerevisiae (yeast)
|
||||
eco = Escherichia coli
|
||||
"""
|
||||
|
||||
import sys
|
||||
import os
|
||||
import argparse
|
||||
import csv
|
||||
from collections import Counter
|
||||
from bioservices import KEGG
|
||||
|
||||
|
||||
def get_all_pathways(kegg, organism):
|
||||
"""Get all pathway IDs for organism."""
|
||||
print(f"\nRetrieving pathways for {organism}...")
|
||||
|
||||
kegg.organism = organism
|
||||
pathway_ids = kegg.pathwayIds
|
||||
|
||||
print(f"✓ Found {len(pathway_ids)} pathways")
|
||||
|
||||
return pathway_ids
|
||||
|
||||
|
||||
def analyze_pathway(kegg, pathway_id):
|
||||
"""Analyze single pathway for size and interactions."""
|
||||
try:
|
||||
# Parse KGML pathway
|
||||
kgml = kegg.parse_kgml_pathway(pathway_id)
|
||||
|
||||
entries = kgml.get('entries', [])
|
||||
relations = kgml.get('relations', [])
|
||||
|
||||
# Count relation types
|
||||
relation_types = Counter()
|
||||
for rel in relations:
|
||||
rel_type = rel.get('name', 'unknown')
|
||||
relation_types[rel_type] += 1
|
||||
|
||||
# Get pathway name
|
||||
try:
|
||||
entry = kegg.get(pathway_id)
|
||||
pathway_name = "Unknown"
|
||||
for line in entry.split("\n"):
|
||||
if line.startswith("NAME"):
|
||||
pathway_name = line.replace("NAME", "").strip()
|
||||
break
|
||||
except:
|
||||
pathway_name = "Unknown"
|
||||
|
||||
result = {
|
||||
'pathway_id': pathway_id,
|
||||
'pathway_name': pathway_name,
|
||||
'num_entries': len(entries),
|
||||
'num_relations': len(relations),
|
||||
'relation_types': dict(relation_types),
|
||||
'entries': entries,
|
||||
'relations': relations
|
||||
}
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
print(f" ✗ Error analyzing {pathway_id}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def analyze_all_pathways(kegg, pathway_ids, limit=None):
|
||||
"""Analyze all pathways."""
|
||||
if limit:
|
||||
pathway_ids = pathway_ids[:limit]
|
||||
print(f"\n⚠ Limiting analysis to first {limit} pathways")
|
||||
|
||||
print(f"\nAnalyzing {len(pathway_ids)} pathways...")
|
||||
|
||||
results = []
|
||||
for i, pathway_id in enumerate(pathway_ids, 1):
|
||||
print(f" [{i}/{len(pathway_ids)}] {pathway_id}", end="\r")
|
||||
|
||||
result = analyze_pathway(kegg, pathway_id)
|
||||
if result:
|
||||
results.append(result)
|
||||
|
||||
print(f"\n✓ Successfully analyzed {len(results)}/{len(pathway_ids)} pathways")
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def save_pathway_summary(results, output_file):
|
||||
"""Save pathway summary to CSV."""
|
||||
print(f"\nSaving pathway summary to {output_file}...")
|
||||
|
||||
with open(output_file, 'w', newline='') as f:
|
||||
writer = csv.writer(f)
|
||||
|
||||
# Header
|
||||
writer.writerow([
|
||||
'Pathway_ID',
|
||||
'Pathway_Name',
|
||||
'Num_Genes',
|
||||
'Num_Interactions',
|
||||
'Activation',
|
||||
'Inhibition',
|
||||
'Phosphorylation',
|
||||
'Binding',
|
||||
'Other'
|
||||
])
|
||||
|
||||
# Data
|
||||
for result in results:
|
||||
rel_types = result['relation_types']
|
||||
|
||||
writer.writerow([
|
||||
result['pathway_id'],
|
||||
result['pathway_name'],
|
||||
result['num_entries'],
|
||||
result['num_relations'],
|
||||
rel_types.get('activation', 0),
|
||||
rel_types.get('inhibition', 0),
|
||||
rel_types.get('phosphorylation', 0),
|
||||
rel_types.get('binding/association', 0),
|
||||
sum(v for k, v in rel_types.items()
|
||||
if k not in ['activation', 'inhibition', 'phosphorylation', 'binding/association'])
|
||||
])
|
||||
|
||||
print(f"✓ Summary saved")
|
||||
|
||||
|
||||
def save_interactions_sif(results, output_file):
|
||||
"""Save all interactions in SIF format."""
|
||||
print(f"\nSaving interactions to {output_file}...")
|
||||
|
||||
with open(output_file, 'w') as f:
|
||||
for result in results:
|
||||
pathway_id = result['pathway_id']
|
||||
|
||||
for rel in result['relations']:
|
||||
entry1 = rel.get('entry1', '')
|
||||
entry2 = rel.get('entry2', '')
|
||||
interaction_type = rel.get('name', 'interaction')
|
||||
|
||||
# Write SIF format: source\tinteraction\ttarget
|
||||
f.write(f"{entry1}\t{interaction_type}\t{entry2}\n")
|
||||
|
||||
print(f"✓ Interactions saved")
|
||||
|
||||
|
||||
def save_detailed_pathway_info(results, output_dir):
|
||||
"""Save detailed information for each pathway."""
|
||||
print(f"\nSaving detailed pathway files to {output_dir}/pathways/...")
|
||||
|
||||
pathway_dir = os.path.join(output_dir, "pathways")
|
||||
os.makedirs(pathway_dir, exist_ok=True)
|
||||
|
||||
for result in results:
|
||||
pathway_id = result['pathway_id'].replace(":", "_")
|
||||
filename = os.path.join(pathway_dir, f"{pathway_id}_interactions.csv")
|
||||
|
||||
with open(filename, 'w', newline='') as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(['Source', 'Target', 'Interaction_Type', 'Link_Type'])
|
||||
|
||||
for rel in result['relations']:
|
||||
writer.writerow([
|
||||
rel.get('entry1', ''),
|
||||
rel.get('entry2', ''),
|
||||
rel.get('name', 'unknown'),
|
||||
rel.get('link', 'unknown')
|
||||
])
|
||||
|
||||
print(f"✓ Detailed files saved for {len(results)} pathways")
|
||||
|
||||
|
||||
def print_statistics(results):
|
||||
"""Print analysis statistics."""
|
||||
print(f"\n{'='*70}")
|
||||
print("PATHWAY ANALYSIS STATISTICS")
|
||||
print(f"{'='*70}")
|
||||
|
||||
# Total stats
|
||||
total_pathways = len(results)
|
||||
total_interactions = sum(r['num_relations'] for r in results)
|
||||
total_genes = sum(r['num_entries'] for r in results)
|
||||
|
||||
print(f"\nOverall:")
|
||||
print(f" Total pathways: {total_pathways}")
|
||||
print(f" Total genes/proteins: {total_genes}")
|
||||
print(f" Total interactions: {total_interactions}")
|
||||
|
||||
# Largest pathways
|
||||
print(f"\nLargest pathways (by gene count):")
|
||||
sorted_by_size = sorted(results, key=lambda x: x['num_entries'], reverse=True)
|
||||
for i, result in enumerate(sorted_by_size[:10], 1):
|
||||
print(f" {i}. {result['pathway_id']}: {result['num_entries']} genes")
|
||||
print(f" {result['pathway_name']}")
|
||||
|
||||
# Most connected pathways
|
||||
print(f"\nMost connected pathways (by interactions):")
|
||||
sorted_by_connections = sorted(results, key=lambda x: x['num_relations'], reverse=True)
|
||||
for i, result in enumerate(sorted_by_connections[:10], 1):
|
||||
print(f" {i}. {result['pathway_id']}: {result['num_relations']} interactions")
|
||||
print(f" {result['pathway_name']}")
|
||||
|
||||
# Interaction type distribution
|
||||
print(f"\nInteraction type distribution:")
|
||||
all_types = Counter()
|
||||
for result in results:
|
||||
for rel_type, count in result['relation_types'].items():
|
||||
all_types[rel_type] += count
|
||||
|
||||
for rel_type, count in all_types.most_common():
|
||||
percentage = (count / total_interactions) * 100 if total_interactions > 0 else 0
|
||||
print(f" {rel_type}: {count} ({percentage:.1f}%)")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main analysis workflow."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Analyze KEGG pathways for an organism",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python pathway_analysis.py hsa ./human_pathways
|
||||
python pathway_analysis.py mmu ./mouse_pathways --limit 50
|
||||
|
||||
Organism codes:
|
||||
hsa = Homo sapiens (human)
|
||||
mmu = Mus musculus (mouse)
|
||||
dme = Drosophila melanogaster
|
||||
sce = Saccharomyces cerevisiae (yeast)
|
||||
eco = Escherichia coli
|
||||
"""
|
||||
)
|
||||
parser.add_argument("organism", help="KEGG organism code (e.g., hsa, mmu)")
|
||||
parser.add_argument("output_dir", help="Output directory for results")
|
||||
parser.add_argument("--limit", type=int, default=None,
|
||||
help="Limit analysis to first N pathways")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 70)
|
||||
print("BIOSERVICES: KEGG Pathway Network Analysis")
|
||||
print("=" * 70)
|
||||
|
||||
# Create output directory
|
||||
os.makedirs(args.output_dir, exist_ok=True)
|
||||
|
||||
# Initialize KEGG
|
||||
kegg = KEGG()
|
||||
|
||||
# Get all pathways
|
||||
pathway_ids = get_all_pathways(kegg, args.organism)
|
||||
|
||||
if not pathway_ids:
|
||||
print(f"\n✗ No pathways found for {args.organism}")
|
||||
sys.exit(1)
|
||||
|
||||
# Analyze pathways
|
||||
results = analyze_all_pathways(kegg, pathway_ids, args.limit)
|
||||
|
||||
if not results:
|
||||
print("\n✗ No pathways successfully analyzed")
|
||||
sys.exit(1)
|
||||
|
||||
# Print statistics
|
||||
print_statistics(results)
|
||||
|
||||
# Save results
|
||||
summary_file = os.path.join(args.output_dir, "pathway_summary.csv")
|
||||
save_pathway_summary(results, summary_file)
|
||||
|
||||
sif_file = os.path.join(args.output_dir, "all_interactions.sif")
|
||||
save_interactions_sif(results, sif_file)
|
||||
|
||||
save_detailed_pathway_info(results, args.output_dir)
|
||||
|
||||
# Final summary
|
||||
print(f"\n{'='*70}")
|
||||
print("OUTPUT FILES")
|
||||
print(f"{'='*70}")
|
||||
print(f" Summary: {summary_file}")
|
||||
print(f" Interactions: {sif_file}")
|
||||
print(f" Detailed: {args.output_dir}/pathways/")
|
||||
print(f"{'='*70}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
408
skills/bioservices/scripts/protein_analysis_workflow.py
Executable file
408
skills/bioservices/scripts/protein_analysis_workflow.py
Executable file
@@ -0,0 +1,408 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Complete Protein Analysis Workflow
|
||||
|
||||
This script performs a comprehensive protein analysis pipeline:
|
||||
1. UniProt search and identifier retrieval
|
||||
2. FASTA sequence retrieval
|
||||
3. BLAST similarity search
|
||||
4. KEGG pathway discovery
|
||||
5. PSICQUIC interaction mapping
|
||||
6. GO annotation retrieval
|
||||
|
||||
Usage:
|
||||
python protein_analysis_workflow.py PROTEIN_NAME EMAIL [--skip-blast]
|
||||
|
||||
Examples:
|
||||
python protein_analysis_workflow.py ZAP70_HUMAN user@example.com
|
||||
python protein_analysis_workflow.py P43403 user@example.com --skip-blast
|
||||
|
||||
Note: BLAST searches can take several minutes. Use --skip-blast to skip this step.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
import argparse
|
||||
from bioservices import UniProt, KEGG, NCBIblast, PSICQUIC, QuickGO
|
||||
|
||||
|
||||
def search_protein(query):
|
||||
"""Search UniProt for protein and retrieve basic information."""
|
||||
print(f"\n{'='*70}")
|
||||
print("STEP 1: UniProt Search")
|
||||
print(f"{'='*70}")
|
||||
|
||||
u = UniProt(verbose=False)
|
||||
|
||||
print(f"Searching for: {query}")
|
||||
|
||||
# Try direct retrieval first (if query looks like accession)
|
||||
if len(query) == 6 and query[0] in "OPQ":
|
||||
try:
|
||||
entry = u.retrieve(query, frmt="tab")
|
||||
if entry:
|
||||
uniprot_id = query
|
||||
print(f"✓ Found UniProt entry: {uniprot_id}")
|
||||
return u, uniprot_id
|
||||
except:
|
||||
pass
|
||||
|
||||
# Otherwise search
|
||||
results = u.search(query, frmt="tab", columns="id,genes,organism,length,protein names", limit=5)
|
||||
|
||||
if not results:
|
||||
print("✗ No results found")
|
||||
return u, None
|
||||
|
||||
lines = results.strip().split("\n")
|
||||
if len(lines) < 2:
|
||||
print("✗ No entries found")
|
||||
return u, None
|
||||
|
||||
# Display results
|
||||
print(f"\n✓ Found {len(lines)-1} result(s):")
|
||||
for i, line in enumerate(lines[1:], 1):
|
||||
fields = line.split("\t")
|
||||
print(f" {i}. {fields[0]} - {fields[1]} ({fields[2]})")
|
||||
|
||||
# Use first result
|
||||
first_entry = lines[1].split("\t")
|
||||
uniprot_id = first_entry[0]
|
||||
gene_names = first_entry[1] if len(first_entry) > 1 else "N/A"
|
||||
organism = first_entry[2] if len(first_entry) > 2 else "N/A"
|
||||
length = first_entry[3] if len(first_entry) > 3 else "N/A"
|
||||
protein_name = first_entry[4] if len(first_entry) > 4 else "N/A"
|
||||
|
||||
print(f"\nUsing first result:")
|
||||
print(f" UniProt ID: {uniprot_id}")
|
||||
print(f" Gene names: {gene_names}")
|
||||
print(f" Organism: {organism}")
|
||||
print(f" Length: {length} aa")
|
||||
print(f" Protein: {protein_name}")
|
||||
|
||||
return u, uniprot_id
|
||||
|
||||
|
||||
def retrieve_sequence(uniprot, uniprot_id):
|
||||
"""Retrieve FASTA sequence for protein."""
|
||||
print(f"\n{'='*70}")
|
||||
print("STEP 2: FASTA Sequence Retrieval")
|
||||
print(f"{'='*70}")
|
||||
|
||||
try:
|
||||
sequence = uniprot.retrieve(uniprot_id, frmt="fasta")
|
||||
|
||||
if sequence:
|
||||
# Extract sequence only (remove header)
|
||||
lines = sequence.strip().split("\n")
|
||||
header = lines[0]
|
||||
seq_only = "".join(lines[1:])
|
||||
|
||||
print(f"✓ Retrieved sequence:")
|
||||
print(f" Header: {header}")
|
||||
print(f" Length: {len(seq_only)} residues")
|
||||
print(f" First 60 residues: {seq_only[:60]}...")
|
||||
|
||||
return seq_only
|
||||
else:
|
||||
print("✗ Failed to retrieve sequence")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def run_blast(sequence, email, skip=False):
|
||||
"""Run BLAST similarity search."""
|
||||
print(f"\n{'='*70}")
|
||||
print("STEP 3: BLAST Similarity Search")
|
||||
print(f"{'='*70}")
|
||||
|
||||
if skip:
|
||||
print("⊘ Skipped (--skip-blast flag)")
|
||||
return None
|
||||
|
||||
if not email or "@" not in email:
|
||||
print("⊘ Skipped (valid email required for BLAST)")
|
||||
return None
|
||||
|
||||
try:
|
||||
print(f"Submitting BLASTP job...")
|
||||
print(f" Database: uniprotkb")
|
||||
print(f" Sequence length: {len(sequence)} aa")
|
||||
|
||||
s = NCBIblast(verbose=False)
|
||||
|
||||
jobid = s.run(
|
||||
program="blastp",
|
||||
sequence=sequence,
|
||||
stype="protein",
|
||||
database="uniprotkb",
|
||||
email=email
|
||||
)
|
||||
|
||||
print(f"✓ Job submitted: {jobid}")
|
||||
print(f" Waiting for completion...")
|
||||
|
||||
# Poll for completion
|
||||
max_wait = 300 # 5 minutes
|
||||
start_time = time.time()
|
||||
|
||||
while time.time() - start_time < max_wait:
|
||||
status = s.getStatus(jobid)
|
||||
elapsed = int(time.time() - start_time)
|
||||
print(f" Status: {status} (elapsed: {elapsed}s)", end="\r")
|
||||
|
||||
if status == "FINISHED":
|
||||
print(f"\n✓ BLAST completed in {elapsed}s")
|
||||
|
||||
# Retrieve results
|
||||
results = s.getResult(jobid, "out")
|
||||
|
||||
# Parse and display summary
|
||||
lines = results.split("\n")
|
||||
print(f"\n Results preview:")
|
||||
for line in lines[:20]:
|
||||
if line.strip():
|
||||
print(f" {line}")
|
||||
|
||||
return results
|
||||
|
||||
elif status == "ERROR":
|
||||
print(f"\n✗ BLAST job failed")
|
||||
return None
|
||||
|
||||
time.sleep(5)
|
||||
|
||||
print(f"\n✗ Timeout after {max_wait}s")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def discover_pathways(uniprot, kegg, uniprot_id):
|
||||
"""Discover KEGG pathways for protein."""
|
||||
print(f"\n{'='*70}")
|
||||
print("STEP 4: KEGG Pathway Discovery")
|
||||
print(f"{'='*70}")
|
||||
|
||||
try:
|
||||
# Map UniProt → KEGG
|
||||
print(f"Mapping {uniprot_id} to KEGG...")
|
||||
kegg_mapping = uniprot.mapping(fr="UniProtKB_AC-ID", to="KEGG", query=uniprot_id)
|
||||
|
||||
if not kegg_mapping or uniprot_id not in kegg_mapping:
|
||||
print("✗ No KEGG mapping found")
|
||||
return []
|
||||
|
||||
kegg_ids = kegg_mapping[uniprot_id]
|
||||
print(f"✓ KEGG ID(s): {kegg_ids}")
|
||||
|
||||
# Get pathways for first KEGG ID
|
||||
kegg_id = kegg_ids[0]
|
||||
organism, gene_id = kegg_id.split(":")
|
||||
|
||||
print(f"\nSearching pathways for {kegg_id}...")
|
||||
pathways = kegg.get_pathway_by_gene(gene_id, organism)
|
||||
|
||||
if not pathways:
|
||||
print("✗ No pathways found")
|
||||
return []
|
||||
|
||||
print(f"✓ Found {len(pathways)} pathway(s):\n")
|
||||
|
||||
# Get pathway names
|
||||
pathway_info = []
|
||||
for pathway_id in pathways:
|
||||
try:
|
||||
entry = kegg.get(pathway_id)
|
||||
|
||||
# Extract pathway name
|
||||
pathway_name = "Unknown"
|
||||
for line in entry.split("\n"):
|
||||
if line.startswith("NAME"):
|
||||
pathway_name = line.replace("NAME", "").strip()
|
||||
break
|
||||
|
||||
pathway_info.append((pathway_id, pathway_name))
|
||||
print(f" • {pathway_id}: {pathway_name}")
|
||||
|
||||
except Exception as e:
|
||||
print(f" • {pathway_id}: [Error retrieving name]")
|
||||
|
||||
return pathway_info
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Error: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def find_interactions(protein_query):
|
||||
"""Find protein-protein interactions via PSICQUIC."""
|
||||
print(f"\n{'='*70}")
|
||||
print("STEP 5: Protein-Protein Interactions")
|
||||
print(f"{'='*70}")
|
||||
|
||||
try:
|
||||
p = PSICQUIC()
|
||||
|
||||
# Try querying MINT database
|
||||
query = f"{protein_query} AND species:9606"
|
||||
print(f"Querying MINT database...")
|
||||
print(f" Query: {query}")
|
||||
|
||||
results = p.query("mint", query)
|
||||
|
||||
if not results:
|
||||
print("✗ No interactions found in MINT")
|
||||
return []
|
||||
|
||||
# Parse PSI-MI TAB format
|
||||
lines = results.strip().split("\n")
|
||||
print(f"✓ Found {len(lines)} interaction(s):\n")
|
||||
|
||||
# Display first 10 interactions
|
||||
interactions = []
|
||||
for i, line in enumerate(lines[:10], 1):
|
||||
fields = line.split("\t")
|
||||
if len(fields) >= 12:
|
||||
protein_a = fields[4].split(":")[1] if ":" in fields[4] else fields[4]
|
||||
protein_b = fields[5].split(":")[1] if ":" in fields[5] else fields[5]
|
||||
interaction_type = fields[11]
|
||||
|
||||
interactions.append((protein_a, protein_b, interaction_type))
|
||||
print(f" {i}. {protein_a} ↔ {protein_b}")
|
||||
|
||||
if len(lines) > 10:
|
||||
print(f" ... and {len(lines)-10} more")
|
||||
|
||||
return interactions
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Error: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def get_go_annotations(uniprot_id):
|
||||
"""Retrieve GO annotations."""
|
||||
print(f"\n{'='*70}")
|
||||
print("STEP 6: Gene Ontology Annotations")
|
||||
print(f"{'='*70}")
|
||||
|
||||
try:
|
||||
g = QuickGO()
|
||||
|
||||
print(f"Retrieving GO annotations for {uniprot_id}...")
|
||||
annotations = g.Annotation(protein=uniprot_id, format="tsv")
|
||||
|
||||
if not annotations:
|
||||
print("✗ No GO annotations found")
|
||||
return []
|
||||
|
||||
lines = annotations.strip().split("\n")
|
||||
print(f"✓ Found {len(lines)-1} annotation(s)\n")
|
||||
|
||||
# Group by aspect
|
||||
aspects = {"P": [], "F": [], "C": []}
|
||||
for line in lines[1:]:
|
||||
fields = line.split("\t")
|
||||
if len(fields) >= 9:
|
||||
go_id = fields[6]
|
||||
go_term = fields[7]
|
||||
go_aspect = fields[8]
|
||||
|
||||
if go_aspect in aspects:
|
||||
aspects[go_aspect].append((go_id, go_term))
|
||||
|
||||
# Display summary
|
||||
print(f" Biological Process (P): {len(aspects['P'])} terms")
|
||||
for go_id, go_term in aspects['P'][:5]:
|
||||
print(f" • {go_id}: {go_term}")
|
||||
if len(aspects['P']) > 5:
|
||||
print(f" ... and {len(aspects['P'])-5} more")
|
||||
|
||||
print(f"\n Molecular Function (F): {len(aspects['F'])} terms")
|
||||
for go_id, go_term in aspects['F'][:5]:
|
||||
print(f" • {go_id}: {go_term}")
|
||||
if len(aspects['F']) > 5:
|
||||
print(f" ... and {len(aspects['F'])-5} more")
|
||||
|
||||
print(f"\n Cellular Component (C): {len(aspects['C'])} terms")
|
||||
for go_id, go_term in aspects['C'][:5]:
|
||||
print(f" • {go_id}: {go_term}")
|
||||
if len(aspects['C']) > 5:
|
||||
print(f" ... and {len(aspects['C'])-5} more")
|
||||
|
||||
return aspects
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Error: {e}")
|
||||
return {}
|
||||
|
||||
|
||||
def main():
|
||||
"""Main workflow."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Complete protein analysis workflow using BioServices",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python protein_analysis_workflow.py ZAP70_HUMAN user@example.com
|
||||
python protein_analysis_workflow.py P43403 user@example.com --skip-blast
|
||||
"""
|
||||
)
|
||||
parser.add_argument("protein", help="Protein name or UniProt ID")
|
||||
parser.add_argument("email", help="Email address (required for BLAST)")
|
||||
parser.add_argument("--skip-blast", action="store_true",
|
||||
help="Skip BLAST search (faster)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 70)
|
||||
print("BIOSERVICES: Complete Protein Analysis Workflow")
|
||||
print("=" * 70)
|
||||
|
||||
# Step 1: Search protein
|
||||
uniprot, uniprot_id = search_protein(args.protein)
|
||||
if not uniprot_id:
|
||||
print("\n✗ Failed to find protein. Exiting.")
|
||||
sys.exit(1)
|
||||
|
||||
# Step 2: Retrieve sequence
|
||||
sequence = retrieve_sequence(uniprot, uniprot_id)
|
||||
if not sequence:
|
||||
print("\n⚠ Warning: Could not retrieve sequence")
|
||||
|
||||
# Step 3: BLAST search
|
||||
if sequence:
|
||||
blast_results = run_blast(sequence, args.email, args.skip_blast)
|
||||
|
||||
# Step 4: Pathway discovery
|
||||
kegg = KEGG()
|
||||
pathways = discover_pathways(uniprot, kegg, uniprot_id)
|
||||
|
||||
# Step 5: Interaction mapping
|
||||
interactions = find_interactions(args.protein)
|
||||
|
||||
# Step 6: GO annotations
|
||||
go_terms = get_go_annotations(uniprot_id)
|
||||
|
||||
# Summary
|
||||
print(f"\n{'='*70}")
|
||||
print("WORKFLOW SUMMARY")
|
||||
print(f"{'='*70}")
|
||||
print(f" Protein: {args.protein}")
|
||||
print(f" UniProt ID: {uniprot_id}")
|
||||
print(f" Sequence: {'✓' if sequence else '✗'}")
|
||||
print(f" BLAST: {'✓' if not args.skip_blast and sequence else '⊘'}")
|
||||
print(f" Pathways: {len(pathways)} found")
|
||||
print(f" Interactions: {len(interactions)} found")
|
||||
print(f" GO annotations: {sum(len(v) for v in go_terms.values())} found")
|
||||
print(f"{'='*70}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user