Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions

View File

@@ -0,0 +1,347 @@
#!/usr/bin/env python3
"""
Batch Identifier Converter
This script converts multiple identifiers between biological databases
using UniProt's mapping service. Supports batch processing with
automatic chunking and error handling.
Usage:
python batch_id_converter.py INPUT_FILE --from DB1 --to DB2 [options]
Examples:
python batch_id_converter.py uniprot_ids.txt --from UniProtKB_AC-ID --to KEGG
python batch_id_converter.py gene_ids.txt --from GeneID --to UniProtKB --output mapping.csv
python batch_id_converter.py ids.txt --from UniProtKB_AC-ID --to Ensembl --chunk-size 50
Input file format:
One identifier per line (plain text)
Common database codes:
UniProtKB_AC-ID - UniProt accession/ID
KEGG - KEGG gene IDs
GeneID - NCBI Gene (Entrez) IDs
Ensembl - Ensembl gene IDs
Ensembl_Protein - Ensembl protein IDs
RefSeq_Protein - RefSeq protein IDs
PDB - Protein Data Bank IDs
HGNC - Human gene symbols
GO - Gene Ontology IDs
"""
import sys
import argparse
import csv
import time
from bioservices import UniProt
# Common database code mappings
DATABASE_CODES = {
'uniprot': 'UniProtKB_AC-ID',
'uniprotkb': 'UniProtKB_AC-ID',
'kegg': 'KEGG',
'geneid': 'GeneID',
'entrez': 'GeneID',
'ensembl': 'Ensembl',
'ensembl_protein': 'Ensembl_Protein',
'ensembl_transcript': 'Ensembl_Transcript',
'refseq': 'RefSeq_Protein',
'refseq_protein': 'RefSeq_Protein',
'pdb': 'PDB',
'hgnc': 'HGNC',
'mgi': 'MGI',
'go': 'GO',
'pfam': 'Pfam',
'interpro': 'InterPro',
'reactome': 'Reactome',
'string': 'STRING',
'biogrid': 'BioGRID'
}
def normalize_database_code(code):
"""Normalize database code to official format."""
# Try exact match first
if code in DATABASE_CODES.values():
return code
# Try lowercase lookup
lowercase = code.lower()
if lowercase in DATABASE_CODES:
return DATABASE_CODES[lowercase]
# Return as-is if not found (may still be valid)
return code
def read_ids_from_file(filename):
"""Read identifiers from file (one per line)."""
print(f"Reading identifiers from {filename}...")
ids = []
with open(filename, 'r') as f:
for line in f:
line = line.strip()
if line and not line.startswith('#'):
ids.append(line)
print(f"✓ Read {len(ids)} identifier(s)")
return ids
def batch_convert(ids, from_db, to_db, chunk_size=100, delay=0.5):
"""Convert IDs with automatic chunking and error handling."""
print(f"\nConverting {len(ids)} IDs:")
print(f" From: {from_db}")
print(f" To: {to_db}")
print(f" Chunk size: {chunk_size}")
print()
u = UniProt(verbose=False)
all_results = {}
failed_ids = []
total_chunks = (len(ids) + chunk_size - 1) // chunk_size
for i in range(0, len(ids), chunk_size):
chunk = ids[i:i+chunk_size]
chunk_num = (i // chunk_size) + 1
query = ",".join(chunk)
try:
print(f" [{chunk_num}/{total_chunks}] Processing {len(chunk)} IDs...", end=" ")
results = u.mapping(fr=from_db, to=to_db, query=query)
if results:
all_results.update(results)
mapped_count = len([v for v in results.values() if v])
print(f"✓ Mapped: {mapped_count}/{len(chunk)}")
else:
print(f"✗ No mappings returned")
failed_ids.extend(chunk)
# Rate limiting
if delay > 0 and i + chunk_size < len(ids):
time.sleep(delay)
except Exception as e:
print(f"✗ Error: {e}")
# Try individual IDs in failed chunk
print(f" Retrying individual IDs...")
for single_id in chunk:
try:
result = u.mapping(fr=from_db, to=to_db, query=single_id)
if result:
all_results.update(result)
print(f"{single_id}")
else:
failed_ids.append(single_id)
print(f"{single_id} - no mapping")
except Exception as e2:
failed_ids.append(single_id)
print(f"{single_id} - {e2}")
time.sleep(0.2)
# Add missing IDs to results (mark as failed)
for id_ in ids:
if id_ not in all_results:
all_results[id_] = None
print(f"\n✓ Conversion complete:")
print(f" Total: {len(ids)}")
print(f" Mapped: {len([v for v in all_results.values() if v])}")
print(f" Failed: {len(failed_ids)}")
return all_results, failed_ids
def save_mapping_csv(mapping, output_file, from_db, to_db):
"""Save mapping results to CSV."""
print(f"\nSaving results to {output_file}...")
with open(output_file, 'w', newline='') as f:
writer = csv.writer(f)
# Header
writer.writerow(['Source_ID', 'Source_DB', 'Target_IDs', 'Target_DB', 'Mapping_Status'])
# Data
for source_id, target_ids in sorted(mapping.items()):
if target_ids:
target_str = ";".join(target_ids)
status = "Success"
else:
target_str = ""
status = "Failed"
writer.writerow([source_id, from_db, target_str, to_db, status])
print(f"✓ Results saved")
def save_failed_ids(failed_ids, output_file):
"""Save failed IDs to file."""
if not failed_ids:
return
print(f"\nSaving failed IDs to {output_file}...")
with open(output_file, 'w') as f:
for id_ in failed_ids:
f.write(f"{id_}\n")
print(f"✓ Saved {len(failed_ids)} failed ID(s)")
def print_mapping_summary(mapping, from_db, to_db):
"""Print summary of mapping results."""
print(f"\n{'='*70}")
print("MAPPING SUMMARY")
print(f"{'='*70}")
total = len(mapping)
mapped = len([v for v in mapping.values() if v])
failed = total - mapped
print(f"\nSource database: {from_db}")
print(f"Target database: {to_db}")
print(f"\nTotal identifiers: {total}")
print(f"Successfully mapped: {mapped} ({mapped/total*100:.1f}%)")
print(f"Failed to map: {failed} ({failed/total*100:.1f}%)")
# Show some examples
if mapped > 0:
print(f"\nExample mappings (first 5):")
count = 0
for source_id, target_ids in mapping.items():
if target_ids:
target_str = ", ".join(target_ids[:3])
if len(target_ids) > 3:
target_str += f" ... +{len(target_ids)-3} more"
print(f" {source_id}{target_str}")
count += 1
if count >= 5:
break
# Show multiple mapping statistics
multiple_mappings = [v for v in mapping.values() if v and len(v) > 1]
if multiple_mappings:
print(f"\nMultiple target mappings: {len(multiple_mappings)} ID(s)")
print(f" (These source IDs map to multiple target IDs)")
print(f"{'='*70}")
def list_common_databases():
"""Print list of common database codes."""
print("\nCommon Database Codes:")
print("-" * 70)
print(f"{'Alias':<20} {'Official Code':<30}")
print("-" * 70)
for alias, code in sorted(DATABASE_CODES.items()):
if alias != code.lower():
print(f"{alias:<20} {code:<30}")
print("-" * 70)
print("\nNote: Many other database codes are supported.")
print("See UniProt documentation for complete list.")
def main():
"""Main conversion workflow."""
parser = argparse.ArgumentParser(
description="Batch convert biological identifiers between databases",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python batch_id_converter.py uniprot_ids.txt --from UniProtKB_AC-ID --to KEGG
python batch_id_converter.py ids.txt --from GeneID --to UniProtKB -o mapping.csv
python batch_id_converter.py ids.txt --from uniprot --to ensembl --chunk-size 50
Common database codes:
UniProtKB_AC-ID, KEGG, GeneID, Ensembl, Ensembl_Protein,
RefSeq_Protein, PDB, HGNC, GO, Pfam, InterPro, Reactome
Use --list-databases to see all supported aliases.
"""
)
parser.add_argument("input_file", help="Input file with IDs (one per line)")
parser.add_argument("--from", dest="from_db", required=True,
help="Source database code")
parser.add_argument("--to", dest="to_db", required=True,
help="Target database code")
parser.add_argument("-o", "--output", default=None,
help="Output CSV file (default: mapping_results.csv)")
parser.add_argument("--chunk-size", type=int, default=100,
help="Number of IDs per batch (default: 100)")
parser.add_argument("--delay", type=float, default=0.5,
help="Delay between batches in seconds (default: 0.5)")
parser.add_argument("--save-failed", action="store_true",
help="Save failed IDs to separate file")
parser.add_argument("--list-databases", action="store_true",
help="List common database codes and exit")
args = parser.parse_args()
# List databases and exit
if args.list_databases:
list_common_databases()
sys.exit(0)
print("=" * 70)
print("BIOSERVICES: Batch Identifier Converter")
print("=" * 70)
# Normalize database codes
from_db = normalize_database_code(args.from_db)
to_db = normalize_database_code(args.to_db)
if from_db != args.from_db:
print(f"\nNote: Normalized '{args.from_db}''{from_db}'")
if to_db != args.to_db:
print(f"Note: Normalized '{args.to_db}''{to_db}'")
# Read input IDs
try:
ids = read_ids_from_file(args.input_file)
except Exception as e:
print(f"\n✗ Error reading input file: {e}")
sys.exit(1)
if not ids:
print("\n✗ No IDs found in input file")
sys.exit(1)
# Perform conversion
mapping, failed_ids = batch_convert(
ids,
from_db,
to_db,
chunk_size=args.chunk_size,
delay=args.delay
)
# Print summary
print_mapping_summary(mapping, from_db, to_db)
# Save results
output_file = args.output or "mapping_results.csv"
save_mapping_csv(mapping, output_file, from_db, to_db)
# Save failed IDs if requested
if args.save_failed and failed_ids:
failed_file = output_file.replace(".csv", "_failed.txt")
save_failed_ids(failed_ids, failed_file)
print(f"\n✓ Done!")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,378 @@
#!/usr/bin/env python3
"""
Compound Cross-Database Search
This script searches for a compound by name and retrieves identifiers
from multiple databases:
- KEGG Compound
- ChEBI
- ChEMBL (via UniChem)
- Basic compound properties
Usage:
python compound_cross_reference.py COMPOUND_NAME [--output FILE]
Examples:
python compound_cross_reference.py Geldanamycin
python compound_cross_reference.py "Adenosine triphosphate"
python compound_cross_reference.py Aspirin --output aspirin_info.txt
"""
import sys
import argparse
from bioservices import KEGG, UniChem, ChEBI, ChEMBL
def search_kegg_compound(compound_name):
"""Search KEGG for compound by name."""
print(f"\n{'='*70}")
print("STEP 1: KEGG Compound Search")
print(f"{'='*70}")
k = KEGG()
print(f"Searching KEGG for: {compound_name}")
try:
results = k.find("compound", compound_name)
if not results or not results.strip():
print(f"✗ No results found in KEGG")
return k, None
# Parse results
lines = results.strip().split("\n")
print(f"✓ Found {len(lines)} result(s):\n")
for i, line in enumerate(lines[:5], 1):
parts = line.split("\t")
kegg_id = parts[0]
description = parts[1] if len(parts) > 1 else "No description"
print(f" {i}. {kegg_id}: {description}")
# Use first result
first_result = lines[0].split("\t")
kegg_id = first_result[0].replace("cpd:", "")
print(f"\nUsing: {kegg_id}")
return k, kegg_id
except Exception as e:
print(f"✗ Error: {e}")
return k, None
def get_kegg_info(kegg, kegg_id):
"""Retrieve detailed KEGG compound information."""
print(f"\n{'='*70}")
print("STEP 2: KEGG Compound Details")
print(f"{'='*70}")
try:
print(f"Retrieving KEGG entry for {kegg_id}...")
entry = kegg.get(f"cpd:{kegg_id}")
if not entry:
print("✗ Failed to retrieve entry")
return None
# Parse entry
compound_info = {
'kegg_id': kegg_id,
'name': None,
'formula': None,
'exact_mass': None,
'mol_weight': None,
'chebi_id': None,
'pathways': []
}
current_section = None
for line in entry.split("\n"):
if line.startswith("NAME"):
compound_info['name'] = line.replace("NAME", "").strip().rstrip(";")
elif line.startswith("FORMULA"):
compound_info['formula'] = line.replace("FORMULA", "").strip()
elif line.startswith("EXACT_MASS"):
compound_info['exact_mass'] = line.replace("EXACT_MASS", "").strip()
elif line.startswith("MOL_WEIGHT"):
compound_info['mol_weight'] = line.replace("MOL_WEIGHT", "").strip()
elif "ChEBI:" in line:
parts = line.split("ChEBI:")
if len(parts) > 1:
compound_info['chebi_id'] = parts[1].strip().split()[0]
elif line.startswith("PATHWAY"):
current_section = "pathway"
pathway = line.replace("PATHWAY", "").strip()
if pathway:
compound_info['pathways'].append(pathway)
elif current_section == "pathway" and line.startswith(" "):
pathway = line.strip()
if pathway:
compound_info['pathways'].append(pathway)
elif line.startswith(" ") and not line.startswith(" "):
current_section = None
# Display information
print(f"\n✓ KEGG Compound Information:")
print(f" ID: {compound_info['kegg_id']}")
print(f" Name: {compound_info['name']}")
print(f" Formula: {compound_info['formula']}")
print(f" Exact Mass: {compound_info['exact_mass']}")
print(f" Molecular Weight: {compound_info['mol_weight']}")
if compound_info['chebi_id']:
print(f" ChEBI ID: {compound_info['chebi_id']}")
if compound_info['pathways']:
print(f" Pathways: {len(compound_info['pathways'])} found")
return compound_info
except Exception as e:
print(f"✗ Error: {e}")
return None
def get_chembl_id(kegg_id):
"""Map KEGG ID to ChEMBL via UniChem."""
print(f"\n{'='*70}")
print("STEP 3: ChEMBL Mapping (via UniChem)")
print(f"{'='*70}")
try:
u = UniChem()
print(f"Mapping KEGG:{kegg_id} to ChEMBL...")
chembl_id = u.get_compound_id_from_kegg(kegg_id)
if chembl_id:
print(f"✓ ChEMBL ID: {chembl_id}")
return chembl_id
else:
print("✗ No ChEMBL mapping found")
return None
except Exception as e:
print(f"✗ Error: {e}")
return None
def get_chebi_info(chebi_id):
"""Retrieve ChEBI compound information."""
print(f"\n{'='*70}")
print("STEP 4: ChEBI Details")
print(f"{'='*70}")
if not chebi_id:
print("⊘ No ChEBI ID available")
return None
try:
c = ChEBI()
print(f"Retrieving ChEBI entry for {chebi_id}...")
# Ensure proper format
if not chebi_id.startswith("CHEBI:"):
chebi_id = f"CHEBI:{chebi_id}"
entity = c.getCompleteEntity(chebi_id)
if entity:
print(f"\n✓ ChEBI Information:")
print(f" ID: {entity.chebiId}")
print(f" Name: {entity.chebiAsciiName}")
if hasattr(entity, 'Formulae') and entity.Formulae:
print(f" Formula: {entity.Formulae}")
if hasattr(entity, 'mass') and entity.mass:
print(f" Mass: {entity.mass}")
if hasattr(entity, 'charge') and entity.charge:
print(f" Charge: {entity.charge}")
return {
'chebi_id': entity.chebiId,
'name': entity.chebiAsciiName,
'formula': entity.Formulae if hasattr(entity, 'Formulae') else None,
'mass': entity.mass if hasattr(entity, 'mass') else None
}
else:
print("✗ Failed to retrieve ChEBI entry")
return None
except Exception as e:
print(f"✗ Error: {e}")
return None
def get_chembl_info(chembl_id):
"""Retrieve ChEMBL compound information."""
print(f"\n{'='*70}")
print("STEP 5: ChEMBL Details")
print(f"{'='*70}")
if not chembl_id:
print("⊘ No ChEMBL ID available")
return None
try:
c = ChEMBL()
print(f"Retrieving ChEMBL entry for {chembl_id}...")
compound = c.get_compound_by_chemblId(chembl_id)
if compound:
print(f"\n✓ ChEMBL Information:")
print(f" ID: {chembl_id}")
if 'pref_name' in compound and compound['pref_name']:
print(f" Preferred Name: {compound['pref_name']}")
if 'molecule_properties' in compound:
props = compound['molecule_properties']
if 'full_mwt' in props:
print(f" Molecular Weight: {props['full_mwt']}")
if 'alogp' in props:
print(f" LogP: {props['alogp']}")
if 'hba' in props:
print(f" H-Bond Acceptors: {props['hba']}")
if 'hbd' in props:
print(f" H-Bond Donors: {props['hbd']}")
if 'molecule_structures' in compound:
structs = compound['molecule_structures']
if 'canonical_smiles' in structs:
smiles = structs['canonical_smiles']
print(f" SMILES: {smiles[:60]}{'...' if len(smiles) > 60 else ''}")
return compound
else:
print("✗ Failed to retrieve ChEMBL entry")
return None
except Exception as e:
print(f"✗ Error: {e}")
return None
def save_results(compound_name, kegg_info, chembl_id, output_file):
"""Save results to file."""
print(f"\n{'='*70}")
print(f"Saving results to {output_file}")
print(f"{'='*70}")
with open(output_file, 'w') as f:
f.write("=" * 70 + "\n")
f.write(f"Compound Cross-Reference Report: {compound_name}\n")
f.write("=" * 70 + "\n\n")
# KEGG information
if kegg_info:
f.write("KEGG Compound\n")
f.write("-" * 70 + "\n")
f.write(f"ID: {kegg_info['kegg_id']}\n")
f.write(f"Name: {kegg_info['name']}\n")
f.write(f"Formula: {kegg_info['formula']}\n")
f.write(f"Exact Mass: {kegg_info['exact_mass']}\n")
f.write(f"Molecular Weight: {kegg_info['mol_weight']}\n")
f.write(f"Pathways: {len(kegg_info['pathways'])} found\n")
f.write("\n")
# Database IDs
f.write("Cross-Database Identifiers\n")
f.write("-" * 70 + "\n")
if kegg_info:
f.write(f"KEGG: {kegg_info['kegg_id']}\n")
if kegg_info['chebi_id']:
f.write(f"ChEBI: {kegg_info['chebi_id']}\n")
if chembl_id:
f.write(f"ChEMBL: {chembl_id}\n")
f.write("\n")
print(f"✓ Results saved")
def main():
"""Main workflow."""
parser = argparse.ArgumentParser(
description="Search compound across multiple databases",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python compound_cross_reference.py Geldanamycin
python compound_cross_reference.py "Adenosine triphosphate"
python compound_cross_reference.py Aspirin --output aspirin_info.txt
"""
)
parser.add_argument("compound", help="Compound name to search")
parser.add_argument("--output", default=None,
help="Output file for results (optional)")
args = parser.parse_args()
print("=" * 70)
print("BIOSERVICES: Compound Cross-Database Search")
print("=" * 70)
# Step 1: Search KEGG
kegg, kegg_id = search_kegg_compound(args.compound)
if not kegg_id:
print("\n✗ Failed to find compound. Exiting.")
sys.exit(1)
# Step 2: Get KEGG details
kegg_info = get_kegg_info(kegg, kegg_id)
# Step 3: Map to ChEMBL
chembl_id = get_chembl_id(kegg_id)
# Step 4: Get ChEBI details
chebi_info = None
if kegg_info and kegg_info['chebi_id']:
chebi_info = get_chebi_info(kegg_info['chebi_id'])
# Step 5: Get ChEMBL details
chembl_info = None
if chembl_id:
chembl_info = get_chembl_info(chembl_id)
# Summary
print(f"\n{'='*70}")
print("SUMMARY")
print(f"{'='*70}")
print(f" Compound: {args.compound}")
if kegg_info:
print(f" KEGG ID: {kegg_info['kegg_id']}")
if kegg_info['chebi_id']:
print(f" ChEBI ID: {kegg_info['chebi_id']}")
if chembl_id:
print(f" ChEMBL ID: {chembl_id}")
print(f"{'='*70}")
# Save to file if requested
if args.output:
save_results(args.compound, kegg_info, chembl_id, args.output)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,309 @@
#!/usr/bin/env python3
"""
KEGG Pathway Network Analysis
This script analyzes all pathways for an organism and extracts:
- Pathway sizes (number of genes)
- Protein-protein interactions
- Interaction type distributions
- Network data in various formats (CSV, SIF)
Usage:
python pathway_analysis.py ORGANISM OUTPUT_DIR [--limit N]
Examples:
python pathway_analysis.py hsa ./human_pathways
python pathway_analysis.py mmu ./mouse_pathways --limit 50
Organism codes:
hsa = Homo sapiens (human)
mmu = Mus musculus (mouse)
dme = Drosophila melanogaster
sce = Saccharomyces cerevisiae (yeast)
eco = Escherichia coli
"""
import sys
import os
import argparse
import csv
from collections import Counter
from bioservices import KEGG
def get_all_pathways(kegg, organism):
"""Get all pathway IDs for organism."""
print(f"\nRetrieving pathways for {organism}...")
kegg.organism = organism
pathway_ids = kegg.pathwayIds
print(f"✓ Found {len(pathway_ids)} pathways")
return pathway_ids
def analyze_pathway(kegg, pathway_id):
"""Analyze single pathway for size and interactions."""
try:
# Parse KGML pathway
kgml = kegg.parse_kgml_pathway(pathway_id)
entries = kgml.get('entries', [])
relations = kgml.get('relations', [])
# Count relation types
relation_types = Counter()
for rel in relations:
rel_type = rel.get('name', 'unknown')
relation_types[rel_type] += 1
# Get pathway name
try:
entry = kegg.get(pathway_id)
pathway_name = "Unknown"
for line in entry.split("\n"):
if line.startswith("NAME"):
pathway_name = line.replace("NAME", "").strip()
break
except:
pathway_name = "Unknown"
result = {
'pathway_id': pathway_id,
'pathway_name': pathway_name,
'num_entries': len(entries),
'num_relations': len(relations),
'relation_types': dict(relation_types),
'entries': entries,
'relations': relations
}
return result
except Exception as e:
print(f" ✗ Error analyzing {pathway_id}: {e}")
return None
def analyze_all_pathways(kegg, pathway_ids, limit=None):
"""Analyze all pathways."""
if limit:
pathway_ids = pathway_ids[:limit]
print(f"\n⚠ Limiting analysis to first {limit} pathways")
print(f"\nAnalyzing {len(pathway_ids)} pathways...")
results = []
for i, pathway_id in enumerate(pathway_ids, 1):
print(f" [{i}/{len(pathway_ids)}] {pathway_id}", end="\r")
result = analyze_pathway(kegg, pathway_id)
if result:
results.append(result)
print(f"\n✓ Successfully analyzed {len(results)}/{len(pathway_ids)} pathways")
return results
def save_pathway_summary(results, output_file):
"""Save pathway summary to CSV."""
print(f"\nSaving pathway summary to {output_file}...")
with open(output_file, 'w', newline='') as f:
writer = csv.writer(f)
# Header
writer.writerow([
'Pathway_ID',
'Pathway_Name',
'Num_Genes',
'Num_Interactions',
'Activation',
'Inhibition',
'Phosphorylation',
'Binding',
'Other'
])
# Data
for result in results:
rel_types = result['relation_types']
writer.writerow([
result['pathway_id'],
result['pathway_name'],
result['num_entries'],
result['num_relations'],
rel_types.get('activation', 0),
rel_types.get('inhibition', 0),
rel_types.get('phosphorylation', 0),
rel_types.get('binding/association', 0),
sum(v for k, v in rel_types.items()
if k not in ['activation', 'inhibition', 'phosphorylation', 'binding/association'])
])
print(f"✓ Summary saved")
def save_interactions_sif(results, output_file):
"""Save all interactions in SIF format."""
print(f"\nSaving interactions to {output_file}...")
with open(output_file, 'w') as f:
for result in results:
pathway_id = result['pathway_id']
for rel in result['relations']:
entry1 = rel.get('entry1', '')
entry2 = rel.get('entry2', '')
interaction_type = rel.get('name', 'interaction')
# Write SIF format: source\tinteraction\ttarget
f.write(f"{entry1}\t{interaction_type}\t{entry2}\n")
print(f"✓ Interactions saved")
def save_detailed_pathway_info(results, output_dir):
"""Save detailed information for each pathway."""
print(f"\nSaving detailed pathway files to {output_dir}/pathways/...")
pathway_dir = os.path.join(output_dir, "pathways")
os.makedirs(pathway_dir, exist_ok=True)
for result in results:
pathway_id = result['pathway_id'].replace(":", "_")
filename = os.path.join(pathway_dir, f"{pathway_id}_interactions.csv")
with open(filename, 'w', newline='') as f:
writer = csv.writer(f)
writer.writerow(['Source', 'Target', 'Interaction_Type', 'Link_Type'])
for rel in result['relations']:
writer.writerow([
rel.get('entry1', ''),
rel.get('entry2', ''),
rel.get('name', 'unknown'),
rel.get('link', 'unknown')
])
print(f"✓ Detailed files saved for {len(results)} pathways")
def print_statistics(results):
"""Print analysis statistics."""
print(f"\n{'='*70}")
print("PATHWAY ANALYSIS STATISTICS")
print(f"{'='*70}")
# Total stats
total_pathways = len(results)
total_interactions = sum(r['num_relations'] for r in results)
total_genes = sum(r['num_entries'] for r in results)
print(f"\nOverall:")
print(f" Total pathways: {total_pathways}")
print(f" Total genes/proteins: {total_genes}")
print(f" Total interactions: {total_interactions}")
# Largest pathways
print(f"\nLargest pathways (by gene count):")
sorted_by_size = sorted(results, key=lambda x: x['num_entries'], reverse=True)
for i, result in enumerate(sorted_by_size[:10], 1):
print(f" {i}. {result['pathway_id']}: {result['num_entries']} genes")
print(f" {result['pathway_name']}")
# Most connected pathways
print(f"\nMost connected pathways (by interactions):")
sorted_by_connections = sorted(results, key=lambda x: x['num_relations'], reverse=True)
for i, result in enumerate(sorted_by_connections[:10], 1):
print(f" {i}. {result['pathway_id']}: {result['num_relations']} interactions")
print(f" {result['pathway_name']}")
# Interaction type distribution
print(f"\nInteraction type distribution:")
all_types = Counter()
for result in results:
for rel_type, count in result['relation_types'].items():
all_types[rel_type] += count
for rel_type, count in all_types.most_common():
percentage = (count / total_interactions) * 100 if total_interactions > 0 else 0
print(f" {rel_type}: {count} ({percentage:.1f}%)")
def main():
"""Main analysis workflow."""
parser = argparse.ArgumentParser(
description="Analyze KEGG pathways for an organism",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python pathway_analysis.py hsa ./human_pathways
python pathway_analysis.py mmu ./mouse_pathways --limit 50
Organism codes:
hsa = Homo sapiens (human)
mmu = Mus musculus (mouse)
dme = Drosophila melanogaster
sce = Saccharomyces cerevisiae (yeast)
eco = Escherichia coli
"""
)
parser.add_argument("organism", help="KEGG organism code (e.g., hsa, mmu)")
parser.add_argument("output_dir", help="Output directory for results")
parser.add_argument("--limit", type=int, default=None,
help="Limit analysis to first N pathways")
args = parser.parse_args()
print("=" * 70)
print("BIOSERVICES: KEGG Pathway Network Analysis")
print("=" * 70)
# Create output directory
os.makedirs(args.output_dir, exist_ok=True)
# Initialize KEGG
kegg = KEGG()
# Get all pathways
pathway_ids = get_all_pathways(kegg, args.organism)
if not pathway_ids:
print(f"\n✗ No pathways found for {args.organism}")
sys.exit(1)
# Analyze pathways
results = analyze_all_pathways(kegg, pathway_ids, args.limit)
if not results:
print("\n✗ No pathways successfully analyzed")
sys.exit(1)
# Print statistics
print_statistics(results)
# Save results
summary_file = os.path.join(args.output_dir, "pathway_summary.csv")
save_pathway_summary(results, summary_file)
sif_file = os.path.join(args.output_dir, "all_interactions.sif")
save_interactions_sif(results, sif_file)
save_detailed_pathway_info(results, args.output_dir)
# Final summary
print(f"\n{'='*70}")
print("OUTPUT FILES")
print(f"{'='*70}")
print(f" Summary: {summary_file}")
print(f" Interactions: {sif_file}")
print(f" Detailed: {args.output_dir}/pathways/")
print(f"{'='*70}")
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,408 @@
#!/usr/bin/env python3
"""
Complete Protein Analysis Workflow
This script performs a comprehensive protein analysis pipeline:
1. UniProt search and identifier retrieval
2. FASTA sequence retrieval
3. BLAST similarity search
4. KEGG pathway discovery
5. PSICQUIC interaction mapping
6. GO annotation retrieval
Usage:
python protein_analysis_workflow.py PROTEIN_NAME EMAIL [--skip-blast]
Examples:
python protein_analysis_workflow.py ZAP70_HUMAN user@example.com
python protein_analysis_workflow.py P43403 user@example.com --skip-blast
Note: BLAST searches can take several minutes. Use --skip-blast to skip this step.
"""
import sys
import time
import argparse
from bioservices import UniProt, KEGG, NCBIblast, PSICQUIC, QuickGO
def search_protein(query):
"""Search UniProt for protein and retrieve basic information."""
print(f"\n{'='*70}")
print("STEP 1: UniProt Search")
print(f"{'='*70}")
u = UniProt(verbose=False)
print(f"Searching for: {query}")
# Try direct retrieval first (if query looks like accession)
if len(query) == 6 and query[0] in "OPQ":
try:
entry = u.retrieve(query, frmt="tab")
if entry:
uniprot_id = query
print(f"✓ Found UniProt entry: {uniprot_id}")
return u, uniprot_id
except:
pass
# Otherwise search
results = u.search(query, frmt="tab", columns="id,genes,organism,length,protein names", limit=5)
if not results:
print("✗ No results found")
return u, None
lines = results.strip().split("\n")
if len(lines) < 2:
print("✗ No entries found")
return u, None
# Display results
print(f"\n✓ Found {len(lines)-1} result(s):")
for i, line in enumerate(lines[1:], 1):
fields = line.split("\t")
print(f" {i}. {fields[0]} - {fields[1]} ({fields[2]})")
# Use first result
first_entry = lines[1].split("\t")
uniprot_id = first_entry[0]
gene_names = first_entry[1] if len(first_entry) > 1 else "N/A"
organism = first_entry[2] if len(first_entry) > 2 else "N/A"
length = first_entry[3] if len(first_entry) > 3 else "N/A"
protein_name = first_entry[4] if len(first_entry) > 4 else "N/A"
print(f"\nUsing first result:")
print(f" UniProt ID: {uniprot_id}")
print(f" Gene names: {gene_names}")
print(f" Organism: {organism}")
print(f" Length: {length} aa")
print(f" Protein: {protein_name}")
return u, uniprot_id
def retrieve_sequence(uniprot, uniprot_id):
"""Retrieve FASTA sequence for protein."""
print(f"\n{'='*70}")
print("STEP 2: FASTA Sequence Retrieval")
print(f"{'='*70}")
try:
sequence = uniprot.retrieve(uniprot_id, frmt="fasta")
if sequence:
# Extract sequence only (remove header)
lines = sequence.strip().split("\n")
header = lines[0]
seq_only = "".join(lines[1:])
print(f"✓ Retrieved sequence:")
print(f" Header: {header}")
print(f" Length: {len(seq_only)} residues")
print(f" First 60 residues: {seq_only[:60]}...")
return seq_only
else:
print("✗ Failed to retrieve sequence")
return None
except Exception as e:
print(f"✗ Error: {e}")
return None
def run_blast(sequence, email, skip=False):
"""Run BLAST similarity search."""
print(f"\n{'='*70}")
print("STEP 3: BLAST Similarity Search")
print(f"{'='*70}")
if skip:
print("⊘ Skipped (--skip-blast flag)")
return None
if not email or "@" not in email:
print("⊘ Skipped (valid email required for BLAST)")
return None
try:
print(f"Submitting BLASTP job...")
print(f" Database: uniprotkb")
print(f" Sequence length: {len(sequence)} aa")
s = NCBIblast(verbose=False)
jobid = s.run(
program="blastp",
sequence=sequence,
stype="protein",
database="uniprotkb",
email=email
)
print(f"✓ Job submitted: {jobid}")
print(f" Waiting for completion...")
# Poll for completion
max_wait = 300 # 5 minutes
start_time = time.time()
while time.time() - start_time < max_wait:
status = s.getStatus(jobid)
elapsed = int(time.time() - start_time)
print(f" Status: {status} (elapsed: {elapsed}s)", end="\r")
if status == "FINISHED":
print(f"\n✓ BLAST completed in {elapsed}s")
# Retrieve results
results = s.getResult(jobid, "out")
# Parse and display summary
lines = results.split("\n")
print(f"\n Results preview:")
for line in lines[:20]:
if line.strip():
print(f" {line}")
return results
elif status == "ERROR":
print(f"\n✗ BLAST job failed")
return None
time.sleep(5)
print(f"\n✗ Timeout after {max_wait}s")
return None
except Exception as e:
print(f"✗ Error: {e}")
return None
def discover_pathways(uniprot, kegg, uniprot_id):
"""Discover KEGG pathways for protein."""
print(f"\n{'='*70}")
print("STEP 4: KEGG Pathway Discovery")
print(f"{'='*70}")
try:
# Map UniProt → KEGG
print(f"Mapping {uniprot_id} to KEGG...")
kegg_mapping = uniprot.mapping(fr="UniProtKB_AC-ID", to="KEGG", query=uniprot_id)
if not kegg_mapping or uniprot_id not in kegg_mapping:
print("✗ No KEGG mapping found")
return []
kegg_ids = kegg_mapping[uniprot_id]
print(f"✓ KEGG ID(s): {kegg_ids}")
# Get pathways for first KEGG ID
kegg_id = kegg_ids[0]
organism, gene_id = kegg_id.split(":")
print(f"\nSearching pathways for {kegg_id}...")
pathways = kegg.get_pathway_by_gene(gene_id, organism)
if not pathways:
print("✗ No pathways found")
return []
print(f"✓ Found {len(pathways)} pathway(s):\n")
# Get pathway names
pathway_info = []
for pathway_id in pathways:
try:
entry = kegg.get(pathway_id)
# Extract pathway name
pathway_name = "Unknown"
for line in entry.split("\n"):
if line.startswith("NAME"):
pathway_name = line.replace("NAME", "").strip()
break
pathway_info.append((pathway_id, pathway_name))
print(f"{pathway_id}: {pathway_name}")
except Exception as e:
print(f"{pathway_id}: [Error retrieving name]")
return pathway_info
except Exception as e:
print(f"✗ Error: {e}")
return []
def find_interactions(protein_query):
"""Find protein-protein interactions via PSICQUIC."""
print(f"\n{'='*70}")
print("STEP 5: Protein-Protein Interactions")
print(f"{'='*70}")
try:
p = PSICQUIC()
# Try querying MINT database
query = f"{protein_query} AND species:9606"
print(f"Querying MINT database...")
print(f" Query: {query}")
results = p.query("mint", query)
if not results:
print("✗ No interactions found in MINT")
return []
# Parse PSI-MI TAB format
lines = results.strip().split("\n")
print(f"✓ Found {len(lines)} interaction(s):\n")
# Display first 10 interactions
interactions = []
for i, line in enumerate(lines[:10], 1):
fields = line.split("\t")
if len(fields) >= 12:
protein_a = fields[4].split(":")[1] if ":" in fields[4] else fields[4]
protein_b = fields[5].split(":")[1] if ":" in fields[5] else fields[5]
interaction_type = fields[11]
interactions.append((protein_a, protein_b, interaction_type))
print(f" {i}. {protein_a}{protein_b}")
if len(lines) > 10:
print(f" ... and {len(lines)-10} more")
return interactions
except Exception as e:
print(f"✗ Error: {e}")
return []
def get_go_annotations(uniprot_id):
"""Retrieve GO annotations."""
print(f"\n{'='*70}")
print("STEP 6: Gene Ontology Annotations")
print(f"{'='*70}")
try:
g = QuickGO()
print(f"Retrieving GO annotations for {uniprot_id}...")
annotations = g.Annotation(protein=uniprot_id, format="tsv")
if not annotations:
print("✗ No GO annotations found")
return []
lines = annotations.strip().split("\n")
print(f"✓ Found {len(lines)-1} annotation(s)\n")
# Group by aspect
aspects = {"P": [], "F": [], "C": []}
for line in lines[1:]:
fields = line.split("\t")
if len(fields) >= 9:
go_id = fields[6]
go_term = fields[7]
go_aspect = fields[8]
if go_aspect in aspects:
aspects[go_aspect].append((go_id, go_term))
# Display summary
print(f" Biological Process (P): {len(aspects['P'])} terms")
for go_id, go_term in aspects['P'][:5]:
print(f"{go_id}: {go_term}")
if len(aspects['P']) > 5:
print(f" ... and {len(aspects['P'])-5} more")
print(f"\n Molecular Function (F): {len(aspects['F'])} terms")
for go_id, go_term in aspects['F'][:5]:
print(f"{go_id}: {go_term}")
if len(aspects['F']) > 5:
print(f" ... and {len(aspects['F'])-5} more")
print(f"\n Cellular Component (C): {len(aspects['C'])} terms")
for go_id, go_term in aspects['C'][:5]:
print(f"{go_id}: {go_term}")
if len(aspects['C']) > 5:
print(f" ... and {len(aspects['C'])-5} more")
return aspects
except Exception as e:
print(f"✗ Error: {e}")
return {}
def main():
"""Main workflow."""
parser = argparse.ArgumentParser(
description="Complete protein analysis workflow using BioServices",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python protein_analysis_workflow.py ZAP70_HUMAN user@example.com
python protein_analysis_workflow.py P43403 user@example.com --skip-blast
"""
)
parser.add_argument("protein", help="Protein name or UniProt ID")
parser.add_argument("email", help="Email address (required for BLAST)")
parser.add_argument("--skip-blast", action="store_true",
help="Skip BLAST search (faster)")
args = parser.parse_args()
print("=" * 70)
print("BIOSERVICES: Complete Protein Analysis Workflow")
print("=" * 70)
# Step 1: Search protein
uniprot, uniprot_id = search_protein(args.protein)
if not uniprot_id:
print("\n✗ Failed to find protein. Exiting.")
sys.exit(1)
# Step 2: Retrieve sequence
sequence = retrieve_sequence(uniprot, uniprot_id)
if not sequence:
print("\n⚠ Warning: Could not retrieve sequence")
# Step 3: BLAST search
if sequence:
blast_results = run_blast(sequence, args.email, args.skip_blast)
# Step 4: Pathway discovery
kegg = KEGG()
pathways = discover_pathways(uniprot, kegg, uniprot_id)
# Step 5: Interaction mapping
interactions = find_interactions(args.protein)
# Step 6: GO annotations
go_terms = get_go_annotations(uniprot_id)
# Summary
print(f"\n{'='*70}")
print("WORKFLOW SUMMARY")
print(f"{'='*70}")
print(f" Protein: {args.protein}")
print(f" UniProt ID: {uniprot_id}")
print(f" Sequence: {'' if sequence else ''}")
print(f" BLAST: {'' if not args.skip_blast and sequence else ''}")
print(f" Pathways: {len(pathways)} found")
print(f" Interactions: {len(interactions)} found")
print(f" GO annotations: {sum(len(v) for v in go_terms.values())} found")
print(f"{'='*70}")
if __name__ == "__main__":
main()