Initial commit
This commit is contained in:
408
skills/bioservices/scripts/protein_analysis_workflow.py
Executable file
408
skills/bioservices/scripts/protein_analysis_workflow.py
Executable file
@@ -0,0 +1,408 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Complete Protein Analysis Workflow
|
||||
|
||||
This script performs a comprehensive protein analysis pipeline:
|
||||
1. UniProt search and identifier retrieval
|
||||
2. FASTA sequence retrieval
|
||||
3. BLAST similarity search
|
||||
4. KEGG pathway discovery
|
||||
5. PSICQUIC interaction mapping
|
||||
6. GO annotation retrieval
|
||||
|
||||
Usage:
|
||||
python protein_analysis_workflow.py PROTEIN_NAME EMAIL [--skip-blast]
|
||||
|
||||
Examples:
|
||||
python protein_analysis_workflow.py ZAP70_HUMAN user@example.com
|
||||
python protein_analysis_workflow.py P43403 user@example.com --skip-blast
|
||||
|
||||
Note: BLAST searches can take several minutes. Use --skip-blast to skip this step.
|
||||
"""
|
||||
|
||||
import sys
|
||||
import time
|
||||
import argparse
|
||||
from bioservices import UniProt, KEGG, NCBIblast, PSICQUIC, QuickGO
|
||||
|
||||
|
||||
def search_protein(query):
|
||||
"""Search UniProt for protein and retrieve basic information."""
|
||||
print(f"\n{'='*70}")
|
||||
print("STEP 1: UniProt Search")
|
||||
print(f"{'='*70}")
|
||||
|
||||
u = UniProt(verbose=False)
|
||||
|
||||
print(f"Searching for: {query}")
|
||||
|
||||
# Try direct retrieval first (if query looks like accession)
|
||||
if len(query) == 6 and query[0] in "OPQ":
|
||||
try:
|
||||
entry = u.retrieve(query, frmt="tab")
|
||||
if entry:
|
||||
uniprot_id = query
|
||||
print(f"✓ Found UniProt entry: {uniprot_id}")
|
||||
return u, uniprot_id
|
||||
except:
|
||||
pass
|
||||
|
||||
# Otherwise search
|
||||
results = u.search(query, frmt="tab", columns="id,genes,organism,length,protein names", limit=5)
|
||||
|
||||
if not results:
|
||||
print("✗ No results found")
|
||||
return u, None
|
||||
|
||||
lines = results.strip().split("\n")
|
||||
if len(lines) < 2:
|
||||
print("✗ No entries found")
|
||||
return u, None
|
||||
|
||||
# Display results
|
||||
print(f"\n✓ Found {len(lines)-1} result(s):")
|
||||
for i, line in enumerate(lines[1:], 1):
|
||||
fields = line.split("\t")
|
||||
print(f" {i}. {fields[0]} - {fields[1]} ({fields[2]})")
|
||||
|
||||
# Use first result
|
||||
first_entry = lines[1].split("\t")
|
||||
uniprot_id = first_entry[0]
|
||||
gene_names = first_entry[1] if len(first_entry) > 1 else "N/A"
|
||||
organism = first_entry[2] if len(first_entry) > 2 else "N/A"
|
||||
length = first_entry[3] if len(first_entry) > 3 else "N/A"
|
||||
protein_name = first_entry[4] if len(first_entry) > 4 else "N/A"
|
||||
|
||||
print(f"\nUsing first result:")
|
||||
print(f" UniProt ID: {uniprot_id}")
|
||||
print(f" Gene names: {gene_names}")
|
||||
print(f" Organism: {organism}")
|
||||
print(f" Length: {length} aa")
|
||||
print(f" Protein: {protein_name}")
|
||||
|
||||
return u, uniprot_id
|
||||
|
||||
|
||||
def retrieve_sequence(uniprot, uniprot_id):
|
||||
"""Retrieve FASTA sequence for protein."""
|
||||
print(f"\n{'='*70}")
|
||||
print("STEP 2: FASTA Sequence Retrieval")
|
||||
print(f"{'='*70}")
|
||||
|
||||
try:
|
||||
sequence = uniprot.retrieve(uniprot_id, frmt="fasta")
|
||||
|
||||
if sequence:
|
||||
# Extract sequence only (remove header)
|
||||
lines = sequence.strip().split("\n")
|
||||
header = lines[0]
|
||||
seq_only = "".join(lines[1:])
|
||||
|
||||
print(f"✓ Retrieved sequence:")
|
||||
print(f" Header: {header}")
|
||||
print(f" Length: {len(seq_only)} residues")
|
||||
print(f" First 60 residues: {seq_only[:60]}...")
|
||||
|
||||
return seq_only
|
||||
else:
|
||||
print("✗ Failed to retrieve sequence")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def run_blast(sequence, email, skip=False):
|
||||
"""Run BLAST similarity search."""
|
||||
print(f"\n{'='*70}")
|
||||
print("STEP 3: BLAST Similarity Search")
|
||||
print(f"{'='*70}")
|
||||
|
||||
if skip:
|
||||
print("⊘ Skipped (--skip-blast flag)")
|
||||
return None
|
||||
|
||||
if not email or "@" not in email:
|
||||
print("⊘ Skipped (valid email required for BLAST)")
|
||||
return None
|
||||
|
||||
try:
|
||||
print(f"Submitting BLASTP job...")
|
||||
print(f" Database: uniprotkb")
|
||||
print(f" Sequence length: {len(sequence)} aa")
|
||||
|
||||
s = NCBIblast(verbose=False)
|
||||
|
||||
jobid = s.run(
|
||||
program="blastp",
|
||||
sequence=sequence,
|
||||
stype="protein",
|
||||
database="uniprotkb",
|
||||
email=email
|
||||
)
|
||||
|
||||
print(f"✓ Job submitted: {jobid}")
|
||||
print(f" Waiting for completion...")
|
||||
|
||||
# Poll for completion
|
||||
max_wait = 300 # 5 minutes
|
||||
start_time = time.time()
|
||||
|
||||
while time.time() - start_time < max_wait:
|
||||
status = s.getStatus(jobid)
|
||||
elapsed = int(time.time() - start_time)
|
||||
print(f" Status: {status} (elapsed: {elapsed}s)", end="\r")
|
||||
|
||||
if status == "FINISHED":
|
||||
print(f"\n✓ BLAST completed in {elapsed}s")
|
||||
|
||||
# Retrieve results
|
||||
results = s.getResult(jobid, "out")
|
||||
|
||||
# Parse and display summary
|
||||
lines = results.split("\n")
|
||||
print(f"\n Results preview:")
|
||||
for line in lines[:20]:
|
||||
if line.strip():
|
||||
print(f" {line}")
|
||||
|
||||
return results
|
||||
|
||||
elif status == "ERROR":
|
||||
print(f"\n✗ BLAST job failed")
|
||||
return None
|
||||
|
||||
time.sleep(5)
|
||||
|
||||
print(f"\n✗ Timeout after {max_wait}s")
|
||||
return None
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Error: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def discover_pathways(uniprot, kegg, uniprot_id):
|
||||
"""Discover KEGG pathways for protein."""
|
||||
print(f"\n{'='*70}")
|
||||
print("STEP 4: KEGG Pathway Discovery")
|
||||
print(f"{'='*70}")
|
||||
|
||||
try:
|
||||
# Map UniProt → KEGG
|
||||
print(f"Mapping {uniprot_id} to KEGG...")
|
||||
kegg_mapping = uniprot.mapping(fr="UniProtKB_AC-ID", to="KEGG", query=uniprot_id)
|
||||
|
||||
if not kegg_mapping or uniprot_id not in kegg_mapping:
|
||||
print("✗ No KEGG mapping found")
|
||||
return []
|
||||
|
||||
kegg_ids = kegg_mapping[uniprot_id]
|
||||
print(f"✓ KEGG ID(s): {kegg_ids}")
|
||||
|
||||
# Get pathways for first KEGG ID
|
||||
kegg_id = kegg_ids[0]
|
||||
organism, gene_id = kegg_id.split(":")
|
||||
|
||||
print(f"\nSearching pathways for {kegg_id}...")
|
||||
pathways = kegg.get_pathway_by_gene(gene_id, organism)
|
||||
|
||||
if not pathways:
|
||||
print("✗ No pathways found")
|
||||
return []
|
||||
|
||||
print(f"✓ Found {len(pathways)} pathway(s):\n")
|
||||
|
||||
# Get pathway names
|
||||
pathway_info = []
|
||||
for pathway_id in pathways:
|
||||
try:
|
||||
entry = kegg.get(pathway_id)
|
||||
|
||||
# Extract pathway name
|
||||
pathway_name = "Unknown"
|
||||
for line in entry.split("\n"):
|
||||
if line.startswith("NAME"):
|
||||
pathway_name = line.replace("NAME", "").strip()
|
||||
break
|
||||
|
||||
pathway_info.append((pathway_id, pathway_name))
|
||||
print(f" • {pathway_id}: {pathway_name}")
|
||||
|
||||
except Exception as e:
|
||||
print(f" • {pathway_id}: [Error retrieving name]")
|
||||
|
||||
return pathway_info
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Error: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def find_interactions(protein_query):
|
||||
"""Find protein-protein interactions via PSICQUIC."""
|
||||
print(f"\n{'='*70}")
|
||||
print("STEP 5: Protein-Protein Interactions")
|
||||
print(f"{'='*70}")
|
||||
|
||||
try:
|
||||
p = PSICQUIC()
|
||||
|
||||
# Try querying MINT database
|
||||
query = f"{protein_query} AND species:9606"
|
||||
print(f"Querying MINT database...")
|
||||
print(f" Query: {query}")
|
||||
|
||||
results = p.query("mint", query)
|
||||
|
||||
if not results:
|
||||
print("✗ No interactions found in MINT")
|
||||
return []
|
||||
|
||||
# Parse PSI-MI TAB format
|
||||
lines = results.strip().split("\n")
|
||||
print(f"✓ Found {len(lines)} interaction(s):\n")
|
||||
|
||||
# Display first 10 interactions
|
||||
interactions = []
|
||||
for i, line in enumerate(lines[:10], 1):
|
||||
fields = line.split("\t")
|
||||
if len(fields) >= 12:
|
||||
protein_a = fields[4].split(":")[1] if ":" in fields[4] else fields[4]
|
||||
protein_b = fields[5].split(":")[1] if ":" in fields[5] else fields[5]
|
||||
interaction_type = fields[11]
|
||||
|
||||
interactions.append((protein_a, protein_b, interaction_type))
|
||||
print(f" {i}. {protein_a} ↔ {protein_b}")
|
||||
|
||||
if len(lines) > 10:
|
||||
print(f" ... and {len(lines)-10} more")
|
||||
|
||||
return interactions
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Error: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def get_go_annotations(uniprot_id):
|
||||
"""Retrieve GO annotations."""
|
||||
print(f"\n{'='*70}")
|
||||
print("STEP 6: Gene Ontology Annotations")
|
||||
print(f"{'='*70}")
|
||||
|
||||
try:
|
||||
g = QuickGO()
|
||||
|
||||
print(f"Retrieving GO annotations for {uniprot_id}...")
|
||||
annotations = g.Annotation(protein=uniprot_id, format="tsv")
|
||||
|
||||
if not annotations:
|
||||
print("✗ No GO annotations found")
|
||||
return []
|
||||
|
||||
lines = annotations.strip().split("\n")
|
||||
print(f"✓ Found {len(lines)-1} annotation(s)\n")
|
||||
|
||||
# Group by aspect
|
||||
aspects = {"P": [], "F": [], "C": []}
|
||||
for line in lines[1:]:
|
||||
fields = line.split("\t")
|
||||
if len(fields) >= 9:
|
||||
go_id = fields[6]
|
||||
go_term = fields[7]
|
||||
go_aspect = fields[8]
|
||||
|
||||
if go_aspect in aspects:
|
||||
aspects[go_aspect].append((go_id, go_term))
|
||||
|
||||
# Display summary
|
||||
print(f" Biological Process (P): {len(aspects['P'])} terms")
|
||||
for go_id, go_term in aspects['P'][:5]:
|
||||
print(f" • {go_id}: {go_term}")
|
||||
if len(aspects['P']) > 5:
|
||||
print(f" ... and {len(aspects['P'])-5} more")
|
||||
|
||||
print(f"\n Molecular Function (F): {len(aspects['F'])} terms")
|
||||
for go_id, go_term in aspects['F'][:5]:
|
||||
print(f" • {go_id}: {go_term}")
|
||||
if len(aspects['F']) > 5:
|
||||
print(f" ... and {len(aspects['F'])-5} more")
|
||||
|
||||
print(f"\n Cellular Component (C): {len(aspects['C'])} terms")
|
||||
for go_id, go_term in aspects['C'][:5]:
|
||||
print(f" • {go_id}: {go_term}")
|
||||
if len(aspects['C']) > 5:
|
||||
print(f" ... and {len(aspects['C'])-5} more")
|
||||
|
||||
return aspects
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Error: {e}")
|
||||
return {}
|
||||
|
||||
|
||||
def main():
|
||||
"""Main workflow."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Complete protein analysis workflow using BioServices",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python protein_analysis_workflow.py ZAP70_HUMAN user@example.com
|
||||
python protein_analysis_workflow.py P43403 user@example.com --skip-blast
|
||||
"""
|
||||
)
|
||||
parser.add_argument("protein", help="Protein name or UniProt ID")
|
||||
parser.add_argument("email", help="Email address (required for BLAST)")
|
||||
parser.add_argument("--skip-blast", action="store_true",
|
||||
help="Skip BLAST search (faster)")
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
print("=" * 70)
|
||||
print("BIOSERVICES: Complete Protein Analysis Workflow")
|
||||
print("=" * 70)
|
||||
|
||||
# Step 1: Search protein
|
||||
uniprot, uniprot_id = search_protein(args.protein)
|
||||
if not uniprot_id:
|
||||
print("\n✗ Failed to find protein. Exiting.")
|
||||
sys.exit(1)
|
||||
|
||||
# Step 2: Retrieve sequence
|
||||
sequence = retrieve_sequence(uniprot, uniprot_id)
|
||||
if not sequence:
|
||||
print("\n⚠ Warning: Could not retrieve sequence")
|
||||
|
||||
# Step 3: BLAST search
|
||||
if sequence:
|
||||
blast_results = run_blast(sequence, args.email, args.skip_blast)
|
||||
|
||||
# Step 4: Pathway discovery
|
||||
kegg = KEGG()
|
||||
pathways = discover_pathways(uniprot, kegg, uniprot_id)
|
||||
|
||||
# Step 5: Interaction mapping
|
||||
interactions = find_interactions(args.protein)
|
||||
|
||||
# Step 6: GO annotations
|
||||
go_terms = get_go_annotations(uniprot_id)
|
||||
|
||||
# Summary
|
||||
print(f"\n{'='*70}")
|
||||
print("WORKFLOW SUMMARY")
|
||||
print(f"{'='*70}")
|
||||
print(f" Protein: {args.protein}")
|
||||
print(f" UniProt ID: {uniprot_id}")
|
||||
print(f" Sequence: {'✓' if sequence else '✗'}")
|
||||
print(f" BLAST: {'✓' if not args.skip_blast and sequence else '⊘'}")
|
||||
print(f" Pathways: {len(pathways)} found")
|
||||
print(f" Interactions: {len(interactions)} found")
|
||||
print(f" GO annotations: {sum(len(v) for v in go_terms.values())} found")
|
||||
print(f"{'='*70}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user