409 lines
12 KiB
Python
Executable File
409 lines
12 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Complete Protein Analysis Workflow
|
|
|
|
This script performs a comprehensive protein analysis pipeline:
|
|
1. UniProt search and identifier retrieval
|
|
2. FASTA sequence retrieval
|
|
3. BLAST similarity search
|
|
4. KEGG pathway discovery
|
|
5. PSICQUIC interaction mapping
|
|
6. GO annotation retrieval
|
|
|
|
Usage:
|
|
python protein_analysis_workflow.py PROTEIN_NAME EMAIL [--skip-blast]
|
|
|
|
Examples:
|
|
python protein_analysis_workflow.py ZAP70_HUMAN user@example.com
|
|
python protein_analysis_workflow.py P43403 user@example.com --skip-blast
|
|
|
|
Note: BLAST searches can take several minutes. Use --skip-blast to skip this step.
|
|
"""
|
|
|
|
import sys
|
|
import time
|
|
import argparse
|
|
from bioservices import UniProt, KEGG, NCBIblast, PSICQUIC, QuickGO
|
|
|
|
|
|
def search_protein(query):
|
|
"""Search UniProt for protein and retrieve basic information."""
|
|
print(f"\n{'='*70}")
|
|
print("STEP 1: UniProt Search")
|
|
print(f"{'='*70}")
|
|
|
|
u = UniProt(verbose=False)
|
|
|
|
print(f"Searching for: {query}")
|
|
|
|
# Try direct retrieval first (if query looks like accession)
|
|
if len(query) == 6 and query[0] in "OPQ":
|
|
try:
|
|
entry = u.retrieve(query, frmt="tab")
|
|
if entry:
|
|
uniprot_id = query
|
|
print(f"✓ Found UniProt entry: {uniprot_id}")
|
|
return u, uniprot_id
|
|
except:
|
|
pass
|
|
|
|
# Otherwise search
|
|
results = u.search(query, frmt="tab", columns="id,genes,organism,length,protein names", limit=5)
|
|
|
|
if not results:
|
|
print("✗ No results found")
|
|
return u, None
|
|
|
|
lines = results.strip().split("\n")
|
|
if len(lines) < 2:
|
|
print("✗ No entries found")
|
|
return u, None
|
|
|
|
# Display results
|
|
print(f"\n✓ Found {len(lines)-1} result(s):")
|
|
for i, line in enumerate(lines[1:], 1):
|
|
fields = line.split("\t")
|
|
print(f" {i}. {fields[0]} - {fields[1]} ({fields[2]})")
|
|
|
|
# Use first result
|
|
first_entry = lines[1].split("\t")
|
|
uniprot_id = first_entry[0]
|
|
gene_names = first_entry[1] if len(first_entry) > 1 else "N/A"
|
|
organism = first_entry[2] if len(first_entry) > 2 else "N/A"
|
|
length = first_entry[3] if len(first_entry) > 3 else "N/A"
|
|
protein_name = first_entry[4] if len(first_entry) > 4 else "N/A"
|
|
|
|
print(f"\nUsing first result:")
|
|
print(f" UniProt ID: {uniprot_id}")
|
|
print(f" Gene names: {gene_names}")
|
|
print(f" Organism: {organism}")
|
|
print(f" Length: {length} aa")
|
|
print(f" Protein: {protein_name}")
|
|
|
|
return u, uniprot_id
|
|
|
|
|
|
def retrieve_sequence(uniprot, uniprot_id):
|
|
"""Retrieve FASTA sequence for protein."""
|
|
print(f"\n{'='*70}")
|
|
print("STEP 2: FASTA Sequence Retrieval")
|
|
print(f"{'='*70}")
|
|
|
|
try:
|
|
sequence = uniprot.retrieve(uniprot_id, frmt="fasta")
|
|
|
|
if sequence:
|
|
# Extract sequence only (remove header)
|
|
lines = sequence.strip().split("\n")
|
|
header = lines[0]
|
|
seq_only = "".join(lines[1:])
|
|
|
|
print(f"✓ Retrieved sequence:")
|
|
print(f" Header: {header}")
|
|
print(f" Length: {len(seq_only)} residues")
|
|
print(f" First 60 residues: {seq_only[:60]}...")
|
|
|
|
return seq_only
|
|
else:
|
|
print("✗ Failed to retrieve sequence")
|
|
return None
|
|
|
|
except Exception as e:
|
|
print(f"✗ Error: {e}")
|
|
return None
|
|
|
|
|
|
def run_blast(sequence, email, skip=False):
|
|
"""Run BLAST similarity search."""
|
|
print(f"\n{'='*70}")
|
|
print("STEP 3: BLAST Similarity Search")
|
|
print(f"{'='*70}")
|
|
|
|
if skip:
|
|
print("⊘ Skipped (--skip-blast flag)")
|
|
return None
|
|
|
|
if not email or "@" not in email:
|
|
print("⊘ Skipped (valid email required for BLAST)")
|
|
return None
|
|
|
|
try:
|
|
print(f"Submitting BLASTP job...")
|
|
print(f" Database: uniprotkb")
|
|
print(f" Sequence length: {len(sequence)} aa")
|
|
|
|
s = NCBIblast(verbose=False)
|
|
|
|
jobid = s.run(
|
|
program="blastp",
|
|
sequence=sequence,
|
|
stype="protein",
|
|
database="uniprotkb",
|
|
email=email
|
|
)
|
|
|
|
print(f"✓ Job submitted: {jobid}")
|
|
print(f" Waiting for completion...")
|
|
|
|
# Poll for completion
|
|
max_wait = 300 # 5 minutes
|
|
start_time = time.time()
|
|
|
|
while time.time() - start_time < max_wait:
|
|
status = s.getStatus(jobid)
|
|
elapsed = int(time.time() - start_time)
|
|
print(f" Status: {status} (elapsed: {elapsed}s)", end="\r")
|
|
|
|
if status == "FINISHED":
|
|
print(f"\n✓ BLAST completed in {elapsed}s")
|
|
|
|
# Retrieve results
|
|
results = s.getResult(jobid, "out")
|
|
|
|
# Parse and display summary
|
|
lines = results.split("\n")
|
|
print(f"\n Results preview:")
|
|
for line in lines[:20]:
|
|
if line.strip():
|
|
print(f" {line}")
|
|
|
|
return results
|
|
|
|
elif status == "ERROR":
|
|
print(f"\n✗ BLAST job failed")
|
|
return None
|
|
|
|
time.sleep(5)
|
|
|
|
print(f"\n✗ Timeout after {max_wait}s")
|
|
return None
|
|
|
|
except Exception as e:
|
|
print(f"✗ Error: {e}")
|
|
return None
|
|
|
|
|
|
def discover_pathways(uniprot, kegg, uniprot_id):
|
|
"""Discover KEGG pathways for protein."""
|
|
print(f"\n{'='*70}")
|
|
print("STEP 4: KEGG Pathway Discovery")
|
|
print(f"{'='*70}")
|
|
|
|
try:
|
|
# Map UniProt → KEGG
|
|
print(f"Mapping {uniprot_id} to KEGG...")
|
|
kegg_mapping = uniprot.mapping(fr="UniProtKB_AC-ID", to="KEGG", query=uniprot_id)
|
|
|
|
if not kegg_mapping or uniprot_id not in kegg_mapping:
|
|
print("✗ No KEGG mapping found")
|
|
return []
|
|
|
|
kegg_ids = kegg_mapping[uniprot_id]
|
|
print(f"✓ KEGG ID(s): {kegg_ids}")
|
|
|
|
# Get pathways for first KEGG ID
|
|
kegg_id = kegg_ids[0]
|
|
organism, gene_id = kegg_id.split(":")
|
|
|
|
print(f"\nSearching pathways for {kegg_id}...")
|
|
pathways = kegg.get_pathway_by_gene(gene_id, organism)
|
|
|
|
if not pathways:
|
|
print("✗ No pathways found")
|
|
return []
|
|
|
|
print(f"✓ Found {len(pathways)} pathway(s):\n")
|
|
|
|
# Get pathway names
|
|
pathway_info = []
|
|
for pathway_id in pathways:
|
|
try:
|
|
entry = kegg.get(pathway_id)
|
|
|
|
# Extract pathway name
|
|
pathway_name = "Unknown"
|
|
for line in entry.split("\n"):
|
|
if line.startswith("NAME"):
|
|
pathway_name = line.replace("NAME", "").strip()
|
|
break
|
|
|
|
pathway_info.append((pathway_id, pathway_name))
|
|
print(f" • {pathway_id}: {pathway_name}")
|
|
|
|
except Exception as e:
|
|
print(f" • {pathway_id}: [Error retrieving name]")
|
|
|
|
return pathway_info
|
|
|
|
except Exception as e:
|
|
print(f"✗ Error: {e}")
|
|
return []
|
|
|
|
|
|
def find_interactions(protein_query):
|
|
"""Find protein-protein interactions via PSICQUIC."""
|
|
print(f"\n{'='*70}")
|
|
print("STEP 5: Protein-Protein Interactions")
|
|
print(f"{'='*70}")
|
|
|
|
try:
|
|
p = PSICQUIC()
|
|
|
|
# Try querying MINT database
|
|
query = f"{protein_query} AND species:9606"
|
|
print(f"Querying MINT database...")
|
|
print(f" Query: {query}")
|
|
|
|
results = p.query("mint", query)
|
|
|
|
if not results:
|
|
print("✗ No interactions found in MINT")
|
|
return []
|
|
|
|
# Parse PSI-MI TAB format
|
|
lines = results.strip().split("\n")
|
|
print(f"✓ Found {len(lines)} interaction(s):\n")
|
|
|
|
# Display first 10 interactions
|
|
interactions = []
|
|
for i, line in enumerate(lines[:10], 1):
|
|
fields = line.split("\t")
|
|
if len(fields) >= 12:
|
|
protein_a = fields[4].split(":")[1] if ":" in fields[4] else fields[4]
|
|
protein_b = fields[5].split(":")[1] if ":" in fields[5] else fields[5]
|
|
interaction_type = fields[11]
|
|
|
|
interactions.append((protein_a, protein_b, interaction_type))
|
|
print(f" {i}. {protein_a} ↔ {protein_b}")
|
|
|
|
if len(lines) > 10:
|
|
print(f" ... and {len(lines)-10} more")
|
|
|
|
return interactions
|
|
|
|
except Exception as e:
|
|
print(f"✗ Error: {e}")
|
|
return []
|
|
|
|
|
|
def get_go_annotations(uniprot_id):
|
|
"""Retrieve GO annotations."""
|
|
print(f"\n{'='*70}")
|
|
print("STEP 6: Gene Ontology Annotations")
|
|
print(f"{'='*70}")
|
|
|
|
try:
|
|
g = QuickGO()
|
|
|
|
print(f"Retrieving GO annotations for {uniprot_id}...")
|
|
annotations = g.Annotation(protein=uniprot_id, format="tsv")
|
|
|
|
if not annotations:
|
|
print("✗ No GO annotations found")
|
|
return []
|
|
|
|
lines = annotations.strip().split("\n")
|
|
print(f"✓ Found {len(lines)-1} annotation(s)\n")
|
|
|
|
# Group by aspect
|
|
aspects = {"P": [], "F": [], "C": []}
|
|
for line in lines[1:]:
|
|
fields = line.split("\t")
|
|
if len(fields) >= 9:
|
|
go_id = fields[6]
|
|
go_term = fields[7]
|
|
go_aspect = fields[8]
|
|
|
|
if go_aspect in aspects:
|
|
aspects[go_aspect].append((go_id, go_term))
|
|
|
|
# Display summary
|
|
print(f" Biological Process (P): {len(aspects['P'])} terms")
|
|
for go_id, go_term in aspects['P'][:5]:
|
|
print(f" • {go_id}: {go_term}")
|
|
if len(aspects['P']) > 5:
|
|
print(f" ... and {len(aspects['P'])-5} more")
|
|
|
|
print(f"\n Molecular Function (F): {len(aspects['F'])} terms")
|
|
for go_id, go_term in aspects['F'][:5]:
|
|
print(f" • {go_id}: {go_term}")
|
|
if len(aspects['F']) > 5:
|
|
print(f" ... and {len(aspects['F'])-5} more")
|
|
|
|
print(f"\n Cellular Component (C): {len(aspects['C'])} terms")
|
|
for go_id, go_term in aspects['C'][:5]:
|
|
print(f" • {go_id}: {go_term}")
|
|
if len(aspects['C']) > 5:
|
|
print(f" ... and {len(aspects['C'])-5} more")
|
|
|
|
return aspects
|
|
|
|
except Exception as e:
|
|
print(f"✗ Error: {e}")
|
|
return {}
|
|
|
|
|
|
def main():
|
|
"""Main workflow."""
|
|
parser = argparse.ArgumentParser(
|
|
description="Complete protein analysis workflow using BioServices",
|
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
|
epilog="""
|
|
Examples:
|
|
python protein_analysis_workflow.py ZAP70_HUMAN user@example.com
|
|
python protein_analysis_workflow.py P43403 user@example.com --skip-blast
|
|
"""
|
|
)
|
|
parser.add_argument("protein", help="Protein name or UniProt ID")
|
|
parser.add_argument("email", help="Email address (required for BLAST)")
|
|
parser.add_argument("--skip-blast", action="store_true",
|
|
help="Skip BLAST search (faster)")
|
|
|
|
args = parser.parse_args()
|
|
|
|
print("=" * 70)
|
|
print("BIOSERVICES: Complete Protein Analysis Workflow")
|
|
print("=" * 70)
|
|
|
|
# Step 1: Search protein
|
|
uniprot, uniprot_id = search_protein(args.protein)
|
|
if not uniprot_id:
|
|
print("\n✗ Failed to find protein. Exiting.")
|
|
sys.exit(1)
|
|
|
|
# Step 2: Retrieve sequence
|
|
sequence = retrieve_sequence(uniprot, uniprot_id)
|
|
if not sequence:
|
|
print("\n⚠ Warning: Could not retrieve sequence")
|
|
|
|
# Step 3: BLAST search
|
|
if sequence:
|
|
blast_results = run_blast(sequence, args.email, args.skip_blast)
|
|
|
|
# Step 4: Pathway discovery
|
|
kegg = KEGG()
|
|
pathways = discover_pathways(uniprot, kegg, uniprot_id)
|
|
|
|
# Step 5: Interaction mapping
|
|
interactions = find_interactions(args.protein)
|
|
|
|
# Step 6: GO annotations
|
|
go_terms = get_go_annotations(uniprot_id)
|
|
|
|
# Summary
|
|
print(f"\n{'='*70}")
|
|
print("WORKFLOW SUMMARY")
|
|
print(f"{'='*70}")
|
|
print(f" Protein: {args.protein}")
|
|
print(f" UniProt ID: {uniprot_id}")
|
|
print(f" Sequence: {'✓' if sequence else '✗'}")
|
|
print(f" BLAST: {'✓' if not args.skip_blast and sequence else '⊘'}")
|
|
print(f" Pathways: {len(pathways)} found")
|
|
print(f" Interactions: {len(interactions)} found")
|
|
print(f" GO annotations: {sum(len(v) for v in go_terms.values())} found")
|
|
print(f"{'='*70}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|