Files
gh-k-dense-ai-claude-scient…/skills/bioservices/scripts/protein_analysis_workflow.py
2025-11-30 08:30:10 +08:00

409 lines
12 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Complete Protein Analysis Workflow
This script performs a comprehensive protein analysis pipeline:
1. UniProt search and identifier retrieval
2. FASTA sequence retrieval
3. BLAST similarity search
4. KEGG pathway discovery
5. PSICQUIC interaction mapping
6. GO annotation retrieval
Usage:
python protein_analysis_workflow.py PROTEIN_NAME EMAIL [--skip-blast]
Examples:
python protein_analysis_workflow.py ZAP70_HUMAN user@example.com
python protein_analysis_workflow.py P43403 user@example.com --skip-blast
Note: BLAST searches can take several minutes. Use --skip-blast to skip this step.
"""
import sys
import time
import argparse
from bioservices import UniProt, KEGG, NCBIblast, PSICQUIC, QuickGO
def search_protein(query):
"""Search UniProt for protein and retrieve basic information."""
print(f"\n{'='*70}")
print("STEP 1: UniProt Search")
print(f"{'='*70}")
u = UniProt(verbose=False)
print(f"Searching for: {query}")
# Try direct retrieval first (if query looks like accession)
if len(query) == 6 and query[0] in "OPQ":
try:
entry = u.retrieve(query, frmt="tab")
if entry:
uniprot_id = query
print(f"✓ Found UniProt entry: {uniprot_id}")
return u, uniprot_id
except:
pass
# Otherwise search
results = u.search(query, frmt="tab", columns="id,genes,organism,length,protein names", limit=5)
if not results:
print("✗ No results found")
return u, None
lines = results.strip().split("\n")
if len(lines) < 2:
print("✗ No entries found")
return u, None
# Display results
print(f"\n✓ Found {len(lines)-1} result(s):")
for i, line in enumerate(lines[1:], 1):
fields = line.split("\t")
print(f" {i}. {fields[0]} - {fields[1]} ({fields[2]})")
# Use first result
first_entry = lines[1].split("\t")
uniprot_id = first_entry[0]
gene_names = first_entry[1] if len(first_entry) > 1 else "N/A"
organism = first_entry[2] if len(first_entry) > 2 else "N/A"
length = first_entry[3] if len(first_entry) > 3 else "N/A"
protein_name = first_entry[4] if len(first_entry) > 4 else "N/A"
print(f"\nUsing first result:")
print(f" UniProt ID: {uniprot_id}")
print(f" Gene names: {gene_names}")
print(f" Organism: {organism}")
print(f" Length: {length} aa")
print(f" Protein: {protein_name}")
return u, uniprot_id
def retrieve_sequence(uniprot, uniprot_id):
"""Retrieve FASTA sequence for protein."""
print(f"\n{'='*70}")
print("STEP 2: FASTA Sequence Retrieval")
print(f"{'='*70}")
try:
sequence = uniprot.retrieve(uniprot_id, frmt="fasta")
if sequence:
# Extract sequence only (remove header)
lines = sequence.strip().split("\n")
header = lines[0]
seq_only = "".join(lines[1:])
print(f"✓ Retrieved sequence:")
print(f" Header: {header}")
print(f" Length: {len(seq_only)} residues")
print(f" First 60 residues: {seq_only[:60]}...")
return seq_only
else:
print("✗ Failed to retrieve sequence")
return None
except Exception as e:
print(f"✗ Error: {e}")
return None
def run_blast(sequence, email, skip=False):
"""Run BLAST similarity search."""
print(f"\n{'='*70}")
print("STEP 3: BLAST Similarity Search")
print(f"{'='*70}")
if skip:
print("⊘ Skipped (--skip-blast flag)")
return None
if not email or "@" not in email:
print("⊘ Skipped (valid email required for BLAST)")
return None
try:
print(f"Submitting BLASTP job...")
print(f" Database: uniprotkb")
print(f" Sequence length: {len(sequence)} aa")
s = NCBIblast(verbose=False)
jobid = s.run(
program="blastp",
sequence=sequence,
stype="protein",
database="uniprotkb",
email=email
)
print(f"✓ Job submitted: {jobid}")
print(f" Waiting for completion...")
# Poll for completion
max_wait = 300 # 5 minutes
start_time = time.time()
while time.time() - start_time < max_wait:
status = s.getStatus(jobid)
elapsed = int(time.time() - start_time)
print(f" Status: {status} (elapsed: {elapsed}s)", end="\r")
if status == "FINISHED":
print(f"\n✓ BLAST completed in {elapsed}s")
# Retrieve results
results = s.getResult(jobid, "out")
# Parse and display summary
lines = results.split("\n")
print(f"\n Results preview:")
for line in lines[:20]:
if line.strip():
print(f" {line}")
return results
elif status == "ERROR":
print(f"\n✗ BLAST job failed")
return None
time.sleep(5)
print(f"\n✗ Timeout after {max_wait}s")
return None
except Exception as e:
print(f"✗ Error: {e}")
return None
def discover_pathways(uniprot, kegg, uniprot_id):
"""Discover KEGG pathways for protein."""
print(f"\n{'='*70}")
print("STEP 4: KEGG Pathway Discovery")
print(f"{'='*70}")
try:
# Map UniProt → KEGG
print(f"Mapping {uniprot_id} to KEGG...")
kegg_mapping = uniprot.mapping(fr="UniProtKB_AC-ID", to="KEGG", query=uniprot_id)
if not kegg_mapping or uniprot_id not in kegg_mapping:
print("✗ No KEGG mapping found")
return []
kegg_ids = kegg_mapping[uniprot_id]
print(f"✓ KEGG ID(s): {kegg_ids}")
# Get pathways for first KEGG ID
kegg_id = kegg_ids[0]
organism, gene_id = kegg_id.split(":")
print(f"\nSearching pathways for {kegg_id}...")
pathways = kegg.get_pathway_by_gene(gene_id, organism)
if not pathways:
print("✗ No pathways found")
return []
print(f"✓ Found {len(pathways)} pathway(s):\n")
# Get pathway names
pathway_info = []
for pathway_id in pathways:
try:
entry = kegg.get(pathway_id)
# Extract pathway name
pathway_name = "Unknown"
for line in entry.split("\n"):
if line.startswith("NAME"):
pathway_name = line.replace("NAME", "").strip()
break
pathway_info.append((pathway_id, pathway_name))
print(f"{pathway_id}: {pathway_name}")
except Exception as e:
print(f"{pathway_id}: [Error retrieving name]")
return pathway_info
except Exception as e:
print(f"✗ Error: {e}")
return []
def find_interactions(protein_query):
"""Find protein-protein interactions via PSICQUIC."""
print(f"\n{'='*70}")
print("STEP 5: Protein-Protein Interactions")
print(f"{'='*70}")
try:
p = PSICQUIC()
# Try querying MINT database
query = f"{protein_query} AND species:9606"
print(f"Querying MINT database...")
print(f" Query: {query}")
results = p.query("mint", query)
if not results:
print("✗ No interactions found in MINT")
return []
# Parse PSI-MI TAB format
lines = results.strip().split("\n")
print(f"✓ Found {len(lines)} interaction(s):\n")
# Display first 10 interactions
interactions = []
for i, line in enumerate(lines[:10], 1):
fields = line.split("\t")
if len(fields) >= 12:
protein_a = fields[4].split(":")[1] if ":" in fields[4] else fields[4]
protein_b = fields[5].split(":")[1] if ":" in fields[5] else fields[5]
interaction_type = fields[11]
interactions.append((protein_a, protein_b, interaction_type))
print(f" {i}. {protein_a}{protein_b}")
if len(lines) > 10:
print(f" ... and {len(lines)-10} more")
return interactions
except Exception as e:
print(f"✗ Error: {e}")
return []
def get_go_annotations(uniprot_id):
"""Retrieve GO annotations."""
print(f"\n{'='*70}")
print("STEP 6: Gene Ontology Annotations")
print(f"{'='*70}")
try:
g = QuickGO()
print(f"Retrieving GO annotations for {uniprot_id}...")
annotations = g.Annotation(protein=uniprot_id, format="tsv")
if not annotations:
print("✗ No GO annotations found")
return []
lines = annotations.strip().split("\n")
print(f"✓ Found {len(lines)-1} annotation(s)\n")
# Group by aspect
aspects = {"P": [], "F": [], "C": []}
for line in lines[1:]:
fields = line.split("\t")
if len(fields) >= 9:
go_id = fields[6]
go_term = fields[7]
go_aspect = fields[8]
if go_aspect in aspects:
aspects[go_aspect].append((go_id, go_term))
# Display summary
print(f" Biological Process (P): {len(aspects['P'])} terms")
for go_id, go_term in aspects['P'][:5]:
print(f"{go_id}: {go_term}")
if len(aspects['P']) > 5:
print(f" ... and {len(aspects['P'])-5} more")
print(f"\n Molecular Function (F): {len(aspects['F'])} terms")
for go_id, go_term in aspects['F'][:5]:
print(f"{go_id}: {go_term}")
if len(aspects['F']) > 5:
print(f" ... and {len(aspects['F'])-5} more")
print(f"\n Cellular Component (C): {len(aspects['C'])} terms")
for go_id, go_term in aspects['C'][:5]:
print(f"{go_id}: {go_term}")
if len(aspects['C']) > 5:
print(f" ... and {len(aspects['C'])-5} more")
return aspects
except Exception as e:
print(f"✗ Error: {e}")
return {}
def main():
"""Main workflow."""
parser = argparse.ArgumentParser(
description="Complete protein analysis workflow using BioServices",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python protein_analysis_workflow.py ZAP70_HUMAN user@example.com
python protein_analysis_workflow.py P43403 user@example.com --skip-blast
"""
)
parser.add_argument("protein", help="Protein name or UniProt ID")
parser.add_argument("email", help="Email address (required for BLAST)")
parser.add_argument("--skip-blast", action="store_true",
help="Skip BLAST search (faster)")
args = parser.parse_args()
print("=" * 70)
print("BIOSERVICES: Complete Protein Analysis Workflow")
print("=" * 70)
# Step 1: Search protein
uniprot, uniprot_id = search_protein(args.protein)
if not uniprot_id:
print("\n✗ Failed to find protein. Exiting.")
sys.exit(1)
# Step 2: Retrieve sequence
sequence = retrieve_sequence(uniprot, uniprot_id)
if not sequence:
print("\n⚠ Warning: Could not retrieve sequence")
# Step 3: BLAST search
if sequence:
blast_results = run_blast(sequence, args.email, args.skip_blast)
# Step 4: Pathway discovery
kegg = KEGG()
pathways = discover_pathways(uniprot, kegg, uniprot_id)
# Step 5: Interaction mapping
interactions = find_interactions(args.protein)
# Step 6: GO annotations
go_terms = get_go_annotations(uniprot_id)
# Summary
print(f"\n{'='*70}")
print("WORKFLOW SUMMARY")
print(f"{'='*70}")
print(f" Protein: {args.protein}")
print(f" UniProt ID: {uniprot_id}")
print(f" Sequence: {'' if sequence else ''}")
print(f" BLAST: {'' if not args.skip_blast and sequence else ''}")
print(f" Pathways: {len(pathways)} found")
print(f" Interactions: {len(interactions)} found")
print(f" GO annotations: {sum(len(v) for v in go_terms.values())} found")
print(f"{'='*70}")
if __name__ == "__main__":
main()