Initial commit

2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions
--- a/skills/bioservices/scripts/protein_analysis_workflow.py
+++ b/skills/bioservices/scripts/protein_analysis_workflow.py
@@ -0,0 +1,408 @@
+#!/usr/bin/env python3
+"""
+Complete Protein Analysis Workflow
+
+This script performs a comprehensive protein analysis pipeline:
+1. UniProt search and identifier retrieval
+2. FASTA sequence retrieval
+3. BLAST similarity search
+4. KEGG pathway discovery
+5. PSICQUIC interaction mapping
+6. GO annotation retrieval
+
+Usage:
+    python protein_analysis_workflow.py PROTEIN_NAME EMAIL [--skip-blast]
+
+Examples:
+    python protein_analysis_workflow.py ZAP70_HUMAN user@example.com
+    python protein_analysis_workflow.py P43403 user@example.com --skip-blast
+
+Note: BLAST searches can take several minutes. Use --skip-blast to skip this step.
+"""
+
+import sys
+import time
+import argparse
+from bioservices import UniProt, KEGG, NCBIblast, PSICQUIC, QuickGO
+
+
+def search_protein(query):
+    """Search UniProt for protein and retrieve basic information."""
+    print(f"\n{'='*70}")
+    print("STEP 1: UniProt Search")
+    print(f"{'='*70}")
+
+    u = UniProt(verbose=False)
+
+    print(f"Searching for: {query}")
+
+    # Try direct retrieval first (if query looks like accession)
+    if len(query) == 6 and query[0] in "OPQ":
+        try:
+            entry = u.retrieve(query, frmt="tab")
+            if entry:
+                uniprot_id = query
+                print(f"✓ Found UniProt entry: {uniprot_id}")
+                return u, uniprot_id
+        except:
+            pass
+
+    # Otherwise search
+    results = u.search(query, frmt="tab", columns="id,genes,organism,length,protein names", limit=5)
+
+    if not results:
+        print("✗ No results found")
+        return u, None
+
+    lines = results.strip().split("\n")
+    if len(lines) < 2:
+        print("✗ No entries found")
+        return u, None
+
+    # Display results
+    print(f"\n✓ Found {len(lines)-1} result(s):")
+    for i, line in enumerate(lines[1:], 1):
+        fields = line.split("\t")
+        print(f"  {i}. {fields[0]} - {fields[1]} ({fields[2]})")
+
+    # Use first result
+    first_entry = lines[1].split("\t")
+    uniprot_id = first_entry[0]
+    gene_names = first_entry[1] if len(first_entry) > 1 else "N/A"
+    organism = first_entry[2] if len(first_entry) > 2 else "N/A"
+    length = first_entry[3] if len(first_entry) > 3 else "N/A"
+    protein_name = first_entry[4] if len(first_entry) > 4 else "N/A"
+
+    print(f"\nUsing first result:")
+    print(f"  UniProt ID: {uniprot_id}")
+    print(f"  Gene names: {gene_names}")
+    print(f"  Organism: {organism}")
+    print(f"  Length: {length} aa")
+    print(f"  Protein: {protein_name}")
+
+    return u, uniprot_id
+
+
+def retrieve_sequence(uniprot, uniprot_id):
+    """Retrieve FASTA sequence for protein."""
+    print(f"\n{'='*70}")
+    print("STEP 2: FASTA Sequence Retrieval")
+    print(f"{'='*70}")
+
+    try:
+        sequence = uniprot.retrieve(uniprot_id, frmt="fasta")
+
+        if sequence:
+            # Extract sequence only (remove header)
+            lines = sequence.strip().split("\n")
+            header = lines[0]
+            seq_only = "".join(lines[1:])
+
+            print(f"✓ Retrieved sequence:")
+            print(f"  Header: {header}")
+            print(f"  Length: {len(seq_only)} residues")
+            print(f"  First 60 residues: {seq_only[:60]}...")
+
+            return seq_only
+        else:
+            print("✗ Failed to retrieve sequence")
+            return None
+
+    except Exception as e:
+        print(f"✗ Error: {e}")
+        return None
+
+
+def run_blast(sequence, email, skip=False):
+    """Run BLAST similarity search."""
+    print(f"\n{'='*70}")
+    print("STEP 3: BLAST Similarity Search")
+    print(f"{'='*70}")
+
+    if skip:
+        print("⊘ Skipped (--skip-blast flag)")
+        return None
+
+    if not email or "@" not in email:
+        print("⊘ Skipped (valid email required for BLAST)")
+        return None
+
+    try:
+        print(f"Submitting BLASTP job...")
+        print(f"  Database: uniprotkb")
+        print(f"  Sequence length: {len(sequence)} aa")
+
+        s = NCBIblast(verbose=False)
+
+        jobid = s.run(
+            program="blastp",
+            sequence=sequence,
+            stype="protein",
+            database="uniprotkb",
+            email=email
+        )
+
+        print(f"✓ Job submitted: {jobid}")
+        print(f"  Waiting for completion...")
+
+        # Poll for completion
+        max_wait = 300  # 5 minutes
+        start_time = time.time()
+
+        while time.time() - start_time < max_wait:
+            status = s.getStatus(jobid)
+            elapsed = int(time.time() - start_time)
+            print(f"  Status: {status} (elapsed: {elapsed}s)", end="\r")
+
+            if status == "FINISHED":
+                print(f"\n✓ BLAST completed in {elapsed}s")
+
+                # Retrieve results
+                results = s.getResult(jobid, "out")
+
+                # Parse and display summary
+                lines = results.split("\n")
+                print(f"\n  Results preview:")
+                for line in lines[:20]:
+                    if line.strip():
+                        print(f"    {line}")
+
+                return results
+
+            elif status == "ERROR":
+                print(f"\n✗ BLAST job failed")
+                return None
+
+            time.sleep(5)
+
+        print(f"\n✗ Timeout after {max_wait}s")
+        return None
+
+    except Exception as e:
+        print(f"✗ Error: {e}")
+        return None
+
+
+def discover_pathways(uniprot, kegg, uniprot_id):
+    """Discover KEGG pathways for protein."""
+    print(f"\n{'='*70}")
+    print("STEP 4: KEGG Pathway Discovery")
+    print(f"{'='*70}")
+
+    try:
+        # Map UniProt → KEGG
+        print(f"Mapping {uniprot_id} to KEGG...")
+        kegg_mapping = uniprot.mapping(fr="UniProtKB_AC-ID", to="KEGG", query=uniprot_id)
+
+        if not kegg_mapping or uniprot_id not in kegg_mapping:
+            print("✗ No KEGG mapping found")
+            return []
+
+        kegg_ids = kegg_mapping[uniprot_id]
+        print(f"✓ KEGG ID(s): {kegg_ids}")
+
+        # Get pathways for first KEGG ID
+        kegg_id = kegg_ids[0]
+        organism, gene_id = kegg_id.split(":")
+
+        print(f"\nSearching pathways for {kegg_id}...")
+        pathways = kegg.get_pathway_by_gene(gene_id, organism)
+
+        if not pathways:
+            print("✗ No pathways found")
+            return []
+
+        print(f"✓ Found {len(pathways)} pathway(s):\n")
+
+        # Get pathway names
+        pathway_info = []
+        for pathway_id in pathways:
+            try:
+                entry = kegg.get(pathway_id)
+
+                # Extract pathway name
+                pathway_name = "Unknown"
+                for line in entry.split("\n"):
+                    if line.startswith("NAME"):
+                        pathway_name = line.replace("NAME", "").strip()
+                        break
+
+                pathway_info.append((pathway_id, pathway_name))
+                print(f"  • {pathway_id}: {pathway_name}")
+
+            except Exception as e:
+                print(f"  • {pathway_id}: [Error retrieving name]")
+
+        return pathway_info
+
+    except Exception as e:
+        print(f"✗ Error: {e}")
+        return []
+
+
+def find_interactions(protein_query):
+    """Find protein-protein interactions via PSICQUIC."""
+    print(f"\n{'='*70}")
+    print("STEP 5: Protein-Protein Interactions")
+    print(f"{'='*70}")
+
+    try:
+        p = PSICQUIC()
+
+        # Try querying MINT database
+        query = f"{protein_query} AND species:9606"
+        print(f"Querying MINT database...")
+        print(f"  Query: {query}")
+
+        results = p.query("mint", query)
+
+        if not results:
+            print("✗ No interactions found in MINT")
+            return []
+
+        # Parse PSI-MI TAB format
+        lines = results.strip().split("\n")
+        print(f"✓ Found {len(lines)} interaction(s):\n")
+
+        # Display first 10 interactions
+        interactions = []
+        for i, line in enumerate(lines[:10], 1):
+            fields = line.split("\t")
+            if len(fields) >= 12:
+                protein_a = fields[4].split(":")[1] if ":" in fields[4] else fields[4]
+                protein_b = fields[5].split(":")[1] if ":" in fields[5] else fields[5]
+                interaction_type = fields[11]
+
+                interactions.append((protein_a, protein_b, interaction_type))
+                print(f"  {i}. {protein_a} ↔ {protein_b}")
+
+        if len(lines) > 10:
+            print(f"  ... and {len(lines)-10} more")
+
+        return interactions
+
+    except Exception as e:
+        print(f"✗ Error: {e}")
+        return []
+
+
+def get_go_annotations(uniprot_id):
+    """Retrieve GO annotations."""
+    print(f"\n{'='*70}")
+    print("STEP 6: Gene Ontology Annotations")
+    print(f"{'='*70}")
+
+    try:
+        g = QuickGO()
+
+        print(f"Retrieving GO annotations for {uniprot_id}...")
+        annotations = g.Annotation(protein=uniprot_id, format="tsv")
+
+        if not annotations:
+            print("✗ No GO annotations found")
+            return []
+
+        lines = annotations.strip().split("\n")
+        print(f"✓ Found {len(lines)-1} annotation(s)\n")
+
+        # Group by aspect
+        aspects = {"P": [], "F": [], "C": []}
+        for line in lines[1:]:
+            fields = line.split("\t")
+            if len(fields) >= 9:
+                go_id = fields[6]
+                go_term = fields[7]
+                go_aspect = fields[8]
+
+                if go_aspect in aspects:
+                    aspects[go_aspect].append((go_id, go_term))
+
+        # Display summary
+        print(f"  Biological Process (P): {len(aspects['P'])} terms")
+        for go_id, go_term in aspects['P'][:5]:
+            print(f"    • {go_id}: {go_term}")
+        if len(aspects['P']) > 5:
+            print(f"    ... and {len(aspects['P'])-5} more")
+
+        print(f"\n  Molecular Function (F): {len(aspects['F'])} terms")
+        for go_id, go_term in aspects['F'][:5]:
+            print(f"    • {go_id}: {go_term}")
+        if len(aspects['F']) > 5:
+            print(f"    ... and {len(aspects['F'])-5} more")
+
+        print(f"\n  Cellular Component (C): {len(aspects['C'])} terms")
+        for go_id, go_term in aspects['C'][:5]:
+            print(f"    • {go_id}: {go_term}")
+        if len(aspects['C']) > 5:
+            print(f"    ... and {len(aspects['C'])-5} more")
+
+        return aspects
+
+    except Exception as e:
+        print(f"✗ Error: {e}")
+        return {}
+
+
+def main():
+    """Main workflow."""
+    parser = argparse.ArgumentParser(
+        description="Complete protein analysis workflow using BioServices",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python protein_analysis_workflow.py ZAP70_HUMAN user@example.com
+  python protein_analysis_workflow.py P43403 user@example.com --skip-blast
+        """
+    )
+    parser.add_argument("protein", help="Protein name or UniProt ID")
+    parser.add_argument("email", help="Email address (required for BLAST)")
+    parser.add_argument("--skip-blast", action="store_true",
+                       help="Skip BLAST search (faster)")
+
+    args = parser.parse_args()
+
+    print("=" * 70)
+    print("BIOSERVICES: Complete Protein Analysis Workflow")
+    print("=" * 70)
+
+    # Step 1: Search protein
+    uniprot, uniprot_id = search_protein(args.protein)
+    if not uniprot_id:
+        print("\n✗ Failed to find protein. Exiting.")
+        sys.exit(1)
+
+    # Step 2: Retrieve sequence
+    sequence = retrieve_sequence(uniprot, uniprot_id)
+    if not sequence:
+        print("\n⚠ Warning: Could not retrieve sequence")
+
+    # Step 3: BLAST search
+    if sequence:
+        blast_results = run_blast(sequence, args.email, args.skip_blast)
+
+    # Step 4: Pathway discovery
+    kegg = KEGG()
+    pathways = discover_pathways(uniprot, kegg, uniprot_id)
+
+    # Step 5: Interaction mapping
+    interactions = find_interactions(args.protein)
+
+    # Step 6: GO annotations
+    go_terms = get_go_annotations(uniprot_id)
+
+    # Summary
+    print(f"\n{'='*70}")
+    print("WORKFLOW SUMMARY")
+    print(f"{'='*70}")
+    print(f"  Protein: {args.protein}")
+    print(f"  UniProt ID: {uniprot_id}")
+    print(f"  Sequence: {'✓' if sequence else '✗'}")
+    print(f"  BLAST: {'✓' if not args.skip_blast and sequence else '⊘'}")
+    print(f"  Pathways: {len(pathways)} found")
+    print(f"  Interactions: {len(interactions)} found")
+    print(f"  GO annotations: {sum(len(v) for v in go_terms.values())} found")
+    print(f"{'='*70}")
+
+
+if __name__ == "__main__":
+    main()