Initial commit

2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions
--- a/skills/bioservices/scripts/batch_id_converter.py
+++ b/skills/bioservices/scripts/batch_id_converter.py
@@ -0,0 +1,347 @@
+#!/usr/bin/env python3
+"""
+Batch Identifier Converter
+
+This script converts multiple identifiers between biological databases
+using UniProt's mapping service. Supports batch processing with
+automatic chunking and error handling.
+
+Usage:
+    python batch_id_converter.py INPUT_FILE --from DB1 --to DB2 [options]
+
+Examples:
+    python batch_id_converter.py uniprot_ids.txt --from UniProtKB_AC-ID --to KEGG
+    python batch_id_converter.py gene_ids.txt --from GeneID --to UniProtKB --output mapping.csv
+    python batch_id_converter.py ids.txt --from UniProtKB_AC-ID --to Ensembl --chunk-size 50
+
+Input file format:
+    One identifier per line (plain text)
+
+Common database codes:
+    UniProtKB_AC-ID  - UniProt accession/ID
+    KEGG             - KEGG gene IDs
+    GeneID           - NCBI Gene (Entrez) IDs
+    Ensembl          - Ensembl gene IDs
+    Ensembl_Protein  - Ensembl protein IDs
+    RefSeq_Protein   - RefSeq protein IDs
+    PDB              - Protein Data Bank IDs
+    HGNC             - Human gene symbols
+    GO               - Gene Ontology IDs
+"""
+
+import sys
+import argparse
+import csv
+import time
+from bioservices import UniProt
+
+
+# Common database code mappings
+DATABASE_CODES = {
+    'uniprot': 'UniProtKB_AC-ID',
+    'uniprotkb': 'UniProtKB_AC-ID',
+    'kegg': 'KEGG',
+    'geneid': 'GeneID',
+    'entrez': 'GeneID',
+    'ensembl': 'Ensembl',
+    'ensembl_protein': 'Ensembl_Protein',
+    'ensembl_transcript': 'Ensembl_Transcript',
+    'refseq': 'RefSeq_Protein',
+    'refseq_protein': 'RefSeq_Protein',
+    'pdb': 'PDB',
+    'hgnc': 'HGNC',
+    'mgi': 'MGI',
+    'go': 'GO',
+    'pfam': 'Pfam',
+    'interpro': 'InterPro',
+    'reactome': 'Reactome',
+    'string': 'STRING',
+    'biogrid': 'BioGRID'
+}
+
+
+def normalize_database_code(code):
+    """Normalize database code to official format."""
+    # Try exact match first
+    if code in DATABASE_CODES.values():
+        return code
+
+    # Try lowercase lookup
+    lowercase = code.lower()
+    if lowercase in DATABASE_CODES:
+        return DATABASE_CODES[lowercase]
+
+    # Return as-is if not found (may still be valid)
+    return code
+
+
+def read_ids_from_file(filename):
+    """Read identifiers from file (one per line)."""
+    print(f"Reading identifiers from {filename}...")
+
+    ids = []
+    with open(filename, 'r') as f:
+        for line in f:
+            line = line.strip()
+            if line and not line.startswith('#'):
+                ids.append(line)
+
+    print(f"✓ Read {len(ids)} identifier(s)")
+
+    return ids
+
+
+def batch_convert(ids, from_db, to_db, chunk_size=100, delay=0.5):
+    """Convert IDs with automatic chunking and error handling."""
+    print(f"\nConverting {len(ids)} IDs:")
+    print(f"  From: {from_db}")
+    print(f"  To: {to_db}")
+    print(f"  Chunk size: {chunk_size}")
+    print()
+
+    u = UniProt(verbose=False)
+    all_results = {}
+    failed_ids = []
+
+    total_chunks = (len(ids) + chunk_size - 1) // chunk_size
+
+    for i in range(0, len(ids), chunk_size):
+        chunk = ids[i:i+chunk_size]
+        chunk_num = (i // chunk_size) + 1
+
+        query = ",".join(chunk)
+
+        try:
+            print(f"  [{chunk_num}/{total_chunks}] Processing {len(chunk)} IDs...", end=" ")
+
+            results = u.mapping(fr=from_db, to=to_db, query=query)
+
+            if results:
+                all_results.update(results)
+                mapped_count = len([v for v in results.values() if v])
+                print(f"✓ Mapped: {mapped_count}/{len(chunk)}")
+            else:
+                print(f"✗ No mappings returned")
+                failed_ids.extend(chunk)
+
+            # Rate limiting
+            if delay > 0 and i + chunk_size < len(ids):
+                time.sleep(delay)
+
+        except Exception as e:
+            print(f"✗ Error: {e}")
+
+            # Try individual IDs in failed chunk
+            print(f"    Retrying individual IDs...")
+            for single_id in chunk:
+                try:
+                    result = u.mapping(fr=from_db, to=to_db, query=single_id)
+                    if result:
+                        all_results.update(result)
+                        print(f"      ✓ {single_id}")
+                    else:
+                        failed_ids.append(single_id)
+                        print(f"      ✗ {single_id} - no mapping")
+                except Exception as e2:
+                    failed_ids.append(single_id)
+                    print(f"      ✗ {single_id} - {e2}")
+
+                time.sleep(0.2)
+
+    # Add missing IDs to results (mark as failed)
+    for id_ in ids:
+        if id_ not in all_results:
+            all_results[id_] = None
+
+    print(f"\n✓ Conversion complete:")
+    print(f"  Total: {len(ids)}")
+    print(f"  Mapped: {len([v for v in all_results.values() if v])}")
+    print(f"  Failed: {len(failed_ids)}")
+
+    return all_results, failed_ids
+
+
+def save_mapping_csv(mapping, output_file, from_db, to_db):
+    """Save mapping results to CSV."""
+    print(f"\nSaving results to {output_file}...")
+
+    with open(output_file, 'w', newline='') as f:
+        writer = csv.writer(f)
+
+        # Header
+        writer.writerow(['Source_ID', 'Source_DB', 'Target_IDs', 'Target_DB', 'Mapping_Status'])
+
+        # Data
+        for source_id, target_ids in sorted(mapping.items()):
+            if target_ids:
+                target_str = ";".join(target_ids)
+                status = "Success"
+            else:
+                target_str = ""
+                status = "Failed"
+
+            writer.writerow([source_id, from_db, target_str, to_db, status])
+
+    print(f"✓ Results saved")
+
+
+def save_failed_ids(failed_ids, output_file):
+    """Save failed IDs to file."""
+    if not failed_ids:
+        return
+
+    print(f"\nSaving failed IDs to {output_file}...")
+
+    with open(output_file, 'w') as f:
+        for id_ in failed_ids:
+            f.write(f"{id_}\n")
+
+    print(f"✓ Saved {len(failed_ids)} failed ID(s)")
+
+
+def print_mapping_summary(mapping, from_db, to_db):
+    """Print summary of mapping results."""
+    print(f"\n{'='*70}")
+    print("MAPPING SUMMARY")
+    print(f"{'='*70}")
+
+    total = len(mapping)
+    mapped = len([v for v in mapping.values() if v])
+    failed = total - mapped
+
+    print(f"\nSource database: {from_db}")
+    print(f"Target database: {to_db}")
+    print(f"\nTotal identifiers: {total}")
+    print(f"Successfully mapped: {mapped} ({mapped/total*100:.1f}%)")
+    print(f"Failed to map: {failed} ({failed/total*100:.1f}%)")
+
+    # Show some examples
+    if mapped > 0:
+        print(f"\nExample mappings (first 5):")
+        count = 0
+        for source_id, target_ids in mapping.items():
+            if target_ids:
+                target_str = ", ".join(target_ids[:3])
+                if len(target_ids) > 3:
+                    target_str += f" ... +{len(target_ids)-3} more"
+                print(f"  {source_id} → {target_str}")
+                count += 1
+                if count >= 5:
+                    break
+
+    # Show multiple mapping statistics
+    multiple_mappings = [v for v in mapping.values() if v and len(v) > 1]
+    if multiple_mappings:
+        print(f"\nMultiple target mappings: {len(multiple_mappings)} ID(s)")
+        print(f"  (These source IDs map to multiple target IDs)")
+
+    print(f"{'='*70}")
+
+
+def list_common_databases():
+    """Print list of common database codes."""
+    print("\nCommon Database Codes:")
+    print("-" * 70)
+    print(f"{'Alias':<20} {'Official Code':<30}")
+    print("-" * 70)
+
+    for alias, code in sorted(DATABASE_CODES.items()):
+        if alias != code.lower():
+            print(f"{alias:<20} {code:<30}")
+
+    print("-" * 70)
+    print("\nNote: Many other database codes are supported.")
+    print("See UniProt documentation for complete list.")
+
+
+def main():
+    """Main conversion workflow."""
+    parser = argparse.ArgumentParser(
+        description="Batch convert biological identifiers between databases",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python batch_id_converter.py uniprot_ids.txt --from UniProtKB_AC-ID --to KEGG
+  python batch_id_converter.py ids.txt --from GeneID --to UniProtKB -o mapping.csv
+  python batch_id_converter.py ids.txt --from uniprot --to ensembl --chunk-size 50
+
+Common database codes:
+  UniProtKB_AC-ID, KEGG, GeneID, Ensembl, Ensembl_Protein,
+  RefSeq_Protein, PDB, HGNC, GO, Pfam, InterPro, Reactome
+
+Use --list-databases to see all supported aliases.
+        """
+    )
+    parser.add_argument("input_file", help="Input file with IDs (one per line)")
+    parser.add_argument("--from", dest="from_db", required=True,
+                       help="Source database code")
+    parser.add_argument("--to", dest="to_db", required=True,
+                       help="Target database code")
+    parser.add_argument("-o", "--output", default=None,
+                       help="Output CSV file (default: mapping_results.csv)")
+    parser.add_argument("--chunk-size", type=int, default=100,
+                       help="Number of IDs per batch (default: 100)")
+    parser.add_argument("--delay", type=float, default=0.5,
+                       help="Delay between batches in seconds (default: 0.5)")
+    parser.add_argument("--save-failed", action="store_true",
+                       help="Save failed IDs to separate file")
+    parser.add_argument("--list-databases", action="store_true",
+                       help="List common database codes and exit")
+
+    args = parser.parse_args()
+
+    # List databases and exit
+    if args.list_databases:
+        list_common_databases()
+        sys.exit(0)
+
+    print("=" * 70)
+    print("BIOSERVICES: Batch Identifier Converter")
+    print("=" * 70)
+
+    # Normalize database codes
+    from_db = normalize_database_code(args.from_db)
+    to_db = normalize_database_code(args.to_db)
+
+    if from_db != args.from_db:
+        print(f"\nNote: Normalized '{args.from_db}' → '{from_db}'")
+    if to_db != args.to_db:
+        print(f"Note: Normalized '{args.to_db}' → '{to_db}'")
+
+    # Read input IDs
+    try:
+        ids = read_ids_from_file(args.input_file)
+    except Exception as e:
+        print(f"\n✗ Error reading input file: {e}")
+        sys.exit(1)
+
+    if not ids:
+        print("\n✗ No IDs found in input file")
+        sys.exit(1)
+
+    # Perform conversion
+    mapping, failed_ids = batch_convert(
+        ids,
+        from_db,
+        to_db,
+        chunk_size=args.chunk_size,
+        delay=args.delay
+    )
+
+    # Print summary
+    print_mapping_summary(mapping, from_db, to_db)
+
+    # Save results
+    output_file = args.output or "mapping_results.csv"
+    save_mapping_csv(mapping, output_file, from_db, to_db)
+
+    # Save failed IDs if requested
+    if args.save_failed and failed_ids:
+        failed_file = output_file.replace(".csv", "_failed.txt")
+        save_failed_ids(failed_ids, failed_file)
+
+    print(f"\n✓ Done!")
+
+
+if __name__ == "__main__":
+    main()
--- a/skills/bioservices/scripts/compound_cross_reference.py
+++ b/skills/bioservices/scripts/compound_cross_reference.py
@@ -0,0 +1,378 @@
+#!/usr/bin/env python3
+"""
+Compound Cross-Database Search
+
+This script searches for a compound by name and retrieves identifiers
+from multiple databases:
+- KEGG Compound
+- ChEBI
+- ChEMBL (via UniChem)
+- Basic compound properties
+
+Usage:
+    python compound_cross_reference.py COMPOUND_NAME [--output FILE]
+
+Examples:
+    python compound_cross_reference.py Geldanamycin
+    python compound_cross_reference.py "Adenosine triphosphate"
+    python compound_cross_reference.py Aspirin --output aspirin_info.txt
+"""
+
+import sys
+import argparse
+from bioservices import KEGG, UniChem, ChEBI, ChEMBL
+
+
+def search_kegg_compound(compound_name):
+    """Search KEGG for compound by name."""
+    print(f"\n{'='*70}")
+    print("STEP 1: KEGG Compound Search")
+    print(f"{'='*70}")
+
+    k = KEGG()
+
+    print(f"Searching KEGG for: {compound_name}")
+
+    try:
+        results = k.find("compound", compound_name)
+
+        if not results or not results.strip():
+            print(f"✗ No results found in KEGG")
+            return k, None
+
+        # Parse results
+        lines = results.strip().split("\n")
+        print(f"✓ Found {len(lines)} result(s):\n")
+
+        for i, line in enumerate(lines[:5], 1):
+            parts = line.split("\t")
+            kegg_id = parts[0]
+            description = parts[1] if len(parts) > 1 else "No description"
+            print(f"  {i}. {kegg_id}: {description}")
+
+        # Use first result
+        first_result = lines[0].split("\t")
+        kegg_id = first_result[0].replace("cpd:", "")
+
+        print(f"\nUsing: {kegg_id}")
+
+        return k, kegg_id
+
+    except Exception as e:
+        print(f"✗ Error: {e}")
+        return k, None
+
+
+def get_kegg_info(kegg, kegg_id):
+    """Retrieve detailed KEGG compound information."""
+    print(f"\n{'='*70}")
+    print("STEP 2: KEGG Compound Details")
+    print(f"{'='*70}")
+
+    try:
+        print(f"Retrieving KEGG entry for {kegg_id}...")
+
+        entry = kegg.get(f"cpd:{kegg_id}")
+
+        if not entry:
+            print("✗ Failed to retrieve entry")
+            return None
+
+        # Parse entry
+        compound_info = {
+            'kegg_id': kegg_id,
+            'name': None,
+            'formula': None,
+            'exact_mass': None,
+            'mol_weight': None,
+            'chebi_id': None,
+            'pathways': []
+        }
+
+        current_section = None
+
+        for line in entry.split("\n"):
+            if line.startswith("NAME"):
+                compound_info['name'] = line.replace("NAME", "").strip().rstrip(";")
+
+            elif line.startswith("FORMULA"):
+                compound_info['formula'] = line.replace("FORMULA", "").strip()
+
+            elif line.startswith("EXACT_MASS"):
+                compound_info['exact_mass'] = line.replace("EXACT_MASS", "").strip()
+
+            elif line.startswith("MOL_WEIGHT"):
+                compound_info['mol_weight'] = line.replace("MOL_WEIGHT", "").strip()
+
+            elif "ChEBI:" in line:
+                parts = line.split("ChEBI:")
+                if len(parts) > 1:
+                    compound_info['chebi_id'] = parts[1].strip().split()[0]
+
+            elif line.startswith("PATHWAY"):
+                current_section = "pathway"
+                pathway = line.replace("PATHWAY", "").strip()
+                if pathway:
+                    compound_info['pathways'].append(pathway)
+
+            elif current_section == "pathway" and line.startswith("            "):
+                pathway = line.strip()
+                if pathway:
+                    compound_info['pathways'].append(pathway)
+
+            elif line.startswith(" ") and not line.startswith("            "):
+                current_section = None
+
+        # Display information
+        print(f"\n✓ KEGG Compound Information:")
+        print(f"  ID: {compound_info['kegg_id']}")
+        print(f"  Name: {compound_info['name']}")
+        print(f"  Formula: {compound_info['formula']}")
+        print(f"  Exact Mass: {compound_info['exact_mass']}")
+        print(f"  Molecular Weight: {compound_info['mol_weight']}")
+
+        if compound_info['chebi_id']:
+            print(f"  ChEBI ID: {compound_info['chebi_id']}")
+
+        if compound_info['pathways']:
+            print(f"  Pathways: {len(compound_info['pathways'])} found")
+
+        return compound_info
+
+    except Exception as e:
+        print(f"✗ Error: {e}")
+        return None
+
+
+def get_chembl_id(kegg_id):
+    """Map KEGG ID to ChEMBL via UniChem."""
+    print(f"\n{'='*70}")
+    print("STEP 3: ChEMBL Mapping (via UniChem)")
+    print(f"{'='*70}")
+
+    try:
+        u = UniChem()
+
+        print(f"Mapping KEGG:{kegg_id} to ChEMBL...")
+
+        chembl_id = u.get_compound_id_from_kegg(kegg_id)
+
+        if chembl_id:
+            print(f"✓ ChEMBL ID: {chembl_id}")
+            return chembl_id
+        else:
+            print("✗ No ChEMBL mapping found")
+            return None
+
+    except Exception as e:
+        print(f"✗ Error: {e}")
+        return None
+
+
+def get_chebi_info(chebi_id):
+    """Retrieve ChEBI compound information."""
+    print(f"\n{'='*70}")
+    print("STEP 4: ChEBI Details")
+    print(f"{'='*70}")
+
+    if not chebi_id:
+        print("⊘ No ChEBI ID available")
+        return None
+
+    try:
+        c = ChEBI()
+
+        print(f"Retrieving ChEBI entry for {chebi_id}...")
+
+        # Ensure proper format
+        if not chebi_id.startswith("CHEBI:"):
+            chebi_id = f"CHEBI:{chebi_id}"
+
+        entity = c.getCompleteEntity(chebi_id)
+
+        if entity:
+            print(f"\n✓ ChEBI Information:")
+            print(f"  ID: {entity.chebiId}")
+            print(f"  Name: {entity.chebiAsciiName}")
+
+            if hasattr(entity, 'Formulae') and entity.Formulae:
+                print(f"  Formula: {entity.Formulae}")
+
+            if hasattr(entity, 'mass') and entity.mass:
+                print(f"  Mass: {entity.mass}")
+
+            if hasattr(entity, 'charge') and entity.charge:
+                print(f"  Charge: {entity.charge}")
+
+            return {
+                'chebi_id': entity.chebiId,
+                'name': entity.chebiAsciiName,
+                'formula': entity.Formulae if hasattr(entity, 'Formulae') else None,
+                'mass': entity.mass if hasattr(entity, 'mass') else None
+            }
+        else:
+            print("✗ Failed to retrieve ChEBI entry")
+            return None
+
+    except Exception as e:
+        print(f"✗ Error: {e}")
+        return None
+
+
+def get_chembl_info(chembl_id):
+    """Retrieve ChEMBL compound information."""
+    print(f"\n{'='*70}")
+    print("STEP 5: ChEMBL Details")
+    print(f"{'='*70}")
+
+    if not chembl_id:
+        print("⊘ No ChEMBL ID available")
+        return None
+
+    try:
+        c = ChEMBL()
+
+        print(f"Retrieving ChEMBL entry for {chembl_id}...")
+
+        compound = c.get_compound_by_chemblId(chembl_id)
+
+        if compound:
+            print(f"\n✓ ChEMBL Information:")
+            print(f"  ID: {chembl_id}")
+
+            if 'pref_name' in compound and compound['pref_name']:
+                print(f"  Preferred Name: {compound['pref_name']}")
+
+            if 'molecule_properties' in compound:
+                props = compound['molecule_properties']
+
+                if 'full_mwt' in props:
+                    print(f"  Molecular Weight: {props['full_mwt']}")
+
+                if 'alogp' in props:
+                    print(f"  LogP: {props['alogp']}")
+
+                if 'hba' in props:
+                    print(f"  H-Bond Acceptors: {props['hba']}")
+
+                if 'hbd' in props:
+                    print(f"  H-Bond Donors: {props['hbd']}")
+
+            if 'molecule_structures' in compound:
+                structs = compound['molecule_structures']
+
+                if 'canonical_smiles' in structs:
+                    smiles = structs['canonical_smiles']
+                    print(f"  SMILES: {smiles[:60]}{'...' if len(smiles) > 60 else ''}")
+
+            return compound
+        else:
+            print("✗ Failed to retrieve ChEMBL entry")
+            return None
+
+    except Exception as e:
+        print(f"✗ Error: {e}")
+        return None
+
+
+def save_results(compound_name, kegg_info, chembl_id, output_file):
+    """Save results to file."""
+    print(f"\n{'='*70}")
+    print(f"Saving results to {output_file}")
+    print(f"{'='*70}")
+
+    with open(output_file, 'w') as f:
+        f.write("=" * 70 + "\n")
+        f.write(f"Compound Cross-Reference Report: {compound_name}\n")
+        f.write("=" * 70 + "\n\n")
+
+        # KEGG information
+        if kegg_info:
+            f.write("KEGG Compound\n")
+            f.write("-" * 70 + "\n")
+            f.write(f"ID: {kegg_info['kegg_id']}\n")
+            f.write(f"Name: {kegg_info['name']}\n")
+            f.write(f"Formula: {kegg_info['formula']}\n")
+            f.write(f"Exact Mass: {kegg_info['exact_mass']}\n")
+            f.write(f"Molecular Weight: {kegg_info['mol_weight']}\n")
+            f.write(f"Pathways: {len(kegg_info['pathways'])} found\n")
+            f.write("\n")
+
+        # Database IDs
+        f.write("Cross-Database Identifiers\n")
+        f.write("-" * 70 + "\n")
+        if kegg_info:
+            f.write(f"KEGG: {kegg_info['kegg_id']}\n")
+            if kegg_info['chebi_id']:
+                f.write(f"ChEBI: {kegg_info['chebi_id']}\n")
+        if chembl_id:
+            f.write(f"ChEMBL: {chembl_id}\n")
+        f.write("\n")
+
+    print(f"✓ Results saved")
+
+
+def main():
+    """Main workflow."""
+    parser = argparse.ArgumentParser(
+        description="Search compound across multiple databases",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python compound_cross_reference.py Geldanamycin
+  python compound_cross_reference.py "Adenosine triphosphate"
+  python compound_cross_reference.py Aspirin --output aspirin_info.txt
+        """
+    )
+    parser.add_argument("compound", help="Compound name to search")
+    parser.add_argument("--output", default=None,
+                       help="Output file for results (optional)")
+
+    args = parser.parse_args()
+
+    print("=" * 70)
+    print("BIOSERVICES: Compound Cross-Database Search")
+    print("=" * 70)
+
+    # Step 1: Search KEGG
+    kegg, kegg_id = search_kegg_compound(args.compound)
+    if not kegg_id:
+        print("\n✗ Failed to find compound. Exiting.")
+        sys.exit(1)
+
+    # Step 2: Get KEGG details
+    kegg_info = get_kegg_info(kegg, kegg_id)
+
+    # Step 3: Map to ChEMBL
+    chembl_id = get_chembl_id(kegg_id)
+
+    # Step 4: Get ChEBI details
+    chebi_info = None
+    if kegg_info and kegg_info['chebi_id']:
+        chebi_info = get_chebi_info(kegg_info['chebi_id'])
+
+    # Step 5: Get ChEMBL details
+    chembl_info = None
+    if chembl_id:
+        chembl_info = get_chembl_info(chembl_id)
+
+    # Summary
+    print(f"\n{'='*70}")
+    print("SUMMARY")
+    print(f"{'='*70}")
+    print(f"  Compound: {args.compound}")
+    if kegg_info:
+        print(f"  KEGG ID: {kegg_info['kegg_id']}")
+        if kegg_info['chebi_id']:
+            print(f"  ChEBI ID: {kegg_info['chebi_id']}")
+    if chembl_id:
+        print(f"  ChEMBL ID: {chembl_id}")
+    print(f"{'='*70}")
+
+    # Save to file if requested
+    if args.output:
+        save_results(args.compound, kegg_info, chembl_id, args.output)
+
+
+if __name__ == "__main__":
+    main()
--- a/skills/bioservices/scripts/pathway_analysis.py
+++ b/skills/bioservices/scripts/pathway_analysis.py
@@ -0,0 +1,309 @@
+#!/usr/bin/env python3
+"""
+KEGG Pathway Network Analysis
+
+This script analyzes all pathways for an organism and extracts:
+- Pathway sizes (number of genes)
+- Protein-protein interactions
+- Interaction type distributions
+- Network data in various formats (CSV, SIF)
+
+Usage:
+    python pathway_analysis.py ORGANISM OUTPUT_DIR [--limit N]
+
+Examples:
+    python pathway_analysis.py hsa ./human_pathways
+    python pathway_analysis.py mmu ./mouse_pathways --limit 50
+
+Organism codes:
+    hsa = Homo sapiens (human)
+    mmu = Mus musculus (mouse)
+    dme = Drosophila melanogaster
+    sce = Saccharomyces cerevisiae (yeast)
+    eco = Escherichia coli
+"""
+
+import sys
+import os
+import argparse
+import csv
+from collections import Counter
+from bioservices import KEGG
+
+
+def get_all_pathways(kegg, organism):
+    """Get all pathway IDs for organism."""
+    print(f"\nRetrieving pathways for {organism}...")
+
+    kegg.organism = organism
+    pathway_ids = kegg.pathwayIds
+
+    print(f"✓ Found {len(pathway_ids)} pathways")
+
+    return pathway_ids
+
+
+def analyze_pathway(kegg, pathway_id):
+    """Analyze single pathway for size and interactions."""
+    try:
+        # Parse KGML pathway
+        kgml = kegg.parse_kgml_pathway(pathway_id)
+
+        entries = kgml.get('entries', [])
+        relations = kgml.get('relations', [])
+
+        # Count relation types
+        relation_types = Counter()
+        for rel in relations:
+            rel_type = rel.get('name', 'unknown')
+            relation_types[rel_type] += 1
+
+        # Get pathway name
+        try:
+            entry = kegg.get(pathway_id)
+            pathway_name = "Unknown"
+            for line in entry.split("\n"):
+                if line.startswith("NAME"):
+                    pathway_name = line.replace("NAME", "").strip()
+                    break
+        except:
+            pathway_name = "Unknown"
+
+        result = {
+            'pathway_id': pathway_id,
+            'pathway_name': pathway_name,
+            'num_entries': len(entries),
+            'num_relations': len(relations),
+            'relation_types': dict(relation_types),
+            'entries': entries,
+            'relations': relations
+        }
+
+        return result
+
+    except Exception as e:
+        print(f"  ✗ Error analyzing {pathway_id}: {e}")
+        return None
+
+
+def analyze_all_pathways(kegg, pathway_ids, limit=None):
+    """Analyze all pathways."""
+    if limit:
+        pathway_ids = pathway_ids[:limit]
+        print(f"\n⚠ Limiting analysis to first {limit} pathways")
+
+    print(f"\nAnalyzing {len(pathway_ids)} pathways...")
+
+    results = []
+    for i, pathway_id in enumerate(pathway_ids, 1):
+        print(f"  [{i}/{len(pathway_ids)}] {pathway_id}", end="\r")
+
+        result = analyze_pathway(kegg, pathway_id)
+        if result:
+            results.append(result)
+
+    print(f"\n✓ Successfully analyzed {len(results)}/{len(pathway_ids)} pathways")
+
+    return results
+
+
+def save_pathway_summary(results, output_file):
+    """Save pathway summary to CSV."""
+    print(f"\nSaving pathway summary to {output_file}...")
+
+    with open(output_file, 'w', newline='') as f:
+        writer = csv.writer(f)
+
+        # Header
+        writer.writerow([
+            'Pathway_ID',
+            'Pathway_Name',
+            'Num_Genes',
+            'Num_Interactions',
+            'Activation',
+            'Inhibition',
+            'Phosphorylation',
+            'Binding',
+            'Other'
+        ])
+
+        # Data
+        for result in results:
+            rel_types = result['relation_types']
+
+            writer.writerow([
+                result['pathway_id'],
+                result['pathway_name'],
+                result['num_entries'],
+                result['num_relations'],
+                rel_types.get('activation', 0),
+                rel_types.get('inhibition', 0),
+                rel_types.get('phosphorylation', 0),
+                rel_types.get('binding/association', 0),
+                sum(v for k, v in rel_types.items()
+                    if k not in ['activation', 'inhibition', 'phosphorylation', 'binding/association'])
+            ])
+
+    print(f"✓ Summary saved")
+
+
+def save_interactions_sif(results, output_file):
+    """Save all interactions in SIF format."""
+    print(f"\nSaving interactions to {output_file}...")
+
+    with open(output_file, 'w') as f:
+        for result in results:
+            pathway_id = result['pathway_id']
+
+            for rel in result['relations']:
+                entry1 = rel.get('entry1', '')
+                entry2 = rel.get('entry2', '')
+                interaction_type = rel.get('name', 'interaction')
+
+                # Write SIF format: source\tinteraction\ttarget
+                f.write(f"{entry1}\t{interaction_type}\t{entry2}\n")
+
+    print(f"✓ Interactions saved")
+
+
+def save_detailed_pathway_info(results, output_dir):
+    """Save detailed information for each pathway."""
+    print(f"\nSaving detailed pathway files to {output_dir}/pathways/...")
+
+    pathway_dir = os.path.join(output_dir, "pathways")
+    os.makedirs(pathway_dir, exist_ok=True)
+
+    for result in results:
+        pathway_id = result['pathway_id'].replace(":", "_")
+        filename = os.path.join(pathway_dir, f"{pathway_id}_interactions.csv")
+
+        with open(filename, 'w', newline='') as f:
+            writer = csv.writer(f)
+            writer.writerow(['Source', 'Target', 'Interaction_Type', 'Link_Type'])
+
+            for rel in result['relations']:
+                writer.writerow([
+                    rel.get('entry1', ''),
+                    rel.get('entry2', ''),
+                    rel.get('name', 'unknown'),
+                    rel.get('link', 'unknown')
+                ])
+
+    print(f"✓ Detailed files saved for {len(results)} pathways")
+
+
+def print_statistics(results):
+    """Print analysis statistics."""
+    print(f"\n{'='*70}")
+    print("PATHWAY ANALYSIS STATISTICS")
+    print(f"{'='*70}")
+
+    # Total stats
+    total_pathways = len(results)
+    total_interactions = sum(r['num_relations'] for r in results)
+    total_genes = sum(r['num_entries'] for r in results)
+
+    print(f"\nOverall:")
+    print(f"  Total pathways: {total_pathways}")
+    print(f"  Total genes/proteins: {total_genes}")
+    print(f"  Total interactions: {total_interactions}")
+
+    # Largest pathways
+    print(f"\nLargest pathways (by gene count):")
+    sorted_by_size = sorted(results, key=lambda x: x['num_entries'], reverse=True)
+    for i, result in enumerate(sorted_by_size[:10], 1):
+        print(f"  {i}. {result['pathway_id']}: {result['num_entries']} genes")
+        print(f"     {result['pathway_name']}")
+
+    # Most connected pathways
+    print(f"\nMost connected pathways (by interactions):")
+    sorted_by_connections = sorted(results, key=lambda x: x['num_relations'], reverse=True)
+    for i, result in enumerate(sorted_by_connections[:10], 1):
+        print(f"  {i}. {result['pathway_id']}: {result['num_relations']} interactions")
+        print(f"     {result['pathway_name']}")
+
+    # Interaction type distribution
+    print(f"\nInteraction type distribution:")
+    all_types = Counter()
+    for result in results:
+        for rel_type, count in result['relation_types'].items():
+            all_types[rel_type] += count
+
+    for rel_type, count in all_types.most_common():
+        percentage = (count / total_interactions) * 100 if total_interactions > 0 else 0
+        print(f"  {rel_type}: {count} ({percentage:.1f}%)")
+
+
+def main():
+    """Main analysis workflow."""
+    parser = argparse.ArgumentParser(
+        description="Analyze KEGG pathways for an organism",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python pathway_analysis.py hsa ./human_pathways
+  python pathway_analysis.py mmu ./mouse_pathways --limit 50
+
+Organism codes:
+  hsa = Homo sapiens (human)
+  mmu = Mus musculus (mouse)
+  dme = Drosophila melanogaster
+  sce = Saccharomyces cerevisiae (yeast)
+  eco = Escherichia coli
+        """
+    )
+    parser.add_argument("organism", help="KEGG organism code (e.g., hsa, mmu)")
+    parser.add_argument("output_dir", help="Output directory for results")
+    parser.add_argument("--limit", type=int, default=None,
+                       help="Limit analysis to first N pathways")
+
+    args = parser.parse_args()
+
+    print("=" * 70)
+    print("BIOSERVICES: KEGG Pathway Network Analysis")
+    print("=" * 70)
+
+    # Create output directory
+    os.makedirs(args.output_dir, exist_ok=True)
+
+    # Initialize KEGG
+    kegg = KEGG()
+
+    # Get all pathways
+    pathway_ids = get_all_pathways(kegg, args.organism)
+
+    if not pathway_ids:
+        print(f"\n✗ No pathways found for {args.organism}")
+        sys.exit(1)
+
+    # Analyze pathways
+    results = analyze_all_pathways(kegg, pathway_ids, args.limit)
+
+    if not results:
+        print("\n✗ No pathways successfully analyzed")
+        sys.exit(1)
+
+    # Print statistics
+    print_statistics(results)
+
+    # Save results
+    summary_file = os.path.join(args.output_dir, "pathway_summary.csv")
+    save_pathway_summary(results, summary_file)
+
+    sif_file = os.path.join(args.output_dir, "all_interactions.sif")
+    save_interactions_sif(results, sif_file)
+
+    save_detailed_pathway_info(results, args.output_dir)
+
+    # Final summary
+    print(f"\n{'='*70}")
+    print("OUTPUT FILES")
+    print(f"{'='*70}")
+    print(f"  Summary: {summary_file}")
+    print(f"  Interactions: {sif_file}")
+    print(f"  Detailed: {args.output_dir}/pathways/")
+    print(f"{'='*70}")
+
+
+if __name__ == "__main__":
+    main()
--- a/skills/bioservices/scripts/protein_analysis_workflow.py
+++ b/skills/bioservices/scripts/protein_analysis_workflow.py
@@ -0,0 +1,408 @@
+#!/usr/bin/env python3
+"""
+Complete Protein Analysis Workflow
+
+This script performs a comprehensive protein analysis pipeline:
+1. UniProt search and identifier retrieval
+2. FASTA sequence retrieval
+3. BLAST similarity search
+4. KEGG pathway discovery
+5. PSICQUIC interaction mapping
+6. GO annotation retrieval
+
+Usage:
+    python protein_analysis_workflow.py PROTEIN_NAME EMAIL [--skip-blast]
+
+Examples:
+    python protein_analysis_workflow.py ZAP70_HUMAN user@example.com
+    python protein_analysis_workflow.py P43403 user@example.com --skip-blast
+
+Note: BLAST searches can take several minutes. Use --skip-blast to skip this step.
+"""
+
+import sys
+import time
+import argparse
+from bioservices import UniProt, KEGG, NCBIblast, PSICQUIC, QuickGO
+
+
+def search_protein(query):
+    """Search UniProt for protein and retrieve basic information."""
+    print(f"\n{'='*70}")
+    print("STEP 1: UniProt Search")
+    print(f"{'='*70}")
+
+    u = UniProt(verbose=False)
+
+    print(f"Searching for: {query}")
+
+    # Try direct retrieval first (if query looks like accession)
+    if len(query) == 6 and query[0] in "OPQ":
+        try:
+            entry = u.retrieve(query, frmt="tab")
+            if entry:
+                uniprot_id = query
+                print(f"✓ Found UniProt entry: {uniprot_id}")
+                return u, uniprot_id
+        except:
+            pass
+
+    # Otherwise search
+    results = u.search(query, frmt="tab", columns="id,genes,organism,length,protein names", limit=5)
+
+    if not results:
+        print("✗ No results found")
+        return u, None
+
+    lines = results.strip().split("\n")
+    if len(lines) < 2:
+        print("✗ No entries found")
+        return u, None
+
+    # Display results
+    print(f"\n✓ Found {len(lines)-1} result(s):")
+    for i, line in enumerate(lines[1:], 1):
+        fields = line.split("\t")
+        print(f"  {i}. {fields[0]} - {fields[1]} ({fields[2]})")
+
+    # Use first result
+    first_entry = lines[1].split("\t")
+    uniprot_id = first_entry[0]
+    gene_names = first_entry[1] if len(first_entry) > 1 else "N/A"
+    organism = first_entry[2] if len(first_entry) > 2 else "N/A"
+    length = first_entry[3] if len(first_entry) > 3 else "N/A"
+    protein_name = first_entry[4] if len(first_entry) > 4 else "N/A"
+
+    print(f"\nUsing first result:")
+    print(f"  UniProt ID: {uniprot_id}")
+    print(f"  Gene names: {gene_names}")
+    print(f"  Organism: {organism}")
+    print(f"  Length: {length} aa")
+    print(f"  Protein: {protein_name}")
+
+    return u, uniprot_id
+
+
+def retrieve_sequence(uniprot, uniprot_id):
+    """Retrieve FASTA sequence for protein."""
+    print(f"\n{'='*70}")
+    print("STEP 2: FASTA Sequence Retrieval")
+    print(f"{'='*70}")
+
+    try:
+        sequence = uniprot.retrieve(uniprot_id, frmt="fasta")
+
+        if sequence:
+            # Extract sequence only (remove header)
+            lines = sequence.strip().split("\n")
+            header = lines[0]
+            seq_only = "".join(lines[1:])
+
+            print(f"✓ Retrieved sequence:")
+            print(f"  Header: {header}")
+            print(f"  Length: {len(seq_only)} residues")
+            print(f"  First 60 residues: {seq_only[:60]}...")
+
+            return seq_only
+        else:
+            print("✗ Failed to retrieve sequence")
+            return None
+
+    except Exception as e:
+        print(f"✗ Error: {e}")
+        return None
+
+
+def run_blast(sequence, email, skip=False):
+    """Run BLAST similarity search."""
+    print(f"\n{'='*70}")
+    print("STEP 3: BLAST Similarity Search")
+    print(f"{'='*70}")
+
+    if skip:
+        print("⊘ Skipped (--skip-blast flag)")
+        return None
+
+    if not email or "@" not in email:
+        print("⊘ Skipped (valid email required for BLAST)")
+        return None
+
+    try:
+        print(f"Submitting BLASTP job...")
+        print(f"  Database: uniprotkb")
+        print(f"  Sequence length: {len(sequence)} aa")
+
+        s = NCBIblast(verbose=False)
+
+        jobid = s.run(
+            program="blastp",
+            sequence=sequence,
+            stype="protein",
+            database="uniprotkb",
+            email=email
+        )
+
+        print(f"✓ Job submitted: {jobid}")
+        print(f"  Waiting for completion...")
+
+        # Poll for completion
+        max_wait = 300  # 5 minutes
+        start_time = time.time()
+
+        while time.time() - start_time < max_wait:
+            status = s.getStatus(jobid)
+            elapsed = int(time.time() - start_time)
+            print(f"  Status: {status} (elapsed: {elapsed}s)", end="\r")
+
+            if status == "FINISHED":
+                print(f"\n✓ BLAST completed in {elapsed}s")
+
+                # Retrieve results
+                results = s.getResult(jobid, "out")
+
+                # Parse and display summary
+                lines = results.split("\n")
+                print(f"\n  Results preview:")
+                for line in lines[:20]:
+                    if line.strip():
+                        print(f"    {line}")
+
+                return results
+
+            elif status == "ERROR":
+                print(f"\n✗ BLAST job failed")
+                return None
+
+            time.sleep(5)
+
+        print(f"\n✗ Timeout after {max_wait}s")
+        return None
+
+    except Exception as e:
+        print(f"✗ Error: {e}")
+        return None
+
+
+def discover_pathways(uniprot, kegg, uniprot_id):
+    """Discover KEGG pathways for protein."""
+    print(f"\n{'='*70}")
+    print("STEP 4: KEGG Pathway Discovery")
+    print(f"{'='*70}")
+
+    try:
+        # Map UniProt → KEGG
+        print(f"Mapping {uniprot_id} to KEGG...")
+        kegg_mapping = uniprot.mapping(fr="UniProtKB_AC-ID", to="KEGG", query=uniprot_id)
+
+        if not kegg_mapping or uniprot_id not in kegg_mapping:
+            print("✗ No KEGG mapping found")
+            return []
+
+        kegg_ids = kegg_mapping[uniprot_id]
+        print(f"✓ KEGG ID(s): {kegg_ids}")
+
+        # Get pathways for first KEGG ID
+        kegg_id = kegg_ids[0]
+        organism, gene_id = kegg_id.split(":")
+
+        print(f"\nSearching pathways for {kegg_id}...")
+        pathways = kegg.get_pathway_by_gene(gene_id, organism)
+
+        if not pathways:
+            print("✗ No pathways found")
+            return []
+
+        print(f"✓ Found {len(pathways)} pathway(s):\n")
+
+        # Get pathway names
+        pathway_info = []
+        for pathway_id in pathways:
+            try:
+                entry = kegg.get(pathway_id)
+
+                # Extract pathway name
+                pathway_name = "Unknown"
+                for line in entry.split("\n"):
+                    if line.startswith("NAME"):
+                        pathway_name = line.replace("NAME", "").strip()
+                        break
+
+                pathway_info.append((pathway_id, pathway_name))
+                print(f"  • {pathway_id}: {pathway_name}")
+
+            except Exception as e:
+                print(f"  • {pathway_id}: [Error retrieving name]")
+
+        return pathway_info
+
+    except Exception as e:
+        print(f"✗ Error: {e}")
+        return []
+
+
+def find_interactions(protein_query):
+    """Find protein-protein interactions via PSICQUIC."""
+    print(f"\n{'='*70}")
+    print("STEP 5: Protein-Protein Interactions")
+    print(f"{'='*70}")
+
+    try:
+        p = PSICQUIC()
+
+        # Try querying MINT database
+        query = f"{protein_query} AND species:9606"
+        print(f"Querying MINT database...")
+        print(f"  Query: {query}")
+
+        results = p.query("mint", query)
+
+        if not results:
+            print("✗ No interactions found in MINT")
+            return []
+
+        # Parse PSI-MI TAB format
+        lines = results.strip().split("\n")
+        print(f"✓ Found {len(lines)} interaction(s):\n")
+
+        # Display first 10 interactions
+        interactions = []
+        for i, line in enumerate(lines[:10], 1):
+            fields = line.split("\t")
+            if len(fields) >= 12:
+                protein_a = fields[4].split(":")[1] if ":" in fields[4] else fields[4]
+                protein_b = fields[5].split(":")[1] if ":" in fields[5] else fields[5]
+                interaction_type = fields[11]
+
+                interactions.append((protein_a, protein_b, interaction_type))
+                print(f"  {i}. {protein_a} ↔ {protein_b}")
+
+        if len(lines) > 10:
+            print(f"  ... and {len(lines)-10} more")
+
+        return interactions
+
+    except Exception as e:
+        print(f"✗ Error: {e}")
+        return []
+
+
+def get_go_annotations(uniprot_id):
+    """Retrieve GO annotations."""
+    print(f"\n{'='*70}")
+    print("STEP 6: Gene Ontology Annotations")
+    print(f"{'='*70}")
+
+    try:
+        g = QuickGO()
+
+        print(f"Retrieving GO annotations for {uniprot_id}...")
+        annotations = g.Annotation(protein=uniprot_id, format="tsv")
+
+        if not annotations:
+            print("✗ No GO annotations found")
+            return []
+
+        lines = annotations.strip().split("\n")
+        print(f"✓ Found {len(lines)-1} annotation(s)\n")
+
+        # Group by aspect
+        aspects = {"P": [], "F": [], "C": []}
+        for line in lines[1:]:
+            fields = line.split("\t")
+            if len(fields) >= 9:
+                go_id = fields[6]
+                go_term = fields[7]
+                go_aspect = fields[8]
+
+                if go_aspect in aspects:
+                    aspects[go_aspect].append((go_id, go_term))
+
+        # Display summary
+        print(f"  Biological Process (P): {len(aspects['P'])} terms")
+        for go_id, go_term in aspects['P'][:5]:
+            print(f"    • {go_id}: {go_term}")
+        if len(aspects['P']) > 5:
+            print(f"    ... and {len(aspects['P'])-5} more")
+
+        print(f"\n  Molecular Function (F): {len(aspects['F'])} terms")
+        for go_id, go_term in aspects['F'][:5]:
+            print(f"    • {go_id}: {go_term}")
+        if len(aspects['F']) > 5:
+            print(f"    ... and {len(aspects['F'])-5} more")
+
+        print(f"\n  Cellular Component (C): {len(aspects['C'])} terms")
+        for go_id, go_term in aspects['C'][:5]:
+            print(f"    • {go_id}: {go_term}")
+        if len(aspects['C']) > 5:
+            print(f"    ... and {len(aspects['C'])-5} more")
+
+        return aspects
+
+    except Exception as e:
+        print(f"✗ Error: {e}")
+        return {}
+
+
+def main():
+    """Main workflow."""
+    parser = argparse.ArgumentParser(
+        description="Complete protein analysis workflow using BioServices",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  python protein_analysis_workflow.py ZAP70_HUMAN user@example.com
+  python protein_analysis_workflow.py P43403 user@example.com --skip-blast
+        """
+    )
+    parser.add_argument("protein", help="Protein name or UniProt ID")
+    parser.add_argument("email", help="Email address (required for BLAST)")
+    parser.add_argument("--skip-blast", action="store_true",
+                       help="Skip BLAST search (faster)")
+
+    args = parser.parse_args()
+
+    print("=" * 70)
+    print("BIOSERVICES: Complete Protein Analysis Workflow")
+    print("=" * 70)
+
+    # Step 1: Search protein
+    uniprot, uniprot_id = search_protein(args.protein)
+    if not uniprot_id:
+        print("\n✗ Failed to find protein. Exiting.")
+        sys.exit(1)
+
+    # Step 2: Retrieve sequence
+    sequence = retrieve_sequence(uniprot, uniprot_id)
+    if not sequence:
+        print("\n⚠ Warning: Could not retrieve sequence")
+
+    # Step 3: BLAST search
+    if sequence:
+        blast_results = run_blast(sequence, args.email, args.skip_blast)
+
+    # Step 4: Pathway discovery
+    kegg = KEGG()
+    pathways = discover_pathways(uniprot, kegg, uniprot_id)
+
+    # Step 5: Interaction mapping
+    interactions = find_interactions(args.protein)
+
+    # Step 6: GO annotations
+    go_terms = get_go_annotations(uniprot_id)
+
+    # Summary
+    print(f"\n{'='*70}")
+    print("WORKFLOW SUMMARY")
+    print(f"{'='*70}")
+    print(f"  Protein: {args.protein}")
+    print(f"  UniProt ID: {uniprot_id}")
+    print(f"  Sequence: {'✓' if sequence else '✗'}")
+    print(f"  BLAST: {'✓' if not args.skip_blast and sequence else '⊘'}")
+    print(f"  Pathways: {len(pathways)} found")
+    print(f"  Interactions: {len(interactions)} found")
+    print(f"  GO annotations: {sum(len(v) for v in go_terms.values())} found")
+    print(f"{'='*70}")
+
+
+if __name__ == "__main__":
+    main()