Initial commit

2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions
--- a/skills/gget/scripts/batch_sequence_analysis.py
+++ b/skills/gget/scripts/batch_sequence_analysis.py
@@ -0,0 +1,191 @@
+#!/usr/bin/env python3
+"""
+Batch Sequence Analysis Script
+Analyze multiple sequences: BLAST, alignment, and structure prediction
+"""
+
+import argparse
+import sys
+from pathlib import Path
+import gget
+
+
+def read_fasta(fasta_file):
+    """Read sequences from FASTA file."""
+    sequences = []
+    current_id = None
+    current_seq = []
+
+    with open(fasta_file, "r") as f:
+        for line in f:
+            line = line.strip()
+            if line.startswith(">"):
+                if current_id:
+                    sequences.append({"id": current_id, "seq": "".join(current_seq)})
+                current_id = line[1:]
+                current_seq = []
+            else:
+                current_seq.append(line)
+
+        if current_id:
+            sequences.append({"id": current_id, "seq": "".join(current_seq)})
+
+    return sequences
+
+
+def analyze_sequences(
+    fasta_file,
+    blast_db="nr",
+    align=True,
+    predict_structure=False,
+    output_dir="output",
+):
+    """
+    Perform batch sequence analysis.
+
+    Args:
+        fasta_file: Path to FASTA file with sequences
+        blast_db: BLAST database to search (default: nr)
+        align: Whether to perform multiple sequence alignment
+        predict_structure: Whether to predict structures with AlphaFold
+        output_dir: Output directory for results
+    """
+    output_path = Path(output_dir)
+    output_path.mkdir(exist_ok=True)
+
+    print(f"Batch Sequence Analysis")
+    print("=" * 60)
+    print(f"Input file: {fasta_file}")
+    print(f"Output directory: {output_dir}")
+    print("")
+
+    # Read sequences
+    print("Reading sequences...")
+    sequences = read_fasta(fasta_file)
+    print(f"Found {len(sequences)} sequences\n")
+
+    # Step 1: BLAST each sequence
+    print("Step 1: Running BLAST searches...")
+    print("-" * 60)
+    for i, seq_data in enumerate(sequences):
+        print(f"\n{i+1}. BLASTing {seq_data['id']}...")
+        try:
+            blast_results = gget.blast(
+                seq_data["seq"], database=blast_db, limit=10, save=False
+            )
+
+            output_file = output_path / f"{seq_data['id']}_blast.csv"
+            blast_results.to_csv(output_file, index=False)
+            print(f"   Results saved to: {output_file}")
+
+            if len(blast_results) > 0:
+                print(f"   Top hit: {blast_results.iloc[0]['Description']}")
+                print(
+                    f"   Max Score: {blast_results.iloc[0]['Max Score']}, "
+                    f"Query Coverage: {blast_results.iloc[0]['Query Coverage']}"
+                )
+        except Exception as e:
+            print(f"   Error: {e}")
+
+    # Step 2: Multiple sequence alignment
+    if align and len(sequences) > 1:
+        print("\n\nStep 2: Multiple sequence alignment...")
+        print("-" * 60)
+        try:
+            alignment = gget.muscle(fasta_file)
+            alignment_file = output_path / "alignment.afa"
+            with open(alignment_file, "w") as f:
+                f.write(alignment)
+            print(f"Alignment saved to: {alignment_file}")
+        except Exception as e:
+            print(f"Error in alignment: {e}")
+    else:
+        print("\n\nStep 2: Skipping alignment (only 1 sequence or disabled)")
+
+    # Step 3: Structure prediction (optional)
+    if predict_structure:
+        print("\n\nStep 3: Predicting structures with AlphaFold...")
+        print("-" * 60)
+        print(
+            "Note: This requires 'gget setup alphafold' and is computationally intensive"
+        )
+
+        for i, seq_data in enumerate(sequences):
+            print(f"\n{i+1}. Predicting structure for {seq_data['id']}...")
+            try:
+                structure_dir = output_path / f"structure_{seq_data['id']}"
+                # Uncomment to run AlphaFold prediction:
+                # gget.alphafold(seq_data['seq'], out=str(structure_dir))
+                # print(f"   Structure saved to: {structure_dir}")
+                print(
+                    "   (Prediction skipped - uncomment code to run AlphaFold prediction)"
+                )
+            except Exception as e:
+                print(f"   Error: {e}")
+    else:
+        print("\n\nStep 3: Structure prediction disabled")
+
+    # Summary
+    print("\n" + "=" * 60)
+    print("Batch analysis complete!")
+    print(f"\nResults saved to: {output_dir}/")
+    print(f"  - BLAST results: *_blast.csv")
+    if align and len(sequences) > 1:
+        print(f"  - Alignment: alignment.afa")
+    if predict_structure:
+        print(f"  - Structures: structure_*/")
+
+    return True
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Perform batch sequence analysis using gget"
+    )
+    parser.add_argument("fasta", help="Input FASTA file with sequences")
+    parser.add_argument(
+        "-db",
+        "--database",
+        default="nr",
+        help="BLAST database (default: nr for proteins, nt for nucleotides)",
+    )
+    parser.add_argument(
+        "--no-align", action="store_true", help="Skip multiple sequence alignment"
+    )
+    parser.add_argument(
+        "--predict-structure",
+        action="store_true",
+        help="Predict structures with AlphaFold (requires setup)",
+    )
+    parser.add_argument(
+        "-o", "--output", default="output", help="Output directory (default: output)"
+    )
+
+    args = parser.parse_args()
+
+    if not Path(args.fasta).exists():
+        print(f"Error: File not found: {args.fasta}")
+        sys.exit(1)
+
+    try:
+        success = analyze_sequences(
+            args.fasta,
+            blast_db=args.database,
+            align=not args.no_align,
+            predict_structure=args.predict_structure,
+            output_dir=args.output,
+        )
+        sys.exit(0 if success else 1)
+    except KeyboardInterrupt:
+        print("\n\nAnalysis interrupted by user")
+        sys.exit(1)
+    except Exception as e:
+        print(f"\n\nError: {e}")
+        import traceback
+
+        traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/skills/gget/scripts/enrichment_pipeline.py
+++ b/skills/gget/scripts/enrichment_pipeline.py
@@ -0,0 +1,235 @@
+#!/usr/bin/env python3
+"""
+Enrichment Analysis Pipeline
+Perform comprehensive enrichment analysis on a gene list
+"""
+
+import argparse
+import sys
+from pathlib import Path
+import gget
+import pandas as pd
+
+
+def read_gene_list(file_path):
+    """Read gene list from file (one gene per line or CSV)."""
+    file_path = Path(file_path)
+
+    if file_path.suffix == ".csv":
+        df = pd.read_csv(file_path)
+        # Assume first column contains gene names
+        genes = df.iloc[:, 0].tolist()
+    else:
+        # Plain text file
+        with open(file_path, "r") as f:
+            genes = [line.strip() for line in f if line.strip()]
+
+    return genes
+
+
+def enrichment_pipeline(
+    gene_list,
+    species="human",
+    background=None,
+    output_prefix="enrichment",
+    plot=True,
+):
+    """
+    Perform comprehensive enrichment analysis.
+
+    Args:
+        gene_list: List of gene symbols
+        species: Species for analysis
+        background: Background gene list (optional)
+        output_prefix: Prefix for output files
+        plot: Whether to generate plots
+    """
+    print("Enrichment Analysis Pipeline")
+    print("=" * 60)
+    print(f"Analyzing {len(gene_list)} genes")
+    print(f"Species: {species}\n")
+
+    # Database categories to analyze
+    databases = {
+        "pathway": "KEGG Pathways",
+        "ontology": "Gene Ontology (Biological Process)",
+        "transcription": "Transcription Factors (ChEA)",
+        "diseases_drugs": "Disease Associations (GWAS)",
+        "celltypes": "Cell Type Markers (PanglaoDB)",
+    }
+
+    results = {}
+
+    for db_key, db_name in databases.items():
+        print(f"\nAnalyzing: {db_name}")
+        print("-" * 60)
+
+        try:
+            enrichment = gget.enrichr(
+                gene_list,
+                database=db_key,
+                species=species,
+                background_list=background,
+                plot=plot,
+            )
+
+            if enrichment is not None and len(enrichment) > 0:
+                # Save results
+                output_file = f"{output_prefix}_{db_key}.csv"
+                enrichment.to_csv(output_file, index=False)
+                print(f"Results saved to: {output_file}")
+
+                # Show top 5 results
+                print(f"\nTop 5 enriched terms:")
+                for i, row in enrichment.head(5).iterrows():
+                    term = row.get("name", row.get("term", "Unknown"))
+                    p_val = row.get(
+                        "adjusted_p_value",
+                        row.get("p_value", row.get("Adjusted P-value", 1)),
+                    )
+                    print(f"  {i+1}. {term}")
+                    print(f"     P-value: {p_val:.2e}")
+
+                results[db_key] = enrichment
+            else:
+                print("No significant results found")
+
+        except Exception as e:
+            print(f"Error: {e}")
+
+    # Generate summary report
+    print("\n" + "=" * 60)
+    print("Generating summary report...")
+
+    summary = []
+    for db_key, db_name in databases.items():
+        if db_key in results and len(results[db_key]) > 0:
+            summary.append(
+                {
+                    "Database": db_name,
+                    "Total Terms": len(results[db_key]),
+                    "Top Term": results[db_key].iloc[0].get(
+                        "name", results[db_key].iloc[0].get("term", "N/A")
+                    ),
+                }
+            )
+
+    if summary:
+        summary_df = pd.DataFrame(summary)
+        summary_file = f"{output_prefix}_summary.csv"
+        summary_df.to_csv(summary_file, index=False)
+        print(f"\nSummary saved to: {summary_file}")
+        print("\n" + summary_df.to_string(index=False))
+    else:
+        print("\nNo enrichment results to summarize")
+
+    # Get expression data for genes
+    print("\n" + "=" * 60)
+    print("Getting expression data for input genes...")
+
+    try:
+        # Get tissue expression for first few genes
+        expr_data = []
+        for gene in gene_list[:5]:  # Limit to first 5
+            print(f"  Getting expression for {gene}...")
+            try:
+                tissue_expr = gget.archs4(gene, which="tissue")
+                top_tissue = tissue_expr.nlargest(1, "median").iloc[0]
+                expr_data.append(
+                    {
+                        "Gene": gene,
+                        "Top Tissue": top_tissue["tissue"],
+                        "Median Expression": top_tissue["median"],
+                    }
+                )
+            except Exception as e:
+                print(f"    Warning: {e}")
+
+        if expr_data:
+            expr_df = pd.DataFrame(expr_data)
+            expr_file = f"{output_prefix}_expression.csv"
+            expr_df.to_csv(expr_file, index=False)
+            print(f"\nExpression data saved to: {expr_file}")
+
+    except Exception as e:
+        print(f"Error getting expression data: {e}")
+
+    print("\n" + "=" * 60)
+    print("Enrichment analysis complete!")
+    print(f"\nOutput files (prefix: {output_prefix}):")
+    for db_key in databases.keys():
+        if db_key in results:
+            print(f"  - {output_prefix}_{db_key}.csv")
+    print(f"  - {output_prefix}_summary.csv")
+    print(f"  - {output_prefix}_expression.csv")
+
+    return True
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Perform comprehensive enrichment analysis using gget"
+    )
+    parser.add_argument(
+        "genes",
+        help="Gene list file (one gene per line or CSV with genes in first column)",
+    )
+    parser.add_argument(
+        "-s",
+        "--species",
+        default="human",
+        help="Species (human, mouse, fly, yeast, worm, fish)",
+    )
+    parser.add_argument(
+        "-b", "--background", help="Background gene list file (optional)"
+    )
+    parser.add_argument(
+        "-o", "--output", default="enrichment", help="Output prefix (default: enrichment)"
+    )
+    parser.add_argument(
+        "--no-plot", action="store_true", help="Disable plotting"
+    )
+
+    args = parser.parse_args()
+
+    # Read gene list
+    if not Path(args.genes).exists():
+        print(f"Error: File not found: {args.genes}")
+        sys.exit(1)
+
+    try:
+        gene_list = read_gene_list(args.genes)
+        print(f"Read {len(gene_list)} genes from {args.genes}")
+
+        # Read background if provided
+        background = None
+        if args.background:
+            if Path(args.background).exists():
+                background = read_gene_list(args.background)
+                print(f"Read {len(background)} background genes from {args.background}")
+            else:
+                print(f"Warning: Background file not found: {args.background}")
+
+        success = enrichment_pipeline(
+            gene_list,
+            species=args.species,
+            background=background,
+            output_prefix=args.output,
+            plot=not args.no_plot,
+        )
+
+        sys.exit(0 if success else 1)
+
+    except KeyboardInterrupt:
+        print("\n\nAnalysis interrupted by user")
+        sys.exit(1)
+    except Exception as e:
+        print(f"\n\nError: {e}")
+        import traceback
+
+        traceback.print_exc()
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()
--- a/skills/gget/scripts/gene_analysis.py
+++ b/skills/gget/scripts/gene_analysis.py
@@ -0,0 +1,161 @@
+#!/usr/bin/env python3
+"""
+Gene Analysis Script
+Quick analysis of a gene: search, info, sequences, expression, and enrichment
+"""
+
+import argparse
+import sys
+import gget
+
+
+def analyze_gene(gene_name, species="homo_sapiens", output_prefix=None):
+    """
+    Perform comprehensive analysis of a gene.
+
+    Args:
+        gene_name: Gene symbol to analyze
+        species: Species name (default: homo_sapiens)
+        output_prefix: Prefix for output files (default: gene_name)
+    """
+    if output_prefix is None:
+        output_prefix = gene_name.lower()
+
+    print(f"Analyzing gene: {gene_name}")
+    print("=" * 60)
+
+    # Step 1: Search for the gene
+    print("\n1. Searching for gene...")
+    search_results = gget.search([gene_name], species=species, limit=1)
+
+    if len(search_results) == 0:
+        print(f"Error: Gene '{gene_name}' not found in {species}")
+        return False
+
+    gene_id = search_results["ensembl_id"].iloc[0]
+    print(f"   Found: {gene_id}")
+    print(f"   Description: {search_results['ensembl_description'].iloc[0]}")
+
+    # Step 2: Get detailed information
+    print("\n2. Getting detailed information...")
+    gene_info = gget.info([gene_id], pdb=True)
+    gene_info.to_csv(f"{output_prefix}_info.csv", index=False)
+    print(f"   Saved to: {output_prefix}_info.csv")
+
+    if "uniprot_id" in gene_info.columns and gene_info["uniprot_id"].iloc[0]:
+        print(f"   UniProt ID: {gene_info['uniprot_id'].iloc[0]}")
+    if "pdb_id" in gene_info.columns and gene_info["pdb_id"].iloc[0]:
+        print(f"   PDB IDs: {gene_info['pdb_id'].iloc[0]}")
+
+    # Step 3: Get sequences
+    print("\n3. Retrieving sequences...")
+    nucleotide_seq = gget.seq([gene_id])
+    protein_seq = gget.seq([gene_id], translate=True)
+
+    with open(f"{output_prefix}_nucleotide.fasta", "w") as f:
+        f.write(nucleotide_seq)
+    print(f"   Nucleotide sequence saved to: {output_prefix}_nucleotide.fasta")
+
+    with open(f"{output_prefix}_protein.fasta", "w") as f:
+        f.write(protein_seq)
+    print(f"   Protein sequence saved to: {output_prefix}_protein.fasta")
+
+    # Step 4: Get tissue expression
+    print("\n4. Getting tissue expression...")
+    try:
+        tissue_expr = gget.archs4(gene_name, which="tissue")
+        tissue_expr.to_csv(f"{output_prefix}_tissue_expression.csv", index=False)
+        print(f"   Saved to: {output_prefix}_tissue_expression.csv")
+
+        # Show top tissues
+        top_tissues = tissue_expr.nlargest(5, "median")
+        print("\n   Top expressing tissues:")
+        for _, row in top_tissues.iterrows():
+            print(f"     {row['tissue']}: median = {row['median']:.2f}")
+    except Exception as e:
+        print(f"   Warning: Could not retrieve ARCHS4 data: {e}")
+
+    # Step 5: Find correlated genes
+    print("\n5. Finding correlated genes...")
+    try:
+        correlated = gget.archs4(gene_name, which="correlation")
+        correlated.to_csv(f"{output_prefix}_correlated_genes.csv", index=False)
+        print(f"   Saved to: {output_prefix}_correlated_genes.csv")
+
+        # Show top correlated
+        print("\n   Top 10 correlated genes:")
+        for _, row in correlated.head(10).iterrows():
+            print(f"     {row['gene_symbol']}: r = {row['correlation']:.3f}")
+    except Exception as e:
+        print(f"   Warning: Could not retrieve correlation data: {e}")
+
+    # Step 6: Get disease associations
+    print("\n6. Getting disease associations...")
+    try:
+        diseases = gget.opentargets(gene_id, resource="diseases", limit=10)
+        diseases.to_csv(f"{output_prefix}_diseases.csv", index=False)
+        print(f"   Saved to: {output_prefix}_diseases.csv")
+
+        print("\n   Top 5 disease associations:")
+        for _, row in diseases.head(5).iterrows():
+            print(f"     {row['disease_name']}: score = {row['overall_score']:.3f}")
+    except Exception as e:
+        print(f"   Warning: Could not retrieve disease data: {e}")
+
+    # Step 7: Get drug associations
+    print("\n7. Getting drug associations...")
+    try:
+        drugs = gget.opentargets(gene_id, resource="drugs", limit=10)
+        if len(drugs) > 0:
+            drugs.to_csv(f"{output_prefix}_drugs.csv", index=False)
+            print(f"   Saved to: {output_prefix}_drugs.csv")
+            print(f"\n   Found {len(drugs)} drug associations")
+        else:
+            print("   No drug associations found")
+    except Exception as e:
+        print(f"   Warning: Could not retrieve drug data: {e}")
+
+    print("\n" + "=" * 60)
+    print("Analysis complete!")
+    print(f"\nOutput files (prefix: {output_prefix}):")
+    print(f"  - {output_prefix}_info.csv")
+    print(f"  - {output_prefix}_nucleotide.fasta")
+    print(f"  - {output_prefix}_protein.fasta")
+    print(f"  - {output_prefix}_tissue_expression.csv")
+    print(f"  - {output_prefix}_correlated_genes.csv")
+    print(f"  - {output_prefix}_diseases.csv")
+    print(f"  - {output_prefix}_drugs.csv (if available)")
+
+    return True
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Perform comprehensive analysis of a gene using gget"
+    )
+    parser.add_argument("gene", help="Gene symbol to analyze")
+    parser.add_argument(
+        "-s",
+        "--species",
+        default="homo_sapiens",
+        help="Species (default: homo_sapiens)",
+    )
+    parser.add_argument(
+        "-o", "--output", help="Output prefix for files (default: gene name)"
+    )
+
+    args = parser.parse_args()
+
+    try:
+        success = analyze_gene(args.gene, args.species, args.output)
+        sys.exit(0 if success else 1)
+    except KeyboardInterrupt:
+        print("\n\nAnalysis interrupted by user")
+        sys.exit(1)
+    except Exception as e:
+        print(f"\n\nError: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    main()