Files
gh-k-dense-ai-claude-scient…/skills/gget/references/workflows.md
2025-11-30 08:30:10 +08:00

25 KiB

gget Workflow Examples

Extended workflow examples demonstrating how to combine multiple gget modules for common bioinformatics tasks.

Table of Contents

  1. Complete Gene Analysis Pipeline
  2. Comparative Structural Biology
  3. Cancer Genomics Analysis
  4. Single-Cell Expression Analysis
  5. Building Reference Transcriptomes
  6. Mutation Impact Assessment
  7. Drug Target Discovery

Complete Gene Analysis Pipeline

Comprehensive analysis of a gene from discovery to functional annotation.

import gget
import pandas as pd

# Step 1: Search for genes of interest
print("Step 1: Searching for GABA receptor genes...")
search_results = gget.search(["GABA", "receptor", "alpha"],
                             species="homo_sapiens",
                             andor="and")
print(f"Found {len(search_results)} genes")

# Step 2: Get detailed information
print("\nStep 2: Getting detailed information...")
gene_ids = search_results["ensembl_id"].tolist()[:5]  # Top 5 genes
gene_info = gget.info(gene_ids, pdb=True)
print(gene_info[["ensembl_id", "gene_name", "uniprot_id", "description"]])

# Step 3: Retrieve sequences
print("\nStep 3: Retrieving sequences...")
nucleotide_seqs = gget.seq(gene_ids)
protein_seqs = gget.seq(gene_ids, translate=True)

# Save sequences
with open("gaba_receptors_nt.fasta", "w") as f:
    f.write(nucleotide_seqs)
with open("gaba_receptors_aa.fasta", "w") as f:
    f.write(protein_seqs)

# Step 4: Get expression data
print("\nStep 4: Getting tissue expression...")
for gene_id, gene_name in zip(gene_ids, gene_info["gene_name"]):
    expr_data = gget.archs4(gene_name, which="tissue")
    print(f"\n{gene_name} expression:")
    print(expr_data.head())

# Step 5: Find correlated genes
print("\nStep 5: Finding correlated genes...")
correlated = gget.archs4(gene_info["gene_name"].iloc[0], which="correlation")
correlated_top = correlated.head(20)
print(correlated_top)

# Step 6: Enrichment analysis on correlated genes
print("\nStep 6: Performing enrichment analysis...")
gene_list = correlated_top["gene_symbol"].tolist()
enrichment = gget.enrichr(gene_list, database="ontology", plot=True)
print(enrichment.head(10))

# Step 7: Get disease associations
print("\nStep 7: Getting disease associations...")
for gene_id, gene_name in zip(gene_ids[:3], gene_info["gene_name"][:3]):
    diseases = gget.opentargets(gene_id, resource="diseases", limit=5)
    print(f"\n{gene_name} disease associations:")
    print(diseases)

# Step 8: Check for orthologs
print("\nStep 8: Finding orthologs...")
orthologs = gget.bgee(gene_ids[0], type="orthologs")
print(orthologs)

print("\nComplete gene analysis pipeline finished!")

Comparative Structural Biology

Compare protein structures across species and analyze functional motifs.

import gget

# Define genes for comparison
human_gene = "ENSG00000169174"  # PCSK9
mouse_gene = "ENSMUSG00000044254"  # Pcsk9

print("Comparative Structural Biology Workflow")
print("=" * 50)

# Step 1: Get gene information
print("\n1. Getting gene information...")
human_info = gget.info([human_gene])
mouse_info = gget.info([mouse_gene])

print(f"Human: {human_info['gene_name'].iloc[0]}")
print(f"Mouse: {mouse_info['gene_name'].iloc[0]}")

# Step 2: Retrieve protein sequences
print("\n2. Retrieving protein sequences...")
human_seq = gget.seq(human_gene, translate=True)
mouse_seq = gget.seq(mouse_gene, translate=True)

# Save to file for alignment
with open("pcsk9_sequences.fasta", "w") as f:
    f.write(human_seq)
    f.write("\n")
    f.write(mouse_seq)

# Step 3: Align sequences
print("\n3. Aligning sequences...")
alignment = gget.muscle("pcsk9_sequences.fasta")
print("Alignment completed. Visualizing in ClustalW format:")
print(alignment)

# Step 4: Get existing structures from PDB
print("\n4. Searching PDB for existing structures...")
# Search by sequence using BLAST
pdb_results = gget.blast(human_seq, database="pdbaa", limit=5)
print("Top PDB matches:")
print(pdb_results[["Description", "Max Score", "Query Coverage"]])

# Download top structure
if len(pdb_results) > 0:
    # Extract PDB ID from description (usually format: "PDB|XXXX|...")
    pdb_id = pdb_results.iloc[0]["Description"].split("|")[1]
    print(f"\nDownloading PDB structure: {pdb_id}")
    gget.pdb(pdb_id, save=True)

# Step 5: Predict AlphaFold structures
print("\n5. Predicting structures with AlphaFold...")
# Note: This requires gget setup alphafold and is computationally intensive
# Uncomment to run:
# human_structure = gget.alphafold(human_seq, plot=True)
# mouse_structure = gget.alphafold(mouse_seq, plot=True)
print("(AlphaFold prediction skipped - uncomment to run)")

# Step 6: Identify functional motifs
print("\n6. Identifying functional motifs with ELM...")
# Note: Requires gget setup elm
# Uncomment to run:
# human_ortholog_df, human_regex_df = gget.elm(human_seq)
# print("Human PCSK9 functional motifs:")
# print(human_regex_df)
print("(ELM analysis skipped - uncomment to run)")

# Step 7: Get orthology information
print("\n7. Getting orthology information from Bgee...")
orthologs = gget.bgee(human_gene, type="orthologs")
print("PCSK9 orthologs:")
print(orthologs)

print("\nComparative structural biology workflow completed!")

Cancer Genomics Analysis

Analyze cancer-associated genes and their mutations.

import gget
import matplotlib.pyplot as plt

print("Cancer Genomics Analysis Workflow")
print("=" * 50)

# Step 1: Search for cancer-related genes
print("\n1. Searching for breast cancer genes...")
genes = gget.search(["breast", "cancer", "BRCA"],
                    species="homo_sapiens",
                    andor="or",
                    limit=20)
print(f"Found {len(genes)} genes")

# Focus on specific genes
target_genes = ["BRCA1", "BRCA2", "TP53", "PIK3CA", "ESR1"]
print(f"\nAnalyzing: {', '.join(target_genes)}")

# Step 2: Get gene information
print("\n2. Getting gene information...")
gene_search = []
for gene in target_genes:
    result = gget.search([gene], species="homo_sapiens", limit=1)
    if len(result) > 0:
        gene_search.append(result.iloc[0])

gene_df = pd.DataFrame(gene_search)
gene_ids = gene_df["ensembl_id"].tolist()

# Step 3: Get disease associations
print("\n3. Getting disease associations from OpenTargets...")
for gene_id, gene_name in zip(gene_ids, target_genes):
    print(f"\n{gene_name} disease associations:")
    diseases = gget.opentargets(gene_id, resource="diseases", limit=3)
    print(diseases[["disease_name", "overall_score"]])

# Step 4: Get drug associations
print("\n4. Getting drug associations...")
for gene_id, gene_name in zip(gene_ids[:3], target_genes[:3]):
    print(f"\n{gene_name} drug associations:")
    drugs = gget.opentargets(gene_id, resource="drugs", limit=3)
    if len(drugs) > 0:
        print(drugs[["drug_name", "drug_type", "max_phase_for_all_diseases"]])

# Step 5: Search cBioPortal for studies
print("\n5. Searching cBioPortal for breast cancer studies...")
studies = gget.cbio_search(["breast", "cancer"])
print(f"Found {len(studies)} studies")
print(studies[:5])

# Step 6: Create cancer genomics heatmap
print("\n6. Creating cancer genomics heatmap...")
if len(studies) > 0:
    # Select relevant studies
    selected_studies = studies[:2]  # Top 2 studies

    gget.cbio_plot(
        selected_studies,
        target_genes,
        stratification="cancer_type",
        variation_type="mutation_occurrences",
        show=False
    )
    print("Heatmap saved to ./gget_cbio_figures/")

# Step 7: Query COSMIC database (requires setup)
print("\n7. Querying COSMIC database...")
# Note: Requires COSMIC account and database download
# Uncomment to run:
# for gene in target_genes[:2]:
#     cosmic_results = gget.cosmic(
#         gene,
#         cosmic_tsv_path="cosmic_cancer.tsv",
#         limit=10
#     )
#     print(f"\n{gene} mutations in COSMIC:")
#     print(cosmic_results)
print("(COSMIC query skipped - requires database download)")

# Step 8: Enrichment analysis
print("\n8. Performing pathway enrichment...")
enrichment = gget.enrichr(target_genes, database="pathway", plot=True)
print("\nTop enriched pathways:")
print(enrichment.head(10))

print("\nCancer genomics analysis completed!")

Single-Cell Expression Analysis

Analyze single-cell RNA-seq data for specific cell types and tissues.

import gget
import scanpy as sc

print("Single-Cell Expression Analysis Workflow")
print("=" * 50)

# Note: Requires gget setup cellxgene

# Step 1: Define genes and cell types of interest
genes_of_interest = ["ACE2", "TMPRSS2", "CD4", "CD8A"]
tissue = "lung"
cell_types = ["type ii pneumocyte", "macrophage", "t cell"]

print(f"\nAnalyzing genes: {', '.join(genes_of_interest)}")
print(f"Tissue: {tissue}")
print(f"Cell types: {', '.join(cell_types)}")

# Step 2: Get metadata first
print("\n1. Retrieving metadata...")
metadata = gget.cellxgene(
    gene=genes_of_interest,
    tissue=tissue,
    species="homo_sapiens",
    meta_only=True
)
print(f"Found {len(metadata)} datasets")
print(metadata.head())

# Step 3: Download count matrices
print("\n2. Downloading single-cell data...")
# Note: This can be a large download
adata = gget.cellxgene(
    gene=genes_of_interest,
    tissue=tissue,
    species="homo_sapiens",
    census_version="stable"
)
print(f"AnnData shape: {adata.shape}")
print(f"Genes: {adata.n_vars}")
print(f"Cells: {adata.n_obs}")

# Step 4: Basic QC and filtering with scanpy
print("\n3. Performing quality control...")
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
print(f"After QC - Cells: {adata.n_obs}, Genes: {adata.n_vars}")

# Step 5: Normalize and log-transform
print("\n4. Normalizing data...")
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)

# Step 6: Calculate gene expression statistics
print("\n5. Calculating expression statistics...")
for gene in genes_of_interest:
    if gene in adata.var_names:
        expr = adata[:, gene].X.toarray().flatten()
        print(f"\n{gene} expression:")
        print(f"  Mean: {expr.mean():.3f}")
        print(f"  Median: {np.median(expr):.3f}")
        print(f"  % expressing: {(expr > 0).sum() / len(expr) * 100:.1f}%")

# Step 7: Get tissue expression from ARCHS4 for comparison
print("\n6. Getting bulk tissue expression from ARCHS4...")
for gene in genes_of_interest:
    tissue_expr = gget.archs4(gene, which="tissue")
    lung_expr = tissue_expr[tissue_expr["tissue"] == "lung"]
    if len(lung_expr) > 0:
        print(f"\n{gene} in lung (ARCHS4):")
        print(f"  Median: {lung_expr['median'].iloc[0]:.3f}")

# Step 8: Enrichment analysis
print("\n7. Performing enrichment analysis...")
enrichment = gget.enrichr(genes_of_interest, database="celltypes", plot=True)
print("\nTop cell type associations:")
print(enrichment.head(10))

# Step 9: Get disease associations
print("\n8. Getting disease associations...")
for gene in genes_of_interest:
    gene_search = gget.search([gene], species="homo_sapiens", limit=1)
    if len(gene_search) > 0:
        gene_id = gene_search["ensembl_id"].iloc[0]
        diseases = gget.opentargets(gene_id, resource="diseases", limit=3)
        print(f"\n{gene} disease associations:")
        print(diseases[["disease_name", "overall_score"]])

print("\nSingle-cell expression analysis completed!")

Building Reference Transcriptomes

Prepare reference data for RNA-seq analysis pipelines.

#!/bin/bash
# Reference transcriptome building workflow

echo "Reference Transcriptome Building Workflow"
echo "=========================================="

# Step 1: List available species
echo -e "\n1. Listing available species..."
gget ref --list_species > available_species.txt
echo "Available species saved to available_species.txt"

# Step 2: Download reference files for human
echo -e "\n2. Downloading human reference files..."
SPECIES="homo_sapiens"
RELEASE=110  # Specify release for reproducibility

# Download GTF annotation
echo "Downloading GTF annotation..."
gget ref -w gtf -r $RELEASE -d $SPECIES -o human_ref_gtf.json

# Download cDNA sequences
echo "Downloading cDNA sequences..."
gget ref -w cdna -r $RELEASE -d $SPECIES -o human_ref_cdna.json

# Download protein sequences
echo "Downloading protein sequences..."
gget ref -w pep -r $RELEASE -d $SPECIES -o human_ref_pep.json

# Step 3: Build kallisto index (if kallisto is installed)
echo -e "\n3. Building kallisto index..."
if command -v kallisto &> /dev/null; then
    # Get cDNA FASTA file from download
    CDNA_FILE=$(ls *.cdna.all.fa.gz)
    if [ -f "$CDNA_FILE" ]; then
        kallisto index -i transcriptome.idx $CDNA_FILE
        echo "Kallisto index created: transcriptome.idx"
    else
        echo "cDNA FASTA file not found"
    fi
else
    echo "kallisto not installed, skipping index building"
fi

# Step 4: Download genome for alignment-based methods
echo -e "\n4. Downloading genome sequence..."
gget ref -w dna -r $RELEASE -d $SPECIES -o human_ref_dna.json

# Step 5: Get gene information for genes of interest
echo -e "\n5. Getting information for specific genes..."
gget search -s $SPECIES "TP53 BRCA1 BRCA2" -o key_genes.csv

echo -e "\nReference transcriptome building completed!"
# Python version
import gget
import json

print("Reference Transcriptome Building Workflow")
print("=" * 50)

# Configuration
species = "homo_sapiens"
release = 110
genes_of_interest = ["TP53", "BRCA1", "BRCA2", "MYC", "EGFR"]

# Step 1: Get reference information
print("\n1. Getting reference information...")
ref_info = gget.ref(species, release=release)

# Save reference information
with open("reference_info.json", "w") as f:
    json.dump(ref_info, f, indent=2)
print("Reference information saved to reference_info.json")

# Step 2: Download specific files
print("\n2. Downloading reference files...")
# GTF annotation
gget.ref(species, which="gtf", release=release, download=True)
# cDNA sequences
gget.ref(species, which="cdna", release=release, download=True)

# Step 3: Get information for genes of interest
print(f"\n3. Getting information for {len(genes_of_interest)} genes...")
gene_data = []
for gene in genes_of_interest:
    result = gget.search([gene], species=species, limit=1)
    if len(result) > 0:
        gene_data.append(result.iloc[0])

# Get detailed info
if gene_data:
    gene_ids = [g["ensembl_id"] for g in gene_data]
    detailed_info = gget.info(gene_ids)
    detailed_info.to_csv("genes_of_interest_info.csv", index=False)
    print("Gene information saved to genes_of_interest_info.csv")

# Step 4: Get sequences
print("\n4. Retrieving sequences...")
sequences_nt = gget.seq(gene_ids)
sequences_aa = gget.seq(gene_ids, translate=True)

with open("key_genes_nucleotide.fasta", "w") as f:
    f.write(sequences_nt)
with open("key_genes_protein.fasta", "w") as f:
    f.write(sequences_aa)

print("\nReference transcriptome building completed!")
print(f"Files created:")
print("  - reference_info.json")
print("  - genes_of_interest_info.csv")
print("  - key_genes_nucleotide.fasta")
print("  - key_genes_protein.fasta")

Mutation Impact Assessment

Analyze the impact of genetic mutations on protein structure and function.

import gget
import pandas as pd

print("Mutation Impact Assessment Workflow")
print("=" * 50)

# Define mutations to analyze
mutations = [
    {"gene": "TP53", "mutation": "c.818G>A", "description": "R273H hotspot"},
    {"gene": "EGFR", "mutation": "c.2573T>G", "description": "L858R activating"},
]

# Step 1: Get gene information
print("\n1. Getting gene information...")
for mut in mutations:
    results = gget.search([mut["gene"]], species="homo_sapiens", limit=1)
    if len(results) > 0:
        mut["ensembl_id"] = results["ensembl_id"].iloc[0]
        print(f"{mut['gene']}: {mut['ensembl_id']}")

# Step 2: Get sequences
print("\n2. Retrieving wild-type sequences...")
for mut in mutations:
    # Get nucleotide sequence
    nt_seq = gget.seq(mut["ensembl_id"])
    mut["wt_sequence"] = nt_seq

    # Get protein sequence
    aa_seq = gget.seq(mut["ensembl_id"], translate=True)
    mut["wt_protein"] = aa_seq

# Step 3: Generate mutated sequences
print("\n3. Generating mutated sequences...")
# Create mutation dataframe for gget mutate
mut_df = pd.DataFrame({
    "seq_ID": [m["gene"] for m in mutations],
    "mutation": [m["mutation"] for m in mutations]
})

# For each mutation
for mut in mutations:
    # Extract sequence from FASTA
    lines = mut["wt_sequence"].split("\n")
    seq = "".join(lines[1:])

    # Create single mutation df
    single_mut = pd.DataFrame({
        "seq_ID": [mut["gene"]],
        "mutation": [mut["mutation"]]
    })

    # Generate mutated sequence
    mutated = gget.mutate([seq], mutations=single_mut)
    mut["mutated_sequence"] = mutated

print("Mutated sequences generated")

# Step 4: Get existing structure information
print("\n4. Getting structure information...")
for mut in mutations:
    # Get info with PDB IDs
    info = gget.info([mut["ensembl_id"]], pdb=True)

    if "pdb_id" in info.columns and pd.notna(info["pdb_id"].iloc[0]):
        pdb_ids = info["pdb_id"].iloc[0].split(";")
        print(f"\n{mut['gene']} PDB structures: {', '.join(pdb_ids[:3])}")

        # Download first structure
        if len(pdb_ids) > 0:
            pdb_id = pdb_ids[0].strip()
            mut["pdb_id"] = pdb_id
            gget.pdb(pdb_id, save=True)
    else:
        print(f"\n{mut['gene']}: No PDB structure available")
        mut["pdb_id"] = None

# Step 5: Predict structures with AlphaFold (optional)
print("\n5. Predicting structures with AlphaFold...")
# Note: Requires gget setup alphafold and is computationally intensive
# Uncomment to run:
# for mut in mutations:
#     print(f"Predicting {mut['gene']} wild-type structure...")
#     wt_structure = gget.alphafold(mut["wt_protein"])
#
#     print(f"Predicting {mut['gene']} mutant structure...")
#     # Would need to translate mutated sequence first
#     # mutant_structure = gget.alphafold(mutated_protein)
print("(AlphaFold prediction skipped - uncomment to run)")

# Step 6: Find functional motifs
print("\n6. Identifying functional motifs...")
# Note: Requires gget setup elm
# Uncomment to run:
# for mut in mutations:
#     ortholog_df, regex_df = gget.elm(mut["wt_protein"])
#     print(f"\n{mut['gene']} functional motifs:")
#     print(regex_df)
print("(ELM analysis skipped - uncomment to run)")

# Step 7: Get disease associations
print("\n7. Getting disease associations...")
for mut in mutations:
    diseases = gget.opentargets(
        mut["ensembl_id"],
        resource="diseases",
        limit=5
    )
    print(f"\n{mut['gene']} ({mut['description']}) disease associations:")
    print(diseases[["disease_name", "overall_score"]])

# Step 8: Query COSMIC for mutation frequency
print("\n8. Querying COSMIC database...")
# Note: Requires COSMIC database download
# Uncomment to run:
# for mut in mutations:
#     cosmic_results = gget.cosmic(
#         mut["mutation"],
#         cosmic_tsv_path="cosmic_cancer.tsv",
#         limit=10
#     )
#     print(f"\n{mut['gene']} {mut['mutation']} in COSMIC:")
#     print(cosmic_results)
print("(COSMIC query skipped - requires database download)")

print("\nMutation impact assessment completed!")

Drug Target Discovery

Identify and validate potential drug targets for specific diseases.

import gget
import pandas as pd

print("Drug Target Discovery Workflow")
print("=" * 50)

# Step 1: Search for disease-related genes
disease = "alzheimer"
print(f"\n1. Searching for {disease} disease genes...")
genes = gget.search([disease], species="homo_sapiens", limit=50)
print(f"Found {len(genes)} potential genes")

# Step 2: Get detailed information
print("\n2. Getting detailed gene information...")
gene_ids = genes["ensembl_id"].tolist()[:20]  # Top 20
gene_info = gget.info(gene_ids[:10])  # Limit to avoid timeout

# Step 3: Get disease associations from OpenTargets
print("\n3. Getting disease associations...")
disease_scores = []
for gene_id, gene_name in zip(gene_info["ensembl_id"], gene_info["gene_name"]):
    diseases = gget.opentargets(gene_id, resource="diseases", limit=10)

    # Filter for Alzheimer's disease
    alzheimer = diseases[diseases["disease_name"].str.contains("Alzheimer", case=False, na=False)]

    if len(alzheimer) > 0:
        disease_scores.append({
            "ensembl_id": gene_id,
            "gene_name": gene_name,
            "disease_score": alzheimer["overall_score"].max()
        })

disease_df = pd.DataFrame(disease_scores).sort_values("disease_score", ascending=False)
print("\nTop disease-associated genes:")
print(disease_df.head(10))

# Step 4: Get tractability information
print("\n4. Assessing target tractability...")
top_targets = disease_df.head(5)
for _, row in top_targets.iterrows():
    tractability = gget.opentargets(
        row["ensembl_id"],
        resource="tractability"
    )
    print(f"\n{row['gene_name']} tractability:")
    print(tractability)

# Step 5: Get expression data
print("\n5. Getting tissue expression data...")
for _, row in top_targets.iterrows():
    # Brain expression from OpenTargets
    expression = gget.opentargets(
        row["ensembl_id"],
        resource="expression",
        filter_tissue="brain"
    )
    print(f"\n{row['gene_name']} brain expression:")
    print(expression)

    # Tissue expression from ARCHS4
    tissue_expr = gget.archs4(row["gene_name"], which="tissue")
    brain_expr = tissue_expr[tissue_expr["tissue"].str.contains("brain", case=False, na=False)]
    print(f"ARCHS4 brain expression:")
    print(brain_expr)

# Step 6: Check for existing drugs
print("\n6. Checking for existing drugs...")
for _, row in top_targets.iterrows():
    drugs = gget.opentargets(row["ensembl_id"], resource="drugs", limit=5)
    print(f"\n{row['gene_name']} drug associations:")
    if len(drugs) > 0:
        print(drugs[["drug_name", "drug_type", "max_phase_for_all_diseases"]])
    else:
        print("No drugs found")

# Step 7: Get protein-protein interactions
print("\n7. Getting protein-protein interactions...")
for _, row in top_targets.iterrows():
    interactions = gget.opentargets(
        row["ensembl_id"],
        resource="interactions",
        limit=10
    )
    print(f"\n{row['gene_name']} interacts with:")
    if len(interactions) > 0:
        print(interactions[["gene_b_symbol", "interaction_score"]])

# Step 8: Enrichment analysis
print("\n8. Performing pathway enrichment...")
gene_list = top_targets["gene_name"].tolist()
enrichment = gget.enrichr(gene_list, database="pathway", plot=True)
print("\nTop enriched pathways:")
print(enrichment.head(10))

# Step 9: Get structure information
print("\n9. Getting structure information...")
for _, row in top_targets.iterrows():
    info = gget.info([row["ensembl_id"]], pdb=True)

    if "pdb_id" in info.columns and pd.notna(info["pdb_id"].iloc[0]):
        pdb_ids = info["pdb_id"].iloc[0].split(";")
        print(f"\n{row['gene_name']} PDB structures: {', '.join(pdb_ids[:3])}")
    else:
        print(f"\n{row['gene_name']}: No PDB structure available")
        # Could predict with AlphaFold
        print(f"  Consider AlphaFold prediction")

# Step 10: Generate target summary report
print("\n10. Generating target summary report...")
report = []
for _, row in top_targets.iterrows():
    report.append({
        "Gene": row["gene_name"],
        "Ensembl ID": row["ensembl_id"],
        "Disease Score": row["disease_score"],
        "Target Status": "High Priority"
    })

report_df = pd.DataFrame(report)
report_df.to_csv("drug_targets_report.csv", index=False)
print("\nTarget report saved to drug_targets_report.csv")

print("\nDrug target discovery workflow completed!")

Tips for Workflow Development

Error Handling

import gget

def safe_gget_call(func, *args, **kwargs):
    """Wrapper for gget calls with error handling"""
    try:
        result = func(*args, **kwargs)
        return result
    except Exception as e:
        print(f"Error in {func.__name__}: {str(e)}")
        return None

# Usage
result = safe_gget_call(gget.search, ["ACE2"], species="homo_sapiens")
if result is not None:
    print(result)

Rate Limiting

import time
import gget

def rate_limited_queries(gene_ids, delay=1):
    """Query multiple genes with rate limiting"""
    results = []
    for i, gene_id in enumerate(gene_ids):
        print(f"Querying {i+1}/{len(gene_ids)}: {gene_id}")
        result = gget.info([gene_id])
        results.append(result)

        if i < len(gene_ids) - 1:  # Don't sleep after last query
            time.sleep(delay)

    return pd.concat(results, ignore_index=True)

Caching Results

import os
import pickle
import gget

def cached_gget(cache_file, func, *args, **kwargs):
    """Cache gget results to avoid repeated queries"""
    if os.path.exists(cache_file):
        print(f"Loading from cache: {cache_file}")
        with open(cache_file, "rb") as f:
            return pickle.load(f)

    result = func(*args, **kwargs)

    with open(cache_file, "wb") as f:
        pickle.dump(result, f)
    print(f"Saved to cache: {cache_file}")

    return result

# Usage
result = cached_gget("ace2_info.pkl", gget.info, ["ENSG00000130234"])

These workflows demonstrate how to combine multiple gget modules for comprehensive bioinformatics analyses. Adapt them to your specific research questions and data types.