Initial commit
This commit is contained in:
191
skills/gget/scripts/batch_sequence_analysis.py
Executable file
191
skills/gget/scripts/batch_sequence_analysis.py
Executable file
@@ -0,0 +1,191 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Batch Sequence Analysis Script
|
||||
Analyze multiple sequences: BLAST, alignment, and structure prediction
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import gget
|
||||
|
||||
|
||||
def read_fasta(fasta_file):
|
||||
"""Read sequences from FASTA file."""
|
||||
sequences = []
|
||||
current_id = None
|
||||
current_seq = []
|
||||
|
||||
with open(fasta_file, "r") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line.startswith(">"):
|
||||
if current_id:
|
||||
sequences.append({"id": current_id, "seq": "".join(current_seq)})
|
||||
current_id = line[1:]
|
||||
current_seq = []
|
||||
else:
|
||||
current_seq.append(line)
|
||||
|
||||
if current_id:
|
||||
sequences.append({"id": current_id, "seq": "".join(current_seq)})
|
||||
|
||||
return sequences
|
||||
|
||||
|
||||
def analyze_sequences(
|
||||
fasta_file,
|
||||
blast_db="nr",
|
||||
align=True,
|
||||
predict_structure=False,
|
||||
output_dir="output",
|
||||
):
|
||||
"""
|
||||
Perform batch sequence analysis.
|
||||
|
||||
Args:
|
||||
fasta_file: Path to FASTA file with sequences
|
||||
blast_db: BLAST database to search (default: nr)
|
||||
align: Whether to perform multiple sequence alignment
|
||||
predict_structure: Whether to predict structures with AlphaFold
|
||||
output_dir: Output directory for results
|
||||
"""
|
||||
output_path = Path(output_dir)
|
||||
output_path.mkdir(exist_ok=True)
|
||||
|
||||
print(f"Batch Sequence Analysis")
|
||||
print("=" * 60)
|
||||
print(f"Input file: {fasta_file}")
|
||||
print(f"Output directory: {output_dir}")
|
||||
print("")
|
||||
|
||||
# Read sequences
|
||||
print("Reading sequences...")
|
||||
sequences = read_fasta(fasta_file)
|
||||
print(f"Found {len(sequences)} sequences\n")
|
||||
|
||||
# Step 1: BLAST each sequence
|
||||
print("Step 1: Running BLAST searches...")
|
||||
print("-" * 60)
|
||||
for i, seq_data in enumerate(sequences):
|
||||
print(f"\n{i+1}. BLASTing {seq_data['id']}...")
|
||||
try:
|
||||
blast_results = gget.blast(
|
||||
seq_data["seq"], database=blast_db, limit=10, save=False
|
||||
)
|
||||
|
||||
output_file = output_path / f"{seq_data['id']}_blast.csv"
|
||||
blast_results.to_csv(output_file, index=False)
|
||||
print(f" Results saved to: {output_file}")
|
||||
|
||||
if len(blast_results) > 0:
|
||||
print(f" Top hit: {blast_results.iloc[0]['Description']}")
|
||||
print(
|
||||
f" Max Score: {blast_results.iloc[0]['Max Score']}, "
|
||||
f"Query Coverage: {blast_results.iloc[0]['Query Coverage']}"
|
||||
)
|
||||
except Exception as e:
|
||||
print(f" Error: {e}")
|
||||
|
||||
# Step 2: Multiple sequence alignment
|
||||
if align and len(sequences) > 1:
|
||||
print("\n\nStep 2: Multiple sequence alignment...")
|
||||
print("-" * 60)
|
||||
try:
|
||||
alignment = gget.muscle(fasta_file)
|
||||
alignment_file = output_path / "alignment.afa"
|
||||
with open(alignment_file, "w") as f:
|
||||
f.write(alignment)
|
||||
print(f"Alignment saved to: {alignment_file}")
|
||||
except Exception as e:
|
||||
print(f"Error in alignment: {e}")
|
||||
else:
|
||||
print("\n\nStep 2: Skipping alignment (only 1 sequence or disabled)")
|
||||
|
||||
# Step 3: Structure prediction (optional)
|
||||
if predict_structure:
|
||||
print("\n\nStep 3: Predicting structures with AlphaFold...")
|
||||
print("-" * 60)
|
||||
print(
|
||||
"Note: This requires 'gget setup alphafold' and is computationally intensive"
|
||||
)
|
||||
|
||||
for i, seq_data in enumerate(sequences):
|
||||
print(f"\n{i+1}. Predicting structure for {seq_data['id']}...")
|
||||
try:
|
||||
structure_dir = output_path / f"structure_{seq_data['id']}"
|
||||
# Uncomment to run AlphaFold prediction:
|
||||
# gget.alphafold(seq_data['seq'], out=str(structure_dir))
|
||||
# print(f" Structure saved to: {structure_dir}")
|
||||
print(
|
||||
" (Prediction skipped - uncomment code to run AlphaFold prediction)"
|
||||
)
|
||||
except Exception as e:
|
||||
print(f" Error: {e}")
|
||||
else:
|
||||
print("\n\nStep 3: Structure prediction disabled")
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 60)
|
||||
print("Batch analysis complete!")
|
||||
print(f"\nResults saved to: {output_dir}/")
|
||||
print(f" - BLAST results: *_blast.csv")
|
||||
if align and len(sequences) > 1:
|
||||
print(f" - Alignment: alignment.afa")
|
||||
if predict_structure:
|
||||
print(f" - Structures: structure_*/")
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Perform batch sequence analysis using gget"
|
||||
)
|
||||
parser.add_argument("fasta", help="Input FASTA file with sequences")
|
||||
parser.add_argument(
|
||||
"-db",
|
||||
"--database",
|
||||
default="nr",
|
||||
help="BLAST database (default: nr for proteins, nt for nucleotides)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-align", action="store_true", help="Skip multiple sequence alignment"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--predict-structure",
|
||||
action="store_true",
|
||||
help="Predict structures with AlphaFold (requires setup)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o", "--output", default="output", help="Output directory (default: output)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
if not Path(args.fasta).exists():
|
||||
print(f"Error: File not found: {args.fasta}")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
success = analyze_sequences(
|
||||
args.fasta,
|
||||
blast_db=args.database,
|
||||
align=not args.no_align,
|
||||
predict_structure=args.predict_structure,
|
||||
output_dir=args.output,
|
||||
)
|
||||
sys.exit(0 if success else 1)
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nAnalysis interrupted by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"\n\nError: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
235
skills/gget/scripts/enrichment_pipeline.py
Executable file
235
skills/gget/scripts/enrichment_pipeline.py
Executable file
@@ -0,0 +1,235 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Enrichment Analysis Pipeline
|
||||
Perform comprehensive enrichment analysis on a gene list
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import gget
|
||||
import pandas as pd
|
||||
|
||||
|
||||
def read_gene_list(file_path):
|
||||
"""Read gene list from file (one gene per line or CSV)."""
|
||||
file_path = Path(file_path)
|
||||
|
||||
if file_path.suffix == ".csv":
|
||||
df = pd.read_csv(file_path)
|
||||
# Assume first column contains gene names
|
||||
genes = df.iloc[:, 0].tolist()
|
||||
else:
|
||||
# Plain text file
|
||||
with open(file_path, "r") as f:
|
||||
genes = [line.strip() for line in f if line.strip()]
|
||||
|
||||
return genes
|
||||
|
||||
|
||||
def enrichment_pipeline(
|
||||
gene_list,
|
||||
species="human",
|
||||
background=None,
|
||||
output_prefix="enrichment",
|
||||
plot=True,
|
||||
):
|
||||
"""
|
||||
Perform comprehensive enrichment analysis.
|
||||
|
||||
Args:
|
||||
gene_list: List of gene symbols
|
||||
species: Species for analysis
|
||||
background: Background gene list (optional)
|
||||
output_prefix: Prefix for output files
|
||||
plot: Whether to generate plots
|
||||
"""
|
||||
print("Enrichment Analysis Pipeline")
|
||||
print("=" * 60)
|
||||
print(f"Analyzing {len(gene_list)} genes")
|
||||
print(f"Species: {species}\n")
|
||||
|
||||
# Database categories to analyze
|
||||
databases = {
|
||||
"pathway": "KEGG Pathways",
|
||||
"ontology": "Gene Ontology (Biological Process)",
|
||||
"transcription": "Transcription Factors (ChEA)",
|
||||
"diseases_drugs": "Disease Associations (GWAS)",
|
||||
"celltypes": "Cell Type Markers (PanglaoDB)",
|
||||
}
|
||||
|
||||
results = {}
|
||||
|
||||
for db_key, db_name in databases.items():
|
||||
print(f"\nAnalyzing: {db_name}")
|
||||
print("-" * 60)
|
||||
|
||||
try:
|
||||
enrichment = gget.enrichr(
|
||||
gene_list,
|
||||
database=db_key,
|
||||
species=species,
|
||||
background_list=background,
|
||||
plot=plot,
|
||||
)
|
||||
|
||||
if enrichment is not None and len(enrichment) > 0:
|
||||
# Save results
|
||||
output_file = f"{output_prefix}_{db_key}.csv"
|
||||
enrichment.to_csv(output_file, index=False)
|
||||
print(f"Results saved to: {output_file}")
|
||||
|
||||
# Show top 5 results
|
||||
print(f"\nTop 5 enriched terms:")
|
||||
for i, row in enrichment.head(5).iterrows():
|
||||
term = row.get("name", row.get("term", "Unknown"))
|
||||
p_val = row.get(
|
||||
"adjusted_p_value",
|
||||
row.get("p_value", row.get("Adjusted P-value", 1)),
|
||||
)
|
||||
print(f" {i+1}. {term}")
|
||||
print(f" P-value: {p_val:.2e}")
|
||||
|
||||
results[db_key] = enrichment
|
||||
else:
|
||||
print("No significant results found")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
|
||||
# Generate summary report
|
||||
print("\n" + "=" * 60)
|
||||
print("Generating summary report...")
|
||||
|
||||
summary = []
|
||||
for db_key, db_name in databases.items():
|
||||
if db_key in results and len(results[db_key]) > 0:
|
||||
summary.append(
|
||||
{
|
||||
"Database": db_name,
|
||||
"Total Terms": len(results[db_key]),
|
||||
"Top Term": results[db_key].iloc[0].get(
|
||||
"name", results[db_key].iloc[0].get("term", "N/A")
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
if summary:
|
||||
summary_df = pd.DataFrame(summary)
|
||||
summary_file = f"{output_prefix}_summary.csv"
|
||||
summary_df.to_csv(summary_file, index=False)
|
||||
print(f"\nSummary saved to: {summary_file}")
|
||||
print("\n" + summary_df.to_string(index=False))
|
||||
else:
|
||||
print("\nNo enrichment results to summarize")
|
||||
|
||||
# Get expression data for genes
|
||||
print("\n" + "=" * 60)
|
||||
print("Getting expression data for input genes...")
|
||||
|
||||
try:
|
||||
# Get tissue expression for first few genes
|
||||
expr_data = []
|
||||
for gene in gene_list[:5]: # Limit to first 5
|
||||
print(f" Getting expression for {gene}...")
|
||||
try:
|
||||
tissue_expr = gget.archs4(gene, which="tissue")
|
||||
top_tissue = tissue_expr.nlargest(1, "median").iloc[0]
|
||||
expr_data.append(
|
||||
{
|
||||
"Gene": gene,
|
||||
"Top Tissue": top_tissue["tissue"],
|
||||
"Median Expression": top_tissue["median"],
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
print(f" Warning: {e}")
|
||||
|
||||
if expr_data:
|
||||
expr_df = pd.DataFrame(expr_data)
|
||||
expr_file = f"{output_prefix}_expression.csv"
|
||||
expr_df.to_csv(expr_file, index=False)
|
||||
print(f"\nExpression data saved to: {expr_file}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error getting expression data: {e}")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Enrichment analysis complete!")
|
||||
print(f"\nOutput files (prefix: {output_prefix}):")
|
||||
for db_key in databases.keys():
|
||||
if db_key in results:
|
||||
print(f" - {output_prefix}_{db_key}.csv")
|
||||
print(f" - {output_prefix}_summary.csv")
|
||||
print(f" - {output_prefix}_expression.csv")
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Perform comprehensive enrichment analysis using gget"
|
||||
)
|
||||
parser.add_argument(
|
||||
"genes",
|
||||
help="Gene list file (one gene per line or CSV with genes in first column)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--species",
|
||||
default="human",
|
||||
help="Species (human, mouse, fly, yeast, worm, fish)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-b", "--background", help="Background gene list file (optional)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o", "--output", default="enrichment", help="Output prefix (default: enrichment)"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--no-plot", action="store_true", help="Disable plotting"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Read gene list
|
||||
if not Path(args.genes).exists():
|
||||
print(f"Error: File not found: {args.genes}")
|
||||
sys.exit(1)
|
||||
|
||||
try:
|
||||
gene_list = read_gene_list(args.genes)
|
||||
print(f"Read {len(gene_list)} genes from {args.genes}")
|
||||
|
||||
# Read background if provided
|
||||
background = None
|
||||
if args.background:
|
||||
if Path(args.background).exists():
|
||||
background = read_gene_list(args.background)
|
||||
print(f"Read {len(background)} background genes from {args.background}")
|
||||
else:
|
||||
print(f"Warning: Background file not found: {args.background}")
|
||||
|
||||
success = enrichment_pipeline(
|
||||
gene_list,
|
||||
species=args.species,
|
||||
background=background,
|
||||
output_prefix=args.output,
|
||||
plot=not args.no_plot,
|
||||
)
|
||||
|
||||
sys.exit(0 if success else 1)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nAnalysis interrupted by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"\n\nError: {e}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
161
skills/gget/scripts/gene_analysis.py
Executable file
161
skills/gget/scripts/gene_analysis.py
Executable file
@@ -0,0 +1,161 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Gene Analysis Script
|
||||
Quick analysis of a gene: search, info, sequences, expression, and enrichment
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import sys
|
||||
import gget
|
||||
|
||||
|
||||
def analyze_gene(gene_name, species="homo_sapiens", output_prefix=None):
|
||||
"""
|
||||
Perform comprehensive analysis of a gene.
|
||||
|
||||
Args:
|
||||
gene_name: Gene symbol to analyze
|
||||
species: Species name (default: homo_sapiens)
|
||||
output_prefix: Prefix for output files (default: gene_name)
|
||||
"""
|
||||
if output_prefix is None:
|
||||
output_prefix = gene_name.lower()
|
||||
|
||||
print(f"Analyzing gene: {gene_name}")
|
||||
print("=" * 60)
|
||||
|
||||
# Step 1: Search for the gene
|
||||
print("\n1. Searching for gene...")
|
||||
search_results = gget.search([gene_name], species=species, limit=1)
|
||||
|
||||
if len(search_results) == 0:
|
||||
print(f"Error: Gene '{gene_name}' not found in {species}")
|
||||
return False
|
||||
|
||||
gene_id = search_results["ensembl_id"].iloc[0]
|
||||
print(f" Found: {gene_id}")
|
||||
print(f" Description: {search_results['ensembl_description'].iloc[0]}")
|
||||
|
||||
# Step 2: Get detailed information
|
||||
print("\n2. Getting detailed information...")
|
||||
gene_info = gget.info([gene_id], pdb=True)
|
||||
gene_info.to_csv(f"{output_prefix}_info.csv", index=False)
|
||||
print(f" Saved to: {output_prefix}_info.csv")
|
||||
|
||||
if "uniprot_id" in gene_info.columns and gene_info["uniprot_id"].iloc[0]:
|
||||
print(f" UniProt ID: {gene_info['uniprot_id'].iloc[0]}")
|
||||
if "pdb_id" in gene_info.columns and gene_info["pdb_id"].iloc[0]:
|
||||
print(f" PDB IDs: {gene_info['pdb_id'].iloc[0]}")
|
||||
|
||||
# Step 3: Get sequences
|
||||
print("\n3. Retrieving sequences...")
|
||||
nucleotide_seq = gget.seq([gene_id])
|
||||
protein_seq = gget.seq([gene_id], translate=True)
|
||||
|
||||
with open(f"{output_prefix}_nucleotide.fasta", "w") as f:
|
||||
f.write(nucleotide_seq)
|
||||
print(f" Nucleotide sequence saved to: {output_prefix}_nucleotide.fasta")
|
||||
|
||||
with open(f"{output_prefix}_protein.fasta", "w") as f:
|
||||
f.write(protein_seq)
|
||||
print(f" Protein sequence saved to: {output_prefix}_protein.fasta")
|
||||
|
||||
# Step 4: Get tissue expression
|
||||
print("\n4. Getting tissue expression...")
|
||||
try:
|
||||
tissue_expr = gget.archs4(gene_name, which="tissue")
|
||||
tissue_expr.to_csv(f"{output_prefix}_tissue_expression.csv", index=False)
|
||||
print(f" Saved to: {output_prefix}_tissue_expression.csv")
|
||||
|
||||
# Show top tissues
|
||||
top_tissues = tissue_expr.nlargest(5, "median")
|
||||
print("\n Top expressing tissues:")
|
||||
for _, row in top_tissues.iterrows():
|
||||
print(f" {row['tissue']}: median = {row['median']:.2f}")
|
||||
except Exception as e:
|
||||
print(f" Warning: Could not retrieve ARCHS4 data: {e}")
|
||||
|
||||
# Step 5: Find correlated genes
|
||||
print("\n5. Finding correlated genes...")
|
||||
try:
|
||||
correlated = gget.archs4(gene_name, which="correlation")
|
||||
correlated.to_csv(f"{output_prefix}_correlated_genes.csv", index=False)
|
||||
print(f" Saved to: {output_prefix}_correlated_genes.csv")
|
||||
|
||||
# Show top correlated
|
||||
print("\n Top 10 correlated genes:")
|
||||
for _, row in correlated.head(10).iterrows():
|
||||
print(f" {row['gene_symbol']}: r = {row['correlation']:.3f}")
|
||||
except Exception as e:
|
||||
print(f" Warning: Could not retrieve correlation data: {e}")
|
||||
|
||||
# Step 6: Get disease associations
|
||||
print("\n6. Getting disease associations...")
|
||||
try:
|
||||
diseases = gget.opentargets(gene_id, resource="diseases", limit=10)
|
||||
diseases.to_csv(f"{output_prefix}_diseases.csv", index=False)
|
||||
print(f" Saved to: {output_prefix}_diseases.csv")
|
||||
|
||||
print("\n Top 5 disease associations:")
|
||||
for _, row in diseases.head(5).iterrows():
|
||||
print(f" {row['disease_name']}: score = {row['overall_score']:.3f}")
|
||||
except Exception as e:
|
||||
print(f" Warning: Could not retrieve disease data: {e}")
|
||||
|
||||
# Step 7: Get drug associations
|
||||
print("\n7. Getting drug associations...")
|
||||
try:
|
||||
drugs = gget.opentargets(gene_id, resource="drugs", limit=10)
|
||||
if len(drugs) > 0:
|
||||
drugs.to_csv(f"{output_prefix}_drugs.csv", index=False)
|
||||
print(f" Saved to: {output_prefix}_drugs.csv")
|
||||
print(f"\n Found {len(drugs)} drug associations")
|
||||
else:
|
||||
print(" No drug associations found")
|
||||
except Exception as e:
|
||||
print(f" Warning: Could not retrieve drug data: {e}")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("Analysis complete!")
|
||||
print(f"\nOutput files (prefix: {output_prefix}):")
|
||||
print(f" - {output_prefix}_info.csv")
|
||||
print(f" - {output_prefix}_nucleotide.fasta")
|
||||
print(f" - {output_prefix}_protein.fasta")
|
||||
print(f" - {output_prefix}_tissue_expression.csv")
|
||||
print(f" - {output_prefix}_correlated_genes.csv")
|
||||
print(f" - {output_prefix}_diseases.csv")
|
||||
print(f" - {output_prefix}_drugs.csv (if available)")
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Perform comprehensive analysis of a gene using gget"
|
||||
)
|
||||
parser.add_argument("gene", help="Gene symbol to analyze")
|
||||
parser.add_argument(
|
||||
"-s",
|
||||
"--species",
|
||||
default="homo_sapiens",
|
||||
help="Species (default: homo_sapiens)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"-o", "--output", help="Output prefix for files (default: gene name)"
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
try:
|
||||
success = analyze_gene(args.gene, args.species, args.output)
|
||||
sys.exit(0 if success else 1)
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nAnalysis interrupted by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"\n\nError: {e}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user