236 lines
7.0 KiB
Python
Executable File
236 lines
7.0 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
"""
|
|
Enrichment Analysis Pipeline
|
|
Perform comprehensive enrichment analysis on a gene list
|
|
"""
|
|
|
|
import argparse
|
|
import sys
|
|
from pathlib import Path
|
|
import gget
|
|
import pandas as pd
|
|
|
|
|
|
def read_gene_list(file_path):
|
|
"""Read gene list from file (one gene per line or CSV)."""
|
|
file_path = Path(file_path)
|
|
|
|
if file_path.suffix == ".csv":
|
|
df = pd.read_csv(file_path)
|
|
# Assume first column contains gene names
|
|
genes = df.iloc[:, 0].tolist()
|
|
else:
|
|
# Plain text file
|
|
with open(file_path, "r") as f:
|
|
genes = [line.strip() for line in f if line.strip()]
|
|
|
|
return genes
|
|
|
|
|
|
def enrichment_pipeline(
|
|
gene_list,
|
|
species="human",
|
|
background=None,
|
|
output_prefix="enrichment",
|
|
plot=True,
|
|
):
|
|
"""
|
|
Perform comprehensive enrichment analysis.
|
|
|
|
Args:
|
|
gene_list: List of gene symbols
|
|
species: Species for analysis
|
|
background: Background gene list (optional)
|
|
output_prefix: Prefix for output files
|
|
plot: Whether to generate plots
|
|
"""
|
|
print("Enrichment Analysis Pipeline")
|
|
print("=" * 60)
|
|
print(f"Analyzing {len(gene_list)} genes")
|
|
print(f"Species: {species}\n")
|
|
|
|
# Database categories to analyze
|
|
databases = {
|
|
"pathway": "KEGG Pathways",
|
|
"ontology": "Gene Ontology (Biological Process)",
|
|
"transcription": "Transcription Factors (ChEA)",
|
|
"diseases_drugs": "Disease Associations (GWAS)",
|
|
"celltypes": "Cell Type Markers (PanglaoDB)",
|
|
}
|
|
|
|
results = {}
|
|
|
|
for db_key, db_name in databases.items():
|
|
print(f"\nAnalyzing: {db_name}")
|
|
print("-" * 60)
|
|
|
|
try:
|
|
enrichment = gget.enrichr(
|
|
gene_list,
|
|
database=db_key,
|
|
species=species,
|
|
background_list=background,
|
|
plot=plot,
|
|
)
|
|
|
|
if enrichment is not None and len(enrichment) > 0:
|
|
# Save results
|
|
output_file = f"{output_prefix}_{db_key}.csv"
|
|
enrichment.to_csv(output_file, index=False)
|
|
print(f"Results saved to: {output_file}")
|
|
|
|
# Show top 5 results
|
|
print(f"\nTop 5 enriched terms:")
|
|
for i, row in enrichment.head(5).iterrows():
|
|
term = row.get("name", row.get("term", "Unknown"))
|
|
p_val = row.get(
|
|
"adjusted_p_value",
|
|
row.get("p_value", row.get("Adjusted P-value", 1)),
|
|
)
|
|
print(f" {i+1}. {term}")
|
|
print(f" P-value: {p_val:.2e}")
|
|
|
|
results[db_key] = enrichment
|
|
else:
|
|
print("No significant results found")
|
|
|
|
except Exception as e:
|
|
print(f"Error: {e}")
|
|
|
|
# Generate summary report
|
|
print("\n" + "=" * 60)
|
|
print("Generating summary report...")
|
|
|
|
summary = []
|
|
for db_key, db_name in databases.items():
|
|
if db_key in results and len(results[db_key]) > 0:
|
|
summary.append(
|
|
{
|
|
"Database": db_name,
|
|
"Total Terms": len(results[db_key]),
|
|
"Top Term": results[db_key].iloc[0].get(
|
|
"name", results[db_key].iloc[0].get("term", "N/A")
|
|
),
|
|
}
|
|
)
|
|
|
|
if summary:
|
|
summary_df = pd.DataFrame(summary)
|
|
summary_file = f"{output_prefix}_summary.csv"
|
|
summary_df.to_csv(summary_file, index=False)
|
|
print(f"\nSummary saved to: {summary_file}")
|
|
print("\n" + summary_df.to_string(index=False))
|
|
else:
|
|
print("\nNo enrichment results to summarize")
|
|
|
|
# Get expression data for genes
|
|
print("\n" + "=" * 60)
|
|
print("Getting expression data for input genes...")
|
|
|
|
try:
|
|
# Get tissue expression for first few genes
|
|
expr_data = []
|
|
for gene in gene_list[:5]: # Limit to first 5
|
|
print(f" Getting expression for {gene}...")
|
|
try:
|
|
tissue_expr = gget.archs4(gene, which="tissue")
|
|
top_tissue = tissue_expr.nlargest(1, "median").iloc[0]
|
|
expr_data.append(
|
|
{
|
|
"Gene": gene,
|
|
"Top Tissue": top_tissue["tissue"],
|
|
"Median Expression": top_tissue["median"],
|
|
}
|
|
)
|
|
except Exception as e:
|
|
print(f" Warning: {e}")
|
|
|
|
if expr_data:
|
|
expr_df = pd.DataFrame(expr_data)
|
|
expr_file = f"{output_prefix}_expression.csv"
|
|
expr_df.to_csv(expr_file, index=False)
|
|
print(f"\nExpression data saved to: {expr_file}")
|
|
|
|
except Exception as e:
|
|
print(f"Error getting expression data: {e}")
|
|
|
|
print("\n" + "=" * 60)
|
|
print("Enrichment analysis complete!")
|
|
print(f"\nOutput files (prefix: {output_prefix}):")
|
|
for db_key in databases.keys():
|
|
if db_key in results:
|
|
print(f" - {output_prefix}_{db_key}.csv")
|
|
print(f" - {output_prefix}_summary.csv")
|
|
print(f" - {output_prefix}_expression.csv")
|
|
|
|
return True
|
|
|
|
|
|
def main():
|
|
parser = argparse.ArgumentParser(
|
|
description="Perform comprehensive enrichment analysis using gget"
|
|
)
|
|
parser.add_argument(
|
|
"genes",
|
|
help="Gene list file (one gene per line or CSV with genes in first column)",
|
|
)
|
|
parser.add_argument(
|
|
"-s",
|
|
"--species",
|
|
default="human",
|
|
help="Species (human, mouse, fly, yeast, worm, fish)",
|
|
)
|
|
parser.add_argument(
|
|
"-b", "--background", help="Background gene list file (optional)"
|
|
)
|
|
parser.add_argument(
|
|
"-o", "--output", default="enrichment", help="Output prefix (default: enrichment)"
|
|
)
|
|
parser.add_argument(
|
|
"--no-plot", action="store_true", help="Disable plotting"
|
|
)
|
|
|
|
args = parser.parse_args()
|
|
|
|
# Read gene list
|
|
if not Path(args.genes).exists():
|
|
print(f"Error: File not found: {args.genes}")
|
|
sys.exit(1)
|
|
|
|
try:
|
|
gene_list = read_gene_list(args.genes)
|
|
print(f"Read {len(gene_list)} genes from {args.genes}")
|
|
|
|
# Read background if provided
|
|
background = None
|
|
if args.background:
|
|
if Path(args.background).exists():
|
|
background = read_gene_list(args.background)
|
|
print(f"Read {len(background)} background genes from {args.background}")
|
|
else:
|
|
print(f"Warning: Background file not found: {args.background}")
|
|
|
|
success = enrichment_pipeline(
|
|
gene_list,
|
|
species=args.species,
|
|
background=background,
|
|
output_prefix=args.output,
|
|
plot=not args.no_plot,
|
|
)
|
|
|
|
sys.exit(0 if success else 1)
|
|
|
|
except KeyboardInterrupt:
|
|
print("\n\nAnalysis interrupted by user")
|
|
sys.exit(1)
|
|
except Exception as e:
|
|
print(f"\n\nError: {e}")
|
|
import traceback
|
|
|
|
traceback.print_exc()
|
|
sys.exit(1)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|