296 lines
9.3 KiB
Python
296 lines
9.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Complete Single-Cell Analysis Template
|
|
|
|
This template provides a complete workflow for single-cell RNA-seq analysis
|
|
using scanpy, from data loading through clustering and cell type annotation.
|
|
|
|
Customize the parameters and sections as needed for your specific dataset.
|
|
"""
|
|
|
|
import scanpy as sc
|
|
import pandas as pd
|
|
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
|
|
# ============================================================================
|
|
# CONFIGURATION
|
|
# ============================================================================
|
|
|
|
# File paths
|
|
INPUT_FILE = 'data/raw_counts.h5ad' # Change to your input file
|
|
OUTPUT_DIR = 'results/'
|
|
FIGURES_DIR = 'figures/'
|
|
|
|
# QC parameters
|
|
MIN_GENES = 200 # Minimum genes per cell
|
|
MIN_CELLS = 3 # Minimum cells per gene
|
|
MT_THRESHOLD = 5 # Maximum mitochondrial percentage
|
|
|
|
# Analysis parameters
|
|
N_TOP_GENES = 2000 # Number of highly variable genes
|
|
N_PCS = 40 # Number of principal components
|
|
N_NEIGHBORS = 10 # Number of neighbors for graph
|
|
LEIDEN_RESOLUTION = 0.5 # Clustering resolution
|
|
|
|
# Scanpy settings
|
|
sc.settings.verbosity = 3
|
|
sc.settings.set_figure_params(dpi=80, facecolor='white')
|
|
sc.settings.figdir = FIGURES_DIR
|
|
|
|
# ============================================================================
|
|
# 1. LOAD DATA
|
|
# ============================================================================
|
|
|
|
print("=" * 80)
|
|
print("LOADING DATA")
|
|
print("=" * 80)
|
|
|
|
# Load data (adjust based on your file format)
|
|
adata = sc.read_h5ad(INPUT_FILE)
|
|
# adata = sc.read_10x_mtx('data/filtered_gene_bc_matrices/') # For 10X data
|
|
# adata = sc.read_csv('data/counts.csv') # For CSV data
|
|
|
|
print(f"Loaded: {adata.n_obs} cells x {adata.n_vars} genes")
|
|
|
|
# ============================================================================
|
|
# 2. QUALITY CONTROL
|
|
# ============================================================================
|
|
|
|
print("\n" + "=" * 80)
|
|
print("QUALITY CONTROL")
|
|
print("=" * 80)
|
|
|
|
# Identify mitochondrial genes
|
|
adata.var['mt'] = adata.var_names.str.startswith('MT-')
|
|
|
|
# Calculate QC metrics
|
|
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None,
|
|
log1p=False, inplace=True)
|
|
|
|
# Visualize QC metrics before filtering
|
|
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
|
|
jitter=0.4, multi_panel=True, save='_qc_before_filtering')
|
|
|
|
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt', save='_qc_mt')
|
|
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', save='_qc_genes')
|
|
|
|
# Filter cells and genes
|
|
print(f"\nBefore filtering: {adata.n_obs} cells, {adata.n_vars} genes")
|
|
|
|
sc.pp.filter_cells(adata, min_genes=MIN_GENES)
|
|
sc.pp.filter_genes(adata, min_cells=MIN_CELLS)
|
|
adata = adata[adata.obs.pct_counts_mt < MT_THRESHOLD, :]
|
|
|
|
print(f"After filtering: {adata.n_obs} cells, {adata.n_vars} genes")
|
|
|
|
# ============================================================================
|
|
# 3. NORMALIZATION
|
|
# ============================================================================
|
|
|
|
print("\n" + "=" * 80)
|
|
print("NORMALIZATION")
|
|
print("=" * 80)
|
|
|
|
# Normalize to 10,000 counts per cell
|
|
sc.pp.normalize_total(adata, target_sum=1e4)
|
|
|
|
# Log-transform
|
|
sc.pp.log1p(adata)
|
|
|
|
# Store normalized data
|
|
adata.raw = adata
|
|
|
|
# ============================================================================
|
|
# 4. FEATURE SELECTION
|
|
# ============================================================================
|
|
|
|
print("\n" + "=" * 80)
|
|
print("FEATURE SELECTION")
|
|
print("=" * 80)
|
|
|
|
# Identify highly variable genes
|
|
sc.pp.highly_variable_genes(adata, n_top_genes=N_TOP_GENES)
|
|
|
|
# Visualize
|
|
sc.pl.highly_variable_genes(adata, save='_hvg')
|
|
|
|
print(f"Selected {sum(adata.var.highly_variable)} highly variable genes")
|
|
|
|
# Subset to highly variable genes
|
|
adata = adata[:, adata.var.highly_variable]
|
|
|
|
# ============================================================================
|
|
# 5. SCALING AND REGRESSION
|
|
# ============================================================================
|
|
|
|
print("\n" + "=" * 80)
|
|
print("SCALING AND REGRESSION")
|
|
print("=" * 80)
|
|
|
|
# Regress out unwanted sources of variation
|
|
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
|
|
|
|
# Scale data
|
|
sc.pp.scale(adata, max_value=10)
|
|
|
|
# ============================================================================
|
|
# 6. DIMENSIONALITY REDUCTION
|
|
# ============================================================================
|
|
|
|
print("\n" + "=" * 80)
|
|
print("DIMENSIONALITY REDUCTION")
|
|
print("=" * 80)
|
|
|
|
# PCA
|
|
sc.tl.pca(adata, svd_solver='arpack')
|
|
sc.pl.pca_variance_ratio(adata, log=True, save='_pca_variance')
|
|
|
|
# Compute neighborhood graph
|
|
sc.pp.neighbors(adata, n_neighbors=N_NEIGHBORS, n_pcs=N_PCS)
|
|
|
|
# UMAP
|
|
sc.tl.umap(adata)
|
|
|
|
# ============================================================================
|
|
# 7. CLUSTERING
|
|
# ============================================================================
|
|
|
|
print("\n" + "=" * 80)
|
|
print("CLUSTERING")
|
|
print("=" * 80)
|
|
|
|
# Leiden clustering
|
|
sc.tl.leiden(adata, resolution=LEIDEN_RESOLUTION)
|
|
|
|
# Visualize
|
|
sc.pl.umap(adata, color='leiden', legend_loc='on data', save='_leiden')
|
|
|
|
print(f"Identified {len(adata.obs['leiden'].unique())} clusters")
|
|
|
|
# ============================================================================
|
|
# 8. MARKER GENE IDENTIFICATION
|
|
# ============================================================================
|
|
|
|
print("\n" + "=" * 80)
|
|
print("MARKER GENE IDENTIFICATION")
|
|
print("=" * 80)
|
|
|
|
# Find marker genes
|
|
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
|
|
|
|
# Visualize top markers
|
|
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, save='_markers')
|
|
sc.pl.rank_genes_groups_heatmap(adata, n_genes=10, save='_markers_heatmap')
|
|
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, save='_markers_dotplot')
|
|
|
|
# Get top markers for each cluster
|
|
for cluster in adata.obs['leiden'].unique():
|
|
print(f"\nCluster {cluster} top markers:")
|
|
markers = sc.get.rank_genes_groups_df(adata, group=cluster).head(10)
|
|
print(markers[['names', 'scores', 'pvals_adj']].to_string(index=False))
|
|
|
|
# ============================================================================
|
|
# 9. CELL TYPE ANNOTATION (CUSTOMIZE THIS SECTION)
|
|
# ============================================================================
|
|
|
|
print("\n" + "=" * 80)
|
|
print("CELL TYPE ANNOTATION")
|
|
print("=" * 80)
|
|
|
|
# Example marker genes for common cell types (customize for your data)
|
|
marker_genes = {
|
|
'T cells': ['CD3D', 'CD3E', 'CD3G'],
|
|
'B cells': ['MS4A1', 'CD79A', 'CD79B'],
|
|
'Monocytes': ['CD14', 'LYZ', 'S100A8'],
|
|
'NK cells': ['NKG7', 'GNLY', 'KLRD1'],
|
|
'Dendritic cells': ['FCER1A', 'CST3'],
|
|
}
|
|
|
|
# Visualize marker genes
|
|
for cell_type, genes in marker_genes.items():
|
|
available_genes = [g for g in genes if g in adata.raw.var_names]
|
|
if available_genes:
|
|
sc.pl.umap(adata, color=available_genes, use_raw=True,
|
|
save=f'_{cell_type.replace(" ", "_")}')
|
|
|
|
# Manual annotation based on marker expression (customize this mapping)
|
|
cluster_to_celltype = {
|
|
'0': 'CD4 T cells',
|
|
'1': 'CD14+ Monocytes',
|
|
'2': 'B cells',
|
|
'3': 'CD8 T cells',
|
|
'4': 'NK cells',
|
|
# Add more mappings based on your marker analysis
|
|
}
|
|
|
|
# Apply annotations
|
|
adata.obs['cell_type'] = adata.obs['leiden'].map(cluster_to_celltype)
|
|
adata.obs['cell_type'] = adata.obs['cell_type'].fillna('Unknown')
|
|
|
|
# Visualize annotated cell types
|
|
sc.pl.umap(adata, color='cell_type', legend_loc='on data', save='_celltypes')
|
|
|
|
# ============================================================================
|
|
# 10. ADDITIONAL ANALYSES (OPTIONAL)
|
|
# ============================================================================
|
|
|
|
print("\n" + "=" * 80)
|
|
print("ADDITIONAL ANALYSES")
|
|
print("=" * 80)
|
|
|
|
# PAGA trajectory analysis (optional)
|
|
sc.tl.paga(adata, groups='leiden')
|
|
sc.pl.paga(adata, color='leiden', save='_paga')
|
|
|
|
# Gene set scoring (optional)
|
|
# example_gene_set = ['CD3D', 'CD3E', 'CD3G']
|
|
# sc.tl.score_genes(adata, example_gene_set, score_name='T_cell_score')
|
|
# sc.pl.umap(adata, color='T_cell_score', save='_gene_set_score')
|
|
|
|
# ============================================================================
|
|
# 11. SAVE RESULTS
|
|
# ============================================================================
|
|
|
|
print("\n" + "=" * 80)
|
|
print("SAVING RESULTS")
|
|
print("=" * 80)
|
|
|
|
import os
|
|
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
|
|
|
# Save processed AnnData object
|
|
adata.write(f'{OUTPUT_DIR}/processed_data.h5ad')
|
|
print(f"Saved processed data to {OUTPUT_DIR}/processed_data.h5ad")
|
|
|
|
# Export metadata
|
|
adata.obs.to_csv(f'{OUTPUT_DIR}/cell_metadata.csv')
|
|
adata.var.to_csv(f'{OUTPUT_DIR}/gene_metadata.csv')
|
|
print(f"Saved metadata to {OUTPUT_DIR}/")
|
|
|
|
# Export marker genes
|
|
for cluster in adata.obs['leiden'].unique():
|
|
markers = sc.get.rank_genes_groups_df(adata, group=cluster)
|
|
markers.to_csv(f'{OUTPUT_DIR}/markers_cluster_{cluster}.csv', index=False)
|
|
print(f"Saved marker genes to {OUTPUT_DIR}/")
|
|
|
|
# ============================================================================
|
|
# 12. SUMMARY
|
|
# ============================================================================
|
|
|
|
print("\n" + "=" * 80)
|
|
print("ANALYSIS SUMMARY")
|
|
print("=" * 80)
|
|
|
|
print(f"\nFinal dataset:")
|
|
print(f" Cells: {adata.n_obs}")
|
|
print(f" Genes: {adata.n_vars}")
|
|
print(f" Clusters: {len(adata.obs['leiden'].unique())}")
|
|
|
|
print(f"\nCell type distribution:")
|
|
print(adata.obs['cell_type'].value_counts())
|
|
|
|
print("\n" + "=" * 80)
|
|
print("ANALYSIS COMPLETE")
|
|
print("=" * 80)
|