Initial commit
This commit is contained in:
380
skills/scanpy/SKILL.md
Normal file
380
skills/scanpy/SKILL.md
Normal file
@@ -0,0 +1,380 @@
|
||||
---
|
||||
name: scanpy
|
||||
description: "Single-cell RNA-seq analysis. Load .h5ad/10X data, QC, normalization, PCA/UMAP/t-SNE, Leiden clustering, marker genes, cell type annotation, trajectory, for scRNA-seq analysis."
|
||||
---
|
||||
|
||||
# Scanpy: Single-Cell Analysis
|
||||
|
||||
## Overview
|
||||
|
||||
Scanpy is a scalable Python toolkit for analyzing single-cell RNA-seq data, built on AnnData. Apply this skill for complete single-cell workflows including quality control, normalization, dimensionality reduction, clustering, marker gene identification, visualization, and trajectory analysis.
|
||||
|
||||
## When to Use This Skill
|
||||
|
||||
This skill should be used when:
|
||||
- Analyzing single-cell RNA-seq data (.h5ad, 10X, CSV formats)
|
||||
- Performing quality control on scRNA-seq datasets
|
||||
- Creating UMAP, t-SNE, or PCA visualizations
|
||||
- Identifying cell clusters and finding marker genes
|
||||
- Annotating cell types based on gene expression
|
||||
- Conducting trajectory inference or pseudotime analysis
|
||||
- Generating publication-quality single-cell plots
|
||||
|
||||
## Quick Start
|
||||
|
||||
### Basic Import and Setup
|
||||
|
||||
```python
|
||||
import scanpy as sc
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
# Configure settings
|
||||
sc.settings.verbosity = 3
|
||||
sc.settings.set_figure_params(dpi=80, facecolor='white')
|
||||
sc.settings.figdir = './figures/'
|
||||
```
|
||||
|
||||
### Loading Data
|
||||
|
||||
```python
|
||||
# From 10X Genomics
|
||||
adata = sc.read_10x_mtx('path/to/data/')
|
||||
adata = sc.read_10x_h5('path/to/data.h5')
|
||||
|
||||
# From h5ad (AnnData format)
|
||||
adata = sc.read_h5ad('path/to/data.h5ad')
|
||||
|
||||
# From CSV
|
||||
adata = sc.read_csv('path/to/data.csv')
|
||||
```
|
||||
|
||||
### Understanding AnnData Structure
|
||||
|
||||
The AnnData object is the core data structure in scanpy:
|
||||
|
||||
```python
|
||||
adata.X # Expression matrix (cells × genes)
|
||||
adata.obs # Cell metadata (DataFrame)
|
||||
adata.var # Gene metadata (DataFrame)
|
||||
adata.uns # Unstructured annotations (dict)
|
||||
adata.obsm # Multi-dimensional cell data (PCA, UMAP)
|
||||
adata.raw # Raw data backup
|
||||
|
||||
# Access cell and gene names
|
||||
adata.obs_names # Cell barcodes
|
||||
adata.var_names # Gene names
|
||||
```
|
||||
|
||||
## Standard Analysis Workflow
|
||||
|
||||
### 1. Quality Control
|
||||
|
||||
Identify and filter low-quality cells and genes:
|
||||
|
||||
```python
|
||||
# Identify mitochondrial genes
|
||||
adata.var['mt'] = adata.var_names.str.startswith('MT-')
|
||||
|
||||
# Calculate QC metrics
|
||||
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], inplace=True)
|
||||
|
||||
# Visualize QC metrics
|
||||
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
|
||||
jitter=0.4, multi_panel=True)
|
||||
|
||||
# Filter cells and genes
|
||||
sc.pp.filter_cells(adata, min_genes=200)
|
||||
sc.pp.filter_genes(adata, min_cells=3)
|
||||
adata = adata[adata.obs.pct_counts_mt < 5, :] # Remove high MT% cells
|
||||
```
|
||||
|
||||
**Use the QC script for automated analysis:**
|
||||
```bash
|
||||
python scripts/qc_analysis.py input_file.h5ad --output filtered.h5ad
|
||||
```
|
||||
|
||||
### 2. Normalization and Preprocessing
|
||||
|
||||
```python
|
||||
# Normalize to 10,000 counts per cell
|
||||
sc.pp.normalize_total(adata, target_sum=1e4)
|
||||
|
||||
# Log-transform
|
||||
sc.pp.log1p(adata)
|
||||
|
||||
# Save raw counts for later
|
||||
adata.raw = adata
|
||||
|
||||
# Identify highly variable genes
|
||||
sc.pp.highly_variable_genes(adata, n_top_genes=2000)
|
||||
sc.pl.highly_variable_genes(adata)
|
||||
|
||||
# Subset to highly variable genes
|
||||
adata = adata[:, adata.var.highly_variable]
|
||||
|
||||
# Regress out unwanted variation
|
||||
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
|
||||
|
||||
# Scale data
|
||||
sc.pp.scale(adata, max_value=10)
|
||||
```
|
||||
|
||||
### 3. Dimensionality Reduction
|
||||
|
||||
```python
|
||||
# PCA
|
||||
sc.tl.pca(adata, svd_solver='arpack')
|
||||
sc.pl.pca_variance_ratio(adata, log=True) # Check elbow plot
|
||||
|
||||
# Compute neighborhood graph
|
||||
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
|
||||
|
||||
# UMAP for visualization
|
||||
sc.tl.umap(adata)
|
||||
sc.pl.umap(adata, color='leiden')
|
||||
|
||||
# Alternative: t-SNE
|
||||
sc.tl.tsne(adata)
|
||||
```
|
||||
|
||||
### 4. Clustering
|
||||
|
||||
```python
|
||||
# Leiden clustering (recommended)
|
||||
sc.tl.leiden(adata, resolution=0.5)
|
||||
sc.pl.umap(adata, color='leiden', legend_loc='on data')
|
||||
|
||||
# Try multiple resolutions to find optimal granularity
|
||||
for res in [0.3, 0.5, 0.8, 1.0]:
|
||||
sc.tl.leiden(adata, resolution=res, key_added=f'leiden_{res}')
|
||||
```
|
||||
|
||||
### 5. Marker Gene Identification
|
||||
|
||||
```python
|
||||
# Find marker genes for each cluster
|
||||
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
|
||||
|
||||
# Visualize results
|
||||
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)
|
||||
sc.pl.rank_genes_groups_heatmap(adata, n_genes=10)
|
||||
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5)
|
||||
|
||||
# Get results as DataFrame
|
||||
markers = sc.get.rank_genes_groups_df(adata, group='0')
|
||||
```
|
||||
|
||||
### 6. Cell Type Annotation
|
||||
|
||||
```python
|
||||
# Define marker genes for known cell types
|
||||
marker_genes = ['CD3D', 'CD14', 'MS4A1', 'NKG7', 'FCGR3A']
|
||||
|
||||
# Visualize markers
|
||||
sc.pl.umap(adata, color=marker_genes, use_raw=True)
|
||||
sc.pl.dotplot(adata, var_names=marker_genes, groupby='leiden')
|
||||
|
||||
# Manual annotation
|
||||
cluster_to_celltype = {
|
||||
'0': 'CD4 T cells',
|
||||
'1': 'CD14+ Monocytes',
|
||||
'2': 'B cells',
|
||||
'3': 'CD8 T cells',
|
||||
}
|
||||
adata.obs['cell_type'] = adata.obs['leiden'].map(cluster_to_celltype)
|
||||
|
||||
# Visualize annotated types
|
||||
sc.pl.umap(adata, color='cell_type', legend_loc='on data')
|
||||
```
|
||||
|
||||
### 7. Save Results
|
||||
|
||||
```python
|
||||
# Save processed data
|
||||
adata.write('results/processed_data.h5ad')
|
||||
|
||||
# Export metadata
|
||||
adata.obs.to_csv('results/cell_metadata.csv')
|
||||
adata.var.to_csv('results/gene_metadata.csv')
|
||||
```
|
||||
|
||||
## Common Tasks
|
||||
|
||||
### Creating Publication-Quality Plots
|
||||
|
||||
```python
|
||||
# Set high-quality defaults
|
||||
sc.settings.set_figure_params(dpi=300, frameon=False, figsize=(5, 5))
|
||||
sc.settings.file_format_figs = 'pdf'
|
||||
|
||||
# UMAP with custom styling
|
||||
sc.pl.umap(adata, color='cell_type',
|
||||
palette='Set2',
|
||||
legend_loc='on data',
|
||||
legend_fontsize=12,
|
||||
legend_fontoutline=2,
|
||||
frameon=False,
|
||||
save='_publication.pdf')
|
||||
|
||||
# Heatmap of marker genes
|
||||
sc.pl.heatmap(adata, var_names=genes, groupby='cell_type',
|
||||
swap_axes=True, show_gene_labels=True,
|
||||
save='_markers.pdf')
|
||||
|
||||
# Dot plot
|
||||
sc.pl.dotplot(adata, var_names=genes, groupby='cell_type',
|
||||
save='_dotplot.pdf')
|
||||
```
|
||||
|
||||
Refer to `references/plotting_guide.md` for comprehensive visualization examples.
|
||||
|
||||
### Trajectory Inference
|
||||
|
||||
```python
|
||||
# PAGA (Partition-based graph abstraction)
|
||||
sc.tl.paga(adata, groups='leiden')
|
||||
sc.pl.paga(adata, color='leiden')
|
||||
|
||||
# Diffusion pseudotime
|
||||
adata.uns['iroot'] = np.flatnonzero(adata.obs['leiden'] == '0')[0]
|
||||
sc.tl.dpt(adata)
|
||||
sc.pl.umap(adata, color='dpt_pseudotime')
|
||||
```
|
||||
|
||||
### Differential Expression Between Conditions
|
||||
|
||||
```python
|
||||
# Compare treated vs control within cell types
|
||||
adata_subset = adata[adata.obs['cell_type'] == 'T cells']
|
||||
sc.tl.rank_genes_groups(adata_subset, groupby='condition',
|
||||
groups=['treated'], reference='control')
|
||||
sc.pl.rank_genes_groups(adata_subset, groups=['treated'])
|
||||
```
|
||||
|
||||
### Gene Set Scoring
|
||||
|
||||
```python
|
||||
# Score cells for gene set expression
|
||||
gene_set = ['CD3D', 'CD3E', 'CD3G']
|
||||
sc.tl.score_genes(adata, gene_set, score_name='T_cell_score')
|
||||
sc.pl.umap(adata, color='T_cell_score')
|
||||
```
|
||||
|
||||
### Batch Correction
|
||||
|
||||
```python
|
||||
# ComBat batch correction
|
||||
sc.pp.combat(adata, key='batch')
|
||||
|
||||
# Alternative: use Harmony or scVI (separate packages)
|
||||
```
|
||||
|
||||
## Key Parameters to Adjust
|
||||
|
||||
### Quality Control
|
||||
- `min_genes`: Minimum genes per cell (typically 200-500)
|
||||
- `min_cells`: Minimum cells per gene (typically 3-10)
|
||||
- `pct_counts_mt`: Mitochondrial threshold (typically 5-20%)
|
||||
|
||||
### Normalization
|
||||
- `target_sum`: Target counts per cell (default 1e4)
|
||||
|
||||
### Feature Selection
|
||||
- `n_top_genes`: Number of HVGs (typically 2000-3000)
|
||||
- `min_mean`, `max_mean`, `min_disp`: HVG selection parameters
|
||||
|
||||
### Dimensionality Reduction
|
||||
- `n_pcs`: Number of principal components (check variance ratio plot)
|
||||
- `n_neighbors`: Number of neighbors (typically 10-30)
|
||||
|
||||
### Clustering
|
||||
- `resolution`: Clustering granularity (0.4-1.2, higher = more clusters)
|
||||
|
||||
## Common Pitfalls and Best Practices
|
||||
|
||||
1. **Always save raw counts**: `adata.raw = adata` before filtering genes
|
||||
2. **Check QC plots carefully**: Adjust thresholds based on dataset quality
|
||||
3. **Use Leiden over Louvain**: More efficient and better results
|
||||
4. **Try multiple clustering resolutions**: Find optimal granularity
|
||||
5. **Validate cell type annotations**: Use multiple marker genes
|
||||
6. **Use `use_raw=True` for gene expression plots**: Shows original counts
|
||||
7. **Check PCA variance ratio**: Determine optimal number of PCs
|
||||
8. **Save intermediate results**: Long workflows can fail partway through
|
||||
|
||||
## Bundled Resources
|
||||
|
||||
### scripts/qc_analysis.py
|
||||
Automated quality control script that calculates metrics, generates plots, and filters data:
|
||||
|
||||
```bash
|
||||
python scripts/qc_analysis.py input.h5ad --output filtered.h5ad \
|
||||
--mt-threshold 5 --min-genes 200 --min-cells 3
|
||||
```
|
||||
|
||||
### references/standard_workflow.md
|
||||
Complete step-by-step workflow with detailed explanations and code examples for:
|
||||
- Data loading and setup
|
||||
- Quality control with visualization
|
||||
- Normalization and scaling
|
||||
- Feature selection
|
||||
- Dimensionality reduction (PCA, UMAP, t-SNE)
|
||||
- Clustering (Leiden, Louvain)
|
||||
- Marker gene identification
|
||||
- Cell type annotation
|
||||
- Trajectory inference
|
||||
- Differential expression
|
||||
|
||||
Read this reference when performing a complete analysis from scratch.
|
||||
|
||||
### references/api_reference.md
|
||||
Quick reference guide for scanpy functions organized by module:
|
||||
- Reading/writing data (`sc.read_*`, `adata.write_*`)
|
||||
- Preprocessing (`sc.pp.*`)
|
||||
- Tools (`sc.tl.*`)
|
||||
- Plotting (`sc.pl.*`)
|
||||
- AnnData structure and manipulation
|
||||
- Settings and utilities
|
||||
|
||||
Use this for quick lookup of function signatures and common parameters.
|
||||
|
||||
### references/plotting_guide.md
|
||||
Comprehensive visualization guide including:
|
||||
- Quality control plots
|
||||
- Dimensionality reduction visualizations
|
||||
- Clustering visualizations
|
||||
- Marker gene plots (heatmaps, dot plots, violin plots)
|
||||
- Trajectory and pseudotime plots
|
||||
- Publication-quality customization
|
||||
- Multi-panel figures
|
||||
- Color palettes and styling
|
||||
|
||||
Consult this when creating publication-ready figures.
|
||||
|
||||
### assets/analysis_template.py
|
||||
Complete analysis template providing a full workflow from data loading through cell type annotation. Copy and customize this template for new analyses:
|
||||
|
||||
```bash
|
||||
cp assets/analysis_template.py my_analysis.py
|
||||
# Edit parameters and run
|
||||
python my_analysis.py
|
||||
```
|
||||
|
||||
The template includes all standard steps with configurable parameters and helpful comments.
|
||||
|
||||
## Additional Resources
|
||||
|
||||
- **Official scanpy documentation**: https://scanpy.readthedocs.io/
|
||||
- **Scanpy tutorials**: https://scanpy-tutorials.readthedocs.io/
|
||||
- **scverse ecosystem**: https://scverse.org/ (related tools: squidpy, scvi-tools, cellrank)
|
||||
- **Best practices**: Luecken & Theis (2019) "Current best practices in single-cell RNA-seq"
|
||||
|
||||
## Tips for Effective Analysis
|
||||
|
||||
1. **Start with the template**: Use `assets/analysis_template.py` as a starting point
|
||||
2. **Run QC script first**: Use `scripts/qc_analysis.py` for initial filtering
|
||||
3. **Consult references as needed**: Load workflow and API references into context
|
||||
4. **Iterate on clustering**: Try multiple resolutions and visualization methods
|
||||
5. **Validate biologically**: Check marker genes match expected cell types
|
||||
6. **Document parameters**: Record QC thresholds and analysis settings
|
||||
7. **Save checkpoints**: Write intermediate results at key steps
|
||||
295
skills/scanpy/assets/analysis_template.py
Normal file
295
skills/scanpy/assets/analysis_template.py
Normal file
@@ -0,0 +1,295 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Complete Single-Cell Analysis Template
|
||||
|
||||
This template provides a complete workflow for single-cell RNA-seq analysis
|
||||
using scanpy, from data loading through clustering and cell type annotation.
|
||||
|
||||
Customize the parameters and sections as needed for your specific dataset.
|
||||
"""
|
||||
|
||||
import scanpy as sc
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# ============================================================================
|
||||
# CONFIGURATION
|
||||
# ============================================================================
|
||||
|
||||
# File paths
|
||||
INPUT_FILE = 'data/raw_counts.h5ad' # Change to your input file
|
||||
OUTPUT_DIR = 'results/'
|
||||
FIGURES_DIR = 'figures/'
|
||||
|
||||
# QC parameters
|
||||
MIN_GENES = 200 # Minimum genes per cell
|
||||
MIN_CELLS = 3 # Minimum cells per gene
|
||||
MT_THRESHOLD = 5 # Maximum mitochondrial percentage
|
||||
|
||||
# Analysis parameters
|
||||
N_TOP_GENES = 2000 # Number of highly variable genes
|
||||
N_PCS = 40 # Number of principal components
|
||||
N_NEIGHBORS = 10 # Number of neighbors for graph
|
||||
LEIDEN_RESOLUTION = 0.5 # Clustering resolution
|
||||
|
||||
# Scanpy settings
|
||||
sc.settings.verbosity = 3
|
||||
sc.settings.set_figure_params(dpi=80, facecolor='white')
|
||||
sc.settings.figdir = FIGURES_DIR
|
||||
|
||||
# ============================================================================
|
||||
# 1. LOAD DATA
|
||||
# ============================================================================
|
||||
|
||||
print("=" * 80)
|
||||
print("LOADING DATA")
|
||||
print("=" * 80)
|
||||
|
||||
# Load data (adjust based on your file format)
|
||||
adata = sc.read_h5ad(INPUT_FILE)
|
||||
# adata = sc.read_10x_mtx('data/filtered_gene_bc_matrices/') # For 10X data
|
||||
# adata = sc.read_csv('data/counts.csv') # For CSV data
|
||||
|
||||
print(f"Loaded: {adata.n_obs} cells x {adata.n_vars} genes")
|
||||
|
||||
# ============================================================================
|
||||
# 2. QUALITY CONTROL
|
||||
# ============================================================================
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("QUALITY CONTROL")
|
||||
print("=" * 80)
|
||||
|
||||
# Identify mitochondrial genes
|
||||
adata.var['mt'] = adata.var_names.str.startswith('MT-')
|
||||
|
||||
# Calculate QC metrics
|
||||
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None,
|
||||
log1p=False, inplace=True)
|
||||
|
||||
# Visualize QC metrics before filtering
|
||||
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
|
||||
jitter=0.4, multi_panel=True, save='_qc_before_filtering')
|
||||
|
||||
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt', save='_qc_mt')
|
||||
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', save='_qc_genes')
|
||||
|
||||
# Filter cells and genes
|
||||
print(f"\nBefore filtering: {adata.n_obs} cells, {adata.n_vars} genes")
|
||||
|
||||
sc.pp.filter_cells(adata, min_genes=MIN_GENES)
|
||||
sc.pp.filter_genes(adata, min_cells=MIN_CELLS)
|
||||
adata = adata[adata.obs.pct_counts_mt < MT_THRESHOLD, :]
|
||||
|
||||
print(f"After filtering: {adata.n_obs} cells, {adata.n_vars} genes")
|
||||
|
||||
# ============================================================================
|
||||
# 3. NORMALIZATION
|
||||
# ============================================================================
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("NORMALIZATION")
|
||||
print("=" * 80)
|
||||
|
||||
# Normalize to 10,000 counts per cell
|
||||
sc.pp.normalize_total(adata, target_sum=1e4)
|
||||
|
||||
# Log-transform
|
||||
sc.pp.log1p(adata)
|
||||
|
||||
# Store normalized data
|
||||
adata.raw = adata
|
||||
|
||||
# ============================================================================
|
||||
# 4. FEATURE SELECTION
|
||||
# ============================================================================
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("FEATURE SELECTION")
|
||||
print("=" * 80)
|
||||
|
||||
# Identify highly variable genes
|
||||
sc.pp.highly_variable_genes(adata, n_top_genes=N_TOP_GENES)
|
||||
|
||||
# Visualize
|
||||
sc.pl.highly_variable_genes(adata, save='_hvg')
|
||||
|
||||
print(f"Selected {sum(adata.var.highly_variable)} highly variable genes")
|
||||
|
||||
# Subset to highly variable genes
|
||||
adata = adata[:, adata.var.highly_variable]
|
||||
|
||||
# ============================================================================
|
||||
# 5. SCALING AND REGRESSION
|
||||
# ============================================================================
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("SCALING AND REGRESSION")
|
||||
print("=" * 80)
|
||||
|
||||
# Regress out unwanted sources of variation
|
||||
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
|
||||
|
||||
# Scale data
|
||||
sc.pp.scale(adata, max_value=10)
|
||||
|
||||
# ============================================================================
|
||||
# 6. DIMENSIONALITY REDUCTION
|
||||
# ============================================================================
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("DIMENSIONALITY REDUCTION")
|
||||
print("=" * 80)
|
||||
|
||||
# PCA
|
||||
sc.tl.pca(adata, svd_solver='arpack')
|
||||
sc.pl.pca_variance_ratio(adata, log=True, save='_pca_variance')
|
||||
|
||||
# Compute neighborhood graph
|
||||
sc.pp.neighbors(adata, n_neighbors=N_NEIGHBORS, n_pcs=N_PCS)
|
||||
|
||||
# UMAP
|
||||
sc.tl.umap(adata)
|
||||
|
||||
# ============================================================================
|
||||
# 7. CLUSTERING
|
||||
# ============================================================================
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("CLUSTERING")
|
||||
print("=" * 80)
|
||||
|
||||
# Leiden clustering
|
||||
sc.tl.leiden(adata, resolution=LEIDEN_RESOLUTION)
|
||||
|
||||
# Visualize
|
||||
sc.pl.umap(adata, color='leiden', legend_loc='on data', save='_leiden')
|
||||
|
||||
print(f"Identified {len(adata.obs['leiden'].unique())} clusters")
|
||||
|
||||
# ============================================================================
|
||||
# 8. MARKER GENE IDENTIFICATION
|
||||
# ============================================================================
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("MARKER GENE IDENTIFICATION")
|
||||
print("=" * 80)
|
||||
|
||||
# Find marker genes
|
||||
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
|
||||
|
||||
# Visualize top markers
|
||||
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, save='_markers')
|
||||
sc.pl.rank_genes_groups_heatmap(adata, n_genes=10, save='_markers_heatmap')
|
||||
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, save='_markers_dotplot')
|
||||
|
||||
# Get top markers for each cluster
|
||||
for cluster in adata.obs['leiden'].unique():
|
||||
print(f"\nCluster {cluster} top markers:")
|
||||
markers = sc.get.rank_genes_groups_df(adata, group=cluster).head(10)
|
||||
print(markers[['names', 'scores', 'pvals_adj']].to_string(index=False))
|
||||
|
||||
# ============================================================================
|
||||
# 9. CELL TYPE ANNOTATION (CUSTOMIZE THIS SECTION)
|
||||
# ============================================================================
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("CELL TYPE ANNOTATION")
|
||||
print("=" * 80)
|
||||
|
||||
# Example marker genes for common cell types (customize for your data)
|
||||
marker_genes = {
|
||||
'T cells': ['CD3D', 'CD3E', 'CD3G'],
|
||||
'B cells': ['MS4A1', 'CD79A', 'CD79B'],
|
||||
'Monocytes': ['CD14', 'LYZ', 'S100A8'],
|
||||
'NK cells': ['NKG7', 'GNLY', 'KLRD1'],
|
||||
'Dendritic cells': ['FCER1A', 'CST3'],
|
||||
}
|
||||
|
||||
# Visualize marker genes
|
||||
for cell_type, genes in marker_genes.items():
|
||||
available_genes = [g for g in genes if g in adata.raw.var_names]
|
||||
if available_genes:
|
||||
sc.pl.umap(adata, color=available_genes, use_raw=True,
|
||||
save=f'_{cell_type.replace(" ", "_")}')
|
||||
|
||||
# Manual annotation based on marker expression (customize this mapping)
|
||||
cluster_to_celltype = {
|
||||
'0': 'CD4 T cells',
|
||||
'1': 'CD14+ Monocytes',
|
||||
'2': 'B cells',
|
||||
'3': 'CD8 T cells',
|
||||
'4': 'NK cells',
|
||||
# Add more mappings based on your marker analysis
|
||||
}
|
||||
|
||||
# Apply annotations
|
||||
adata.obs['cell_type'] = adata.obs['leiden'].map(cluster_to_celltype)
|
||||
adata.obs['cell_type'] = adata.obs['cell_type'].fillna('Unknown')
|
||||
|
||||
# Visualize annotated cell types
|
||||
sc.pl.umap(adata, color='cell_type', legend_loc='on data', save='_celltypes')
|
||||
|
||||
# ============================================================================
|
||||
# 10. ADDITIONAL ANALYSES (OPTIONAL)
|
||||
# ============================================================================
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("ADDITIONAL ANALYSES")
|
||||
print("=" * 80)
|
||||
|
||||
# PAGA trajectory analysis (optional)
|
||||
sc.tl.paga(adata, groups='leiden')
|
||||
sc.pl.paga(adata, color='leiden', save='_paga')
|
||||
|
||||
# Gene set scoring (optional)
|
||||
# example_gene_set = ['CD3D', 'CD3E', 'CD3G']
|
||||
# sc.tl.score_genes(adata, example_gene_set, score_name='T_cell_score')
|
||||
# sc.pl.umap(adata, color='T_cell_score', save='_gene_set_score')
|
||||
|
||||
# ============================================================================
|
||||
# 11. SAVE RESULTS
|
||||
# ============================================================================
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("SAVING RESULTS")
|
||||
print("=" * 80)
|
||||
|
||||
import os
|
||||
os.makedirs(OUTPUT_DIR, exist_ok=True)
|
||||
|
||||
# Save processed AnnData object
|
||||
adata.write(f'{OUTPUT_DIR}/processed_data.h5ad')
|
||||
print(f"Saved processed data to {OUTPUT_DIR}/processed_data.h5ad")
|
||||
|
||||
# Export metadata
|
||||
adata.obs.to_csv(f'{OUTPUT_DIR}/cell_metadata.csv')
|
||||
adata.var.to_csv(f'{OUTPUT_DIR}/gene_metadata.csv')
|
||||
print(f"Saved metadata to {OUTPUT_DIR}/")
|
||||
|
||||
# Export marker genes
|
||||
for cluster in adata.obs['leiden'].unique():
|
||||
markers = sc.get.rank_genes_groups_df(adata, group=cluster)
|
||||
markers.to_csv(f'{OUTPUT_DIR}/markers_cluster_{cluster}.csv', index=False)
|
||||
print(f"Saved marker genes to {OUTPUT_DIR}/")
|
||||
|
||||
# ============================================================================
|
||||
# 12. SUMMARY
|
||||
# ============================================================================
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("ANALYSIS SUMMARY")
|
||||
print("=" * 80)
|
||||
|
||||
print(f"\nFinal dataset:")
|
||||
print(f" Cells: {adata.n_obs}")
|
||||
print(f" Genes: {adata.n_vars}")
|
||||
print(f" Clusters: {len(adata.obs['leiden'].unique())}")
|
||||
|
||||
print(f"\nCell type distribution:")
|
||||
print(adata.obs['cell_type'].value_counts())
|
||||
|
||||
print("\n" + "=" * 80)
|
||||
print("ANALYSIS COMPLETE")
|
||||
print("=" * 80)
|
||||
251
skills/scanpy/references/api_reference.md
Normal file
251
skills/scanpy/references/api_reference.md
Normal file
@@ -0,0 +1,251 @@
|
||||
# Scanpy API Quick Reference
|
||||
|
||||
Quick reference for commonly used scanpy functions organized by module.
|
||||
|
||||
## Import Convention
|
||||
|
||||
```python
|
||||
import scanpy as sc
|
||||
```
|
||||
|
||||
## Reading and Writing Data (sc.read_*)
|
||||
|
||||
### Reading Functions
|
||||
|
||||
```python
|
||||
sc.read_10x_h5(filename) # Read 10X HDF5 file
|
||||
sc.read_10x_mtx(path) # Read 10X mtx directory
|
||||
sc.read_h5ad(filename) # Read h5ad (AnnData) file
|
||||
sc.read_csv(filename) # Read CSV file
|
||||
sc.read_excel(filename) # Read Excel file
|
||||
sc.read_loom(filename) # Read loom file
|
||||
sc.read_text(filename) # Read text file
|
||||
sc.read_visium(path) # Read Visium spatial data
|
||||
```
|
||||
|
||||
### Writing Functions
|
||||
|
||||
```python
|
||||
adata.write_h5ad(filename) # Write to h5ad format
|
||||
adata.write_csvs(dirname) # Write to CSV files
|
||||
adata.write_loom(filename) # Write to loom format
|
||||
adata.write_zarr(filename) # Write to zarr format
|
||||
```
|
||||
|
||||
## Preprocessing (sc.pp.*)
|
||||
|
||||
### Quality Control
|
||||
|
||||
```python
|
||||
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], inplace=True)
|
||||
sc.pp.filter_cells(adata, min_genes=200)
|
||||
sc.pp.filter_genes(adata, min_cells=3)
|
||||
```
|
||||
|
||||
### Normalization and Transformation
|
||||
|
||||
```python
|
||||
sc.pp.normalize_total(adata, target_sum=1e4) # Normalize to target sum
|
||||
sc.pp.log1p(adata) # Log(x + 1) transformation
|
||||
sc.pp.sqrt(adata) # Square root transformation
|
||||
```
|
||||
|
||||
### Feature Selection
|
||||
|
||||
```python
|
||||
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
|
||||
sc.pp.highly_variable_genes(adata, flavor='seurat_v3', n_top_genes=2000)
|
||||
```
|
||||
|
||||
### Scaling and Regression
|
||||
|
||||
```python
|
||||
sc.pp.scale(adata, max_value=10) # Scale to unit variance
|
||||
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt']) # Regress out unwanted variation
|
||||
```
|
||||
|
||||
### Dimensionality Reduction (Preprocessing)
|
||||
|
||||
```python
|
||||
sc.pp.pca(adata, n_comps=50) # Principal component analysis
|
||||
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40) # Compute neighborhood graph
|
||||
```
|
||||
|
||||
### Batch Correction
|
||||
|
||||
```python
|
||||
sc.pp.combat(adata, key='batch') # ComBat batch correction
|
||||
```
|
||||
|
||||
## Tools (sc.tl.*)
|
||||
|
||||
### Dimensionality Reduction
|
||||
|
||||
```python
|
||||
sc.tl.pca(adata, svd_solver='arpack') # PCA
|
||||
sc.tl.umap(adata) # UMAP embedding
|
||||
sc.tl.tsne(adata) # t-SNE embedding
|
||||
sc.tl.diffmap(adata) # Diffusion map
|
||||
sc.tl.draw_graph(adata, layout='fa') # Force-directed graph
|
||||
```
|
||||
|
||||
### Clustering
|
||||
|
||||
```python
|
||||
sc.tl.leiden(adata, resolution=0.5) # Leiden clustering (recommended)
|
||||
sc.tl.louvain(adata, resolution=0.5) # Louvain clustering
|
||||
sc.tl.kmeans(adata, n_clusters=10) # K-means clustering
|
||||
```
|
||||
|
||||
### Marker Genes and Differential Expression
|
||||
|
||||
```python
|
||||
sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon')
|
||||
sc.tl.rank_genes_groups(adata, groupby='leiden', method='t-test')
|
||||
sc.tl.rank_genes_groups(adata, groupby='leiden', method='logreg')
|
||||
|
||||
# Get results as dataframe
|
||||
sc.get.rank_genes_groups_df(adata, group='0')
|
||||
```
|
||||
|
||||
### Trajectory Inference
|
||||
|
||||
```python
|
||||
sc.tl.paga(adata, groups='leiden') # PAGA trajectory
|
||||
sc.tl.dpt(adata) # Diffusion pseudotime
|
||||
```
|
||||
|
||||
### Gene Scoring
|
||||
|
||||
```python
|
||||
sc.tl.score_genes(adata, gene_list, score_name='score')
|
||||
sc.tl.score_genes_cell_cycle(adata, s_genes, g2m_genes)
|
||||
```
|
||||
|
||||
### Embeddings and Projections
|
||||
|
||||
```python
|
||||
sc.tl.ingest(adata, adata_ref) # Map to reference
|
||||
sc.tl.embedding_density(adata, basis='umap', groupby='leiden')
|
||||
```
|
||||
|
||||
## Plotting (sc.pl.*)
|
||||
|
||||
### Basic Embeddings
|
||||
|
||||
```python
|
||||
sc.pl.umap(adata, color='leiden') # UMAP plot
|
||||
sc.pl.tsne(adata, color='gene_name') # t-SNE plot
|
||||
sc.pl.pca(adata, color='leiden') # PCA plot
|
||||
sc.pl.diffmap(adata, color='leiden') # Diffusion map plot
|
||||
```
|
||||
|
||||
### Heatmaps and Dot Plots
|
||||
|
||||
```python
|
||||
sc.pl.heatmap(adata, var_names=genes, groupby='leiden')
|
||||
sc.pl.dotplot(adata, var_names=genes, groupby='leiden')
|
||||
sc.pl.matrixplot(adata, var_names=genes, groupby='leiden')
|
||||
sc.pl.stacked_violin(adata, var_names=genes, groupby='leiden')
|
||||
```
|
||||
|
||||
### Violin and Scatter Plots
|
||||
|
||||
```python
|
||||
sc.pl.violin(adata, keys=['gene1', 'gene2'], groupby='leiden')
|
||||
sc.pl.scatter(adata, x='gene1', y='gene2', color='leiden')
|
||||
```
|
||||
|
||||
### Marker Gene Visualization
|
||||
|
||||
```python
|
||||
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)
|
||||
sc.pl.rank_genes_groups_violin(adata, groups='0')
|
||||
sc.pl.rank_genes_groups_heatmap(adata, n_genes=10)
|
||||
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5)
|
||||
```
|
||||
|
||||
### Trajectory Visualization
|
||||
|
||||
```python
|
||||
sc.pl.paga(adata, color='leiden') # PAGA graph
|
||||
sc.pl.dpt_timeseries(adata) # DPT timeseries
|
||||
```
|
||||
|
||||
### QC Plots
|
||||
|
||||
```python
|
||||
sc.pl.highest_expr_genes(adata, n_top=20)
|
||||
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'])
|
||||
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')
|
||||
```
|
||||
|
||||
### Advanced Plots
|
||||
|
||||
```python
|
||||
sc.pl.dendrogram(adata, groupby='leiden')
|
||||
sc.pl.correlation_matrix(adata, groupby='leiden')
|
||||
sc.pl.tracksplot(adata, var_names=genes, groupby='leiden')
|
||||
```
|
||||
|
||||
## Common Parameters
|
||||
|
||||
### Color Parameters
|
||||
- `color`: Variable(s) to color by (gene name, obs column)
|
||||
- `use_raw`: Use `.raw` attribute of adata
|
||||
- `palette`: Color palette to use
|
||||
- `vmin`, `vmax`: Color scale limits
|
||||
|
||||
### Layout Parameters
|
||||
- `basis`: Embedding basis ('umap', 'tsne', 'pca', etc.)
|
||||
- `legend_loc`: Legend location ('on data', 'right margin', etc.)
|
||||
- `size`: Point size
|
||||
- `alpha`: Point transparency
|
||||
|
||||
### Saving Parameters
|
||||
- `save`: Filename to save plot
|
||||
- `show`: Whether to show plot
|
||||
|
||||
## AnnData Structure
|
||||
|
||||
```python
|
||||
adata.X # Expression matrix (cells × genes)
|
||||
adata.obs # Cell annotations (DataFrame)
|
||||
adata.var # Gene annotations (DataFrame)
|
||||
adata.uns # Unstructured annotations (dict)
|
||||
adata.obsm # Multi-dimensional cell annotations (e.g., PCA, UMAP)
|
||||
adata.varm # Multi-dimensional gene annotations
|
||||
adata.layers # Additional data layers
|
||||
adata.raw # Raw data backup
|
||||
|
||||
# Access
|
||||
adata.obs_names # Cell barcodes
|
||||
adata.var_names # Gene names
|
||||
adata.shape # (n_cells, n_genes)
|
||||
|
||||
# Slicing
|
||||
adata[cell_indices, gene_indices]
|
||||
adata[:, adata.var_names.isin(gene_list)]
|
||||
adata[adata.obs['leiden'] == '0', :]
|
||||
```
|
||||
|
||||
## Settings
|
||||
|
||||
```python
|
||||
sc.settings.verbosity = 3 # 0=error, 1=warning, 2=info, 3=hint
|
||||
sc.settings.set_figure_params(dpi=80, facecolor='white')
|
||||
sc.settings.autoshow = False # Don't show plots automatically
|
||||
sc.settings.autosave = True # Autosave figures
|
||||
sc.settings.figdir = './figures/' # Figure directory
|
||||
sc.settings.cachedir = './cache/' # Cache directory
|
||||
sc.settings.n_jobs = 8 # Number of parallel jobs
|
||||
```
|
||||
|
||||
## Useful Utilities
|
||||
|
||||
```python
|
||||
sc.logging.print_versions() # Print version information
|
||||
sc.logging.print_memory_usage() # Print memory usage
|
||||
adata.copy() # Create a copy of AnnData object
|
||||
adata.concatenate([adata1, adata2]) # Concatenate AnnData objects
|
||||
```
|
||||
352
skills/scanpy/references/plotting_guide.md
Normal file
352
skills/scanpy/references/plotting_guide.md
Normal file
@@ -0,0 +1,352 @@
|
||||
# Scanpy Plotting Guide
|
||||
|
||||
Comprehensive guide for creating publication-quality visualizations with scanpy.
|
||||
|
||||
## General Plotting Principles
|
||||
|
||||
All scanpy plotting functions follow consistent patterns:
|
||||
- Functions in `sc.pl.*` mirror analysis functions in `sc.tl.*`
|
||||
- Most accept `color` parameter for gene names or metadata columns
|
||||
- Results are saved via `save` parameter
|
||||
- Multiple plots can be generated in a single call
|
||||
|
||||
## Essential Quality Control Plots
|
||||
|
||||
### Visualize QC Metrics
|
||||
|
||||
```python
|
||||
# Violin plots for QC metrics
|
||||
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
|
||||
jitter=0.4, multi_panel=True, save='_qc_violin.pdf')
|
||||
|
||||
# Scatter plots to identify outliers
|
||||
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt', save='_qc_mt.pdf')
|
||||
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', save='_qc_genes.pdf')
|
||||
|
||||
# Highest expressing genes
|
||||
sc.pl.highest_expr_genes(adata, n_top=20, save='_highest_expr.pdf')
|
||||
```
|
||||
|
||||
### Post-filtering QC
|
||||
|
||||
```python
|
||||
# Compare before and after filtering
|
||||
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts'],
|
||||
groupby='sample', save='_post_filter.pdf')
|
||||
```
|
||||
|
||||
## Dimensionality Reduction Visualizations
|
||||
|
||||
### PCA Plots
|
||||
|
||||
```python
|
||||
# Basic PCA
|
||||
sc.pl.pca(adata, color='leiden', save='_pca.pdf')
|
||||
|
||||
# PCA colored by gene expression
|
||||
sc.pl.pca(adata, color=['gene1', 'gene2', 'gene3'], save='_pca_genes.pdf')
|
||||
|
||||
# Variance ratio plot (elbow plot)
|
||||
sc.pl.pca_variance_ratio(adata, log=True, n_pcs=50, save='_variance.pdf')
|
||||
|
||||
# PCA loadings
|
||||
sc.pl.pca_loadings(adata, components=[1, 2, 3], save='_loadings.pdf')
|
||||
```
|
||||
|
||||
### UMAP Plots
|
||||
|
||||
```python
|
||||
# Basic UMAP with clusters
|
||||
sc.pl.umap(adata, color='leiden', legend_loc='on data', save='_umap_leiden.pdf')
|
||||
|
||||
# UMAP colored by multiple variables
|
||||
sc.pl.umap(adata, color=['leiden', 'cell_type', 'batch'],
|
||||
save='_umap_multi.pdf')
|
||||
|
||||
# UMAP with gene expression
|
||||
sc.pl.umap(adata, color=['CD3D', 'CD14', 'MS4A1'],
|
||||
use_raw=False, save='_umap_genes.pdf')
|
||||
|
||||
# Customize appearance
|
||||
sc.pl.umap(adata, color='leiden',
|
||||
palette='Set2',
|
||||
size=50,
|
||||
alpha=0.8,
|
||||
frameon=False,
|
||||
title='Cell Types',
|
||||
save='_umap_custom.pdf')
|
||||
```
|
||||
|
||||
### t-SNE Plots
|
||||
|
||||
```python
|
||||
# t-SNE with clusters
|
||||
sc.pl.tsne(adata, color='leiden', legend_loc='right margin', save='_tsne.pdf')
|
||||
|
||||
# Multiple t-SNE perplexities (if computed)
|
||||
sc.pl.tsne(adata, color='leiden', save='_tsne_default.pdf')
|
||||
```
|
||||
|
||||
## Clustering Visualizations
|
||||
|
||||
### Basic Cluster Plots
|
||||
|
||||
```python
|
||||
# UMAP with cluster annotations
|
||||
sc.pl.umap(adata, color='leiden', add_outline=True,
|
||||
legend_loc='on data', legend_fontsize=12,
|
||||
legend_fontoutline=2, frameon=False,
|
||||
save='_clusters.pdf')
|
||||
|
||||
# Show cluster proportions
|
||||
sc.pl.umap(adata, color='leiden', size=50, edges=True,
|
||||
edges_width=0.1, save='_clusters_edges.pdf')
|
||||
```
|
||||
|
||||
### Cluster Comparison
|
||||
|
||||
```python
|
||||
# Compare clustering results
|
||||
sc.pl.umap(adata, color=['leiden', 'louvain'],
|
||||
save='_cluster_comparison.pdf')
|
||||
|
||||
# Cluster dendrogram
|
||||
sc.tl.dendrogram(adata, groupby='leiden')
|
||||
sc.pl.dendrogram(adata, groupby='leiden', save='_dendrogram.pdf')
|
||||
```
|
||||
|
||||
## Marker Gene Visualizations
|
||||
|
||||
### Ranked Marker Genes
|
||||
|
||||
```python
|
||||
# Overview of top markers per cluster
|
||||
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False,
|
||||
save='_marker_overview.pdf')
|
||||
|
||||
# Heatmap of top markers
|
||||
sc.pl.rank_genes_groups_heatmap(adata, n_genes=10, groupby='leiden',
|
||||
show_gene_labels=True,
|
||||
save='_marker_heatmap.pdf')
|
||||
|
||||
# Dot plot of markers
|
||||
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5,
|
||||
save='_marker_dotplot.pdf')
|
||||
|
||||
# Stacked violin plots
|
||||
sc.pl.rank_genes_groups_stacked_violin(adata, n_genes=5,
|
||||
save='_marker_violin.pdf')
|
||||
|
||||
# Matrix plot
|
||||
sc.pl.rank_genes_groups_matrixplot(adata, n_genes=5,
|
||||
save='_marker_matrix.pdf')
|
||||
```
|
||||
|
||||
### Specific Gene Expression
|
||||
|
||||
```python
|
||||
# Violin plots for specific genes
|
||||
marker_genes = ['CD3D', 'CD14', 'MS4A1', 'NKG7', 'FCGR3A']
|
||||
sc.pl.violin(adata, keys=marker_genes, groupby='leiden',
|
||||
save='_markers_violin.pdf')
|
||||
|
||||
# Dot plot for curated markers
|
||||
sc.pl.dotplot(adata, var_names=marker_genes, groupby='leiden',
|
||||
save='_markers_dotplot.pdf')
|
||||
|
||||
# Heatmap for specific genes
|
||||
sc.pl.heatmap(adata, var_names=marker_genes, groupby='leiden',
|
||||
swap_axes=True, save='_markers_heatmap.pdf')
|
||||
|
||||
# Stacked violin for gene sets
|
||||
sc.pl.stacked_violin(adata, var_names=marker_genes, groupby='leiden',
|
||||
save='_markers_stacked.pdf')
|
||||
```
|
||||
|
||||
### Gene Expression on Embeddings
|
||||
|
||||
```python
|
||||
# Multiple genes on UMAP
|
||||
genes = ['CD3D', 'CD14', 'MS4A1', 'NKG7']
|
||||
sc.pl.umap(adata, color=genes, cmap='viridis',
|
||||
save='_umap_markers.pdf')
|
||||
|
||||
# Gene expression with custom colormap
|
||||
sc.pl.umap(adata, color='CD3D', cmap='Reds',
|
||||
vmin=0, vmax=3, save='_umap_cd3d.pdf')
|
||||
```
|
||||
|
||||
## Trajectory and Pseudotime Visualizations
|
||||
|
||||
### PAGA Plots
|
||||
|
||||
```python
|
||||
# PAGA graph
|
||||
sc.pl.paga(adata, color='leiden', save='_paga.pdf')
|
||||
|
||||
# PAGA with gene expression
|
||||
sc.pl.paga(adata, color=['leiden', 'dpt_pseudotime'],
|
||||
save='_paga_pseudotime.pdf')
|
||||
|
||||
# PAGA overlaid on UMAP
|
||||
sc.pl.umap(adata, color='leiden', save='_umap_with_paga.pdf',
|
||||
edges=True, edges_color='gray')
|
||||
```
|
||||
|
||||
### Pseudotime Plots
|
||||
|
||||
```python
|
||||
# DPT pseudotime on UMAP
|
||||
sc.pl.umap(adata, color='dpt_pseudotime', save='_umap_dpt.pdf')
|
||||
|
||||
# Gene expression along pseudotime
|
||||
sc.pl.dpt_timeseries(adata, save='_dpt_timeseries.pdf')
|
||||
|
||||
# Heatmap ordered by pseudotime
|
||||
sc.pl.heatmap(adata, var_names=genes, groupby='leiden',
|
||||
use_raw=False, show_gene_labels=True,
|
||||
save='_pseudotime_heatmap.pdf')
|
||||
```
|
||||
|
||||
## Advanced Visualizations
|
||||
|
||||
### Tracks Plot (Gene Expression Trends)
|
||||
|
||||
```python
|
||||
# Show gene expression across cell types
|
||||
sc.pl.tracksplot(adata, var_names=marker_genes, groupby='leiden',
|
||||
save='_tracks.pdf')
|
||||
```
|
||||
|
||||
### Correlation Matrix
|
||||
|
||||
```python
|
||||
# Correlation between clusters
|
||||
sc.pl.correlation_matrix(adata, groupby='leiden',
|
||||
save='_correlation.pdf')
|
||||
```
|
||||
|
||||
### Embedding Density
|
||||
|
||||
```python
|
||||
# Cell density on UMAP
|
||||
sc.tl.embedding_density(adata, basis='umap', groupby='cell_type')
|
||||
sc.pl.embedding_density(adata, basis='umap', key='umap_density_cell_type',
|
||||
save='_density.pdf')
|
||||
```
|
||||
|
||||
## Multi-Panel Figures
|
||||
|
||||
### Creating Panel Figures
|
||||
|
||||
```python
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# Create multi-panel figure
|
||||
fig, axes = plt.subplots(2, 2, figsize=(12, 12))
|
||||
|
||||
# Plot on specific axes
|
||||
sc.pl.umap(adata, color='leiden', ax=axes[0, 0], show=False)
|
||||
sc.pl.umap(adata, color='CD3D', ax=axes[0, 1], show=False)
|
||||
sc.pl.umap(adata, color='CD14', ax=axes[1, 0], show=False)
|
||||
sc.pl.umap(adata, color='MS4A1', ax=axes[1, 1], show=False)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.savefig('figures/multi_panel.pdf')
|
||||
plt.show()
|
||||
```
|
||||
|
||||
## Publication-Quality Customization
|
||||
|
||||
### High-Quality Settings
|
||||
|
||||
```python
|
||||
# Set publication-quality defaults
|
||||
sc.settings.set_figure_params(dpi=300, frameon=False, figsize=(5, 5),
|
||||
facecolor='white')
|
||||
|
||||
# Vector graphics output
|
||||
sc.settings.figdir = './figures/'
|
||||
sc.settings.file_format_figs = 'pdf' # or 'svg'
|
||||
```
|
||||
|
||||
### Custom Color Palettes
|
||||
|
||||
```python
|
||||
# Use custom colors
|
||||
custom_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
|
||||
sc.pl.umap(adata, color='leiden', palette=custom_colors,
|
||||
save='_custom_colors.pdf')
|
||||
|
||||
# Continuous color maps
|
||||
sc.pl.umap(adata, color='CD3D', cmap='viridis', save='_viridis.pdf')
|
||||
sc.pl.umap(adata, color='CD3D', cmap='RdBu_r', save='_rdbu.pdf')
|
||||
```
|
||||
|
||||
### Remove Axes and Frames
|
||||
|
||||
```python
|
||||
# Clean plot without axes
|
||||
sc.pl.umap(adata, color='leiden', frameon=False,
|
||||
save='_clean.pdf')
|
||||
|
||||
# No legend
|
||||
sc.pl.umap(adata, color='leiden', legend_loc=None,
|
||||
save='_no_legend.pdf')
|
||||
```
|
||||
|
||||
## Exporting Plots
|
||||
|
||||
### Save Individual Plots
|
||||
|
||||
```python
|
||||
# Automatic saving with save parameter
|
||||
sc.pl.umap(adata, color='leiden', save='_leiden.pdf')
|
||||
# Saves to: sc.settings.figdir + 'umap_leiden.pdf'
|
||||
|
||||
# Manual saving
|
||||
import matplotlib.pyplot as plt
|
||||
fig = sc.pl.umap(adata, color='leiden', show=False, return_fig=True)
|
||||
fig.savefig('figures/my_umap.pdf', dpi=300, bbox_inches='tight')
|
||||
```
|
||||
|
||||
### Batch Export
|
||||
|
||||
```python
|
||||
# Save multiple versions
|
||||
for gene in ['CD3D', 'CD14', 'MS4A1']:
|
||||
sc.pl.umap(adata, color=gene, save=f'_{gene}.pdf')
|
||||
```
|
||||
|
||||
## Common Customization Parameters
|
||||
|
||||
### Layout Parameters
|
||||
- `figsize`: Figure size (width, height)
|
||||
- `frameon`: Show frame around plot
|
||||
- `title`: Plot title
|
||||
- `legend_loc`: 'right margin', 'on data', 'best', or None
|
||||
- `legend_fontsize`: Font size for legend
|
||||
- `size`: Point size
|
||||
|
||||
### Color Parameters
|
||||
- `color`: Variable(s) to color by
|
||||
- `palette`: Color palette (e.g., 'Set1', 'viridis')
|
||||
- `cmap`: Colormap for continuous variables
|
||||
- `vmin`, `vmax`: Color scale limits
|
||||
- `use_raw`: Use raw counts for gene expression
|
||||
|
||||
### Saving Parameters
|
||||
- `save`: Filename suffix for saving
|
||||
- `show`: Whether to display plot
|
||||
- `dpi`: Resolution for raster formats
|
||||
|
||||
## Tips for Publication Figures
|
||||
|
||||
1. **Use vector formats**: PDF or SVG for scalable graphics
|
||||
2. **High DPI**: Set dpi=300 or higher for raster images
|
||||
3. **Consistent styling**: Use the same color palette across figures
|
||||
4. **Clear labels**: Ensure gene names and cell types are readable
|
||||
5. **White background**: Use `facecolor='white'` for publications
|
||||
6. **Remove clutter**: Set `frameon=False` for cleaner appearance
|
||||
7. **Legend placement**: Use 'on data' for compact figures
|
||||
8. **Color blind friendly**: Consider palettes like 'colorblind' or 'Set2'
|
||||
206
skills/scanpy/references/standard_workflow.md
Normal file
206
skills/scanpy/references/standard_workflow.md
Normal file
@@ -0,0 +1,206 @@
|
||||
# Standard Scanpy Workflow for Single-Cell Analysis
|
||||
|
||||
This document outlines the standard workflow for analyzing single-cell RNA-seq data using scanpy.
|
||||
|
||||
## Complete Analysis Pipeline
|
||||
|
||||
### 1. Data Loading and Initial Setup
|
||||
|
||||
```python
|
||||
import scanpy as sc
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
# Configure scanpy settings
|
||||
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
|
||||
sc.settings.set_figure_params(dpi=80, facecolor='white')
|
||||
|
||||
# Load data (various formats)
|
||||
adata = sc.read_10x_mtx('path/to/data/') # For 10X data
|
||||
# adata = sc.read_h5ad('path/to/data.h5ad') # For h5ad format
|
||||
# adata = sc.read_csv('path/to/data.csv') # For CSV format
|
||||
```
|
||||
|
||||
### 2. Quality Control (QC)
|
||||
|
||||
```python
|
||||
# Calculate QC metrics
|
||||
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
|
||||
|
||||
# Common filtering thresholds (adjust based on dataset)
|
||||
sc.pp.filter_cells(adata, min_genes=200)
|
||||
sc.pp.filter_genes(adata, min_cells=3)
|
||||
|
||||
# Remove cells with high mitochondrial content
|
||||
adata = adata[adata.obs.pct_counts_mt < 5, :]
|
||||
|
||||
# Visualize QC metrics
|
||||
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
|
||||
jitter=0.4, multi_panel=True)
|
||||
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')
|
||||
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')
|
||||
```
|
||||
|
||||
### 3. Normalization
|
||||
|
||||
```python
|
||||
# Normalize to 10,000 counts per cell
|
||||
sc.pp.normalize_total(adata, target_sum=1e4)
|
||||
|
||||
# Log-transform the data
|
||||
sc.pp.log1p(adata)
|
||||
|
||||
# Store normalized data in raw for later use
|
||||
adata.raw = adata
|
||||
```
|
||||
|
||||
### 4. Feature Selection
|
||||
|
||||
```python
|
||||
# Identify highly variable genes
|
||||
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
|
||||
|
||||
# Visualize highly variable genes
|
||||
sc.pl.highly_variable_genes(adata)
|
||||
|
||||
# Subset to highly variable genes
|
||||
adata = adata[:, adata.var.highly_variable]
|
||||
```
|
||||
|
||||
### 5. Scaling and Regression
|
||||
|
||||
```python
|
||||
# Regress out effects of total counts per cell and percent mitochondrial genes
|
||||
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
|
||||
|
||||
# Scale data to unit variance and zero mean
|
||||
sc.pp.scale(adata, max_value=10)
|
||||
```
|
||||
|
||||
### 6. Dimensionality Reduction
|
||||
|
||||
```python
|
||||
# Principal Component Analysis (PCA)
|
||||
sc.tl.pca(adata, svd_solver='arpack')
|
||||
|
||||
# Visualize PCA results
|
||||
sc.pl.pca(adata, color='CST3')
|
||||
sc.pl.pca_variance_ratio(adata, log=True)
|
||||
|
||||
# Computing neighborhood graph
|
||||
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
|
||||
|
||||
# UMAP for visualization
|
||||
sc.tl.umap(adata)
|
||||
|
||||
# t-SNE (alternative to UMAP)
|
||||
# sc.tl.tsne(adata)
|
||||
```
|
||||
|
||||
### 7. Clustering
|
||||
|
||||
```python
|
||||
# Leiden clustering (recommended)
|
||||
sc.tl.leiden(adata, resolution=0.5)
|
||||
|
||||
# Alternative: Louvain clustering
|
||||
# sc.tl.louvain(adata, resolution=0.5)
|
||||
|
||||
# Visualize clustering results
|
||||
sc.pl.umap(adata, color=['leiden'], legend_loc='on data')
|
||||
```
|
||||
|
||||
### 8. Marker Gene Identification
|
||||
|
||||
```python
|
||||
# Find marker genes for each cluster
|
||||
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
|
||||
|
||||
# Visualize top marker genes
|
||||
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)
|
||||
|
||||
# Get marker gene dataframe
|
||||
marker_genes = sc.get.rank_genes_groups_df(adata, group='0')
|
||||
|
||||
# Visualize specific markers
|
||||
sc.pl.umap(adata, color=['leiden', 'CST3', 'NKG7'])
|
||||
```
|
||||
|
||||
### 9. Cell Type Annotation
|
||||
|
||||
```python
|
||||
# Manual annotation based on marker genes
|
||||
cluster_annotations = {
|
||||
'0': 'CD4 T cells',
|
||||
'1': 'CD14+ Monocytes',
|
||||
'2': 'B cells',
|
||||
'3': 'CD8 T cells',
|
||||
# ... add more annotations
|
||||
}
|
||||
adata.obs['cell_type'] = adata.obs['leiden'].map(cluster_annotations)
|
||||
|
||||
# Visualize annotated cell types
|
||||
sc.pl.umap(adata, color='cell_type', legend_loc='on data')
|
||||
```
|
||||
|
||||
### 10. Saving Results
|
||||
|
||||
```python
|
||||
# Save the processed AnnData object
|
||||
adata.write('results/processed_data.h5ad')
|
||||
|
||||
# Export results to CSV
|
||||
adata.obs.to_csv('results/cell_metadata.csv')
|
||||
adata.var.to_csv('results/gene_metadata.csv')
|
||||
```
|
||||
|
||||
## Additional Analysis Options
|
||||
|
||||
### Trajectory Inference
|
||||
|
||||
```python
|
||||
# PAGA (Partition-based graph abstraction)
|
||||
sc.tl.paga(adata, groups='leiden')
|
||||
sc.pl.paga(adata, color=['leiden'])
|
||||
|
||||
# Diffusion pseudotime (DPT)
|
||||
adata.uns['iroot'] = np.flatnonzero(adata.obs['leiden'] == '0')[0]
|
||||
sc.tl.dpt(adata)
|
||||
sc.pl.umap(adata, color=['dpt_pseudotime'])
|
||||
```
|
||||
|
||||
### Differential Expression Between Conditions
|
||||
|
||||
```python
|
||||
# Compare conditions within a cell type
|
||||
sc.tl.rank_genes_groups(adata, groupby='condition', groups=['treated'],
|
||||
reference='control', method='wilcoxon')
|
||||
sc.pl.rank_genes_groups(adata, groups=['treated'])
|
||||
```
|
||||
|
||||
### Gene Set Scoring
|
||||
|
||||
```python
|
||||
# Score cells for gene set expression
|
||||
gene_set = ['CD3D', 'CD3E', 'CD3G']
|
||||
sc.tl.score_genes(adata, gene_set, score_name='T_cell_score')
|
||||
sc.pl.umap(adata, color='T_cell_score')
|
||||
```
|
||||
|
||||
## Common Parameters to Adjust
|
||||
|
||||
- **QC thresholds**: `min_genes`, `min_cells`, `pct_counts_mt` - depends on dataset quality
|
||||
- **Normalization target**: Usually 1e4, but can be adjusted
|
||||
- **HVG parameters**: Affects feature selection stringency
|
||||
- **PCA components**: Check variance ratio plot to determine optimal number
|
||||
- **Clustering resolution**: Higher values give more clusters (typically 0.4-1.2)
|
||||
- **n_neighbors**: Affects granularity of UMAP and clustering (typically 10-30)
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. Always visualize QC metrics before filtering
|
||||
2. Save raw counts before normalization (`adata.raw = adata`)
|
||||
3. Use Leiden instead of Louvain for clustering (more efficient)
|
||||
4. Try multiple clustering resolutions to find optimal granularity
|
||||
5. Validate cell type annotations with known marker genes
|
||||
6. Save intermediate results at key steps
|
||||
200
skills/scanpy/scripts/qc_analysis.py
Executable file
200
skills/scanpy/scripts/qc_analysis.py
Executable file
@@ -0,0 +1,200 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Quality Control Analysis Script for Scanpy
|
||||
|
||||
Performs comprehensive quality control on single-cell RNA-seq data,
|
||||
including calculating metrics, generating QC plots, and filtering cells.
|
||||
|
||||
Usage:
|
||||
python qc_analysis.py <input_file> [--output <output_file>]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import scanpy as sc
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
|
||||
def calculate_qc_metrics(adata, mt_threshold=5, min_genes=200, min_cells=3):
|
||||
"""
|
||||
Calculate QC metrics and filter cells/genes.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
adata : AnnData
|
||||
Annotated data matrix
|
||||
mt_threshold : float
|
||||
Maximum percentage of mitochondrial genes (default: 5)
|
||||
min_genes : int
|
||||
Minimum number of genes per cell (default: 200)
|
||||
min_cells : int
|
||||
Minimum number of cells per gene (default: 3)
|
||||
|
||||
Returns:
|
||||
--------
|
||||
AnnData
|
||||
Filtered annotated data matrix
|
||||
"""
|
||||
# Identify mitochondrial genes (assumes gene names follow standard conventions)
|
||||
adata.var['mt'] = adata.var_names.str.startswith(('MT-', 'mt-', 'Mt-'))
|
||||
|
||||
# Calculate QC metrics
|
||||
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None,
|
||||
log1p=False, inplace=True)
|
||||
|
||||
print("\n=== QC Metrics Summary ===")
|
||||
print(f"Total cells: {adata.n_obs}")
|
||||
print(f"Total genes: {adata.n_vars}")
|
||||
print(f"Mean genes per cell: {adata.obs['n_genes_by_counts'].mean():.2f}")
|
||||
print(f"Mean counts per cell: {adata.obs['total_counts'].mean():.2f}")
|
||||
print(f"Mean mitochondrial %: {adata.obs['pct_counts_mt'].mean():.2f}")
|
||||
|
||||
return adata
|
||||
|
||||
|
||||
def generate_qc_plots(adata, output_prefix='qc'):
|
||||
"""
|
||||
Generate comprehensive QC plots.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
adata : AnnData
|
||||
Annotated data matrix
|
||||
output_prefix : str
|
||||
Prefix for saved figure files
|
||||
"""
|
||||
# Create figure directory if it doesn't exist
|
||||
import os
|
||||
os.makedirs('figures', exist_ok=True)
|
||||
|
||||
# Violin plots for QC metrics
|
||||
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
|
||||
jitter=0.4, multi_panel=True, save=f'_{output_prefix}_violin.pdf')
|
||||
|
||||
# Scatter plots
|
||||
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt',
|
||||
save=f'_{output_prefix}_mt_scatter.pdf')
|
||||
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts',
|
||||
save=f'_{output_prefix}_genes_scatter.pdf')
|
||||
|
||||
# Highest expressing genes
|
||||
sc.pl.highest_expr_genes(adata, n_top=20,
|
||||
save=f'_{output_prefix}_highest_expr.pdf')
|
||||
|
||||
print(f"\nQC plots saved to figures/ directory with prefix '{output_prefix}'")
|
||||
|
||||
|
||||
def filter_data(adata, mt_threshold=5, min_genes=200, max_genes=None,
|
||||
min_counts=None, max_counts=None, min_cells=3):
|
||||
"""
|
||||
Filter cells and genes based on QC thresholds.
|
||||
|
||||
Parameters:
|
||||
-----------
|
||||
adata : AnnData
|
||||
Annotated data matrix
|
||||
mt_threshold : float
|
||||
Maximum percentage of mitochondrial genes
|
||||
min_genes : int
|
||||
Minimum number of genes per cell
|
||||
max_genes : int, optional
|
||||
Maximum number of genes per cell
|
||||
min_counts : int, optional
|
||||
Minimum number of counts per cell
|
||||
max_counts : int, optional
|
||||
Maximum number of counts per cell
|
||||
min_cells : int
|
||||
Minimum number of cells per gene
|
||||
|
||||
Returns:
|
||||
--------
|
||||
AnnData
|
||||
Filtered annotated data matrix
|
||||
"""
|
||||
n_cells_before = adata.n_obs
|
||||
n_genes_before = adata.n_vars
|
||||
|
||||
# Filter cells
|
||||
sc.pp.filter_cells(adata, min_genes=min_genes)
|
||||
if max_genes:
|
||||
adata = adata[adata.obs['n_genes_by_counts'] < max_genes, :]
|
||||
if min_counts:
|
||||
adata = adata[adata.obs['total_counts'] >= min_counts, :]
|
||||
if max_counts:
|
||||
adata = adata[adata.obs['total_counts'] < max_counts, :]
|
||||
|
||||
# Filter by mitochondrial percentage
|
||||
adata = adata[adata.obs['pct_counts_mt'] < mt_threshold, :]
|
||||
|
||||
# Filter genes
|
||||
sc.pp.filter_genes(adata, min_cells=min_cells)
|
||||
|
||||
print(f"\n=== Filtering Results ===")
|
||||
print(f"Cells: {n_cells_before} -> {adata.n_obs} ({adata.n_obs/n_cells_before*100:.1f}% retained)")
|
||||
print(f"Genes: {n_genes_before} -> {adata.n_vars} ({adata.n_vars/n_genes_before*100:.1f}% retained)")
|
||||
|
||||
return adata
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description='QC analysis for single-cell data')
|
||||
parser.add_argument('input', help='Input file (h5ad, 10X mtx, csv, etc.)')
|
||||
parser.add_argument('--output', default='qc_filtered.h5ad',
|
||||
help='Output file name (default: qc_filtered.h5ad)')
|
||||
parser.add_argument('--mt-threshold', type=float, default=5,
|
||||
help='Max mitochondrial percentage (default: 5)')
|
||||
parser.add_argument('--min-genes', type=int, default=200,
|
||||
help='Min genes per cell (default: 200)')
|
||||
parser.add_argument('--min-cells', type=int, default=3,
|
||||
help='Min cells per gene (default: 3)')
|
||||
parser.add_argument('--skip-plots', action='store_true',
|
||||
help='Skip generating QC plots')
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Configure scanpy
|
||||
sc.settings.verbosity = 2
|
||||
sc.settings.set_figure_params(dpi=300, facecolor='white')
|
||||
sc.settings.figdir = './figures/'
|
||||
|
||||
print(f"Loading data from: {args.input}")
|
||||
|
||||
# Load data based on file extension
|
||||
if args.input.endswith('.h5ad'):
|
||||
adata = sc.read_h5ad(args.input)
|
||||
elif args.input.endswith('.h5'):
|
||||
adata = sc.read_10x_h5(args.input)
|
||||
elif args.input.endswith('.csv'):
|
||||
adata = sc.read_csv(args.input)
|
||||
else:
|
||||
# Try reading as 10X mtx directory
|
||||
adata = sc.read_10x_mtx(args.input)
|
||||
|
||||
print(f"Loaded data: {adata.n_obs} cells x {adata.n_vars} genes")
|
||||
|
||||
# Calculate QC metrics
|
||||
adata = calculate_qc_metrics(adata, mt_threshold=args.mt_threshold,
|
||||
min_genes=args.min_genes, min_cells=args.min_cells)
|
||||
|
||||
# Generate QC plots (before filtering)
|
||||
if not args.skip_plots:
|
||||
print("\nGenerating QC plots (before filtering)...")
|
||||
generate_qc_plots(adata, output_prefix='qc_before')
|
||||
|
||||
# Filter data
|
||||
adata = filter_data(adata, mt_threshold=args.mt_threshold,
|
||||
min_genes=args.min_genes, min_cells=args.min_cells)
|
||||
|
||||
# Generate QC plots (after filtering)
|
||||
if not args.skip_plots:
|
||||
print("\nGenerating QC plots (after filtering)...")
|
||||
generate_qc_plots(adata, output_prefix='qc_after')
|
||||
|
||||
# Save filtered data
|
||||
print(f"\nSaving filtered data to: {args.output}")
|
||||
adata.write_h5ad(args.output)
|
||||
|
||||
print("\n=== QC Analysis Complete ===")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user