Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions

380
skills/scanpy/SKILL.md Normal file
View File

@@ -0,0 +1,380 @@
---
name: scanpy
description: "Single-cell RNA-seq analysis. Load .h5ad/10X data, QC, normalization, PCA/UMAP/t-SNE, Leiden clustering, marker genes, cell type annotation, trajectory, for scRNA-seq analysis."
---
# Scanpy: Single-Cell Analysis
## Overview
Scanpy is a scalable Python toolkit for analyzing single-cell RNA-seq data, built on AnnData. Apply this skill for complete single-cell workflows including quality control, normalization, dimensionality reduction, clustering, marker gene identification, visualization, and trajectory analysis.
## When to Use This Skill
This skill should be used when:
- Analyzing single-cell RNA-seq data (.h5ad, 10X, CSV formats)
- Performing quality control on scRNA-seq datasets
- Creating UMAP, t-SNE, or PCA visualizations
- Identifying cell clusters and finding marker genes
- Annotating cell types based on gene expression
- Conducting trajectory inference or pseudotime analysis
- Generating publication-quality single-cell plots
## Quick Start
### Basic Import and Setup
```python
import scanpy as sc
import pandas as pd
import numpy as np
# Configure settings
sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=80, facecolor='white')
sc.settings.figdir = './figures/'
```
### Loading Data
```python
# From 10X Genomics
adata = sc.read_10x_mtx('path/to/data/')
adata = sc.read_10x_h5('path/to/data.h5')
# From h5ad (AnnData format)
adata = sc.read_h5ad('path/to/data.h5ad')
# From CSV
adata = sc.read_csv('path/to/data.csv')
```
### Understanding AnnData Structure
The AnnData object is the core data structure in scanpy:
```python
adata.X # Expression matrix (cells × genes)
adata.obs # Cell metadata (DataFrame)
adata.var # Gene metadata (DataFrame)
adata.uns # Unstructured annotations (dict)
adata.obsm # Multi-dimensional cell data (PCA, UMAP)
adata.raw # Raw data backup
# Access cell and gene names
adata.obs_names # Cell barcodes
adata.var_names # Gene names
```
## Standard Analysis Workflow
### 1. Quality Control
Identify and filter low-quality cells and genes:
```python
# Identify mitochondrial genes
adata.var['mt'] = adata.var_names.str.startswith('MT-')
# Calculate QC metrics
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], inplace=True)
# Visualize QC metrics
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
jitter=0.4, multi_panel=True)
# Filter cells and genes
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
adata = adata[adata.obs.pct_counts_mt < 5, :] # Remove high MT% cells
```
**Use the QC script for automated analysis:**
```bash
python scripts/qc_analysis.py input_file.h5ad --output filtered.h5ad
```
### 2. Normalization and Preprocessing
```python
# Normalize to 10,000 counts per cell
sc.pp.normalize_total(adata, target_sum=1e4)
# Log-transform
sc.pp.log1p(adata)
# Save raw counts for later
adata.raw = adata
# Identify highly variable genes
sc.pp.highly_variable_genes(adata, n_top_genes=2000)
sc.pl.highly_variable_genes(adata)
# Subset to highly variable genes
adata = adata[:, adata.var.highly_variable]
# Regress out unwanted variation
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
# Scale data
sc.pp.scale(adata, max_value=10)
```
### 3. Dimensionality Reduction
```python
# PCA
sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca_variance_ratio(adata, log=True) # Check elbow plot
# Compute neighborhood graph
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
# UMAP for visualization
sc.tl.umap(adata)
sc.pl.umap(adata, color='leiden')
# Alternative: t-SNE
sc.tl.tsne(adata)
```
### 4. Clustering
```python
# Leiden clustering (recommended)
sc.tl.leiden(adata, resolution=0.5)
sc.pl.umap(adata, color='leiden', legend_loc='on data')
# Try multiple resolutions to find optimal granularity
for res in [0.3, 0.5, 0.8, 1.0]:
sc.tl.leiden(adata, resolution=res, key_added=f'leiden_{res}')
```
### 5. Marker Gene Identification
```python
# Find marker genes for each cluster
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
# Visualize results
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)
sc.pl.rank_genes_groups_heatmap(adata, n_genes=10)
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5)
# Get results as DataFrame
markers = sc.get.rank_genes_groups_df(adata, group='0')
```
### 6. Cell Type Annotation
```python
# Define marker genes for known cell types
marker_genes = ['CD3D', 'CD14', 'MS4A1', 'NKG7', 'FCGR3A']
# Visualize markers
sc.pl.umap(adata, color=marker_genes, use_raw=True)
sc.pl.dotplot(adata, var_names=marker_genes, groupby='leiden')
# Manual annotation
cluster_to_celltype = {
'0': 'CD4 T cells',
'1': 'CD14+ Monocytes',
'2': 'B cells',
'3': 'CD8 T cells',
}
adata.obs['cell_type'] = adata.obs['leiden'].map(cluster_to_celltype)
# Visualize annotated types
sc.pl.umap(adata, color='cell_type', legend_loc='on data')
```
### 7. Save Results
```python
# Save processed data
adata.write('results/processed_data.h5ad')
# Export metadata
adata.obs.to_csv('results/cell_metadata.csv')
adata.var.to_csv('results/gene_metadata.csv')
```
## Common Tasks
### Creating Publication-Quality Plots
```python
# Set high-quality defaults
sc.settings.set_figure_params(dpi=300, frameon=False, figsize=(5, 5))
sc.settings.file_format_figs = 'pdf'
# UMAP with custom styling
sc.pl.umap(adata, color='cell_type',
palette='Set2',
legend_loc='on data',
legend_fontsize=12,
legend_fontoutline=2,
frameon=False,
save='_publication.pdf')
# Heatmap of marker genes
sc.pl.heatmap(adata, var_names=genes, groupby='cell_type',
swap_axes=True, show_gene_labels=True,
save='_markers.pdf')
# Dot plot
sc.pl.dotplot(adata, var_names=genes, groupby='cell_type',
save='_dotplot.pdf')
```
Refer to `references/plotting_guide.md` for comprehensive visualization examples.
### Trajectory Inference
```python
# PAGA (Partition-based graph abstraction)
sc.tl.paga(adata, groups='leiden')
sc.pl.paga(adata, color='leiden')
# Diffusion pseudotime
adata.uns['iroot'] = np.flatnonzero(adata.obs['leiden'] == '0')[0]
sc.tl.dpt(adata)
sc.pl.umap(adata, color='dpt_pseudotime')
```
### Differential Expression Between Conditions
```python
# Compare treated vs control within cell types
adata_subset = adata[adata.obs['cell_type'] == 'T cells']
sc.tl.rank_genes_groups(adata_subset, groupby='condition',
groups=['treated'], reference='control')
sc.pl.rank_genes_groups(adata_subset, groups=['treated'])
```
### Gene Set Scoring
```python
# Score cells for gene set expression
gene_set = ['CD3D', 'CD3E', 'CD3G']
sc.tl.score_genes(adata, gene_set, score_name='T_cell_score')
sc.pl.umap(adata, color='T_cell_score')
```
### Batch Correction
```python
# ComBat batch correction
sc.pp.combat(adata, key='batch')
# Alternative: use Harmony or scVI (separate packages)
```
## Key Parameters to Adjust
### Quality Control
- `min_genes`: Minimum genes per cell (typically 200-500)
- `min_cells`: Minimum cells per gene (typically 3-10)
- `pct_counts_mt`: Mitochondrial threshold (typically 5-20%)
### Normalization
- `target_sum`: Target counts per cell (default 1e4)
### Feature Selection
- `n_top_genes`: Number of HVGs (typically 2000-3000)
- `min_mean`, `max_mean`, `min_disp`: HVG selection parameters
### Dimensionality Reduction
- `n_pcs`: Number of principal components (check variance ratio plot)
- `n_neighbors`: Number of neighbors (typically 10-30)
### Clustering
- `resolution`: Clustering granularity (0.4-1.2, higher = more clusters)
## Common Pitfalls and Best Practices
1. **Always save raw counts**: `adata.raw = adata` before filtering genes
2. **Check QC plots carefully**: Adjust thresholds based on dataset quality
3. **Use Leiden over Louvain**: More efficient and better results
4. **Try multiple clustering resolutions**: Find optimal granularity
5. **Validate cell type annotations**: Use multiple marker genes
6. **Use `use_raw=True` for gene expression plots**: Shows original counts
7. **Check PCA variance ratio**: Determine optimal number of PCs
8. **Save intermediate results**: Long workflows can fail partway through
## Bundled Resources
### scripts/qc_analysis.py
Automated quality control script that calculates metrics, generates plots, and filters data:
```bash
python scripts/qc_analysis.py input.h5ad --output filtered.h5ad \
--mt-threshold 5 --min-genes 200 --min-cells 3
```
### references/standard_workflow.md
Complete step-by-step workflow with detailed explanations and code examples for:
- Data loading and setup
- Quality control with visualization
- Normalization and scaling
- Feature selection
- Dimensionality reduction (PCA, UMAP, t-SNE)
- Clustering (Leiden, Louvain)
- Marker gene identification
- Cell type annotation
- Trajectory inference
- Differential expression
Read this reference when performing a complete analysis from scratch.
### references/api_reference.md
Quick reference guide for scanpy functions organized by module:
- Reading/writing data (`sc.read_*`, `adata.write_*`)
- Preprocessing (`sc.pp.*`)
- Tools (`sc.tl.*`)
- Plotting (`sc.pl.*`)
- AnnData structure and manipulation
- Settings and utilities
Use this for quick lookup of function signatures and common parameters.
### references/plotting_guide.md
Comprehensive visualization guide including:
- Quality control plots
- Dimensionality reduction visualizations
- Clustering visualizations
- Marker gene plots (heatmaps, dot plots, violin plots)
- Trajectory and pseudotime plots
- Publication-quality customization
- Multi-panel figures
- Color palettes and styling
Consult this when creating publication-ready figures.
### assets/analysis_template.py
Complete analysis template providing a full workflow from data loading through cell type annotation. Copy and customize this template for new analyses:
```bash
cp assets/analysis_template.py my_analysis.py
# Edit parameters and run
python my_analysis.py
```
The template includes all standard steps with configurable parameters and helpful comments.
## Additional Resources
- **Official scanpy documentation**: https://scanpy.readthedocs.io/
- **Scanpy tutorials**: https://scanpy-tutorials.readthedocs.io/
- **scverse ecosystem**: https://scverse.org/ (related tools: squidpy, scvi-tools, cellrank)
- **Best practices**: Luecken & Theis (2019) "Current best practices in single-cell RNA-seq"
## Tips for Effective Analysis
1. **Start with the template**: Use `assets/analysis_template.py` as a starting point
2. **Run QC script first**: Use `scripts/qc_analysis.py` for initial filtering
3. **Consult references as needed**: Load workflow and API references into context
4. **Iterate on clustering**: Try multiple resolutions and visualization methods
5. **Validate biologically**: Check marker genes match expected cell types
6. **Document parameters**: Record QC thresholds and analysis settings
7. **Save checkpoints**: Write intermediate results at key steps

View File

@@ -0,0 +1,295 @@
#!/usr/bin/env python3
"""
Complete Single-Cell Analysis Template
This template provides a complete workflow for single-cell RNA-seq analysis
using scanpy, from data loading through clustering and cell type annotation.
Customize the parameters and sections as needed for your specific dataset.
"""
import scanpy as sc
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# ============================================================================
# CONFIGURATION
# ============================================================================
# File paths
INPUT_FILE = 'data/raw_counts.h5ad' # Change to your input file
OUTPUT_DIR = 'results/'
FIGURES_DIR = 'figures/'
# QC parameters
MIN_GENES = 200 # Minimum genes per cell
MIN_CELLS = 3 # Minimum cells per gene
MT_THRESHOLD = 5 # Maximum mitochondrial percentage
# Analysis parameters
N_TOP_GENES = 2000 # Number of highly variable genes
N_PCS = 40 # Number of principal components
N_NEIGHBORS = 10 # Number of neighbors for graph
LEIDEN_RESOLUTION = 0.5 # Clustering resolution
# Scanpy settings
sc.settings.verbosity = 3
sc.settings.set_figure_params(dpi=80, facecolor='white')
sc.settings.figdir = FIGURES_DIR
# ============================================================================
# 1. LOAD DATA
# ============================================================================
print("=" * 80)
print("LOADING DATA")
print("=" * 80)
# Load data (adjust based on your file format)
adata = sc.read_h5ad(INPUT_FILE)
# adata = sc.read_10x_mtx('data/filtered_gene_bc_matrices/') # For 10X data
# adata = sc.read_csv('data/counts.csv') # For CSV data
print(f"Loaded: {adata.n_obs} cells x {adata.n_vars} genes")
# ============================================================================
# 2. QUALITY CONTROL
# ============================================================================
print("\n" + "=" * 80)
print("QUALITY CONTROL")
print("=" * 80)
# Identify mitochondrial genes
adata.var['mt'] = adata.var_names.str.startswith('MT-')
# Calculate QC metrics
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None,
log1p=False, inplace=True)
# Visualize QC metrics before filtering
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
jitter=0.4, multi_panel=True, save='_qc_before_filtering')
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt', save='_qc_mt')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', save='_qc_genes')
# Filter cells and genes
print(f"\nBefore filtering: {adata.n_obs} cells, {adata.n_vars} genes")
sc.pp.filter_cells(adata, min_genes=MIN_GENES)
sc.pp.filter_genes(adata, min_cells=MIN_CELLS)
adata = adata[adata.obs.pct_counts_mt < MT_THRESHOLD, :]
print(f"After filtering: {adata.n_obs} cells, {adata.n_vars} genes")
# ============================================================================
# 3. NORMALIZATION
# ============================================================================
print("\n" + "=" * 80)
print("NORMALIZATION")
print("=" * 80)
# Normalize to 10,000 counts per cell
sc.pp.normalize_total(adata, target_sum=1e4)
# Log-transform
sc.pp.log1p(adata)
# Store normalized data
adata.raw = adata
# ============================================================================
# 4. FEATURE SELECTION
# ============================================================================
print("\n" + "=" * 80)
print("FEATURE SELECTION")
print("=" * 80)
# Identify highly variable genes
sc.pp.highly_variable_genes(adata, n_top_genes=N_TOP_GENES)
# Visualize
sc.pl.highly_variable_genes(adata, save='_hvg')
print(f"Selected {sum(adata.var.highly_variable)} highly variable genes")
# Subset to highly variable genes
adata = adata[:, adata.var.highly_variable]
# ============================================================================
# 5. SCALING AND REGRESSION
# ============================================================================
print("\n" + "=" * 80)
print("SCALING AND REGRESSION")
print("=" * 80)
# Regress out unwanted sources of variation
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
# Scale data
sc.pp.scale(adata, max_value=10)
# ============================================================================
# 6. DIMENSIONALITY REDUCTION
# ============================================================================
print("\n" + "=" * 80)
print("DIMENSIONALITY REDUCTION")
print("=" * 80)
# PCA
sc.tl.pca(adata, svd_solver='arpack')
sc.pl.pca_variance_ratio(adata, log=True, save='_pca_variance')
# Compute neighborhood graph
sc.pp.neighbors(adata, n_neighbors=N_NEIGHBORS, n_pcs=N_PCS)
# UMAP
sc.tl.umap(adata)
# ============================================================================
# 7. CLUSTERING
# ============================================================================
print("\n" + "=" * 80)
print("CLUSTERING")
print("=" * 80)
# Leiden clustering
sc.tl.leiden(adata, resolution=LEIDEN_RESOLUTION)
# Visualize
sc.pl.umap(adata, color='leiden', legend_loc='on data', save='_leiden')
print(f"Identified {len(adata.obs['leiden'].unique())} clusters")
# ============================================================================
# 8. MARKER GENE IDENTIFICATION
# ============================================================================
print("\n" + "=" * 80)
print("MARKER GENE IDENTIFICATION")
print("=" * 80)
# Find marker genes
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
# Visualize top markers
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, save='_markers')
sc.pl.rank_genes_groups_heatmap(adata, n_genes=10, save='_markers_heatmap')
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, save='_markers_dotplot')
# Get top markers for each cluster
for cluster in adata.obs['leiden'].unique():
print(f"\nCluster {cluster} top markers:")
markers = sc.get.rank_genes_groups_df(adata, group=cluster).head(10)
print(markers[['names', 'scores', 'pvals_adj']].to_string(index=False))
# ============================================================================
# 9. CELL TYPE ANNOTATION (CUSTOMIZE THIS SECTION)
# ============================================================================
print("\n" + "=" * 80)
print("CELL TYPE ANNOTATION")
print("=" * 80)
# Example marker genes for common cell types (customize for your data)
marker_genes = {
'T cells': ['CD3D', 'CD3E', 'CD3G'],
'B cells': ['MS4A1', 'CD79A', 'CD79B'],
'Monocytes': ['CD14', 'LYZ', 'S100A8'],
'NK cells': ['NKG7', 'GNLY', 'KLRD1'],
'Dendritic cells': ['FCER1A', 'CST3'],
}
# Visualize marker genes
for cell_type, genes in marker_genes.items():
available_genes = [g for g in genes if g in adata.raw.var_names]
if available_genes:
sc.pl.umap(adata, color=available_genes, use_raw=True,
save=f'_{cell_type.replace(" ", "_")}')
# Manual annotation based on marker expression (customize this mapping)
cluster_to_celltype = {
'0': 'CD4 T cells',
'1': 'CD14+ Monocytes',
'2': 'B cells',
'3': 'CD8 T cells',
'4': 'NK cells',
# Add more mappings based on your marker analysis
}
# Apply annotations
adata.obs['cell_type'] = adata.obs['leiden'].map(cluster_to_celltype)
adata.obs['cell_type'] = adata.obs['cell_type'].fillna('Unknown')
# Visualize annotated cell types
sc.pl.umap(adata, color='cell_type', legend_loc='on data', save='_celltypes')
# ============================================================================
# 10. ADDITIONAL ANALYSES (OPTIONAL)
# ============================================================================
print("\n" + "=" * 80)
print("ADDITIONAL ANALYSES")
print("=" * 80)
# PAGA trajectory analysis (optional)
sc.tl.paga(adata, groups='leiden')
sc.pl.paga(adata, color='leiden', save='_paga')
# Gene set scoring (optional)
# example_gene_set = ['CD3D', 'CD3E', 'CD3G']
# sc.tl.score_genes(adata, example_gene_set, score_name='T_cell_score')
# sc.pl.umap(adata, color='T_cell_score', save='_gene_set_score')
# ============================================================================
# 11. SAVE RESULTS
# ============================================================================
print("\n" + "=" * 80)
print("SAVING RESULTS")
print("=" * 80)
import os
os.makedirs(OUTPUT_DIR, exist_ok=True)
# Save processed AnnData object
adata.write(f'{OUTPUT_DIR}/processed_data.h5ad')
print(f"Saved processed data to {OUTPUT_DIR}/processed_data.h5ad")
# Export metadata
adata.obs.to_csv(f'{OUTPUT_DIR}/cell_metadata.csv')
adata.var.to_csv(f'{OUTPUT_DIR}/gene_metadata.csv')
print(f"Saved metadata to {OUTPUT_DIR}/")
# Export marker genes
for cluster in adata.obs['leiden'].unique():
markers = sc.get.rank_genes_groups_df(adata, group=cluster)
markers.to_csv(f'{OUTPUT_DIR}/markers_cluster_{cluster}.csv', index=False)
print(f"Saved marker genes to {OUTPUT_DIR}/")
# ============================================================================
# 12. SUMMARY
# ============================================================================
print("\n" + "=" * 80)
print("ANALYSIS SUMMARY")
print("=" * 80)
print(f"\nFinal dataset:")
print(f" Cells: {adata.n_obs}")
print(f" Genes: {adata.n_vars}")
print(f" Clusters: {len(adata.obs['leiden'].unique())}")
print(f"\nCell type distribution:")
print(adata.obs['cell_type'].value_counts())
print("\n" + "=" * 80)
print("ANALYSIS COMPLETE")
print("=" * 80)

View File

@@ -0,0 +1,251 @@
# Scanpy API Quick Reference
Quick reference for commonly used scanpy functions organized by module.
## Import Convention
```python
import scanpy as sc
```
## Reading and Writing Data (sc.read_*)
### Reading Functions
```python
sc.read_10x_h5(filename) # Read 10X HDF5 file
sc.read_10x_mtx(path) # Read 10X mtx directory
sc.read_h5ad(filename) # Read h5ad (AnnData) file
sc.read_csv(filename) # Read CSV file
sc.read_excel(filename) # Read Excel file
sc.read_loom(filename) # Read loom file
sc.read_text(filename) # Read text file
sc.read_visium(path) # Read Visium spatial data
```
### Writing Functions
```python
adata.write_h5ad(filename) # Write to h5ad format
adata.write_csvs(dirname) # Write to CSV files
adata.write_loom(filename) # Write to loom format
adata.write_zarr(filename) # Write to zarr format
```
## Preprocessing (sc.pp.*)
### Quality Control
```python
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], inplace=True)
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
```
### Normalization and Transformation
```python
sc.pp.normalize_total(adata, target_sum=1e4) # Normalize to target sum
sc.pp.log1p(adata) # Log(x + 1) transformation
sc.pp.sqrt(adata) # Square root transformation
```
### Feature Selection
```python
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
sc.pp.highly_variable_genes(adata, flavor='seurat_v3', n_top_genes=2000)
```
### Scaling and Regression
```python
sc.pp.scale(adata, max_value=10) # Scale to unit variance
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt']) # Regress out unwanted variation
```
### Dimensionality Reduction (Preprocessing)
```python
sc.pp.pca(adata, n_comps=50) # Principal component analysis
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40) # Compute neighborhood graph
```
### Batch Correction
```python
sc.pp.combat(adata, key='batch') # ComBat batch correction
```
## Tools (sc.tl.*)
### Dimensionality Reduction
```python
sc.tl.pca(adata, svd_solver='arpack') # PCA
sc.tl.umap(adata) # UMAP embedding
sc.tl.tsne(adata) # t-SNE embedding
sc.tl.diffmap(adata) # Diffusion map
sc.tl.draw_graph(adata, layout='fa') # Force-directed graph
```
### Clustering
```python
sc.tl.leiden(adata, resolution=0.5) # Leiden clustering (recommended)
sc.tl.louvain(adata, resolution=0.5) # Louvain clustering
sc.tl.kmeans(adata, n_clusters=10) # K-means clustering
```
### Marker Genes and Differential Expression
```python
sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon')
sc.tl.rank_genes_groups(adata, groupby='leiden', method='t-test')
sc.tl.rank_genes_groups(adata, groupby='leiden', method='logreg')
# Get results as dataframe
sc.get.rank_genes_groups_df(adata, group='0')
```
### Trajectory Inference
```python
sc.tl.paga(adata, groups='leiden') # PAGA trajectory
sc.tl.dpt(adata) # Diffusion pseudotime
```
### Gene Scoring
```python
sc.tl.score_genes(adata, gene_list, score_name='score')
sc.tl.score_genes_cell_cycle(adata, s_genes, g2m_genes)
```
### Embeddings and Projections
```python
sc.tl.ingest(adata, adata_ref) # Map to reference
sc.tl.embedding_density(adata, basis='umap', groupby='leiden')
```
## Plotting (sc.pl.*)
### Basic Embeddings
```python
sc.pl.umap(adata, color='leiden') # UMAP plot
sc.pl.tsne(adata, color='gene_name') # t-SNE plot
sc.pl.pca(adata, color='leiden') # PCA plot
sc.pl.diffmap(adata, color='leiden') # Diffusion map plot
```
### Heatmaps and Dot Plots
```python
sc.pl.heatmap(adata, var_names=genes, groupby='leiden')
sc.pl.dotplot(adata, var_names=genes, groupby='leiden')
sc.pl.matrixplot(adata, var_names=genes, groupby='leiden')
sc.pl.stacked_violin(adata, var_names=genes, groupby='leiden')
```
### Violin and Scatter Plots
```python
sc.pl.violin(adata, keys=['gene1', 'gene2'], groupby='leiden')
sc.pl.scatter(adata, x='gene1', y='gene2', color='leiden')
```
### Marker Gene Visualization
```python
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)
sc.pl.rank_genes_groups_violin(adata, groups='0')
sc.pl.rank_genes_groups_heatmap(adata, n_genes=10)
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5)
```
### Trajectory Visualization
```python
sc.pl.paga(adata, color='leiden') # PAGA graph
sc.pl.dpt_timeseries(adata) # DPT timeseries
```
### QC Plots
```python
sc.pl.highest_expr_genes(adata, n_top=20)
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'])
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')
```
### Advanced Plots
```python
sc.pl.dendrogram(adata, groupby='leiden')
sc.pl.correlation_matrix(adata, groupby='leiden')
sc.pl.tracksplot(adata, var_names=genes, groupby='leiden')
```
## Common Parameters
### Color Parameters
- `color`: Variable(s) to color by (gene name, obs column)
- `use_raw`: Use `.raw` attribute of adata
- `palette`: Color palette to use
- `vmin`, `vmax`: Color scale limits
### Layout Parameters
- `basis`: Embedding basis ('umap', 'tsne', 'pca', etc.)
- `legend_loc`: Legend location ('on data', 'right margin', etc.)
- `size`: Point size
- `alpha`: Point transparency
### Saving Parameters
- `save`: Filename to save plot
- `show`: Whether to show plot
## AnnData Structure
```python
adata.X # Expression matrix (cells × genes)
adata.obs # Cell annotations (DataFrame)
adata.var # Gene annotations (DataFrame)
adata.uns # Unstructured annotations (dict)
adata.obsm # Multi-dimensional cell annotations (e.g., PCA, UMAP)
adata.varm # Multi-dimensional gene annotations
adata.layers # Additional data layers
adata.raw # Raw data backup
# Access
adata.obs_names # Cell barcodes
adata.var_names # Gene names
adata.shape # (n_cells, n_genes)
# Slicing
adata[cell_indices, gene_indices]
adata[:, adata.var_names.isin(gene_list)]
adata[adata.obs['leiden'] == '0', :]
```
## Settings
```python
sc.settings.verbosity = 3 # 0=error, 1=warning, 2=info, 3=hint
sc.settings.set_figure_params(dpi=80, facecolor='white')
sc.settings.autoshow = False # Don't show plots automatically
sc.settings.autosave = True # Autosave figures
sc.settings.figdir = './figures/' # Figure directory
sc.settings.cachedir = './cache/' # Cache directory
sc.settings.n_jobs = 8 # Number of parallel jobs
```
## Useful Utilities
```python
sc.logging.print_versions() # Print version information
sc.logging.print_memory_usage() # Print memory usage
adata.copy() # Create a copy of AnnData object
adata.concatenate([adata1, adata2]) # Concatenate AnnData objects
```

View File

@@ -0,0 +1,352 @@
# Scanpy Plotting Guide
Comprehensive guide for creating publication-quality visualizations with scanpy.
## General Plotting Principles
All scanpy plotting functions follow consistent patterns:
- Functions in `sc.pl.*` mirror analysis functions in `sc.tl.*`
- Most accept `color` parameter for gene names or metadata columns
- Results are saved via `save` parameter
- Multiple plots can be generated in a single call
## Essential Quality Control Plots
### Visualize QC Metrics
```python
# Violin plots for QC metrics
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
jitter=0.4, multi_panel=True, save='_qc_violin.pdf')
# Scatter plots to identify outliers
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt', save='_qc_mt.pdf')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', save='_qc_genes.pdf')
# Highest expressing genes
sc.pl.highest_expr_genes(adata, n_top=20, save='_highest_expr.pdf')
```
### Post-filtering QC
```python
# Compare before and after filtering
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts'],
groupby='sample', save='_post_filter.pdf')
```
## Dimensionality Reduction Visualizations
### PCA Plots
```python
# Basic PCA
sc.pl.pca(adata, color='leiden', save='_pca.pdf')
# PCA colored by gene expression
sc.pl.pca(adata, color=['gene1', 'gene2', 'gene3'], save='_pca_genes.pdf')
# Variance ratio plot (elbow plot)
sc.pl.pca_variance_ratio(adata, log=True, n_pcs=50, save='_variance.pdf')
# PCA loadings
sc.pl.pca_loadings(adata, components=[1, 2, 3], save='_loadings.pdf')
```
### UMAP Plots
```python
# Basic UMAP with clusters
sc.pl.umap(adata, color='leiden', legend_loc='on data', save='_umap_leiden.pdf')
# UMAP colored by multiple variables
sc.pl.umap(adata, color=['leiden', 'cell_type', 'batch'],
save='_umap_multi.pdf')
# UMAP with gene expression
sc.pl.umap(adata, color=['CD3D', 'CD14', 'MS4A1'],
use_raw=False, save='_umap_genes.pdf')
# Customize appearance
sc.pl.umap(adata, color='leiden',
palette='Set2',
size=50,
alpha=0.8,
frameon=False,
title='Cell Types',
save='_umap_custom.pdf')
```
### t-SNE Plots
```python
# t-SNE with clusters
sc.pl.tsne(adata, color='leiden', legend_loc='right margin', save='_tsne.pdf')
# Multiple t-SNE perplexities (if computed)
sc.pl.tsne(adata, color='leiden', save='_tsne_default.pdf')
```
## Clustering Visualizations
### Basic Cluster Plots
```python
# UMAP with cluster annotations
sc.pl.umap(adata, color='leiden', add_outline=True,
legend_loc='on data', legend_fontsize=12,
legend_fontoutline=2, frameon=False,
save='_clusters.pdf')
# Show cluster proportions
sc.pl.umap(adata, color='leiden', size=50, edges=True,
edges_width=0.1, save='_clusters_edges.pdf')
```
### Cluster Comparison
```python
# Compare clustering results
sc.pl.umap(adata, color=['leiden', 'louvain'],
save='_cluster_comparison.pdf')
# Cluster dendrogram
sc.tl.dendrogram(adata, groupby='leiden')
sc.pl.dendrogram(adata, groupby='leiden', save='_dendrogram.pdf')
```
## Marker Gene Visualizations
### Ranked Marker Genes
```python
# Overview of top markers per cluster
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False,
save='_marker_overview.pdf')
# Heatmap of top markers
sc.pl.rank_genes_groups_heatmap(adata, n_genes=10, groupby='leiden',
show_gene_labels=True,
save='_marker_heatmap.pdf')
# Dot plot of markers
sc.pl.rank_genes_groups_dotplot(adata, n_genes=5,
save='_marker_dotplot.pdf')
# Stacked violin plots
sc.pl.rank_genes_groups_stacked_violin(adata, n_genes=5,
save='_marker_violin.pdf')
# Matrix plot
sc.pl.rank_genes_groups_matrixplot(adata, n_genes=5,
save='_marker_matrix.pdf')
```
### Specific Gene Expression
```python
# Violin plots for specific genes
marker_genes = ['CD3D', 'CD14', 'MS4A1', 'NKG7', 'FCGR3A']
sc.pl.violin(adata, keys=marker_genes, groupby='leiden',
save='_markers_violin.pdf')
# Dot plot for curated markers
sc.pl.dotplot(adata, var_names=marker_genes, groupby='leiden',
save='_markers_dotplot.pdf')
# Heatmap for specific genes
sc.pl.heatmap(adata, var_names=marker_genes, groupby='leiden',
swap_axes=True, save='_markers_heatmap.pdf')
# Stacked violin for gene sets
sc.pl.stacked_violin(adata, var_names=marker_genes, groupby='leiden',
save='_markers_stacked.pdf')
```
### Gene Expression on Embeddings
```python
# Multiple genes on UMAP
genes = ['CD3D', 'CD14', 'MS4A1', 'NKG7']
sc.pl.umap(adata, color=genes, cmap='viridis',
save='_umap_markers.pdf')
# Gene expression with custom colormap
sc.pl.umap(adata, color='CD3D', cmap='Reds',
vmin=0, vmax=3, save='_umap_cd3d.pdf')
```
## Trajectory and Pseudotime Visualizations
### PAGA Plots
```python
# PAGA graph
sc.pl.paga(adata, color='leiden', save='_paga.pdf')
# PAGA with gene expression
sc.pl.paga(adata, color=['leiden', 'dpt_pseudotime'],
save='_paga_pseudotime.pdf')
# PAGA overlaid on UMAP
sc.pl.umap(adata, color='leiden', save='_umap_with_paga.pdf',
edges=True, edges_color='gray')
```
### Pseudotime Plots
```python
# DPT pseudotime on UMAP
sc.pl.umap(adata, color='dpt_pseudotime', save='_umap_dpt.pdf')
# Gene expression along pseudotime
sc.pl.dpt_timeseries(adata, save='_dpt_timeseries.pdf')
# Heatmap ordered by pseudotime
sc.pl.heatmap(adata, var_names=genes, groupby='leiden',
use_raw=False, show_gene_labels=True,
save='_pseudotime_heatmap.pdf')
```
## Advanced Visualizations
### Tracks Plot (Gene Expression Trends)
```python
# Show gene expression across cell types
sc.pl.tracksplot(adata, var_names=marker_genes, groupby='leiden',
save='_tracks.pdf')
```
### Correlation Matrix
```python
# Correlation between clusters
sc.pl.correlation_matrix(adata, groupby='leiden',
save='_correlation.pdf')
```
### Embedding Density
```python
# Cell density on UMAP
sc.tl.embedding_density(adata, basis='umap', groupby='cell_type')
sc.pl.embedding_density(adata, basis='umap', key='umap_density_cell_type',
save='_density.pdf')
```
## Multi-Panel Figures
### Creating Panel Figures
```python
import matplotlib.pyplot as plt
# Create multi-panel figure
fig, axes = plt.subplots(2, 2, figsize=(12, 12))
# Plot on specific axes
sc.pl.umap(adata, color='leiden', ax=axes[0, 0], show=False)
sc.pl.umap(adata, color='CD3D', ax=axes[0, 1], show=False)
sc.pl.umap(adata, color='CD14', ax=axes[1, 0], show=False)
sc.pl.umap(adata, color='MS4A1', ax=axes[1, 1], show=False)
plt.tight_layout()
plt.savefig('figures/multi_panel.pdf')
plt.show()
```
## Publication-Quality Customization
### High-Quality Settings
```python
# Set publication-quality defaults
sc.settings.set_figure_params(dpi=300, frameon=False, figsize=(5, 5),
facecolor='white')
# Vector graphics output
sc.settings.figdir = './figures/'
sc.settings.file_format_figs = 'pdf' # or 'svg'
```
### Custom Color Palettes
```python
# Use custom colors
custom_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
sc.pl.umap(adata, color='leiden', palette=custom_colors,
save='_custom_colors.pdf')
# Continuous color maps
sc.pl.umap(adata, color='CD3D', cmap='viridis', save='_viridis.pdf')
sc.pl.umap(adata, color='CD3D', cmap='RdBu_r', save='_rdbu.pdf')
```
### Remove Axes and Frames
```python
# Clean plot without axes
sc.pl.umap(adata, color='leiden', frameon=False,
save='_clean.pdf')
# No legend
sc.pl.umap(adata, color='leiden', legend_loc=None,
save='_no_legend.pdf')
```
## Exporting Plots
### Save Individual Plots
```python
# Automatic saving with save parameter
sc.pl.umap(adata, color='leiden', save='_leiden.pdf')
# Saves to: sc.settings.figdir + 'umap_leiden.pdf'
# Manual saving
import matplotlib.pyplot as plt
fig = sc.pl.umap(adata, color='leiden', show=False, return_fig=True)
fig.savefig('figures/my_umap.pdf', dpi=300, bbox_inches='tight')
```
### Batch Export
```python
# Save multiple versions
for gene in ['CD3D', 'CD14', 'MS4A1']:
sc.pl.umap(adata, color=gene, save=f'_{gene}.pdf')
```
## Common Customization Parameters
### Layout Parameters
- `figsize`: Figure size (width, height)
- `frameon`: Show frame around plot
- `title`: Plot title
- `legend_loc`: 'right margin', 'on data', 'best', or None
- `legend_fontsize`: Font size for legend
- `size`: Point size
### Color Parameters
- `color`: Variable(s) to color by
- `palette`: Color palette (e.g., 'Set1', 'viridis')
- `cmap`: Colormap for continuous variables
- `vmin`, `vmax`: Color scale limits
- `use_raw`: Use raw counts for gene expression
### Saving Parameters
- `save`: Filename suffix for saving
- `show`: Whether to display plot
- `dpi`: Resolution for raster formats
## Tips for Publication Figures
1. **Use vector formats**: PDF or SVG for scalable graphics
2. **High DPI**: Set dpi=300 or higher for raster images
3. **Consistent styling**: Use the same color palette across figures
4. **Clear labels**: Ensure gene names and cell types are readable
5. **White background**: Use `facecolor='white'` for publications
6. **Remove clutter**: Set `frameon=False` for cleaner appearance
7. **Legend placement**: Use 'on data' for compact figures
8. **Color blind friendly**: Consider palettes like 'colorblind' or 'Set2'

View File

@@ -0,0 +1,206 @@
# Standard Scanpy Workflow for Single-Cell Analysis
This document outlines the standard workflow for analyzing single-cell RNA-seq data using scanpy.
## Complete Analysis Pipeline
### 1. Data Loading and Initial Setup
```python
import scanpy as sc
import pandas as pd
import numpy as np
# Configure scanpy settings
sc.settings.verbosity = 3 # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.settings.set_figure_params(dpi=80, facecolor='white')
# Load data (various formats)
adata = sc.read_10x_mtx('path/to/data/') # For 10X data
# adata = sc.read_h5ad('path/to/data.h5ad') # For h5ad format
# adata = sc.read_csv('path/to/data.csv') # For CSV format
```
### 2. Quality Control (QC)
```python
# Calculate QC metrics
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
# Common filtering thresholds (adjust based on dataset)
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
# Remove cells with high mitochondrial content
adata = adata[adata.obs.pct_counts_mt < 5, :]
# Visualize QC metrics
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
jitter=0.4, multi_panel=True)
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')
```
### 3. Normalization
```python
# Normalize to 10,000 counts per cell
sc.pp.normalize_total(adata, target_sum=1e4)
# Log-transform the data
sc.pp.log1p(adata)
# Store normalized data in raw for later use
adata.raw = adata
```
### 4. Feature Selection
```python
# Identify highly variable genes
sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
# Visualize highly variable genes
sc.pl.highly_variable_genes(adata)
# Subset to highly variable genes
adata = adata[:, adata.var.highly_variable]
```
### 5. Scaling and Regression
```python
# Regress out effects of total counts per cell and percent mitochondrial genes
sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
# Scale data to unit variance and zero mean
sc.pp.scale(adata, max_value=10)
```
### 6. Dimensionality Reduction
```python
# Principal Component Analysis (PCA)
sc.tl.pca(adata, svd_solver='arpack')
# Visualize PCA results
sc.pl.pca(adata, color='CST3')
sc.pl.pca_variance_ratio(adata, log=True)
# Computing neighborhood graph
sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
# UMAP for visualization
sc.tl.umap(adata)
# t-SNE (alternative to UMAP)
# sc.tl.tsne(adata)
```
### 7. Clustering
```python
# Leiden clustering (recommended)
sc.tl.leiden(adata, resolution=0.5)
# Alternative: Louvain clustering
# sc.tl.louvain(adata, resolution=0.5)
# Visualize clustering results
sc.pl.umap(adata, color=['leiden'], legend_loc='on data')
```
### 8. Marker Gene Identification
```python
# Find marker genes for each cluster
sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
# Visualize top marker genes
sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)
# Get marker gene dataframe
marker_genes = sc.get.rank_genes_groups_df(adata, group='0')
# Visualize specific markers
sc.pl.umap(adata, color=['leiden', 'CST3', 'NKG7'])
```
### 9. Cell Type Annotation
```python
# Manual annotation based on marker genes
cluster_annotations = {
'0': 'CD4 T cells',
'1': 'CD14+ Monocytes',
'2': 'B cells',
'3': 'CD8 T cells',
# ... add more annotations
}
adata.obs['cell_type'] = adata.obs['leiden'].map(cluster_annotations)
# Visualize annotated cell types
sc.pl.umap(adata, color='cell_type', legend_loc='on data')
```
### 10. Saving Results
```python
# Save the processed AnnData object
adata.write('results/processed_data.h5ad')
# Export results to CSV
adata.obs.to_csv('results/cell_metadata.csv')
adata.var.to_csv('results/gene_metadata.csv')
```
## Additional Analysis Options
### Trajectory Inference
```python
# PAGA (Partition-based graph abstraction)
sc.tl.paga(adata, groups='leiden')
sc.pl.paga(adata, color=['leiden'])
# Diffusion pseudotime (DPT)
adata.uns['iroot'] = np.flatnonzero(adata.obs['leiden'] == '0')[0]
sc.tl.dpt(adata)
sc.pl.umap(adata, color=['dpt_pseudotime'])
```
### Differential Expression Between Conditions
```python
# Compare conditions within a cell type
sc.tl.rank_genes_groups(adata, groupby='condition', groups=['treated'],
reference='control', method='wilcoxon')
sc.pl.rank_genes_groups(adata, groups=['treated'])
```
### Gene Set Scoring
```python
# Score cells for gene set expression
gene_set = ['CD3D', 'CD3E', 'CD3G']
sc.tl.score_genes(adata, gene_set, score_name='T_cell_score')
sc.pl.umap(adata, color='T_cell_score')
```
## Common Parameters to Adjust
- **QC thresholds**: `min_genes`, `min_cells`, `pct_counts_mt` - depends on dataset quality
- **Normalization target**: Usually 1e4, but can be adjusted
- **HVG parameters**: Affects feature selection stringency
- **PCA components**: Check variance ratio plot to determine optimal number
- **Clustering resolution**: Higher values give more clusters (typically 0.4-1.2)
- **n_neighbors**: Affects granularity of UMAP and clustering (typically 10-30)
## Best Practices
1. Always visualize QC metrics before filtering
2. Save raw counts before normalization (`adata.raw = adata`)
3. Use Leiden instead of Louvain for clustering (more efficient)
4. Try multiple clustering resolutions to find optimal granularity
5. Validate cell type annotations with known marker genes
6. Save intermediate results at key steps

View File

@@ -0,0 +1,200 @@
#!/usr/bin/env python3
"""
Quality Control Analysis Script for Scanpy
Performs comprehensive quality control on single-cell RNA-seq data,
including calculating metrics, generating QC plots, and filtering cells.
Usage:
python qc_analysis.py <input_file> [--output <output_file>]
"""
import argparse
import scanpy as sc
import matplotlib.pyplot as plt
def calculate_qc_metrics(adata, mt_threshold=5, min_genes=200, min_cells=3):
"""
Calculate QC metrics and filter cells/genes.
Parameters:
-----------
adata : AnnData
Annotated data matrix
mt_threshold : float
Maximum percentage of mitochondrial genes (default: 5)
min_genes : int
Minimum number of genes per cell (default: 200)
min_cells : int
Minimum number of cells per gene (default: 3)
Returns:
--------
AnnData
Filtered annotated data matrix
"""
# Identify mitochondrial genes (assumes gene names follow standard conventions)
adata.var['mt'] = adata.var_names.str.startswith(('MT-', 'mt-', 'Mt-'))
# Calculate QC metrics
sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None,
log1p=False, inplace=True)
print("\n=== QC Metrics Summary ===")
print(f"Total cells: {adata.n_obs}")
print(f"Total genes: {adata.n_vars}")
print(f"Mean genes per cell: {adata.obs['n_genes_by_counts'].mean():.2f}")
print(f"Mean counts per cell: {adata.obs['total_counts'].mean():.2f}")
print(f"Mean mitochondrial %: {adata.obs['pct_counts_mt'].mean():.2f}")
return adata
def generate_qc_plots(adata, output_prefix='qc'):
"""
Generate comprehensive QC plots.
Parameters:
-----------
adata : AnnData
Annotated data matrix
output_prefix : str
Prefix for saved figure files
"""
# Create figure directory if it doesn't exist
import os
os.makedirs('figures', exist_ok=True)
# Violin plots for QC metrics
sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
jitter=0.4, multi_panel=True, save=f'_{output_prefix}_violin.pdf')
# Scatter plots
sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt',
save=f'_{output_prefix}_mt_scatter.pdf')
sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts',
save=f'_{output_prefix}_genes_scatter.pdf')
# Highest expressing genes
sc.pl.highest_expr_genes(adata, n_top=20,
save=f'_{output_prefix}_highest_expr.pdf')
print(f"\nQC plots saved to figures/ directory with prefix '{output_prefix}'")
def filter_data(adata, mt_threshold=5, min_genes=200, max_genes=None,
min_counts=None, max_counts=None, min_cells=3):
"""
Filter cells and genes based on QC thresholds.
Parameters:
-----------
adata : AnnData
Annotated data matrix
mt_threshold : float
Maximum percentage of mitochondrial genes
min_genes : int
Minimum number of genes per cell
max_genes : int, optional
Maximum number of genes per cell
min_counts : int, optional
Minimum number of counts per cell
max_counts : int, optional
Maximum number of counts per cell
min_cells : int
Minimum number of cells per gene
Returns:
--------
AnnData
Filtered annotated data matrix
"""
n_cells_before = adata.n_obs
n_genes_before = adata.n_vars
# Filter cells
sc.pp.filter_cells(adata, min_genes=min_genes)
if max_genes:
adata = adata[adata.obs['n_genes_by_counts'] < max_genes, :]
if min_counts:
adata = adata[adata.obs['total_counts'] >= min_counts, :]
if max_counts:
adata = adata[adata.obs['total_counts'] < max_counts, :]
# Filter by mitochondrial percentage
adata = adata[adata.obs['pct_counts_mt'] < mt_threshold, :]
# Filter genes
sc.pp.filter_genes(adata, min_cells=min_cells)
print(f"\n=== Filtering Results ===")
print(f"Cells: {n_cells_before} -> {adata.n_obs} ({adata.n_obs/n_cells_before*100:.1f}% retained)")
print(f"Genes: {n_genes_before} -> {adata.n_vars} ({adata.n_vars/n_genes_before*100:.1f}% retained)")
return adata
def main():
parser = argparse.ArgumentParser(description='QC analysis for single-cell data')
parser.add_argument('input', help='Input file (h5ad, 10X mtx, csv, etc.)')
parser.add_argument('--output', default='qc_filtered.h5ad',
help='Output file name (default: qc_filtered.h5ad)')
parser.add_argument('--mt-threshold', type=float, default=5,
help='Max mitochondrial percentage (default: 5)')
parser.add_argument('--min-genes', type=int, default=200,
help='Min genes per cell (default: 200)')
parser.add_argument('--min-cells', type=int, default=3,
help='Min cells per gene (default: 3)')
parser.add_argument('--skip-plots', action='store_true',
help='Skip generating QC plots')
args = parser.parse_args()
# Configure scanpy
sc.settings.verbosity = 2
sc.settings.set_figure_params(dpi=300, facecolor='white')
sc.settings.figdir = './figures/'
print(f"Loading data from: {args.input}")
# Load data based on file extension
if args.input.endswith('.h5ad'):
adata = sc.read_h5ad(args.input)
elif args.input.endswith('.h5'):
adata = sc.read_10x_h5(args.input)
elif args.input.endswith('.csv'):
adata = sc.read_csv(args.input)
else:
# Try reading as 10X mtx directory
adata = sc.read_10x_mtx(args.input)
print(f"Loaded data: {adata.n_obs} cells x {adata.n_vars} genes")
# Calculate QC metrics
adata = calculate_qc_metrics(adata, mt_threshold=args.mt_threshold,
min_genes=args.min_genes, min_cells=args.min_cells)
# Generate QC plots (before filtering)
if not args.skip_plots:
print("\nGenerating QC plots (before filtering)...")
generate_qc_plots(adata, output_prefix='qc_before')
# Filter data
adata = filter_data(adata, mt_threshold=args.mt_threshold,
min_genes=args.min_genes, min_cells=args.min_cells)
# Generate QC plots (after filtering)
if not args.skip_plots:
print("\nGenerating QC plots (after filtering)...")
generate_qc_plots(adata, output_prefix='qc_after')
# Save filtered data
print(f"\nSaving filtered data to: {args.output}")
adata.write_h5ad(args.output)
print("\n=== QC Analysis Complete ===")
if __name__ == "__main__":
main()