Initial commit

2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions
--- a/skills/scanpy/SKILL.md
+++ b/skills/scanpy/SKILL.md
@@ -0,0 +1,380 @@
+---
+name: scanpy
+description: "Single-cell RNA-seq analysis. Load .h5ad/10X data, QC, normalization, PCA/UMAP/t-SNE, Leiden clustering, marker genes, cell type annotation, trajectory, for scRNA-seq analysis."
+---
+
+# Scanpy: Single-Cell Analysis
+
+## Overview
+
+Scanpy is a scalable Python toolkit for analyzing single-cell RNA-seq data, built on AnnData. Apply this skill for complete single-cell workflows including quality control, normalization, dimensionality reduction, clustering, marker gene identification, visualization, and trajectory analysis.
+
+## When to Use This Skill
+
+This skill should be used when:
+- Analyzing single-cell RNA-seq data (.h5ad, 10X, CSV formats)
+- Performing quality control on scRNA-seq datasets
+- Creating UMAP, t-SNE, or PCA visualizations
+- Identifying cell clusters and finding marker genes
+- Annotating cell types based on gene expression
+- Conducting trajectory inference or pseudotime analysis
+- Generating publication-quality single-cell plots
+
+## Quick Start
+
+### Basic Import and Setup
+
+```python
+import scanpy as sc
+import pandas as pd
+import numpy as np
+
+# Configure settings
+sc.settings.verbosity = 3
+sc.settings.set_figure_params(dpi=80, facecolor='white')
+sc.settings.figdir = './figures/'
+```
+
+### Loading Data
+
+```python
+# From 10X Genomics
+adata = sc.read_10x_mtx('path/to/data/')
+adata = sc.read_10x_h5('path/to/data.h5')
+
+# From h5ad (AnnData format)
+adata = sc.read_h5ad('path/to/data.h5ad')
+
+# From CSV
+adata = sc.read_csv('path/to/data.csv')
+```
+
+### Understanding AnnData Structure
+
+The AnnData object is the core data structure in scanpy:
+
+```python
+adata.X          # Expression matrix (cells × genes)
+adata.obs        # Cell metadata (DataFrame)
+adata.var        # Gene metadata (DataFrame)
+adata.uns        # Unstructured annotations (dict)
+adata.obsm       # Multi-dimensional cell data (PCA, UMAP)
+adata.raw        # Raw data backup
+
+# Access cell and gene names
+adata.obs_names  # Cell barcodes
+adata.var_names  # Gene names
+```
+
+## Standard Analysis Workflow
+
+### 1. Quality Control
+
+Identify and filter low-quality cells and genes:
+
+```python
+# Identify mitochondrial genes
+adata.var['mt'] = adata.var_names.str.startswith('MT-')
+
+# Calculate QC metrics
+sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], inplace=True)
+
+# Visualize QC metrics
+sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
+             jitter=0.4, multi_panel=True)
+
+# Filter cells and genes
+sc.pp.filter_cells(adata, min_genes=200)
+sc.pp.filter_genes(adata, min_cells=3)
+adata = adata[adata.obs.pct_counts_mt < 5, :]  # Remove high MT% cells
+```
+
+**Use the QC script for automated analysis:**
+```bash
+python scripts/qc_analysis.py input_file.h5ad --output filtered.h5ad
+```
+
+### 2. Normalization and Preprocessing
+
+```python
+# Normalize to 10,000 counts per cell
+sc.pp.normalize_total(adata, target_sum=1e4)
+
+# Log-transform
+sc.pp.log1p(adata)
+
+# Save raw counts for later
+adata.raw = adata
+
+# Identify highly variable genes
+sc.pp.highly_variable_genes(adata, n_top_genes=2000)
+sc.pl.highly_variable_genes(adata)
+
+# Subset to highly variable genes
+adata = adata[:, adata.var.highly_variable]
+
+# Regress out unwanted variation
+sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
+
+# Scale data
+sc.pp.scale(adata, max_value=10)
+```
+
+### 3. Dimensionality Reduction
+
+```python
+# PCA
+sc.tl.pca(adata, svd_solver='arpack')
+sc.pl.pca_variance_ratio(adata, log=True)  # Check elbow plot
+
+# Compute neighborhood graph
+sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
+
+# UMAP for visualization
+sc.tl.umap(adata)
+sc.pl.umap(adata, color='leiden')
+
+# Alternative: t-SNE
+sc.tl.tsne(adata)
+```
+
+### 4. Clustering
+
+```python
+# Leiden clustering (recommended)
+sc.tl.leiden(adata, resolution=0.5)
+sc.pl.umap(adata, color='leiden', legend_loc='on data')
+
+# Try multiple resolutions to find optimal granularity
+for res in [0.3, 0.5, 0.8, 1.0]:
+    sc.tl.leiden(adata, resolution=res, key_added=f'leiden_{res}')
+```
+
+### 5. Marker Gene Identification
+
+```python
+# Find marker genes for each cluster
+sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
+
+# Visualize results
+sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)
+sc.pl.rank_genes_groups_heatmap(adata, n_genes=10)
+sc.pl.rank_genes_groups_dotplot(adata, n_genes=5)
+
+# Get results as DataFrame
+markers = sc.get.rank_genes_groups_df(adata, group='0')
+```
+
+### 6. Cell Type Annotation
+
+```python
+# Define marker genes for known cell types
+marker_genes = ['CD3D', 'CD14', 'MS4A1', 'NKG7', 'FCGR3A']
+
+# Visualize markers
+sc.pl.umap(adata, color=marker_genes, use_raw=True)
+sc.pl.dotplot(adata, var_names=marker_genes, groupby='leiden')
+
+# Manual annotation
+cluster_to_celltype = {
+    '0': 'CD4 T cells',
+    '1': 'CD14+ Monocytes',
+    '2': 'B cells',
+    '3': 'CD8 T cells',
+}
+adata.obs['cell_type'] = adata.obs['leiden'].map(cluster_to_celltype)
+
+# Visualize annotated types
+sc.pl.umap(adata, color='cell_type', legend_loc='on data')
+```
+
+### 7. Save Results
+
+```python
+# Save processed data
+adata.write('results/processed_data.h5ad')
+
+# Export metadata
+adata.obs.to_csv('results/cell_metadata.csv')
+adata.var.to_csv('results/gene_metadata.csv')
+```
+
+## Common Tasks
+
+### Creating Publication-Quality Plots
+
+```python
+# Set high-quality defaults
+sc.settings.set_figure_params(dpi=300, frameon=False, figsize=(5, 5))
+sc.settings.file_format_figs = 'pdf'
+
+# UMAP with custom styling
+sc.pl.umap(adata, color='cell_type',
+           palette='Set2',
+           legend_loc='on data',
+           legend_fontsize=12,
+           legend_fontoutline=2,
+           frameon=False,
+           save='_publication.pdf')
+
+# Heatmap of marker genes
+sc.pl.heatmap(adata, var_names=genes, groupby='cell_type',
+              swap_axes=True, show_gene_labels=True,
+              save='_markers.pdf')
+
+# Dot plot
+sc.pl.dotplot(adata, var_names=genes, groupby='cell_type',
+              save='_dotplot.pdf')
+```
+
+Refer to `references/plotting_guide.md` for comprehensive visualization examples.
+
+### Trajectory Inference
+
+```python
+# PAGA (Partition-based graph abstraction)
+sc.tl.paga(adata, groups='leiden')
+sc.pl.paga(adata, color='leiden')
+
+# Diffusion pseudotime
+adata.uns['iroot'] = np.flatnonzero(adata.obs['leiden'] == '0')[0]
+sc.tl.dpt(adata)
+sc.pl.umap(adata, color='dpt_pseudotime')
+```
+
+### Differential Expression Between Conditions
+
+```python
+# Compare treated vs control within cell types
+adata_subset = adata[adata.obs['cell_type'] == 'T cells']
+sc.tl.rank_genes_groups(adata_subset, groupby='condition',
+                         groups=['treated'], reference='control')
+sc.pl.rank_genes_groups(adata_subset, groups=['treated'])
+```
+
+### Gene Set Scoring
+
+```python
+# Score cells for gene set expression
+gene_set = ['CD3D', 'CD3E', 'CD3G']
+sc.tl.score_genes(adata, gene_set, score_name='T_cell_score')
+sc.pl.umap(adata, color='T_cell_score')
+```
+
+### Batch Correction
+
+```python
+# ComBat batch correction
+sc.pp.combat(adata, key='batch')
+
+# Alternative: use Harmony or scVI (separate packages)
+```
+
+## Key Parameters to Adjust
+
+### Quality Control
+- `min_genes`: Minimum genes per cell (typically 200-500)
+- `min_cells`: Minimum cells per gene (typically 3-10)
+- `pct_counts_mt`: Mitochondrial threshold (typically 5-20%)
+
+### Normalization
+- `target_sum`: Target counts per cell (default 1e4)
+
+### Feature Selection
+- `n_top_genes`: Number of HVGs (typically 2000-3000)
+- `min_mean`, `max_mean`, `min_disp`: HVG selection parameters
+
+### Dimensionality Reduction
+- `n_pcs`: Number of principal components (check variance ratio plot)
+- `n_neighbors`: Number of neighbors (typically 10-30)
+
+### Clustering
+- `resolution`: Clustering granularity (0.4-1.2, higher = more clusters)
+
+## Common Pitfalls and Best Practices
+
+1. **Always save raw counts**: `adata.raw = adata` before filtering genes
+2. **Check QC plots carefully**: Adjust thresholds based on dataset quality
+3. **Use Leiden over Louvain**: More efficient and better results
+4. **Try multiple clustering resolutions**: Find optimal granularity
+5. **Validate cell type annotations**: Use multiple marker genes
+6. **Use `use_raw=True` for gene expression plots**: Shows original counts
+7. **Check PCA variance ratio**: Determine optimal number of PCs
+8. **Save intermediate results**: Long workflows can fail partway through
+
+## Bundled Resources
+
+### scripts/qc_analysis.py
+Automated quality control script that calculates metrics, generates plots, and filters data:
+
+```bash
+python scripts/qc_analysis.py input.h5ad --output filtered.h5ad \
+    --mt-threshold 5 --min-genes 200 --min-cells 3
+```
+
+### references/standard_workflow.md
+Complete step-by-step workflow with detailed explanations and code examples for:
+- Data loading and setup
+- Quality control with visualization
+- Normalization and scaling
+- Feature selection
+- Dimensionality reduction (PCA, UMAP, t-SNE)
+- Clustering (Leiden, Louvain)
+- Marker gene identification
+- Cell type annotation
+- Trajectory inference
+- Differential expression
+
+Read this reference when performing a complete analysis from scratch.
+
+### references/api_reference.md
+Quick reference guide for scanpy functions organized by module:
+- Reading/writing data (`sc.read_*`, `adata.write_*`)
+- Preprocessing (`sc.pp.*`)
+- Tools (`sc.tl.*`)
+- Plotting (`sc.pl.*`)
+- AnnData structure and manipulation
+- Settings and utilities
+
+Use this for quick lookup of function signatures and common parameters.
+
+### references/plotting_guide.md
+Comprehensive visualization guide including:
+- Quality control plots
+- Dimensionality reduction visualizations
+- Clustering visualizations
+- Marker gene plots (heatmaps, dot plots, violin plots)
+- Trajectory and pseudotime plots
+- Publication-quality customization
+- Multi-panel figures
+- Color palettes and styling
+
+Consult this when creating publication-ready figures.
+
+### assets/analysis_template.py
+Complete analysis template providing a full workflow from data loading through cell type annotation. Copy and customize this template for new analyses:
+
+```bash
+cp assets/analysis_template.py my_analysis.py
+# Edit parameters and run
+python my_analysis.py
+```
+
+The template includes all standard steps with configurable parameters and helpful comments.
+
+## Additional Resources
+
+- **Official scanpy documentation**: https://scanpy.readthedocs.io/
+- **Scanpy tutorials**: https://scanpy-tutorials.readthedocs.io/
+- **scverse ecosystem**: https://scverse.org/ (related tools: squidpy, scvi-tools, cellrank)
+- **Best practices**: Luecken & Theis (2019) "Current best practices in single-cell RNA-seq"
+
+## Tips for Effective Analysis
+
+1. **Start with the template**: Use `assets/analysis_template.py` as a starting point
+2. **Run QC script first**: Use `scripts/qc_analysis.py` for initial filtering
+3. **Consult references as needed**: Load workflow and API references into context
+4. **Iterate on clustering**: Try multiple resolutions and visualization methods
+5. **Validate biologically**: Check marker genes match expected cell types
+6. **Document parameters**: Record QC thresholds and analysis settings
+7. **Save checkpoints**: Write intermediate results at key steps
--- a/skills/scanpy/assets/analysis_template.py
+++ b/skills/scanpy/assets/analysis_template.py
@@ -0,0 +1,295 @@
+#!/usr/bin/env python3
+"""
+Complete Single-Cell Analysis Template
+
+This template provides a complete workflow for single-cell RNA-seq analysis
+using scanpy, from data loading through clustering and cell type annotation.
+
+Customize the parameters and sections as needed for your specific dataset.
+"""
+
+import scanpy as sc
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+
+# ============================================================================
+# CONFIGURATION
+# ============================================================================
+
+# File paths
+INPUT_FILE = 'data/raw_counts.h5ad'  # Change to your input file
+OUTPUT_DIR = 'results/'
+FIGURES_DIR = 'figures/'
+
+# QC parameters
+MIN_GENES = 200          # Minimum genes per cell
+MIN_CELLS = 3            # Minimum cells per gene
+MT_THRESHOLD = 5         # Maximum mitochondrial percentage
+
+# Analysis parameters
+N_TOP_GENES = 2000       # Number of highly variable genes
+N_PCS = 40               # Number of principal components
+N_NEIGHBORS = 10         # Number of neighbors for graph
+LEIDEN_RESOLUTION = 0.5  # Clustering resolution
+
+# Scanpy settings
+sc.settings.verbosity = 3
+sc.settings.set_figure_params(dpi=80, facecolor='white')
+sc.settings.figdir = FIGURES_DIR
+
+# ============================================================================
+# 1. LOAD DATA
+# ============================================================================
+
+print("=" * 80)
+print("LOADING DATA")
+print("=" * 80)
+
+# Load data (adjust based on your file format)
+adata = sc.read_h5ad(INPUT_FILE)
+# adata = sc.read_10x_mtx('data/filtered_gene_bc_matrices/')  # For 10X data
+# adata = sc.read_csv('data/counts.csv')  # For CSV data
+
+print(f"Loaded: {adata.n_obs} cells x {adata.n_vars} genes")
+
+# ============================================================================
+# 2. QUALITY CONTROL
+# ============================================================================
+
+print("\n" + "=" * 80)
+print("QUALITY CONTROL")
+print("=" * 80)
+
+# Identify mitochondrial genes
+adata.var['mt'] = adata.var_names.str.startswith('MT-')
+
+# Calculate QC metrics
+sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None,
+                            log1p=False, inplace=True)
+
+# Visualize QC metrics before filtering
+sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
+             jitter=0.4, multi_panel=True, save='_qc_before_filtering')
+
+sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt', save='_qc_mt')
+sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', save='_qc_genes')
+
+# Filter cells and genes
+print(f"\nBefore filtering: {adata.n_obs} cells, {adata.n_vars} genes")
+
+sc.pp.filter_cells(adata, min_genes=MIN_GENES)
+sc.pp.filter_genes(adata, min_cells=MIN_CELLS)
+adata = adata[adata.obs.pct_counts_mt < MT_THRESHOLD, :]
+
+print(f"After filtering: {adata.n_obs} cells, {adata.n_vars} genes")
+
+# ============================================================================
+# 3. NORMALIZATION
+# ============================================================================
+
+print("\n" + "=" * 80)
+print("NORMALIZATION")
+print("=" * 80)
+
+# Normalize to 10,000 counts per cell
+sc.pp.normalize_total(adata, target_sum=1e4)
+
+# Log-transform
+sc.pp.log1p(adata)
+
+# Store normalized data
+adata.raw = adata
+
+# ============================================================================
+# 4. FEATURE SELECTION
+# ============================================================================
+
+print("\n" + "=" * 80)
+print("FEATURE SELECTION")
+print("=" * 80)
+
+# Identify highly variable genes
+sc.pp.highly_variable_genes(adata, n_top_genes=N_TOP_GENES)
+
+# Visualize
+sc.pl.highly_variable_genes(adata, save='_hvg')
+
+print(f"Selected {sum(adata.var.highly_variable)} highly variable genes")
+
+# Subset to highly variable genes
+adata = adata[:, adata.var.highly_variable]
+
+# ============================================================================
+# 5. SCALING AND REGRESSION
+# ============================================================================
+
+print("\n" + "=" * 80)
+print("SCALING AND REGRESSION")
+print("=" * 80)
+
+# Regress out unwanted sources of variation
+sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
+
+# Scale data
+sc.pp.scale(adata, max_value=10)
+
+# ============================================================================
+# 6. DIMENSIONALITY REDUCTION
+# ============================================================================
+
+print("\n" + "=" * 80)
+print("DIMENSIONALITY REDUCTION")
+print("=" * 80)
+
+# PCA
+sc.tl.pca(adata, svd_solver='arpack')
+sc.pl.pca_variance_ratio(adata, log=True, save='_pca_variance')
+
+# Compute neighborhood graph
+sc.pp.neighbors(adata, n_neighbors=N_NEIGHBORS, n_pcs=N_PCS)
+
+# UMAP
+sc.tl.umap(adata)
+
+# ============================================================================
+# 7. CLUSTERING
+# ============================================================================
+
+print("\n" + "=" * 80)
+print("CLUSTERING")
+print("=" * 80)
+
+# Leiden clustering
+sc.tl.leiden(adata, resolution=LEIDEN_RESOLUTION)
+
+# Visualize
+sc.pl.umap(adata, color='leiden', legend_loc='on data', save='_leiden')
+
+print(f"Identified {len(adata.obs['leiden'].unique())} clusters")
+
+# ============================================================================
+# 8. MARKER GENE IDENTIFICATION
+# ============================================================================
+
+print("\n" + "=" * 80)
+print("MARKER GENE IDENTIFICATION")
+print("=" * 80)
+
+# Find marker genes
+sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
+
+# Visualize top markers
+sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False, save='_markers')
+sc.pl.rank_genes_groups_heatmap(adata, n_genes=10, save='_markers_heatmap')
+sc.pl.rank_genes_groups_dotplot(adata, n_genes=5, save='_markers_dotplot')
+
+# Get top markers for each cluster
+for cluster in adata.obs['leiden'].unique():
+    print(f"\nCluster {cluster} top markers:")
+    markers = sc.get.rank_genes_groups_df(adata, group=cluster).head(10)
+    print(markers[['names', 'scores', 'pvals_adj']].to_string(index=False))
+
+# ============================================================================
+# 9. CELL TYPE ANNOTATION (CUSTOMIZE THIS SECTION)
+# ============================================================================
+
+print("\n" + "=" * 80)
+print("CELL TYPE ANNOTATION")
+print("=" * 80)
+
+# Example marker genes for common cell types (customize for your data)
+marker_genes = {
+    'T cells': ['CD3D', 'CD3E', 'CD3G'],
+    'B cells': ['MS4A1', 'CD79A', 'CD79B'],
+    'Monocytes': ['CD14', 'LYZ', 'S100A8'],
+    'NK cells': ['NKG7', 'GNLY', 'KLRD1'],
+    'Dendritic cells': ['FCER1A', 'CST3'],
+}
+
+# Visualize marker genes
+for cell_type, genes in marker_genes.items():
+    available_genes = [g for g in genes if g in adata.raw.var_names]
+    if available_genes:
+        sc.pl.umap(adata, color=available_genes, use_raw=True,
+                   save=f'_{cell_type.replace(" ", "_")}')
+
+# Manual annotation based on marker expression (customize this mapping)
+cluster_to_celltype = {
+    '0': 'CD4 T cells',
+    '1': 'CD14+ Monocytes',
+    '2': 'B cells',
+    '3': 'CD8 T cells',
+    '4': 'NK cells',
+    # Add more mappings based on your marker analysis
+}
+
+# Apply annotations
+adata.obs['cell_type'] = adata.obs['leiden'].map(cluster_to_celltype)
+adata.obs['cell_type'] = adata.obs['cell_type'].fillna('Unknown')
+
+# Visualize annotated cell types
+sc.pl.umap(adata, color='cell_type', legend_loc='on data', save='_celltypes')
+
+# ============================================================================
+# 10. ADDITIONAL ANALYSES (OPTIONAL)
+# ============================================================================
+
+print("\n" + "=" * 80)
+print("ADDITIONAL ANALYSES")
+print("=" * 80)
+
+# PAGA trajectory analysis (optional)
+sc.tl.paga(adata, groups='leiden')
+sc.pl.paga(adata, color='leiden', save='_paga')
+
+# Gene set scoring (optional)
+# example_gene_set = ['CD3D', 'CD3E', 'CD3G']
+# sc.tl.score_genes(adata, example_gene_set, score_name='T_cell_score')
+# sc.pl.umap(adata, color='T_cell_score', save='_gene_set_score')
+
+# ============================================================================
+# 11. SAVE RESULTS
+# ============================================================================
+
+print("\n" + "=" * 80)
+print("SAVING RESULTS")
+print("=" * 80)
+
+import os
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+
+# Save processed AnnData object
+adata.write(f'{OUTPUT_DIR}/processed_data.h5ad')
+print(f"Saved processed data to {OUTPUT_DIR}/processed_data.h5ad")
+
+# Export metadata
+adata.obs.to_csv(f'{OUTPUT_DIR}/cell_metadata.csv')
+adata.var.to_csv(f'{OUTPUT_DIR}/gene_metadata.csv')
+print(f"Saved metadata to {OUTPUT_DIR}/")
+
+# Export marker genes
+for cluster in adata.obs['leiden'].unique():
+    markers = sc.get.rank_genes_groups_df(adata, group=cluster)
+    markers.to_csv(f'{OUTPUT_DIR}/markers_cluster_{cluster}.csv', index=False)
+print(f"Saved marker genes to {OUTPUT_DIR}/")
+
+# ============================================================================
+# 12. SUMMARY
+# ============================================================================
+
+print("\n" + "=" * 80)
+print("ANALYSIS SUMMARY")
+print("=" * 80)
+
+print(f"\nFinal dataset:")
+print(f"  Cells: {adata.n_obs}")
+print(f"  Genes: {adata.n_vars}")
+print(f"  Clusters: {len(adata.obs['leiden'].unique())}")
+
+print(f"\nCell type distribution:")
+print(adata.obs['cell_type'].value_counts())
+
+print("\n" + "=" * 80)
+print("ANALYSIS COMPLETE")
+print("=" * 80)
--- a/skills/scanpy/references/api_reference.md
+++ b/skills/scanpy/references/api_reference.md
@@ -0,0 +1,251 @@
+# Scanpy API Quick Reference
+
+Quick reference for commonly used scanpy functions organized by module.
+
+## Import Convention
+
+```python
+import scanpy as sc
+```
+
+## Reading and Writing Data (sc.read_*)
+
+### Reading Functions
+
+```python
+sc.read_10x_h5(filename)                    # Read 10X HDF5 file
+sc.read_10x_mtx(path)                       # Read 10X mtx directory
+sc.read_h5ad(filename)                      # Read h5ad (AnnData) file
+sc.read_csv(filename)                       # Read CSV file
+sc.read_excel(filename)                     # Read Excel file
+sc.read_loom(filename)                      # Read loom file
+sc.read_text(filename)                      # Read text file
+sc.read_visium(path)                        # Read Visium spatial data
+```
+
+### Writing Functions
+
+```python
+adata.write_h5ad(filename)                  # Write to h5ad format
+adata.write_csvs(dirname)                   # Write to CSV files
+adata.write_loom(filename)                  # Write to loom format
+adata.write_zarr(filename)                  # Write to zarr format
+```
+
+## Preprocessing (sc.pp.*)
+
+### Quality Control
+
+```python
+sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], inplace=True)
+sc.pp.filter_cells(adata, min_genes=200)
+sc.pp.filter_genes(adata, min_cells=3)
+```
+
+### Normalization and Transformation
+
+```python
+sc.pp.normalize_total(adata, target_sum=1e4)    # Normalize to target sum
+sc.pp.log1p(adata)                               # Log(x + 1) transformation
+sc.pp.sqrt(adata)                                # Square root transformation
+```
+
+### Feature Selection
+
+```python
+sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
+sc.pp.highly_variable_genes(adata, flavor='seurat_v3', n_top_genes=2000)
+```
+
+### Scaling and Regression
+
+```python
+sc.pp.scale(adata, max_value=10)                      # Scale to unit variance
+sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])  # Regress out unwanted variation
+```
+
+### Dimensionality Reduction (Preprocessing)
+
+```python
+sc.pp.pca(adata, n_comps=50)                     # Principal component analysis
+sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40) # Compute neighborhood graph
+```
+
+### Batch Correction
+
+```python
+sc.pp.combat(adata, key='batch')                 # ComBat batch correction
+```
+
+## Tools (sc.tl.*)
+
+### Dimensionality Reduction
+
+```python
+sc.tl.pca(adata, svd_solver='arpack')            # PCA
+sc.tl.umap(adata)                                 # UMAP embedding
+sc.tl.tsne(adata)                                 # t-SNE embedding
+sc.tl.diffmap(adata)                              # Diffusion map
+sc.tl.draw_graph(adata, layout='fa')             # Force-directed graph
+```
+
+### Clustering
+
+```python
+sc.tl.leiden(adata, resolution=0.5)              # Leiden clustering (recommended)
+sc.tl.louvain(adata, resolution=0.5)             # Louvain clustering
+sc.tl.kmeans(adata, n_clusters=10)               # K-means clustering
+```
+
+### Marker Genes and Differential Expression
+
+```python
+sc.tl.rank_genes_groups(adata, groupby='leiden', method='wilcoxon')
+sc.tl.rank_genes_groups(adata, groupby='leiden', method='t-test')
+sc.tl.rank_genes_groups(adata, groupby='leiden', method='logreg')
+
+# Get results as dataframe
+sc.get.rank_genes_groups_df(adata, group='0')
+```
+
+### Trajectory Inference
+
+```python
+sc.tl.paga(adata, groups='leiden')               # PAGA trajectory
+sc.tl.dpt(adata)                                  # Diffusion pseudotime
+```
+
+### Gene Scoring
+
+```python
+sc.tl.score_genes(adata, gene_list, score_name='score')
+sc.tl.score_genes_cell_cycle(adata, s_genes, g2m_genes)
+```
+
+### Embeddings and Projections
+
+```python
+sc.tl.ingest(adata, adata_ref)                   # Map to reference
+sc.tl.embedding_density(adata, basis='umap', groupby='leiden')
+```
+
+## Plotting (sc.pl.*)
+
+### Basic Embeddings
+
+```python
+sc.pl.umap(adata, color='leiden')                # UMAP plot
+sc.pl.tsne(adata, color='gene_name')             # t-SNE plot
+sc.pl.pca(adata, color='leiden')                 # PCA plot
+sc.pl.diffmap(adata, color='leiden')             # Diffusion map plot
+```
+
+### Heatmaps and Dot Plots
+
+```python
+sc.pl.heatmap(adata, var_names=genes, groupby='leiden')
+sc.pl.dotplot(adata, var_names=genes, groupby='leiden')
+sc.pl.matrixplot(adata, var_names=genes, groupby='leiden')
+sc.pl.stacked_violin(adata, var_names=genes, groupby='leiden')
+```
+
+### Violin and Scatter Plots
+
+```python
+sc.pl.violin(adata, keys=['gene1', 'gene2'], groupby='leiden')
+sc.pl.scatter(adata, x='gene1', y='gene2', color='leiden')
+```
+
+### Marker Gene Visualization
+
+```python
+sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)
+sc.pl.rank_genes_groups_violin(adata, groups='0')
+sc.pl.rank_genes_groups_heatmap(adata, n_genes=10)
+sc.pl.rank_genes_groups_dotplot(adata, n_genes=5)
+```
+
+### Trajectory Visualization
+
+```python
+sc.pl.paga(adata, color='leiden')                # PAGA graph
+sc.pl.dpt_timeseries(adata)                      # DPT timeseries
+```
+
+### QC Plots
+
+```python
+sc.pl.highest_expr_genes(adata, n_top=20)
+sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'])
+sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')
+```
+
+### Advanced Plots
+
+```python
+sc.pl.dendrogram(adata, groupby='leiden')
+sc.pl.correlation_matrix(adata, groupby='leiden')
+sc.pl.tracksplot(adata, var_names=genes, groupby='leiden')
+```
+
+## Common Parameters
+
+### Color Parameters
+- `color`: Variable(s) to color by (gene name, obs column)
+- `use_raw`: Use `.raw` attribute of adata
+- `palette`: Color palette to use
+- `vmin`, `vmax`: Color scale limits
+
+### Layout Parameters
+- `basis`: Embedding basis ('umap', 'tsne', 'pca', etc.)
+- `legend_loc`: Legend location ('on data', 'right margin', etc.)
+- `size`: Point size
+- `alpha`: Point transparency
+
+### Saving Parameters
+- `save`: Filename to save plot
+- `show`: Whether to show plot
+
+## AnnData Structure
+
+```python
+adata.X                    # Expression matrix (cells × genes)
+adata.obs                  # Cell annotations (DataFrame)
+adata.var                  # Gene annotations (DataFrame)
+adata.uns                  # Unstructured annotations (dict)
+adata.obsm                 # Multi-dimensional cell annotations (e.g., PCA, UMAP)
+adata.varm                 # Multi-dimensional gene annotations
+adata.layers               # Additional data layers
+adata.raw                  # Raw data backup
+
+# Access
+adata.obs_names            # Cell barcodes
+adata.var_names            # Gene names
+adata.shape                # (n_cells, n_genes)
+
+# Slicing
+adata[cell_indices, gene_indices]
+adata[:, adata.var_names.isin(gene_list)]
+adata[adata.obs['leiden'] == '0', :]
+```
+
+## Settings
+
+```python
+sc.settings.verbosity = 3              # 0=error, 1=warning, 2=info, 3=hint
+sc.settings.set_figure_params(dpi=80, facecolor='white')
+sc.settings.autoshow = False           # Don't show plots automatically
+sc.settings.autosave = True            # Autosave figures
+sc.settings.figdir = './figures/'      # Figure directory
+sc.settings.cachedir = './cache/'      # Cache directory
+sc.settings.n_jobs = 8                 # Number of parallel jobs
+```
+
+## Useful Utilities
+
+```python
+sc.logging.print_versions()            # Print version information
+sc.logging.print_memory_usage()        # Print memory usage
+adata.copy()                           # Create a copy of AnnData object
+adata.concatenate([adata1, adata2])    # Concatenate AnnData objects
+```
--- a/skills/scanpy/references/plotting_guide.md
+++ b/skills/scanpy/references/plotting_guide.md
@@ -0,0 +1,352 @@
+# Scanpy Plotting Guide
+
+Comprehensive guide for creating publication-quality visualizations with scanpy.
+
+## General Plotting Principles
+
+All scanpy plotting functions follow consistent patterns:
+- Functions in `sc.pl.*` mirror analysis functions in `sc.tl.*`
+- Most accept `color` parameter for gene names or metadata columns
+- Results are saved via `save` parameter
+- Multiple plots can be generated in a single call
+
+## Essential Quality Control Plots
+
+### Visualize QC Metrics
+
+```python
+# Violin plots for QC metrics
+sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
+             jitter=0.4, multi_panel=True, save='_qc_violin.pdf')
+
+# Scatter plots to identify outliers
+sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt', save='_qc_mt.pdf')
+sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts', save='_qc_genes.pdf')
+
+# Highest expressing genes
+sc.pl.highest_expr_genes(adata, n_top=20, save='_highest_expr.pdf')
+```
+
+### Post-filtering QC
+
+```python
+# Compare before and after filtering
+sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts'],
+             groupby='sample', save='_post_filter.pdf')
+```
+
+## Dimensionality Reduction Visualizations
+
+### PCA Plots
+
+```python
+# Basic PCA
+sc.pl.pca(adata, color='leiden', save='_pca.pdf')
+
+# PCA colored by gene expression
+sc.pl.pca(adata, color=['gene1', 'gene2', 'gene3'], save='_pca_genes.pdf')
+
+# Variance ratio plot (elbow plot)
+sc.pl.pca_variance_ratio(adata, log=True, n_pcs=50, save='_variance.pdf')
+
+# PCA loadings
+sc.pl.pca_loadings(adata, components=[1, 2, 3], save='_loadings.pdf')
+```
+
+### UMAP Plots
+
+```python
+# Basic UMAP with clusters
+sc.pl.umap(adata, color='leiden', legend_loc='on data', save='_umap_leiden.pdf')
+
+# UMAP colored by multiple variables
+sc.pl.umap(adata, color=['leiden', 'cell_type', 'batch'],
+           save='_umap_multi.pdf')
+
+# UMAP with gene expression
+sc.pl.umap(adata, color=['CD3D', 'CD14', 'MS4A1'],
+           use_raw=False, save='_umap_genes.pdf')
+
+# Customize appearance
+sc.pl.umap(adata, color='leiden',
+           palette='Set2',
+           size=50,
+           alpha=0.8,
+           frameon=False,
+           title='Cell Types',
+           save='_umap_custom.pdf')
+```
+
+### t-SNE Plots
+
+```python
+# t-SNE with clusters
+sc.pl.tsne(adata, color='leiden', legend_loc='right margin', save='_tsne.pdf')
+
+# Multiple t-SNE perplexities (if computed)
+sc.pl.tsne(adata, color='leiden', save='_tsne_default.pdf')
+```
+
+## Clustering Visualizations
+
+### Basic Cluster Plots
+
+```python
+# UMAP with cluster annotations
+sc.pl.umap(adata, color='leiden', add_outline=True,
+           legend_loc='on data', legend_fontsize=12,
+           legend_fontoutline=2, frameon=False,
+           save='_clusters.pdf')
+
+# Show cluster proportions
+sc.pl.umap(adata, color='leiden', size=50, edges=True,
+           edges_width=0.1, save='_clusters_edges.pdf')
+```
+
+### Cluster Comparison
+
+```python
+# Compare clustering results
+sc.pl.umap(adata, color=['leiden', 'louvain'],
+           save='_cluster_comparison.pdf')
+
+# Cluster dendrogram
+sc.tl.dendrogram(adata, groupby='leiden')
+sc.pl.dendrogram(adata, groupby='leiden', save='_dendrogram.pdf')
+```
+
+## Marker Gene Visualizations
+
+### Ranked Marker Genes
+
+```python
+# Overview of top markers per cluster
+sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False,
+                        save='_marker_overview.pdf')
+
+# Heatmap of top markers
+sc.pl.rank_genes_groups_heatmap(adata, n_genes=10, groupby='leiden',
+                                 show_gene_labels=True,
+                                 save='_marker_heatmap.pdf')
+
+# Dot plot of markers
+sc.pl.rank_genes_groups_dotplot(adata, n_genes=5,
+                                 save='_marker_dotplot.pdf')
+
+# Stacked violin plots
+sc.pl.rank_genes_groups_stacked_violin(adata, n_genes=5,
+                                        save='_marker_violin.pdf')
+
+# Matrix plot
+sc.pl.rank_genes_groups_matrixplot(adata, n_genes=5,
+                                    save='_marker_matrix.pdf')
+```
+
+### Specific Gene Expression
+
+```python
+# Violin plots for specific genes
+marker_genes = ['CD3D', 'CD14', 'MS4A1', 'NKG7', 'FCGR3A']
+sc.pl.violin(adata, keys=marker_genes, groupby='leiden',
+             save='_markers_violin.pdf')
+
+# Dot plot for curated markers
+sc.pl.dotplot(adata, var_names=marker_genes, groupby='leiden',
+              save='_markers_dotplot.pdf')
+
+# Heatmap for specific genes
+sc.pl.heatmap(adata, var_names=marker_genes, groupby='leiden',
+              swap_axes=True, save='_markers_heatmap.pdf')
+
+# Stacked violin for gene sets
+sc.pl.stacked_violin(adata, var_names=marker_genes, groupby='leiden',
+                     save='_markers_stacked.pdf')
+```
+
+### Gene Expression on Embeddings
+
+```python
+# Multiple genes on UMAP
+genes = ['CD3D', 'CD14', 'MS4A1', 'NKG7']
+sc.pl.umap(adata, color=genes, cmap='viridis',
+           save='_umap_markers.pdf')
+
+# Gene expression with custom colormap
+sc.pl.umap(adata, color='CD3D', cmap='Reds',
+           vmin=0, vmax=3, save='_umap_cd3d.pdf')
+```
+
+## Trajectory and Pseudotime Visualizations
+
+### PAGA Plots
+
+```python
+# PAGA graph
+sc.pl.paga(adata, color='leiden', save='_paga.pdf')
+
+# PAGA with gene expression
+sc.pl.paga(adata, color=['leiden', 'dpt_pseudotime'],
+           save='_paga_pseudotime.pdf')
+
+# PAGA overlaid on UMAP
+sc.pl.umap(adata, color='leiden', save='_umap_with_paga.pdf',
+           edges=True, edges_color='gray')
+```
+
+### Pseudotime Plots
+
+```python
+# DPT pseudotime on UMAP
+sc.pl.umap(adata, color='dpt_pseudotime', save='_umap_dpt.pdf')
+
+# Gene expression along pseudotime
+sc.pl.dpt_timeseries(adata, save='_dpt_timeseries.pdf')
+
+# Heatmap ordered by pseudotime
+sc.pl.heatmap(adata, var_names=genes, groupby='leiden',
+              use_raw=False, show_gene_labels=True,
+              save='_pseudotime_heatmap.pdf')
+```
+
+## Advanced Visualizations
+
+### Tracks Plot (Gene Expression Trends)
+
+```python
+# Show gene expression across cell types
+sc.pl.tracksplot(adata, var_names=marker_genes, groupby='leiden',
+                 save='_tracks.pdf')
+```
+
+### Correlation Matrix
+
+```python
+# Correlation between clusters
+sc.pl.correlation_matrix(adata, groupby='leiden',
+                         save='_correlation.pdf')
+```
+
+### Embedding Density
+
+```python
+# Cell density on UMAP
+sc.tl.embedding_density(adata, basis='umap', groupby='cell_type')
+sc.pl.embedding_density(adata, basis='umap', key='umap_density_cell_type',
+                        save='_density.pdf')
+```
+
+## Multi-Panel Figures
+
+### Creating Panel Figures
+
+```python
+import matplotlib.pyplot as plt
+
+# Create multi-panel figure
+fig, axes = plt.subplots(2, 2, figsize=(12, 12))
+
+# Plot on specific axes
+sc.pl.umap(adata, color='leiden', ax=axes[0, 0], show=False)
+sc.pl.umap(adata, color='CD3D', ax=axes[0, 1], show=False)
+sc.pl.umap(adata, color='CD14', ax=axes[1, 0], show=False)
+sc.pl.umap(adata, color='MS4A1', ax=axes[1, 1], show=False)
+
+plt.tight_layout()
+plt.savefig('figures/multi_panel.pdf')
+plt.show()
+```
+
+## Publication-Quality Customization
+
+### High-Quality Settings
+
+```python
+# Set publication-quality defaults
+sc.settings.set_figure_params(dpi=300, frameon=False, figsize=(5, 5),
+                               facecolor='white')
+
+# Vector graphics output
+sc.settings.figdir = './figures/'
+sc.settings.file_format_figs = 'pdf'  # or 'svg'
+```
+
+### Custom Color Palettes
+
+```python
+# Use custom colors
+custom_colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
+sc.pl.umap(adata, color='leiden', palette=custom_colors,
+           save='_custom_colors.pdf')
+
+# Continuous color maps
+sc.pl.umap(adata, color='CD3D', cmap='viridis', save='_viridis.pdf')
+sc.pl.umap(adata, color='CD3D', cmap='RdBu_r', save='_rdbu.pdf')
+```
+
+### Remove Axes and Frames
+
+```python
+# Clean plot without axes
+sc.pl.umap(adata, color='leiden', frameon=False,
+           save='_clean.pdf')
+
+# No legend
+sc.pl.umap(adata, color='leiden', legend_loc=None,
+           save='_no_legend.pdf')
+```
+
+## Exporting Plots
+
+### Save Individual Plots
+
+```python
+# Automatic saving with save parameter
+sc.pl.umap(adata, color='leiden', save='_leiden.pdf')
+# Saves to: sc.settings.figdir + 'umap_leiden.pdf'
+
+# Manual saving
+import matplotlib.pyplot as plt
+fig = sc.pl.umap(adata, color='leiden', show=False, return_fig=True)
+fig.savefig('figures/my_umap.pdf', dpi=300, bbox_inches='tight')
+```
+
+### Batch Export
+
+```python
+# Save multiple versions
+for gene in ['CD3D', 'CD14', 'MS4A1']:
+    sc.pl.umap(adata, color=gene, save=f'_{gene}.pdf')
+```
+
+## Common Customization Parameters
+
+### Layout Parameters
+- `figsize`: Figure size (width, height)
+- `frameon`: Show frame around plot
+- `title`: Plot title
+- `legend_loc`: 'right margin', 'on data', 'best', or None
+- `legend_fontsize`: Font size for legend
+- `size`: Point size
+
+### Color Parameters
+- `color`: Variable(s) to color by
+- `palette`: Color palette (e.g., 'Set1', 'viridis')
+- `cmap`: Colormap for continuous variables
+- `vmin`, `vmax`: Color scale limits
+- `use_raw`: Use raw counts for gene expression
+
+### Saving Parameters
+- `save`: Filename suffix for saving
+- `show`: Whether to display plot
+- `dpi`: Resolution for raster formats
+
+## Tips for Publication Figures
+
+1. **Use vector formats**: PDF or SVG for scalable graphics
+2. **High DPI**: Set dpi=300 or higher for raster images
+3. **Consistent styling**: Use the same color palette across figures
+4. **Clear labels**: Ensure gene names and cell types are readable
+5. **White background**: Use `facecolor='white'` for publications
+6. **Remove clutter**: Set `frameon=False` for cleaner appearance
+7. **Legend placement**: Use 'on data' for compact figures
+8. **Color blind friendly**: Consider palettes like 'colorblind' or 'Set2'
--- a/skills/scanpy/references/standard_workflow.md
+++ b/skills/scanpy/references/standard_workflow.md
@@ -0,0 +1,206 @@
+# Standard Scanpy Workflow for Single-Cell Analysis
+
+This document outlines the standard workflow for analyzing single-cell RNA-seq data using scanpy.
+
+## Complete Analysis Pipeline
+
+### 1. Data Loading and Initial Setup
+
+```python
+import scanpy as sc
+import pandas as pd
+import numpy as np
+
+# Configure scanpy settings
+sc.settings.verbosity = 3  # verbosity: errors (0), warnings (1), info (2), hints (3)
+sc.settings.set_figure_params(dpi=80, facecolor='white')
+
+# Load data (various formats)
+adata = sc.read_10x_mtx('path/to/data/')  # For 10X data
+# adata = sc.read_h5ad('path/to/data.h5ad')  # For h5ad format
+# adata = sc.read_csv('path/to/data.csv')  # For CSV format
+```
+
+### 2. Quality Control (QC)
+
+```python
+# Calculate QC metrics
+sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None, log1p=False, inplace=True)
+
+# Common filtering thresholds (adjust based on dataset)
+sc.pp.filter_cells(adata, min_genes=200)
+sc.pp.filter_genes(adata, min_cells=3)
+
+# Remove cells with high mitochondrial content
+adata = adata[adata.obs.pct_counts_mt < 5, :]
+
+# Visualize QC metrics
+sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
+             jitter=0.4, multi_panel=True)
+sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt')
+sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts')
+```
+
+### 3. Normalization
+
+```python
+# Normalize to 10,000 counts per cell
+sc.pp.normalize_total(adata, target_sum=1e4)
+
+# Log-transform the data
+sc.pp.log1p(adata)
+
+# Store normalized data in raw for later use
+adata.raw = adata
+```
+
+### 4. Feature Selection
+
+```python
+# Identify highly variable genes
+sc.pp.highly_variable_genes(adata, min_mean=0.0125, max_mean=3, min_disp=0.5)
+
+# Visualize highly variable genes
+sc.pl.highly_variable_genes(adata)
+
+# Subset to highly variable genes
+adata = adata[:, adata.var.highly_variable]
+```
+
+### 5. Scaling and Regression
+
+```python
+# Regress out effects of total counts per cell and percent mitochondrial genes
+sc.pp.regress_out(adata, ['total_counts', 'pct_counts_mt'])
+
+# Scale data to unit variance and zero mean
+sc.pp.scale(adata, max_value=10)
+```
+
+### 6. Dimensionality Reduction
+
+```python
+# Principal Component Analysis (PCA)
+sc.tl.pca(adata, svd_solver='arpack')
+
+# Visualize PCA results
+sc.pl.pca(adata, color='CST3')
+sc.pl.pca_variance_ratio(adata, log=True)
+
+# Computing neighborhood graph
+sc.pp.neighbors(adata, n_neighbors=10, n_pcs=40)
+
+# UMAP for visualization
+sc.tl.umap(adata)
+
+# t-SNE (alternative to UMAP)
+# sc.tl.tsne(adata)
+```
+
+### 7. Clustering
+
+```python
+# Leiden clustering (recommended)
+sc.tl.leiden(adata, resolution=0.5)
+
+# Alternative: Louvain clustering
+# sc.tl.louvain(adata, resolution=0.5)
+
+# Visualize clustering results
+sc.pl.umap(adata, color=['leiden'], legend_loc='on data')
+```
+
+### 8. Marker Gene Identification
+
+```python
+# Find marker genes for each cluster
+sc.tl.rank_genes_groups(adata, 'leiden', method='wilcoxon')
+
+# Visualize top marker genes
+sc.pl.rank_genes_groups(adata, n_genes=25, sharey=False)
+
+# Get marker gene dataframe
+marker_genes = sc.get.rank_genes_groups_df(adata, group='0')
+
+# Visualize specific markers
+sc.pl.umap(adata, color=['leiden', 'CST3', 'NKG7'])
+```
+
+### 9. Cell Type Annotation
+
+```python
+# Manual annotation based on marker genes
+cluster_annotations = {
+    '0': 'CD4 T cells',
+    '1': 'CD14+ Monocytes',
+    '2': 'B cells',
+    '3': 'CD8 T cells',
+    # ... add more annotations
+}
+adata.obs['cell_type'] = adata.obs['leiden'].map(cluster_annotations)
+
+# Visualize annotated cell types
+sc.pl.umap(adata, color='cell_type', legend_loc='on data')
+```
+
+### 10. Saving Results
+
+```python
+# Save the processed AnnData object
+adata.write('results/processed_data.h5ad')
+
+# Export results to CSV
+adata.obs.to_csv('results/cell_metadata.csv')
+adata.var.to_csv('results/gene_metadata.csv')
+```
+
+## Additional Analysis Options
+
+### Trajectory Inference
+
+```python
+# PAGA (Partition-based graph abstraction)
+sc.tl.paga(adata, groups='leiden')
+sc.pl.paga(adata, color=['leiden'])
+
+# Diffusion pseudotime (DPT)
+adata.uns['iroot'] = np.flatnonzero(adata.obs['leiden'] == '0')[0]
+sc.tl.dpt(adata)
+sc.pl.umap(adata, color=['dpt_pseudotime'])
+```
+
+### Differential Expression Between Conditions
+
+```python
+# Compare conditions within a cell type
+sc.tl.rank_genes_groups(adata, groupby='condition', groups=['treated'],
+                         reference='control', method='wilcoxon')
+sc.pl.rank_genes_groups(adata, groups=['treated'])
+```
+
+### Gene Set Scoring
+
+```python
+# Score cells for gene set expression
+gene_set = ['CD3D', 'CD3E', 'CD3G']
+sc.tl.score_genes(adata, gene_set, score_name='T_cell_score')
+sc.pl.umap(adata, color='T_cell_score')
+```
+
+## Common Parameters to Adjust
+
+- **QC thresholds**: `min_genes`, `min_cells`, `pct_counts_mt` - depends on dataset quality
+- **Normalization target**: Usually 1e4, but can be adjusted
+- **HVG parameters**: Affects feature selection stringency
+- **PCA components**: Check variance ratio plot to determine optimal number
+- **Clustering resolution**: Higher values give more clusters (typically 0.4-1.2)
+- **n_neighbors**: Affects granularity of UMAP and clustering (typically 10-30)
+
+## Best Practices
+
+1. Always visualize QC metrics before filtering
+2. Save raw counts before normalization (`adata.raw = adata`)
+3. Use Leiden instead of Louvain for clustering (more efficient)
+4. Try multiple clustering resolutions to find optimal granularity
+5. Validate cell type annotations with known marker genes
+6. Save intermediate results at key steps
--- a/skills/scanpy/scripts/qc_analysis.py
+++ b/skills/scanpy/scripts/qc_analysis.py
@@ -0,0 +1,200 @@
+#!/usr/bin/env python3
+"""
+Quality Control Analysis Script for Scanpy
+
+Performs comprehensive quality control on single-cell RNA-seq data,
+including calculating metrics, generating QC plots, and filtering cells.
+
+Usage:
+    python qc_analysis.py <input_file> [--output <output_file>]
+"""
+
+import argparse
+import scanpy as sc
+import matplotlib.pyplot as plt
+
+
+def calculate_qc_metrics(adata, mt_threshold=5, min_genes=200, min_cells=3):
+    """
+    Calculate QC metrics and filter cells/genes.
+
+    Parameters:
+    -----------
+    adata : AnnData
+        Annotated data matrix
+    mt_threshold : float
+        Maximum percentage of mitochondrial genes (default: 5)
+    min_genes : int
+        Minimum number of genes per cell (default: 200)
+    min_cells : int
+        Minimum number of cells per gene (default: 3)
+
+    Returns:
+    --------
+    AnnData
+        Filtered annotated data matrix
+    """
+    # Identify mitochondrial genes (assumes gene names follow standard conventions)
+    adata.var['mt'] = adata.var_names.str.startswith(('MT-', 'mt-', 'Mt-'))
+
+    # Calculate QC metrics
+    sc.pp.calculate_qc_metrics(adata, qc_vars=['mt'], percent_top=None,
+                                log1p=False, inplace=True)
+
+    print("\n=== QC Metrics Summary ===")
+    print(f"Total cells: {adata.n_obs}")
+    print(f"Total genes: {adata.n_vars}")
+    print(f"Mean genes per cell: {adata.obs['n_genes_by_counts'].mean():.2f}")
+    print(f"Mean counts per cell: {adata.obs['total_counts'].mean():.2f}")
+    print(f"Mean mitochondrial %: {adata.obs['pct_counts_mt'].mean():.2f}")
+
+    return adata
+
+
+def generate_qc_plots(adata, output_prefix='qc'):
+    """
+    Generate comprehensive QC plots.
+
+    Parameters:
+    -----------
+    adata : AnnData
+        Annotated data matrix
+    output_prefix : str
+        Prefix for saved figure files
+    """
+    # Create figure directory if it doesn't exist
+    import os
+    os.makedirs('figures', exist_ok=True)
+
+    # Violin plots for QC metrics
+    sc.pl.violin(adata, ['n_genes_by_counts', 'total_counts', 'pct_counts_mt'],
+                 jitter=0.4, multi_panel=True, save=f'_{output_prefix}_violin.pdf')
+
+    # Scatter plots
+    sc.pl.scatter(adata, x='total_counts', y='pct_counts_mt',
+                  save=f'_{output_prefix}_mt_scatter.pdf')
+    sc.pl.scatter(adata, x='total_counts', y='n_genes_by_counts',
+                  save=f'_{output_prefix}_genes_scatter.pdf')
+
+    # Highest expressing genes
+    sc.pl.highest_expr_genes(adata, n_top=20,
+                              save=f'_{output_prefix}_highest_expr.pdf')
+
+    print(f"\nQC plots saved to figures/ directory with prefix '{output_prefix}'")
+
+
+def filter_data(adata, mt_threshold=5, min_genes=200, max_genes=None,
+                min_counts=None, max_counts=None, min_cells=3):
+    """
+    Filter cells and genes based on QC thresholds.
+
+    Parameters:
+    -----------
+    adata : AnnData
+        Annotated data matrix
+    mt_threshold : float
+        Maximum percentage of mitochondrial genes
+    min_genes : int
+        Minimum number of genes per cell
+    max_genes : int, optional
+        Maximum number of genes per cell
+    min_counts : int, optional
+        Minimum number of counts per cell
+    max_counts : int, optional
+        Maximum number of counts per cell
+    min_cells : int
+        Minimum number of cells per gene
+
+    Returns:
+    --------
+    AnnData
+        Filtered annotated data matrix
+    """
+    n_cells_before = adata.n_obs
+    n_genes_before = adata.n_vars
+
+    # Filter cells
+    sc.pp.filter_cells(adata, min_genes=min_genes)
+    if max_genes:
+        adata = adata[adata.obs['n_genes_by_counts'] < max_genes, :]
+    if min_counts:
+        adata = adata[adata.obs['total_counts'] >= min_counts, :]
+    if max_counts:
+        adata = adata[adata.obs['total_counts'] < max_counts, :]
+
+    # Filter by mitochondrial percentage
+    adata = adata[adata.obs['pct_counts_mt'] < mt_threshold, :]
+
+    # Filter genes
+    sc.pp.filter_genes(adata, min_cells=min_cells)
+
+    print(f"\n=== Filtering Results ===")
+    print(f"Cells: {n_cells_before} -> {adata.n_obs} ({adata.n_obs/n_cells_before*100:.1f}% retained)")
+    print(f"Genes: {n_genes_before} -> {adata.n_vars} ({adata.n_vars/n_genes_before*100:.1f}% retained)")
+
+    return adata
+
+
+def main():
+    parser = argparse.ArgumentParser(description='QC analysis for single-cell data')
+    parser.add_argument('input', help='Input file (h5ad, 10X mtx, csv, etc.)')
+    parser.add_argument('--output', default='qc_filtered.h5ad',
+                        help='Output file name (default: qc_filtered.h5ad)')
+    parser.add_argument('--mt-threshold', type=float, default=5,
+                        help='Max mitochondrial percentage (default: 5)')
+    parser.add_argument('--min-genes', type=int, default=200,
+                        help='Min genes per cell (default: 200)')
+    parser.add_argument('--min-cells', type=int, default=3,
+                        help='Min cells per gene (default: 3)')
+    parser.add_argument('--skip-plots', action='store_true',
+                        help='Skip generating QC plots')
+
+    args = parser.parse_args()
+
+    # Configure scanpy
+    sc.settings.verbosity = 2
+    sc.settings.set_figure_params(dpi=300, facecolor='white')
+    sc.settings.figdir = './figures/'
+
+    print(f"Loading data from: {args.input}")
+
+    # Load data based on file extension
+    if args.input.endswith('.h5ad'):
+        adata = sc.read_h5ad(args.input)
+    elif args.input.endswith('.h5'):
+        adata = sc.read_10x_h5(args.input)
+    elif args.input.endswith('.csv'):
+        adata = sc.read_csv(args.input)
+    else:
+        # Try reading as 10X mtx directory
+        adata = sc.read_10x_mtx(args.input)
+
+    print(f"Loaded data: {adata.n_obs} cells x {adata.n_vars} genes")
+
+    # Calculate QC metrics
+    adata = calculate_qc_metrics(adata, mt_threshold=args.mt_threshold,
+                                  min_genes=args.min_genes, min_cells=args.min_cells)
+
+    # Generate QC plots (before filtering)
+    if not args.skip_plots:
+        print("\nGenerating QC plots (before filtering)...")
+        generate_qc_plots(adata, output_prefix='qc_before')
+
+    # Filter data
+    adata = filter_data(adata, mt_threshold=args.mt_threshold,
+                        min_genes=args.min_genes, min_cells=args.min_cells)
+
+    # Generate QC plots (after filtering)
+    if not args.skip_plots:
+        print("\nGenerating QC plots (after filtering)...")
+        generate_qc_plots(adata, output_prefix='qc_after')
+
+    # Save filtered data
+    print(f"\nSaving filtered data to: {args.output}")
+    adata.write_h5ad(args.output)
+
+    print("\n=== QC Analysis Complete ===")
+
+
+if __name__ == "__main__":
+    main()