Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions

View File

@@ -0,0 +1,525 @@
# Best Practices
Guidelines for efficient and effective use of AnnData.
## Memory Management
### Use sparse matrices for sparse data
```python
import numpy as np
from scipy.sparse import csr_matrix
import anndata as ad
# Check data sparsity
data = np.random.rand(1000, 2000)
sparsity = 1 - np.count_nonzero(data) / data.size
print(f"Sparsity: {sparsity:.2%}")
# Convert to sparse if >50% zeros
if sparsity > 0.5:
adata = ad.AnnData(X=csr_matrix(data))
else:
adata = ad.AnnData(X=data)
# Benefits: 10-100x memory reduction for sparse genomics data
```
### Convert strings to categoricals
```python
# Inefficient: string columns use lots of memory
adata.obs['cell_type'] = ['Type_A', 'Type_B', 'Type_C'] * 333 + ['Type_A']
# Efficient: convert to categorical
adata.obs['cell_type'] = adata.obs['cell_type'].astype('category')
# Convert all string columns
adata.strings_to_categoricals()
# Benefits: 10-50x memory reduction for repeated strings
```
### Use backed mode for large datasets
```python
# Don't load entire dataset into memory
adata = ad.read_h5ad('large_dataset.h5ad', backed='r')
# Work with metadata
filtered = adata[adata.obs['quality'] > 0.8]
# Load only filtered subset
adata_subset = filtered.to_memory()
# Benefits: Work with datasets larger than RAM
```
## Views vs Copies
### Understanding views
```python
# Subsetting creates a view by default
subset = adata[0:100, :]
print(subset.is_view) # True
# Views don't copy data (memory efficient)
# But modifications can affect original
# Check if object is a view
if adata.is_view:
adata = adata.copy() # Make independent
```
### When to use views
```python
# Good: Read-only operations on subsets
mean_expr = adata[adata.obs['cell_type'] == 'T cell'].X.mean()
# Good: Temporary analysis
temp_subset = adata[:100, :]
result = analyze(temp_subset.X)
```
### When to use copies
```python
# Create independent copy for modifications
adata_filtered = adata[keep_cells, :].copy()
# Safe to modify without affecting original
adata_filtered.obs['new_column'] = values
# Always copy when:
# - Storing subset for later use
# - Modifying subset data
# - Passing to function that modifies data
```
## Data Storage Best Practices
### Choose the right format
**H5AD (HDF5) - Default choice**
```python
adata.write_h5ad('data.h5ad', compression='gzip')
```
- Fast random access
- Supports backed mode
- Good compression
- Best for: Most use cases
**Zarr - Cloud and parallel access**
```python
adata.write_zarr('data.zarr', chunks=(100, 100))
```
- Excellent for cloud storage (S3, GCS)
- Supports parallel I/O
- Good compression
- Best for: Large datasets, cloud workflows, parallel processing
**CSV - Interoperability**
```python
adata.write_csvs('output_dir/')
```
- Human readable
- Compatible with all tools
- Large file sizes, slow
- Best for: Sharing with non-Python tools, small datasets
### Optimize file size
```python
# Before saving, optimize:
# 1. Convert to sparse if appropriate
from scipy.sparse import csr_matrix, issparse
if not issparse(adata.X):
density = np.count_nonzero(adata.X) / adata.X.size
if density < 0.5:
adata.X = csr_matrix(adata.X)
# 2. Convert strings to categoricals
adata.strings_to_categoricals()
# 3. Use compression
adata.write_h5ad('data.h5ad', compression='gzip', compression_opts=9)
# Typical results: 5-20x file size reduction
```
## Backed Mode Strategies
### Read-only analysis
```python
# Open in read-only backed mode
adata = ad.read_h5ad('data.h5ad', backed='r')
# Perform filtering without loading data
high_quality = adata[adata.obs['quality_score'] > 0.8]
# Load only filtered data
adata_filtered = high_quality.to_memory()
```
### Read-write modifications
```python
# Open in read-write backed mode
adata = ad.read_h5ad('data.h5ad', backed='r+')
# Modify metadata (written to disk)
adata.obs['new_annotation'] = values
# X remains on disk, modifications saved immediately
```
### Chunked processing
```python
# Process large dataset in chunks
adata = ad.read_h5ad('huge_dataset.h5ad', backed='r')
results = []
chunk_size = 1000
for i in range(0, adata.n_obs, chunk_size):
chunk = adata[i:i+chunk_size, :].to_memory()
result = process(chunk)
results.append(result)
final_result = combine(results)
```
## Performance Optimization
### Subsetting performance
```python
# Fast: Boolean indexing with arrays
mask = np.array(adata.obs['quality'] > 0.5)
subset = adata[mask, :]
# Slow: Boolean indexing with Series (creates view chain)
subset = adata[adata.obs['quality'] > 0.5, :]
# Fastest: Integer indices
indices = np.where(adata.obs['quality'] > 0.5)[0]
subset = adata[indices, :]
```
### Avoid repeated subsetting
```python
# Inefficient: Multiple subset operations
for cell_type in ['A', 'B', 'C']:
subset = adata[adata.obs['cell_type'] == cell_type]
process(subset)
# Efficient: Group and process
groups = adata.obs.groupby('cell_type').groups
for cell_type, indices in groups.items():
subset = adata[indices, :]
process(subset)
```
### Use chunked operations for large matrices
```python
# Process X in chunks
for chunk in adata.chunked_X(chunk_size=1000):
result = compute(chunk)
# More memory efficient than loading full X
```
## Working with Raw Data
### Store raw before filtering
```python
# Original data with all genes
adata = ad.AnnData(X=counts)
# Store raw before filtering
adata.raw = adata.copy()
# Filter to highly variable genes
adata = adata[:, adata.var['highly_variable']]
# Later: access original data
original_expression = adata.raw.X
all_genes = adata.raw.var_names
```
### When to use raw
```python
# Use raw for:
# - Differential expression on filtered genes
# - Visualization of specific genes not in filtered set
# - Accessing original counts after normalization
# Access raw data
if adata.raw is not None:
gene_expr = adata.raw[:, 'GENE_NAME'].X
else:
gene_expr = adata[:, 'GENE_NAME'].X
```
## Metadata Management
### Naming conventions
```python
# Consistent naming improves usability
# Observation metadata (obs):
# - cell_id, sample_id
# - cell_type, tissue, condition
# - n_genes, n_counts, percent_mito
# - cluster, leiden, louvain
# Variable metadata (var):
# - gene_id, gene_name
# - highly_variable, n_cells
# - mean_expression, dispersion
# Embeddings (obsm):
# - X_pca, X_umap, X_tsne
# - X_diffmap, X_draw_graph_fr
# Follow conventions from scanpy/scverse ecosystem
```
### Document metadata
```python
# Store metadata descriptions in uns
adata.uns['metadata_descriptions'] = {
'cell_type': 'Cell type annotation from automated clustering',
'quality_score': 'QC score from scrublet (0-1, higher is better)',
'batch': 'Experimental batch identifier'
}
# Store processing history
adata.uns['processing_steps'] = [
'Raw counts loaded from 10X',
'Filtered: n_genes > 200, n_counts < 50000',
'Normalized to 10000 counts per cell',
'Log transformed'
]
```
## Reproducibility
### Set random seeds
```python
import numpy as np
# Set seed for reproducible results
np.random.seed(42)
# Document in uns
adata.uns['random_seed'] = 42
```
### Store parameters
```python
# Store analysis parameters in uns
adata.uns['pca'] = {
'n_comps': 50,
'svd_solver': 'arpack',
'random_state': 42
}
adata.uns['neighbors'] = {
'n_neighbors': 15,
'n_pcs': 50,
'metric': 'euclidean',
'method': 'umap'
}
```
### Version tracking
```python
import anndata
import scanpy
import numpy
# Store versions
adata.uns['versions'] = {
'anndata': anndata.__version__,
'scanpy': scanpy.__version__,
'numpy': numpy.__version__,
'python': sys.version
}
```
## Error Handling
### Check data validity
```python
# Verify dimensions
assert adata.n_obs == len(adata.obs)
assert adata.n_vars == len(adata.var)
assert adata.X.shape == (adata.n_obs, adata.n_vars)
# Check for NaN values
has_nan = np.isnan(adata.X.data).any() if issparse(adata.X) else np.isnan(adata.X).any()
if has_nan:
print("Warning: Data contains NaN values")
# Check for negative values (if counts expected)
has_negative = (adata.X.data < 0).any() if issparse(adata.X) else (adata.X < 0).any()
if has_negative:
print("Warning: Data contains negative values")
```
### Validate metadata
```python
# Check for missing values
missing_obs = adata.obs.isnull().sum()
if missing_obs.any():
print("Missing values in obs:")
print(missing_obs[missing_obs > 0])
# Verify indices are unique
assert adata.obs_names.is_unique, "Observation names not unique"
assert adata.var_names.is_unique, "Variable names not unique"
# Check metadata alignment
assert len(adata.obs) == adata.n_obs
assert len(adata.var) == adata.n_vars
```
## Integration with Other Tools
### Scanpy integration
```python
import scanpy as sc
# AnnData is native format for scanpy
sc.pp.filter_cells(adata, min_genes=200)
sc.pp.filter_genes(adata, min_cells=3)
sc.pp.normalize_total(adata, target_sum=1e4)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata)
sc.pp.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
```
### Pandas integration
```python
import pandas as pd
# Convert to DataFrame
df = adata.to_df()
# Create from DataFrame
adata = ad.AnnData(df)
# Work with metadata as DataFrames
adata.obs = adata.obs.merge(external_metadata, left_index=True, right_index=True)
```
### PyTorch integration
```python
from anndata.experimental import AnnLoader
# Create PyTorch DataLoader
dataloader = AnnLoader(adata, batch_size=128, shuffle=True)
# Iterate in training loop
for batch in dataloader:
X = batch.X
# Train model on batch
```
## Common Pitfalls
### Pitfall 1: Modifying views
```python
# Wrong: Modifying view can affect original
subset = adata[:100, :]
subset.X = new_data # May modify adata.X!
# Correct: Copy before modifying
subset = adata[:100, :].copy()
subset.X = new_data # Independent copy
```
### Pitfall 2: Index misalignment
```python
# Wrong: Assuming order matches
external_data = pd.read_csv('data.csv')
adata.obs['new_col'] = external_data['values'] # May misalign!
# Correct: Align on index
adata.obs['new_col'] = external_data.set_index('cell_id').loc[adata.obs_names, 'values']
```
### Pitfall 3: Mixing sparse and dense
```python
# Wrong: Converting sparse to dense uses huge memory
result = adata.X + 1 # Converts sparse to dense!
# Correct: Use sparse operations
from scipy.sparse import issparse
if issparse(adata.X):
result = adata.X.copy()
result.data += 1
```
### Pitfall 4: Not handling views
```python
# Wrong: Assuming subset is independent
subset = adata[mask, :]
del adata # subset may become invalid!
# Correct: Copy when needed
subset = adata[mask, :].copy()
del adata # subset remains valid
```
### Pitfall 5: Ignoring memory constraints
```python
# Wrong: Loading huge dataset into memory
adata = ad.read_h5ad('100GB_file.h5ad') # OOM error!
# Correct: Use backed mode
adata = ad.read_h5ad('100GB_file.h5ad', backed='r')
subset = adata[adata.obs['keep']].to_memory()
```
## Workflow Example
Complete best-practices workflow:
```python
import anndata as ad
import numpy as np
from scipy.sparse import csr_matrix
# 1. Load with backed mode if large
adata = ad.read_h5ad('data.h5ad', backed='r')
# 2. Quick metadata check without loading data
print(f"Dataset: {adata.n_obs} cells × {adata.n_vars} genes")
# 3. Filter based on metadata
high_quality = adata[adata.obs['quality_score'] > 0.8]
# 4. Load filtered subset to memory
adata = high_quality.to_memory()
# 5. Convert to optimal storage types
adata.strings_to_categoricals()
if not issparse(adata.X):
density = np.count_nonzero(adata.X) / adata.X.size
if density < 0.5:
adata.X = csr_matrix(adata.X)
# 6. Store raw before filtering genes
adata.raw = adata.copy()
# 7. Filter to highly variable genes
adata = adata[:, adata.var['highly_variable']].copy()
# 8. Document processing
adata.uns['processing'] = {
'filtered': 'quality_score > 0.8',
'n_hvg': adata.n_vars,
'date': '2025-11-03'
}
# 9. Save optimized
adata.write_h5ad('processed.h5ad', compression='gzip')
```