526 lines
12 KiB
Markdown
526 lines
12 KiB
Markdown
# Best Practices
|
||
|
||
Guidelines for efficient and effective use of AnnData.
|
||
|
||
## Memory Management
|
||
|
||
### Use sparse matrices for sparse data
|
||
```python
|
||
import numpy as np
|
||
from scipy.sparse import csr_matrix
|
||
import anndata as ad
|
||
|
||
# Check data sparsity
|
||
data = np.random.rand(1000, 2000)
|
||
sparsity = 1 - np.count_nonzero(data) / data.size
|
||
print(f"Sparsity: {sparsity:.2%}")
|
||
|
||
# Convert to sparse if >50% zeros
|
||
if sparsity > 0.5:
|
||
adata = ad.AnnData(X=csr_matrix(data))
|
||
else:
|
||
adata = ad.AnnData(X=data)
|
||
|
||
# Benefits: 10-100x memory reduction for sparse genomics data
|
||
```
|
||
|
||
### Convert strings to categoricals
|
||
```python
|
||
# Inefficient: string columns use lots of memory
|
||
adata.obs['cell_type'] = ['Type_A', 'Type_B', 'Type_C'] * 333 + ['Type_A']
|
||
|
||
# Efficient: convert to categorical
|
||
adata.obs['cell_type'] = adata.obs['cell_type'].astype('category')
|
||
|
||
# Convert all string columns
|
||
adata.strings_to_categoricals()
|
||
|
||
# Benefits: 10-50x memory reduction for repeated strings
|
||
```
|
||
|
||
### Use backed mode for large datasets
|
||
```python
|
||
# Don't load entire dataset into memory
|
||
adata = ad.read_h5ad('large_dataset.h5ad', backed='r')
|
||
|
||
# Work with metadata
|
||
filtered = adata[adata.obs['quality'] > 0.8]
|
||
|
||
# Load only filtered subset
|
||
adata_subset = filtered.to_memory()
|
||
|
||
# Benefits: Work with datasets larger than RAM
|
||
```
|
||
|
||
## Views vs Copies
|
||
|
||
### Understanding views
|
||
```python
|
||
# Subsetting creates a view by default
|
||
subset = adata[0:100, :]
|
||
print(subset.is_view) # True
|
||
|
||
# Views don't copy data (memory efficient)
|
||
# But modifications can affect original
|
||
|
||
# Check if object is a view
|
||
if adata.is_view:
|
||
adata = adata.copy() # Make independent
|
||
```
|
||
|
||
### When to use views
|
||
```python
|
||
# Good: Read-only operations on subsets
|
||
mean_expr = adata[adata.obs['cell_type'] == 'T cell'].X.mean()
|
||
|
||
# Good: Temporary analysis
|
||
temp_subset = adata[:100, :]
|
||
result = analyze(temp_subset.X)
|
||
```
|
||
|
||
### When to use copies
|
||
```python
|
||
# Create independent copy for modifications
|
||
adata_filtered = adata[keep_cells, :].copy()
|
||
|
||
# Safe to modify without affecting original
|
||
adata_filtered.obs['new_column'] = values
|
||
|
||
# Always copy when:
|
||
# - Storing subset for later use
|
||
# - Modifying subset data
|
||
# - Passing to function that modifies data
|
||
```
|
||
|
||
## Data Storage Best Practices
|
||
|
||
### Choose the right format
|
||
|
||
**H5AD (HDF5) - Default choice**
|
||
```python
|
||
adata.write_h5ad('data.h5ad', compression='gzip')
|
||
```
|
||
- Fast random access
|
||
- Supports backed mode
|
||
- Good compression
|
||
- Best for: Most use cases
|
||
|
||
**Zarr - Cloud and parallel access**
|
||
```python
|
||
adata.write_zarr('data.zarr', chunks=(100, 100))
|
||
```
|
||
- Excellent for cloud storage (S3, GCS)
|
||
- Supports parallel I/O
|
||
- Good compression
|
||
- Best for: Large datasets, cloud workflows, parallel processing
|
||
|
||
**CSV - Interoperability**
|
||
```python
|
||
adata.write_csvs('output_dir/')
|
||
```
|
||
- Human readable
|
||
- Compatible with all tools
|
||
- Large file sizes, slow
|
||
- Best for: Sharing with non-Python tools, small datasets
|
||
|
||
### Optimize file size
|
||
```python
|
||
# Before saving, optimize:
|
||
|
||
# 1. Convert to sparse if appropriate
|
||
from scipy.sparse import csr_matrix, issparse
|
||
if not issparse(adata.X):
|
||
density = np.count_nonzero(adata.X) / adata.X.size
|
||
if density < 0.5:
|
||
adata.X = csr_matrix(adata.X)
|
||
|
||
# 2. Convert strings to categoricals
|
||
adata.strings_to_categoricals()
|
||
|
||
# 3. Use compression
|
||
adata.write_h5ad('data.h5ad', compression='gzip', compression_opts=9)
|
||
|
||
# Typical results: 5-20x file size reduction
|
||
```
|
||
|
||
## Backed Mode Strategies
|
||
|
||
### Read-only analysis
|
||
```python
|
||
# Open in read-only backed mode
|
||
adata = ad.read_h5ad('data.h5ad', backed='r')
|
||
|
||
# Perform filtering without loading data
|
||
high_quality = adata[adata.obs['quality_score'] > 0.8]
|
||
|
||
# Load only filtered data
|
||
adata_filtered = high_quality.to_memory()
|
||
```
|
||
|
||
### Read-write modifications
|
||
```python
|
||
# Open in read-write backed mode
|
||
adata = ad.read_h5ad('data.h5ad', backed='r+')
|
||
|
||
# Modify metadata (written to disk)
|
||
adata.obs['new_annotation'] = values
|
||
|
||
# X remains on disk, modifications saved immediately
|
||
```
|
||
|
||
### Chunked processing
|
||
```python
|
||
# Process large dataset in chunks
|
||
adata = ad.read_h5ad('huge_dataset.h5ad', backed='r')
|
||
|
||
results = []
|
||
chunk_size = 1000
|
||
|
||
for i in range(0, adata.n_obs, chunk_size):
|
||
chunk = adata[i:i+chunk_size, :].to_memory()
|
||
result = process(chunk)
|
||
results.append(result)
|
||
|
||
final_result = combine(results)
|
||
```
|
||
|
||
## Performance Optimization
|
||
|
||
### Subsetting performance
|
||
```python
|
||
# Fast: Boolean indexing with arrays
|
||
mask = np.array(adata.obs['quality'] > 0.5)
|
||
subset = adata[mask, :]
|
||
|
||
# Slow: Boolean indexing with Series (creates view chain)
|
||
subset = adata[adata.obs['quality'] > 0.5, :]
|
||
|
||
# Fastest: Integer indices
|
||
indices = np.where(adata.obs['quality'] > 0.5)[0]
|
||
subset = adata[indices, :]
|
||
```
|
||
|
||
### Avoid repeated subsetting
|
||
```python
|
||
# Inefficient: Multiple subset operations
|
||
for cell_type in ['A', 'B', 'C']:
|
||
subset = adata[adata.obs['cell_type'] == cell_type]
|
||
process(subset)
|
||
|
||
# Efficient: Group and process
|
||
groups = adata.obs.groupby('cell_type').groups
|
||
for cell_type, indices in groups.items():
|
||
subset = adata[indices, :]
|
||
process(subset)
|
||
```
|
||
|
||
### Use chunked operations for large matrices
|
||
```python
|
||
# Process X in chunks
|
||
for chunk in adata.chunked_X(chunk_size=1000):
|
||
result = compute(chunk)
|
||
|
||
# More memory efficient than loading full X
|
||
```
|
||
|
||
## Working with Raw Data
|
||
|
||
### Store raw before filtering
|
||
```python
|
||
# Original data with all genes
|
||
adata = ad.AnnData(X=counts)
|
||
|
||
# Store raw before filtering
|
||
adata.raw = adata.copy()
|
||
|
||
# Filter to highly variable genes
|
||
adata = adata[:, adata.var['highly_variable']]
|
||
|
||
# Later: access original data
|
||
original_expression = adata.raw.X
|
||
all_genes = adata.raw.var_names
|
||
```
|
||
|
||
### When to use raw
|
||
```python
|
||
# Use raw for:
|
||
# - Differential expression on filtered genes
|
||
# - Visualization of specific genes not in filtered set
|
||
# - Accessing original counts after normalization
|
||
|
||
# Access raw data
|
||
if adata.raw is not None:
|
||
gene_expr = adata.raw[:, 'GENE_NAME'].X
|
||
else:
|
||
gene_expr = adata[:, 'GENE_NAME'].X
|
||
```
|
||
|
||
## Metadata Management
|
||
|
||
### Naming conventions
|
||
```python
|
||
# Consistent naming improves usability
|
||
|
||
# Observation metadata (obs):
|
||
# - cell_id, sample_id
|
||
# - cell_type, tissue, condition
|
||
# - n_genes, n_counts, percent_mito
|
||
# - cluster, leiden, louvain
|
||
|
||
# Variable metadata (var):
|
||
# - gene_id, gene_name
|
||
# - highly_variable, n_cells
|
||
# - mean_expression, dispersion
|
||
|
||
# Embeddings (obsm):
|
||
# - X_pca, X_umap, X_tsne
|
||
# - X_diffmap, X_draw_graph_fr
|
||
|
||
# Follow conventions from scanpy/scverse ecosystem
|
||
```
|
||
|
||
### Document metadata
|
||
```python
|
||
# Store metadata descriptions in uns
|
||
adata.uns['metadata_descriptions'] = {
|
||
'cell_type': 'Cell type annotation from automated clustering',
|
||
'quality_score': 'QC score from scrublet (0-1, higher is better)',
|
||
'batch': 'Experimental batch identifier'
|
||
}
|
||
|
||
# Store processing history
|
||
adata.uns['processing_steps'] = [
|
||
'Raw counts loaded from 10X',
|
||
'Filtered: n_genes > 200, n_counts < 50000',
|
||
'Normalized to 10000 counts per cell',
|
||
'Log transformed'
|
||
]
|
||
```
|
||
|
||
## Reproducibility
|
||
|
||
### Set random seeds
|
||
```python
|
||
import numpy as np
|
||
|
||
# Set seed for reproducible results
|
||
np.random.seed(42)
|
||
|
||
# Document in uns
|
||
adata.uns['random_seed'] = 42
|
||
```
|
||
|
||
### Store parameters
|
||
```python
|
||
# Store analysis parameters in uns
|
||
adata.uns['pca'] = {
|
||
'n_comps': 50,
|
||
'svd_solver': 'arpack',
|
||
'random_state': 42
|
||
}
|
||
|
||
adata.uns['neighbors'] = {
|
||
'n_neighbors': 15,
|
||
'n_pcs': 50,
|
||
'metric': 'euclidean',
|
||
'method': 'umap'
|
||
}
|
||
```
|
||
|
||
### Version tracking
|
||
```python
|
||
import anndata
|
||
import scanpy
|
||
import numpy
|
||
|
||
# Store versions
|
||
adata.uns['versions'] = {
|
||
'anndata': anndata.__version__,
|
||
'scanpy': scanpy.__version__,
|
||
'numpy': numpy.__version__,
|
||
'python': sys.version
|
||
}
|
||
```
|
||
|
||
## Error Handling
|
||
|
||
### Check data validity
|
||
```python
|
||
# Verify dimensions
|
||
assert adata.n_obs == len(adata.obs)
|
||
assert adata.n_vars == len(adata.var)
|
||
assert adata.X.shape == (adata.n_obs, adata.n_vars)
|
||
|
||
# Check for NaN values
|
||
has_nan = np.isnan(adata.X.data).any() if issparse(adata.X) else np.isnan(adata.X).any()
|
||
if has_nan:
|
||
print("Warning: Data contains NaN values")
|
||
|
||
# Check for negative values (if counts expected)
|
||
has_negative = (adata.X.data < 0).any() if issparse(adata.X) else (adata.X < 0).any()
|
||
if has_negative:
|
||
print("Warning: Data contains negative values")
|
||
```
|
||
|
||
### Validate metadata
|
||
```python
|
||
# Check for missing values
|
||
missing_obs = adata.obs.isnull().sum()
|
||
if missing_obs.any():
|
||
print("Missing values in obs:")
|
||
print(missing_obs[missing_obs > 0])
|
||
|
||
# Verify indices are unique
|
||
assert adata.obs_names.is_unique, "Observation names not unique"
|
||
assert adata.var_names.is_unique, "Variable names not unique"
|
||
|
||
# Check metadata alignment
|
||
assert len(adata.obs) == adata.n_obs
|
||
assert len(adata.var) == adata.n_vars
|
||
```
|
||
|
||
## Integration with Other Tools
|
||
|
||
### Scanpy integration
|
||
```python
|
||
import scanpy as sc
|
||
|
||
# AnnData is native format for scanpy
|
||
sc.pp.filter_cells(adata, min_genes=200)
|
||
sc.pp.filter_genes(adata, min_cells=3)
|
||
sc.pp.normalize_total(adata, target_sum=1e4)
|
||
sc.pp.log1p(adata)
|
||
sc.pp.highly_variable_genes(adata)
|
||
sc.pp.pca(adata)
|
||
sc.pp.neighbors(adata)
|
||
sc.tl.umap(adata)
|
||
```
|
||
|
||
### Pandas integration
|
||
```python
|
||
import pandas as pd
|
||
|
||
# Convert to DataFrame
|
||
df = adata.to_df()
|
||
|
||
# Create from DataFrame
|
||
adata = ad.AnnData(df)
|
||
|
||
# Work with metadata as DataFrames
|
||
adata.obs = adata.obs.merge(external_metadata, left_index=True, right_index=True)
|
||
```
|
||
|
||
### PyTorch integration
|
||
```python
|
||
from anndata.experimental import AnnLoader
|
||
|
||
# Create PyTorch DataLoader
|
||
dataloader = AnnLoader(adata, batch_size=128, shuffle=True)
|
||
|
||
# Iterate in training loop
|
||
for batch in dataloader:
|
||
X = batch.X
|
||
# Train model on batch
|
||
```
|
||
|
||
## Common Pitfalls
|
||
|
||
### Pitfall 1: Modifying views
|
||
```python
|
||
# Wrong: Modifying view can affect original
|
||
subset = adata[:100, :]
|
||
subset.X = new_data # May modify adata.X!
|
||
|
||
# Correct: Copy before modifying
|
||
subset = adata[:100, :].copy()
|
||
subset.X = new_data # Independent copy
|
||
```
|
||
|
||
### Pitfall 2: Index misalignment
|
||
```python
|
||
# Wrong: Assuming order matches
|
||
external_data = pd.read_csv('data.csv')
|
||
adata.obs['new_col'] = external_data['values'] # May misalign!
|
||
|
||
# Correct: Align on index
|
||
adata.obs['new_col'] = external_data.set_index('cell_id').loc[adata.obs_names, 'values']
|
||
```
|
||
|
||
### Pitfall 3: Mixing sparse and dense
|
||
```python
|
||
# Wrong: Converting sparse to dense uses huge memory
|
||
result = adata.X + 1 # Converts sparse to dense!
|
||
|
||
# Correct: Use sparse operations
|
||
from scipy.sparse import issparse
|
||
if issparse(adata.X):
|
||
result = adata.X.copy()
|
||
result.data += 1
|
||
```
|
||
|
||
### Pitfall 4: Not handling views
|
||
```python
|
||
# Wrong: Assuming subset is independent
|
||
subset = adata[mask, :]
|
||
del adata # subset may become invalid!
|
||
|
||
# Correct: Copy when needed
|
||
subset = adata[mask, :].copy()
|
||
del adata # subset remains valid
|
||
```
|
||
|
||
### Pitfall 5: Ignoring memory constraints
|
||
```python
|
||
# Wrong: Loading huge dataset into memory
|
||
adata = ad.read_h5ad('100GB_file.h5ad') # OOM error!
|
||
|
||
# Correct: Use backed mode
|
||
adata = ad.read_h5ad('100GB_file.h5ad', backed='r')
|
||
subset = adata[adata.obs['keep']].to_memory()
|
||
```
|
||
|
||
## Workflow Example
|
||
|
||
Complete best-practices workflow:
|
||
|
||
```python
|
||
import anndata as ad
|
||
import numpy as np
|
||
from scipy.sparse import csr_matrix
|
||
|
||
# 1. Load with backed mode if large
|
||
adata = ad.read_h5ad('data.h5ad', backed='r')
|
||
|
||
# 2. Quick metadata check without loading data
|
||
print(f"Dataset: {adata.n_obs} cells × {adata.n_vars} genes")
|
||
|
||
# 3. Filter based on metadata
|
||
high_quality = adata[adata.obs['quality_score'] > 0.8]
|
||
|
||
# 4. Load filtered subset to memory
|
||
adata = high_quality.to_memory()
|
||
|
||
# 5. Convert to optimal storage types
|
||
adata.strings_to_categoricals()
|
||
if not issparse(adata.X):
|
||
density = np.count_nonzero(adata.X) / adata.X.size
|
||
if density < 0.5:
|
||
adata.X = csr_matrix(adata.X)
|
||
|
||
# 6. Store raw before filtering genes
|
||
adata.raw = adata.copy()
|
||
|
||
# 7. Filter to highly variable genes
|
||
adata = adata[:, adata.var['highly_variable']].copy()
|
||
|
||
# 8. Document processing
|
||
adata.uns['processing'] = {
|
||
'filtered': 'quality_score > 0.8',
|
||
'n_hvg': adata.n_vars,
|
||
'date': '2025-11-03'
|
||
}
|
||
|
||
# 9. Save optimized
|
||
adata.write_h5ad('processed.h5ad', compression='gzip')
|
||
```
|