Initial commit
This commit is contained in:
525
skills/anndata/references/best_practices.md
Normal file
525
skills/anndata/references/best_practices.md
Normal file
@@ -0,0 +1,525 @@
|
||||
# Best Practices
|
||||
|
||||
Guidelines for efficient and effective use of AnnData.
|
||||
|
||||
## Memory Management
|
||||
|
||||
### Use sparse matrices for sparse data
|
||||
```python
|
||||
import numpy as np
|
||||
from scipy.sparse import csr_matrix
|
||||
import anndata as ad
|
||||
|
||||
# Check data sparsity
|
||||
data = np.random.rand(1000, 2000)
|
||||
sparsity = 1 - np.count_nonzero(data) / data.size
|
||||
print(f"Sparsity: {sparsity:.2%}")
|
||||
|
||||
# Convert to sparse if >50% zeros
|
||||
if sparsity > 0.5:
|
||||
adata = ad.AnnData(X=csr_matrix(data))
|
||||
else:
|
||||
adata = ad.AnnData(X=data)
|
||||
|
||||
# Benefits: 10-100x memory reduction for sparse genomics data
|
||||
```
|
||||
|
||||
### Convert strings to categoricals
|
||||
```python
|
||||
# Inefficient: string columns use lots of memory
|
||||
adata.obs['cell_type'] = ['Type_A', 'Type_B', 'Type_C'] * 333 + ['Type_A']
|
||||
|
||||
# Efficient: convert to categorical
|
||||
adata.obs['cell_type'] = adata.obs['cell_type'].astype('category')
|
||||
|
||||
# Convert all string columns
|
||||
adata.strings_to_categoricals()
|
||||
|
||||
# Benefits: 10-50x memory reduction for repeated strings
|
||||
```
|
||||
|
||||
### Use backed mode for large datasets
|
||||
```python
|
||||
# Don't load entire dataset into memory
|
||||
adata = ad.read_h5ad('large_dataset.h5ad', backed='r')
|
||||
|
||||
# Work with metadata
|
||||
filtered = adata[adata.obs['quality'] > 0.8]
|
||||
|
||||
# Load only filtered subset
|
||||
adata_subset = filtered.to_memory()
|
||||
|
||||
# Benefits: Work with datasets larger than RAM
|
||||
```
|
||||
|
||||
## Views vs Copies
|
||||
|
||||
### Understanding views
|
||||
```python
|
||||
# Subsetting creates a view by default
|
||||
subset = adata[0:100, :]
|
||||
print(subset.is_view) # True
|
||||
|
||||
# Views don't copy data (memory efficient)
|
||||
# But modifications can affect original
|
||||
|
||||
# Check if object is a view
|
||||
if adata.is_view:
|
||||
adata = adata.copy() # Make independent
|
||||
```
|
||||
|
||||
### When to use views
|
||||
```python
|
||||
# Good: Read-only operations on subsets
|
||||
mean_expr = adata[adata.obs['cell_type'] == 'T cell'].X.mean()
|
||||
|
||||
# Good: Temporary analysis
|
||||
temp_subset = adata[:100, :]
|
||||
result = analyze(temp_subset.X)
|
||||
```
|
||||
|
||||
### When to use copies
|
||||
```python
|
||||
# Create independent copy for modifications
|
||||
adata_filtered = adata[keep_cells, :].copy()
|
||||
|
||||
# Safe to modify without affecting original
|
||||
adata_filtered.obs['new_column'] = values
|
||||
|
||||
# Always copy when:
|
||||
# - Storing subset for later use
|
||||
# - Modifying subset data
|
||||
# - Passing to function that modifies data
|
||||
```
|
||||
|
||||
## Data Storage Best Practices
|
||||
|
||||
### Choose the right format
|
||||
|
||||
**H5AD (HDF5) - Default choice**
|
||||
```python
|
||||
adata.write_h5ad('data.h5ad', compression='gzip')
|
||||
```
|
||||
- Fast random access
|
||||
- Supports backed mode
|
||||
- Good compression
|
||||
- Best for: Most use cases
|
||||
|
||||
**Zarr - Cloud and parallel access**
|
||||
```python
|
||||
adata.write_zarr('data.zarr', chunks=(100, 100))
|
||||
```
|
||||
- Excellent for cloud storage (S3, GCS)
|
||||
- Supports parallel I/O
|
||||
- Good compression
|
||||
- Best for: Large datasets, cloud workflows, parallel processing
|
||||
|
||||
**CSV - Interoperability**
|
||||
```python
|
||||
adata.write_csvs('output_dir/')
|
||||
```
|
||||
- Human readable
|
||||
- Compatible with all tools
|
||||
- Large file sizes, slow
|
||||
- Best for: Sharing with non-Python tools, small datasets
|
||||
|
||||
### Optimize file size
|
||||
```python
|
||||
# Before saving, optimize:
|
||||
|
||||
# 1. Convert to sparse if appropriate
|
||||
from scipy.sparse import csr_matrix, issparse
|
||||
if not issparse(adata.X):
|
||||
density = np.count_nonzero(adata.X) / adata.X.size
|
||||
if density < 0.5:
|
||||
adata.X = csr_matrix(adata.X)
|
||||
|
||||
# 2. Convert strings to categoricals
|
||||
adata.strings_to_categoricals()
|
||||
|
||||
# 3. Use compression
|
||||
adata.write_h5ad('data.h5ad', compression='gzip', compression_opts=9)
|
||||
|
||||
# Typical results: 5-20x file size reduction
|
||||
```
|
||||
|
||||
## Backed Mode Strategies
|
||||
|
||||
### Read-only analysis
|
||||
```python
|
||||
# Open in read-only backed mode
|
||||
adata = ad.read_h5ad('data.h5ad', backed='r')
|
||||
|
||||
# Perform filtering without loading data
|
||||
high_quality = adata[adata.obs['quality_score'] > 0.8]
|
||||
|
||||
# Load only filtered data
|
||||
adata_filtered = high_quality.to_memory()
|
||||
```
|
||||
|
||||
### Read-write modifications
|
||||
```python
|
||||
# Open in read-write backed mode
|
||||
adata = ad.read_h5ad('data.h5ad', backed='r+')
|
||||
|
||||
# Modify metadata (written to disk)
|
||||
adata.obs['new_annotation'] = values
|
||||
|
||||
# X remains on disk, modifications saved immediately
|
||||
```
|
||||
|
||||
### Chunked processing
|
||||
```python
|
||||
# Process large dataset in chunks
|
||||
adata = ad.read_h5ad('huge_dataset.h5ad', backed='r')
|
||||
|
||||
results = []
|
||||
chunk_size = 1000
|
||||
|
||||
for i in range(0, adata.n_obs, chunk_size):
|
||||
chunk = adata[i:i+chunk_size, :].to_memory()
|
||||
result = process(chunk)
|
||||
results.append(result)
|
||||
|
||||
final_result = combine(results)
|
||||
```
|
||||
|
||||
## Performance Optimization
|
||||
|
||||
### Subsetting performance
|
||||
```python
|
||||
# Fast: Boolean indexing with arrays
|
||||
mask = np.array(adata.obs['quality'] > 0.5)
|
||||
subset = adata[mask, :]
|
||||
|
||||
# Slow: Boolean indexing with Series (creates view chain)
|
||||
subset = adata[adata.obs['quality'] > 0.5, :]
|
||||
|
||||
# Fastest: Integer indices
|
||||
indices = np.where(adata.obs['quality'] > 0.5)[0]
|
||||
subset = adata[indices, :]
|
||||
```
|
||||
|
||||
### Avoid repeated subsetting
|
||||
```python
|
||||
# Inefficient: Multiple subset operations
|
||||
for cell_type in ['A', 'B', 'C']:
|
||||
subset = adata[adata.obs['cell_type'] == cell_type]
|
||||
process(subset)
|
||||
|
||||
# Efficient: Group and process
|
||||
groups = adata.obs.groupby('cell_type').groups
|
||||
for cell_type, indices in groups.items():
|
||||
subset = adata[indices, :]
|
||||
process(subset)
|
||||
```
|
||||
|
||||
### Use chunked operations for large matrices
|
||||
```python
|
||||
# Process X in chunks
|
||||
for chunk in adata.chunked_X(chunk_size=1000):
|
||||
result = compute(chunk)
|
||||
|
||||
# More memory efficient than loading full X
|
||||
```
|
||||
|
||||
## Working with Raw Data
|
||||
|
||||
### Store raw before filtering
|
||||
```python
|
||||
# Original data with all genes
|
||||
adata = ad.AnnData(X=counts)
|
||||
|
||||
# Store raw before filtering
|
||||
adata.raw = adata.copy()
|
||||
|
||||
# Filter to highly variable genes
|
||||
adata = adata[:, adata.var['highly_variable']]
|
||||
|
||||
# Later: access original data
|
||||
original_expression = adata.raw.X
|
||||
all_genes = adata.raw.var_names
|
||||
```
|
||||
|
||||
### When to use raw
|
||||
```python
|
||||
# Use raw for:
|
||||
# - Differential expression on filtered genes
|
||||
# - Visualization of specific genes not in filtered set
|
||||
# - Accessing original counts after normalization
|
||||
|
||||
# Access raw data
|
||||
if adata.raw is not None:
|
||||
gene_expr = adata.raw[:, 'GENE_NAME'].X
|
||||
else:
|
||||
gene_expr = adata[:, 'GENE_NAME'].X
|
||||
```
|
||||
|
||||
## Metadata Management
|
||||
|
||||
### Naming conventions
|
||||
```python
|
||||
# Consistent naming improves usability
|
||||
|
||||
# Observation metadata (obs):
|
||||
# - cell_id, sample_id
|
||||
# - cell_type, tissue, condition
|
||||
# - n_genes, n_counts, percent_mito
|
||||
# - cluster, leiden, louvain
|
||||
|
||||
# Variable metadata (var):
|
||||
# - gene_id, gene_name
|
||||
# - highly_variable, n_cells
|
||||
# - mean_expression, dispersion
|
||||
|
||||
# Embeddings (obsm):
|
||||
# - X_pca, X_umap, X_tsne
|
||||
# - X_diffmap, X_draw_graph_fr
|
||||
|
||||
# Follow conventions from scanpy/scverse ecosystem
|
||||
```
|
||||
|
||||
### Document metadata
|
||||
```python
|
||||
# Store metadata descriptions in uns
|
||||
adata.uns['metadata_descriptions'] = {
|
||||
'cell_type': 'Cell type annotation from automated clustering',
|
||||
'quality_score': 'QC score from scrublet (0-1, higher is better)',
|
||||
'batch': 'Experimental batch identifier'
|
||||
}
|
||||
|
||||
# Store processing history
|
||||
adata.uns['processing_steps'] = [
|
||||
'Raw counts loaded from 10X',
|
||||
'Filtered: n_genes > 200, n_counts < 50000',
|
||||
'Normalized to 10000 counts per cell',
|
||||
'Log transformed'
|
||||
]
|
||||
```
|
||||
|
||||
## Reproducibility
|
||||
|
||||
### Set random seeds
|
||||
```python
|
||||
import numpy as np
|
||||
|
||||
# Set seed for reproducible results
|
||||
np.random.seed(42)
|
||||
|
||||
# Document in uns
|
||||
adata.uns['random_seed'] = 42
|
||||
```
|
||||
|
||||
### Store parameters
|
||||
```python
|
||||
# Store analysis parameters in uns
|
||||
adata.uns['pca'] = {
|
||||
'n_comps': 50,
|
||||
'svd_solver': 'arpack',
|
||||
'random_state': 42
|
||||
}
|
||||
|
||||
adata.uns['neighbors'] = {
|
||||
'n_neighbors': 15,
|
||||
'n_pcs': 50,
|
||||
'metric': 'euclidean',
|
||||
'method': 'umap'
|
||||
}
|
||||
```
|
||||
|
||||
### Version tracking
|
||||
```python
|
||||
import anndata
|
||||
import scanpy
|
||||
import numpy
|
||||
|
||||
# Store versions
|
||||
adata.uns['versions'] = {
|
||||
'anndata': anndata.__version__,
|
||||
'scanpy': scanpy.__version__,
|
||||
'numpy': numpy.__version__,
|
||||
'python': sys.version
|
||||
}
|
||||
```
|
||||
|
||||
## Error Handling
|
||||
|
||||
### Check data validity
|
||||
```python
|
||||
# Verify dimensions
|
||||
assert adata.n_obs == len(adata.obs)
|
||||
assert adata.n_vars == len(adata.var)
|
||||
assert adata.X.shape == (adata.n_obs, adata.n_vars)
|
||||
|
||||
# Check for NaN values
|
||||
has_nan = np.isnan(adata.X.data).any() if issparse(adata.X) else np.isnan(adata.X).any()
|
||||
if has_nan:
|
||||
print("Warning: Data contains NaN values")
|
||||
|
||||
# Check for negative values (if counts expected)
|
||||
has_negative = (adata.X.data < 0).any() if issparse(adata.X) else (adata.X < 0).any()
|
||||
if has_negative:
|
||||
print("Warning: Data contains negative values")
|
||||
```
|
||||
|
||||
### Validate metadata
|
||||
```python
|
||||
# Check for missing values
|
||||
missing_obs = adata.obs.isnull().sum()
|
||||
if missing_obs.any():
|
||||
print("Missing values in obs:")
|
||||
print(missing_obs[missing_obs > 0])
|
||||
|
||||
# Verify indices are unique
|
||||
assert adata.obs_names.is_unique, "Observation names not unique"
|
||||
assert adata.var_names.is_unique, "Variable names not unique"
|
||||
|
||||
# Check metadata alignment
|
||||
assert len(adata.obs) == adata.n_obs
|
||||
assert len(adata.var) == adata.n_vars
|
||||
```
|
||||
|
||||
## Integration with Other Tools
|
||||
|
||||
### Scanpy integration
|
||||
```python
|
||||
import scanpy as sc
|
||||
|
||||
# AnnData is native format for scanpy
|
||||
sc.pp.filter_cells(adata, min_genes=200)
|
||||
sc.pp.filter_genes(adata, min_cells=3)
|
||||
sc.pp.normalize_total(adata, target_sum=1e4)
|
||||
sc.pp.log1p(adata)
|
||||
sc.pp.highly_variable_genes(adata)
|
||||
sc.pp.pca(adata)
|
||||
sc.pp.neighbors(adata)
|
||||
sc.tl.umap(adata)
|
||||
```
|
||||
|
||||
### Pandas integration
|
||||
```python
|
||||
import pandas as pd
|
||||
|
||||
# Convert to DataFrame
|
||||
df = adata.to_df()
|
||||
|
||||
# Create from DataFrame
|
||||
adata = ad.AnnData(df)
|
||||
|
||||
# Work with metadata as DataFrames
|
||||
adata.obs = adata.obs.merge(external_metadata, left_index=True, right_index=True)
|
||||
```
|
||||
|
||||
### PyTorch integration
|
||||
```python
|
||||
from anndata.experimental import AnnLoader
|
||||
|
||||
# Create PyTorch DataLoader
|
||||
dataloader = AnnLoader(adata, batch_size=128, shuffle=True)
|
||||
|
||||
# Iterate in training loop
|
||||
for batch in dataloader:
|
||||
X = batch.X
|
||||
# Train model on batch
|
||||
```
|
||||
|
||||
## Common Pitfalls
|
||||
|
||||
### Pitfall 1: Modifying views
|
||||
```python
|
||||
# Wrong: Modifying view can affect original
|
||||
subset = adata[:100, :]
|
||||
subset.X = new_data # May modify adata.X!
|
||||
|
||||
# Correct: Copy before modifying
|
||||
subset = adata[:100, :].copy()
|
||||
subset.X = new_data # Independent copy
|
||||
```
|
||||
|
||||
### Pitfall 2: Index misalignment
|
||||
```python
|
||||
# Wrong: Assuming order matches
|
||||
external_data = pd.read_csv('data.csv')
|
||||
adata.obs['new_col'] = external_data['values'] # May misalign!
|
||||
|
||||
# Correct: Align on index
|
||||
adata.obs['new_col'] = external_data.set_index('cell_id').loc[adata.obs_names, 'values']
|
||||
```
|
||||
|
||||
### Pitfall 3: Mixing sparse and dense
|
||||
```python
|
||||
# Wrong: Converting sparse to dense uses huge memory
|
||||
result = adata.X + 1 # Converts sparse to dense!
|
||||
|
||||
# Correct: Use sparse operations
|
||||
from scipy.sparse import issparse
|
||||
if issparse(adata.X):
|
||||
result = adata.X.copy()
|
||||
result.data += 1
|
||||
```
|
||||
|
||||
### Pitfall 4: Not handling views
|
||||
```python
|
||||
# Wrong: Assuming subset is independent
|
||||
subset = adata[mask, :]
|
||||
del adata # subset may become invalid!
|
||||
|
||||
# Correct: Copy when needed
|
||||
subset = adata[mask, :].copy()
|
||||
del adata # subset remains valid
|
||||
```
|
||||
|
||||
### Pitfall 5: Ignoring memory constraints
|
||||
```python
|
||||
# Wrong: Loading huge dataset into memory
|
||||
adata = ad.read_h5ad('100GB_file.h5ad') # OOM error!
|
||||
|
||||
# Correct: Use backed mode
|
||||
adata = ad.read_h5ad('100GB_file.h5ad', backed='r')
|
||||
subset = adata[adata.obs['keep']].to_memory()
|
||||
```
|
||||
|
||||
## Workflow Example
|
||||
|
||||
Complete best-practices workflow:
|
||||
|
||||
```python
|
||||
import anndata as ad
|
||||
import numpy as np
|
||||
from scipy.sparse import csr_matrix
|
||||
|
||||
# 1. Load with backed mode if large
|
||||
adata = ad.read_h5ad('data.h5ad', backed='r')
|
||||
|
||||
# 2. Quick metadata check without loading data
|
||||
print(f"Dataset: {adata.n_obs} cells × {adata.n_vars} genes")
|
||||
|
||||
# 3. Filter based on metadata
|
||||
high_quality = adata[adata.obs['quality_score'] > 0.8]
|
||||
|
||||
# 4. Load filtered subset to memory
|
||||
adata = high_quality.to_memory()
|
||||
|
||||
# 5. Convert to optimal storage types
|
||||
adata.strings_to_categoricals()
|
||||
if not issparse(adata.X):
|
||||
density = np.count_nonzero(adata.X) / adata.X.size
|
||||
if density < 0.5:
|
||||
adata.X = csr_matrix(adata.X)
|
||||
|
||||
# 6. Store raw before filtering genes
|
||||
adata.raw = adata.copy()
|
||||
|
||||
# 7. Filter to highly variable genes
|
||||
adata = adata[:, adata.var['highly_variable']].copy()
|
||||
|
||||
# 8. Document processing
|
||||
adata.uns['processing'] = {
|
||||
'filtered': 'quality_score > 0.8',
|
||||
'n_hvg': adata.n_vars,
|
||||
'date': '2025-11-03'
|
||||
}
|
||||
|
||||
# 9. Save optimized
|
||||
adata.write_h5ad('processed.h5ad', compression='gzip')
|
||||
```
|
||||
Reference in New Issue
Block a user