Initial commit

2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions
--- a/skills/anndata/references/concatenation.md
+++ b/skills/anndata/references/concatenation.md
@@ -0,0 +1,396 @@
+# Concatenating AnnData Objects
+
+Combine multiple AnnData objects along either observations or variables axis.
+
+## Basic Concatenation
+
+### Concatenate along observations (stack cells/samples)
+```python
+import anndata as ad
+import numpy as np
+
+# Create multiple AnnData objects
+adata1 = ad.AnnData(X=np.random.rand(100, 50))
+adata2 = ad.AnnData(X=np.random.rand(150, 50))
+adata3 = ad.AnnData(X=np.random.rand(200, 50))
+
+# Concatenate along observations (axis=0, default)
+adata_combined = ad.concat([adata1, adata2, adata3], axis=0)
+
+print(adata_combined.shape)  # (450, 50)
+```
+
+### Concatenate along variables (stack genes/features)
+```python
+# Create objects with same observations, different variables
+adata1 = ad.AnnData(X=np.random.rand(100, 50))
+adata2 = ad.AnnData(X=np.random.rand(100, 30))
+adata3 = ad.AnnData(X=np.random.rand(100, 70))
+
+# Concatenate along variables (axis=1)
+adata_combined = ad.concat([adata1, adata2, adata3], axis=1)
+
+print(adata_combined.shape)  # (100, 150)
+```
+
+## Join Types
+
+### Inner join (intersection)
+Keep only variables/observations present in all objects.
+
+```python
+import pandas as pd
+
+# Create objects with different variables
+adata1 = ad.AnnData(
+    X=np.random.rand(100, 50),
+    var=pd.DataFrame(index=[f'Gene_{i}' for i in range(50)])
+)
+adata2 = ad.AnnData(
+    X=np.random.rand(150, 60),
+    var=pd.DataFrame(index=[f'Gene_{i}' for i in range(10, 70)])
+)
+
+# Inner join: only genes 10-49 are kept (overlap)
+adata_inner = ad.concat([adata1, adata2], join='inner')
+print(adata_inner.n_vars)  # 40 genes (overlap)
+```
+
+### Outer join (union)
+Keep all variables/observations, filling missing values.
+
+```python
+# Outer join: all genes are kept
+adata_outer = ad.concat([adata1, adata2], join='outer')
+print(adata_outer.n_vars)  # 70 genes (union)
+
+# Missing values are filled with appropriate defaults:
+# - 0 for sparse matrices
+# - NaN for dense matrices
+```
+
+### Fill values for outer joins
+```python
+# Specify fill value for missing data
+adata_filled = ad.concat([adata1, adata2], join='outer', fill_value=0)
+```
+
+## Tracking Data Sources
+
+### Add batch labels
+```python
+# Label which object each observation came from
+adata_combined = ad.concat(
+    [adata1, adata2, adata3],
+    label='batch',  # Column name for labels
+    keys=['batch1', 'batch2', 'batch3']  # Labels for each object
+)
+
+print(adata_combined.obs['batch'].value_counts())
+# batch1    100
+# batch2    150
+# batch3    200
+```
+
+### Automatic batch labels
+```python
+# If keys not provided, uses integer indices
+adata_combined = ad.concat(
+    [adata1, adata2, adata3],
+    label='dataset'
+)
+# dataset column contains: 0, 1, 2
+```
+
+## Merge Strategies
+
+Control how metadata from different objects is combined using the `merge` parameter.
+
+### merge=None (default for observations)
+Exclude metadata on non-concatenation axis.
+
+```python
+# When concatenating observations, var metadata must match
+adata1.var['gene_type'] = 'protein_coding'
+adata2.var['gene_type'] = 'protein_coding'
+
+# var is kept only if identical across all objects
+adata_combined = ad.concat([adata1, adata2], merge=None)
+```
+
+### merge='same'
+Keep metadata that is identical across all objects.
+
+```python
+adata1.var['chromosome'] = ['chr1'] * 25 + ['chr2'] * 25
+adata2.var['chromosome'] = ['chr1'] * 25 + ['chr2'] * 25
+adata1.var['type'] = 'protein_coding'
+adata2.var['type'] = 'lncRNA'  # Different
+
+# 'chromosome' is kept (same), 'type' is excluded (different)
+adata_combined = ad.concat([adata1, adata2], merge='same')
+```
+
+### merge='unique'
+Keep metadata columns where each key has exactly one value.
+
+```python
+adata1.var['gene_id'] = [f'ENSG{i:05d}' for i in range(50)]
+adata2.var['gene_id'] = [f'ENSG{i:05d}' for i in range(50)]
+
+# gene_id is kept (unique values for each key)
+adata_combined = ad.concat([adata1, adata2], merge='unique')
+```
+
+### merge='first'
+Take values from the first object containing each key.
+
+```python
+adata1.var['description'] = ['Desc1'] * 50
+adata2.var['description'] = ['Desc2'] * 50
+
+# Uses descriptions from adata1
+adata_combined = ad.concat([adata1, adata2], merge='first')
+```
+
+### merge='only'
+Keep metadata that appears in only one object.
+
+```python
+adata1.var['adata1_specific'] = [1] * 50
+adata2.var['adata2_specific'] = [2] * 50
+
+# Both metadata columns are kept
+adata_combined = ad.concat([adata1, adata2], merge='only')
+```
+
+## Handling Index Conflicts
+
+### Make indices unique
+```python
+import pandas as pd
+
+# Create objects with overlapping observation names
+adata1 = ad.AnnData(
+    X=np.random.rand(3, 10),
+    obs=pd.DataFrame(index=['cell_1', 'cell_2', 'cell_3'])
+)
+adata2 = ad.AnnData(
+    X=np.random.rand(3, 10),
+    obs=pd.DataFrame(index=['cell_1', 'cell_2', 'cell_3'])
+)
+
+# Make indices unique by appending batch keys
+adata_combined = ad.concat(
+    [adata1, adata2],
+    label='batch',
+    keys=['batch1', 'batch2'],
+    index_unique='_'  # Separator for making indices unique
+)
+
+print(adata_combined.obs_names)
+# ['cell_1_batch1', 'cell_2_batch1', 'cell_3_batch1',
+#  'cell_1_batch2', 'cell_2_batch2', 'cell_3_batch2']
+```
+
+## Concatenating Layers
+
+```python
+# Objects with layers
+adata1 = ad.AnnData(X=np.random.rand(100, 50))
+adata1.layers['normalized'] = np.random.rand(100, 50)
+adata1.layers['scaled'] = np.random.rand(100, 50)
+
+adata2 = ad.AnnData(X=np.random.rand(150, 50))
+adata2.layers['normalized'] = np.random.rand(150, 50)
+adata2.layers['scaled'] = np.random.rand(150, 50)
+
+# Layers are concatenated automatically if present in all objects
+adata_combined = ad.concat([adata1, adata2])
+
+print(adata_combined.layers.keys())
+# dict_keys(['normalized', 'scaled'])
+```
+
+## Concatenating Multi-dimensional Annotations
+
+### obsm/varm
+```python
+# Objects with embeddings
+adata1.obsm['X_pca'] = np.random.rand(100, 50)
+adata2.obsm['X_pca'] = np.random.rand(150, 50)
+
+# obsm is concatenated along observation axis
+adata_combined = ad.concat([adata1, adata2])
+print(adata_combined.obsm['X_pca'].shape)  # (250, 50)
+```
+
+### obsp/varp (pairwise annotations)
+```python
+from scipy.sparse import csr_matrix
+
+# Pairwise matrices
+adata1.obsp['connectivities'] = csr_matrix((100, 100))
+adata2.obsp['connectivities'] = csr_matrix((150, 150))
+
+# By default, obsp is NOT concatenated (set pairwise=True to include)
+adata_combined = ad.concat([adata1, adata2])
+# adata_combined.obsp is empty
+
+# Include pairwise data (creates block diagonal matrix)
+adata_combined = ad.concat([adata1, adata2], pairwise=True)
+print(adata_combined.obsp['connectivities'].shape)  # (250, 250)
+```
+
+## Concatenating uns (unstructured)
+
+Unstructured metadata is merged recursively:
+
+```python
+adata1.uns['experiment'] = {'date': '2025-01-01', 'batch': 'A'}
+adata2.uns['experiment'] = {'date': '2025-01-01', 'batch': 'B'}
+
+# Using merge='unique' for uns
+adata_combined = ad.concat([adata1, adata2], uns_merge='unique')
+# 'date' is kept (same value), 'batch' might be excluded (different values)
+```
+
+## Lazy Concatenation (AnnCollection)
+
+For very large datasets, use lazy concatenation that doesn't load all data:
+
+```python
+from anndata.experimental import AnnCollection
+
+# Create collection from file paths (doesn't load data)
+files = ['data1.h5ad', 'data2.h5ad', 'data3.h5ad']
+collection = AnnCollection(
+    files,
+    join_obs='outer',
+    join_vars='inner',
+    label='dataset',
+    keys=['dataset1', 'dataset2', 'dataset3']
+)
+
+# Access data lazily
+print(collection.n_obs)  # Total observations
+print(collection.obs.head())  # Metadata loaded, not X
+
+# Convert to regular AnnData when needed (loads all data)
+adata = collection.to_adata()
+```
+
+### Working with AnnCollection
+```python
+# Subset without loading data
+subset = collection[collection.obs['cell_type'] == 'T cell']
+
+# Iterate through datasets
+for adata in collection:
+    print(adata.shape)
+
+# Access specific dataset
+first_dataset = collection[0]
+```
+
+## Concatenation on Disk
+
+For datasets too large for memory, concatenate directly on disk:
+
+```python
+from anndata.experimental import concat_on_disk
+
+# Concatenate without loading into memory
+concat_on_disk(
+    ['data1.h5ad', 'data2.h5ad', 'data3.h5ad'],
+    'combined.h5ad',
+    join='outer'
+)
+
+# Load result in backed mode
+adata = ad.read_h5ad('combined.h5ad', backed='r')
+```
+
+## Common Concatenation Patterns
+
+### Combine technical replicates
+```python
+# Multiple runs of the same samples
+replicates = [adata_run1, adata_run2, adata_run3]
+adata_combined = ad.concat(
+    replicates,
+    label='technical_replicate',
+    keys=['rep1', 'rep2', 'rep3'],
+    join='inner'  # Keep only genes measured in all runs
+)
+```
+
+### Combine batches from experiment
+```python
+# Different experimental batches
+batches = [adata_batch1, adata_batch2, adata_batch3]
+adata_combined = ad.concat(
+    batches,
+    label='batch',
+    keys=['batch1', 'batch2', 'batch3'],
+    join='outer'  # Keep all genes
+)
+
+# Later: apply batch correction
+```
+
+### Merge multi-modal data
+```python
+# Different measurement modalities (e.g., RNA + protein)
+adata_rna = ad.AnnData(X=np.random.rand(100, 2000))
+adata_protein = ad.AnnData(X=np.random.rand(100, 50))
+
+# Concatenate along variables to combine modalities
+adata_multimodal = ad.concat([adata_rna, adata_protein], axis=1)
+
+# Add labels to distinguish modalities
+adata_multimodal.var['modality'] = ['RNA'] * 2000 + ['protein'] * 50
+```
+
+## Best Practices
+
+1. **Check compatibility before concatenating**
+```python
+# Verify shapes are compatible
+print([adata.n_vars for adata in [adata1, adata2, adata3]])
+
+# Check variable names match
+print([set(adata.var_names) for adata in [adata1, adata2, adata3]])
+```
+
+2. **Use appropriate join type**
+- `inner`: When you need the same features across all samples (most stringent)
+- `outer`: When you want to preserve all features (most inclusive)
+
+3. **Track data sources**
+Always use `label` and `keys` to track which observations came from which dataset.
+
+4. **Consider memory usage**
+- For large datasets, use `AnnCollection` or `concat_on_disk`
+- Consider backed mode for the result
+
+5. **Handle batch effects**
+Concatenation combines data but doesn't correct for batch effects. Apply batch correction after concatenation:
+```python
+# After concatenation, apply batch correction
+import scanpy as sc
+sc.pp.combat(adata_combined, key='batch')
+```
+
+6. **Validate results**
+```python
+# Check dimensions
+print(adata_combined.shape)
+
+# Check batch distribution
+print(adata_combined.obs['batch'].value_counts())
+
+# Verify metadata integrity
+print(adata_combined.var.head())
+print(adata_combined.obs.head())
+```