Initial commit
This commit is contained in:
516
skills/anndata/references/manipulation.md
Normal file
516
skills/anndata/references/manipulation.md
Normal file
@@ -0,0 +1,516 @@
|
||||
# Data Manipulation
|
||||
|
||||
Operations for transforming, subsetting, and manipulating AnnData objects.
|
||||
|
||||
## Subsetting
|
||||
|
||||
### By indices
|
||||
```python
|
||||
import anndata as ad
|
||||
import numpy as np
|
||||
|
||||
adata = ad.AnnData(X=np.random.rand(1000, 2000))
|
||||
|
||||
# Integer indices
|
||||
subset = adata[0:100, 0:500] # First 100 obs, first 500 vars
|
||||
|
||||
# List of indices
|
||||
obs_indices = [0, 10, 20, 30, 40]
|
||||
var_indices = [0, 1, 2, 3, 4]
|
||||
subset = adata[obs_indices, var_indices]
|
||||
|
||||
# Single observation or variable
|
||||
single_obs = adata[0, :]
|
||||
single_var = adata[:, 0]
|
||||
```
|
||||
|
||||
### By names
|
||||
```python
|
||||
import pandas as pd
|
||||
|
||||
# Create with named indices
|
||||
obs_names = [f'cell_{i}' for i in range(1000)]
|
||||
var_names = [f'gene_{i}' for i in range(2000)]
|
||||
adata = ad.AnnData(
|
||||
X=np.random.rand(1000, 2000),
|
||||
obs=pd.DataFrame(index=obs_names),
|
||||
var=pd.DataFrame(index=var_names)
|
||||
)
|
||||
|
||||
# Subset by observation names
|
||||
subset = adata[['cell_0', 'cell_1', 'cell_2'], :]
|
||||
|
||||
# Subset by variable names
|
||||
subset = adata[:, ['gene_0', 'gene_10', 'gene_20']]
|
||||
|
||||
# Both axes
|
||||
subset = adata[['cell_0', 'cell_1'], ['gene_0', 'gene_1']]
|
||||
```
|
||||
|
||||
### By boolean masks
|
||||
```python
|
||||
# Create boolean masks
|
||||
high_count_obs = np.random.rand(1000) > 0.5
|
||||
high_var_genes = np.random.rand(2000) > 0.7
|
||||
|
||||
# Subset using masks
|
||||
subset = adata[high_count_obs, :]
|
||||
subset = adata[:, high_var_genes]
|
||||
subset = adata[high_count_obs, high_var_genes]
|
||||
```
|
||||
|
||||
### By metadata conditions
|
||||
```python
|
||||
# Add metadata
|
||||
adata.obs['cell_type'] = np.random.choice(['A', 'B', 'C'], 1000)
|
||||
adata.obs['quality_score'] = np.random.rand(1000)
|
||||
adata.var['highly_variable'] = np.random.rand(2000) > 0.8
|
||||
|
||||
# Filter by cell type
|
||||
t_cells = adata[adata.obs['cell_type'] == 'A']
|
||||
|
||||
# Filter by multiple conditions
|
||||
high_quality_a_cells = adata[
|
||||
(adata.obs['cell_type'] == 'A') &
|
||||
(adata.obs['quality_score'] > 0.7)
|
||||
]
|
||||
|
||||
# Filter by variable metadata
|
||||
hv_genes = adata[:, adata.var['highly_variable']]
|
||||
|
||||
# Complex conditions
|
||||
filtered = adata[
|
||||
(adata.obs['quality_score'] > 0.5) &
|
||||
(adata.obs['cell_type'].isin(['A', 'B'])),
|
||||
adata.var['highly_variable']
|
||||
]
|
||||
```
|
||||
|
||||
## Transposition
|
||||
|
||||
```python
|
||||
# Transpose AnnData object (swap observations and variables)
|
||||
adata_T = adata.T
|
||||
|
||||
# Shape changes
|
||||
print(adata.shape) # (1000, 2000)
|
||||
print(adata_T.shape) # (2000, 1000)
|
||||
|
||||
# obs and var are swapped
|
||||
print(adata.obs.head()) # Observation metadata
|
||||
print(adata_T.var.head()) # Same data, now as variable metadata
|
||||
|
||||
# Useful when data is in opposite orientation
|
||||
# Common with some file formats where genes are rows
|
||||
```
|
||||
|
||||
## Copying
|
||||
|
||||
### Full copy
|
||||
```python
|
||||
# Create independent copy
|
||||
adata_copy = adata.copy()
|
||||
|
||||
# Modifications to copy don't affect original
|
||||
adata_copy.obs['new_column'] = 1
|
||||
print('new_column' in adata.obs.columns) # False
|
||||
```
|
||||
|
||||
### Shallow copy
|
||||
```python
|
||||
# View (doesn't copy data, modifications affect original)
|
||||
adata_view = adata[0:100, :]
|
||||
|
||||
# Check if object is a view
|
||||
print(adata_view.is_view) # True
|
||||
|
||||
# Convert view to independent copy
|
||||
adata_independent = adata_view.copy()
|
||||
print(adata_independent.is_view) # False
|
||||
```
|
||||
|
||||
## Renaming
|
||||
|
||||
### Rename observations and variables
|
||||
```python
|
||||
# Rename all observations
|
||||
adata.obs_names = [f'new_cell_{i}' for i in range(adata.n_obs)]
|
||||
|
||||
# Rename all variables
|
||||
adata.var_names = [f'new_gene_{i}' for i in range(adata.n_vars)]
|
||||
|
||||
# Make names unique (add suffix to duplicates)
|
||||
adata.obs_names_make_unique()
|
||||
adata.var_names_make_unique()
|
||||
```
|
||||
|
||||
### Rename categories
|
||||
```python
|
||||
# Create categorical column
|
||||
adata.obs['cell_type'] = pd.Categorical(['A', 'B', 'C'] * 333 + ['A'])
|
||||
|
||||
# Rename categories
|
||||
adata.rename_categories('cell_type', ['Type_A', 'Type_B', 'Type_C'])
|
||||
|
||||
# Or using dictionary
|
||||
adata.rename_categories('cell_type', {
|
||||
'Type_A': 'T_cell',
|
||||
'Type_B': 'B_cell',
|
||||
'Type_C': 'Monocyte'
|
||||
})
|
||||
```
|
||||
|
||||
## Type Conversions
|
||||
|
||||
### Strings to categoricals
|
||||
```python
|
||||
# Convert string columns to categorical (more memory efficient)
|
||||
adata.obs['cell_type'] = ['TypeA', 'TypeB'] * 500
|
||||
adata.obs['tissue'] = ['brain', 'liver'] * 500
|
||||
|
||||
# Convert all string columns to categorical
|
||||
adata.strings_to_categoricals()
|
||||
|
||||
print(adata.obs['cell_type'].dtype) # category
|
||||
print(adata.obs['tissue'].dtype) # category
|
||||
```
|
||||
|
||||
### Sparse to dense and vice versa
|
||||
```python
|
||||
from scipy.sparse import csr_matrix
|
||||
|
||||
# Dense to sparse
|
||||
if not isinstance(adata.X, csr_matrix):
|
||||
adata.X = csr_matrix(adata.X)
|
||||
|
||||
# Sparse to dense
|
||||
if isinstance(adata.X, csr_matrix):
|
||||
adata.X = adata.X.toarray()
|
||||
|
||||
# Convert layer
|
||||
adata.layers['normalized'] = csr_matrix(adata.layers['normalized'])
|
||||
```
|
||||
|
||||
## Chunked Operations
|
||||
|
||||
Process large datasets in chunks:
|
||||
|
||||
```python
|
||||
# Iterate through data in chunks
|
||||
chunk_size = 100
|
||||
for chunk in adata.chunked_X(chunk_size):
|
||||
# Process chunk
|
||||
result = process_chunk(chunk)
|
||||
```
|
||||
|
||||
## Extracting Vectors
|
||||
|
||||
### Get observation vectors
|
||||
```python
|
||||
# Get observation metadata as array
|
||||
cell_types = adata.obs_vector('cell_type')
|
||||
|
||||
# Get gene expression across observations
|
||||
actb_expression = adata.obs_vector('ACTB') # If ACTB in var_names
|
||||
```
|
||||
|
||||
### Get variable vectors
|
||||
```python
|
||||
# Get variable metadata as array
|
||||
gene_names = adata.var_vector('gene_name')
|
||||
```
|
||||
|
||||
## Adding/Modifying Data
|
||||
|
||||
### Add observations
|
||||
```python
|
||||
# Create new observations
|
||||
new_obs = ad.AnnData(X=np.random.rand(100, adata.n_vars))
|
||||
new_obs.var_names = adata.var_names
|
||||
|
||||
# Concatenate with existing
|
||||
adata_extended = ad.concat([adata, new_obs], axis=0)
|
||||
```
|
||||
|
||||
### Add variables
|
||||
```python
|
||||
# Create new variables
|
||||
new_vars = ad.AnnData(X=np.random.rand(adata.n_obs, 100))
|
||||
new_vars.obs_names = adata.obs_names
|
||||
|
||||
# Concatenate with existing
|
||||
adata_extended = ad.concat([adata, new_vars], axis=1)
|
||||
```
|
||||
|
||||
### Add metadata columns
|
||||
```python
|
||||
# Add observation annotation
|
||||
adata.obs['new_score'] = np.random.rand(adata.n_obs)
|
||||
|
||||
# Add variable annotation
|
||||
adata.var['new_label'] = ['label'] * adata.n_vars
|
||||
|
||||
# Add from external data
|
||||
external_data = pd.read_csv('metadata.csv', index_col=0)
|
||||
adata.obs['external_info'] = external_data.loc[adata.obs_names, 'column']
|
||||
```
|
||||
|
||||
### Add layers
|
||||
```python
|
||||
# Add new layer
|
||||
adata.layers['raw_counts'] = np.random.randint(0, 100, adata.shape)
|
||||
adata.layers['log_transformed'] = np.log1p(adata.X)
|
||||
|
||||
# Replace layer
|
||||
adata.layers['normalized'] = new_normalized_data
|
||||
```
|
||||
|
||||
### Add embeddings
|
||||
```python
|
||||
# Add PCA
|
||||
adata.obsm['X_pca'] = np.random.rand(adata.n_obs, 50)
|
||||
|
||||
# Add UMAP
|
||||
adata.obsm['X_umap'] = np.random.rand(adata.n_obs, 2)
|
||||
|
||||
# Add multiple embeddings
|
||||
adata.obsm['X_tsne'] = np.random.rand(adata.n_obs, 2)
|
||||
adata.obsm['X_diffmap'] = np.random.rand(adata.n_obs, 10)
|
||||
```
|
||||
|
||||
### Add pairwise relationships
|
||||
```python
|
||||
from scipy.sparse import csr_matrix
|
||||
|
||||
# Add nearest neighbor graph
|
||||
n_obs = adata.n_obs
|
||||
knn_graph = csr_matrix(np.random.rand(n_obs, n_obs) > 0.95)
|
||||
adata.obsp['connectivities'] = knn_graph
|
||||
|
||||
# Add distance matrix
|
||||
adata.obsp['distances'] = csr_matrix(np.random.rand(n_obs, n_obs))
|
||||
```
|
||||
|
||||
### Add unstructured data
|
||||
```python
|
||||
# Add analysis parameters
|
||||
adata.uns['pca'] = {
|
||||
'variance': [0.2, 0.15, 0.1],
|
||||
'variance_ratio': [0.4, 0.3, 0.2],
|
||||
'params': {'n_comps': 50}
|
||||
}
|
||||
|
||||
# Add color schemes
|
||||
adata.uns['cell_type_colors'] = ['#FF0000', '#00FF00', '#0000FF']
|
||||
```
|
||||
|
||||
## Removing Data
|
||||
|
||||
### Remove observations or variables
|
||||
```python
|
||||
# Keep only specific observations
|
||||
keep_obs = adata.obs['quality_score'] > 0.5
|
||||
adata = adata[keep_obs, :]
|
||||
|
||||
# Remove specific variables
|
||||
remove_vars = adata.var['low_count']
|
||||
adata = adata[:, ~remove_vars]
|
||||
```
|
||||
|
||||
### Remove metadata columns
|
||||
```python
|
||||
# Remove observation column
|
||||
adata.obs.drop('unwanted_column', axis=1, inplace=True)
|
||||
|
||||
# Remove variable column
|
||||
adata.var.drop('unwanted_column', axis=1, inplace=True)
|
||||
```
|
||||
|
||||
### Remove layers
|
||||
```python
|
||||
# Remove specific layer
|
||||
del adata.layers['unwanted_layer']
|
||||
|
||||
# Remove all layers
|
||||
adata.layers = {}
|
||||
```
|
||||
|
||||
### Remove embeddings
|
||||
```python
|
||||
# Remove specific embedding
|
||||
del adata.obsm['X_tsne']
|
||||
|
||||
# Remove all embeddings
|
||||
adata.obsm = {}
|
||||
```
|
||||
|
||||
### Remove unstructured data
|
||||
```python
|
||||
# Remove specific key
|
||||
del adata.uns['unwanted_key']
|
||||
|
||||
# Remove all unstructured data
|
||||
adata.uns = {}
|
||||
```
|
||||
|
||||
## Reordering
|
||||
|
||||
### Sort observations
|
||||
```python
|
||||
# Sort by observation metadata
|
||||
adata = adata[adata.obs.sort_values('quality_score').index, :]
|
||||
|
||||
# Sort by observation names
|
||||
adata = adata[sorted(adata.obs_names), :]
|
||||
```
|
||||
|
||||
### Sort variables
|
||||
```python
|
||||
# Sort by variable metadata
|
||||
adata = adata[:, adata.var.sort_values('gene_name').index]
|
||||
|
||||
# Sort by variable names
|
||||
adata = adata[:, sorted(adata.var_names)]
|
||||
```
|
||||
|
||||
### Reorder to match external list
|
||||
```python
|
||||
# Reorder observations to match external list
|
||||
desired_order = ['cell_10', 'cell_5', 'cell_20', ...]
|
||||
adata = adata[desired_order, :]
|
||||
|
||||
# Reorder variables
|
||||
desired_genes = ['TP53', 'ACTB', 'GAPDH', ...]
|
||||
adata = adata[:, desired_genes]
|
||||
```
|
||||
|
||||
## Data Transformations
|
||||
|
||||
### Normalize
|
||||
```python
|
||||
# Total count normalization (CPM/TPM-like)
|
||||
total_counts = adata.X.sum(axis=1)
|
||||
adata.layers['normalized'] = adata.X / total_counts[:, np.newaxis] * 1e6
|
||||
|
||||
# Log transformation
|
||||
adata.layers['log1p'] = np.log1p(adata.X)
|
||||
|
||||
# Z-score normalization
|
||||
mean = adata.X.mean(axis=0)
|
||||
std = adata.X.std(axis=0)
|
||||
adata.layers['scaled'] = (adata.X - mean) / std
|
||||
```
|
||||
|
||||
### Filter
|
||||
```python
|
||||
# Filter cells by total counts
|
||||
total_counts = np.array(adata.X.sum(axis=1)).flatten()
|
||||
adata.obs['total_counts'] = total_counts
|
||||
adata = adata[adata.obs['total_counts'] > 1000, :]
|
||||
|
||||
# Filter genes by detection rate
|
||||
detection_rate = (adata.X > 0).sum(axis=0) / adata.n_obs
|
||||
adata.var['detection_rate'] = np.array(detection_rate).flatten()
|
||||
adata = adata[:, adata.var['detection_rate'] > 0.01]
|
||||
```
|
||||
|
||||
## Working with Views
|
||||
|
||||
Views are lightweight references to subsets of data that don't copy the underlying matrix:
|
||||
|
||||
```python
|
||||
# Create view
|
||||
view = adata[0:100, 0:500]
|
||||
print(view.is_view) # True
|
||||
|
||||
# Views allow read access
|
||||
data = view.X
|
||||
|
||||
# Modifying view data affects original
|
||||
# (Be careful!)
|
||||
|
||||
# Convert view to independent copy
|
||||
independent = view.copy()
|
||||
|
||||
# Force AnnData to be a copy, not a view
|
||||
adata = adata.copy()
|
||||
```
|
||||
|
||||
## Merging Metadata
|
||||
|
||||
```python
|
||||
# Merge external metadata
|
||||
external_metadata = pd.read_csv('additional_metadata.csv', index_col=0)
|
||||
|
||||
# Join metadata (inner join on index)
|
||||
adata.obs = adata.obs.join(external_metadata)
|
||||
|
||||
# Left join (keep all adata observations)
|
||||
adata.obs = adata.obs.merge(
|
||||
external_metadata,
|
||||
left_index=True,
|
||||
right_index=True,
|
||||
how='left'
|
||||
)
|
||||
```
|
||||
|
||||
## Common Manipulation Patterns
|
||||
|
||||
### Quality control filtering
|
||||
```python
|
||||
# Calculate QC metrics
|
||||
adata.obs['n_genes'] = (adata.X > 0).sum(axis=1)
|
||||
adata.obs['total_counts'] = adata.X.sum(axis=1)
|
||||
adata.var['n_cells'] = (adata.X > 0).sum(axis=0)
|
||||
|
||||
# Filter low-quality cells
|
||||
adata = adata[adata.obs['n_genes'] > 200, :]
|
||||
adata = adata[adata.obs['total_counts'] < 50000, :]
|
||||
|
||||
# Filter rarely detected genes
|
||||
adata = adata[:, adata.var['n_cells'] >= 3]
|
||||
```
|
||||
|
||||
### Select highly variable genes
|
||||
```python
|
||||
# Mark highly variable genes
|
||||
gene_variance = np.var(adata.X, axis=0)
|
||||
adata.var['variance'] = np.array(gene_variance).flatten()
|
||||
adata.var['highly_variable'] = adata.var['variance'] > np.percentile(gene_variance, 90)
|
||||
|
||||
# Subset to highly variable genes
|
||||
adata_hvg = adata[:, adata.var['highly_variable']].copy()
|
||||
```
|
||||
|
||||
### Downsample
|
||||
```python
|
||||
# Random sampling of observations
|
||||
np.random.seed(42)
|
||||
n_sample = 500
|
||||
sample_indices = np.random.choice(adata.n_obs, n_sample, replace=False)
|
||||
adata_downsampled = adata[sample_indices, :].copy()
|
||||
|
||||
# Stratified sampling by cell type
|
||||
from sklearn.model_selection import train_test_split
|
||||
train_idx, test_idx = train_test_split(
|
||||
range(adata.n_obs),
|
||||
test_size=0.2,
|
||||
stratify=adata.obs['cell_type']
|
||||
)
|
||||
adata_train = adata[train_idx, :].copy()
|
||||
adata_test = adata[test_idx, :].copy()
|
||||
```
|
||||
|
||||
### Split train/test
|
||||
```python
|
||||
# Random train/test split
|
||||
np.random.seed(42)
|
||||
n_obs = adata.n_obs
|
||||
train_size = int(0.8 * n_obs)
|
||||
indices = np.random.permutation(n_obs)
|
||||
train_indices = indices[:train_size]
|
||||
test_indices = indices[train_size:]
|
||||
|
||||
adata_train = adata[train_indices, :].copy()
|
||||
adata_test = adata[test_indices, :].copy()
|
||||
```
|
||||
Reference in New Issue
Block a user