Initial commit

2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions
--- a/skills/anndata/references/manipulation.md
+++ b/skills/anndata/references/manipulation.md
@@ -0,0 +1,516 @@
+# Data Manipulation
+
+Operations for transforming, subsetting, and manipulating AnnData objects.
+
+## Subsetting
+
+### By indices
+```python
+import anndata as ad
+import numpy as np
+
+adata = ad.AnnData(X=np.random.rand(1000, 2000))
+
+# Integer indices
+subset = adata[0:100, 0:500]  # First 100 obs, first 500 vars
+
+# List of indices
+obs_indices = [0, 10, 20, 30, 40]
+var_indices = [0, 1, 2, 3, 4]
+subset = adata[obs_indices, var_indices]
+
+# Single observation or variable
+single_obs = adata[0, :]
+single_var = adata[:, 0]
+```
+
+### By names
+```python
+import pandas as pd
+
+# Create with named indices
+obs_names = [f'cell_{i}' for i in range(1000)]
+var_names = [f'gene_{i}' for i in range(2000)]
+adata = ad.AnnData(
+    X=np.random.rand(1000, 2000),
+    obs=pd.DataFrame(index=obs_names),
+    var=pd.DataFrame(index=var_names)
+)
+
+# Subset by observation names
+subset = adata[['cell_0', 'cell_1', 'cell_2'], :]
+
+# Subset by variable names
+subset = adata[:, ['gene_0', 'gene_10', 'gene_20']]
+
+# Both axes
+subset = adata[['cell_0', 'cell_1'], ['gene_0', 'gene_1']]
+```
+
+### By boolean masks
+```python
+# Create boolean masks
+high_count_obs = np.random.rand(1000) > 0.5
+high_var_genes = np.random.rand(2000) > 0.7
+
+# Subset using masks
+subset = adata[high_count_obs, :]
+subset = adata[:, high_var_genes]
+subset = adata[high_count_obs, high_var_genes]
+```
+
+### By metadata conditions
+```python
+# Add metadata
+adata.obs['cell_type'] = np.random.choice(['A', 'B', 'C'], 1000)
+adata.obs['quality_score'] = np.random.rand(1000)
+adata.var['highly_variable'] = np.random.rand(2000) > 0.8
+
+# Filter by cell type
+t_cells = adata[adata.obs['cell_type'] == 'A']
+
+# Filter by multiple conditions
+high_quality_a_cells = adata[
+    (adata.obs['cell_type'] == 'A') &
+    (adata.obs['quality_score'] > 0.7)
+]
+
+# Filter by variable metadata
+hv_genes = adata[:, adata.var['highly_variable']]
+
+# Complex conditions
+filtered = adata[
+    (adata.obs['quality_score'] > 0.5) &
+    (adata.obs['cell_type'].isin(['A', 'B'])),
+    adata.var['highly_variable']
+]
+```
+
+## Transposition
+
+```python
+# Transpose AnnData object (swap observations and variables)
+adata_T = adata.T
+
+# Shape changes
+print(adata.shape)    # (1000, 2000)
+print(adata_T.shape)  # (2000, 1000)
+
+# obs and var are swapped
+print(adata.obs.head())   # Observation metadata
+print(adata_T.var.head()) # Same data, now as variable metadata
+
+# Useful when data is in opposite orientation
+# Common with some file formats where genes are rows
+```
+
+## Copying
+
+### Full copy
+```python
+# Create independent copy
+adata_copy = adata.copy()
+
+# Modifications to copy don't affect original
+adata_copy.obs['new_column'] = 1
+print('new_column' in adata.obs.columns)  # False
+```
+
+### Shallow copy
+```python
+# View (doesn't copy data, modifications affect original)
+adata_view = adata[0:100, :]
+
+# Check if object is a view
+print(adata_view.is_view)  # True
+
+# Convert view to independent copy
+adata_independent = adata_view.copy()
+print(adata_independent.is_view)  # False
+```
+
+## Renaming
+
+### Rename observations and variables
+```python
+# Rename all observations
+adata.obs_names = [f'new_cell_{i}' for i in range(adata.n_obs)]
+
+# Rename all variables
+adata.var_names = [f'new_gene_{i}' for i in range(adata.n_vars)]
+
+# Make names unique (add suffix to duplicates)
+adata.obs_names_make_unique()
+adata.var_names_make_unique()
+```
+
+### Rename categories
+```python
+# Create categorical column
+adata.obs['cell_type'] = pd.Categorical(['A', 'B', 'C'] * 333 + ['A'])
+
+# Rename categories
+adata.rename_categories('cell_type', ['Type_A', 'Type_B', 'Type_C'])
+
+# Or using dictionary
+adata.rename_categories('cell_type', {
+    'Type_A': 'T_cell',
+    'Type_B': 'B_cell',
+    'Type_C': 'Monocyte'
+})
+```
+
+## Type Conversions
+
+### Strings to categoricals
+```python
+# Convert string columns to categorical (more memory efficient)
+adata.obs['cell_type'] = ['TypeA', 'TypeB'] * 500
+adata.obs['tissue'] = ['brain', 'liver'] * 500
+
+# Convert all string columns to categorical
+adata.strings_to_categoricals()
+
+print(adata.obs['cell_type'].dtype)  # category
+print(adata.obs['tissue'].dtype)     # category
+```
+
+### Sparse to dense and vice versa
+```python
+from scipy.sparse import csr_matrix
+
+# Dense to sparse
+if not isinstance(adata.X, csr_matrix):
+    adata.X = csr_matrix(adata.X)
+
+# Sparse to dense
+if isinstance(adata.X, csr_matrix):
+    adata.X = adata.X.toarray()
+
+# Convert layer
+adata.layers['normalized'] = csr_matrix(adata.layers['normalized'])
+```
+
+## Chunked Operations
+
+Process large datasets in chunks:
+
+```python
+# Iterate through data in chunks
+chunk_size = 100
+for chunk in adata.chunked_X(chunk_size):
+    # Process chunk
+    result = process_chunk(chunk)
+```
+
+## Extracting Vectors
+
+### Get observation vectors
+```python
+# Get observation metadata as array
+cell_types = adata.obs_vector('cell_type')
+
+# Get gene expression across observations
+actb_expression = adata.obs_vector('ACTB')  # If ACTB in var_names
+```
+
+### Get variable vectors
+```python
+# Get variable metadata as array
+gene_names = adata.var_vector('gene_name')
+```
+
+## Adding/Modifying Data
+
+### Add observations
+```python
+# Create new observations
+new_obs = ad.AnnData(X=np.random.rand(100, adata.n_vars))
+new_obs.var_names = adata.var_names
+
+# Concatenate with existing
+adata_extended = ad.concat([adata, new_obs], axis=0)
+```
+
+### Add variables
+```python
+# Create new variables
+new_vars = ad.AnnData(X=np.random.rand(adata.n_obs, 100))
+new_vars.obs_names = adata.obs_names
+
+# Concatenate with existing
+adata_extended = ad.concat([adata, new_vars], axis=1)
+```
+
+### Add metadata columns
+```python
+# Add observation annotation
+adata.obs['new_score'] = np.random.rand(adata.n_obs)
+
+# Add variable annotation
+adata.var['new_label'] = ['label'] * adata.n_vars
+
+# Add from external data
+external_data = pd.read_csv('metadata.csv', index_col=0)
+adata.obs['external_info'] = external_data.loc[adata.obs_names, 'column']
+```
+
+### Add layers
+```python
+# Add new layer
+adata.layers['raw_counts'] = np.random.randint(0, 100, adata.shape)
+adata.layers['log_transformed'] = np.log1p(adata.X)
+
+# Replace layer
+adata.layers['normalized'] = new_normalized_data
+```
+
+### Add embeddings
+```python
+# Add PCA
+adata.obsm['X_pca'] = np.random.rand(adata.n_obs, 50)
+
+# Add UMAP
+adata.obsm['X_umap'] = np.random.rand(adata.n_obs, 2)
+
+# Add multiple embeddings
+adata.obsm['X_tsne'] = np.random.rand(adata.n_obs, 2)
+adata.obsm['X_diffmap'] = np.random.rand(adata.n_obs, 10)
+```
+
+### Add pairwise relationships
+```python
+from scipy.sparse import csr_matrix
+
+# Add nearest neighbor graph
+n_obs = adata.n_obs
+knn_graph = csr_matrix(np.random.rand(n_obs, n_obs) > 0.95)
+adata.obsp['connectivities'] = knn_graph
+
+# Add distance matrix
+adata.obsp['distances'] = csr_matrix(np.random.rand(n_obs, n_obs))
+```
+
+### Add unstructured data
+```python
+# Add analysis parameters
+adata.uns['pca'] = {
+    'variance': [0.2, 0.15, 0.1],
+    'variance_ratio': [0.4, 0.3, 0.2],
+    'params': {'n_comps': 50}
+}
+
+# Add color schemes
+adata.uns['cell_type_colors'] = ['#FF0000', '#00FF00', '#0000FF']
+```
+
+## Removing Data
+
+### Remove observations or variables
+```python
+# Keep only specific observations
+keep_obs = adata.obs['quality_score'] > 0.5
+adata = adata[keep_obs, :]
+
+# Remove specific variables
+remove_vars = adata.var['low_count']
+adata = adata[:, ~remove_vars]
+```
+
+### Remove metadata columns
+```python
+# Remove observation column
+adata.obs.drop('unwanted_column', axis=1, inplace=True)
+
+# Remove variable column
+adata.var.drop('unwanted_column', axis=1, inplace=True)
+```
+
+### Remove layers
+```python
+# Remove specific layer
+del adata.layers['unwanted_layer']
+
+# Remove all layers
+adata.layers = {}
+```
+
+### Remove embeddings
+```python
+# Remove specific embedding
+del adata.obsm['X_tsne']
+
+# Remove all embeddings
+adata.obsm = {}
+```
+
+### Remove unstructured data
+```python
+# Remove specific key
+del adata.uns['unwanted_key']
+
+# Remove all unstructured data
+adata.uns = {}
+```
+
+## Reordering
+
+### Sort observations
+```python
+# Sort by observation metadata
+adata = adata[adata.obs.sort_values('quality_score').index, :]
+
+# Sort by observation names
+adata = adata[sorted(adata.obs_names), :]
+```
+
+### Sort variables
+```python
+# Sort by variable metadata
+adata = adata[:, adata.var.sort_values('gene_name').index]
+
+# Sort by variable names
+adata = adata[:, sorted(adata.var_names)]
+```
+
+### Reorder to match external list
+```python
+# Reorder observations to match external list
+desired_order = ['cell_10', 'cell_5', 'cell_20', ...]
+adata = adata[desired_order, :]
+
+# Reorder variables
+desired_genes = ['TP53', 'ACTB', 'GAPDH', ...]
+adata = adata[:, desired_genes]
+```
+
+## Data Transformations
+
+### Normalize
+```python
+# Total count normalization (CPM/TPM-like)
+total_counts = adata.X.sum(axis=1)
+adata.layers['normalized'] = adata.X / total_counts[:, np.newaxis] * 1e6
+
+# Log transformation
+adata.layers['log1p'] = np.log1p(adata.X)
+
+# Z-score normalization
+mean = adata.X.mean(axis=0)
+std = adata.X.std(axis=0)
+adata.layers['scaled'] = (adata.X - mean) / std
+```
+
+### Filter
+```python
+# Filter cells by total counts
+total_counts = np.array(adata.X.sum(axis=1)).flatten()
+adata.obs['total_counts'] = total_counts
+adata = adata[adata.obs['total_counts'] > 1000, :]
+
+# Filter genes by detection rate
+detection_rate = (adata.X > 0).sum(axis=0) / adata.n_obs
+adata.var['detection_rate'] = np.array(detection_rate).flatten()
+adata = adata[:, adata.var['detection_rate'] > 0.01]
+```
+
+## Working with Views
+
+Views are lightweight references to subsets of data that don't copy the underlying matrix:
+
+```python
+# Create view
+view = adata[0:100, 0:500]
+print(view.is_view)  # True
+
+# Views allow read access
+data = view.X
+
+# Modifying view data affects original
+# (Be careful!)
+
+# Convert view to independent copy
+independent = view.copy()
+
+# Force AnnData to be a copy, not a view
+adata = adata.copy()
+```
+
+## Merging Metadata
+
+```python
+# Merge external metadata
+external_metadata = pd.read_csv('additional_metadata.csv', index_col=0)
+
+# Join metadata (inner join on index)
+adata.obs = adata.obs.join(external_metadata)
+
+# Left join (keep all adata observations)
+adata.obs = adata.obs.merge(
+    external_metadata,
+    left_index=True,
+    right_index=True,
+    how='left'
+)
+```
+
+## Common Manipulation Patterns
+
+### Quality control filtering
+```python
+# Calculate QC metrics
+adata.obs['n_genes'] = (adata.X > 0).sum(axis=1)
+adata.obs['total_counts'] = adata.X.sum(axis=1)
+adata.var['n_cells'] = (adata.X > 0).sum(axis=0)
+
+# Filter low-quality cells
+adata = adata[adata.obs['n_genes'] > 200, :]
+adata = adata[adata.obs['total_counts'] < 50000, :]
+
+# Filter rarely detected genes
+adata = adata[:, adata.var['n_cells'] >= 3]
+```
+
+### Select highly variable genes
+```python
+# Mark highly variable genes
+gene_variance = np.var(adata.X, axis=0)
+adata.var['variance'] = np.array(gene_variance).flatten()
+adata.var['highly_variable'] = adata.var['variance'] > np.percentile(gene_variance, 90)
+
+# Subset to highly variable genes
+adata_hvg = adata[:, adata.var['highly_variable']].copy()
+```
+
+### Downsample
+```python
+# Random sampling of observations
+np.random.seed(42)
+n_sample = 500
+sample_indices = np.random.choice(adata.n_obs, n_sample, replace=False)
+adata_downsampled = adata[sample_indices, :].copy()
+
+# Stratified sampling by cell type
+from sklearn.model_selection import train_test_split
+train_idx, test_idx = train_test_split(
+    range(adata.n_obs),
+    test_size=0.2,
+    stratify=adata.obs['cell_type']
+)
+adata_train = adata[train_idx, :].copy()
+adata_test = adata[test_idx, :].copy()
+```
+
+### Split train/test
+```python
+# Random train/test split
+np.random.seed(42)
+n_obs = adata.n_obs
+train_size = int(0.8 * n_obs)
+indices = np.random.permutation(n_obs)
+train_indices = indices[:train_size]
+test_indices = indices[train_size:]
+
+adata_train = adata[train_indices, :].copy()
+adata_test = adata[test_indices, :].copy()
+```