Initial commit

2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions
--- a/skills/scikit-bio/references/api_reference.md
+++ b/skills/scikit-bio/references/api_reference.md
@@ -0,0 +1,749 @@
+# scikit-bio API Reference
+
+This document provides detailed API information, advanced examples, and troubleshooting guidance for working with scikit-bio.
+
+## Table of Contents
+1. [Sequence Classes](#sequence-classes)
+2. [Alignment Methods](#alignment-methods)
+3. [Phylogenetic Trees](#phylogenetic-trees)
+4. [Diversity Metrics](#diversity-metrics)
+5. [Ordination](#ordination)
+6. [Statistical Tests](#statistical-tests)
+7. [Distance Matrices](#distance-matrices)
+8. [File I/O](#file-io)
+9. [Troubleshooting](#troubleshooting)
+
+## Sequence Classes
+
+### DNA, RNA, and Protein Classes
+
+```python
+from skbio import DNA, RNA, Protein, Sequence
+
+# Creating sequences
+dna = DNA('ATCGATCG', metadata={'id': 'seq1', 'description': 'Example'})
+rna = RNA('AUCGAUCG')
+protein = Protein('ACDEFGHIKLMNPQRSTVWY')
+
+# Sequence operations
+dna_rc = dna.reverse_complement()  # Reverse complement
+rna = dna.transcribe()  # DNA -> RNA
+protein = rna.translate()  # RNA -> Protein
+
+# Using genetic code tables
+protein = rna.translate(genetic_code=11)  # Bacterial code
+```
+
+### Sequence Searching and Pattern Matching
+
+```python
+# Find motifs using regex
+dna = DNA('ATGCGATCGATGCATCG')
+motif_locs = dna.find_with_regex('ATG.{3}')  # Start codons
+
+# Find all positions
+import re
+for match in re.finditer('ATG', str(dna)):
+    print(f"ATG found at position {match.start()}")
+
+# k-mer counting
+from skbio.sequence import _motifs
+kmers = dna.kmer_frequencies(k=3)
+```
+
+### Handling Sequence Metadata
+
+```python
+# Sequence-level metadata
+dna = DNA('ATCG', metadata={'id': 'seq1', 'source': 'E. coli'})
+print(dna.metadata['id'])
+
+# Positional metadata (per-base quality scores from FASTQ)
+from skbio import DNA
+seqs = DNA.read('reads.fastq', format='fastq', phred_offset=33)
+quality_scores = seqs.positional_metadata['quality']
+
+# Interval metadata (features/annotations)
+dna.interval_metadata.add([(5, 15)], metadata={'type': 'gene', 'name': 'geneA'})
+```
+
+### Distance Calculations
+
+```python
+from skbio import DNA
+
+seq1 = DNA('ATCGATCG')
+seq2 = DNA('ATCG--CG')
+
+# Hamming distance (default)
+dist = seq1.distance(seq2)
+
+# Custom distance function
+from skbio.sequence.distance import kmer_distance
+dist = seq1.distance(seq2, metric=kmer_distance)
+```
+
+## Alignment Methods
+
+### Pairwise Alignment
+
+```python
+from skbio.alignment import local_pairwise_align_ssw, global_pairwise_align
+from skbio import DNA, Protein
+
+# Local alignment (Smith-Waterman via SSW)
+seq1 = DNA('ATCGATCGATCG')
+seq2 = DNA('ATCGGGGATCG')
+alignment = local_pairwise_align_ssw(seq1, seq2)
+
+# Access alignment details
+print(f"Score: {alignment.score}")
+print(f"Start position: {alignment.target_begin}")
+aligned_seqs = alignment.aligned_sequences
+
+# Global alignment with custom scoring
+from skbio.alignment import AlignScorer
+
+scorer = AlignScorer(
+    match_score=2,
+    mismatch_score=-3,
+    gap_open_penalty=5,
+    gap_extend_penalty=2
+)
+
+alignment = global_pairwise_align(seq1, seq2, scorer=scorer)
+
+# Protein alignment with substitution matrix
+from skbio.alignment import StripedSmithWaterman
+
+protein_query = Protein('ACDEFGHIKLMNPQRSTVWY')
+protein_target = Protein('ACDEFMNPQRSTVWY')
+
+aligner = StripedSmithWaterman(
+    str(protein_query),
+    gap_open_penalty=11,
+    gap_extend_penalty=1,
+    substitution_matrix='blosum62'
+)
+alignment = aligner(str(protein_target))
+```
+
+### Multiple Sequence Alignment
+
+```python
+from skbio.alignment import TabularMSA
+from skbio import DNA
+
+# Read MSA from file
+msa = TabularMSA.read('alignment.fasta', constructor=DNA)
+
+# Create MSA manually
+seqs = [
+    DNA('ATCG--'),
+    DNA('ATGG--'),
+    DNA('ATCGAT')
+]
+msa = TabularMSA(seqs)
+
+# MSA operations
+consensus = msa.consensus()
+majority_consensus = msa.majority_consensus()
+
+# Calculate conservation
+conservation = msa.conservation()
+
+# Access sequences
+first_seq = msa[0]
+column = msa[:, 2]  # Third column
+
+# Filter gaps
+degapped_msa = msa.omit_gap_positions(maximum_gap_frequency=0.5)
+
+# Calculate position-specific scores
+position_entropies = msa.position_entropies()
+```
+
+### CIGAR String Handling
+
+```python
+from skbio.alignment import AlignPath
+
+# Parse CIGAR string
+cigar = "10M2I5M3D10M"
+align_path = AlignPath.from_cigar(cigar, target_length=100, query_length=50)
+
+# Convert alignment to CIGAR
+alignment = local_pairwise_align_ssw(seq1, seq2)
+cigar_string = alignment.to_cigar()
+```
+
+## Phylogenetic Trees
+
+### Tree Construction
+
+```python
+from skbio import TreeNode, DistanceMatrix
+from skbio.tree import nj, upgma
+
+# Distance matrix
+dm = DistanceMatrix([[0, 5, 9, 9],
+                     [5, 0, 10, 10],
+                     [9, 10, 0, 8],
+                     [9, 10, 8, 0]],
+                    ids=['A', 'B', 'C', 'D'])
+
+# Neighbor joining
+nj_tree = nj(dm)
+
+# UPGMA (assumes molecular clock)
+upgma_tree = upgma(dm)
+
+# Balanced Minimum Evolution (scalable for large trees)
+from skbio.tree import bme
+bme_tree = bme(dm)
+```
+
+### Tree Manipulation
+
+```python
+from skbio import TreeNode
+
+# Read tree
+tree = TreeNode.read('tree.nwk', format='newick')
+
+# Traversal
+for node in tree.traverse():
+    print(node.name)
+
+# Preorder, postorder, levelorder
+for node in tree.preorder():
+    print(node.name)
+
+# Get tips only
+tips = list(tree.tips())
+
+# Find specific node
+node = tree.find('taxon_name')
+
+# Root tree at midpoint
+rooted_tree = tree.root_at_midpoint()
+
+# Prune tree to specific taxa
+pruned = tree.shear(['taxon1', 'taxon2', 'taxon3'])
+
+# Get subtree
+lca = tree.lowest_common_ancestor(['taxon1', 'taxon2'])
+subtree = lca.copy()
+
+# Add/remove nodes
+parent = tree.find('parent_name')
+child = TreeNode(name='new_child', length=0.5)
+parent.append(child)
+
+# Remove node
+node_to_remove = tree.find('taxon_to_remove')
+node_to_remove.parent.remove(node_to_remove)
+```
+
+### Tree Distances and Comparisons
+
+```python
+# Patristic distance (branch-length distance)
+node1 = tree.find('taxon1')
+node2 = tree.find('taxon2')
+patristic = node1.distance(node2)
+
+# Cophenetic matrix (all pairwise distances)
+cophenetic_dm = tree.cophenetic_matrix()
+
+# Robinson-Foulds distance (topology comparison)
+rf_dist = tree.robinson_foulds(other_tree)
+
+# Compare with unweighted RF
+rf_dist, max_rf = tree.robinson_foulds(other_tree, proportion=False)
+
+# Tip-to-tip distances
+tip_distances = tree.tip_tip_distances()
+```
+
+### Tree Visualization
+
+```python
+# ASCII art visualization
+print(tree.ascii_art())
+
+# For advanced visualization, export to external tools
+tree.write('tree.nwk', format='newick')
+
+# Then use ete3, toytree, or ggtree for publication-quality figures
+```
+
+## Diversity Metrics
+
+### Alpha Diversity
+
+```python
+from skbio.diversity import alpha_diversity, get_alpha_diversity_metrics
+import numpy as np
+
+# Sample count data (samples x features)
+counts = np.array([
+    [10, 5, 0, 3],
+    [2, 0, 8, 4],
+    [5, 5, 5, 5]
+])
+sample_ids = ['Sample1', 'Sample2', 'Sample3']
+
+# List available metrics
+print(get_alpha_diversity_metrics())
+
+# Calculate various alpha diversity metrics
+shannon = alpha_diversity('shannon', counts, ids=sample_ids)
+simpson = alpha_diversity('simpson', counts, ids=sample_ids)
+observed_otus = alpha_diversity('observed_otus', counts, ids=sample_ids)
+chao1 = alpha_diversity('chao1', counts, ids=sample_ids)
+
+# Phylogenetic alpha diversity (requires tree)
+from skbio import TreeNode
+
+tree = TreeNode.read('tree.nwk')
+feature_ids = ['OTU1', 'OTU2', 'OTU3', 'OTU4']
+
+faith_pd = alpha_diversity('faith_pd', counts, ids=sample_ids,
+                          tree=tree, otu_ids=feature_ids)
+```
+
+### Beta Diversity
+
+```python
+from skbio.diversity import beta_diversity, partial_beta_diversity
+
+# Beta diversity (all pairwise comparisons)
+bc_dm = beta_diversity('braycurtis', counts, ids=sample_ids)
+
+# Jaccard (presence/absence)
+jaccard_dm = beta_diversity('jaccard', counts, ids=sample_ids)
+
+# Phylogenetic beta diversity
+unifrac_dm = beta_diversity('unweighted_unifrac', counts,
+                           ids=sample_ids,
+                           tree=tree,
+                           otu_ids=feature_ids)
+
+weighted_unifrac_dm = beta_diversity('weighted_unifrac', counts,
+                                    ids=sample_ids,
+                                    tree=tree,
+                                    otu_ids=feature_ids)
+
+# Compute only specific pairs (more efficient)
+pairs = [('Sample1', 'Sample2'), ('Sample1', 'Sample3')]
+partial_dm = partial_beta_diversity('braycurtis', counts,
+                                   ids=sample_ids,
+                                   id_pairs=pairs)
+```
+
+### Rarefaction and Subsampling
+
+```python
+from skbio.diversity import subsample_counts
+
+# Rarefy to minimum depth
+min_depth = counts.min(axis=1).max()
+rarefied = [subsample_counts(row, n=min_depth) for row in counts]
+
+# Multiple rarefactions for confidence intervals
+import numpy as np
+rarefactions = []
+for i in range(100):
+    rarefied_counts = np.array([subsample_counts(row, n=1000) for row in counts])
+    shannon_rare = alpha_diversity('shannon', rarefied_counts)
+    rarefactions.append(shannon_rare)
+
+# Calculate mean and std
+mean_shannon = np.mean(rarefactions, axis=0)
+std_shannon = np.std(rarefactions, axis=0)
+```
+
+## Ordination
+
+### Principal Coordinate Analysis (PCoA)
+
+```python
+from skbio.stats.ordination import pcoa
+from skbio import DistanceMatrix
+import numpy as np
+
+# PCoA from distance matrix
+dm = DistanceMatrix(...)
+pcoa_results = pcoa(dm)
+
+# Access coordinates
+pc1 = pcoa_results.samples['PC1']
+pc2 = pcoa_results.samples['PC2']
+
+# Proportion explained
+prop_explained = pcoa_results.proportion_explained
+
+# Eigenvalues
+eigenvalues = pcoa_results.eigvals
+
+# Save results
+pcoa_results.write('pcoa_results.txt')
+
+# Plot with matplotlib
+import matplotlib.pyplot as plt
+plt.scatter(pc1, pc2)
+plt.xlabel(f'PC1 ({prop_explained[0]*100:.1f}%)')
+plt.ylabel(f'PC2 ({prop_explained[1]*100:.1f}%)')
+```
+
+### Canonical Correspondence Analysis (CCA)
+
+```python
+from skbio.stats.ordination import cca
+import pandas as pd
+import numpy as np
+
+# Species abundance matrix (samples x species)
+species = np.array([
+    [10, 5, 3],
+    [2, 8, 4],
+    [5, 5, 5]
+])
+
+# Environmental variables (samples x variables)
+env = pd.DataFrame({
+    'pH': [6.5, 7.0, 6.8],
+    'temperature': [20, 25, 22],
+    'depth': [10, 15, 12]
+})
+
+# CCA
+cca_results = cca(species, env,
+                 sample_ids=['Site1', 'Site2', 'Site3'],
+                 species_ids=['SpeciesA', 'SpeciesB', 'SpeciesC'])
+
+# Access constrained axes
+cca1 = cca_results.samples['CCA1']
+cca2 = cca_results.samples['CCA2']
+
+# Biplot scores for environmental variables
+env_scores = cca_results.biplot_scores
+```
+
+### Redundancy Analysis (RDA)
+
+```python
+from skbio.stats.ordination import rda
+
+# Similar to CCA but for linear relationships
+rda_results = rda(species, env,
+                 sample_ids=['Site1', 'Site2', 'Site3'],
+                 species_ids=['SpeciesA', 'SpeciesB', 'SpeciesC'])
+```
+
+## Statistical Tests
+
+### PERMANOVA
+
+```python
+from skbio.stats.distance import permanova
+from skbio import DistanceMatrix
+import numpy as np
+
+# Distance matrix
+dm = DistanceMatrix(...)
+
+# Grouping variable
+grouping = ['Group1', 'Group1', 'Group2', 'Group2', 'Group3', 'Group3']
+
+# Run PERMANOVA
+results = permanova(dm, grouping, permutations=999)
+
+print(f"Test statistic: {results['test statistic']}")
+print(f"p-value: {results['p-value']}")
+print(f"Sample size: {results['sample size']}")
+print(f"Number of groups: {results['number of groups']}")
+```
+
+### ANOSIM
+
+```python
+from skbio.stats.distance import anosim
+
+# ANOSIM test
+results = anosim(dm, grouping, permutations=999)
+
+print(f"R statistic: {results['test statistic']}")
+print(f"p-value: {results['p-value']}")
+```
+
+### PERMDISP
+
+```python
+from skbio.stats.distance import permdisp
+
+# Test homogeneity of dispersions
+results = permdisp(dm, grouping, permutations=999)
+
+print(f"F statistic: {results['test statistic']}")
+print(f"p-value: {results['p-value']}")
+```
+
+### Mantel Test
+
+```python
+from skbio.stats.distance import mantel
+from skbio import DistanceMatrix
+
+# Two distance matrices to compare
+dm1 = DistanceMatrix(...)  # e.g., genetic distance
+dm2 = DistanceMatrix(...)  # e.g., geographic distance
+
+# Mantel test
+r, p_value, n = mantel(dm1, dm2, method='pearson', permutations=999)
+
+print(f"Correlation: {r}")
+print(f"p-value: {p_value}")
+print(f"Sample size: {n}")
+
+# Spearman correlation
+r_spearman, p, n = mantel(dm1, dm2, method='spearman', permutations=999)
+```
+
+### Partial Mantel Test
+
+```python
+from skbio.stats.distance import mantel
+
+# Control for a third matrix
+dm3 = DistanceMatrix(...)  # controlling variable
+
+r_partial, p_value, n = mantel(dm1, dm2, method='pearson',
+                               permutations=999, alternative='two-sided')
+```
+
+## Distance Matrices
+
+### Creating and Manipulating Distance Matrices
+
+```python
+from skbio import DistanceMatrix, DissimilarityMatrix
+import numpy as np
+
+# Create from array
+data = np.array([[0, 1, 2],
+                 [1, 0, 3],
+                 [2, 3, 0]])
+dm = DistanceMatrix(data, ids=['A', 'B', 'C'])
+
+# Access elements
+dist_ab = dm['A', 'B']
+row_a = dm['A']
+
+# Slicing
+subset_dm = dm.filter(['A', 'C'])
+
+# Asymmetric dissimilarity matrix
+asym_data = np.array([[0, 1, 2],
+                      [3, 0, 4],
+                      [5, 6, 0]])
+dissim = DissimilarityMatrix(asym_data, ids=['X', 'Y', 'Z'])
+
+# Read/write
+dm.write('distances.txt')
+dm2 = DistanceMatrix.read('distances.txt')
+
+# Convert to condensed form (for scipy)
+condensed = dm.condensed_form()
+
+# Convert to dataframe
+df = dm.to_data_frame()
+```
+
+## File I/O
+
+### Reading Sequences
+
+```python
+import skbio
+
+# Read single sequence
+dna = skbio.DNA.read('sequence.fasta', format='fasta')
+
+# Read multiple sequences (generator)
+for seq in skbio.io.read('sequences.fasta', format='fasta', constructor=skbio.DNA):
+    print(seq.metadata['id'], len(seq))
+
+# Read into list
+sequences = list(skbio.io.read('sequences.fasta', format='fasta',
+                               constructor=skbio.DNA))
+
+# Read FASTQ with quality scores
+for seq in skbio.io.read('reads.fastq', format='fastq', constructor=skbio.DNA):
+    quality = seq.positional_metadata['quality']
+    print(f"Mean quality: {quality.mean()}")
+```
+
+### Writing Sequences
+
+```python
+# Write single sequence
+dna.write('output.fasta', format='fasta')
+
+# Write multiple sequences
+sequences = [dna1, dna2, dna3]
+skbio.io.write(sequences, format='fasta', into='output.fasta')
+
+# Write with custom line wrapping
+dna.write('output.fasta', format='fasta', max_width=60)
+```
+
+### BIOM Tables
+
+```python
+from skbio import Table
+
+# Read BIOM table
+table = Table.read('table.biom', format='hdf5')
+
+# Access data
+sample_ids = table.ids(axis='sample')
+feature_ids = table.ids(axis='observation')
+matrix = table.matrix_data.toarray()  # if sparse
+
+# Filter samples
+abundant_samples = table.filter(lambda row, id_, md: row.sum() > 1000, axis='sample')
+
+# Filter features (OTUs/ASVs)
+prevalent_features = table.filter(lambda col, id_, md: (col > 0).sum() >= 3,
+                                 axis='observation')
+
+# Normalize
+relative_abundance = table.norm(axis='sample', inplace=False)
+
+# Write
+table.write('filtered_table.biom', format='hdf5')
+```
+
+### Format Conversion
+
+```python
+# FASTQ to FASTA
+seqs = skbio.io.read('input.fastq', format='fastq', constructor=skbio.DNA)
+skbio.io.write(seqs, format='fasta', into='output.fasta')
+
+# GenBank to FASTA
+seqs = skbio.io.read('genes.gb', format='genbank', constructor=skbio.DNA)
+skbio.io.write(seqs, format='fasta', into='genes.fasta')
+```
+
+## Troubleshooting
+
+### Common Issues and Solutions
+
+#### Issue: "ValueError: Ids must be unique"
+```python
+# Problem: Duplicate sequence IDs
+# Solution: Make IDs unique or filter duplicates
+seen = set()
+unique_seqs = []
+for seq in sequences:
+    if seq.metadata['id'] not in seen:
+        unique_seqs.append(seq)
+        seen.add(seq.metadata['id'])
+```
+
+#### Issue: "ValueError: Counts must be integers"
+```python
+# Problem: Relative abundances instead of counts
+# Solution: Convert to integer counts or use appropriate metrics
+counts_int = (abundance_table * 1000).astype(int)
+```
+
+#### Issue: Memory error with large files
+```python
+# Problem: Loading entire file into memory
+# Solution: Use generators
+for seq in skbio.io.read('huge.fasta', format='fasta', constructor=skbio.DNA):
+    # Process one at a time
+    process(seq)
+```
+
+#### Issue: Tree tips don't match OTU IDs
+```python
+# Problem: Mismatch between tree tip names and feature IDs
+# Solution: Verify and align IDs
+tree_tips = {tip.name for tip in tree.tips()}
+feature_ids = set(feature_ids)
+missing_in_tree = feature_ids - tree_tips
+missing_in_table = tree_tips - feature_ids
+
+# Prune tree to match table
+tree_pruned = tree.shear(feature_ids)
+```
+
+#### Issue: Alignment fails with sequences of different lengths
+```python
+# Problem: Trying to align pre-aligned sequences
+# Solution: Degap sequences first or ensure sequences are unaligned
+seq1_degapped = seq1.degap()
+seq2_degapped = seq2.degap()
+alignment = local_pairwise_align_ssw(seq1_degapped, seq2_degapped)
+```
+
+### Performance Tips
+
+1. **Use appropriate data structures**: BIOM HDF5 for large tables, generators for large sequence files
+2. **Parallel processing**: Use `partial_beta_diversity()` for subset calculations that can be parallelized
+3. **Subsample large datasets**: For exploratory analysis, work with subsampled data first
+4. **Cache results**: Save distance matrices and ordination results to avoid recomputation
+
+### Integration Examples
+
+#### With pandas
+```python
+import pandas as pd
+from skbio import DistanceMatrix
+
+# Distance matrix to DataFrame
+dm = DistanceMatrix(...)
+df = dm.to_data_frame()
+
+# Alpha diversity to DataFrame
+alpha = alpha_diversity('shannon', counts, ids=sample_ids)
+alpha_df = pd.DataFrame({'shannon': alpha})
+```
+
+#### With matplotlib/seaborn
+```python
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+# PCoA plot
+fig, ax = plt.subplots()
+scatter = ax.scatter(pc1, pc2, c=grouping, cmap='viridis')
+ax.set_xlabel(f'PC1 ({prop_explained[0]*100:.1f}%)')
+ax.set_ylabel(f'PC2 ({prop_explained[1]*100:.1f}%)')
+plt.colorbar(scatter)
+
+# Heatmap of distance matrix
+sns.heatmap(dm.to_data_frame(), cmap='viridis')
+```
+
+#### With QIIME 2
+```python
+# scikit-bio objects are compatible with QIIME 2
+# Export from QIIME 2
+# qiime tools export --input-path table.qza --output-path exported/
+
+# Read in scikit-bio
+table = Table.read('exported/feature-table.biom')
+
+# Process with scikit-bio
+# ...
+
+# Import back to QIIME 2 if needed
+table.write('processed-table.biom')
+# qiime tools import --input-path processed-table.biom --output-path processed.qza
+```