Initial commit

2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions
--- a/skills/biopython/references/advanced.md
+++ b/skills/biopython/references/advanced.md
@@ -0,0 +1,577 @@
+# Advanced Biopython Features
+
+## Sequence Motifs with Bio.motifs
+
+### Creating Motifs
+
+```python
+from Bio import motifs
+from Bio.Seq import Seq
+
+# Create motif from instances
+instances = [
+    Seq("TACAA"),
+    Seq("TACGC"),
+    Seq("TACAC"),
+    Seq("TACCC"),
+    Seq("AACCC"),
+    Seq("AATGC"),
+    Seq("AATGC"),
+]
+
+motif = motifs.create(instances)
+```
+
+### Motif Consensus and Degenerate Sequences
+
+```python
+# Get consensus sequence
+print(motif.counts.consensus)
+
+# Get degenerate consensus (IUPAC ambiguity codes)
+print(motif.counts.degenerate_consensus)
+
+# Access counts matrix
+print(motif.counts)
+```
+
+### Position Weight Matrix (PWM)
+
+```python
+# Create position weight matrix
+pwm = motif.counts.normalize(pseudocounts=0.5)
+print(pwm)
+
+# Calculate information content
+ic = motif.counts.information_content()
+print(f"Information content: {ic:.2f} bits")
+```
+
+### Searching for Motifs
+
+```python
+from Bio.Seq import Seq
+
+# Search sequence for motif
+test_seq = Seq("ATACAGGACAGACATACGCATACAACATTACAC")
+
+# Get Position Specific Scoring Matrix (PSSM)
+pssm = pwm.log_odds()
+
+# Search sequence
+for position, score in pssm.search(test_seq, threshold=5.0):
+    print(f"Position {position}: score = {score:.2f}")
+```
+
+### Reading Motifs from Files
+
+```python
+# Read motif from JASPAR format
+with open("motif.jaspar") as handle:
+    motif = motifs.read(handle, "jaspar")
+
+# Read multiple motifs
+with open("motifs.jaspar") as handle:
+    for m in motifs.parse(handle, "jaspar"):
+        print(m.name)
+
+# Supported formats: jaspar, meme, transfac, pfm
+```
+
+### Writing Motifs
+
+```python
+# Write motif in JASPAR format
+with open("output.jaspar", "w") as handle:
+    handle.write(motif.format("jaspar"))
+```
+
+## Population Genetics with Bio.PopGen
+
+### Working with GenePop Files
+
+```python
+from Bio.PopGen import GenePop
+
+# Read GenePop file
+with open("data.gen") as handle:
+    record = GenePop.read(handle)
+
+# Access populations
+print(f"Number of populations: {len(record.populations)}")
+print(f"Loci: {record.loci_list}")
+
+# Iterate through populations
+for pop_idx, pop in enumerate(record.populations):
+    print(f"\nPopulation {pop_idx + 1}:")
+    for individual in pop:
+        print(f"  {individual[0]}: {individual[1]}")
+```
+
+### Calculating Population Statistics
+
+```python
+from Bio.PopGen.GenePop.Controller import GenePopController
+
+# Create controller
+ctrl = GenePopController()
+
+# Calculate basic statistics
+result = ctrl.calc_allele_genotype_freqs("data.gen")
+
+# Calculate Fst
+fst_result = ctrl.calc_fst_all("data.gen")
+print(f"Fst: {fst_result}")
+
+# Test Hardy-Weinberg equilibrium
+hw_result = ctrl.test_hw_pop("data.gen", "probability")
+```
+
+## Sequence Utilities with Bio.SeqUtils
+
+### GC Content
+
+```python
+from Bio.SeqUtils import gc_fraction
+from Bio.Seq import Seq
+
+seq = Seq("ATCGATCGATCG")
+gc = gc_fraction(seq)
+print(f"GC content: {gc:.2%}")
+```
+
+### Molecular Weight
+
+```python
+from Bio.SeqUtils import molecular_weight
+
+# DNA molecular weight
+dna_seq = Seq("ATCG")
+mw = molecular_weight(dna_seq, seq_type="DNA")
+print(f"DNA MW: {mw:.2f} g/mol")
+
+# Protein molecular weight
+protein_seq = Seq("ACDEFGHIKLMNPQRSTVWY")
+mw = molecular_weight(protein_seq, seq_type="protein")
+print(f"Protein MW: {mw:.2f} Da")
+```
+
+### Melting Temperature
+
+```python
+from Bio.SeqUtils import MeltingTemp as mt
+
+# Calculate Tm using nearest-neighbor method
+seq = Seq("ATCGATCGATCG")
+tm = mt.Tm_NN(seq)
+print(f"Tm: {tm:.1f}°C")
+
+# Use different salt concentration
+tm = mt.Tm_NN(seq, Na=50, Mg=1.5)  # 50 mM Na+, 1.5 mM Mg2+
+
+# Wallace rule (for primers)
+tm_wallace = mt.Tm_Wallace(seq)
+```
+
+### GC Skew
+
+```python
+from Bio.SeqUtils import gc_skew
+
+# Calculate GC skew
+seq = Seq("ATCGATCGGGCCCAAATTT")
+skew = gc_skew(seq, window=100)
+print(f"GC skew: {skew}")
+```
+
+### ProtParam - Protein Analysis
+
+```python
+from Bio.SeqUtils.ProtParam import ProteinAnalysis
+
+protein_seq = "ACDEFGHIKLMNPQRSTVWY"
+analyzed_seq = ProteinAnalysis(protein_seq)
+
+# Molecular weight
+print(f"MW: {analyzed_seq.molecular_weight():.2f} Da")
+
+# Isoelectric point
+print(f"pI: {analyzed_seq.isoelectric_point():.2f}")
+
+# Amino acid composition
+print(f"Composition: {analyzed_seq.get_amino_acids_percent()}")
+
+# Instability index
+print(f"Instability: {analyzed_seq.instability_index():.2f}")
+
+# Aromaticity
+print(f"Aromaticity: {analyzed_seq.aromaticity():.2f}")
+
+# Secondary structure fraction
+ss = analyzed_seq.secondary_structure_fraction()
+print(f"Helix: {ss[0]:.2%}, Turn: {ss[1]:.2%}, Sheet: {ss[2]:.2%}")
+
+# Extinction coefficient (assumes Cys reduced, no disulfide bonds)
+print(f"Extinction coefficient: {analyzed_seq.molar_extinction_coefficient()}")
+
+# Gravy (grand average of hydropathy)
+print(f"GRAVY: {analyzed_seq.gravy():.3f}")
+```
+
+## Restriction Analysis with Bio.Restriction
+
+```python
+from Bio import Restriction
+from Bio.Seq import Seq
+
+# Analyze sequence for restriction sites
+seq = Seq("GAATTCATCGATCGATGAATTC")
+
+# Use specific enzyme
+ecori = Restriction.EcoRI
+sites = ecori.search(seq)
+print(f"EcoRI sites at: {sites}")
+
+# Use multiple enzymes
+rb = Restriction.RestrictionBatch(["EcoRI", "BamHI", "PstI"])
+results = rb.search(seq)
+for enzyme, sites in results.items():
+    if sites:
+        print(f"{enzyme}: {sites}")
+
+# Get all enzymes that cut sequence
+all_enzymes = Restriction.Analysis(rb, seq)
+print(f"Cutting enzymes: {all_enzymes.with_sites()}")
+```
+
+## Sequence Translation Tables
+
+```python
+from Bio.Data import CodonTable
+
+# Standard genetic code
+standard_table = CodonTable.unambiguous_dna_by_id[1]
+print(standard_table)
+
+# Mitochondrial code
+mito_table = CodonTable.unambiguous_dna_by_id[2]
+
+# Get specific codon
+print(f"ATG codes for: {standard_table.forward_table['ATG']}")
+
+# Get stop codons
+print(f"Stop codons: {standard_table.stop_codons}")
+
+# Get start codons
+print(f"Start codons: {standard_table.start_codons}")
+```
+
+## Cluster Analysis with Bio.Cluster
+
+```python
+from Bio.Cluster import kcluster
+import numpy as np
+
+# Sample data matrix (genes x conditions)
+data = np.array([
+    [1.2, 0.8, 0.5, 1.5],
+    [0.9, 1.1, 0.7, 1.3],
+    [0.2, 0.3, 2.1, 2.5],
+    [0.1, 0.4, 2.3, 2.2],
+])
+
+# Perform k-means clustering
+clusterid, error, nfound = kcluster(data, nclusters=2)
+print(f"Cluster assignments: {clusterid}")
+print(f"Error: {error}")
+```
+
+## Genome Diagrams with GenomeDiagram
+
+```python
+from Bio.Graphics import GenomeDiagram
+from Bio.SeqFeature import SeqFeature, FeatureLocation
+from Bio import SeqIO
+from reportlab.lib import colors
+
+# Read GenBank file
+record = SeqIO.read("sequence.gb", "genbank")
+
+# Create diagram
+gd_diagram = GenomeDiagram.Diagram("Genome Diagram")
+gd_track = gd_diagram.new_track(1, greytrack=True)
+gd_feature_set = gd_track.new_set()
+
+# Add features
+for feature in record.features:
+    if feature.type == "CDS":
+        color = colors.blue
+    elif feature.type == "gene":
+        color = colors.lightblue
+    else:
+        color = colors.grey
+
+    gd_feature_set.add_feature(
+        feature,
+        color=color,
+        label=True,
+        label_size=6,
+        label_angle=45
+    )
+
+# Draw and save
+gd_diagram.draw(format="linear", pagesize="A4", fragments=1)
+gd_diagram.write("genome_diagram.pdf", "PDF")
+```
+
+## Sequence Comparison with Bio.pairwise2
+
+**Note**: Bio.pairwise2 is deprecated. Use Bio.Align.PairwiseAligner instead (see alignment.md).
+
+However, for legacy code:
+
+```python
+from Bio import pairwise2
+from Bio.pairwise2 import format_alignment
+
+# Global alignment
+alignments = pairwise2.align.globalxx("ACCGT", "ACGT")
+
+# Print top alignments
+for alignment in alignments[:3]:
+    print(format_alignment(*alignment))
+```
+
+## Working with PubChem
+
+```python
+from Bio import Entrez
+
+Entrez.email = "your.email@example.com"
+
+# Search PubChem
+handle = Entrez.esearch(db="pccompound", term="aspirin")
+result = Entrez.read(handle)
+handle.close()
+
+compound_id = result["IdList"][0]
+
+# Get compound information
+handle = Entrez.efetch(db="pccompound", id=compound_id, retmode="xml")
+compound_data = handle.read()
+handle.close()
+```
+
+## Sequence Features with Bio.SeqFeature
+
+```python
+from Bio.SeqFeature import SeqFeature, FeatureLocation
+from Bio.Seq import Seq
+from Bio.SeqRecord import SeqRecord
+
+# Create a feature
+feature = SeqFeature(
+    location=FeatureLocation(start=10, end=50),
+    type="CDS",
+    strand=1,
+    qualifiers={"gene": ["ABC1"], "product": ["ABC protein"]}
+)
+
+# Add feature to record
+record = SeqRecord(Seq("ATCG" * 20), id="seq1")
+record.features.append(feature)
+
+# Extract feature sequence
+feature_seq = feature.extract(record.seq)
+print(feature_seq)
+```
+
+## Sequence Ambiguity
+
+```python
+from Bio.Data import IUPACData
+
+# DNA ambiguity codes
+print(IUPACData.ambiguous_dna_letters)
+
+# Protein ambiguity codes
+print(IUPACData.ambiguous_protein_letters)
+
+# Resolve ambiguous bases
+print(IUPACData.ambiguous_dna_values["N"])  # Any base
+print(IUPACData.ambiguous_dna_values["R"])  # A or G
+```
+
+## Quality Scores (FASTQ)
+
+```python
+from Bio import SeqIO
+
+# Read FASTQ with quality scores
+for record in SeqIO.parse("reads.fastq", "fastq"):
+    print(f"ID: {record.id}")
+    print(f"Sequence: {record.seq}")
+    print(f"Quality: {record.letter_annotations['phred_quality']}")
+
+    # Calculate average quality
+    avg_quality = sum(record.letter_annotations['phred_quality']) / len(record)
+    print(f"Average quality: {avg_quality:.2f}")
+
+    # Filter by quality
+    min_quality = min(record.letter_annotations['phred_quality'])
+    if min_quality >= 20:
+        print("High quality read")
+```
+
+## Best Practices
+
+1. **Use appropriate modules** - Choose the right tool for your analysis
+2. **Handle pseudocounts** - Important for motif analysis
+3. **Validate input data** - Check file formats and data quality
+4. **Consider performance** - Some operations can be computationally intensive
+5. **Cache results** - Store intermediate results for large analyses
+6. **Use proper genetic codes** - Select appropriate translation tables
+7. **Document parameters** - Record thresholds and settings used
+8. **Validate statistical results** - Understand limitations of tests
+9. **Handle edge cases** - Check for empty results or invalid input
+10. **Combine modules** - Leverage multiple Biopython tools together
+
+## Common Use Cases
+
+### Find ORFs
+
+```python
+from Bio import SeqIO
+from Bio.SeqUtils import gc_fraction
+
+def find_orfs(seq, min_length=100):
+    """Find all ORFs in sequence."""
+    orfs = []
+
+    for strand, nuc in [(+1, seq), (-1, seq.reverse_complement())]:
+        for frame in range(3):
+            trans = nuc[frame:].translate()
+            trans_len = len(trans)
+
+            aa_start = 0
+            while aa_start < trans_len:
+                aa_end = trans.find("*", aa_start)
+                if aa_end == -1:
+                    aa_end = trans_len
+
+                if aa_end - aa_start >= min_length // 3:
+                    start = frame + aa_start * 3
+                    end = frame + aa_end * 3
+                    orfs.append({
+                        'start': start,
+                        'end': end,
+                        'strand': strand,
+                        'frame': frame,
+                        'length': end - start,
+                        'sequence': nuc[start:end]
+                    })
+
+                aa_start = aa_end + 1
+
+    return orfs
+
+# Use it
+record = SeqIO.read("sequence.fasta", "fasta")
+orfs = find_orfs(record.seq, min_length=300)
+for orf in orfs:
+    print(f"ORF: {orf['start']}-{orf['end']}, strand={orf['strand']}, length={orf['length']}")
+```
+
+### Analyze Codon Usage
+
+```python
+from Bio import SeqIO
+from Bio.SeqUtils import CodonUsage
+
+def analyze_codon_usage(fasta_file):
+    """Analyze codon usage in coding sequences."""
+    codon_counts = {}
+
+    for record in SeqIO.parse(fasta_file, "fasta"):
+        # Ensure sequence is multiple of 3
+        seq = record.seq[:len(record.seq) - len(record.seq) % 3]
+
+        # Count codons
+        for i in range(0, len(seq), 3):
+            codon = str(seq[i:i+3])
+            codon_counts[codon] = codon_counts.get(codon, 0) + 1
+
+    # Calculate frequencies
+    total = sum(codon_counts.values())
+    codon_freq = {k: v/total for k, v in codon_counts.items()}
+
+    return codon_freq
+```
+
+### Calculate Sequence Complexity
+
+```python
+def sequence_complexity(seq, k=2):
+    """Calculate k-mer complexity (Shannon entropy)."""
+    import math
+    from collections import Counter
+
+    # Generate k-mers
+    kmers = [str(seq[i:i+k]) for i in range(len(seq) - k + 1)]
+
+    # Count k-mers
+    counts = Counter(kmers)
+    total = len(kmers)
+
+    # Calculate entropy
+    entropy = 0
+    for count in counts.values():
+        freq = count / total
+        entropy -= freq * math.log2(freq)
+
+    # Normalize by maximum possible entropy
+    max_entropy = math.log2(4 ** k)  # For DNA
+
+    return entropy / max_entropy if max_entropy > 0 else 0
+
+# Use it
+from Bio.Seq import Seq
+seq = Seq("ATCGATCGATCGATCG")
+complexity = sequence_complexity(seq, k=2)
+print(f"Sequence complexity: {complexity:.3f}")
+```
+
+### Extract Promoter Regions
+
+```python
+def extract_promoters(genbank_file, upstream=500):
+    """Extract promoter regions upstream of genes."""
+    from Bio import SeqIO
+
+    record = SeqIO.read(genbank_file, "genbank")
+    promoters = []
+
+    for feature in record.features:
+        if feature.type == "gene":
+            if feature.strand == 1:
+                # Forward strand
+                start = max(0, feature.location.start - upstream)
+                end = feature.location.start
+            else:
+                # Reverse strand
+                start = feature.location.end
+                end = min(len(record.seq), feature.location.end + upstream)
+
+            promoter_seq = record.seq[start:end]
+            if feature.strand == -1:
+                promoter_seq = promoter_seq.reverse_complement()
+
+            promoters.append({
+                'gene': feature.qualifiers.get('gene', ['Unknown'])[0],
+                'sequence': promoter_seq,
+                'start': start,
+                'end': end
+            })
+
+    return promoters
+```