gh-k-dense-ai-claude-scient…/skills/biopython/references/advanced.md

# Advanced Biopython Features

## Sequence Motifs with Bio.motifs

### Creating Motifs

```python
from Bio import motifs
from Bio.Seq import Seq

# Create motif from instances
instances = [
    Seq("TACAA"),
    Seq("TACGC"),
    Seq("TACAC"),
    Seq("TACCC"),
    Seq("AACCC"),
    Seq("AATGC"),
    Seq("AATGC"),
]

motif = motifs.create(instances)
```

### Motif Consensus and Degenerate Sequences

```python
# Get consensus sequence
print(motif.counts.consensus)

# Get degenerate consensus (IUPAC ambiguity codes)
print(motif.counts.degenerate_consensus)

# Access counts matrix
print(motif.counts)
```

### Position Weight Matrix (PWM)

```python
# Create position weight matrix
pwm = motif.counts.normalize(pseudocounts=0.5)
print(pwm)

# Calculate information content
ic = motif.counts.information_content()
print(f"Information content: {ic:.2f} bits")
```

### Searching for Motifs

```python
from Bio.Seq import Seq

# Search sequence for motif
test_seq = Seq("ATACAGGACAGACATACGCATACAACATTACAC")

# Get Position Specific Scoring Matrix (PSSM)
pssm = pwm.log_odds()

# Search sequence
for position, score in pssm.search(test_seq, threshold=5.0):
    print(f"Position {position}: score = {score:.2f}")
```

### Reading Motifs from Files

```python
# Read motif from JASPAR format
with open("motif.jaspar") as handle:
    motif = motifs.read(handle, "jaspar")

# Read multiple motifs
with open("motifs.jaspar") as handle:
    for m in motifs.parse(handle, "jaspar"):
        print(m.name)

# Supported formats: jaspar, meme, transfac, pfm
```

### Writing Motifs

```python
# Write motif in JASPAR format
with open("output.jaspar", "w") as handle:
    handle.write(motif.format("jaspar"))
```

## Population Genetics with Bio.PopGen

### Working with GenePop Files

```python
from Bio.PopGen import GenePop

# Read GenePop file
with open("data.gen") as handle:
    record = GenePop.read(handle)

# Access populations
print(f"Number of populations: {len(record.populations)}")
print(f"Loci: {record.loci_list}")

# Iterate through populations
for pop_idx, pop in enumerate(record.populations):
    print(f"\nPopulation {pop_idx + 1}:")
    for individual in pop:
        print(f"  {individual[0]}: {individual[1]}")
```

### Calculating Population Statistics

```python
from Bio.PopGen.GenePop.Controller import GenePopController

# Create controller
ctrl = GenePopController()

# Calculate basic statistics
result = ctrl.calc_allele_genotype_freqs("data.gen")

# Calculate Fst
fst_result = ctrl.calc_fst_all("data.gen")
print(f"Fst: {fst_result}")

# Test Hardy-Weinberg equilibrium
hw_result = ctrl.test_hw_pop("data.gen", "probability")
```

## Sequence Utilities with Bio.SeqUtils

### GC Content

```python
from Bio.SeqUtils import gc_fraction
from Bio.Seq import Seq

seq = Seq("ATCGATCGATCG")
gc = gc_fraction(seq)
print(f"GC content: {gc:.2%}")
```

### Molecular Weight

```python
from Bio.SeqUtils import molecular_weight

# DNA molecular weight
dna_seq = Seq("ATCG")
mw = molecular_weight(dna_seq, seq_type="DNA")
print(f"DNA MW: {mw:.2f} g/mol")

# Protein molecular weight
protein_seq = Seq("ACDEFGHIKLMNPQRSTVWY")
mw = molecular_weight(protein_seq, seq_type="protein")
print(f"Protein MW: {mw:.2f} Da")
```

### Melting Temperature

```python
from Bio.SeqUtils import MeltingTemp as mt

# Calculate Tm using nearest-neighbor method
seq = Seq("ATCGATCGATCG")
tm = mt.Tm_NN(seq)
print(f"Tm: {tm:.1f}°C")

# Use different salt concentration
tm = mt.Tm_NN(seq, Na=50, Mg=1.5)  # 50 mM Na+, 1.5 mM Mg2+

# Wallace rule (for primers)
tm_wallace = mt.Tm_Wallace(seq)
```

### GC Skew

```python
from Bio.SeqUtils import gc_skew

# Calculate GC skew
seq = Seq("ATCGATCGGGCCCAAATTT")
skew = gc_skew(seq, window=100)
print(f"GC skew: {skew}")
```

### ProtParam - Protein Analysis

```python
from Bio.SeqUtils.ProtParam import ProteinAnalysis

protein_seq = "ACDEFGHIKLMNPQRSTVWY"
analyzed_seq = ProteinAnalysis(protein_seq)

# Molecular weight
print(f"MW: {analyzed_seq.molecular_weight():.2f} Da")

# Isoelectric point
print(f"pI: {analyzed_seq.isoelectric_point():.2f}")

# Amino acid composition
print(f"Composition: {analyzed_seq.get_amino_acids_percent()}")

# Instability index
print(f"Instability: {analyzed_seq.instability_index():.2f}")

# Aromaticity
print(f"Aromaticity: {analyzed_seq.aromaticity():.2f}")

# Secondary structure fraction
ss = analyzed_seq.secondary_structure_fraction()
print(f"Helix: {ss[0]:.2%}, Turn: {ss[1]:.2%}, Sheet: {ss[2]:.2%}")

# Extinction coefficient (assumes Cys reduced, no disulfide bonds)
print(f"Extinction coefficient: {analyzed_seq.molar_extinction_coefficient()}")

# Gravy (grand average of hydropathy)
print(f"GRAVY: {analyzed_seq.gravy():.3f}")
```

## Restriction Analysis with Bio.Restriction

```python
from Bio import Restriction
from Bio.Seq import Seq

# Analyze sequence for restriction sites
seq = Seq("GAATTCATCGATCGATGAATTC")

# Use specific enzyme
ecori = Restriction.EcoRI
sites = ecori.search(seq)
print(f"EcoRI sites at: {sites}")

# Use multiple enzymes
rb = Restriction.RestrictionBatch(["EcoRI", "BamHI", "PstI"])
results = rb.search(seq)
for enzyme, sites in results.items():
    if sites:
        print(f"{enzyme}: {sites}")

# Get all enzymes that cut sequence
all_enzymes = Restriction.Analysis(rb, seq)
print(f"Cutting enzymes: {all_enzymes.with_sites()}")
```

## Sequence Translation Tables

```python
from Bio.Data import CodonTable

# Standard genetic code
standard_table = CodonTable.unambiguous_dna_by_id[1]
print(standard_table)

# Mitochondrial code
mito_table = CodonTable.unambiguous_dna_by_id[2]

# Get specific codon
print(f"ATG codes for: {standard_table.forward_table['ATG']}")

# Get stop codons
print(f"Stop codons: {standard_table.stop_codons}")

# Get start codons
print(f"Start codons: {standard_table.start_codons}")
```

## Cluster Analysis with Bio.Cluster

```python
from Bio.Cluster import kcluster
import numpy as np

# Sample data matrix (genes x conditions)
data = np.array([
    [1.2, 0.8, 0.5, 1.5],
    [0.9, 1.1, 0.7, 1.3],
    [0.2, 0.3, 2.1, 2.5],
    [0.1, 0.4, 2.3, 2.2],
])

# Perform k-means clustering
clusterid, error, nfound = kcluster(data, nclusters=2)
print(f"Cluster assignments: {clusterid}")
print(f"Error: {error}")
```

## Genome Diagrams with GenomeDiagram

```python
from Bio.Graphics import GenomeDiagram
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import SeqIO
from reportlab.lib import colors

# Read GenBank file
record = SeqIO.read("sequence.gb", "genbank")

# Create diagram
gd_diagram = GenomeDiagram.Diagram("Genome Diagram")
gd_track = gd_diagram.new_track(1, greytrack=True)
gd_feature_set = gd_track.new_set()

# Add features
for feature in record.features:
    if feature.type == "CDS":
        color = colors.blue
    elif feature.type == "gene":
        color = colors.lightblue
    else:
        color = colors.grey

    gd_feature_set.add_feature(
        feature,
        color=color,
        label=True,
        label_size=6,
        label_angle=45
    )

# Draw and save
gd_diagram.draw(format="linear", pagesize="A4", fragments=1)
gd_diagram.write("genome_diagram.pdf", "PDF")
```

## Sequence Comparison with Bio.pairwise2

**Note**: Bio.pairwise2 is deprecated. Use Bio.Align.PairwiseAligner instead (see alignment.md).

However, for legacy code:

```python
from Bio import pairwise2
from Bio.pairwise2 import format_alignment

# Global alignment
alignments = pairwise2.align.globalxx("ACCGT", "ACGT")

# Print top alignments
for alignment in alignments[:3]:
    print(format_alignment(*alignment))
```

## Working with PubChem

```python
from Bio import Entrez

Entrez.email = "your.email@example.com"

# Search PubChem
handle = Entrez.esearch(db="pccompound", term="aspirin")
result = Entrez.read(handle)
handle.close()

compound_id = result["IdList"][0]

# Get compound information
handle = Entrez.efetch(db="pccompound", id=compound_id, retmode="xml")
compound_data = handle.read()
handle.close()
```

## Sequence Features with Bio.SeqFeature

```python
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

# Create a feature
feature = SeqFeature(
    location=FeatureLocation(start=10, end=50),
    type="CDS",
    strand=1,
    qualifiers={"gene": ["ABC1"], "product": ["ABC protein"]}
)

# Add feature to record
record = SeqRecord(Seq("ATCG" * 20), id="seq1")
record.features.append(feature)

# Extract feature sequence
feature_seq = feature.extract(record.seq)
print(feature_seq)
```

## Sequence Ambiguity

```python
from Bio.Data import IUPACData

# DNA ambiguity codes
print(IUPACData.ambiguous_dna_letters)

# Protein ambiguity codes
print(IUPACData.ambiguous_protein_letters)

# Resolve ambiguous bases
print(IUPACData.ambiguous_dna_values["N"])  # Any base
print(IUPACData.ambiguous_dna_values["R"])  # A or G
```

## Quality Scores (FASTQ)

```python
from Bio import SeqIO

# Read FASTQ with quality scores
for record in SeqIO.parse("reads.fastq", "fastq"):
    print(f"ID: {record.id}")
    print(f"Sequence: {record.seq}")
    print(f"Quality: {record.letter_annotations['phred_quality']}")

    # Calculate average quality
    avg_quality = sum(record.letter_annotations['phred_quality']) / len(record)
    print(f"Average quality: {avg_quality:.2f}")

    # Filter by quality
    min_quality = min(record.letter_annotations['phred_quality'])
    if min_quality >= 20:
        print("High quality read")
```

## Best Practices

1. **Use appropriate modules** - Choose the right tool for your analysis
2. **Handle pseudocounts** - Important for motif analysis
3. **Validate input data** - Check file formats and data quality
4. **Consider performance** - Some operations can be computationally intensive
5. **Cache results** - Store intermediate results for large analyses
6. **Use proper genetic codes** - Select appropriate translation tables
7. **Document parameters** - Record thresholds and settings used
8. **Validate statistical results** - Understand limitations of tests
9. **Handle edge cases** - Check for empty results or invalid input
10. **Combine modules** - Leverage multiple Biopython tools together

## Common Use Cases

### Find ORFs

```python
from Bio import SeqIO
from Bio.SeqUtils import gc_fraction

def find_orfs(seq, min_length=100):
    """Find all ORFs in sequence."""
    orfs = []

    for strand, nuc in [(+1, seq), (-1, seq.reverse_complement())]:
        for frame in range(3):
            trans = nuc[frame:].translate()
            trans_len = len(trans)

            aa_start = 0
            while aa_start < trans_len:
                aa_end = trans.find("*", aa_start)
                if aa_end == -1:
                    aa_end = trans_len

                if aa_end - aa_start >= min_length // 3:
                    start = frame + aa_start * 3
                    end = frame + aa_end * 3
                    orfs.append({
                        'start': start,
                        'end': end,
                        'strand': strand,
                        'frame': frame,
                        'length': end - start,
                        'sequence': nuc[start:end]
                    })

                aa_start = aa_end + 1

    return orfs

# Use it
record = SeqIO.read("sequence.fasta", "fasta")
orfs = find_orfs(record.seq, min_length=300)
for orf in orfs:
    print(f"ORF: {orf['start']}-{orf['end']}, strand={orf['strand']}, length={orf['length']}")
```

### Analyze Codon Usage

```python
from Bio import SeqIO
from Bio.SeqUtils import CodonUsage

def analyze_codon_usage(fasta_file):
    """Analyze codon usage in coding sequences."""
    codon_counts = {}

    for record in SeqIO.parse(fasta_file, "fasta"):
        # Ensure sequence is multiple of 3
        seq = record.seq[:len(record.seq) - len(record.seq) % 3]

        # Count codons
        for i in range(0, len(seq), 3):
            codon = str(seq[i:i+3])
            codon_counts[codon] = codon_counts.get(codon, 0) + 1

    # Calculate frequencies
    total = sum(codon_counts.values())
    codon_freq = {k: v/total for k, v in codon_counts.items()}

    return codon_freq
```

### Calculate Sequence Complexity

```python
def sequence_complexity(seq, k=2):
    """Calculate k-mer complexity (Shannon entropy)."""
    import math
    from collections import Counter

    # Generate k-mers
    kmers = [str(seq[i:i+k]) for i in range(len(seq) - k + 1)]

    # Count k-mers
    counts = Counter(kmers)
    total = len(kmers)

    # Calculate entropy
    entropy = 0
    for count in counts.values():
        freq = count / total
        entropy -= freq * math.log2(freq)

    # Normalize by maximum possible entropy
    max_entropy = math.log2(4 ** k)  # For DNA

    return entropy / max_entropy if max_entropy > 0 else 0

# Use it
from Bio.Seq import Seq
seq = Seq("ATCGATCGATCGATCG")
complexity = sequence_complexity(seq, k=2)
print(f"Sequence complexity: {complexity:.3f}")
```

### Extract Promoter Regions

```python
def extract_promoters(genbank_file, upstream=500):
    """Extract promoter regions upstream of genes."""
    from Bio import SeqIO

    record = SeqIO.read(genbank_file, "genbank")
    promoters = []

    for feature in record.features:
        if feature.type == "gene":
            if feature.strand == 1:
                # Forward strand
                start = max(0, feature.location.start - upstream)
                end = feature.location.start
            else:
                # Reverse strand
                start = feature.location.end
                end = min(len(record.seq), feature.location.end + upstream)

            promoter_seq = record.seq[start:end]
            if feature.strand == -1:
                promoter_seq = promoter_seq.reverse_complement()

            promoters.append({
                'gene': feature.qualifiers.get('gene', ['Unknown'])[0],
                'sequence': promoter_seq,
                'start': start,
                'end': end
            })

    return promoters
```