Files
2025-11-30 08:30:10 +08:00

578 lines
14 KiB
Markdown

# Advanced Biopython Features
## Sequence Motifs with Bio.motifs
### Creating Motifs
```python
from Bio import motifs
from Bio.Seq import Seq
# Create motif from instances
instances = [
Seq("TACAA"),
Seq("TACGC"),
Seq("TACAC"),
Seq("TACCC"),
Seq("AACCC"),
Seq("AATGC"),
Seq("AATGC"),
]
motif = motifs.create(instances)
```
### Motif Consensus and Degenerate Sequences
```python
# Get consensus sequence
print(motif.counts.consensus)
# Get degenerate consensus (IUPAC ambiguity codes)
print(motif.counts.degenerate_consensus)
# Access counts matrix
print(motif.counts)
```
### Position Weight Matrix (PWM)
```python
# Create position weight matrix
pwm = motif.counts.normalize(pseudocounts=0.5)
print(pwm)
# Calculate information content
ic = motif.counts.information_content()
print(f"Information content: {ic:.2f} bits")
```
### Searching for Motifs
```python
from Bio.Seq import Seq
# Search sequence for motif
test_seq = Seq("ATACAGGACAGACATACGCATACAACATTACAC")
# Get Position Specific Scoring Matrix (PSSM)
pssm = pwm.log_odds()
# Search sequence
for position, score in pssm.search(test_seq, threshold=5.0):
print(f"Position {position}: score = {score:.2f}")
```
### Reading Motifs from Files
```python
# Read motif from JASPAR format
with open("motif.jaspar") as handle:
motif = motifs.read(handle, "jaspar")
# Read multiple motifs
with open("motifs.jaspar") as handle:
for m in motifs.parse(handle, "jaspar"):
print(m.name)
# Supported formats: jaspar, meme, transfac, pfm
```
### Writing Motifs
```python
# Write motif in JASPAR format
with open("output.jaspar", "w") as handle:
handle.write(motif.format("jaspar"))
```
## Population Genetics with Bio.PopGen
### Working with GenePop Files
```python
from Bio.PopGen import GenePop
# Read GenePop file
with open("data.gen") as handle:
record = GenePop.read(handle)
# Access populations
print(f"Number of populations: {len(record.populations)}")
print(f"Loci: {record.loci_list}")
# Iterate through populations
for pop_idx, pop in enumerate(record.populations):
print(f"\nPopulation {pop_idx + 1}:")
for individual in pop:
print(f" {individual[0]}: {individual[1]}")
```
### Calculating Population Statistics
```python
from Bio.PopGen.GenePop.Controller import GenePopController
# Create controller
ctrl = GenePopController()
# Calculate basic statistics
result = ctrl.calc_allele_genotype_freqs("data.gen")
# Calculate Fst
fst_result = ctrl.calc_fst_all("data.gen")
print(f"Fst: {fst_result}")
# Test Hardy-Weinberg equilibrium
hw_result = ctrl.test_hw_pop("data.gen", "probability")
```
## Sequence Utilities with Bio.SeqUtils
### GC Content
```python
from Bio.SeqUtils import gc_fraction
from Bio.Seq import Seq
seq = Seq("ATCGATCGATCG")
gc = gc_fraction(seq)
print(f"GC content: {gc:.2%}")
```
### Molecular Weight
```python
from Bio.SeqUtils import molecular_weight
# DNA molecular weight
dna_seq = Seq("ATCG")
mw = molecular_weight(dna_seq, seq_type="DNA")
print(f"DNA MW: {mw:.2f} g/mol")
# Protein molecular weight
protein_seq = Seq("ACDEFGHIKLMNPQRSTVWY")
mw = molecular_weight(protein_seq, seq_type="protein")
print(f"Protein MW: {mw:.2f} Da")
```
### Melting Temperature
```python
from Bio.SeqUtils import MeltingTemp as mt
# Calculate Tm using nearest-neighbor method
seq = Seq("ATCGATCGATCG")
tm = mt.Tm_NN(seq)
print(f"Tm: {tm:.1f}°C")
# Use different salt concentration
tm = mt.Tm_NN(seq, Na=50, Mg=1.5) # 50 mM Na+, 1.5 mM Mg2+
# Wallace rule (for primers)
tm_wallace = mt.Tm_Wallace(seq)
```
### GC Skew
```python
from Bio.SeqUtils import gc_skew
# Calculate GC skew
seq = Seq("ATCGATCGGGCCCAAATTT")
skew = gc_skew(seq, window=100)
print(f"GC skew: {skew}")
```
### ProtParam - Protein Analysis
```python
from Bio.SeqUtils.ProtParam import ProteinAnalysis
protein_seq = "ACDEFGHIKLMNPQRSTVWY"
analyzed_seq = ProteinAnalysis(protein_seq)
# Molecular weight
print(f"MW: {analyzed_seq.molecular_weight():.2f} Da")
# Isoelectric point
print(f"pI: {analyzed_seq.isoelectric_point():.2f}")
# Amino acid composition
print(f"Composition: {analyzed_seq.get_amino_acids_percent()}")
# Instability index
print(f"Instability: {analyzed_seq.instability_index():.2f}")
# Aromaticity
print(f"Aromaticity: {analyzed_seq.aromaticity():.2f}")
# Secondary structure fraction
ss = analyzed_seq.secondary_structure_fraction()
print(f"Helix: {ss[0]:.2%}, Turn: {ss[1]:.2%}, Sheet: {ss[2]:.2%}")
# Extinction coefficient (assumes Cys reduced, no disulfide bonds)
print(f"Extinction coefficient: {analyzed_seq.molar_extinction_coefficient()}")
# Gravy (grand average of hydropathy)
print(f"GRAVY: {analyzed_seq.gravy():.3f}")
```
## Restriction Analysis with Bio.Restriction
```python
from Bio import Restriction
from Bio.Seq import Seq
# Analyze sequence for restriction sites
seq = Seq("GAATTCATCGATCGATGAATTC")
# Use specific enzyme
ecori = Restriction.EcoRI
sites = ecori.search(seq)
print(f"EcoRI sites at: {sites}")
# Use multiple enzymes
rb = Restriction.RestrictionBatch(["EcoRI", "BamHI", "PstI"])
results = rb.search(seq)
for enzyme, sites in results.items():
if sites:
print(f"{enzyme}: {sites}")
# Get all enzymes that cut sequence
all_enzymes = Restriction.Analysis(rb, seq)
print(f"Cutting enzymes: {all_enzymes.with_sites()}")
```
## Sequence Translation Tables
```python
from Bio.Data import CodonTable
# Standard genetic code
standard_table = CodonTable.unambiguous_dna_by_id[1]
print(standard_table)
# Mitochondrial code
mito_table = CodonTable.unambiguous_dna_by_id[2]
# Get specific codon
print(f"ATG codes for: {standard_table.forward_table['ATG']}")
# Get stop codons
print(f"Stop codons: {standard_table.stop_codons}")
# Get start codons
print(f"Start codons: {standard_table.start_codons}")
```
## Cluster Analysis with Bio.Cluster
```python
from Bio.Cluster import kcluster
import numpy as np
# Sample data matrix (genes x conditions)
data = np.array([
[1.2, 0.8, 0.5, 1.5],
[0.9, 1.1, 0.7, 1.3],
[0.2, 0.3, 2.1, 2.5],
[0.1, 0.4, 2.3, 2.2],
])
# Perform k-means clustering
clusterid, error, nfound = kcluster(data, nclusters=2)
print(f"Cluster assignments: {clusterid}")
print(f"Error: {error}")
```
## Genome Diagrams with GenomeDiagram
```python
from Bio.Graphics import GenomeDiagram
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio import SeqIO
from reportlab.lib import colors
# Read GenBank file
record = SeqIO.read("sequence.gb", "genbank")
# Create diagram
gd_diagram = GenomeDiagram.Diagram("Genome Diagram")
gd_track = gd_diagram.new_track(1, greytrack=True)
gd_feature_set = gd_track.new_set()
# Add features
for feature in record.features:
if feature.type == "CDS":
color = colors.blue
elif feature.type == "gene":
color = colors.lightblue
else:
color = colors.grey
gd_feature_set.add_feature(
feature,
color=color,
label=True,
label_size=6,
label_angle=45
)
# Draw and save
gd_diagram.draw(format="linear", pagesize="A4", fragments=1)
gd_diagram.write("genome_diagram.pdf", "PDF")
```
## Sequence Comparison with Bio.pairwise2
**Note**: Bio.pairwise2 is deprecated. Use Bio.Align.PairwiseAligner instead (see alignment.md).
However, for legacy code:
```python
from Bio import pairwise2
from Bio.pairwise2 import format_alignment
# Global alignment
alignments = pairwise2.align.globalxx("ACCGT", "ACGT")
# Print top alignments
for alignment in alignments[:3]:
print(format_alignment(*alignment))
```
## Working with PubChem
```python
from Bio import Entrez
Entrez.email = "your.email@example.com"
# Search PubChem
handle = Entrez.esearch(db="pccompound", term="aspirin")
result = Entrez.read(handle)
handle.close()
compound_id = result["IdList"][0]
# Get compound information
handle = Entrez.efetch(db="pccompound", id=compound_id, retmode="xml")
compound_data = handle.read()
handle.close()
```
## Sequence Features with Bio.SeqFeature
```python
from Bio.SeqFeature import SeqFeature, FeatureLocation
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
# Create a feature
feature = SeqFeature(
location=FeatureLocation(start=10, end=50),
type="CDS",
strand=1,
qualifiers={"gene": ["ABC1"], "product": ["ABC protein"]}
)
# Add feature to record
record = SeqRecord(Seq("ATCG" * 20), id="seq1")
record.features.append(feature)
# Extract feature sequence
feature_seq = feature.extract(record.seq)
print(feature_seq)
```
## Sequence Ambiguity
```python
from Bio.Data import IUPACData
# DNA ambiguity codes
print(IUPACData.ambiguous_dna_letters)
# Protein ambiguity codes
print(IUPACData.ambiguous_protein_letters)
# Resolve ambiguous bases
print(IUPACData.ambiguous_dna_values["N"]) # Any base
print(IUPACData.ambiguous_dna_values["R"]) # A or G
```
## Quality Scores (FASTQ)
```python
from Bio import SeqIO
# Read FASTQ with quality scores
for record in SeqIO.parse("reads.fastq", "fastq"):
print(f"ID: {record.id}")
print(f"Sequence: {record.seq}")
print(f"Quality: {record.letter_annotations['phred_quality']}")
# Calculate average quality
avg_quality = sum(record.letter_annotations['phred_quality']) / len(record)
print(f"Average quality: {avg_quality:.2f}")
# Filter by quality
min_quality = min(record.letter_annotations['phred_quality'])
if min_quality >= 20:
print("High quality read")
```
## Best Practices
1. **Use appropriate modules** - Choose the right tool for your analysis
2. **Handle pseudocounts** - Important for motif analysis
3. **Validate input data** - Check file formats and data quality
4. **Consider performance** - Some operations can be computationally intensive
5. **Cache results** - Store intermediate results for large analyses
6. **Use proper genetic codes** - Select appropriate translation tables
7. **Document parameters** - Record thresholds and settings used
8. **Validate statistical results** - Understand limitations of tests
9. **Handle edge cases** - Check for empty results or invalid input
10. **Combine modules** - Leverage multiple Biopython tools together
## Common Use Cases
### Find ORFs
```python
from Bio import SeqIO
from Bio.SeqUtils import gc_fraction
def find_orfs(seq, min_length=100):
"""Find all ORFs in sequence."""
orfs = []
for strand, nuc in [(+1, seq), (-1, seq.reverse_complement())]:
for frame in range(3):
trans = nuc[frame:].translate()
trans_len = len(trans)
aa_start = 0
while aa_start < trans_len:
aa_end = trans.find("*", aa_start)
if aa_end == -1:
aa_end = trans_len
if aa_end - aa_start >= min_length // 3:
start = frame + aa_start * 3
end = frame + aa_end * 3
orfs.append({
'start': start,
'end': end,
'strand': strand,
'frame': frame,
'length': end - start,
'sequence': nuc[start:end]
})
aa_start = aa_end + 1
return orfs
# Use it
record = SeqIO.read("sequence.fasta", "fasta")
orfs = find_orfs(record.seq, min_length=300)
for orf in orfs:
print(f"ORF: {orf['start']}-{orf['end']}, strand={orf['strand']}, length={orf['length']}")
```
### Analyze Codon Usage
```python
from Bio import SeqIO
from Bio.SeqUtils import CodonUsage
def analyze_codon_usage(fasta_file):
"""Analyze codon usage in coding sequences."""
codon_counts = {}
for record in SeqIO.parse(fasta_file, "fasta"):
# Ensure sequence is multiple of 3
seq = record.seq[:len(record.seq) - len(record.seq) % 3]
# Count codons
for i in range(0, len(seq), 3):
codon = str(seq[i:i+3])
codon_counts[codon] = codon_counts.get(codon, 0) + 1
# Calculate frequencies
total = sum(codon_counts.values())
codon_freq = {k: v/total for k, v in codon_counts.items()}
return codon_freq
```
### Calculate Sequence Complexity
```python
def sequence_complexity(seq, k=2):
"""Calculate k-mer complexity (Shannon entropy)."""
import math
from collections import Counter
# Generate k-mers
kmers = [str(seq[i:i+k]) for i in range(len(seq) - k + 1)]
# Count k-mers
counts = Counter(kmers)
total = len(kmers)
# Calculate entropy
entropy = 0
for count in counts.values():
freq = count / total
entropy -= freq * math.log2(freq)
# Normalize by maximum possible entropy
max_entropy = math.log2(4 ** k) # For DNA
return entropy / max_entropy if max_entropy > 0 else 0
# Use it
from Bio.Seq import Seq
seq = Seq("ATCGATCGATCGATCG")
complexity = sequence_complexity(seq, k=2)
print(f"Sequence complexity: {complexity:.3f}")
```
### Extract Promoter Regions
```python
def extract_promoters(genbank_file, upstream=500):
"""Extract promoter regions upstream of genes."""
from Bio import SeqIO
record = SeqIO.read(genbank_file, "genbank")
promoters = []
for feature in record.features:
if feature.type == "gene":
if feature.strand == 1:
# Forward strand
start = max(0, feature.location.start - upstream)
end = feature.location.start
else:
# Reverse strand
start = feature.location.end
end = min(len(record.seq), feature.location.end + upstream)
promoter_seq = record.seq[start:end]
if feature.strand == -1:
promoter_seq = promoter_seq.reverse_complement()
promoters.append({
'gene': feature.qualifiers.get('gene', ['Unknown'])[0],
'sequence': promoter_seq,
'start': start,
'end': end
})
return promoters
```