Initial commit
This commit is contained in:
577
skills/biopython/references/advanced.md
Normal file
577
skills/biopython/references/advanced.md
Normal file
@@ -0,0 +1,577 @@
|
||||
# Advanced Biopython Features
|
||||
|
||||
## Sequence Motifs with Bio.motifs
|
||||
|
||||
### Creating Motifs
|
||||
|
||||
```python
|
||||
from Bio import motifs
|
||||
from Bio.Seq import Seq
|
||||
|
||||
# Create motif from instances
|
||||
instances = [
|
||||
Seq("TACAA"),
|
||||
Seq("TACGC"),
|
||||
Seq("TACAC"),
|
||||
Seq("TACCC"),
|
||||
Seq("AACCC"),
|
||||
Seq("AATGC"),
|
||||
Seq("AATGC"),
|
||||
]
|
||||
|
||||
motif = motifs.create(instances)
|
||||
```
|
||||
|
||||
### Motif Consensus and Degenerate Sequences
|
||||
|
||||
```python
|
||||
# Get consensus sequence
|
||||
print(motif.counts.consensus)
|
||||
|
||||
# Get degenerate consensus (IUPAC ambiguity codes)
|
||||
print(motif.counts.degenerate_consensus)
|
||||
|
||||
# Access counts matrix
|
||||
print(motif.counts)
|
||||
```
|
||||
|
||||
### Position Weight Matrix (PWM)
|
||||
|
||||
```python
|
||||
# Create position weight matrix
|
||||
pwm = motif.counts.normalize(pseudocounts=0.5)
|
||||
print(pwm)
|
||||
|
||||
# Calculate information content
|
||||
ic = motif.counts.information_content()
|
||||
print(f"Information content: {ic:.2f} bits")
|
||||
```
|
||||
|
||||
### Searching for Motifs
|
||||
|
||||
```python
|
||||
from Bio.Seq import Seq
|
||||
|
||||
# Search sequence for motif
|
||||
test_seq = Seq("ATACAGGACAGACATACGCATACAACATTACAC")
|
||||
|
||||
# Get Position Specific Scoring Matrix (PSSM)
|
||||
pssm = pwm.log_odds()
|
||||
|
||||
# Search sequence
|
||||
for position, score in pssm.search(test_seq, threshold=5.0):
|
||||
print(f"Position {position}: score = {score:.2f}")
|
||||
```
|
||||
|
||||
### Reading Motifs from Files
|
||||
|
||||
```python
|
||||
# Read motif from JASPAR format
|
||||
with open("motif.jaspar") as handle:
|
||||
motif = motifs.read(handle, "jaspar")
|
||||
|
||||
# Read multiple motifs
|
||||
with open("motifs.jaspar") as handle:
|
||||
for m in motifs.parse(handle, "jaspar"):
|
||||
print(m.name)
|
||||
|
||||
# Supported formats: jaspar, meme, transfac, pfm
|
||||
```
|
||||
|
||||
### Writing Motifs
|
||||
|
||||
```python
|
||||
# Write motif in JASPAR format
|
||||
with open("output.jaspar", "w") as handle:
|
||||
handle.write(motif.format("jaspar"))
|
||||
```
|
||||
|
||||
## Population Genetics with Bio.PopGen
|
||||
|
||||
### Working with GenePop Files
|
||||
|
||||
```python
|
||||
from Bio.PopGen import GenePop
|
||||
|
||||
# Read GenePop file
|
||||
with open("data.gen") as handle:
|
||||
record = GenePop.read(handle)
|
||||
|
||||
# Access populations
|
||||
print(f"Number of populations: {len(record.populations)}")
|
||||
print(f"Loci: {record.loci_list}")
|
||||
|
||||
# Iterate through populations
|
||||
for pop_idx, pop in enumerate(record.populations):
|
||||
print(f"\nPopulation {pop_idx + 1}:")
|
||||
for individual in pop:
|
||||
print(f" {individual[0]}: {individual[1]}")
|
||||
```
|
||||
|
||||
### Calculating Population Statistics
|
||||
|
||||
```python
|
||||
from Bio.PopGen.GenePop.Controller import GenePopController
|
||||
|
||||
# Create controller
|
||||
ctrl = GenePopController()
|
||||
|
||||
# Calculate basic statistics
|
||||
result = ctrl.calc_allele_genotype_freqs("data.gen")
|
||||
|
||||
# Calculate Fst
|
||||
fst_result = ctrl.calc_fst_all("data.gen")
|
||||
print(f"Fst: {fst_result}")
|
||||
|
||||
# Test Hardy-Weinberg equilibrium
|
||||
hw_result = ctrl.test_hw_pop("data.gen", "probability")
|
||||
```
|
||||
|
||||
## Sequence Utilities with Bio.SeqUtils
|
||||
|
||||
### GC Content
|
||||
|
||||
```python
|
||||
from Bio.SeqUtils import gc_fraction
|
||||
from Bio.Seq import Seq
|
||||
|
||||
seq = Seq("ATCGATCGATCG")
|
||||
gc = gc_fraction(seq)
|
||||
print(f"GC content: {gc:.2%}")
|
||||
```
|
||||
|
||||
### Molecular Weight
|
||||
|
||||
```python
|
||||
from Bio.SeqUtils import molecular_weight
|
||||
|
||||
# DNA molecular weight
|
||||
dna_seq = Seq("ATCG")
|
||||
mw = molecular_weight(dna_seq, seq_type="DNA")
|
||||
print(f"DNA MW: {mw:.2f} g/mol")
|
||||
|
||||
# Protein molecular weight
|
||||
protein_seq = Seq("ACDEFGHIKLMNPQRSTVWY")
|
||||
mw = molecular_weight(protein_seq, seq_type="protein")
|
||||
print(f"Protein MW: {mw:.2f} Da")
|
||||
```
|
||||
|
||||
### Melting Temperature
|
||||
|
||||
```python
|
||||
from Bio.SeqUtils import MeltingTemp as mt
|
||||
|
||||
# Calculate Tm using nearest-neighbor method
|
||||
seq = Seq("ATCGATCGATCG")
|
||||
tm = mt.Tm_NN(seq)
|
||||
print(f"Tm: {tm:.1f}°C")
|
||||
|
||||
# Use different salt concentration
|
||||
tm = mt.Tm_NN(seq, Na=50, Mg=1.5) # 50 mM Na+, 1.5 mM Mg2+
|
||||
|
||||
# Wallace rule (for primers)
|
||||
tm_wallace = mt.Tm_Wallace(seq)
|
||||
```
|
||||
|
||||
### GC Skew
|
||||
|
||||
```python
|
||||
from Bio.SeqUtils import gc_skew
|
||||
|
||||
# Calculate GC skew
|
||||
seq = Seq("ATCGATCGGGCCCAAATTT")
|
||||
skew = gc_skew(seq, window=100)
|
||||
print(f"GC skew: {skew}")
|
||||
```
|
||||
|
||||
### ProtParam - Protein Analysis
|
||||
|
||||
```python
|
||||
from Bio.SeqUtils.ProtParam import ProteinAnalysis
|
||||
|
||||
protein_seq = "ACDEFGHIKLMNPQRSTVWY"
|
||||
analyzed_seq = ProteinAnalysis(protein_seq)
|
||||
|
||||
# Molecular weight
|
||||
print(f"MW: {analyzed_seq.molecular_weight():.2f} Da")
|
||||
|
||||
# Isoelectric point
|
||||
print(f"pI: {analyzed_seq.isoelectric_point():.2f}")
|
||||
|
||||
# Amino acid composition
|
||||
print(f"Composition: {analyzed_seq.get_amino_acids_percent()}")
|
||||
|
||||
# Instability index
|
||||
print(f"Instability: {analyzed_seq.instability_index():.2f}")
|
||||
|
||||
# Aromaticity
|
||||
print(f"Aromaticity: {analyzed_seq.aromaticity():.2f}")
|
||||
|
||||
# Secondary structure fraction
|
||||
ss = analyzed_seq.secondary_structure_fraction()
|
||||
print(f"Helix: {ss[0]:.2%}, Turn: {ss[1]:.2%}, Sheet: {ss[2]:.2%}")
|
||||
|
||||
# Extinction coefficient (assumes Cys reduced, no disulfide bonds)
|
||||
print(f"Extinction coefficient: {analyzed_seq.molar_extinction_coefficient()}")
|
||||
|
||||
# Gravy (grand average of hydropathy)
|
||||
print(f"GRAVY: {analyzed_seq.gravy():.3f}")
|
||||
```
|
||||
|
||||
## Restriction Analysis with Bio.Restriction
|
||||
|
||||
```python
|
||||
from Bio import Restriction
|
||||
from Bio.Seq import Seq
|
||||
|
||||
# Analyze sequence for restriction sites
|
||||
seq = Seq("GAATTCATCGATCGATGAATTC")
|
||||
|
||||
# Use specific enzyme
|
||||
ecori = Restriction.EcoRI
|
||||
sites = ecori.search(seq)
|
||||
print(f"EcoRI sites at: {sites}")
|
||||
|
||||
# Use multiple enzymes
|
||||
rb = Restriction.RestrictionBatch(["EcoRI", "BamHI", "PstI"])
|
||||
results = rb.search(seq)
|
||||
for enzyme, sites in results.items():
|
||||
if sites:
|
||||
print(f"{enzyme}: {sites}")
|
||||
|
||||
# Get all enzymes that cut sequence
|
||||
all_enzymes = Restriction.Analysis(rb, seq)
|
||||
print(f"Cutting enzymes: {all_enzymes.with_sites()}")
|
||||
```
|
||||
|
||||
## Sequence Translation Tables
|
||||
|
||||
```python
|
||||
from Bio.Data import CodonTable
|
||||
|
||||
# Standard genetic code
|
||||
standard_table = CodonTable.unambiguous_dna_by_id[1]
|
||||
print(standard_table)
|
||||
|
||||
# Mitochondrial code
|
||||
mito_table = CodonTable.unambiguous_dna_by_id[2]
|
||||
|
||||
# Get specific codon
|
||||
print(f"ATG codes for: {standard_table.forward_table['ATG']}")
|
||||
|
||||
# Get stop codons
|
||||
print(f"Stop codons: {standard_table.stop_codons}")
|
||||
|
||||
# Get start codons
|
||||
print(f"Start codons: {standard_table.start_codons}")
|
||||
```
|
||||
|
||||
## Cluster Analysis with Bio.Cluster
|
||||
|
||||
```python
|
||||
from Bio.Cluster import kcluster
|
||||
import numpy as np
|
||||
|
||||
# Sample data matrix (genes x conditions)
|
||||
data = np.array([
|
||||
[1.2, 0.8, 0.5, 1.5],
|
||||
[0.9, 1.1, 0.7, 1.3],
|
||||
[0.2, 0.3, 2.1, 2.5],
|
||||
[0.1, 0.4, 2.3, 2.2],
|
||||
])
|
||||
|
||||
# Perform k-means clustering
|
||||
clusterid, error, nfound = kcluster(data, nclusters=2)
|
||||
print(f"Cluster assignments: {clusterid}")
|
||||
print(f"Error: {error}")
|
||||
```
|
||||
|
||||
## Genome Diagrams with GenomeDiagram
|
||||
|
||||
```python
|
||||
from Bio.Graphics import GenomeDiagram
|
||||
from Bio.SeqFeature import SeqFeature, FeatureLocation
|
||||
from Bio import SeqIO
|
||||
from reportlab.lib import colors
|
||||
|
||||
# Read GenBank file
|
||||
record = SeqIO.read("sequence.gb", "genbank")
|
||||
|
||||
# Create diagram
|
||||
gd_diagram = GenomeDiagram.Diagram("Genome Diagram")
|
||||
gd_track = gd_diagram.new_track(1, greytrack=True)
|
||||
gd_feature_set = gd_track.new_set()
|
||||
|
||||
# Add features
|
||||
for feature in record.features:
|
||||
if feature.type == "CDS":
|
||||
color = colors.blue
|
||||
elif feature.type == "gene":
|
||||
color = colors.lightblue
|
||||
else:
|
||||
color = colors.grey
|
||||
|
||||
gd_feature_set.add_feature(
|
||||
feature,
|
||||
color=color,
|
||||
label=True,
|
||||
label_size=6,
|
||||
label_angle=45
|
||||
)
|
||||
|
||||
# Draw and save
|
||||
gd_diagram.draw(format="linear", pagesize="A4", fragments=1)
|
||||
gd_diagram.write("genome_diagram.pdf", "PDF")
|
||||
```
|
||||
|
||||
## Sequence Comparison with Bio.pairwise2
|
||||
|
||||
**Note**: Bio.pairwise2 is deprecated. Use Bio.Align.PairwiseAligner instead (see alignment.md).
|
||||
|
||||
However, for legacy code:
|
||||
|
||||
```python
|
||||
from Bio import pairwise2
|
||||
from Bio.pairwise2 import format_alignment
|
||||
|
||||
# Global alignment
|
||||
alignments = pairwise2.align.globalxx("ACCGT", "ACGT")
|
||||
|
||||
# Print top alignments
|
||||
for alignment in alignments[:3]:
|
||||
print(format_alignment(*alignment))
|
||||
```
|
||||
|
||||
## Working with PubChem
|
||||
|
||||
```python
|
||||
from Bio import Entrez
|
||||
|
||||
Entrez.email = "your.email@example.com"
|
||||
|
||||
# Search PubChem
|
||||
handle = Entrez.esearch(db="pccompound", term="aspirin")
|
||||
result = Entrez.read(handle)
|
||||
handle.close()
|
||||
|
||||
compound_id = result["IdList"][0]
|
||||
|
||||
# Get compound information
|
||||
handle = Entrez.efetch(db="pccompound", id=compound_id, retmode="xml")
|
||||
compound_data = handle.read()
|
||||
handle.close()
|
||||
```
|
||||
|
||||
## Sequence Features with Bio.SeqFeature
|
||||
|
||||
```python
|
||||
from Bio.SeqFeature import SeqFeature, FeatureLocation
|
||||
from Bio.Seq import Seq
|
||||
from Bio.SeqRecord import SeqRecord
|
||||
|
||||
# Create a feature
|
||||
feature = SeqFeature(
|
||||
location=FeatureLocation(start=10, end=50),
|
||||
type="CDS",
|
||||
strand=1,
|
||||
qualifiers={"gene": ["ABC1"], "product": ["ABC protein"]}
|
||||
)
|
||||
|
||||
# Add feature to record
|
||||
record = SeqRecord(Seq("ATCG" * 20), id="seq1")
|
||||
record.features.append(feature)
|
||||
|
||||
# Extract feature sequence
|
||||
feature_seq = feature.extract(record.seq)
|
||||
print(feature_seq)
|
||||
```
|
||||
|
||||
## Sequence Ambiguity
|
||||
|
||||
```python
|
||||
from Bio.Data import IUPACData
|
||||
|
||||
# DNA ambiguity codes
|
||||
print(IUPACData.ambiguous_dna_letters)
|
||||
|
||||
# Protein ambiguity codes
|
||||
print(IUPACData.ambiguous_protein_letters)
|
||||
|
||||
# Resolve ambiguous bases
|
||||
print(IUPACData.ambiguous_dna_values["N"]) # Any base
|
||||
print(IUPACData.ambiguous_dna_values["R"]) # A or G
|
||||
```
|
||||
|
||||
## Quality Scores (FASTQ)
|
||||
|
||||
```python
|
||||
from Bio import SeqIO
|
||||
|
||||
# Read FASTQ with quality scores
|
||||
for record in SeqIO.parse("reads.fastq", "fastq"):
|
||||
print(f"ID: {record.id}")
|
||||
print(f"Sequence: {record.seq}")
|
||||
print(f"Quality: {record.letter_annotations['phred_quality']}")
|
||||
|
||||
# Calculate average quality
|
||||
avg_quality = sum(record.letter_annotations['phred_quality']) / len(record)
|
||||
print(f"Average quality: {avg_quality:.2f}")
|
||||
|
||||
# Filter by quality
|
||||
min_quality = min(record.letter_annotations['phred_quality'])
|
||||
if min_quality >= 20:
|
||||
print("High quality read")
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Use appropriate modules** - Choose the right tool for your analysis
|
||||
2. **Handle pseudocounts** - Important for motif analysis
|
||||
3. **Validate input data** - Check file formats and data quality
|
||||
4. **Consider performance** - Some operations can be computationally intensive
|
||||
5. **Cache results** - Store intermediate results for large analyses
|
||||
6. **Use proper genetic codes** - Select appropriate translation tables
|
||||
7. **Document parameters** - Record thresholds and settings used
|
||||
8. **Validate statistical results** - Understand limitations of tests
|
||||
9. **Handle edge cases** - Check for empty results or invalid input
|
||||
10. **Combine modules** - Leverage multiple Biopython tools together
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
### Find ORFs
|
||||
|
||||
```python
|
||||
from Bio import SeqIO
|
||||
from Bio.SeqUtils import gc_fraction
|
||||
|
||||
def find_orfs(seq, min_length=100):
|
||||
"""Find all ORFs in sequence."""
|
||||
orfs = []
|
||||
|
||||
for strand, nuc in [(+1, seq), (-1, seq.reverse_complement())]:
|
||||
for frame in range(3):
|
||||
trans = nuc[frame:].translate()
|
||||
trans_len = len(trans)
|
||||
|
||||
aa_start = 0
|
||||
while aa_start < trans_len:
|
||||
aa_end = trans.find("*", aa_start)
|
||||
if aa_end == -1:
|
||||
aa_end = trans_len
|
||||
|
||||
if aa_end - aa_start >= min_length // 3:
|
||||
start = frame + aa_start * 3
|
||||
end = frame + aa_end * 3
|
||||
orfs.append({
|
||||
'start': start,
|
||||
'end': end,
|
||||
'strand': strand,
|
||||
'frame': frame,
|
||||
'length': end - start,
|
||||
'sequence': nuc[start:end]
|
||||
})
|
||||
|
||||
aa_start = aa_end + 1
|
||||
|
||||
return orfs
|
||||
|
||||
# Use it
|
||||
record = SeqIO.read("sequence.fasta", "fasta")
|
||||
orfs = find_orfs(record.seq, min_length=300)
|
||||
for orf in orfs:
|
||||
print(f"ORF: {orf['start']}-{orf['end']}, strand={orf['strand']}, length={orf['length']}")
|
||||
```
|
||||
|
||||
### Analyze Codon Usage
|
||||
|
||||
```python
|
||||
from Bio import SeqIO
|
||||
from Bio.SeqUtils import CodonUsage
|
||||
|
||||
def analyze_codon_usage(fasta_file):
|
||||
"""Analyze codon usage in coding sequences."""
|
||||
codon_counts = {}
|
||||
|
||||
for record in SeqIO.parse(fasta_file, "fasta"):
|
||||
# Ensure sequence is multiple of 3
|
||||
seq = record.seq[:len(record.seq) - len(record.seq) % 3]
|
||||
|
||||
# Count codons
|
||||
for i in range(0, len(seq), 3):
|
||||
codon = str(seq[i:i+3])
|
||||
codon_counts[codon] = codon_counts.get(codon, 0) + 1
|
||||
|
||||
# Calculate frequencies
|
||||
total = sum(codon_counts.values())
|
||||
codon_freq = {k: v/total for k, v in codon_counts.items()}
|
||||
|
||||
return codon_freq
|
||||
```
|
||||
|
||||
### Calculate Sequence Complexity
|
||||
|
||||
```python
|
||||
def sequence_complexity(seq, k=2):
|
||||
"""Calculate k-mer complexity (Shannon entropy)."""
|
||||
import math
|
||||
from collections import Counter
|
||||
|
||||
# Generate k-mers
|
||||
kmers = [str(seq[i:i+k]) for i in range(len(seq) - k + 1)]
|
||||
|
||||
# Count k-mers
|
||||
counts = Counter(kmers)
|
||||
total = len(kmers)
|
||||
|
||||
# Calculate entropy
|
||||
entropy = 0
|
||||
for count in counts.values():
|
||||
freq = count / total
|
||||
entropy -= freq * math.log2(freq)
|
||||
|
||||
# Normalize by maximum possible entropy
|
||||
max_entropy = math.log2(4 ** k) # For DNA
|
||||
|
||||
return entropy / max_entropy if max_entropy > 0 else 0
|
||||
|
||||
# Use it
|
||||
from Bio.Seq import Seq
|
||||
seq = Seq("ATCGATCGATCGATCG")
|
||||
complexity = sequence_complexity(seq, k=2)
|
||||
print(f"Sequence complexity: {complexity:.3f}")
|
||||
```
|
||||
|
||||
### Extract Promoter Regions
|
||||
|
||||
```python
|
||||
def extract_promoters(genbank_file, upstream=500):
|
||||
"""Extract promoter regions upstream of genes."""
|
||||
from Bio import SeqIO
|
||||
|
||||
record = SeqIO.read(genbank_file, "genbank")
|
||||
promoters = []
|
||||
|
||||
for feature in record.features:
|
||||
if feature.type == "gene":
|
||||
if feature.strand == 1:
|
||||
# Forward strand
|
||||
start = max(0, feature.location.start - upstream)
|
||||
end = feature.location.start
|
||||
else:
|
||||
# Reverse strand
|
||||
start = feature.location.end
|
||||
end = min(len(record.seq), feature.location.end + upstream)
|
||||
|
||||
promoter_seq = record.seq[start:end]
|
||||
if feature.strand == -1:
|
||||
promoter_seq = promoter_seq.reverse_complement()
|
||||
|
||||
promoters.append({
|
||||
'gene': feature.qualifiers.get('gene', ['Unknown'])[0],
|
||||
'sequence': promoter_seq,
|
||||
'start': start,
|
||||
'end': end
|
||||
})
|
||||
|
||||
return promoters
|
||||
```
|
||||
Reference in New Issue
Block a user