# Advanced Biopython Features ## Sequence Motifs with Bio.motifs ### Creating Motifs ```python from Bio import motifs from Bio.Seq import Seq # Create motif from instances instances = [ Seq("TACAA"), Seq("TACGC"), Seq("TACAC"), Seq("TACCC"), Seq("AACCC"), Seq("AATGC"), Seq("AATGC"), ] motif = motifs.create(instances) ``` ### Motif Consensus and Degenerate Sequences ```python # Get consensus sequence print(motif.counts.consensus) # Get degenerate consensus (IUPAC ambiguity codes) print(motif.counts.degenerate_consensus) # Access counts matrix print(motif.counts) ``` ### Position Weight Matrix (PWM) ```python # Create position weight matrix pwm = motif.counts.normalize(pseudocounts=0.5) print(pwm) # Calculate information content ic = motif.counts.information_content() print(f"Information content: {ic:.2f} bits") ``` ### Searching for Motifs ```python from Bio.Seq import Seq # Search sequence for motif test_seq = Seq("ATACAGGACAGACATACGCATACAACATTACAC") # Get Position Specific Scoring Matrix (PSSM) pssm = pwm.log_odds() # Search sequence for position, score in pssm.search(test_seq, threshold=5.0): print(f"Position {position}: score = {score:.2f}") ``` ### Reading Motifs from Files ```python # Read motif from JASPAR format with open("motif.jaspar") as handle: motif = motifs.read(handle, "jaspar") # Read multiple motifs with open("motifs.jaspar") as handle: for m in motifs.parse(handle, "jaspar"): print(m.name) # Supported formats: jaspar, meme, transfac, pfm ``` ### Writing Motifs ```python # Write motif in JASPAR format with open("output.jaspar", "w") as handle: handle.write(motif.format("jaspar")) ``` ## Population Genetics with Bio.PopGen ### Working with GenePop Files ```python from Bio.PopGen import GenePop # Read GenePop file with open("data.gen") as handle: record = GenePop.read(handle) # Access populations print(f"Number of populations: {len(record.populations)}") print(f"Loci: {record.loci_list}") # Iterate through populations for pop_idx, pop in enumerate(record.populations): print(f"\nPopulation {pop_idx + 1}:") for individual in pop: print(f" {individual[0]}: {individual[1]}") ``` ### Calculating Population Statistics ```python from Bio.PopGen.GenePop.Controller import GenePopController # Create controller ctrl = GenePopController() # Calculate basic statistics result = ctrl.calc_allele_genotype_freqs("data.gen") # Calculate Fst fst_result = ctrl.calc_fst_all("data.gen") print(f"Fst: {fst_result}") # Test Hardy-Weinberg equilibrium hw_result = ctrl.test_hw_pop("data.gen", "probability") ``` ## Sequence Utilities with Bio.SeqUtils ### GC Content ```python from Bio.SeqUtils import gc_fraction from Bio.Seq import Seq seq = Seq("ATCGATCGATCG") gc = gc_fraction(seq) print(f"GC content: {gc:.2%}") ``` ### Molecular Weight ```python from Bio.SeqUtils import molecular_weight # DNA molecular weight dna_seq = Seq("ATCG") mw = molecular_weight(dna_seq, seq_type="DNA") print(f"DNA MW: {mw:.2f} g/mol") # Protein molecular weight protein_seq = Seq("ACDEFGHIKLMNPQRSTVWY") mw = molecular_weight(protein_seq, seq_type="protein") print(f"Protein MW: {mw:.2f} Da") ``` ### Melting Temperature ```python from Bio.SeqUtils import MeltingTemp as mt # Calculate Tm using nearest-neighbor method seq = Seq("ATCGATCGATCG") tm = mt.Tm_NN(seq) print(f"Tm: {tm:.1f}°C") # Use different salt concentration tm = mt.Tm_NN(seq, Na=50, Mg=1.5) # 50 mM Na+, 1.5 mM Mg2+ # Wallace rule (for primers) tm_wallace = mt.Tm_Wallace(seq) ``` ### GC Skew ```python from Bio.SeqUtils import gc_skew # Calculate GC skew seq = Seq("ATCGATCGGGCCCAAATTT") skew = gc_skew(seq, window=100) print(f"GC skew: {skew}") ``` ### ProtParam - Protein Analysis ```python from Bio.SeqUtils.ProtParam import ProteinAnalysis protein_seq = "ACDEFGHIKLMNPQRSTVWY" analyzed_seq = ProteinAnalysis(protein_seq) # Molecular weight print(f"MW: {analyzed_seq.molecular_weight():.2f} Da") # Isoelectric point print(f"pI: {analyzed_seq.isoelectric_point():.2f}") # Amino acid composition print(f"Composition: {analyzed_seq.get_amino_acids_percent()}") # Instability index print(f"Instability: {analyzed_seq.instability_index():.2f}") # Aromaticity print(f"Aromaticity: {analyzed_seq.aromaticity():.2f}") # Secondary structure fraction ss = analyzed_seq.secondary_structure_fraction() print(f"Helix: {ss[0]:.2%}, Turn: {ss[1]:.2%}, Sheet: {ss[2]:.2%}") # Extinction coefficient (assumes Cys reduced, no disulfide bonds) print(f"Extinction coefficient: {analyzed_seq.molar_extinction_coefficient()}") # Gravy (grand average of hydropathy) print(f"GRAVY: {analyzed_seq.gravy():.3f}") ``` ## Restriction Analysis with Bio.Restriction ```python from Bio import Restriction from Bio.Seq import Seq # Analyze sequence for restriction sites seq = Seq("GAATTCATCGATCGATGAATTC") # Use specific enzyme ecori = Restriction.EcoRI sites = ecori.search(seq) print(f"EcoRI sites at: {sites}") # Use multiple enzymes rb = Restriction.RestrictionBatch(["EcoRI", "BamHI", "PstI"]) results = rb.search(seq) for enzyme, sites in results.items(): if sites: print(f"{enzyme}: {sites}") # Get all enzymes that cut sequence all_enzymes = Restriction.Analysis(rb, seq) print(f"Cutting enzymes: {all_enzymes.with_sites()}") ``` ## Sequence Translation Tables ```python from Bio.Data import CodonTable # Standard genetic code standard_table = CodonTable.unambiguous_dna_by_id[1] print(standard_table) # Mitochondrial code mito_table = CodonTable.unambiguous_dna_by_id[2] # Get specific codon print(f"ATG codes for: {standard_table.forward_table['ATG']}") # Get stop codons print(f"Stop codons: {standard_table.stop_codons}") # Get start codons print(f"Start codons: {standard_table.start_codons}") ``` ## Cluster Analysis with Bio.Cluster ```python from Bio.Cluster import kcluster import numpy as np # Sample data matrix (genes x conditions) data = np.array([ [1.2, 0.8, 0.5, 1.5], [0.9, 1.1, 0.7, 1.3], [0.2, 0.3, 2.1, 2.5], [0.1, 0.4, 2.3, 2.2], ]) # Perform k-means clustering clusterid, error, nfound = kcluster(data, nclusters=2) print(f"Cluster assignments: {clusterid}") print(f"Error: {error}") ``` ## Genome Diagrams with GenomeDiagram ```python from Bio.Graphics import GenomeDiagram from Bio.SeqFeature import SeqFeature, FeatureLocation from Bio import SeqIO from reportlab.lib import colors # Read GenBank file record = SeqIO.read("sequence.gb", "genbank") # Create diagram gd_diagram = GenomeDiagram.Diagram("Genome Diagram") gd_track = gd_diagram.new_track(1, greytrack=True) gd_feature_set = gd_track.new_set() # Add features for feature in record.features: if feature.type == "CDS": color = colors.blue elif feature.type == "gene": color = colors.lightblue else: color = colors.grey gd_feature_set.add_feature( feature, color=color, label=True, label_size=6, label_angle=45 ) # Draw and save gd_diagram.draw(format="linear", pagesize="A4", fragments=1) gd_diagram.write("genome_diagram.pdf", "PDF") ``` ## Sequence Comparison with Bio.pairwise2 **Note**: Bio.pairwise2 is deprecated. Use Bio.Align.PairwiseAligner instead (see alignment.md). However, for legacy code: ```python from Bio import pairwise2 from Bio.pairwise2 import format_alignment # Global alignment alignments = pairwise2.align.globalxx("ACCGT", "ACGT") # Print top alignments for alignment in alignments[:3]: print(format_alignment(*alignment)) ``` ## Working with PubChem ```python from Bio import Entrez Entrez.email = "your.email@example.com" # Search PubChem handle = Entrez.esearch(db="pccompound", term="aspirin") result = Entrez.read(handle) handle.close() compound_id = result["IdList"][0] # Get compound information handle = Entrez.efetch(db="pccompound", id=compound_id, retmode="xml") compound_data = handle.read() handle.close() ``` ## Sequence Features with Bio.SeqFeature ```python from Bio.SeqFeature import SeqFeature, FeatureLocation from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord # Create a feature feature = SeqFeature( location=FeatureLocation(start=10, end=50), type="CDS", strand=1, qualifiers={"gene": ["ABC1"], "product": ["ABC protein"]} ) # Add feature to record record = SeqRecord(Seq("ATCG" * 20), id="seq1") record.features.append(feature) # Extract feature sequence feature_seq = feature.extract(record.seq) print(feature_seq) ``` ## Sequence Ambiguity ```python from Bio.Data import IUPACData # DNA ambiguity codes print(IUPACData.ambiguous_dna_letters) # Protein ambiguity codes print(IUPACData.ambiguous_protein_letters) # Resolve ambiguous bases print(IUPACData.ambiguous_dna_values["N"]) # Any base print(IUPACData.ambiguous_dna_values["R"]) # A or G ``` ## Quality Scores (FASTQ) ```python from Bio import SeqIO # Read FASTQ with quality scores for record in SeqIO.parse("reads.fastq", "fastq"): print(f"ID: {record.id}") print(f"Sequence: {record.seq}") print(f"Quality: {record.letter_annotations['phred_quality']}") # Calculate average quality avg_quality = sum(record.letter_annotations['phred_quality']) / len(record) print(f"Average quality: {avg_quality:.2f}") # Filter by quality min_quality = min(record.letter_annotations['phred_quality']) if min_quality >= 20: print("High quality read") ``` ## Best Practices 1. **Use appropriate modules** - Choose the right tool for your analysis 2. **Handle pseudocounts** - Important for motif analysis 3. **Validate input data** - Check file formats and data quality 4. **Consider performance** - Some operations can be computationally intensive 5. **Cache results** - Store intermediate results for large analyses 6. **Use proper genetic codes** - Select appropriate translation tables 7. **Document parameters** - Record thresholds and settings used 8. **Validate statistical results** - Understand limitations of tests 9. **Handle edge cases** - Check for empty results or invalid input 10. **Combine modules** - Leverage multiple Biopython tools together ## Common Use Cases ### Find ORFs ```python from Bio import SeqIO from Bio.SeqUtils import gc_fraction def find_orfs(seq, min_length=100): """Find all ORFs in sequence.""" orfs = [] for strand, nuc in [(+1, seq), (-1, seq.reverse_complement())]: for frame in range(3): trans = nuc[frame:].translate() trans_len = len(trans) aa_start = 0 while aa_start < trans_len: aa_end = trans.find("*", aa_start) if aa_end == -1: aa_end = trans_len if aa_end - aa_start >= min_length // 3: start = frame + aa_start * 3 end = frame + aa_end * 3 orfs.append({ 'start': start, 'end': end, 'strand': strand, 'frame': frame, 'length': end - start, 'sequence': nuc[start:end] }) aa_start = aa_end + 1 return orfs # Use it record = SeqIO.read("sequence.fasta", "fasta") orfs = find_orfs(record.seq, min_length=300) for orf in orfs: print(f"ORF: {orf['start']}-{orf['end']}, strand={orf['strand']}, length={orf['length']}") ``` ### Analyze Codon Usage ```python from Bio import SeqIO from Bio.SeqUtils import CodonUsage def analyze_codon_usage(fasta_file): """Analyze codon usage in coding sequences.""" codon_counts = {} for record in SeqIO.parse(fasta_file, "fasta"): # Ensure sequence is multiple of 3 seq = record.seq[:len(record.seq) - len(record.seq) % 3] # Count codons for i in range(0, len(seq), 3): codon = str(seq[i:i+3]) codon_counts[codon] = codon_counts.get(codon, 0) + 1 # Calculate frequencies total = sum(codon_counts.values()) codon_freq = {k: v/total for k, v in codon_counts.items()} return codon_freq ``` ### Calculate Sequence Complexity ```python def sequence_complexity(seq, k=2): """Calculate k-mer complexity (Shannon entropy).""" import math from collections import Counter # Generate k-mers kmers = [str(seq[i:i+k]) for i in range(len(seq) - k + 1)] # Count k-mers counts = Counter(kmers) total = len(kmers) # Calculate entropy entropy = 0 for count in counts.values(): freq = count / total entropy -= freq * math.log2(freq) # Normalize by maximum possible entropy max_entropy = math.log2(4 ** k) # For DNA return entropy / max_entropy if max_entropy > 0 else 0 # Use it from Bio.Seq import Seq seq = Seq("ATCGATCGATCGATCG") complexity = sequence_complexity(seq, k=2) print(f"Sequence complexity: {complexity:.3f}") ``` ### Extract Promoter Regions ```python def extract_promoters(genbank_file, upstream=500): """Extract promoter regions upstream of genes.""" from Bio import SeqIO record = SeqIO.read(genbank_file, "genbank") promoters = [] for feature in record.features: if feature.type == "gene": if feature.strand == 1: # Forward strand start = max(0, feature.location.start - upstream) end = feature.location.start else: # Reverse strand start = feature.location.end end = min(len(record.seq), feature.location.end + upstream) promoter_seq = record.seq[start:end] if feature.strand == -1: promoter_seq = promoter_seq.reverse_complement() promoters.append({ 'gene': feature.qualifiers.get('gene', ['Unknown'])[0], 'sequence': promoter_seq, 'start': start, 'end': end }) return promoters ```