Files
gh-k-dense-ai-claude-scient…/skills/biopython/references/phylogenetics.md
2025-11-30 08:30:10 +08:00

14 KiB

Phylogenetics with Bio.Phylo

Overview

Bio.Phylo provides a unified toolkit for reading, writing, analyzing, and visualizing phylogenetic trees. It supports multiple file formats including Newick, NEXUS, phyloXML, NeXML, and CDAO.

Supported File Formats

  • Newick - Simple tree representation (most common)
  • NEXUS - Extended format with additional data
  • phyloXML - XML-based format with rich annotations
  • NeXML - Modern XML format
  • CDAO - Comparative Data Analysis Ontology

Reading and Writing Trees

Reading Trees

from Bio import Phylo

# Read a tree from file
tree = Phylo.read("tree.nwk", "newick")

# Parse multiple trees from a file
trees = list(Phylo.parse("trees.nwk", "newick"))
print(f"Found {len(trees)} trees")

Writing Trees

# Write tree to file
Phylo.write(tree, "output.nwk", "newick")

# Write multiple trees
Phylo.write(trees, "output.nex", "nexus")

Format Conversion

# Convert between formats
count = Phylo.convert("input.nwk", "newick", "output.xml", "phyloxml")
print(f"Converted {count} trees")

Tree Structure and Navigation

Basic Tree Components

Trees consist of:

  • Clade - A node (internal or terminal) in the tree
  • Terminal clades - Leaves/tips (taxa)
  • Internal clades - Internal nodes
  • Branch length - Evolutionary distance

Accessing Tree Properties

# Tree root
root = tree.root

# Terminal nodes (leaves)
terminals = tree.get_terminals()
print(f"Number of taxa: {len(terminals)}")

# Non-terminal nodes
nonterminals = tree.get_nonterminals()
print(f"Number of internal nodes: {len(nonterminals)}")

# All clades
all_clades = list(tree.find_clades())
print(f"Total clades: {len(all_clades)}")

Traversing Trees

# Iterate through all clades
for clade in tree.find_clades():
    if clade.name:
        print(f"Clade: {clade.name}, Branch length: {clade.branch_length}")

# Iterate through terminals only
for terminal in tree.get_terminals():
    print(f"Taxon: {terminal.name}")

# Depth-first traversal
for clade in tree.find_clades(order="preorder"):
    print(clade.name)

# Level-order (breadth-first) traversal
for clade in tree.find_clades(order="level"):
    print(clade.name)

Finding Specific Clades

# Find clade by name
clade = tree.find_any(name="Species_A")

# Find all clades matching criteria
def is_long_branch(clade):
    return clade.branch_length and clade.branch_length > 0.5

long_branches = tree.find_clades(is_long_branch)

Tree Analysis

Tree Statistics

# Total branch length
total_length = tree.total_branch_length()
print(f"Total tree length: {total_length:.3f}")

# Tree depth (root to furthest leaf)
depths = tree.depths()
max_depth = max(depths.values())
print(f"Maximum depth: {max_depth:.3f}")

# Terminal count
terminal_count = tree.count_terminals()
print(f"Number of taxa: {terminal_count}")

Distance Calculations

# Distance between two taxa
distance = tree.distance("Species_A", "Species_B")
print(f"Distance: {distance:.3f}")

# Create distance matrix
from Bio import Phylo

terminals = tree.get_terminals()
taxa_names = [t.name for t in terminals]

print("Distance Matrix:")
for taxon1 in taxa_names:
    row = []
    for taxon2 in taxa_names:
        if taxon1 == taxon2:
            row.append(0)
        else:
            dist = tree.distance(taxon1, taxon2)
            row.append(dist)
    print(f"{taxon1}: {row}")

Common Ancestors

# Find common ancestor of two clades
clade1 = tree.find_any(name="Species_A")
clade2 = tree.find_any(name="Species_B")
ancestor = tree.common_ancestor(clade1, clade2)
print(f"Common ancestor: {ancestor.name}")

# Find common ancestor of multiple clades
clades = [tree.find_any(name=n) for n in ["Species_A", "Species_B", "Species_C"]]
ancestor = tree.common_ancestor(*clades)

Tree Comparison

# Compare tree topologies
def compare_trees(tree1, tree2):
    """Compare two trees."""
    # Get terminal names
    taxa1 = set(t.name for t in tree1.get_terminals())
    taxa2 = set(t.name for t in tree2.get_terminals())

    # Check if they have same taxa
    if taxa1 != taxa2:
        return False, "Different taxa"

    # Compare distances
    differences = []
    for taxon1 in taxa1:
        for taxon2 in taxa1:
            if taxon1 < taxon2:
                dist1 = tree1.distance(taxon1, taxon2)
                dist2 = tree2.distance(taxon1, taxon2)
                if abs(dist1 - dist2) > 0.01:
                    differences.append((taxon1, taxon2, dist1, dist2))

    return len(differences) == 0, differences

Tree Manipulation

Pruning Trees

# Prune (remove) specific taxa
tree_copy = tree.copy()
tree_copy.prune("Species_A")

# Keep only specific taxa
taxa_to_keep = ["Species_B", "Species_C", "Species_D"]
terminals = tree_copy.get_terminals()
for terminal in terminals:
    if terminal.name not in taxa_to_keep:
        tree_copy.prune(terminal)

Collapsing Short Branches

# Collapse branches shorter than threshold
def collapse_short_branches(tree, threshold=0.01):
    """Collapse branches shorter than threshold."""
    for clade in tree.find_clades():
        if clade.branch_length and clade.branch_length < threshold:
            clade.branch_length = 0
    return tree

Ladderizing Trees

# Ladderize tree (sort branches by size)
tree.ladderize()  # ascending order
tree.ladderize(reverse=True)  # descending order

Rerooting Trees

# Reroot at midpoint
tree.root_at_midpoint()

# Reroot with outgroup
outgroup = tree.find_any(name="Outgroup_Species")
tree.root_with_outgroup(outgroup)

# Reroot at internal node
internal = tree.get_nonterminals()[0]
tree.root_with_outgroup(internal)

Tree Visualization

Basic ASCII Drawing

# Draw tree to console
Phylo.draw_ascii(tree)

# Draw with custom format
Phylo.draw_ascii(tree, column_width=80)

Matplotlib Visualization

import matplotlib.pyplot as plt
from Bio import Phylo

# Simple plot
fig = plt.figure(figsize=(10, 8))
axes = fig.add_subplot(1, 1, 1)
Phylo.draw(tree, axes=axes)
plt.show()

# Customize plot
fig = plt.figure(figsize=(10, 8))
axes = fig.add_subplot(1, 1, 1)
Phylo.draw(tree, axes=axes, do_show=False)
axes.set_title("Phylogenetic Tree")
plt.tight_layout()
plt.savefig("tree.png", dpi=300)

Advanced Visualization Options

# Radial (circular) tree
Phylo.draw(tree, branch_labels=lambda c: c.branch_length)

# Show branch support values
Phylo.draw(tree, label_func=lambda n: str(n.confidence) if n.confidence else "")

# Color branches
def color_by_length(clade):
    if clade.branch_length:
        if clade.branch_length > 0.5:
            return "red"
        elif clade.branch_length > 0.2:
            return "orange"
    return "black"

# Note: Direct branch coloring requires custom matplotlib code

Building Trees

From Distance Matrix

from Bio.Phylo.TreeConstruction import DistanceTreeConstructor, DistanceMatrix

# Create distance matrix
dm = DistanceMatrix(
    names=["Alpha", "Beta", "Gamma", "Delta"],
    matrix=[
        [],
        [0.23],
        [0.45, 0.34],
        [0.67, 0.58, 0.29]
    ]
)

# Build tree using UPGMA
constructor = DistanceTreeConstructor()
tree = constructor.upgma(dm)
Phylo.draw_ascii(tree)

# Build tree using Neighbor-Joining
tree = constructor.nj(dm)

From Multiple Sequence Alignment

from Bio import AlignIO, Phylo
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor

# Read alignment
alignment = AlignIO.read("alignment.fasta", "fasta")

# Calculate distance matrix
calculator = DistanceCalculator("identity")
distance_matrix = calculator.get_distance(alignment)

# Build tree
constructor = DistanceTreeConstructor()
tree = constructor.upgma(distance_matrix)

# Write tree
Phylo.write(tree, "output_tree.nwk", "newick")

Distance Models

Available distance calculation models:

  • identity - Simple identity
  • blastn - BLASTN identity
  • trans - Transition/transversion ratio
  • blosum62 - BLOSUM62 matrix
  • pam250 - PAM250 matrix
# Use different model
calculator = DistanceCalculator("blosum62")
dm = calculator.get_distance(alignment)

Consensus Trees

from Bio.Phylo.Consensus import majority_consensus, strict_consensus

# Read multiple trees
trees = list(Phylo.parse("bootstrap_trees.nwk", "newick"))

# Majority-rule consensus
consensus = majority_consensus(trees, cutoff=0.5)

# Strict consensus
strict_cons = strict_consensus(trees)

# Write consensus tree
Phylo.write(consensus, "consensus.nwk", "newick")

PhyloXML Features

PhyloXML format supports rich annotations:

from Bio.Phylo.PhyloXML import Phylogeny, Clade

# Create PhyloXML tree
tree = Phylogeny(rooted=True)
tree.name = "Example Tree"
tree.description = "A sample phylogenetic tree"

# Add clades with rich annotations
clade = Clade(branch_length=0.5)
clade.name = "Species_A"
clade.color = "red"
clade.width = 2.0

# Add taxonomy information
from Bio.Phylo.PhyloXML import Taxonomy
taxonomy = Taxonomy(scientific_name="Homo sapiens", common_name="Human")
clade.taxonomies.append(taxonomy)

Bootstrap Support

# Add bootstrap support values to tree
def add_bootstrap_support(tree, support_values):
    """Add bootstrap support to internal nodes."""
    internal_nodes = tree.get_nonterminals()
    for node, support in zip(internal_nodes, support_values):
        node.confidence = support
    return tree

# Example
support_values = [95, 87, 76, 92]
tree_with_support = add_bootstrap_support(tree, support_values)

Best Practices

  1. Choose appropriate file format - Newick for simple trees, phyloXML for annotations
  2. Validate tree topology - Check for polytomies and negative branch lengths
  3. Root trees appropriately - Use midpoint or outgroup rooting
  4. Handle bootstrap values - Store as clade confidence
  5. Consider tree size - Large trees may need special handling
  6. Use tree copies - Call .copy() before modifications
  7. Export publication-ready figures - Use matplotlib for high-quality output
  8. Document tree construction - Record alignment and parameters used
  9. Compare multiple trees - Use consensus methods for bootstrap trees
  10. Validate taxon names - Ensure consistent naming across files

Common Use Cases

Build Tree from Sequences

from Bio import AlignIO, Phylo
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor

# Read aligned sequences
alignment = AlignIO.read("sequences.aln", "clustal")

# Calculate distances
calculator = DistanceCalculator("identity")
dm = calculator.get_distance(alignment)

# Build neighbor-joining tree
constructor = DistanceTreeConstructor()
tree = constructor.nj(dm)

# Root at midpoint
tree.root_at_midpoint()

# Save tree
Phylo.write(tree, "tree.nwk", "newick")

# Visualize
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(10, 8))
Phylo.draw(tree)
plt.show()

Extract Subtree

def extract_subtree(tree, taxa_list):
    """Extract subtree containing specific taxa."""
    # Create a copy
    subtree = tree.copy()

    # Get all terminals
    all_terminals = subtree.get_terminals()

    # Prune taxa not in list
    for terminal in all_terminals:
        if terminal.name not in taxa_list:
            subtree.prune(terminal)

    return subtree

# Use it
subtree = extract_subtree(tree, ["Species_A", "Species_B", "Species_C"])
Phylo.write(subtree, "subtree.nwk", "newick")

Calculate Phylogenetic Diversity

def phylogenetic_diversity(tree, taxa_subset=None):
    """Calculate phylogenetic diversity (sum of branch lengths)."""
    if taxa_subset:
        # Prune to subset
        tree = extract_subtree(tree, taxa_subset)

    # Sum all branch lengths
    total = 0
    for clade in tree.find_clades():
        if clade.branch_length:
            total += clade.branch_length

    return total

# Calculate PD for all taxa
pd_all = phylogenetic_diversity(tree)
print(f"Total phylogenetic diversity: {pd_all:.3f}")

# Calculate PD for subset
pd_subset = phylogenetic_diversity(tree, ["Species_A", "Species_B"])
print(f"Subset phylogenetic diversity: {pd_subset:.3f}")

Annotate Tree with External Data

def annotate_tree_from_csv(tree, csv_file):
    """Annotate tree leaves with data from CSV."""
    import csv

    # Read annotation data
    annotations = {}
    with open(csv_file) as f:
        reader = csv.DictReader(f)
        for row in reader:
            annotations[row["species"]] = row

    # Annotate tree
    for terminal in tree.get_terminals():
        if terminal.name in annotations:
            # Add custom attributes
            for key, value in annotations[terminal.name].items():
                setattr(terminal, key, value)

    return tree

Compare Tree Topologies

def robinson_foulds_distance(tree1, tree2):
    """Calculate Robinson-Foulds distance between two trees."""
    # Get bipartitions for each tree
    def get_bipartitions(tree):
        bipartitions = set()
        for clade in tree.get_nonterminals():
            terminals = frozenset(t.name for t in clade.get_terminals())
            bipartitions.add(terminals)
        return bipartitions

    bp1 = get_bipartitions(tree1)
    bp2 = get_bipartitions(tree2)

    # Symmetric difference
    diff = len(bp1.symmetric_difference(bp2))
    return diff

# Use it
tree1 = Phylo.read("tree1.nwk", "newick")
tree2 = Phylo.read("tree2.nwk", "newick")
rf_dist = robinson_foulds_distance(tree1, tree2)
print(f"Robinson-Foulds distance: {rf_dist}")