Files
2025-11-30 08:30:10 +08:00

18 KiB

ETE Toolkit Common Workflows

This document provides complete workflows for common tasks using the ETE Toolkit.

Table of Contents

  1. Basic Tree Operations
  2. Phylogenetic Analysis
  3. Tree Comparison
  4. Taxonomy Integration
  5. Clustering Analysis
  6. Tree Visualization

Basic Tree Operations

Loading and Exploring a Tree

from ete3 import Tree

# Load tree from file
tree = Tree("my_tree.nw", format=1)

# Display ASCII representation
print(tree.get_ascii(show_internal=True))

# Get basic statistics
print(f"Number of leaves: {len(tree)}")
print(f"Total nodes: {len(list(tree.traverse()))}")
print(f"Tree depth: {tree.get_farthest_leaf()[1]}")

# List all leaf names
for leaf in tree:
    print(leaf.name)

Extracting and Saving Subtrees

from ete3 import Tree

tree = Tree("full_tree.nw")

# Get subtree rooted at specific node
node = tree.search_nodes(name="MyNode")[0]
subtree = node.copy()

# Save subtree to file
subtree.write(outfile="subtree.nw", format=1)

# Extract monophyletic clade
species_of_interest = ["species1", "species2", "species3"]
ancestor = tree.get_common_ancestor(species_of_interest)
clade = ancestor.copy()
clade.write(outfile="clade.nw")

Pruning Trees to Specific Taxa

from ete3 import Tree

tree = Tree("large_tree.nw")

# Keep only taxa of interest
taxa_to_keep = ["taxon1", "taxon2", "taxon3", "taxon4"]
tree.prune(taxa_to_keep, preserve_branch_length=True)

# Save pruned tree
tree.write(outfile="pruned_tree.nw")

Rerooting Trees

from ete3 import Tree

tree = Tree("unrooted_tree.nw")

# Method 1: Root by outgroup
outgroup = tree & "Outgroup_species"
tree.set_outgroup(outgroup)

# Method 2: Midpoint rooting
midpoint = tree.get_midpoint_outgroup()
tree.set_outgroup(midpoint)

# Save rooted tree
tree.write(outfile="rooted_tree.nw")

Annotating Nodes with Custom Data

from ete3 import Tree

tree = Tree("tree.nw")

# Add features to nodes based on metadata
metadata = {
    "species1": {"habitat": "marine", "temperature": 20},
    "species2": {"habitat": "freshwater", "temperature": 15},
}

for leaf in tree:
    if leaf.name in metadata:
        leaf.add_features(**metadata[leaf.name])

# Query annotated features
for leaf in tree:
    if hasattr(leaf, "habitat"):
        print(f"{leaf.name}: {leaf.habitat}, {leaf.temperature}°C")

# Save with custom features (NHX format)
tree.write(outfile="annotated_tree.nhx", features=["habitat", "temperature"])

Modifying Tree Topology

from ete3 import Tree

tree = Tree("tree.nw")

# Remove a clade
node_to_remove = tree & "unwanted_clade"
node_to_remove.detach()

# Collapse a node (delete but keep children)
node_to_collapse = tree & "low_support_node"
node_to_collapse.delete()

# Add a new species to existing clade
target_clade = tree & "target_node"
new_leaf = target_clade.add_child(name="new_species", dist=0.5)

# Resolve polytomies
tree.resolve_polytomy(recursive=True)

# Save modified tree
tree.write(outfile="modified_tree.nw")

Phylogenetic Analysis

Complete Gene Tree Analysis with Alignment

from ete3 import PhyloTree

# Load gene tree and link alignment
tree = PhyloTree("gene_tree.nw", format=1)
tree.link_to_alignment("alignment.fasta", alg_format="fasta")

# Set species naming function (e.g., gene_species format)
def extract_species(node_name):
    return node_name.split("_")[0]

tree.set_species_naming_function(extract_species)

# Access sequences
for leaf in tree:
    print(f"{leaf.name} ({leaf.species})")
    print(f"Sequence: {leaf.sequence[:50]}...")

Detecting Duplication and Speciation Events

from ete3 import PhyloTree, Tree

# Load gene tree
gene_tree = PhyloTree("gene_tree.nw")

# Set species naming
gene_tree.set_species_naming_function(lambda x: x.split("_")[0])

# Option 1: Species Overlap algorithm (no species tree needed)
events = gene_tree.get_descendant_evol_events()

# Option 2: Tree reconciliation (requires species tree)
species_tree = Tree("species_tree.nw")
events = gene_tree.get_descendant_evol_events(species_tree=species_tree)

# Analyze events
duplications = 0
speciations = 0

for node in gene_tree.traverse():
    if hasattr(node, "evoltype"):
        if node.evoltype == "D":
            duplications += 1
            print(f"Duplication at node {node.name}")
        elif node.evoltype == "S":
            speciations += 1

print(f"\nTotal duplications: {duplications}")
print(f"Total speciations: {speciations}")

Extracting Orthologs and Paralogs

from ete3 import PhyloTree

gene_tree = PhyloTree("gene_tree.nw")
gene_tree.set_species_naming_function(lambda x: x.split("_")[0])

# Detect evolutionary events
events = gene_tree.get_descendant_evol_events()

# Find all orthologs to a query gene
query_gene = gene_tree & "species1_gene1"

orthologs = []
paralogs = []

for event in events:
    if query_gene in event.in_seqs:
        if event.etype == "S":  # Speciation
            orthologs.extend([s for s in event.out_seqs if s != query_gene])
        elif event.etype == "D":  # Duplication
            paralogs.extend([s for s in event.out_seqs if s != query_gene])

print(f"Orthologs of {query_gene.name}:")
for ortholog in set(orthologs):
    print(f"  {ortholog.name}")

print(f"\nParalogs of {query_gene.name}:")
for paralog in set(paralogs):
    print(f"  {paralog.name}")

Splitting Gene Families by Duplication Events

from ete3 import PhyloTree

gene_tree = PhyloTree("gene_family.nw")
gene_tree.set_species_naming_function(lambda x: x.split("_")[0])
gene_tree.get_descendant_evol_events()

# Split into individual gene families
subfamilies = gene_tree.split_by_dups()

print(f"Gene family split into {len(subfamilies)} subfamilies")

for i, subtree in enumerate(subfamilies):
    subtree.write(outfile=f"subfamily_{i}.nw")
    species = set([leaf.species for leaf in subtree])
    print(f"Subfamily {i}: {len(subtree)} genes from {len(species)} species")

Collapsing Lineage-Specific Expansions

from ete3 import PhyloTree

gene_tree = PhyloTree("expanded_tree.nw")
gene_tree.set_species_naming_function(lambda x: x.split("_")[0])

# Collapse lineage-specific duplications
gene_tree.collapse_lineage_specific_expansions()

print("After collapsing expansions:")
print(gene_tree.get_ascii())

gene_tree.write(outfile="collapsed_tree.nw")

Testing Monophyly

from ete3 import Tree

tree = Tree("tree.nw")

# Test if a group is monophyletic
target_species = ["species1", "species2", "species3"]
is_mono, clade_type, base_node = tree.check_monophyly(
    values=target_species,
    target_attr="name"
)

if is_mono:
    print(f"Group is monophyletic")
    print(f"MRCA: {base_node.name}")
elif clade_type == "paraphyletic":
    print(f"Group is paraphyletic")
elif clade_type == "polyphyletic":
    print(f"Group is polyphyletic")

# Get all monophyletic clades of a specific type
# Annotate leaves first
for leaf in tree:
    if leaf.name.startswith("species"):
        leaf.add_feature("type", "typeA")
    else:
        leaf.add_feature("type", "typeB")

mono_clades = tree.get_monophyletic(values=["typeA"], target_attr="type")
print(f"Found {len(mono_clades)} monophyletic clades of typeA")

Tree Comparison

Computing Robinson-Foulds Distance

from ete3 import Tree

tree1 = Tree("tree1.nw")
tree2 = Tree("tree2.nw")

# Compute RF distance
rf, max_rf, common_leaves, parts_t1, parts_t2 = tree1.robinson_foulds(tree2)

print(f"Robinson-Foulds distance: {rf}")
print(f"Maximum RF distance: {max_rf}")
print(f"Normalized RF: {rf/max_rf:.3f}")
print(f"Common leaves: {len(common_leaves)}")

# Find unique partitions
unique_in_t1 = parts_t1 - parts_t2
unique_in_t2 = parts_t2 - parts_t1

print(f"\nPartitions unique to tree1: {len(unique_in_t1)}")
print(f"Partitions unique to tree2: {len(unique_in_t2)}")

Comparing Multiple Trees

from ete3 import Tree
import numpy as np

# Load multiple trees
tree_files = ["tree1.nw", "tree2.nw", "tree3.nw", "tree4.nw"]
trees = [Tree(f) for f in tree_files]

# Create distance matrix
n = len(trees)
dist_matrix = np.zeros((n, n))

for i in range(n):
    for j in range(i+1, n):
        rf, max_rf, _, _, _ = trees[i].robinson_foulds(trees[j])
        norm_rf = rf / max_rf if max_rf > 0 else 0
        dist_matrix[i, j] = norm_rf
        dist_matrix[j, i] = norm_rf

print("Normalized RF distance matrix:")
print(dist_matrix)

# Find most similar pair
min_dist = float('inf')
best_pair = None

for i in range(n):
    for j in range(i+1, n):
        if dist_matrix[i, j] < min_dist:
            min_dist = dist_matrix[i, j]
            best_pair = (i, j)

print(f"\nMost similar trees: {tree_files[best_pair[0]]} and {tree_files[best_pair[1]]}")
print(f"Distance: {min_dist:.3f}")

Finding Consensus Topology

from ete3 import Tree

# Load multiple bootstrap trees
bootstrap_trees = [Tree(f"bootstrap_{i}.nw") for i in range(100)]

# Get reference tree (first tree)
ref_tree = bootstrap_trees[0].copy()

# Count bipartitions
bipartition_counts = {}

for tree in bootstrap_trees:
    rf, max_rf, common, parts_ref, parts_tree = ref_tree.robinson_foulds(tree)
    for partition in parts_tree:
        bipartition_counts[partition] = bipartition_counts.get(partition, 0) + 1

# Filter by support threshold
threshold = 70  # 70% support
supported_bipartitions = {
    k: v for k, v in bipartition_counts.items()
    if (v / len(bootstrap_trees)) * 100 >= threshold
}

print(f"Bipartitions with >{threshold}% support: {len(supported_bipartitions)}")

Taxonomy Integration

Building Species Trees from NCBI Taxonomy

from ete3 import NCBITaxa

ncbi = NCBITaxa()

# Define species of interest
species = ["Homo sapiens", "Pan troglodytes", "Gorilla gorilla",
           "Mus musculus", "Rattus norvegicus"]

# Get taxids
name2taxid = ncbi.get_name_translator(species)
taxids = [name2taxid[sp][0] for sp in species]

# Build tree
tree = ncbi.get_topology(taxids)

# Annotate with taxonomy info
for node in tree.traverse():
    if hasattr(node, "sci_name"):
        print(f"{node.sci_name} - Rank: {node.rank} - TaxID: {node.taxid}")

# Save tree
tree.write(outfile="species_tree.nw")

Annotating Existing Tree with NCBI Taxonomy

from ete3 import Tree, NCBITaxa

tree = Tree("species_tree.nw")
ncbi = NCBITaxa()

# Map leaf names to species names (adjust as needed)
leaf_to_species = {
    "Hsap_gene1": "Homo sapiens",
    "Ptro_gene1": "Pan troglodytes",
    "Mmur_gene1": "Microcebus murinus",
}

# Get taxids
all_species = list(set(leaf_to_species.values()))
name2taxid = ncbi.get_name_translator(all_species)

# Annotate leaves
for leaf in tree:
    if leaf.name in leaf_to_species:
        species_name = leaf_to_species[leaf.name]
        taxid = name2taxid[species_name][0]

        # Add taxonomy info
        leaf.add_feature("species", species_name)
        leaf.add_feature("taxid", taxid)

        # Get full lineage
        lineage = ncbi.get_lineage(taxid)
        names = ncbi.get_taxid_translator(lineage)
        leaf.add_feature("lineage", [names[t] for t in lineage])

        print(f"{leaf.name}: {species_name} (taxid: {taxid})")

Querying NCBI Taxonomy

from ete3 import NCBITaxa

ncbi = NCBITaxa()

# Get all primates
primates_taxid = ncbi.get_name_translator(["Primates"])["Primates"][0]
all_primates = ncbi.get_descendant_taxa(primates_taxid, collapse_subspecies=True)

print(f"Total primate species: {len(all_primates)}")

# Get names for subset
taxid2name = ncbi.get_taxid_translator(all_primates[:10])
for taxid, name in taxid2name.items():
    rank = ncbi.get_rank([taxid])[taxid]
    print(f"{name} ({rank})")

# Get lineage for specific species
human_taxid = 9606
lineage = ncbi.get_lineage(human_taxid)
ranks = ncbi.get_rank(lineage)
names = ncbi.get_taxid_translator(lineage)

print("\nHuman lineage:")
for taxid in lineage:
    print(f"{ranks[taxid]:15s} {names[taxid]}")

Clustering Analysis

Analyzing Hierarchical Clustering Results

from ete3 import ClusterTree

# Load clustering tree with data matrix
matrix = """#Names\tSample1\tSample2\tSample3\tSample4
Gene1\t1.5\t2.3\t0.8\t1.2
Gene2\t0.9\t1.1\t1.8\t2.1
Gene3\t2.1\t2.5\t0.5\t0.9
Gene4\t0.7\t0.9\t2.2\t2.4"""

tree = ClusterTree("((Gene1,Gene2),(Gene3,Gene4));", text_array=matrix)

# Calculate cluster quality metrics
for node in tree.traverse():
    if not node.is_leaf():
        # Silhouette coefficient
        silhouette = node.get_silhouette()

        # Dunn index
        dunn = node.get_dunn()

        # Distances
        inter = node.intercluster_dist
        intra = node.intracluster_dist

        print(f"Node: {node.name}")
        print(f"  Silhouette: {silhouette:.3f}")
        print(f"  Dunn index: {dunn:.3f}")
        print(f"  Intercluster distance: {inter:.3f}")
        print(f"  Intracluster distance: {intra:.3f}")

Validating Clusters

from ete3 import ClusterTree

matrix = """#Names\tCol1\tCol2\tCol3
ItemA\t1.2\t0.5\t0.8
ItemB\t1.3\t0.6\t0.9
ItemC\t0.1\t2.5\t2.3
ItemD\t0.2\t2.6\t2.4"""

tree = ClusterTree("((ItemA,ItemB),(ItemC,ItemD));", text_array=matrix)

# Test different distance metrics
metrics = ["euclidean", "pearson", "spearman"]

for metric in metrics:
    print(f"\nUsing {metric} distance:")

    for node in tree.traverse():
        if not node.is_leaf():
            silhouette = node.get_silhouette(distance=metric)

            # Positive silhouette = good clustering
            # Negative silhouette = poor clustering
            quality = "good" if silhouette > 0 else "poor"

            print(f"  Cluster {node.name}: {silhouette:.3f} ({quality})")

Tree Visualization

Basic Tree Rendering

from ete3 import Tree, TreeStyle

tree = Tree("tree.nw")

# Create tree style
ts = TreeStyle()
ts.show_leaf_name = True
ts.show_branch_length = True
ts.show_branch_support = True
ts.scale = 50  # pixels per branch length unit

# Render to file
tree.render("tree_output.pdf", tree_style=ts)
tree.render("tree_output.png", tree_style=ts, w=800, h=600, units="px")
tree.render("tree_output.svg", tree_style=ts)

Customizing Node Appearance

from ete3 import Tree, TreeStyle, NodeStyle

tree = Tree("tree.nw")

# Define node styles
for node in tree.traverse():
    nstyle = NodeStyle()

    if node.is_leaf():
        nstyle["fgcolor"] = "blue"
        nstyle["size"] = 10
    else:
        nstyle["fgcolor"] = "red"
        nstyle["size"] = 5

    if node.support > 0.9:
        nstyle["shape"] = "sphere"
    else:
        nstyle["shape"] = "circle"

    node.set_style(nstyle)

# Render
ts = TreeStyle()
tree.render("styled_tree.pdf", tree_style=ts)

Adding Faces to Nodes

from ete3 import Tree, TreeStyle, TextFace, CircleFace, AttrFace

tree = Tree("tree.nw")

# Add features to nodes
for leaf in tree:
    leaf.add_feature("habitat", "marine" if "fish" in leaf.name else "terrestrial")
    leaf.add_feature("temp", 20)

# Layout function to add faces
def layout(node):
    if node.is_leaf():
        # Add text face
        name_face = TextFace(node.name, fsize=10)
        node.add_face(name_face, column=0, position="branch-right")

        # Add colored circle based on habitat
        color = "blue" if node.habitat == "marine" else "green"
        circle_face = CircleFace(radius=5, color=color)
        node.add_face(circle_face, column=1, position="branch-right")

        # Add attribute face
        temp_face = AttrFace("temp", fsize=8)
        node.add_face(temp_face, column=2, position="branch-right")

ts = TreeStyle()
ts.layout_fn = layout
ts.show_leaf_name = False  # We're adding custom names

tree.render("tree_with_faces.pdf", tree_style=ts)

Circular Tree Layout

from ete3 import Tree, TreeStyle

tree = Tree("tree.nw")

ts = TreeStyle()
ts.mode = "c"  # Circular mode
ts.arc_start = 0  # Degrees
ts.arc_span = 360  # Full circle
ts.show_leaf_name = True

tree.render("circular_tree.pdf", tree_style=ts)

Interactive Exploration

from ete3 import Tree

tree = Tree("tree.nw")

# Launch GUI (allows zooming, searching, modifying)
# Changes persist after closing
tree.show()

# Can save changes made in GUI
tree.write(outfile="modified_tree.nw")

Advanced Workflows

Complete Phylogenomic Pipeline

from ete3 import PhyloTree, NCBITaxa, TreeStyle

# 1. Load gene tree
gene_tree = PhyloTree("gene_tree.nw", alignment="alignment.fasta")

# 2. Set species naming
gene_tree.set_species_naming_function(lambda x: x.split("_")[0])

# 3. Detect evolutionary events
gene_tree.get_descendant_evol_events()

# 4. Annotate with NCBI taxonomy
ncbi = NCBITaxa()
species_set = set([leaf.species for leaf in gene_tree])
name2taxid = ncbi.get_name_translator(list(species_set))

for leaf in gene_tree:
    if leaf.species in name2taxid:
        taxid = name2taxid[leaf.species][0]
        lineage = ncbi.get_lineage(taxid)
        names = ncbi.get_taxid_translator(lineage)
        leaf.add_feature("lineage", [names[t] for t in lineage])

# 5. Identify and save ortholog groups
ortho_groups = gene_tree.get_speciation_trees()

for i, ortho_tree in enumerate(ortho_groups):
    ortho_tree.write(outfile=f"ortholog_group_{i}.nw")

# 6. Visualize with evolutionary events marked
def layout(node):
    from ete3 import TextFace
    if hasattr(node, "evoltype"):
        if node.evoltype == "D":
            dup_face = TextFace("DUPLICATION", fsize=8, fgcolor="red")
            node.add_face(dup_face, column=0, position="branch-top")

ts = TreeStyle()
ts.layout_fn = layout
ts.show_leaf_name = True
gene_tree.render("annotated_gene_tree.pdf", tree_style=ts)

print(f"Pipeline complete. Found {len(ortho_groups)} ortholog groups.")

Batch Processing Multiple Trees

from ete3 import Tree
import os

input_dir = "input_trees"
output_dir = "processed_trees"
os.makedirs(output_dir, exist_ok=True)

for filename in os.listdir(input_dir):
    if filename.endswith(".nw"):
        # Load tree
        tree = Tree(os.path.join(input_dir, filename))

        # Process: root, prune, annotate
        midpoint = tree.get_midpoint_outgroup()
        tree.set_outgroup(midpoint)

        # Filter by branch length
        to_remove = []
        for node in tree.traverse():
            if node.dist < 0.001 and not node.is_root():
                to_remove.append(node)

        for node in to_remove:
            node.delete()

        # Save processed tree
        output_file = os.path.join(output_dir, f"processed_{filename}")
        tree.write(outfile=output_file)

        print(f"Processed {filename}")