# ETE Toolkit Common Workflows This document provides complete workflows for common tasks using the ETE Toolkit. ## Table of Contents 1. [Basic Tree Operations](#basic-tree-operations) 2. [Phylogenetic Analysis](#phylogenetic-analysis) 3. [Tree Comparison](#tree-comparison) 4. [Taxonomy Integration](#taxonomy-integration) 5. [Clustering Analysis](#clustering-analysis) 6. [Tree Visualization](#tree-visualization) --- ## Basic Tree Operations ### Loading and Exploring a Tree ```python from ete3 import Tree # Load tree from file tree = Tree("my_tree.nw", format=1) # Display ASCII representation print(tree.get_ascii(show_internal=True)) # Get basic statistics print(f"Number of leaves: {len(tree)}") print(f"Total nodes: {len(list(tree.traverse()))}") print(f"Tree depth: {tree.get_farthest_leaf()[1]}") # List all leaf names for leaf in tree: print(leaf.name) ``` ### Extracting and Saving Subtrees ```python from ete3 import Tree tree = Tree("full_tree.nw") # Get subtree rooted at specific node node = tree.search_nodes(name="MyNode")[0] subtree = node.copy() # Save subtree to file subtree.write(outfile="subtree.nw", format=1) # Extract monophyletic clade species_of_interest = ["species1", "species2", "species3"] ancestor = tree.get_common_ancestor(species_of_interest) clade = ancestor.copy() clade.write(outfile="clade.nw") ``` ### Pruning Trees to Specific Taxa ```python from ete3 import Tree tree = Tree("large_tree.nw") # Keep only taxa of interest taxa_to_keep = ["taxon1", "taxon2", "taxon3", "taxon4"] tree.prune(taxa_to_keep, preserve_branch_length=True) # Save pruned tree tree.write(outfile="pruned_tree.nw") ``` ### Rerooting Trees ```python from ete3 import Tree tree = Tree("unrooted_tree.nw") # Method 1: Root by outgroup outgroup = tree & "Outgroup_species" tree.set_outgroup(outgroup) # Method 2: Midpoint rooting midpoint = tree.get_midpoint_outgroup() tree.set_outgroup(midpoint) # Save rooted tree tree.write(outfile="rooted_tree.nw") ``` ### Annotating Nodes with Custom Data ```python from ete3 import Tree tree = Tree("tree.nw") # Add features to nodes based on metadata metadata = { "species1": {"habitat": "marine", "temperature": 20}, "species2": {"habitat": "freshwater", "temperature": 15}, } for leaf in tree: if leaf.name in metadata: leaf.add_features(**metadata[leaf.name]) # Query annotated features for leaf in tree: if hasattr(leaf, "habitat"): print(f"{leaf.name}: {leaf.habitat}, {leaf.temperature}°C") # Save with custom features (NHX format) tree.write(outfile="annotated_tree.nhx", features=["habitat", "temperature"]) ``` ### Modifying Tree Topology ```python from ete3 import Tree tree = Tree("tree.nw") # Remove a clade node_to_remove = tree & "unwanted_clade" node_to_remove.detach() # Collapse a node (delete but keep children) node_to_collapse = tree & "low_support_node" node_to_collapse.delete() # Add a new species to existing clade target_clade = tree & "target_node" new_leaf = target_clade.add_child(name="new_species", dist=0.5) # Resolve polytomies tree.resolve_polytomy(recursive=True) # Save modified tree tree.write(outfile="modified_tree.nw") ``` --- ## Phylogenetic Analysis ### Complete Gene Tree Analysis with Alignment ```python from ete3 import PhyloTree # Load gene tree and link alignment tree = PhyloTree("gene_tree.nw", format=1) tree.link_to_alignment("alignment.fasta", alg_format="fasta") # Set species naming function (e.g., gene_species format) def extract_species(node_name): return node_name.split("_")[0] tree.set_species_naming_function(extract_species) # Access sequences for leaf in tree: print(f"{leaf.name} ({leaf.species})") print(f"Sequence: {leaf.sequence[:50]}...") ``` ### Detecting Duplication and Speciation Events ```python from ete3 import PhyloTree, Tree # Load gene tree gene_tree = PhyloTree("gene_tree.nw") # Set species naming gene_tree.set_species_naming_function(lambda x: x.split("_")[0]) # Option 1: Species Overlap algorithm (no species tree needed) events = gene_tree.get_descendant_evol_events() # Option 2: Tree reconciliation (requires species tree) species_tree = Tree("species_tree.nw") events = gene_tree.get_descendant_evol_events(species_tree=species_tree) # Analyze events duplications = 0 speciations = 0 for node in gene_tree.traverse(): if hasattr(node, "evoltype"): if node.evoltype == "D": duplications += 1 print(f"Duplication at node {node.name}") elif node.evoltype == "S": speciations += 1 print(f"\nTotal duplications: {duplications}") print(f"Total speciations: {speciations}") ``` ### Extracting Orthologs and Paralogs ```python from ete3 import PhyloTree gene_tree = PhyloTree("gene_tree.nw") gene_tree.set_species_naming_function(lambda x: x.split("_")[0]) # Detect evolutionary events events = gene_tree.get_descendant_evol_events() # Find all orthologs to a query gene query_gene = gene_tree & "species1_gene1" orthologs = [] paralogs = [] for event in events: if query_gene in event.in_seqs: if event.etype == "S": # Speciation orthologs.extend([s for s in event.out_seqs if s != query_gene]) elif event.etype == "D": # Duplication paralogs.extend([s for s in event.out_seqs if s != query_gene]) print(f"Orthologs of {query_gene.name}:") for ortholog in set(orthologs): print(f" {ortholog.name}") print(f"\nParalogs of {query_gene.name}:") for paralog in set(paralogs): print(f" {paralog.name}") ``` ### Splitting Gene Families by Duplication Events ```python from ete3 import PhyloTree gene_tree = PhyloTree("gene_family.nw") gene_tree.set_species_naming_function(lambda x: x.split("_")[0]) gene_tree.get_descendant_evol_events() # Split into individual gene families subfamilies = gene_tree.split_by_dups() print(f"Gene family split into {len(subfamilies)} subfamilies") for i, subtree in enumerate(subfamilies): subtree.write(outfile=f"subfamily_{i}.nw") species = set([leaf.species for leaf in subtree]) print(f"Subfamily {i}: {len(subtree)} genes from {len(species)} species") ``` ### Collapsing Lineage-Specific Expansions ```python from ete3 import PhyloTree gene_tree = PhyloTree("expanded_tree.nw") gene_tree.set_species_naming_function(lambda x: x.split("_")[0]) # Collapse lineage-specific duplications gene_tree.collapse_lineage_specific_expansions() print("After collapsing expansions:") print(gene_tree.get_ascii()) gene_tree.write(outfile="collapsed_tree.nw") ``` ### Testing Monophyly ```python from ete3 import Tree tree = Tree("tree.nw") # Test if a group is monophyletic target_species = ["species1", "species2", "species3"] is_mono, clade_type, base_node = tree.check_monophyly( values=target_species, target_attr="name" ) if is_mono: print(f"Group is monophyletic") print(f"MRCA: {base_node.name}") elif clade_type == "paraphyletic": print(f"Group is paraphyletic") elif clade_type == "polyphyletic": print(f"Group is polyphyletic") # Get all monophyletic clades of a specific type # Annotate leaves first for leaf in tree: if leaf.name.startswith("species"): leaf.add_feature("type", "typeA") else: leaf.add_feature("type", "typeB") mono_clades = tree.get_monophyletic(values=["typeA"], target_attr="type") print(f"Found {len(mono_clades)} monophyletic clades of typeA") ``` --- ## Tree Comparison ### Computing Robinson-Foulds Distance ```python from ete3 import Tree tree1 = Tree("tree1.nw") tree2 = Tree("tree2.nw") # Compute RF distance rf, max_rf, common_leaves, parts_t1, parts_t2 = tree1.robinson_foulds(tree2) print(f"Robinson-Foulds distance: {rf}") print(f"Maximum RF distance: {max_rf}") print(f"Normalized RF: {rf/max_rf:.3f}") print(f"Common leaves: {len(common_leaves)}") # Find unique partitions unique_in_t1 = parts_t1 - parts_t2 unique_in_t2 = parts_t2 - parts_t1 print(f"\nPartitions unique to tree1: {len(unique_in_t1)}") print(f"Partitions unique to tree2: {len(unique_in_t2)}") ``` ### Comparing Multiple Trees ```python from ete3 import Tree import numpy as np # Load multiple trees tree_files = ["tree1.nw", "tree2.nw", "tree3.nw", "tree4.nw"] trees = [Tree(f) for f in tree_files] # Create distance matrix n = len(trees) dist_matrix = np.zeros((n, n)) for i in range(n): for j in range(i+1, n): rf, max_rf, _, _, _ = trees[i].robinson_foulds(trees[j]) norm_rf = rf / max_rf if max_rf > 0 else 0 dist_matrix[i, j] = norm_rf dist_matrix[j, i] = norm_rf print("Normalized RF distance matrix:") print(dist_matrix) # Find most similar pair min_dist = float('inf') best_pair = None for i in range(n): for j in range(i+1, n): if dist_matrix[i, j] < min_dist: min_dist = dist_matrix[i, j] best_pair = (i, j) print(f"\nMost similar trees: {tree_files[best_pair[0]]} and {tree_files[best_pair[1]]}") print(f"Distance: {min_dist:.3f}") ``` ### Finding Consensus Topology ```python from ete3 import Tree # Load multiple bootstrap trees bootstrap_trees = [Tree(f"bootstrap_{i}.nw") for i in range(100)] # Get reference tree (first tree) ref_tree = bootstrap_trees[0].copy() # Count bipartitions bipartition_counts = {} for tree in bootstrap_trees: rf, max_rf, common, parts_ref, parts_tree = ref_tree.robinson_foulds(tree) for partition in parts_tree: bipartition_counts[partition] = bipartition_counts.get(partition, 0) + 1 # Filter by support threshold threshold = 70 # 70% support supported_bipartitions = { k: v for k, v in bipartition_counts.items() if (v / len(bootstrap_trees)) * 100 >= threshold } print(f"Bipartitions with >{threshold}% support: {len(supported_bipartitions)}") ``` --- ## Taxonomy Integration ### Building Species Trees from NCBI Taxonomy ```python from ete3 import NCBITaxa ncbi = NCBITaxa() # Define species of interest species = ["Homo sapiens", "Pan troglodytes", "Gorilla gorilla", "Mus musculus", "Rattus norvegicus"] # Get taxids name2taxid = ncbi.get_name_translator(species) taxids = [name2taxid[sp][0] for sp in species] # Build tree tree = ncbi.get_topology(taxids) # Annotate with taxonomy info for node in tree.traverse(): if hasattr(node, "sci_name"): print(f"{node.sci_name} - Rank: {node.rank} - TaxID: {node.taxid}") # Save tree tree.write(outfile="species_tree.nw") ``` ### Annotating Existing Tree with NCBI Taxonomy ```python from ete3 import Tree, NCBITaxa tree = Tree("species_tree.nw") ncbi = NCBITaxa() # Map leaf names to species names (adjust as needed) leaf_to_species = { "Hsap_gene1": "Homo sapiens", "Ptro_gene1": "Pan troglodytes", "Mmur_gene1": "Microcebus murinus", } # Get taxids all_species = list(set(leaf_to_species.values())) name2taxid = ncbi.get_name_translator(all_species) # Annotate leaves for leaf in tree: if leaf.name in leaf_to_species: species_name = leaf_to_species[leaf.name] taxid = name2taxid[species_name][0] # Add taxonomy info leaf.add_feature("species", species_name) leaf.add_feature("taxid", taxid) # Get full lineage lineage = ncbi.get_lineage(taxid) names = ncbi.get_taxid_translator(lineage) leaf.add_feature("lineage", [names[t] for t in lineage]) print(f"{leaf.name}: {species_name} (taxid: {taxid})") ``` ### Querying NCBI Taxonomy ```python from ete3 import NCBITaxa ncbi = NCBITaxa() # Get all primates primates_taxid = ncbi.get_name_translator(["Primates"])["Primates"][0] all_primates = ncbi.get_descendant_taxa(primates_taxid, collapse_subspecies=True) print(f"Total primate species: {len(all_primates)}") # Get names for subset taxid2name = ncbi.get_taxid_translator(all_primates[:10]) for taxid, name in taxid2name.items(): rank = ncbi.get_rank([taxid])[taxid] print(f"{name} ({rank})") # Get lineage for specific species human_taxid = 9606 lineage = ncbi.get_lineage(human_taxid) ranks = ncbi.get_rank(lineage) names = ncbi.get_taxid_translator(lineage) print("\nHuman lineage:") for taxid in lineage: print(f"{ranks[taxid]:15s} {names[taxid]}") ``` --- ## Clustering Analysis ### Analyzing Hierarchical Clustering Results ```python from ete3 import ClusterTree # Load clustering tree with data matrix matrix = """#Names\tSample1\tSample2\tSample3\tSample4 Gene1\t1.5\t2.3\t0.8\t1.2 Gene2\t0.9\t1.1\t1.8\t2.1 Gene3\t2.1\t2.5\t0.5\t0.9 Gene4\t0.7\t0.9\t2.2\t2.4""" tree = ClusterTree("((Gene1,Gene2),(Gene3,Gene4));", text_array=matrix) # Calculate cluster quality metrics for node in tree.traverse(): if not node.is_leaf(): # Silhouette coefficient silhouette = node.get_silhouette() # Dunn index dunn = node.get_dunn() # Distances inter = node.intercluster_dist intra = node.intracluster_dist print(f"Node: {node.name}") print(f" Silhouette: {silhouette:.3f}") print(f" Dunn index: {dunn:.3f}") print(f" Intercluster distance: {inter:.3f}") print(f" Intracluster distance: {intra:.3f}") ``` ### Validating Clusters ```python from ete3 import ClusterTree matrix = """#Names\tCol1\tCol2\tCol3 ItemA\t1.2\t0.5\t0.8 ItemB\t1.3\t0.6\t0.9 ItemC\t0.1\t2.5\t2.3 ItemD\t0.2\t2.6\t2.4""" tree = ClusterTree("((ItemA,ItemB),(ItemC,ItemD));", text_array=matrix) # Test different distance metrics metrics = ["euclidean", "pearson", "spearman"] for metric in metrics: print(f"\nUsing {metric} distance:") for node in tree.traverse(): if not node.is_leaf(): silhouette = node.get_silhouette(distance=metric) # Positive silhouette = good clustering # Negative silhouette = poor clustering quality = "good" if silhouette > 0 else "poor" print(f" Cluster {node.name}: {silhouette:.3f} ({quality})") ``` --- ## Tree Visualization ### Basic Tree Rendering ```python from ete3 import Tree, TreeStyle tree = Tree("tree.nw") # Create tree style ts = TreeStyle() ts.show_leaf_name = True ts.show_branch_length = True ts.show_branch_support = True ts.scale = 50 # pixels per branch length unit # Render to file tree.render("tree_output.pdf", tree_style=ts) tree.render("tree_output.png", tree_style=ts, w=800, h=600, units="px") tree.render("tree_output.svg", tree_style=ts) ``` ### Customizing Node Appearance ```python from ete3 import Tree, TreeStyle, NodeStyle tree = Tree("tree.nw") # Define node styles for node in tree.traverse(): nstyle = NodeStyle() if node.is_leaf(): nstyle["fgcolor"] = "blue" nstyle["size"] = 10 else: nstyle["fgcolor"] = "red" nstyle["size"] = 5 if node.support > 0.9: nstyle["shape"] = "sphere" else: nstyle["shape"] = "circle" node.set_style(nstyle) # Render ts = TreeStyle() tree.render("styled_tree.pdf", tree_style=ts) ``` ### Adding Faces to Nodes ```python from ete3 import Tree, TreeStyle, TextFace, CircleFace, AttrFace tree = Tree("tree.nw") # Add features to nodes for leaf in tree: leaf.add_feature("habitat", "marine" if "fish" in leaf.name else "terrestrial") leaf.add_feature("temp", 20) # Layout function to add faces def layout(node): if node.is_leaf(): # Add text face name_face = TextFace(node.name, fsize=10) node.add_face(name_face, column=0, position="branch-right") # Add colored circle based on habitat color = "blue" if node.habitat == "marine" else "green" circle_face = CircleFace(radius=5, color=color) node.add_face(circle_face, column=1, position="branch-right") # Add attribute face temp_face = AttrFace("temp", fsize=8) node.add_face(temp_face, column=2, position="branch-right") ts = TreeStyle() ts.layout_fn = layout ts.show_leaf_name = False # We're adding custom names tree.render("tree_with_faces.pdf", tree_style=ts) ``` ### Circular Tree Layout ```python from ete3 import Tree, TreeStyle tree = Tree("tree.nw") ts = TreeStyle() ts.mode = "c" # Circular mode ts.arc_start = 0 # Degrees ts.arc_span = 360 # Full circle ts.show_leaf_name = True tree.render("circular_tree.pdf", tree_style=ts) ``` ### Interactive Exploration ```python from ete3 import Tree tree = Tree("tree.nw") # Launch GUI (allows zooming, searching, modifying) # Changes persist after closing tree.show() # Can save changes made in GUI tree.write(outfile="modified_tree.nw") ``` --- ## Advanced Workflows ### Complete Phylogenomic Pipeline ```python from ete3 import PhyloTree, NCBITaxa, TreeStyle # 1. Load gene tree gene_tree = PhyloTree("gene_tree.nw", alignment="alignment.fasta") # 2. Set species naming gene_tree.set_species_naming_function(lambda x: x.split("_")[0]) # 3. Detect evolutionary events gene_tree.get_descendant_evol_events() # 4. Annotate with NCBI taxonomy ncbi = NCBITaxa() species_set = set([leaf.species for leaf in gene_tree]) name2taxid = ncbi.get_name_translator(list(species_set)) for leaf in gene_tree: if leaf.species in name2taxid: taxid = name2taxid[leaf.species][0] lineage = ncbi.get_lineage(taxid) names = ncbi.get_taxid_translator(lineage) leaf.add_feature("lineage", [names[t] for t in lineage]) # 5. Identify and save ortholog groups ortho_groups = gene_tree.get_speciation_trees() for i, ortho_tree in enumerate(ortho_groups): ortho_tree.write(outfile=f"ortholog_group_{i}.nw") # 6. Visualize with evolutionary events marked def layout(node): from ete3 import TextFace if hasattr(node, "evoltype"): if node.evoltype == "D": dup_face = TextFace("DUPLICATION", fsize=8, fgcolor="red") node.add_face(dup_face, column=0, position="branch-top") ts = TreeStyle() ts.layout_fn = layout ts.show_leaf_name = True gene_tree.render("annotated_gene_tree.pdf", tree_style=ts) print(f"Pipeline complete. Found {len(ortho_groups)} ortholog groups.") ``` ### Batch Processing Multiple Trees ```python from ete3 import Tree import os input_dir = "input_trees" output_dir = "processed_trees" os.makedirs(output_dir, exist_ok=True) for filename in os.listdir(input_dir): if filename.endswith(".nw"): # Load tree tree = Tree(os.path.join(input_dir, filename)) # Process: root, prune, annotate midpoint = tree.get_midpoint_outgroup() tree.set_outgroup(midpoint) # Filter by branch length to_remove = [] for node in tree.traverse(): if node.dist < 0.001 and not node.is_root(): to_remove.append(node) for node in to_remove: node.delete() # Save processed tree output_file = os.path.join(output_dir, f"processed_{filename}") tree.write(outfile=output_file) print(f"Processed {filename}") ```