775 lines
18 KiB
Markdown
775 lines
18 KiB
Markdown
# ETE Toolkit Common Workflows
|
|
|
|
This document provides complete workflows for common tasks using the ETE Toolkit.
|
|
|
|
## Table of Contents
|
|
1. [Basic Tree Operations](#basic-tree-operations)
|
|
2. [Phylogenetic Analysis](#phylogenetic-analysis)
|
|
3. [Tree Comparison](#tree-comparison)
|
|
4. [Taxonomy Integration](#taxonomy-integration)
|
|
5. [Clustering Analysis](#clustering-analysis)
|
|
6. [Tree Visualization](#tree-visualization)
|
|
|
|
---
|
|
|
|
## Basic Tree Operations
|
|
|
|
### Loading and Exploring a Tree
|
|
|
|
```python
|
|
from ete3 import Tree
|
|
|
|
# Load tree from file
|
|
tree = Tree("my_tree.nw", format=1)
|
|
|
|
# Display ASCII representation
|
|
print(tree.get_ascii(show_internal=True))
|
|
|
|
# Get basic statistics
|
|
print(f"Number of leaves: {len(tree)}")
|
|
print(f"Total nodes: {len(list(tree.traverse()))}")
|
|
print(f"Tree depth: {tree.get_farthest_leaf()[1]}")
|
|
|
|
# List all leaf names
|
|
for leaf in tree:
|
|
print(leaf.name)
|
|
```
|
|
|
|
### Extracting and Saving Subtrees
|
|
|
|
```python
|
|
from ete3 import Tree
|
|
|
|
tree = Tree("full_tree.nw")
|
|
|
|
# Get subtree rooted at specific node
|
|
node = tree.search_nodes(name="MyNode")[0]
|
|
subtree = node.copy()
|
|
|
|
# Save subtree to file
|
|
subtree.write(outfile="subtree.nw", format=1)
|
|
|
|
# Extract monophyletic clade
|
|
species_of_interest = ["species1", "species2", "species3"]
|
|
ancestor = tree.get_common_ancestor(species_of_interest)
|
|
clade = ancestor.copy()
|
|
clade.write(outfile="clade.nw")
|
|
```
|
|
|
|
### Pruning Trees to Specific Taxa
|
|
|
|
```python
|
|
from ete3 import Tree
|
|
|
|
tree = Tree("large_tree.nw")
|
|
|
|
# Keep only taxa of interest
|
|
taxa_to_keep = ["taxon1", "taxon2", "taxon3", "taxon4"]
|
|
tree.prune(taxa_to_keep, preserve_branch_length=True)
|
|
|
|
# Save pruned tree
|
|
tree.write(outfile="pruned_tree.nw")
|
|
```
|
|
|
|
### Rerooting Trees
|
|
|
|
```python
|
|
from ete3 import Tree
|
|
|
|
tree = Tree("unrooted_tree.nw")
|
|
|
|
# Method 1: Root by outgroup
|
|
outgroup = tree & "Outgroup_species"
|
|
tree.set_outgroup(outgroup)
|
|
|
|
# Method 2: Midpoint rooting
|
|
midpoint = tree.get_midpoint_outgroup()
|
|
tree.set_outgroup(midpoint)
|
|
|
|
# Save rooted tree
|
|
tree.write(outfile="rooted_tree.nw")
|
|
```
|
|
|
|
### Annotating Nodes with Custom Data
|
|
|
|
```python
|
|
from ete3 import Tree
|
|
|
|
tree = Tree("tree.nw")
|
|
|
|
# Add features to nodes based on metadata
|
|
metadata = {
|
|
"species1": {"habitat": "marine", "temperature": 20},
|
|
"species2": {"habitat": "freshwater", "temperature": 15},
|
|
}
|
|
|
|
for leaf in tree:
|
|
if leaf.name in metadata:
|
|
leaf.add_features(**metadata[leaf.name])
|
|
|
|
# Query annotated features
|
|
for leaf in tree:
|
|
if hasattr(leaf, "habitat"):
|
|
print(f"{leaf.name}: {leaf.habitat}, {leaf.temperature}°C")
|
|
|
|
# Save with custom features (NHX format)
|
|
tree.write(outfile="annotated_tree.nhx", features=["habitat", "temperature"])
|
|
```
|
|
|
|
### Modifying Tree Topology
|
|
|
|
```python
|
|
from ete3 import Tree
|
|
|
|
tree = Tree("tree.nw")
|
|
|
|
# Remove a clade
|
|
node_to_remove = tree & "unwanted_clade"
|
|
node_to_remove.detach()
|
|
|
|
# Collapse a node (delete but keep children)
|
|
node_to_collapse = tree & "low_support_node"
|
|
node_to_collapse.delete()
|
|
|
|
# Add a new species to existing clade
|
|
target_clade = tree & "target_node"
|
|
new_leaf = target_clade.add_child(name="new_species", dist=0.5)
|
|
|
|
# Resolve polytomies
|
|
tree.resolve_polytomy(recursive=True)
|
|
|
|
# Save modified tree
|
|
tree.write(outfile="modified_tree.nw")
|
|
```
|
|
|
|
---
|
|
|
|
## Phylogenetic Analysis
|
|
|
|
### Complete Gene Tree Analysis with Alignment
|
|
|
|
```python
|
|
from ete3 import PhyloTree
|
|
|
|
# Load gene tree and link alignment
|
|
tree = PhyloTree("gene_tree.nw", format=1)
|
|
tree.link_to_alignment("alignment.fasta", alg_format="fasta")
|
|
|
|
# Set species naming function (e.g., gene_species format)
|
|
def extract_species(node_name):
|
|
return node_name.split("_")[0]
|
|
|
|
tree.set_species_naming_function(extract_species)
|
|
|
|
# Access sequences
|
|
for leaf in tree:
|
|
print(f"{leaf.name} ({leaf.species})")
|
|
print(f"Sequence: {leaf.sequence[:50]}...")
|
|
```
|
|
|
|
### Detecting Duplication and Speciation Events
|
|
|
|
```python
|
|
from ete3 import PhyloTree, Tree
|
|
|
|
# Load gene tree
|
|
gene_tree = PhyloTree("gene_tree.nw")
|
|
|
|
# Set species naming
|
|
gene_tree.set_species_naming_function(lambda x: x.split("_")[0])
|
|
|
|
# Option 1: Species Overlap algorithm (no species tree needed)
|
|
events = gene_tree.get_descendant_evol_events()
|
|
|
|
# Option 2: Tree reconciliation (requires species tree)
|
|
species_tree = Tree("species_tree.nw")
|
|
events = gene_tree.get_descendant_evol_events(species_tree=species_tree)
|
|
|
|
# Analyze events
|
|
duplications = 0
|
|
speciations = 0
|
|
|
|
for node in gene_tree.traverse():
|
|
if hasattr(node, "evoltype"):
|
|
if node.evoltype == "D":
|
|
duplications += 1
|
|
print(f"Duplication at node {node.name}")
|
|
elif node.evoltype == "S":
|
|
speciations += 1
|
|
|
|
print(f"\nTotal duplications: {duplications}")
|
|
print(f"Total speciations: {speciations}")
|
|
```
|
|
|
|
### Extracting Orthologs and Paralogs
|
|
|
|
```python
|
|
from ete3 import PhyloTree
|
|
|
|
gene_tree = PhyloTree("gene_tree.nw")
|
|
gene_tree.set_species_naming_function(lambda x: x.split("_")[0])
|
|
|
|
# Detect evolutionary events
|
|
events = gene_tree.get_descendant_evol_events()
|
|
|
|
# Find all orthologs to a query gene
|
|
query_gene = gene_tree & "species1_gene1"
|
|
|
|
orthologs = []
|
|
paralogs = []
|
|
|
|
for event in events:
|
|
if query_gene in event.in_seqs:
|
|
if event.etype == "S": # Speciation
|
|
orthologs.extend([s for s in event.out_seqs if s != query_gene])
|
|
elif event.etype == "D": # Duplication
|
|
paralogs.extend([s for s in event.out_seqs if s != query_gene])
|
|
|
|
print(f"Orthologs of {query_gene.name}:")
|
|
for ortholog in set(orthologs):
|
|
print(f" {ortholog.name}")
|
|
|
|
print(f"\nParalogs of {query_gene.name}:")
|
|
for paralog in set(paralogs):
|
|
print(f" {paralog.name}")
|
|
```
|
|
|
|
### Splitting Gene Families by Duplication Events
|
|
|
|
```python
|
|
from ete3 import PhyloTree
|
|
|
|
gene_tree = PhyloTree("gene_family.nw")
|
|
gene_tree.set_species_naming_function(lambda x: x.split("_")[0])
|
|
gene_tree.get_descendant_evol_events()
|
|
|
|
# Split into individual gene families
|
|
subfamilies = gene_tree.split_by_dups()
|
|
|
|
print(f"Gene family split into {len(subfamilies)} subfamilies")
|
|
|
|
for i, subtree in enumerate(subfamilies):
|
|
subtree.write(outfile=f"subfamily_{i}.nw")
|
|
species = set([leaf.species for leaf in subtree])
|
|
print(f"Subfamily {i}: {len(subtree)} genes from {len(species)} species")
|
|
```
|
|
|
|
### Collapsing Lineage-Specific Expansions
|
|
|
|
```python
|
|
from ete3 import PhyloTree
|
|
|
|
gene_tree = PhyloTree("expanded_tree.nw")
|
|
gene_tree.set_species_naming_function(lambda x: x.split("_")[0])
|
|
|
|
# Collapse lineage-specific duplications
|
|
gene_tree.collapse_lineage_specific_expansions()
|
|
|
|
print("After collapsing expansions:")
|
|
print(gene_tree.get_ascii())
|
|
|
|
gene_tree.write(outfile="collapsed_tree.nw")
|
|
```
|
|
|
|
### Testing Monophyly
|
|
|
|
```python
|
|
from ete3 import Tree
|
|
|
|
tree = Tree("tree.nw")
|
|
|
|
# Test if a group is monophyletic
|
|
target_species = ["species1", "species2", "species3"]
|
|
is_mono, clade_type, base_node = tree.check_monophyly(
|
|
values=target_species,
|
|
target_attr="name"
|
|
)
|
|
|
|
if is_mono:
|
|
print(f"Group is monophyletic")
|
|
print(f"MRCA: {base_node.name}")
|
|
elif clade_type == "paraphyletic":
|
|
print(f"Group is paraphyletic")
|
|
elif clade_type == "polyphyletic":
|
|
print(f"Group is polyphyletic")
|
|
|
|
# Get all monophyletic clades of a specific type
|
|
# Annotate leaves first
|
|
for leaf in tree:
|
|
if leaf.name.startswith("species"):
|
|
leaf.add_feature("type", "typeA")
|
|
else:
|
|
leaf.add_feature("type", "typeB")
|
|
|
|
mono_clades = tree.get_monophyletic(values=["typeA"], target_attr="type")
|
|
print(f"Found {len(mono_clades)} monophyletic clades of typeA")
|
|
```
|
|
|
|
---
|
|
|
|
## Tree Comparison
|
|
|
|
### Computing Robinson-Foulds Distance
|
|
|
|
```python
|
|
from ete3 import Tree
|
|
|
|
tree1 = Tree("tree1.nw")
|
|
tree2 = Tree("tree2.nw")
|
|
|
|
# Compute RF distance
|
|
rf, max_rf, common_leaves, parts_t1, parts_t2 = tree1.robinson_foulds(tree2)
|
|
|
|
print(f"Robinson-Foulds distance: {rf}")
|
|
print(f"Maximum RF distance: {max_rf}")
|
|
print(f"Normalized RF: {rf/max_rf:.3f}")
|
|
print(f"Common leaves: {len(common_leaves)}")
|
|
|
|
# Find unique partitions
|
|
unique_in_t1 = parts_t1 - parts_t2
|
|
unique_in_t2 = parts_t2 - parts_t1
|
|
|
|
print(f"\nPartitions unique to tree1: {len(unique_in_t1)}")
|
|
print(f"Partitions unique to tree2: {len(unique_in_t2)}")
|
|
```
|
|
|
|
### Comparing Multiple Trees
|
|
|
|
```python
|
|
from ete3 import Tree
|
|
import numpy as np
|
|
|
|
# Load multiple trees
|
|
tree_files = ["tree1.nw", "tree2.nw", "tree3.nw", "tree4.nw"]
|
|
trees = [Tree(f) for f in tree_files]
|
|
|
|
# Create distance matrix
|
|
n = len(trees)
|
|
dist_matrix = np.zeros((n, n))
|
|
|
|
for i in range(n):
|
|
for j in range(i+1, n):
|
|
rf, max_rf, _, _, _ = trees[i].robinson_foulds(trees[j])
|
|
norm_rf = rf / max_rf if max_rf > 0 else 0
|
|
dist_matrix[i, j] = norm_rf
|
|
dist_matrix[j, i] = norm_rf
|
|
|
|
print("Normalized RF distance matrix:")
|
|
print(dist_matrix)
|
|
|
|
# Find most similar pair
|
|
min_dist = float('inf')
|
|
best_pair = None
|
|
|
|
for i in range(n):
|
|
for j in range(i+1, n):
|
|
if dist_matrix[i, j] < min_dist:
|
|
min_dist = dist_matrix[i, j]
|
|
best_pair = (i, j)
|
|
|
|
print(f"\nMost similar trees: {tree_files[best_pair[0]]} and {tree_files[best_pair[1]]}")
|
|
print(f"Distance: {min_dist:.3f}")
|
|
```
|
|
|
|
### Finding Consensus Topology
|
|
|
|
```python
|
|
from ete3 import Tree
|
|
|
|
# Load multiple bootstrap trees
|
|
bootstrap_trees = [Tree(f"bootstrap_{i}.nw") for i in range(100)]
|
|
|
|
# Get reference tree (first tree)
|
|
ref_tree = bootstrap_trees[0].copy()
|
|
|
|
# Count bipartitions
|
|
bipartition_counts = {}
|
|
|
|
for tree in bootstrap_trees:
|
|
rf, max_rf, common, parts_ref, parts_tree = ref_tree.robinson_foulds(tree)
|
|
for partition in parts_tree:
|
|
bipartition_counts[partition] = bipartition_counts.get(partition, 0) + 1
|
|
|
|
# Filter by support threshold
|
|
threshold = 70 # 70% support
|
|
supported_bipartitions = {
|
|
k: v for k, v in bipartition_counts.items()
|
|
if (v / len(bootstrap_trees)) * 100 >= threshold
|
|
}
|
|
|
|
print(f"Bipartitions with >{threshold}% support: {len(supported_bipartitions)}")
|
|
```
|
|
|
|
---
|
|
|
|
## Taxonomy Integration
|
|
|
|
### Building Species Trees from NCBI Taxonomy
|
|
|
|
```python
|
|
from ete3 import NCBITaxa
|
|
|
|
ncbi = NCBITaxa()
|
|
|
|
# Define species of interest
|
|
species = ["Homo sapiens", "Pan troglodytes", "Gorilla gorilla",
|
|
"Mus musculus", "Rattus norvegicus"]
|
|
|
|
# Get taxids
|
|
name2taxid = ncbi.get_name_translator(species)
|
|
taxids = [name2taxid[sp][0] for sp in species]
|
|
|
|
# Build tree
|
|
tree = ncbi.get_topology(taxids)
|
|
|
|
# Annotate with taxonomy info
|
|
for node in tree.traverse():
|
|
if hasattr(node, "sci_name"):
|
|
print(f"{node.sci_name} - Rank: {node.rank} - TaxID: {node.taxid}")
|
|
|
|
# Save tree
|
|
tree.write(outfile="species_tree.nw")
|
|
```
|
|
|
|
### Annotating Existing Tree with NCBI Taxonomy
|
|
|
|
```python
|
|
from ete3 import Tree, NCBITaxa
|
|
|
|
tree = Tree("species_tree.nw")
|
|
ncbi = NCBITaxa()
|
|
|
|
# Map leaf names to species names (adjust as needed)
|
|
leaf_to_species = {
|
|
"Hsap_gene1": "Homo sapiens",
|
|
"Ptro_gene1": "Pan troglodytes",
|
|
"Mmur_gene1": "Microcebus murinus",
|
|
}
|
|
|
|
# Get taxids
|
|
all_species = list(set(leaf_to_species.values()))
|
|
name2taxid = ncbi.get_name_translator(all_species)
|
|
|
|
# Annotate leaves
|
|
for leaf in tree:
|
|
if leaf.name in leaf_to_species:
|
|
species_name = leaf_to_species[leaf.name]
|
|
taxid = name2taxid[species_name][0]
|
|
|
|
# Add taxonomy info
|
|
leaf.add_feature("species", species_name)
|
|
leaf.add_feature("taxid", taxid)
|
|
|
|
# Get full lineage
|
|
lineage = ncbi.get_lineage(taxid)
|
|
names = ncbi.get_taxid_translator(lineage)
|
|
leaf.add_feature("lineage", [names[t] for t in lineage])
|
|
|
|
print(f"{leaf.name}: {species_name} (taxid: {taxid})")
|
|
```
|
|
|
|
### Querying NCBI Taxonomy
|
|
|
|
```python
|
|
from ete3 import NCBITaxa
|
|
|
|
ncbi = NCBITaxa()
|
|
|
|
# Get all primates
|
|
primates_taxid = ncbi.get_name_translator(["Primates"])["Primates"][0]
|
|
all_primates = ncbi.get_descendant_taxa(primates_taxid, collapse_subspecies=True)
|
|
|
|
print(f"Total primate species: {len(all_primates)}")
|
|
|
|
# Get names for subset
|
|
taxid2name = ncbi.get_taxid_translator(all_primates[:10])
|
|
for taxid, name in taxid2name.items():
|
|
rank = ncbi.get_rank([taxid])[taxid]
|
|
print(f"{name} ({rank})")
|
|
|
|
# Get lineage for specific species
|
|
human_taxid = 9606
|
|
lineage = ncbi.get_lineage(human_taxid)
|
|
ranks = ncbi.get_rank(lineage)
|
|
names = ncbi.get_taxid_translator(lineage)
|
|
|
|
print("\nHuman lineage:")
|
|
for taxid in lineage:
|
|
print(f"{ranks[taxid]:15s} {names[taxid]}")
|
|
```
|
|
|
|
---
|
|
|
|
## Clustering Analysis
|
|
|
|
### Analyzing Hierarchical Clustering Results
|
|
|
|
```python
|
|
from ete3 import ClusterTree
|
|
|
|
# Load clustering tree with data matrix
|
|
matrix = """#Names\tSample1\tSample2\tSample3\tSample4
|
|
Gene1\t1.5\t2.3\t0.8\t1.2
|
|
Gene2\t0.9\t1.1\t1.8\t2.1
|
|
Gene3\t2.1\t2.5\t0.5\t0.9
|
|
Gene4\t0.7\t0.9\t2.2\t2.4"""
|
|
|
|
tree = ClusterTree("((Gene1,Gene2),(Gene3,Gene4));", text_array=matrix)
|
|
|
|
# Calculate cluster quality metrics
|
|
for node in tree.traverse():
|
|
if not node.is_leaf():
|
|
# Silhouette coefficient
|
|
silhouette = node.get_silhouette()
|
|
|
|
# Dunn index
|
|
dunn = node.get_dunn()
|
|
|
|
# Distances
|
|
inter = node.intercluster_dist
|
|
intra = node.intracluster_dist
|
|
|
|
print(f"Node: {node.name}")
|
|
print(f" Silhouette: {silhouette:.3f}")
|
|
print(f" Dunn index: {dunn:.3f}")
|
|
print(f" Intercluster distance: {inter:.3f}")
|
|
print(f" Intracluster distance: {intra:.3f}")
|
|
```
|
|
|
|
### Validating Clusters
|
|
|
|
```python
|
|
from ete3 import ClusterTree
|
|
|
|
matrix = """#Names\tCol1\tCol2\tCol3
|
|
ItemA\t1.2\t0.5\t0.8
|
|
ItemB\t1.3\t0.6\t0.9
|
|
ItemC\t0.1\t2.5\t2.3
|
|
ItemD\t0.2\t2.6\t2.4"""
|
|
|
|
tree = ClusterTree("((ItemA,ItemB),(ItemC,ItemD));", text_array=matrix)
|
|
|
|
# Test different distance metrics
|
|
metrics = ["euclidean", "pearson", "spearman"]
|
|
|
|
for metric in metrics:
|
|
print(f"\nUsing {metric} distance:")
|
|
|
|
for node in tree.traverse():
|
|
if not node.is_leaf():
|
|
silhouette = node.get_silhouette(distance=metric)
|
|
|
|
# Positive silhouette = good clustering
|
|
# Negative silhouette = poor clustering
|
|
quality = "good" if silhouette > 0 else "poor"
|
|
|
|
print(f" Cluster {node.name}: {silhouette:.3f} ({quality})")
|
|
```
|
|
|
|
---
|
|
|
|
## Tree Visualization
|
|
|
|
### Basic Tree Rendering
|
|
|
|
```python
|
|
from ete3 import Tree, TreeStyle
|
|
|
|
tree = Tree("tree.nw")
|
|
|
|
# Create tree style
|
|
ts = TreeStyle()
|
|
ts.show_leaf_name = True
|
|
ts.show_branch_length = True
|
|
ts.show_branch_support = True
|
|
ts.scale = 50 # pixels per branch length unit
|
|
|
|
# Render to file
|
|
tree.render("tree_output.pdf", tree_style=ts)
|
|
tree.render("tree_output.png", tree_style=ts, w=800, h=600, units="px")
|
|
tree.render("tree_output.svg", tree_style=ts)
|
|
```
|
|
|
|
### Customizing Node Appearance
|
|
|
|
```python
|
|
from ete3 import Tree, TreeStyle, NodeStyle
|
|
|
|
tree = Tree("tree.nw")
|
|
|
|
# Define node styles
|
|
for node in tree.traverse():
|
|
nstyle = NodeStyle()
|
|
|
|
if node.is_leaf():
|
|
nstyle["fgcolor"] = "blue"
|
|
nstyle["size"] = 10
|
|
else:
|
|
nstyle["fgcolor"] = "red"
|
|
nstyle["size"] = 5
|
|
|
|
if node.support > 0.9:
|
|
nstyle["shape"] = "sphere"
|
|
else:
|
|
nstyle["shape"] = "circle"
|
|
|
|
node.set_style(nstyle)
|
|
|
|
# Render
|
|
ts = TreeStyle()
|
|
tree.render("styled_tree.pdf", tree_style=ts)
|
|
```
|
|
|
|
### Adding Faces to Nodes
|
|
|
|
```python
|
|
from ete3 import Tree, TreeStyle, TextFace, CircleFace, AttrFace
|
|
|
|
tree = Tree("tree.nw")
|
|
|
|
# Add features to nodes
|
|
for leaf in tree:
|
|
leaf.add_feature("habitat", "marine" if "fish" in leaf.name else "terrestrial")
|
|
leaf.add_feature("temp", 20)
|
|
|
|
# Layout function to add faces
|
|
def layout(node):
|
|
if node.is_leaf():
|
|
# Add text face
|
|
name_face = TextFace(node.name, fsize=10)
|
|
node.add_face(name_face, column=0, position="branch-right")
|
|
|
|
# Add colored circle based on habitat
|
|
color = "blue" if node.habitat == "marine" else "green"
|
|
circle_face = CircleFace(radius=5, color=color)
|
|
node.add_face(circle_face, column=1, position="branch-right")
|
|
|
|
# Add attribute face
|
|
temp_face = AttrFace("temp", fsize=8)
|
|
node.add_face(temp_face, column=2, position="branch-right")
|
|
|
|
ts = TreeStyle()
|
|
ts.layout_fn = layout
|
|
ts.show_leaf_name = False # We're adding custom names
|
|
|
|
tree.render("tree_with_faces.pdf", tree_style=ts)
|
|
```
|
|
|
|
### Circular Tree Layout
|
|
|
|
```python
|
|
from ete3 import Tree, TreeStyle
|
|
|
|
tree = Tree("tree.nw")
|
|
|
|
ts = TreeStyle()
|
|
ts.mode = "c" # Circular mode
|
|
ts.arc_start = 0 # Degrees
|
|
ts.arc_span = 360 # Full circle
|
|
ts.show_leaf_name = True
|
|
|
|
tree.render("circular_tree.pdf", tree_style=ts)
|
|
```
|
|
|
|
### Interactive Exploration
|
|
|
|
```python
|
|
from ete3 import Tree
|
|
|
|
tree = Tree("tree.nw")
|
|
|
|
# Launch GUI (allows zooming, searching, modifying)
|
|
# Changes persist after closing
|
|
tree.show()
|
|
|
|
# Can save changes made in GUI
|
|
tree.write(outfile="modified_tree.nw")
|
|
```
|
|
|
|
---
|
|
|
|
## Advanced Workflows
|
|
|
|
### Complete Phylogenomic Pipeline
|
|
|
|
```python
|
|
from ete3 import PhyloTree, NCBITaxa, TreeStyle
|
|
|
|
# 1. Load gene tree
|
|
gene_tree = PhyloTree("gene_tree.nw", alignment="alignment.fasta")
|
|
|
|
# 2. Set species naming
|
|
gene_tree.set_species_naming_function(lambda x: x.split("_")[0])
|
|
|
|
# 3. Detect evolutionary events
|
|
gene_tree.get_descendant_evol_events()
|
|
|
|
# 4. Annotate with NCBI taxonomy
|
|
ncbi = NCBITaxa()
|
|
species_set = set([leaf.species for leaf in gene_tree])
|
|
name2taxid = ncbi.get_name_translator(list(species_set))
|
|
|
|
for leaf in gene_tree:
|
|
if leaf.species in name2taxid:
|
|
taxid = name2taxid[leaf.species][0]
|
|
lineage = ncbi.get_lineage(taxid)
|
|
names = ncbi.get_taxid_translator(lineage)
|
|
leaf.add_feature("lineage", [names[t] for t in lineage])
|
|
|
|
# 5. Identify and save ortholog groups
|
|
ortho_groups = gene_tree.get_speciation_trees()
|
|
|
|
for i, ortho_tree in enumerate(ortho_groups):
|
|
ortho_tree.write(outfile=f"ortholog_group_{i}.nw")
|
|
|
|
# 6. Visualize with evolutionary events marked
|
|
def layout(node):
|
|
from ete3 import TextFace
|
|
if hasattr(node, "evoltype"):
|
|
if node.evoltype == "D":
|
|
dup_face = TextFace("DUPLICATION", fsize=8, fgcolor="red")
|
|
node.add_face(dup_face, column=0, position="branch-top")
|
|
|
|
ts = TreeStyle()
|
|
ts.layout_fn = layout
|
|
ts.show_leaf_name = True
|
|
gene_tree.render("annotated_gene_tree.pdf", tree_style=ts)
|
|
|
|
print(f"Pipeline complete. Found {len(ortho_groups)} ortholog groups.")
|
|
```
|
|
|
|
### Batch Processing Multiple Trees
|
|
|
|
```python
|
|
from ete3 import Tree
|
|
import os
|
|
|
|
input_dir = "input_trees"
|
|
output_dir = "processed_trees"
|
|
os.makedirs(output_dir, exist_ok=True)
|
|
|
|
for filename in os.listdir(input_dir):
|
|
if filename.endswith(".nw"):
|
|
# Load tree
|
|
tree = Tree(os.path.join(input_dir, filename))
|
|
|
|
# Process: root, prune, annotate
|
|
midpoint = tree.get_midpoint_outgroup()
|
|
tree.set_outgroup(midpoint)
|
|
|
|
# Filter by branch length
|
|
to_remove = []
|
|
for node in tree.traverse():
|
|
if node.dist < 0.001 and not node.is_root():
|
|
to_remove.append(node)
|
|
|
|
for node in to_remove:
|
|
node.delete()
|
|
|
|
# Save processed tree
|
|
output_file = os.path.join(output_dir, f"processed_{filename}")
|
|
tree.write(outfile=output_file)
|
|
|
|
print(f"Processed {filename}")
|
|
```
|