567 lines
14 KiB
Markdown
567 lines
14 KiB
Markdown
# Phylogenetics with Bio.Phylo
|
|
|
|
## Overview
|
|
|
|
Bio.Phylo provides a unified toolkit for reading, writing, analyzing, and visualizing phylogenetic trees. It supports multiple file formats including Newick, NEXUS, phyloXML, NeXML, and CDAO.
|
|
|
|
## Supported File Formats
|
|
|
|
- **Newick** - Simple tree representation (most common)
|
|
- **NEXUS** - Extended format with additional data
|
|
- **phyloXML** - XML-based format with rich annotations
|
|
- **NeXML** - Modern XML format
|
|
- **CDAO** - Comparative Data Analysis Ontology
|
|
|
|
## Reading and Writing Trees
|
|
|
|
### Reading Trees
|
|
|
|
```python
|
|
from Bio import Phylo
|
|
|
|
# Read a tree from file
|
|
tree = Phylo.read("tree.nwk", "newick")
|
|
|
|
# Parse multiple trees from a file
|
|
trees = list(Phylo.parse("trees.nwk", "newick"))
|
|
print(f"Found {len(trees)} trees")
|
|
```
|
|
|
|
### Writing Trees
|
|
|
|
```python
|
|
# Write tree to file
|
|
Phylo.write(tree, "output.nwk", "newick")
|
|
|
|
# Write multiple trees
|
|
Phylo.write(trees, "output.nex", "nexus")
|
|
```
|
|
|
|
### Format Conversion
|
|
|
|
```python
|
|
# Convert between formats
|
|
count = Phylo.convert("input.nwk", "newick", "output.xml", "phyloxml")
|
|
print(f"Converted {count} trees")
|
|
```
|
|
|
|
## Tree Structure and Navigation
|
|
|
|
### Basic Tree Components
|
|
|
|
Trees consist of:
|
|
- **Clade** - A node (internal or terminal) in the tree
|
|
- **Terminal clades** - Leaves/tips (taxa)
|
|
- **Internal clades** - Internal nodes
|
|
- **Branch length** - Evolutionary distance
|
|
|
|
### Accessing Tree Properties
|
|
|
|
```python
|
|
# Tree root
|
|
root = tree.root
|
|
|
|
# Terminal nodes (leaves)
|
|
terminals = tree.get_terminals()
|
|
print(f"Number of taxa: {len(terminals)}")
|
|
|
|
# Non-terminal nodes
|
|
nonterminals = tree.get_nonterminals()
|
|
print(f"Number of internal nodes: {len(nonterminals)}")
|
|
|
|
# All clades
|
|
all_clades = list(tree.find_clades())
|
|
print(f"Total clades: {len(all_clades)}")
|
|
```
|
|
|
|
### Traversing Trees
|
|
|
|
```python
|
|
# Iterate through all clades
|
|
for clade in tree.find_clades():
|
|
if clade.name:
|
|
print(f"Clade: {clade.name}, Branch length: {clade.branch_length}")
|
|
|
|
# Iterate through terminals only
|
|
for terminal in tree.get_terminals():
|
|
print(f"Taxon: {terminal.name}")
|
|
|
|
# Depth-first traversal
|
|
for clade in tree.find_clades(order="preorder"):
|
|
print(clade.name)
|
|
|
|
# Level-order (breadth-first) traversal
|
|
for clade in tree.find_clades(order="level"):
|
|
print(clade.name)
|
|
```
|
|
|
|
### Finding Specific Clades
|
|
|
|
```python
|
|
# Find clade by name
|
|
clade = tree.find_any(name="Species_A")
|
|
|
|
# Find all clades matching criteria
|
|
def is_long_branch(clade):
|
|
return clade.branch_length and clade.branch_length > 0.5
|
|
|
|
long_branches = tree.find_clades(is_long_branch)
|
|
```
|
|
|
|
## Tree Analysis
|
|
|
|
### Tree Statistics
|
|
|
|
```python
|
|
# Total branch length
|
|
total_length = tree.total_branch_length()
|
|
print(f"Total tree length: {total_length:.3f}")
|
|
|
|
# Tree depth (root to furthest leaf)
|
|
depths = tree.depths()
|
|
max_depth = max(depths.values())
|
|
print(f"Maximum depth: {max_depth:.3f}")
|
|
|
|
# Terminal count
|
|
terminal_count = tree.count_terminals()
|
|
print(f"Number of taxa: {terminal_count}")
|
|
```
|
|
|
|
### Distance Calculations
|
|
|
|
```python
|
|
# Distance between two taxa
|
|
distance = tree.distance("Species_A", "Species_B")
|
|
print(f"Distance: {distance:.3f}")
|
|
|
|
# Create distance matrix
|
|
from Bio import Phylo
|
|
|
|
terminals = tree.get_terminals()
|
|
taxa_names = [t.name for t in terminals]
|
|
|
|
print("Distance Matrix:")
|
|
for taxon1 in taxa_names:
|
|
row = []
|
|
for taxon2 in taxa_names:
|
|
if taxon1 == taxon2:
|
|
row.append(0)
|
|
else:
|
|
dist = tree.distance(taxon1, taxon2)
|
|
row.append(dist)
|
|
print(f"{taxon1}: {row}")
|
|
```
|
|
|
|
### Common Ancestors
|
|
|
|
```python
|
|
# Find common ancestor of two clades
|
|
clade1 = tree.find_any(name="Species_A")
|
|
clade2 = tree.find_any(name="Species_B")
|
|
ancestor = tree.common_ancestor(clade1, clade2)
|
|
print(f"Common ancestor: {ancestor.name}")
|
|
|
|
# Find common ancestor of multiple clades
|
|
clades = [tree.find_any(name=n) for n in ["Species_A", "Species_B", "Species_C"]]
|
|
ancestor = tree.common_ancestor(*clades)
|
|
```
|
|
|
|
### Tree Comparison
|
|
|
|
```python
|
|
# Compare tree topologies
|
|
def compare_trees(tree1, tree2):
|
|
"""Compare two trees."""
|
|
# Get terminal names
|
|
taxa1 = set(t.name for t in tree1.get_terminals())
|
|
taxa2 = set(t.name for t in tree2.get_terminals())
|
|
|
|
# Check if they have same taxa
|
|
if taxa1 != taxa2:
|
|
return False, "Different taxa"
|
|
|
|
# Compare distances
|
|
differences = []
|
|
for taxon1 in taxa1:
|
|
for taxon2 in taxa1:
|
|
if taxon1 < taxon2:
|
|
dist1 = tree1.distance(taxon1, taxon2)
|
|
dist2 = tree2.distance(taxon1, taxon2)
|
|
if abs(dist1 - dist2) > 0.01:
|
|
differences.append((taxon1, taxon2, dist1, dist2))
|
|
|
|
return len(differences) == 0, differences
|
|
```
|
|
|
|
## Tree Manipulation
|
|
|
|
### Pruning Trees
|
|
|
|
```python
|
|
# Prune (remove) specific taxa
|
|
tree_copy = tree.copy()
|
|
tree_copy.prune("Species_A")
|
|
|
|
# Keep only specific taxa
|
|
taxa_to_keep = ["Species_B", "Species_C", "Species_D"]
|
|
terminals = tree_copy.get_terminals()
|
|
for terminal in terminals:
|
|
if terminal.name not in taxa_to_keep:
|
|
tree_copy.prune(terminal)
|
|
```
|
|
|
|
### Collapsing Short Branches
|
|
|
|
```python
|
|
# Collapse branches shorter than threshold
|
|
def collapse_short_branches(tree, threshold=0.01):
|
|
"""Collapse branches shorter than threshold."""
|
|
for clade in tree.find_clades():
|
|
if clade.branch_length and clade.branch_length < threshold:
|
|
clade.branch_length = 0
|
|
return tree
|
|
```
|
|
|
|
### Ladderizing Trees
|
|
|
|
```python
|
|
# Ladderize tree (sort branches by size)
|
|
tree.ladderize() # ascending order
|
|
tree.ladderize(reverse=True) # descending order
|
|
```
|
|
|
|
### Rerooting Trees
|
|
|
|
```python
|
|
# Reroot at midpoint
|
|
tree.root_at_midpoint()
|
|
|
|
# Reroot with outgroup
|
|
outgroup = tree.find_any(name="Outgroup_Species")
|
|
tree.root_with_outgroup(outgroup)
|
|
|
|
# Reroot at internal node
|
|
internal = tree.get_nonterminals()[0]
|
|
tree.root_with_outgroup(internal)
|
|
```
|
|
|
|
## Tree Visualization
|
|
|
|
### Basic ASCII Drawing
|
|
|
|
```python
|
|
# Draw tree to console
|
|
Phylo.draw_ascii(tree)
|
|
|
|
# Draw with custom format
|
|
Phylo.draw_ascii(tree, column_width=80)
|
|
```
|
|
|
|
### Matplotlib Visualization
|
|
|
|
```python
|
|
import matplotlib.pyplot as plt
|
|
from Bio import Phylo
|
|
|
|
# Simple plot
|
|
fig = plt.figure(figsize=(10, 8))
|
|
axes = fig.add_subplot(1, 1, 1)
|
|
Phylo.draw(tree, axes=axes)
|
|
plt.show()
|
|
|
|
# Customize plot
|
|
fig = plt.figure(figsize=(10, 8))
|
|
axes = fig.add_subplot(1, 1, 1)
|
|
Phylo.draw(tree, axes=axes, do_show=False)
|
|
axes.set_title("Phylogenetic Tree")
|
|
plt.tight_layout()
|
|
plt.savefig("tree.png", dpi=300)
|
|
```
|
|
|
|
### Advanced Visualization Options
|
|
|
|
```python
|
|
# Radial (circular) tree
|
|
Phylo.draw(tree, branch_labels=lambda c: c.branch_length)
|
|
|
|
# Show branch support values
|
|
Phylo.draw(tree, label_func=lambda n: str(n.confidence) if n.confidence else "")
|
|
|
|
# Color branches
|
|
def color_by_length(clade):
|
|
if clade.branch_length:
|
|
if clade.branch_length > 0.5:
|
|
return "red"
|
|
elif clade.branch_length > 0.2:
|
|
return "orange"
|
|
return "black"
|
|
|
|
# Note: Direct branch coloring requires custom matplotlib code
|
|
```
|
|
|
|
## Building Trees
|
|
|
|
### From Distance Matrix
|
|
|
|
```python
|
|
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor, DistanceMatrix
|
|
|
|
# Create distance matrix
|
|
dm = DistanceMatrix(
|
|
names=["Alpha", "Beta", "Gamma", "Delta"],
|
|
matrix=[
|
|
[],
|
|
[0.23],
|
|
[0.45, 0.34],
|
|
[0.67, 0.58, 0.29]
|
|
]
|
|
)
|
|
|
|
# Build tree using UPGMA
|
|
constructor = DistanceTreeConstructor()
|
|
tree = constructor.upgma(dm)
|
|
Phylo.draw_ascii(tree)
|
|
|
|
# Build tree using Neighbor-Joining
|
|
tree = constructor.nj(dm)
|
|
```
|
|
|
|
### From Multiple Sequence Alignment
|
|
|
|
```python
|
|
from Bio import AlignIO, Phylo
|
|
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
|
|
|
|
# Read alignment
|
|
alignment = AlignIO.read("alignment.fasta", "fasta")
|
|
|
|
# Calculate distance matrix
|
|
calculator = DistanceCalculator("identity")
|
|
distance_matrix = calculator.get_distance(alignment)
|
|
|
|
# Build tree
|
|
constructor = DistanceTreeConstructor()
|
|
tree = constructor.upgma(distance_matrix)
|
|
|
|
# Write tree
|
|
Phylo.write(tree, "output_tree.nwk", "newick")
|
|
```
|
|
|
|
### Distance Models
|
|
|
|
Available distance calculation models:
|
|
- **identity** - Simple identity
|
|
- **blastn** - BLASTN identity
|
|
- **trans** - Transition/transversion ratio
|
|
- **blosum62** - BLOSUM62 matrix
|
|
- **pam250** - PAM250 matrix
|
|
|
|
```python
|
|
# Use different model
|
|
calculator = DistanceCalculator("blosum62")
|
|
dm = calculator.get_distance(alignment)
|
|
```
|
|
|
|
## Consensus Trees
|
|
|
|
```python
|
|
from Bio.Phylo.Consensus import majority_consensus, strict_consensus
|
|
|
|
# Read multiple trees
|
|
trees = list(Phylo.parse("bootstrap_trees.nwk", "newick"))
|
|
|
|
# Majority-rule consensus
|
|
consensus = majority_consensus(trees, cutoff=0.5)
|
|
|
|
# Strict consensus
|
|
strict_cons = strict_consensus(trees)
|
|
|
|
# Write consensus tree
|
|
Phylo.write(consensus, "consensus.nwk", "newick")
|
|
```
|
|
|
|
## PhyloXML Features
|
|
|
|
PhyloXML format supports rich annotations:
|
|
|
|
```python
|
|
from Bio.Phylo.PhyloXML import Phylogeny, Clade
|
|
|
|
# Create PhyloXML tree
|
|
tree = Phylogeny(rooted=True)
|
|
tree.name = "Example Tree"
|
|
tree.description = "A sample phylogenetic tree"
|
|
|
|
# Add clades with rich annotations
|
|
clade = Clade(branch_length=0.5)
|
|
clade.name = "Species_A"
|
|
clade.color = "red"
|
|
clade.width = 2.0
|
|
|
|
# Add taxonomy information
|
|
from Bio.Phylo.PhyloXML import Taxonomy
|
|
taxonomy = Taxonomy(scientific_name="Homo sapiens", common_name="Human")
|
|
clade.taxonomies.append(taxonomy)
|
|
```
|
|
|
|
## Bootstrap Support
|
|
|
|
```python
|
|
# Add bootstrap support values to tree
|
|
def add_bootstrap_support(tree, support_values):
|
|
"""Add bootstrap support to internal nodes."""
|
|
internal_nodes = tree.get_nonterminals()
|
|
for node, support in zip(internal_nodes, support_values):
|
|
node.confidence = support
|
|
return tree
|
|
|
|
# Example
|
|
support_values = [95, 87, 76, 92]
|
|
tree_with_support = add_bootstrap_support(tree, support_values)
|
|
```
|
|
|
|
## Best Practices
|
|
|
|
1. **Choose appropriate file format** - Newick for simple trees, phyloXML for annotations
|
|
2. **Validate tree topology** - Check for polytomies and negative branch lengths
|
|
3. **Root trees appropriately** - Use midpoint or outgroup rooting
|
|
4. **Handle bootstrap values** - Store as clade confidence
|
|
5. **Consider tree size** - Large trees may need special handling
|
|
6. **Use tree copies** - Call `.copy()` before modifications
|
|
7. **Export publication-ready figures** - Use matplotlib for high-quality output
|
|
8. **Document tree construction** - Record alignment and parameters used
|
|
9. **Compare multiple trees** - Use consensus methods for bootstrap trees
|
|
10. **Validate taxon names** - Ensure consistent naming across files
|
|
|
|
## Common Use Cases
|
|
|
|
### Build Tree from Sequences
|
|
|
|
```python
|
|
from Bio import AlignIO, Phylo
|
|
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
|
|
|
|
# Read aligned sequences
|
|
alignment = AlignIO.read("sequences.aln", "clustal")
|
|
|
|
# Calculate distances
|
|
calculator = DistanceCalculator("identity")
|
|
dm = calculator.get_distance(alignment)
|
|
|
|
# Build neighbor-joining tree
|
|
constructor = DistanceTreeConstructor()
|
|
tree = constructor.nj(dm)
|
|
|
|
# Root at midpoint
|
|
tree.root_at_midpoint()
|
|
|
|
# Save tree
|
|
Phylo.write(tree, "tree.nwk", "newick")
|
|
|
|
# Visualize
|
|
import matplotlib.pyplot as plt
|
|
fig = plt.figure(figsize=(10, 8))
|
|
Phylo.draw(tree)
|
|
plt.show()
|
|
```
|
|
|
|
### Extract Subtree
|
|
|
|
```python
|
|
def extract_subtree(tree, taxa_list):
|
|
"""Extract subtree containing specific taxa."""
|
|
# Create a copy
|
|
subtree = tree.copy()
|
|
|
|
# Get all terminals
|
|
all_terminals = subtree.get_terminals()
|
|
|
|
# Prune taxa not in list
|
|
for terminal in all_terminals:
|
|
if terminal.name not in taxa_list:
|
|
subtree.prune(terminal)
|
|
|
|
return subtree
|
|
|
|
# Use it
|
|
subtree = extract_subtree(tree, ["Species_A", "Species_B", "Species_C"])
|
|
Phylo.write(subtree, "subtree.nwk", "newick")
|
|
```
|
|
|
|
### Calculate Phylogenetic Diversity
|
|
|
|
```python
|
|
def phylogenetic_diversity(tree, taxa_subset=None):
|
|
"""Calculate phylogenetic diversity (sum of branch lengths)."""
|
|
if taxa_subset:
|
|
# Prune to subset
|
|
tree = extract_subtree(tree, taxa_subset)
|
|
|
|
# Sum all branch lengths
|
|
total = 0
|
|
for clade in tree.find_clades():
|
|
if clade.branch_length:
|
|
total += clade.branch_length
|
|
|
|
return total
|
|
|
|
# Calculate PD for all taxa
|
|
pd_all = phylogenetic_diversity(tree)
|
|
print(f"Total phylogenetic diversity: {pd_all:.3f}")
|
|
|
|
# Calculate PD for subset
|
|
pd_subset = phylogenetic_diversity(tree, ["Species_A", "Species_B"])
|
|
print(f"Subset phylogenetic diversity: {pd_subset:.3f}")
|
|
```
|
|
|
|
### Annotate Tree with External Data
|
|
|
|
```python
|
|
def annotate_tree_from_csv(tree, csv_file):
|
|
"""Annotate tree leaves with data from CSV."""
|
|
import csv
|
|
|
|
# Read annotation data
|
|
annotations = {}
|
|
with open(csv_file) as f:
|
|
reader = csv.DictReader(f)
|
|
for row in reader:
|
|
annotations[row["species"]] = row
|
|
|
|
# Annotate tree
|
|
for terminal in tree.get_terminals():
|
|
if terminal.name in annotations:
|
|
# Add custom attributes
|
|
for key, value in annotations[terminal.name].items():
|
|
setattr(terminal, key, value)
|
|
|
|
return tree
|
|
```
|
|
|
|
### Compare Tree Topologies
|
|
|
|
```python
|
|
def robinson_foulds_distance(tree1, tree2):
|
|
"""Calculate Robinson-Foulds distance between two trees."""
|
|
# Get bipartitions for each tree
|
|
def get_bipartitions(tree):
|
|
bipartitions = set()
|
|
for clade in tree.get_nonterminals():
|
|
terminals = frozenset(t.name for t in clade.get_terminals())
|
|
bipartitions.add(terminals)
|
|
return bipartitions
|
|
|
|
bp1 = get_bipartitions(tree1)
|
|
bp2 = get_bipartitions(tree2)
|
|
|
|
# Symmetric difference
|
|
diff = len(bp1.symmetric_difference(bp2))
|
|
return diff
|
|
|
|
# Use it
|
|
tree1 = Phylo.read("tree1.nwk", "newick")
|
|
tree2 = Phylo.read("tree2.nwk", "newick")
|
|
rf_dist = robinson_foulds_distance(tree1, tree2)
|
|
print(f"Robinson-Foulds distance: {rf_dist}")
|
|
```
|