Initial commit
This commit is contained in:
566
skills/biopython/references/phylogenetics.md
Normal file
566
skills/biopython/references/phylogenetics.md
Normal file
@@ -0,0 +1,566 @@
|
||||
# Phylogenetics with Bio.Phylo
|
||||
|
||||
## Overview
|
||||
|
||||
Bio.Phylo provides a unified toolkit for reading, writing, analyzing, and visualizing phylogenetic trees. It supports multiple file formats including Newick, NEXUS, phyloXML, NeXML, and CDAO.
|
||||
|
||||
## Supported File Formats
|
||||
|
||||
- **Newick** - Simple tree representation (most common)
|
||||
- **NEXUS** - Extended format with additional data
|
||||
- **phyloXML** - XML-based format with rich annotations
|
||||
- **NeXML** - Modern XML format
|
||||
- **CDAO** - Comparative Data Analysis Ontology
|
||||
|
||||
## Reading and Writing Trees
|
||||
|
||||
### Reading Trees
|
||||
|
||||
```python
|
||||
from Bio import Phylo
|
||||
|
||||
# Read a tree from file
|
||||
tree = Phylo.read("tree.nwk", "newick")
|
||||
|
||||
# Parse multiple trees from a file
|
||||
trees = list(Phylo.parse("trees.nwk", "newick"))
|
||||
print(f"Found {len(trees)} trees")
|
||||
```
|
||||
|
||||
### Writing Trees
|
||||
|
||||
```python
|
||||
# Write tree to file
|
||||
Phylo.write(tree, "output.nwk", "newick")
|
||||
|
||||
# Write multiple trees
|
||||
Phylo.write(trees, "output.nex", "nexus")
|
||||
```
|
||||
|
||||
### Format Conversion
|
||||
|
||||
```python
|
||||
# Convert between formats
|
||||
count = Phylo.convert("input.nwk", "newick", "output.xml", "phyloxml")
|
||||
print(f"Converted {count} trees")
|
||||
```
|
||||
|
||||
## Tree Structure and Navigation
|
||||
|
||||
### Basic Tree Components
|
||||
|
||||
Trees consist of:
|
||||
- **Clade** - A node (internal or terminal) in the tree
|
||||
- **Terminal clades** - Leaves/tips (taxa)
|
||||
- **Internal clades** - Internal nodes
|
||||
- **Branch length** - Evolutionary distance
|
||||
|
||||
### Accessing Tree Properties
|
||||
|
||||
```python
|
||||
# Tree root
|
||||
root = tree.root
|
||||
|
||||
# Terminal nodes (leaves)
|
||||
terminals = tree.get_terminals()
|
||||
print(f"Number of taxa: {len(terminals)}")
|
||||
|
||||
# Non-terminal nodes
|
||||
nonterminals = tree.get_nonterminals()
|
||||
print(f"Number of internal nodes: {len(nonterminals)}")
|
||||
|
||||
# All clades
|
||||
all_clades = list(tree.find_clades())
|
||||
print(f"Total clades: {len(all_clades)}")
|
||||
```
|
||||
|
||||
### Traversing Trees
|
||||
|
||||
```python
|
||||
# Iterate through all clades
|
||||
for clade in tree.find_clades():
|
||||
if clade.name:
|
||||
print(f"Clade: {clade.name}, Branch length: {clade.branch_length}")
|
||||
|
||||
# Iterate through terminals only
|
||||
for terminal in tree.get_terminals():
|
||||
print(f"Taxon: {terminal.name}")
|
||||
|
||||
# Depth-first traversal
|
||||
for clade in tree.find_clades(order="preorder"):
|
||||
print(clade.name)
|
||||
|
||||
# Level-order (breadth-first) traversal
|
||||
for clade in tree.find_clades(order="level"):
|
||||
print(clade.name)
|
||||
```
|
||||
|
||||
### Finding Specific Clades
|
||||
|
||||
```python
|
||||
# Find clade by name
|
||||
clade = tree.find_any(name="Species_A")
|
||||
|
||||
# Find all clades matching criteria
|
||||
def is_long_branch(clade):
|
||||
return clade.branch_length and clade.branch_length > 0.5
|
||||
|
||||
long_branches = tree.find_clades(is_long_branch)
|
||||
```
|
||||
|
||||
## Tree Analysis
|
||||
|
||||
### Tree Statistics
|
||||
|
||||
```python
|
||||
# Total branch length
|
||||
total_length = tree.total_branch_length()
|
||||
print(f"Total tree length: {total_length:.3f}")
|
||||
|
||||
# Tree depth (root to furthest leaf)
|
||||
depths = tree.depths()
|
||||
max_depth = max(depths.values())
|
||||
print(f"Maximum depth: {max_depth:.3f}")
|
||||
|
||||
# Terminal count
|
||||
terminal_count = tree.count_terminals()
|
||||
print(f"Number of taxa: {terminal_count}")
|
||||
```
|
||||
|
||||
### Distance Calculations
|
||||
|
||||
```python
|
||||
# Distance between two taxa
|
||||
distance = tree.distance("Species_A", "Species_B")
|
||||
print(f"Distance: {distance:.3f}")
|
||||
|
||||
# Create distance matrix
|
||||
from Bio import Phylo
|
||||
|
||||
terminals = tree.get_terminals()
|
||||
taxa_names = [t.name for t in terminals]
|
||||
|
||||
print("Distance Matrix:")
|
||||
for taxon1 in taxa_names:
|
||||
row = []
|
||||
for taxon2 in taxa_names:
|
||||
if taxon1 == taxon2:
|
||||
row.append(0)
|
||||
else:
|
||||
dist = tree.distance(taxon1, taxon2)
|
||||
row.append(dist)
|
||||
print(f"{taxon1}: {row}")
|
||||
```
|
||||
|
||||
### Common Ancestors
|
||||
|
||||
```python
|
||||
# Find common ancestor of two clades
|
||||
clade1 = tree.find_any(name="Species_A")
|
||||
clade2 = tree.find_any(name="Species_B")
|
||||
ancestor = tree.common_ancestor(clade1, clade2)
|
||||
print(f"Common ancestor: {ancestor.name}")
|
||||
|
||||
# Find common ancestor of multiple clades
|
||||
clades = [tree.find_any(name=n) for n in ["Species_A", "Species_B", "Species_C"]]
|
||||
ancestor = tree.common_ancestor(*clades)
|
||||
```
|
||||
|
||||
### Tree Comparison
|
||||
|
||||
```python
|
||||
# Compare tree topologies
|
||||
def compare_trees(tree1, tree2):
|
||||
"""Compare two trees."""
|
||||
# Get terminal names
|
||||
taxa1 = set(t.name for t in tree1.get_terminals())
|
||||
taxa2 = set(t.name for t in tree2.get_terminals())
|
||||
|
||||
# Check if they have same taxa
|
||||
if taxa1 != taxa2:
|
||||
return False, "Different taxa"
|
||||
|
||||
# Compare distances
|
||||
differences = []
|
||||
for taxon1 in taxa1:
|
||||
for taxon2 in taxa1:
|
||||
if taxon1 < taxon2:
|
||||
dist1 = tree1.distance(taxon1, taxon2)
|
||||
dist2 = tree2.distance(taxon1, taxon2)
|
||||
if abs(dist1 - dist2) > 0.01:
|
||||
differences.append((taxon1, taxon2, dist1, dist2))
|
||||
|
||||
return len(differences) == 0, differences
|
||||
```
|
||||
|
||||
## Tree Manipulation
|
||||
|
||||
### Pruning Trees
|
||||
|
||||
```python
|
||||
# Prune (remove) specific taxa
|
||||
tree_copy = tree.copy()
|
||||
tree_copy.prune("Species_A")
|
||||
|
||||
# Keep only specific taxa
|
||||
taxa_to_keep = ["Species_B", "Species_C", "Species_D"]
|
||||
terminals = tree_copy.get_terminals()
|
||||
for terminal in terminals:
|
||||
if terminal.name not in taxa_to_keep:
|
||||
tree_copy.prune(terminal)
|
||||
```
|
||||
|
||||
### Collapsing Short Branches
|
||||
|
||||
```python
|
||||
# Collapse branches shorter than threshold
|
||||
def collapse_short_branches(tree, threshold=0.01):
|
||||
"""Collapse branches shorter than threshold."""
|
||||
for clade in tree.find_clades():
|
||||
if clade.branch_length and clade.branch_length < threshold:
|
||||
clade.branch_length = 0
|
||||
return tree
|
||||
```
|
||||
|
||||
### Ladderizing Trees
|
||||
|
||||
```python
|
||||
# Ladderize tree (sort branches by size)
|
||||
tree.ladderize() # ascending order
|
||||
tree.ladderize(reverse=True) # descending order
|
||||
```
|
||||
|
||||
### Rerooting Trees
|
||||
|
||||
```python
|
||||
# Reroot at midpoint
|
||||
tree.root_at_midpoint()
|
||||
|
||||
# Reroot with outgroup
|
||||
outgroup = tree.find_any(name="Outgroup_Species")
|
||||
tree.root_with_outgroup(outgroup)
|
||||
|
||||
# Reroot at internal node
|
||||
internal = tree.get_nonterminals()[0]
|
||||
tree.root_with_outgroup(internal)
|
||||
```
|
||||
|
||||
## Tree Visualization
|
||||
|
||||
### Basic ASCII Drawing
|
||||
|
||||
```python
|
||||
# Draw tree to console
|
||||
Phylo.draw_ascii(tree)
|
||||
|
||||
# Draw with custom format
|
||||
Phylo.draw_ascii(tree, column_width=80)
|
||||
```
|
||||
|
||||
### Matplotlib Visualization
|
||||
|
||||
```python
|
||||
import matplotlib.pyplot as plt
|
||||
from Bio import Phylo
|
||||
|
||||
# Simple plot
|
||||
fig = plt.figure(figsize=(10, 8))
|
||||
axes = fig.add_subplot(1, 1, 1)
|
||||
Phylo.draw(tree, axes=axes)
|
||||
plt.show()
|
||||
|
||||
# Customize plot
|
||||
fig = plt.figure(figsize=(10, 8))
|
||||
axes = fig.add_subplot(1, 1, 1)
|
||||
Phylo.draw(tree, axes=axes, do_show=False)
|
||||
axes.set_title("Phylogenetic Tree")
|
||||
plt.tight_layout()
|
||||
plt.savefig("tree.png", dpi=300)
|
||||
```
|
||||
|
||||
### Advanced Visualization Options
|
||||
|
||||
```python
|
||||
# Radial (circular) tree
|
||||
Phylo.draw(tree, branch_labels=lambda c: c.branch_length)
|
||||
|
||||
# Show branch support values
|
||||
Phylo.draw(tree, label_func=lambda n: str(n.confidence) if n.confidence else "")
|
||||
|
||||
# Color branches
|
||||
def color_by_length(clade):
|
||||
if clade.branch_length:
|
||||
if clade.branch_length > 0.5:
|
||||
return "red"
|
||||
elif clade.branch_length > 0.2:
|
||||
return "orange"
|
||||
return "black"
|
||||
|
||||
# Note: Direct branch coloring requires custom matplotlib code
|
||||
```
|
||||
|
||||
## Building Trees
|
||||
|
||||
### From Distance Matrix
|
||||
|
||||
```python
|
||||
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor, DistanceMatrix
|
||||
|
||||
# Create distance matrix
|
||||
dm = DistanceMatrix(
|
||||
names=["Alpha", "Beta", "Gamma", "Delta"],
|
||||
matrix=[
|
||||
[],
|
||||
[0.23],
|
||||
[0.45, 0.34],
|
||||
[0.67, 0.58, 0.29]
|
||||
]
|
||||
)
|
||||
|
||||
# Build tree using UPGMA
|
||||
constructor = DistanceTreeConstructor()
|
||||
tree = constructor.upgma(dm)
|
||||
Phylo.draw_ascii(tree)
|
||||
|
||||
# Build tree using Neighbor-Joining
|
||||
tree = constructor.nj(dm)
|
||||
```
|
||||
|
||||
### From Multiple Sequence Alignment
|
||||
|
||||
```python
|
||||
from Bio import AlignIO, Phylo
|
||||
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
|
||||
|
||||
# Read alignment
|
||||
alignment = AlignIO.read("alignment.fasta", "fasta")
|
||||
|
||||
# Calculate distance matrix
|
||||
calculator = DistanceCalculator("identity")
|
||||
distance_matrix = calculator.get_distance(alignment)
|
||||
|
||||
# Build tree
|
||||
constructor = DistanceTreeConstructor()
|
||||
tree = constructor.upgma(distance_matrix)
|
||||
|
||||
# Write tree
|
||||
Phylo.write(tree, "output_tree.nwk", "newick")
|
||||
```
|
||||
|
||||
### Distance Models
|
||||
|
||||
Available distance calculation models:
|
||||
- **identity** - Simple identity
|
||||
- **blastn** - BLASTN identity
|
||||
- **trans** - Transition/transversion ratio
|
||||
- **blosum62** - BLOSUM62 matrix
|
||||
- **pam250** - PAM250 matrix
|
||||
|
||||
```python
|
||||
# Use different model
|
||||
calculator = DistanceCalculator("blosum62")
|
||||
dm = calculator.get_distance(alignment)
|
||||
```
|
||||
|
||||
## Consensus Trees
|
||||
|
||||
```python
|
||||
from Bio.Phylo.Consensus import majority_consensus, strict_consensus
|
||||
|
||||
# Read multiple trees
|
||||
trees = list(Phylo.parse("bootstrap_trees.nwk", "newick"))
|
||||
|
||||
# Majority-rule consensus
|
||||
consensus = majority_consensus(trees, cutoff=0.5)
|
||||
|
||||
# Strict consensus
|
||||
strict_cons = strict_consensus(trees)
|
||||
|
||||
# Write consensus tree
|
||||
Phylo.write(consensus, "consensus.nwk", "newick")
|
||||
```
|
||||
|
||||
## PhyloXML Features
|
||||
|
||||
PhyloXML format supports rich annotations:
|
||||
|
||||
```python
|
||||
from Bio.Phylo.PhyloXML import Phylogeny, Clade
|
||||
|
||||
# Create PhyloXML tree
|
||||
tree = Phylogeny(rooted=True)
|
||||
tree.name = "Example Tree"
|
||||
tree.description = "A sample phylogenetic tree"
|
||||
|
||||
# Add clades with rich annotations
|
||||
clade = Clade(branch_length=0.5)
|
||||
clade.name = "Species_A"
|
||||
clade.color = "red"
|
||||
clade.width = 2.0
|
||||
|
||||
# Add taxonomy information
|
||||
from Bio.Phylo.PhyloXML import Taxonomy
|
||||
taxonomy = Taxonomy(scientific_name="Homo sapiens", common_name="Human")
|
||||
clade.taxonomies.append(taxonomy)
|
||||
```
|
||||
|
||||
## Bootstrap Support
|
||||
|
||||
```python
|
||||
# Add bootstrap support values to tree
|
||||
def add_bootstrap_support(tree, support_values):
|
||||
"""Add bootstrap support to internal nodes."""
|
||||
internal_nodes = tree.get_nonterminals()
|
||||
for node, support in zip(internal_nodes, support_values):
|
||||
node.confidence = support
|
||||
return tree
|
||||
|
||||
# Example
|
||||
support_values = [95, 87, 76, 92]
|
||||
tree_with_support = add_bootstrap_support(tree, support_values)
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Choose appropriate file format** - Newick for simple trees, phyloXML for annotations
|
||||
2. **Validate tree topology** - Check for polytomies and negative branch lengths
|
||||
3. **Root trees appropriately** - Use midpoint or outgroup rooting
|
||||
4. **Handle bootstrap values** - Store as clade confidence
|
||||
5. **Consider tree size** - Large trees may need special handling
|
||||
6. **Use tree copies** - Call `.copy()` before modifications
|
||||
7. **Export publication-ready figures** - Use matplotlib for high-quality output
|
||||
8. **Document tree construction** - Record alignment and parameters used
|
||||
9. **Compare multiple trees** - Use consensus methods for bootstrap trees
|
||||
10. **Validate taxon names** - Ensure consistent naming across files
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
### Build Tree from Sequences
|
||||
|
||||
```python
|
||||
from Bio import AlignIO, Phylo
|
||||
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
|
||||
|
||||
# Read aligned sequences
|
||||
alignment = AlignIO.read("sequences.aln", "clustal")
|
||||
|
||||
# Calculate distances
|
||||
calculator = DistanceCalculator("identity")
|
||||
dm = calculator.get_distance(alignment)
|
||||
|
||||
# Build neighbor-joining tree
|
||||
constructor = DistanceTreeConstructor()
|
||||
tree = constructor.nj(dm)
|
||||
|
||||
# Root at midpoint
|
||||
tree.root_at_midpoint()
|
||||
|
||||
# Save tree
|
||||
Phylo.write(tree, "tree.nwk", "newick")
|
||||
|
||||
# Visualize
|
||||
import matplotlib.pyplot as plt
|
||||
fig = plt.figure(figsize=(10, 8))
|
||||
Phylo.draw(tree)
|
||||
plt.show()
|
||||
```
|
||||
|
||||
### Extract Subtree
|
||||
|
||||
```python
|
||||
def extract_subtree(tree, taxa_list):
|
||||
"""Extract subtree containing specific taxa."""
|
||||
# Create a copy
|
||||
subtree = tree.copy()
|
||||
|
||||
# Get all terminals
|
||||
all_terminals = subtree.get_terminals()
|
||||
|
||||
# Prune taxa not in list
|
||||
for terminal in all_terminals:
|
||||
if terminal.name not in taxa_list:
|
||||
subtree.prune(terminal)
|
||||
|
||||
return subtree
|
||||
|
||||
# Use it
|
||||
subtree = extract_subtree(tree, ["Species_A", "Species_B", "Species_C"])
|
||||
Phylo.write(subtree, "subtree.nwk", "newick")
|
||||
```
|
||||
|
||||
### Calculate Phylogenetic Diversity
|
||||
|
||||
```python
|
||||
def phylogenetic_diversity(tree, taxa_subset=None):
|
||||
"""Calculate phylogenetic diversity (sum of branch lengths)."""
|
||||
if taxa_subset:
|
||||
# Prune to subset
|
||||
tree = extract_subtree(tree, taxa_subset)
|
||||
|
||||
# Sum all branch lengths
|
||||
total = 0
|
||||
for clade in tree.find_clades():
|
||||
if clade.branch_length:
|
||||
total += clade.branch_length
|
||||
|
||||
return total
|
||||
|
||||
# Calculate PD for all taxa
|
||||
pd_all = phylogenetic_diversity(tree)
|
||||
print(f"Total phylogenetic diversity: {pd_all:.3f}")
|
||||
|
||||
# Calculate PD for subset
|
||||
pd_subset = phylogenetic_diversity(tree, ["Species_A", "Species_B"])
|
||||
print(f"Subset phylogenetic diversity: {pd_subset:.3f}")
|
||||
```
|
||||
|
||||
### Annotate Tree with External Data
|
||||
|
||||
```python
|
||||
def annotate_tree_from_csv(tree, csv_file):
|
||||
"""Annotate tree leaves with data from CSV."""
|
||||
import csv
|
||||
|
||||
# Read annotation data
|
||||
annotations = {}
|
||||
with open(csv_file) as f:
|
||||
reader = csv.DictReader(f)
|
||||
for row in reader:
|
||||
annotations[row["species"]] = row
|
||||
|
||||
# Annotate tree
|
||||
for terminal in tree.get_terminals():
|
||||
if terminal.name in annotations:
|
||||
# Add custom attributes
|
||||
for key, value in annotations[terminal.name].items():
|
||||
setattr(terminal, key, value)
|
||||
|
||||
return tree
|
||||
```
|
||||
|
||||
### Compare Tree Topologies
|
||||
|
||||
```python
|
||||
def robinson_foulds_distance(tree1, tree2):
|
||||
"""Calculate Robinson-Foulds distance between two trees."""
|
||||
# Get bipartitions for each tree
|
||||
def get_bipartitions(tree):
|
||||
bipartitions = set()
|
||||
for clade in tree.get_nonterminals():
|
||||
terminals = frozenset(t.name for t in clade.get_terminals())
|
||||
bipartitions.add(terminals)
|
||||
return bipartitions
|
||||
|
||||
bp1 = get_bipartitions(tree1)
|
||||
bp2 = get_bipartitions(tree2)
|
||||
|
||||
# Symmetric difference
|
||||
diff = len(bp1.symmetric_difference(bp2))
|
||||
return diff
|
||||
|
||||
# Use it
|
||||
tree1 = Phylo.read("tree1.nwk", "newick")
|
||||
tree2 = Phylo.read("tree2.nwk", "newick")
|
||||
rf_dist = robinson_foulds_distance(tree1, tree2)
|
||||
print(f"Robinson-Foulds distance: {rf_dist}")
|
||||
```
|
||||
Reference in New Issue
Block a user