Files
gh-k-dense-ai-claude-scient…/skills/biopython/references/phylogenetics.md
2025-11-30 08:30:10 +08:00

567 lines
14 KiB
Markdown

# Phylogenetics with Bio.Phylo
## Overview
Bio.Phylo provides a unified toolkit for reading, writing, analyzing, and visualizing phylogenetic trees. It supports multiple file formats including Newick, NEXUS, phyloXML, NeXML, and CDAO.
## Supported File Formats
- **Newick** - Simple tree representation (most common)
- **NEXUS** - Extended format with additional data
- **phyloXML** - XML-based format with rich annotations
- **NeXML** - Modern XML format
- **CDAO** - Comparative Data Analysis Ontology
## Reading and Writing Trees
### Reading Trees
```python
from Bio import Phylo
# Read a tree from file
tree = Phylo.read("tree.nwk", "newick")
# Parse multiple trees from a file
trees = list(Phylo.parse("trees.nwk", "newick"))
print(f"Found {len(trees)} trees")
```
### Writing Trees
```python
# Write tree to file
Phylo.write(tree, "output.nwk", "newick")
# Write multiple trees
Phylo.write(trees, "output.nex", "nexus")
```
### Format Conversion
```python
# Convert between formats
count = Phylo.convert("input.nwk", "newick", "output.xml", "phyloxml")
print(f"Converted {count} trees")
```
## Tree Structure and Navigation
### Basic Tree Components
Trees consist of:
- **Clade** - A node (internal or terminal) in the tree
- **Terminal clades** - Leaves/tips (taxa)
- **Internal clades** - Internal nodes
- **Branch length** - Evolutionary distance
### Accessing Tree Properties
```python
# Tree root
root = tree.root
# Terminal nodes (leaves)
terminals = tree.get_terminals()
print(f"Number of taxa: {len(terminals)}")
# Non-terminal nodes
nonterminals = tree.get_nonterminals()
print(f"Number of internal nodes: {len(nonterminals)}")
# All clades
all_clades = list(tree.find_clades())
print(f"Total clades: {len(all_clades)}")
```
### Traversing Trees
```python
# Iterate through all clades
for clade in tree.find_clades():
if clade.name:
print(f"Clade: {clade.name}, Branch length: {clade.branch_length}")
# Iterate through terminals only
for terminal in tree.get_terminals():
print(f"Taxon: {terminal.name}")
# Depth-first traversal
for clade in tree.find_clades(order="preorder"):
print(clade.name)
# Level-order (breadth-first) traversal
for clade in tree.find_clades(order="level"):
print(clade.name)
```
### Finding Specific Clades
```python
# Find clade by name
clade = tree.find_any(name="Species_A")
# Find all clades matching criteria
def is_long_branch(clade):
return clade.branch_length and clade.branch_length > 0.5
long_branches = tree.find_clades(is_long_branch)
```
## Tree Analysis
### Tree Statistics
```python
# Total branch length
total_length = tree.total_branch_length()
print(f"Total tree length: {total_length:.3f}")
# Tree depth (root to furthest leaf)
depths = tree.depths()
max_depth = max(depths.values())
print(f"Maximum depth: {max_depth:.3f}")
# Terminal count
terminal_count = tree.count_terminals()
print(f"Number of taxa: {terminal_count}")
```
### Distance Calculations
```python
# Distance between two taxa
distance = tree.distance("Species_A", "Species_B")
print(f"Distance: {distance:.3f}")
# Create distance matrix
from Bio import Phylo
terminals = tree.get_terminals()
taxa_names = [t.name for t in terminals]
print("Distance Matrix:")
for taxon1 in taxa_names:
row = []
for taxon2 in taxa_names:
if taxon1 == taxon2:
row.append(0)
else:
dist = tree.distance(taxon1, taxon2)
row.append(dist)
print(f"{taxon1}: {row}")
```
### Common Ancestors
```python
# Find common ancestor of two clades
clade1 = tree.find_any(name="Species_A")
clade2 = tree.find_any(name="Species_B")
ancestor = tree.common_ancestor(clade1, clade2)
print(f"Common ancestor: {ancestor.name}")
# Find common ancestor of multiple clades
clades = [tree.find_any(name=n) for n in ["Species_A", "Species_B", "Species_C"]]
ancestor = tree.common_ancestor(*clades)
```
### Tree Comparison
```python
# Compare tree topologies
def compare_trees(tree1, tree2):
"""Compare two trees."""
# Get terminal names
taxa1 = set(t.name for t in tree1.get_terminals())
taxa2 = set(t.name for t in tree2.get_terminals())
# Check if they have same taxa
if taxa1 != taxa2:
return False, "Different taxa"
# Compare distances
differences = []
for taxon1 in taxa1:
for taxon2 in taxa1:
if taxon1 < taxon2:
dist1 = tree1.distance(taxon1, taxon2)
dist2 = tree2.distance(taxon1, taxon2)
if abs(dist1 - dist2) > 0.01:
differences.append((taxon1, taxon2, dist1, dist2))
return len(differences) == 0, differences
```
## Tree Manipulation
### Pruning Trees
```python
# Prune (remove) specific taxa
tree_copy = tree.copy()
tree_copy.prune("Species_A")
# Keep only specific taxa
taxa_to_keep = ["Species_B", "Species_C", "Species_D"]
terminals = tree_copy.get_terminals()
for terminal in terminals:
if terminal.name not in taxa_to_keep:
tree_copy.prune(terminal)
```
### Collapsing Short Branches
```python
# Collapse branches shorter than threshold
def collapse_short_branches(tree, threshold=0.01):
"""Collapse branches shorter than threshold."""
for clade in tree.find_clades():
if clade.branch_length and clade.branch_length < threshold:
clade.branch_length = 0
return tree
```
### Ladderizing Trees
```python
# Ladderize tree (sort branches by size)
tree.ladderize() # ascending order
tree.ladderize(reverse=True) # descending order
```
### Rerooting Trees
```python
# Reroot at midpoint
tree.root_at_midpoint()
# Reroot with outgroup
outgroup = tree.find_any(name="Outgroup_Species")
tree.root_with_outgroup(outgroup)
# Reroot at internal node
internal = tree.get_nonterminals()[0]
tree.root_with_outgroup(internal)
```
## Tree Visualization
### Basic ASCII Drawing
```python
# Draw tree to console
Phylo.draw_ascii(tree)
# Draw with custom format
Phylo.draw_ascii(tree, column_width=80)
```
### Matplotlib Visualization
```python
import matplotlib.pyplot as plt
from Bio import Phylo
# Simple plot
fig = plt.figure(figsize=(10, 8))
axes = fig.add_subplot(1, 1, 1)
Phylo.draw(tree, axes=axes)
plt.show()
# Customize plot
fig = plt.figure(figsize=(10, 8))
axes = fig.add_subplot(1, 1, 1)
Phylo.draw(tree, axes=axes, do_show=False)
axes.set_title("Phylogenetic Tree")
plt.tight_layout()
plt.savefig("tree.png", dpi=300)
```
### Advanced Visualization Options
```python
# Radial (circular) tree
Phylo.draw(tree, branch_labels=lambda c: c.branch_length)
# Show branch support values
Phylo.draw(tree, label_func=lambda n: str(n.confidence) if n.confidence else "")
# Color branches
def color_by_length(clade):
if clade.branch_length:
if clade.branch_length > 0.5:
return "red"
elif clade.branch_length > 0.2:
return "orange"
return "black"
# Note: Direct branch coloring requires custom matplotlib code
```
## Building Trees
### From Distance Matrix
```python
from Bio.Phylo.TreeConstruction import DistanceTreeConstructor, DistanceMatrix
# Create distance matrix
dm = DistanceMatrix(
names=["Alpha", "Beta", "Gamma", "Delta"],
matrix=[
[],
[0.23],
[0.45, 0.34],
[0.67, 0.58, 0.29]
]
)
# Build tree using UPGMA
constructor = DistanceTreeConstructor()
tree = constructor.upgma(dm)
Phylo.draw_ascii(tree)
# Build tree using Neighbor-Joining
tree = constructor.nj(dm)
```
### From Multiple Sequence Alignment
```python
from Bio import AlignIO, Phylo
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
# Read alignment
alignment = AlignIO.read("alignment.fasta", "fasta")
# Calculate distance matrix
calculator = DistanceCalculator("identity")
distance_matrix = calculator.get_distance(alignment)
# Build tree
constructor = DistanceTreeConstructor()
tree = constructor.upgma(distance_matrix)
# Write tree
Phylo.write(tree, "output_tree.nwk", "newick")
```
### Distance Models
Available distance calculation models:
- **identity** - Simple identity
- **blastn** - BLASTN identity
- **trans** - Transition/transversion ratio
- **blosum62** - BLOSUM62 matrix
- **pam250** - PAM250 matrix
```python
# Use different model
calculator = DistanceCalculator("blosum62")
dm = calculator.get_distance(alignment)
```
## Consensus Trees
```python
from Bio.Phylo.Consensus import majority_consensus, strict_consensus
# Read multiple trees
trees = list(Phylo.parse("bootstrap_trees.nwk", "newick"))
# Majority-rule consensus
consensus = majority_consensus(trees, cutoff=0.5)
# Strict consensus
strict_cons = strict_consensus(trees)
# Write consensus tree
Phylo.write(consensus, "consensus.nwk", "newick")
```
## PhyloXML Features
PhyloXML format supports rich annotations:
```python
from Bio.Phylo.PhyloXML import Phylogeny, Clade
# Create PhyloXML tree
tree = Phylogeny(rooted=True)
tree.name = "Example Tree"
tree.description = "A sample phylogenetic tree"
# Add clades with rich annotations
clade = Clade(branch_length=0.5)
clade.name = "Species_A"
clade.color = "red"
clade.width = 2.0
# Add taxonomy information
from Bio.Phylo.PhyloXML import Taxonomy
taxonomy = Taxonomy(scientific_name="Homo sapiens", common_name="Human")
clade.taxonomies.append(taxonomy)
```
## Bootstrap Support
```python
# Add bootstrap support values to tree
def add_bootstrap_support(tree, support_values):
"""Add bootstrap support to internal nodes."""
internal_nodes = tree.get_nonterminals()
for node, support in zip(internal_nodes, support_values):
node.confidence = support
return tree
# Example
support_values = [95, 87, 76, 92]
tree_with_support = add_bootstrap_support(tree, support_values)
```
## Best Practices
1. **Choose appropriate file format** - Newick for simple trees, phyloXML for annotations
2. **Validate tree topology** - Check for polytomies and negative branch lengths
3. **Root trees appropriately** - Use midpoint or outgroup rooting
4. **Handle bootstrap values** - Store as clade confidence
5. **Consider tree size** - Large trees may need special handling
6. **Use tree copies** - Call `.copy()` before modifications
7. **Export publication-ready figures** - Use matplotlib for high-quality output
8. **Document tree construction** - Record alignment and parameters used
9. **Compare multiple trees** - Use consensus methods for bootstrap trees
10. **Validate taxon names** - Ensure consistent naming across files
## Common Use Cases
### Build Tree from Sequences
```python
from Bio import AlignIO, Phylo
from Bio.Phylo.TreeConstruction import DistanceCalculator, DistanceTreeConstructor
# Read aligned sequences
alignment = AlignIO.read("sequences.aln", "clustal")
# Calculate distances
calculator = DistanceCalculator("identity")
dm = calculator.get_distance(alignment)
# Build neighbor-joining tree
constructor = DistanceTreeConstructor()
tree = constructor.nj(dm)
# Root at midpoint
tree.root_at_midpoint()
# Save tree
Phylo.write(tree, "tree.nwk", "newick")
# Visualize
import matplotlib.pyplot as plt
fig = plt.figure(figsize=(10, 8))
Phylo.draw(tree)
plt.show()
```
### Extract Subtree
```python
def extract_subtree(tree, taxa_list):
"""Extract subtree containing specific taxa."""
# Create a copy
subtree = tree.copy()
# Get all terminals
all_terminals = subtree.get_terminals()
# Prune taxa not in list
for terminal in all_terminals:
if terminal.name not in taxa_list:
subtree.prune(terminal)
return subtree
# Use it
subtree = extract_subtree(tree, ["Species_A", "Species_B", "Species_C"])
Phylo.write(subtree, "subtree.nwk", "newick")
```
### Calculate Phylogenetic Diversity
```python
def phylogenetic_diversity(tree, taxa_subset=None):
"""Calculate phylogenetic diversity (sum of branch lengths)."""
if taxa_subset:
# Prune to subset
tree = extract_subtree(tree, taxa_subset)
# Sum all branch lengths
total = 0
for clade in tree.find_clades():
if clade.branch_length:
total += clade.branch_length
return total
# Calculate PD for all taxa
pd_all = phylogenetic_diversity(tree)
print(f"Total phylogenetic diversity: {pd_all:.3f}")
# Calculate PD for subset
pd_subset = phylogenetic_diversity(tree, ["Species_A", "Species_B"])
print(f"Subset phylogenetic diversity: {pd_subset:.3f}")
```
### Annotate Tree with External Data
```python
def annotate_tree_from_csv(tree, csv_file):
"""Annotate tree leaves with data from CSV."""
import csv
# Read annotation data
annotations = {}
with open(csv_file) as f:
reader = csv.DictReader(f)
for row in reader:
annotations[row["species"]] = row
# Annotate tree
for terminal in tree.get_terminals():
if terminal.name in annotations:
# Add custom attributes
for key, value in annotations[terminal.name].items():
setattr(terminal, key, value)
return tree
```
### Compare Tree Topologies
```python
def robinson_foulds_distance(tree1, tree2):
"""Calculate Robinson-Foulds distance between two trees."""
# Get bipartitions for each tree
def get_bipartitions(tree):
bipartitions = set()
for clade in tree.get_nonterminals():
terminals = frozenset(t.name for t in clade.get_terminals())
bipartitions.add(terminals)
return bipartitions
bp1 = get_bipartitions(tree1)
bp2 = get_bipartitions(tree2)
# Symmetric difference
diff = len(bp1.symmetric_difference(bp2))
return diff
# Use it
tree1 = Phylo.read("tree1.nwk", "newick")
tree2 = Phylo.read("tree2.nwk", "newick")
rf_dist = robinson_foulds_distance(tree1, tree2)
print(f"Robinson-Foulds distance: {rf_dist}")
```