565 lines
13 KiB
Markdown
565 lines
13 KiB
Markdown
# Structural Bioinformatics with Bio.PDB
|
|
|
|
## Overview
|
|
|
|
Bio.PDB provides tools for working with macromolecular 3D structures from PDB and mmCIF files. The module uses the SMCRA (Structure/Model/Chain/Residue/Atom) architecture to represent protein structures hierarchically.
|
|
|
|
## SMCRA Architecture
|
|
|
|
The Bio.PDB module organizes structures hierarchically:
|
|
|
|
```
|
|
Structure
|
|
└── Model (multiple models for NMR structures)
|
|
└── Chain (e.g., chain A, B, C)
|
|
└── Residue (amino acids, nucleotides, heteroatoms)
|
|
└── Atom (individual atoms)
|
|
```
|
|
|
|
## Parsing Structure Files
|
|
|
|
### PDB Format
|
|
|
|
```python
|
|
from Bio.PDB import PDBParser
|
|
|
|
# Create parser
|
|
parser = PDBParser(QUIET=True) # QUIET=True suppresses warnings
|
|
|
|
# Parse structure
|
|
structure = parser.get_structure("1crn", "1crn.pdb")
|
|
|
|
# Access basic information
|
|
print(f"Structure ID: {structure.id}")
|
|
print(f"Number of models: {len(structure)}")
|
|
```
|
|
|
|
### mmCIF Format
|
|
|
|
mmCIF format is more modern and handles large structures better:
|
|
|
|
```python
|
|
from Bio.PDB import MMCIFParser
|
|
|
|
# Create parser
|
|
parser = MMCIFParser(QUIET=True)
|
|
|
|
# Parse structure
|
|
structure = parser.get_structure("1crn", "1crn.cif")
|
|
```
|
|
|
|
### Download from PDB
|
|
|
|
```python
|
|
from Bio.PDB import PDBList
|
|
|
|
# Create PDB list object
|
|
pdbl = PDBList()
|
|
|
|
# Download PDB file
|
|
pdbl.retrieve_pdb_file("1CRN", file_format="pdb", pdir="structures/")
|
|
|
|
# Download mmCIF file
|
|
pdbl.retrieve_pdb_file("1CRN", file_format="mmCif", pdir="structures/")
|
|
|
|
# Download obsolete structure
|
|
pdbl.retrieve_pdb_file("1CRN", obsolete=True, pdir="structures/")
|
|
```
|
|
|
|
## Navigating Structure Hierarchy
|
|
|
|
### Access Models
|
|
|
|
```python
|
|
# Get first model
|
|
model = structure[0]
|
|
|
|
# Iterate through all models
|
|
for model in structure:
|
|
print(f"Model {model.id}")
|
|
```
|
|
|
|
### Access Chains
|
|
|
|
```python
|
|
# Get specific chain
|
|
chain = model["A"]
|
|
|
|
# Iterate through all chains
|
|
for chain in model:
|
|
print(f"Chain {chain.id}")
|
|
```
|
|
|
|
### Access Residues
|
|
|
|
```python
|
|
# Iterate through residues in a chain
|
|
for residue in chain:
|
|
print(f"Residue: {residue.resname} {residue.id[1]}")
|
|
|
|
# Get specific residue by ID
|
|
# Residue ID is tuple: (hetfield, sequence_id, insertion_code)
|
|
residue = chain[(" ", 10, " ")] # Standard amino acid at position 10
|
|
```
|
|
|
|
### Access Atoms
|
|
|
|
```python
|
|
# Iterate through atoms in a residue
|
|
for atom in residue:
|
|
print(f"Atom: {atom.name}, Coordinates: {atom.coord}")
|
|
|
|
# Get specific atom
|
|
ca_atom = residue["CA"] # Alpha carbon
|
|
print(f"CA coordinates: {ca_atom.coord}")
|
|
```
|
|
|
|
### Alternative Access Patterns
|
|
|
|
```python
|
|
# Direct access through hierarchy
|
|
atom = structure[0]["A"][10]["CA"]
|
|
|
|
# Get all atoms
|
|
atoms = list(structure.get_atoms())
|
|
print(f"Total atoms: {len(atoms)}")
|
|
|
|
# Get all residues
|
|
residues = list(structure.get_residues())
|
|
|
|
# Get all chains
|
|
chains = list(structure.get_chains())
|
|
```
|
|
|
|
## Working with Atom Coordinates
|
|
|
|
### Accessing Coordinates
|
|
|
|
```python
|
|
# Get atom coordinates
|
|
coord = atom.coord
|
|
print(f"X: {coord[0]}, Y: {coord[1]}, Z: {coord[2]}")
|
|
|
|
# Get B-factor (temperature factor)
|
|
b_factor = atom.bfactor
|
|
|
|
# Get occupancy
|
|
occupancy = atom.occupancy
|
|
|
|
# Get element
|
|
element = atom.element
|
|
```
|
|
|
|
### Calculate Distances
|
|
|
|
```python
|
|
from Bio.PDB import Vector
|
|
|
|
# Calculate distance between two atoms
|
|
atom1 = residue1["CA"]
|
|
atom2 = residue2["CA"]
|
|
|
|
distance = atom1 - atom2 # Returns distance in Angstroms
|
|
print(f"Distance: {distance:.2f} Å")
|
|
```
|
|
|
|
### Calculate Angles
|
|
|
|
```python
|
|
from Bio.PDB.vectors import calc_angle
|
|
|
|
# Calculate angle between three atoms
|
|
angle = calc_angle(
|
|
atom1.get_vector(),
|
|
atom2.get_vector(),
|
|
atom3.get_vector()
|
|
)
|
|
print(f"Angle: {angle:.2f} radians")
|
|
```
|
|
|
|
### Calculate Dihedrals
|
|
|
|
```python
|
|
from Bio.PDB.vectors import calc_dihedral
|
|
|
|
# Calculate dihedral angle between four atoms
|
|
dihedral = calc_dihedral(
|
|
atom1.get_vector(),
|
|
atom2.get_vector(),
|
|
atom3.get_vector(),
|
|
atom4.get_vector()
|
|
)
|
|
print(f"Dihedral: {dihedral:.2f} radians")
|
|
```
|
|
|
|
## Structure Analysis
|
|
|
|
### Secondary Structure (DSSP)
|
|
|
|
DSSP assigns secondary structure to protein structures:
|
|
|
|
```python
|
|
from Bio.PDB import DSSP, PDBParser
|
|
|
|
# Parse structure
|
|
parser = PDBParser()
|
|
structure = parser.get_structure("1crn", "1crn.pdb")
|
|
|
|
# Run DSSP (requires DSSP executable installed)
|
|
model = structure[0]
|
|
dssp = DSSP(model, "1crn.pdb")
|
|
|
|
# Access results
|
|
for residue_key in dssp:
|
|
dssp_data = dssp[residue_key]
|
|
residue_id = residue_key[1]
|
|
ss = dssp_data[2] # Secondary structure code
|
|
phi = dssp_data[4] # Phi angle
|
|
psi = dssp_data[5] # Psi angle
|
|
print(f"Residue {residue_id}: {ss}, φ={phi:.1f}°, ψ={psi:.1f}°")
|
|
```
|
|
|
|
Secondary structure codes:
|
|
- `H` - Alpha helix
|
|
- `B` - Beta bridge
|
|
- `E` - Strand
|
|
- `G` - 3-10 helix
|
|
- `I` - Pi helix
|
|
- `T` - Turn
|
|
- `S` - Bend
|
|
- `-` - Coil/loop
|
|
|
|
### Solvent Accessibility (DSSP)
|
|
|
|
```python
|
|
# Get relative solvent accessibility
|
|
for residue_key in dssp:
|
|
acc = dssp[residue_key][3] # Relative accessibility
|
|
print(f"Residue {residue_key[1]}: {acc:.2f} relative accessibility")
|
|
```
|
|
|
|
### Neighbor Search
|
|
|
|
Find nearby atoms efficiently:
|
|
|
|
```python
|
|
from Bio.PDB import NeighborSearch
|
|
|
|
# Get all atoms
|
|
atoms = list(structure.get_atoms())
|
|
|
|
# Create neighbor search object
|
|
ns = NeighborSearch(atoms)
|
|
|
|
# Find atoms within radius
|
|
center_atom = structure[0]["A"][10]["CA"]
|
|
nearby_atoms = ns.search(center_atom.coord, 5.0) # 5 Å radius
|
|
print(f"Found {len(nearby_atoms)} atoms within 5 Å")
|
|
|
|
# Find residues within radius
|
|
nearby_residues = ns.search(center_atom.coord, 5.0, level="R")
|
|
|
|
# Find chains within radius
|
|
nearby_chains = ns.search(center_atom.coord, 10.0, level="C")
|
|
```
|
|
|
|
### Contact Map
|
|
|
|
```python
|
|
def calculate_contact_map(chain, distance_threshold=8.0):
|
|
"""Calculate residue-residue contact map."""
|
|
residues = list(chain.get_residues())
|
|
n = len(residues)
|
|
contact_map = [[0] * n for _ in range(n)]
|
|
|
|
for i, res1 in enumerate(residues):
|
|
for j, res2 in enumerate(residues):
|
|
if i < j:
|
|
# Get CA atoms
|
|
if res1.has_id("CA") and res2.has_id("CA"):
|
|
dist = res1["CA"] - res2["CA"]
|
|
if dist < distance_threshold:
|
|
contact_map[i][j] = 1
|
|
contact_map[j][i] = 1
|
|
|
|
return contact_map
|
|
```
|
|
|
|
## Structure Quality Assessment
|
|
|
|
### Ramachandran Plot Data
|
|
|
|
```python
|
|
from Bio.PDB import Polypeptide
|
|
|
|
def get_phi_psi(structure):
|
|
"""Extract phi and psi angles for Ramachandran plot."""
|
|
phi_psi = []
|
|
|
|
for model in structure:
|
|
for chain in model:
|
|
polypeptides = Polypeptide.PPBuilder().build_peptides(chain)
|
|
for poly in polypeptides:
|
|
angles = poly.get_phi_psi_list()
|
|
for residue, (phi, psi) in zip(poly, angles):
|
|
if phi and psi: # Skip None values
|
|
phi_psi.append((residue.resname, phi, psi))
|
|
|
|
return phi_psi
|
|
```
|
|
|
|
### Check for Missing Atoms
|
|
|
|
```python
|
|
def check_missing_atoms(structure):
|
|
"""Identify residues with missing atoms."""
|
|
missing = []
|
|
|
|
for residue in structure.get_residues():
|
|
if residue.id[0] == " ": # Standard amino acid
|
|
resname = residue.resname
|
|
|
|
# Expected backbone atoms
|
|
expected = ["N", "CA", "C", "O"]
|
|
|
|
for atom_name in expected:
|
|
if not residue.has_id(atom_name):
|
|
missing.append((residue.full_id, atom_name))
|
|
|
|
return missing
|
|
```
|
|
|
|
## Structure Manipulation
|
|
|
|
### Select Specific Atoms
|
|
|
|
```python
|
|
from Bio.PDB import Select
|
|
|
|
class CASelect(Select):
|
|
"""Select only CA atoms."""
|
|
def accept_atom(self, atom):
|
|
return atom.name == "CA"
|
|
|
|
class ChainASelect(Select):
|
|
"""Select only chain A."""
|
|
def accept_chain(self, chain):
|
|
return chain.id == "A"
|
|
|
|
# Use with PDBIO
|
|
from Bio.PDB import PDBIO
|
|
|
|
io = PDBIO()
|
|
io.set_structure(structure)
|
|
io.save("ca_only.pdb", CASelect())
|
|
io.save("chain_a.pdb", ChainASelect())
|
|
```
|
|
|
|
### Transform Structures
|
|
|
|
```python
|
|
import numpy as np
|
|
|
|
# Rotate structure
|
|
from Bio.PDB.vectors import rotaxis
|
|
|
|
# Define rotation axis and angle
|
|
axis = Vector(1, 0, 0) # X-axis
|
|
angle = np.pi / 4 # 45 degrees
|
|
|
|
# Create rotation matrix
|
|
rotation = rotaxis(angle, axis)
|
|
|
|
# Apply rotation to all atoms
|
|
for atom in structure.get_atoms():
|
|
atom.transform(rotation, Vector(0, 0, 0))
|
|
```
|
|
|
|
### Superimpose Structures
|
|
|
|
```python
|
|
from Bio.PDB import Superimposer, PDBParser
|
|
|
|
# Parse two structures
|
|
parser = PDBParser()
|
|
structure1 = parser.get_structure("ref", "reference.pdb")
|
|
structure2 = parser.get_structure("mov", "mobile.pdb")
|
|
|
|
# Get CA atoms from both structures
|
|
ref_atoms = [atom for atom in structure1.get_atoms() if atom.name == "CA"]
|
|
mov_atoms = [atom for atom in structure2.get_atoms() if atom.name == "CA"]
|
|
|
|
# Superimpose
|
|
super_imposer = Superimposer()
|
|
super_imposer.set_atoms(ref_atoms, mov_atoms)
|
|
|
|
# Apply transformation
|
|
super_imposer.apply(structure2.get_atoms())
|
|
|
|
# Get RMSD
|
|
rmsd = super_imposer.rms
|
|
print(f"RMSD: {rmsd:.2f} Å")
|
|
|
|
# Save superimposed structure
|
|
from Bio.PDB import PDBIO
|
|
io = PDBIO()
|
|
io.set_structure(structure2)
|
|
io.save("superimposed.pdb")
|
|
```
|
|
|
|
## Writing Structure Files
|
|
|
|
### Save PDB Files
|
|
|
|
```python
|
|
from Bio.PDB import PDBIO
|
|
|
|
io = PDBIO()
|
|
io.set_structure(structure)
|
|
io.save("output.pdb")
|
|
```
|
|
|
|
### Save mmCIF Files
|
|
|
|
```python
|
|
from Bio.PDB import MMCIFIO
|
|
|
|
io = MMCIFIO()
|
|
io.set_structure(structure)
|
|
io.save("output.cif")
|
|
```
|
|
|
|
## Sequence from Structure
|
|
|
|
### Extract Sequence
|
|
|
|
```python
|
|
from Bio.PDB import Polypeptide
|
|
|
|
# Get polypeptides from structure
|
|
ppb = Polypeptide.PPBuilder()
|
|
|
|
for model in structure:
|
|
for chain in model:
|
|
for pp in ppb.build_peptides(chain):
|
|
sequence = pp.get_sequence()
|
|
print(f"Chain {chain.id}: {sequence}")
|
|
```
|
|
|
|
### Map to FASTA
|
|
|
|
```python
|
|
from Bio import SeqIO
|
|
from Bio.SeqRecord import SeqRecord
|
|
|
|
# Extract sequences and create FASTA
|
|
records = []
|
|
ppb = Polypeptide.PPBuilder()
|
|
|
|
for model in structure:
|
|
for chain in model:
|
|
for pp in ppb.build_peptides(chain):
|
|
seq_record = SeqRecord(
|
|
pp.get_sequence(),
|
|
id=f"{structure.id}_{chain.id}",
|
|
description=f"Chain {chain.id}"
|
|
)
|
|
records.append(seq_record)
|
|
|
|
SeqIO.write(records, "structure_sequences.fasta", "fasta")
|
|
```
|
|
|
|
## Best Practices
|
|
|
|
1. **Use mmCIF** for large structures and modern data
|
|
2. **Set QUIET=True** to suppress parser warnings
|
|
3. **Check for missing atoms** before analysis
|
|
4. **Use NeighborSearch** for efficient spatial queries
|
|
5. **Validate structure quality** with DSSP or Ramachandran analysis
|
|
6. **Handle multiple models** appropriately (NMR structures)
|
|
7. **Be aware of heteroatoms** - they have different residue IDs
|
|
8. **Use Select classes** for targeted structure output
|
|
9. **Cache downloaded structures** locally
|
|
10. **Consider alternative conformations** - some residues have multiple positions
|
|
|
|
## Common Use Cases
|
|
|
|
### Calculate RMSD Between Structures
|
|
|
|
```python
|
|
from Bio.PDB import PDBParser, Superimposer
|
|
|
|
parser = PDBParser()
|
|
structure1 = parser.get_structure("s1", "structure1.pdb")
|
|
structure2 = parser.get_structure("s2", "structure2.pdb")
|
|
|
|
# Get CA atoms
|
|
atoms1 = [atom for atom in structure1[0]["A"].get_atoms() if atom.name == "CA"]
|
|
atoms2 = [atom for atom in structure2[0]["A"].get_atoms() if atom.name == "CA"]
|
|
|
|
# Ensure same number of atoms
|
|
min_len = min(len(atoms1), len(atoms2))
|
|
atoms1 = atoms1[:min_len]
|
|
atoms2 = atoms2[:min_len]
|
|
|
|
# Calculate RMSD
|
|
sup = Superimposer()
|
|
sup.set_atoms(atoms1, atoms2)
|
|
print(f"RMSD: {sup.rms:.3f} Å")
|
|
```
|
|
|
|
### Find Binding Site Residues
|
|
|
|
```python
|
|
def find_binding_site(structure, ligand_chain, ligand_res_id, distance=5.0):
|
|
"""Find residues near a ligand."""
|
|
from Bio.PDB import NeighborSearch
|
|
|
|
# Get ligand atoms
|
|
ligand = structure[0][ligand_chain][ligand_res_id]
|
|
ligand_atoms = list(ligand.get_atoms())
|
|
|
|
# Get all protein atoms
|
|
protein_atoms = []
|
|
for chain in structure[0]:
|
|
if chain.id != ligand_chain:
|
|
for residue in chain:
|
|
if residue.id[0] == " ": # Standard residue
|
|
protein_atoms.extend(residue.get_atoms())
|
|
|
|
# Find nearby atoms
|
|
ns = NeighborSearch(protein_atoms)
|
|
binding_site = set()
|
|
|
|
for ligand_atom in ligand_atoms:
|
|
nearby = ns.search(ligand_atom.coord, distance, level="R")
|
|
binding_site.update(nearby)
|
|
|
|
return list(binding_site)
|
|
```
|
|
|
|
### Calculate Center of Mass
|
|
|
|
```python
|
|
import numpy as np
|
|
|
|
def center_of_mass(entity):
|
|
"""Calculate center of mass for structure entity."""
|
|
masses = []
|
|
coords = []
|
|
|
|
# Atomic masses (simplified)
|
|
mass_dict = {"C": 12.0, "N": 14.0, "O": 16.0, "S": 32.0}
|
|
|
|
for atom in entity.get_atoms():
|
|
mass = mass_dict.get(atom.element, 12.0)
|
|
masses.append(mass)
|
|
coords.append(atom.coord)
|
|
|
|
masses = np.array(masses)
|
|
coords = np.array(coords)
|
|
|
|
com = np.sum(coords * masses[:, np.newaxis], axis=0) / np.sum(masses)
|
|
return com
|
|
```
|