Initial commit
This commit is contained in:
564
skills/biopython/references/structure.md
Normal file
564
skills/biopython/references/structure.md
Normal file
@@ -0,0 +1,564 @@
|
||||
# Structural Bioinformatics with Bio.PDB
|
||||
|
||||
## Overview
|
||||
|
||||
Bio.PDB provides tools for working with macromolecular 3D structures from PDB and mmCIF files. The module uses the SMCRA (Structure/Model/Chain/Residue/Atom) architecture to represent protein structures hierarchically.
|
||||
|
||||
## SMCRA Architecture
|
||||
|
||||
The Bio.PDB module organizes structures hierarchically:
|
||||
|
||||
```
|
||||
Structure
|
||||
└── Model (multiple models for NMR structures)
|
||||
└── Chain (e.g., chain A, B, C)
|
||||
└── Residue (amino acids, nucleotides, heteroatoms)
|
||||
└── Atom (individual atoms)
|
||||
```
|
||||
|
||||
## Parsing Structure Files
|
||||
|
||||
### PDB Format
|
||||
|
||||
```python
|
||||
from Bio.PDB import PDBParser
|
||||
|
||||
# Create parser
|
||||
parser = PDBParser(QUIET=True) # QUIET=True suppresses warnings
|
||||
|
||||
# Parse structure
|
||||
structure = parser.get_structure("1crn", "1crn.pdb")
|
||||
|
||||
# Access basic information
|
||||
print(f"Structure ID: {structure.id}")
|
||||
print(f"Number of models: {len(structure)}")
|
||||
```
|
||||
|
||||
### mmCIF Format
|
||||
|
||||
mmCIF format is more modern and handles large structures better:
|
||||
|
||||
```python
|
||||
from Bio.PDB import MMCIFParser
|
||||
|
||||
# Create parser
|
||||
parser = MMCIFParser(QUIET=True)
|
||||
|
||||
# Parse structure
|
||||
structure = parser.get_structure("1crn", "1crn.cif")
|
||||
```
|
||||
|
||||
### Download from PDB
|
||||
|
||||
```python
|
||||
from Bio.PDB import PDBList
|
||||
|
||||
# Create PDB list object
|
||||
pdbl = PDBList()
|
||||
|
||||
# Download PDB file
|
||||
pdbl.retrieve_pdb_file("1CRN", file_format="pdb", pdir="structures/")
|
||||
|
||||
# Download mmCIF file
|
||||
pdbl.retrieve_pdb_file("1CRN", file_format="mmCif", pdir="structures/")
|
||||
|
||||
# Download obsolete structure
|
||||
pdbl.retrieve_pdb_file("1CRN", obsolete=True, pdir="structures/")
|
||||
```
|
||||
|
||||
## Navigating Structure Hierarchy
|
||||
|
||||
### Access Models
|
||||
|
||||
```python
|
||||
# Get first model
|
||||
model = structure[0]
|
||||
|
||||
# Iterate through all models
|
||||
for model in structure:
|
||||
print(f"Model {model.id}")
|
||||
```
|
||||
|
||||
### Access Chains
|
||||
|
||||
```python
|
||||
# Get specific chain
|
||||
chain = model["A"]
|
||||
|
||||
# Iterate through all chains
|
||||
for chain in model:
|
||||
print(f"Chain {chain.id}")
|
||||
```
|
||||
|
||||
### Access Residues
|
||||
|
||||
```python
|
||||
# Iterate through residues in a chain
|
||||
for residue in chain:
|
||||
print(f"Residue: {residue.resname} {residue.id[1]}")
|
||||
|
||||
# Get specific residue by ID
|
||||
# Residue ID is tuple: (hetfield, sequence_id, insertion_code)
|
||||
residue = chain[(" ", 10, " ")] # Standard amino acid at position 10
|
||||
```
|
||||
|
||||
### Access Atoms
|
||||
|
||||
```python
|
||||
# Iterate through atoms in a residue
|
||||
for atom in residue:
|
||||
print(f"Atom: {atom.name}, Coordinates: {atom.coord}")
|
||||
|
||||
# Get specific atom
|
||||
ca_atom = residue["CA"] # Alpha carbon
|
||||
print(f"CA coordinates: {ca_atom.coord}")
|
||||
```
|
||||
|
||||
### Alternative Access Patterns
|
||||
|
||||
```python
|
||||
# Direct access through hierarchy
|
||||
atom = structure[0]["A"][10]["CA"]
|
||||
|
||||
# Get all atoms
|
||||
atoms = list(structure.get_atoms())
|
||||
print(f"Total atoms: {len(atoms)}")
|
||||
|
||||
# Get all residues
|
||||
residues = list(structure.get_residues())
|
||||
|
||||
# Get all chains
|
||||
chains = list(structure.get_chains())
|
||||
```
|
||||
|
||||
## Working with Atom Coordinates
|
||||
|
||||
### Accessing Coordinates
|
||||
|
||||
```python
|
||||
# Get atom coordinates
|
||||
coord = atom.coord
|
||||
print(f"X: {coord[0]}, Y: {coord[1]}, Z: {coord[2]}")
|
||||
|
||||
# Get B-factor (temperature factor)
|
||||
b_factor = atom.bfactor
|
||||
|
||||
# Get occupancy
|
||||
occupancy = atom.occupancy
|
||||
|
||||
# Get element
|
||||
element = atom.element
|
||||
```
|
||||
|
||||
### Calculate Distances
|
||||
|
||||
```python
|
||||
from Bio.PDB import Vector
|
||||
|
||||
# Calculate distance between two atoms
|
||||
atom1 = residue1["CA"]
|
||||
atom2 = residue2["CA"]
|
||||
|
||||
distance = atom1 - atom2 # Returns distance in Angstroms
|
||||
print(f"Distance: {distance:.2f} Å")
|
||||
```
|
||||
|
||||
### Calculate Angles
|
||||
|
||||
```python
|
||||
from Bio.PDB.vectors import calc_angle
|
||||
|
||||
# Calculate angle between three atoms
|
||||
angle = calc_angle(
|
||||
atom1.get_vector(),
|
||||
atom2.get_vector(),
|
||||
atom3.get_vector()
|
||||
)
|
||||
print(f"Angle: {angle:.2f} radians")
|
||||
```
|
||||
|
||||
### Calculate Dihedrals
|
||||
|
||||
```python
|
||||
from Bio.PDB.vectors import calc_dihedral
|
||||
|
||||
# Calculate dihedral angle between four atoms
|
||||
dihedral = calc_dihedral(
|
||||
atom1.get_vector(),
|
||||
atom2.get_vector(),
|
||||
atom3.get_vector(),
|
||||
atom4.get_vector()
|
||||
)
|
||||
print(f"Dihedral: {dihedral:.2f} radians")
|
||||
```
|
||||
|
||||
## Structure Analysis
|
||||
|
||||
### Secondary Structure (DSSP)
|
||||
|
||||
DSSP assigns secondary structure to protein structures:
|
||||
|
||||
```python
|
||||
from Bio.PDB import DSSP, PDBParser
|
||||
|
||||
# Parse structure
|
||||
parser = PDBParser()
|
||||
structure = parser.get_structure("1crn", "1crn.pdb")
|
||||
|
||||
# Run DSSP (requires DSSP executable installed)
|
||||
model = structure[0]
|
||||
dssp = DSSP(model, "1crn.pdb")
|
||||
|
||||
# Access results
|
||||
for residue_key in dssp:
|
||||
dssp_data = dssp[residue_key]
|
||||
residue_id = residue_key[1]
|
||||
ss = dssp_data[2] # Secondary structure code
|
||||
phi = dssp_data[4] # Phi angle
|
||||
psi = dssp_data[5] # Psi angle
|
||||
print(f"Residue {residue_id}: {ss}, φ={phi:.1f}°, ψ={psi:.1f}°")
|
||||
```
|
||||
|
||||
Secondary structure codes:
|
||||
- `H` - Alpha helix
|
||||
- `B` - Beta bridge
|
||||
- `E` - Strand
|
||||
- `G` - 3-10 helix
|
||||
- `I` - Pi helix
|
||||
- `T` - Turn
|
||||
- `S` - Bend
|
||||
- `-` - Coil/loop
|
||||
|
||||
### Solvent Accessibility (DSSP)
|
||||
|
||||
```python
|
||||
# Get relative solvent accessibility
|
||||
for residue_key in dssp:
|
||||
acc = dssp[residue_key][3] # Relative accessibility
|
||||
print(f"Residue {residue_key[1]}: {acc:.2f} relative accessibility")
|
||||
```
|
||||
|
||||
### Neighbor Search
|
||||
|
||||
Find nearby atoms efficiently:
|
||||
|
||||
```python
|
||||
from Bio.PDB import NeighborSearch
|
||||
|
||||
# Get all atoms
|
||||
atoms = list(structure.get_atoms())
|
||||
|
||||
# Create neighbor search object
|
||||
ns = NeighborSearch(atoms)
|
||||
|
||||
# Find atoms within radius
|
||||
center_atom = structure[0]["A"][10]["CA"]
|
||||
nearby_atoms = ns.search(center_atom.coord, 5.0) # 5 Å radius
|
||||
print(f"Found {len(nearby_atoms)} atoms within 5 Å")
|
||||
|
||||
# Find residues within radius
|
||||
nearby_residues = ns.search(center_atom.coord, 5.0, level="R")
|
||||
|
||||
# Find chains within radius
|
||||
nearby_chains = ns.search(center_atom.coord, 10.0, level="C")
|
||||
```
|
||||
|
||||
### Contact Map
|
||||
|
||||
```python
|
||||
def calculate_contact_map(chain, distance_threshold=8.0):
|
||||
"""Calculate residue-residue contact map."""
|
||||
residues = list(chain.get_residues())
|
||||
n = len(residues)
|
||||
contact_map = [[0] * n for _ in range(n)]
|
||||
|
||||
for i, res1 in enumerate(residues):
|
||||
for j, res2 in enumerate(residues):
|
||||
if i < j:
|
||||
# Get CA atoms
|
||||
if res1.has_id("CA") and res2.has_id("CA"):
|
||||
dist = res1["CA"] - res2["CA"]
|
||||
if dist < distance_threshold:
|
||||
contact_map[i][j] = 1
|
||||
contact_map[j][i] = 1
|
||||
|
||||
return contact_map
|
||||
```
|
||||
|
||||
## Structure Quality Assessment
|
||||
|
||||
### Ramachandran Plot Data
|
||||
|
||||
```python
|
||||
from Bio.PDB import Polypeptide
|
||||
|
||||
def get_phi_psi(structure):
|
||||
"""Extract phi and psi angles for Ramachandran plot."""
|
||||
phi_psi = []
|
||||
|
||||
for model in structure:
|
||||
for chain in model:
|
||||
polypeptides = Polypeptide.PPBuilder().build_peptides(chain)
|
||||
for poly in polypeptides:
|
||||
angles = poly.get_phi_psi_list()
|
||||
for residue, (phi, psi) in zip(poly, angles):
|
||||
if phi and psi: # Skip None values
|
||||
phi_psi.append((residue.resname, phi, psi))
|
||||
|
||||
return phi_psi
|
||||
```
|
||||
|
||||
### Check for Missing Atoms
|
||||
|
||||
```python
|
||||
def check_missing_atoms(structure):
|
||||
"""Identify residues with missing atoms."""
|
||||
missing = []
|
||||
|
||||
for residue in structure.get_residues():
|
||||
if residue.id[0] == " ": # Standard amino acid
|
||||
resname = residue.resname
|
||||
|
||||
# Expected backbone atoms
|
||||
expected = ["N", "CA", "C", "O"]
|
||||
|
||||
for atom_name in expected:
|
||||
if not residue.has_id(atom_name):
|
||||
missing.append((residue.full_id, atom_name))
|
||||
|
||||
return missing
|
||||
```
|
||||
|
||||
## Structure Manipulation
|
||||
|
||||
### Select Specific Atoms
|
||||
|
||||
```python
|
||||
from Bio.PDB import Select
|
||||
|
||||
class CASelect(Select):
|
||||
"""Select only CA atoms."""
|
||||
def accept_atom(self, atom):
|
||||
return atom.name == "CA"
|
||||
|
||||
class ChainASelect(Select):
|
||||
"""Select only chain A."""
|
||||
def accept_chain(self, chain):
|
||||
return chain.id == "A"
|
||||
|
||||
# Use with PDBIO
|
||||
from Bio.PDB import PDBIO
|
||||
|
||||
io = PDBIO()
|
||||
io.set_structure(structure)
|
||||
io.save("ca_only.pdb", CASelect())
|
||||
io.save("chain_a.pdb", ChainASelect())
|
||||
```
|
||||
|
||||
### Transform Structures
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
|
||||
# Rotate structure
|
||||
from Bio.PDB.vectors import rotaxis
|
||||
|
||||
# Define rotation axis and angle
|
||||
axis = Vector(1, 0, 0) # X-axis
|
||||
angle = np.pi / 4 # 45 degrees
|
||||
|
||||
# Create rotation matrix
|
||||
rotation = rotaxis(angle, axis)
|
||||
|
||||
# Apply rotation to all atoms
|
||||
for atom in structure.get_atoms():
|
||||
atom.transform(rotation, Vector(0, 0, 0))
|
||||
```
|
||||
|
||||
### Superimpose Structures
|
||||
|
||||
```python
|
||||
from Bio.PDB import Superimposer, PDBParser
|
||||
|
||||
# Parse two structures
|
||||
parser = PDBParser()
|
||||
structure1 = parser.get_structure("ref", "reference.pdb")
|
||||
structure2 = parser.get_structure("mov", "mobile.pdb")
|
||||
|
||||
# Get CA atoms from both structures
|
||||
ref_atoms = [atom for atom in structure1.get_atoms() if atom.name == "CA"]
|
||||
mov_atoms = [atom for atom in structure2.get_atoms() if atom.name == "CA"]
|
||||
|
||||
# Superimpose
|
||||
super_imposer = Superimposer()
|
||||
super_imposer.set_atoms(ref_atoms, mov_atoms)
|
||||
|
||||
# Apply transformation
|
||||
super_imposer.apply(structure2.get_atoms())
|
||||
|
||||
# Get RMSD
|
||||
rmsd = super_imposer.rms
|
||||
print(f"RMSD: {rmsd:.2f} Å")
|
||||
|
||||
# Save superimposed structure
|
||||
from Bio.PDB import PDBIO
|
||||
io = PDBIO()
|
||||
io.set_structure(structure2)
|
||||
io.save("superimposed.pdb")
|
||||
```
|
||||
|
||||
## Writing Structure Files
|
||||
|
||||
### Save PDB Files
|
||||
|
||||
```python
|
||||
from Bio.PDB import PDBIO
|
||||
|
||||
io = PDBIO()
|
||||
io.set_structure(structure)
|
||||
io.save("output.pdb")
|
||||
```
|
||||
|
||||
### Save mmCIF Files
|
||||
|
||||
```python
|
||||
from Bio.PDB import MMCIFIO
|
||||
|
||||
io = MMCIFIO()
|
||||
io.set_structure(structure)
|
||||
io.save("output.cif")
|
||||
```
|
||||
|
||||
## Sequence from Structure
|
||||
|
||||
### Extract Sequence
|
||||
|
||||
```python
|
||||
from Bio.PDB import Polypeptide
|
||||
|
||||
# Get polypeptides from structure
|
||||
ppb = Polypeptide.PPBuilder()
|
||||
|
||||
for model in structure:
|
||||
for chain in model:
|
||||
for pp in ppb.build_peptides(chain):
|
||||
sequence = pp.get_sequence()
|
||||
print(f"Chain {chain.id}: {sequence}")
|
||||
```
|
||||
|
||||
### Map to FASTA
|
||||
|
||||
```python
|
||||
from Bio import SeqIO
|
||||
from Bio.SeqRecord import SeqRecord
|
||||
|
||||
# Extract sequences and create FASTA
|
||||
records = []
|
||||
ppb = Polypeptide.PPBuilder()
|
||||
|
||||
for model in structure:
|
||||
for chain in model:
|
||||
for pp in ppb.build_peptides(chain):
|
||||
seq_record = SeqRecord(
|
||||
pp.get_sequence(),
|
||||
id=f"{structure.id}_{chain.id}",
|
||||
description=f"Chain {chain.id}"
|
||||
)
|
||||
records.append(seq_record)
|
||||
|
||||
SeqIO.write(records, "structure_sequences.fasta", "fasta")
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Use mmCIF** for large structures and modern data
|
||||
2. **Set QUIET=True** to suppress parser warnings
|
||||
3. **Check for missing atoms** before analysis
|
||||
4. **Use NeighborSearch** for efficient spatial queries
|
||||
5. **Validate structure quality** with DSSP or Ramachandran analysis
|
||||
6. **Handle multiple models** appropriately (NMR structures)
|
||||
7. **Be aware of heteroatoms** - they have different residue IDs
|
||||
8. **Use Select classes** for targeted structure output
|
||||
9. **Cache downloaded structures** locally
|
||||
10. **Consider alternative conformations** - some residues have multiple positions
|
||||
|
||||
## Common Use Cases
|
||||
|
||||
### Calculate RMSD Between Structures
|
||||
|
||||
```python
|
||||
from Bio.PDB import PDBParser, Superimposer
|
||||
|
||||
parser = PDBParser()
|
||||
structure1 = parser.get_structure("s1", "structure1.pdb")
|
||||
structure2 = parser.get_structure("s2", "structure2.pdb")
|
||||
|
||||
# Get CA atoms
|
||||
atoms1 = [atom for atom in structure1[0]["A"].get_atoms() if atom.name == "CA"]
|
||||
atoms2 = [atom for atom in structure2[0]["A"].get_atoms() if atom.name == "CA"]
|
||||
|
||||
# Ensure same number of atoms
|
||||
min_len = min(len(atoms1), len(atoms2))
|
||||
atoms1 = atoms1[:min_len]
|
||||
atoms2 = atoms2[:min_len]
|
||||
|
||||
# Calculate RMSD
|
||||
sup = Superimposer()
|
||||
sup.set_atoms(atoms1, atoms2)
|
||||
print(f"RMSD: {sup.rms:.3f} Å")
|
||||
```
|
||||
|
||||
### Find Binding Site Residues
|
||||
|
||||
```python
|
||||
def find_binding_site(structure, ligand_chain, ligand_res_id, distance=5.0):
|
||||
"""Find residues near a ligand."""
|
||||
from Bio.PDB import NeighborSearch
|
||||
|
||||
# Get ligand atoms
|
||||
ligand = structure[0][ligand_chain][ligand_res_id]
|
||||
ligand_atoms = list(ligand.get_atoms())
|
||||
|
||||
# Get all protein atoms
|
||||
protein_atoms = []
|
||||
for chain in structure[0]:
|
||||
if chain.id != ligand_chain:
|
||||
for residue in chain:
|
||||
if residue.id[0] == " ": # Standard residue
|
||||
protein_atoms.extend(residue.get_atoms())
|
||||
|
||||
# Find nearby atoms
|
||||
ns = NeighborSearch(protein_atoms)
|
||||
binding_site = set()
|
||||
|
||||
for ligand_atom in ligand_atoms:
|
||||
nearby = ns.search(ligand_atom.coord, distance, level="R")
|
||||
binding_site.update(nearby)
|
||||
|
||||
return list(binding_site)
|
||||
```
|
||||
|
||||
### Calculate Center of Mass
|
||||
|
||||
```python
|
||||
import numpy as np
|
||||
|
||||
def center_of_mass(entity):
|
||||
"""Calculate center of mass for structure entity."""
|
||||
masses = []
|
||||
coords = []
|
||||
|
||||
# Atomic masses (simplified)
|
||||
mass_dict = {"C": 12.0, "N": 14.0, "O": 16.0, "S": 32.0}
|
||||
|
||||
for atom in entity.get_atoms():
|
||||
mass = mass_dict.get(atom.element, 12.0)
|
||||
masses.append(mass)
|
||||
coords.append(atom.coord)
|
||||
|
||||
masses = np.array(masses)
|
||||
coords = np.array(coords)
|
||||
|
||||
com = np.sum(coords * masses[:, np.newaxis], axis=0) / np.sum(masses)
|
||||
return com
|
||||
```
|
||||
Reference in New Issue
Block a user