Initial commit

2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions
--- a/skills/biopython/references/structure.md
+++ b/skills/biopython/references/structure.md
@@ -0,0 +1,564 @@
+# Structural Bioinformatics with Bio.PDB
+
+## Overview
+
+Bio.PDB provides tools for working with macromolecular 3D structures from PDB and mmCIF files. The module uses the SMCRA (Structure/Model/Chain/Residue/Atom) architecture to represent protein structures hierarchically.
+
+## SMCRA Architecture
+
+The Bio.PDB module organizes structures hierarchically:
+
+```
+Structure
+  └── Model       (multiple models for NMR structures)
+      └── Chain   (e.g., chain A, B, C)
+          └── Residue  (amino acids, nucleotides, heteroatoms)
+              └── Atom (individual atoms)
+```
+
+## Parsing Structure Files
+
+### PDB Format
+
+```python
+from Bio.PDB import PDBParser
+
+# Create parser
+parser = PDBParser(QUIET=True)  # QUIET=True suppresses warnings
+
+# Parse structure
+structure = parser.get_structure("1crn", "1crn.pdb")
+
+# Access basic information
+print(f"Structure ID: {structure.id}")
+print(f"Number of models: {len(structure)}")
+```
+
+### mmCIF Format
+
+mmCIF format is more modern and handles large structures better:
+
+```python
+from Bio.PDB import MMCIFParser
+
+# Create parser
+parser = MMCIFParser(QUIET=True)
+
+# Parse structure
+structure = parser.get_structure("1crn", "1crn.cif")
+```
+
+### Download from PDB
+
+```python
+from Bio.PDB import PDBList
+
+# Create PDB list object
+pdbl = PDBList()
+
+# Download PDB file
+pdbl.retrieve_pdb_file("1CRN", file_format="pdb", pdir="structures/")
+
+# Download mmCIF file
+pdbl.retrieve_pdb_file("1CRN", file_format="mmCif", pdir="structures/")
+
+# Download obsolete structure
+pdbl.retrieve_pdb_file("1CRN", obsolete=True, pdir="structures/")
+```
+
+## Navigating Structure Hierarchy
+
+### Access Models
+
+```python
+# Get first model
+model = structure[0]
+
+# Iterate through all models
+for model in structure:
+    print(f"Model {model.id}")
+```
+
+### Access Chains
+
+```python
+# Get specific chain
+chain = model["A"]
+
+# Iterate through all chains
+for chain in model:
+    print(f"Chain {chain.id}")
+```
+
+### Access Residues
+
+```python
+# Iterate through residues in a chain
+for residue in chain:
+    print(f"Residue: {residue.resname} {residue.id[1]}")
+
+# Get specific residue by ID
+# Residue ID is tuple: (hetfield, sequence_id, insertion_code)
+residue = chain[(" ", 10, " ")]  # Standard amino acid at position 10
+```
+
+### Access Atoms
+
+```python
+# Iterate through atoms in a residue
+for atom in residue:
+    print(f"Atom: {atom.name}, Coordinates: {atom.coord}")
+
+# Get specific atom
+ca_atom = residue["CA"]  # Alpha carbon
+print(f"CA coordinates: {ca_atom.coord}")
+```
+
+### Alternative Access Patterns
+
+```python
+# Direct access through hierarchy
+atom = structure[0]["A"][10]["CA"]
+
+# Get all atoms
+atoms = list(structure.get_atoms())
+print(f"Total atoms: {len(atoms)}")
+
+# Get all residues
+residues = list(structure.get_residues())
+
+# Get all chains
+chains = list(structure.get_chains())
+```
+
+## Working with Atom Coordinates
+
+### Accessing Coordinates
+
+```python
+# Get atom coordinates
+coord = atom.coord
+print(f"X: {coord[0]}, Y: {coord[1]}, Z: {coord[2]}")
+
+# Get B-factor (temperature factor)
+b_factor = atom.bfactor
+
+# Get occupancy
+occupancy = atom.occupancy
+
+# Get element
+element = atom.element
+```
+
+### Calculate Distances
+
+```python
+from Bio.PDB import Vector
+
+# Calculate distance between two atoms
+atom1 = residue1["CA"]
+atom2 = residue2["CA"]
+
+distance = atom1 - atom2  # Returns distance in Angstroms
+print(f"Distance: {distance:.2f} Å")
+```
+
+### Calculate Angles
+
+```python
+from Bio.PDB.vectors import calc_angle
+
+# Calculate angle between three atoms
+angle = calc_angle(
+    atom1.get_vector(),
+    atom2.get_vector(),
+    atom3.get_vector()
+)
+print(f"Angle: {angle:.2f} radians")
+```
+
+### Calculate Dihedrals
+
+```python
+from Bio.PDB.vectors import calc_dihedral
+
+# Calculate dihedral angle between four atoms
+dihedral = calc_dihedral(
+    atom1.get_vector(),
+    atom2.get_vector(),
+    atom3.get_vector(),
+    atom4.get_vector()
+)
+print(f"Dihedral: {dihedral:.2f} radians")
+```
+
+## Structure Analysis
+
+### Secondary Structure (DSSP)
+
+DSSP assigns secondary structure to protein structures:
+
+```python
+from Bio.PDB import DSSP, PDBParser
+
+# Parse structure
+parser = PDBParser()
+structure = parser.get_structure("1crn", "1crn.pdb")
+
+# Run DSSP (requires DSSP executable installed)
+model = structure[0]
+dssp = DSSP(model, "1crn.pdb")
+
+# Access results
+for residue_key in dssp:
+    dssp_data = dssp[residue_key]
+    residue_id = residue_key[1]
+    ss = dssp_data[2]  # Secondary structure code
+    phi = dssp_data[4]  # Phi angle
+    psi = dssp_data[5]  # Psi angle
+    print(f"Residue {residue_id}: {ss}, φ={phi:.1f}°, ψ={psi:.1f}°")
+```
+
+Secondary structure codes:
+- `H` - Alpha helix
+- `B` - Beta bridge
+- `E` - Strand
+- `G` - 3-10 helix
+- `I` - Pi helix
+- `T` - Turn
+- `S` - Bend
+- `-` - Coil/loop
+
+### Solvent Accessibility (DSSP)
+
+```python
+# Get relative solvent accessibility
+for residue_key in dssp:
+    acc = dssp[residue_key][3]  # Relative accessibility
+    print(f"Residue {residue_key[1]}: {acc:.2f} relative accessibility")
+```
+
+### Neighbor Search
+
+Find nearby atoms efficiently:
+
+```python
+from Bio.PDB import NeighborSearch
+
+# Get all atoms
+atoms = list(structure.get_atoms())
+
+# Create neighbor search object
+ns = NeighborSearch(atoms)
+
+# Find atoms within radius
+center_atom = structure[0]["A"][10]["CA"]
+nearby_atoms = ns.search(center_atom.coord, 5.0)  # 5 Å radius
+print(f"Found {len(nearby_atoms)} atoms within 5 Å")
+
+# Find residues within radius
+nearby_residues = ns.search(center_atom.coord, 5.0, level="R")
+
+# Find chains within radius
+nearby_chains = ns.search(center_atom.coord, 10.0, level="C")
+```
+
+### Contact Map
+
+```python
+def calculate_contact_map(chain, distance_threshold=8.0):
+    """Calculate residue-residue contact map."""
+    residues = list(chain.get_residues())
+    n = len(residues)
+    contact_map = [[0] * n for _ in range(n)]
+
+    for i, res1 in enumerate(residues):
+        for j, res2 in enumerate(residues):
+            if i < j:
+                # Get CA atoms
+                if res1.has_id("CA") and res2.has_id("CA"):
+                    dist = res1["CA"] - res2["CA"]
+                    if dist < distance_threshold:
+                        contact_map[i][j] = 1
+                        contact_map[j][i] = 1
+
+    return contact_map
+```
+
+## Structure Quality Assessment
+
+### Ramachandran Plot Data
+
+```python
+from Bio.PDB import Polypeptide
+
+def get_phi_psi(structure):
+    """Extract phi and psi angles for Ramachandran plot."""
+    phi_psi = []
+
+    for model in structure:
+        for chain in model:
+            polypeptides = Polypeptide.PPBuilder().build_peptides(chain)
+            for poly in polypeptides:
+                angles = poly.get_phi_psi_list()
+                for residue, (phi, psi) in zip(poly, angles):
+                    if phi and psi:  # Skip None values
+                        phi_psi.append((residue.resname, phi, psi))
+
+    return phi_psi
+```
+
+### Check for Missing Atoms
+
+```python
+def check_missing_atoms(structure):
+    """Identify residues with missing atoms."""
+    missing = []
+
+    for residue in structure.get_residues():
+        if residue.id[0] == " ":  # Standard amino acid
+            resname = residue.resname
+
+            # Expected backbone atoms
+            expected = ["N", "CA", "C", "O"]
+
+            for atom_name in expected:
+                if not residue.has_id(atom_name):
+                    missing.append((residue.full_id, atom_name))
+
+    return missing
+```
+
+## Structure Manipulation
+
+### Select Specific Atoms
+
+```python
+from Bio.PDB import Select
+
+class CASelect(Select):
+    """Select only CA atoms."""
+    def accept_atom(self, atom):
+        return atom.name == "CA"
+
+class ChainASelect(Select):
+    """Select only chain A."""
+    def accept_chain(self, chain):
+        return chain.id == "A"
+
+# Use with PDBIO
+from Bio.PDB import PDBIO
+
+io = PDBIO()
+io.set_structure(structure)
+io.save("ca_only.pdb", CASelect())
+io.save("chain_a.pdb", ChainASelect())
+```
+
+### Transform Structures
+
+```python
+import numpy as np
+
+# Rotate structure
+from Bio.PDB.vectors import rotaxis
+
+# Define rotation axis and angle
+axis = Vector(1, 0, 0)  # X-axis
+angle = np.pi / 4  # 45 degrees
+
+# Create rotation matrix
+rotation = rotaxis(angle, axis)
+
+# Apply rotation to all atoms
+for atom in structure.get_atoms():
+    atom.transform(rotation, Vector(0, 0, 0))
+```
+
+### Superimpose Structures
+
+```python
+from Bio.PDB import Superimposer, PDBParser
+
+# Parse two structures
+parser = PDBParser()
+structure1 = parser.get_structure("ref", "reference.pdb")
+structure2 = parser.get_structure("mov", "mobile.pdb")
+
+# Get CA atoms from both structures
+ref_atoms = [atom for atom in structure1.get_atoms() if atom.name == "CA"]
+mov_atoms = [atom for atom in structure2.get_atoms() if atom.name == "CA"]
+
+# Superimpose
+super_imposer = Superimposer()
+super_imposer.set_atoms(ref_atoms, mov_atoms)
+
+# Apply transformation
+super_imposer.apply(structure2.get_atoms())
+
+# Get RMSD
+rmsd = super_imposer.rms
+print(f"RMSD: {rmsd:.2f} Å")
+
+# Save superimposed structure
+from Bio.PDB import PDBIO
+io = PDBIO()
+io.set_structure(structure2)
+io.save("superimposed.pdb")
+```
+
+## Writing Structure Files
+
+### Save PDB Files
+
+```python
+from Bio.PDB import PDBIO
+
+io = PDBIO()
+io.set_structure(structure)
+io.save("output.pdb")
+```
+
+### Save mmCIF Files
+
+```python
+from Bio.PDB import MMCIFIO
+
+io = MMCIFIO()
+io.set_structure(structure)
+io.save("output.cif")
+```
+
+## Sequence from Structure
+
+### Extract Sequence
+
+```python
+from Bio.PDB import Polypeptide
+
+# Get polypeptides from structure
+ppb = Polypeptide.PPBuilder()
+
+for model in structure:
+    for chain in model:
+        for pp in ppb.build_peptides(chain):
+            sequence = pp.get_sequence()
+            print(f"Chain {chain.id}: {sequence}")
+```
+
+### Map to FASTA
+
+```python
+from Bio import SeqIO
+from Bio.SeqRecord import SeqRecord
+
+# Extract sequences and create FASTA
+records = []
+ppb = Polypeptide.PPBuilder()
+
+for model in structure:
+    for chain in model:
+        for pp in ppb.build_peptides(chain):
+            seq_record = SeqRecord(
+                pp.get_sequence(),
+                id=f"{structure.id}_{chain.id}",
+                description=f"Chain {chain.id}"
+            )
+            records.append(seq_record)
+
+SeqIO.write(records, "structure_sequences.fasta", "fasta")
+```
+
+## Best Practices
+
+1. **Use mmCIF** for large structures and modern data
+2. **Set QUIET=True** to suppress parser warnings
+3. **Check for missing atoms** before analysis
+4. **Use NeighborSearch** for efficient spatial queries
+5. **Validate structure quality** with DSSP or Ramachandran analysis
+6. **Handle multiple models** appropriately (NMR structures)
+7. **Be aware of heteroatoms** - they have different residue IDs
+8. **Use Select classes** for targeted structure output
+9. **Cache downloaded structures** locally
+10. **Consider alternative conformations** - some residues have multiple positions
+
+## Common Use Cases
+
+### Calculate RMSD Between Structures
+
+```python
+from Bio.PDB import PDBParser, Superimposer
+
+parser = PDBParser()
+structure1 = parser.get_structure("s1", "structure1.pdb")
+structure2 = parser.get_structure("s2", "structure2.pdb")
+
+# Get CA atoms
+atoms1 = [atom for atom in structure1[0]["A"].get_atoms() if atom.name == "CA"]
+atoms2 = [atom for atom in structure2[0]["A"].get_atoms() if atom.name == "CA"]
+
+# Ensure same number of atoms
+min_len = min(len(atoms1), len(atoms2))
+atoms1 = atoms1[:min_len]
+atoms2 = atoms2[:min_len]
+
+# Calculate RMSD
+sup = Superimposer()
+sup.set_atoms(atoms1, atoms2)
+print(f"RMSD: {sup.rms:.3f} Å")
+```
+
+### Find Binding Site Residues
+
+```python
+def find_binding_site(structure, ligand_chain, ligand_res_id, distance=5.0):
+    """Find residues near a ligand."""
+    from Bio.PDB import NeighborSearch
+
+    # Get ligand atoms
+    ligand = structure[0][ligand_chain][ligand_res_id]
+    ligand_atoms = list(ligand.get_atoms())
+
+    # Get all protein atoms
+    protein_atoms = []
+    for chain in structure[0]:
+        if chain.id != ligand_chain:
+            for residue in chain:
+                if residue.id[0] == " ":  # Standard residue
+                    protein_atoms.extend(residue.get_atoms())
+
+    # Find nearby atoms
+    ns = NeighborSearch(protein_atoms)
+    binding_site = set()
+
+    for ligand_atom in ligand_atoms:
+        nearby = ns.search(ligand_atom.coord, distance, level="R")
+        binding_site.update(nearby)
+
+    return list(binding_site)
+```
+
+### Calculate Center of Mass
+
+```python
+import numpy as np
+
+def center_of_mass(entity):
+    """Calculate center of mass for structure entity."""
+    masses = []
+    coords = []
+
+    # Atomic masses (simplified)
+    mass_dict = {"C": 12.0, "N": 14.0, "O": 16.0, "S": 32.0}
+
+    for atom in entity.get_atoms():
+        mass = mass_dict.get(atom.element, 12.0)
+        masses.append(mass)
+        coords.append(atom.coord)
+
+    masses = np.array(masses)
+    coords = np.array(coords)
+
+    com = np.sum(coords * masses[:, np.newaxis], axis=0) / np.sum(masses)
+    return com
+```