423 lines
11 KiB
Markdown
423 lines
11 KiB
Markdown
# Peptide and Protein Identification
|
|
|
|
## Overview
|
|
|
|
PyOpenMS supports peptide/protein identification through integration with search engines and provides tools for post-processing identification results including FDR control, protein inference, and annotation.
|
|
|
|
## Supported Search Engines
|
|
|
|
PyOpenMS integrates with these search engines:
|
|
|
|
- **Comet**: Fast tandem MS search
|
|
- **Mascot**: Commercial search engine
|
|
- **MSGFPlus**: Spectral probability-based search
|
|
- **XTandem**: Open-source search tool
|
|
- **OMSSA**: NCBI search engine
|
|
- **Myrimatch**: High-throughput search
|
|
- **MSFragger**: Ultra-fast search
|
|
|
|
## Reading Identification Data
|
|
|
|
### idXML Format
|
|
|
|
```python
|
|
import pyopenms as ms
|
|
|
|
# Load identification results
|
|
protein_ids = []
|
|
peptide_ids = []
|
|
|
|
ms.IdXMLFile().load("identifications.idXML", protein_ids, peptide_ids)
|
|
|
|
print(f"Protein identifications: {len(protein_ids)}")
|
|
print(f"Peptide identifications: {len(peptide_ids)}")
|
|
```
|
|
|
|
### Access Peptide Identifications
|
|
|
|
```python
|
|
# Iterate through peptide IDs
|
|
for peptide_id in peptide_ids:
|
|
# Spectrum metadata
|
|
print(f"RT: {peptide_id.getRT():.2f}")
|
|
print(f"m/z: {peptide_id.getMZ():.4f}")
|
|
|
|
# Get peptide hits (ranked by score)
|
|
hits = peptide_id.getHits()
|
|
print(f"Number of hits: {len(hits)}")
|
|
|
|
for hit in hits:
|
|
sequence = hit.getSequence()
|
|
print(f" Sequence: {sequence.toString()}")
|
|
print(f" Score: {hit.getScore()}")
|
|
print(f" Charge: {hit.getCharge()}")
|
|
print(f" Mass error (ppm): {hit.getMetaValue('mass_error_ppm')}")
|
|
|
|
# Get modifications
|
|
if sequence.isModified():
|
|
for i in range(sequence.size()):
|
|
residue = sequence.getResidue(i)
|
|
if residue.isModified():
|
|
print(f" Modification at position {i}: {residue.getModificationName()}")
|
|
```
|
|
|
|
### Access Protein Identifications
|
|
|
|
```python
|
|
# Access protein-level information
|
|
for protein_id in protein_ids:
|
|
# Search parameters
|
|
search_params = protein_id.getSearchParameters()
|
|
print(f"Search engine: {protein_id.getSearchEngine()}")
|
|
print(f"Database: {search_params.db}")
|
|
|
|
# Protein hits
|
|
hits = protein_id.getHits()
|
|
for hit in hits:
|
|
print(f" Accession: {hit.getAccession()}")
|
|
print(f" Score: {hit.getScore()}")
|
|
print(f" Coverage: {hit.getCoverage()}")
|
|
print(f" Sequence: {hit.getSequence()}")
|
|
```
|
|
|
|
## False Discovery Rate (FDR)
|
|
|
|
### FDR Filtering
|
|
|
|
Apply FDR filtering to control false positives:
|
|
|
|
```python
|
|
# Create FDR object
|
|
fdr = ms.FalseDiscoveryRate()
|
|
|
|
# Apply FDR at PSM level
|
|
fdr.apply(peptide_ids)
|
|
|
|
# Filter by FDR threshold
|
|
fdr_threshold = 0.01 # 1% FDR
|
|
filtered_peptide_ids = []
|
|
|
|
for peptide_id in peptide_ids:
|
|
# Keep hits below FDR threshold
|
|
filtered_hits = []
|
|
for hit in peptide_id.getHits():
|
|
if hit.getScore() <= fdr_threshold: # Lower score = better
|
|
filtered_hits.append(hit)
|
|
|
|
if filtered_hits:
|
|
peptide_id.setHits(filtered_hits)
|
|
filtered_peptide_ids.append(peptide_id)
|
|
|
|
print(f"Peptides passing FDR: {len(filtered_peptide_ids)}")
|
|
```
|
|
|
|
### Score Transformation
|
|
|
|
Convert scores to q-values:
|
|
|
|
```python
|
|
# Apply score transformation
|
|
fdr.apply(peptide_ids)
|
|
|
|
# Access q-values
|
|
for peptide_id in peptide_ids:
|
|
for hit in peptide_id.getHits():
|
|
q_value = hit.getMetaValue("q-value")
|
|
print(f"Sequence: {hit.getSequence().toString()}, q-value: {q_value}")
|
|
```
|
|
|
|
## Protein Inference
|
|
|
|
### ID Mapper
|
|
|
|
Map peptide identifications to proteins:
|
|
|
|
```python
|
|
# Create mapper
|
|
mapper = ms.IDMapper()
|
|
|
|
# Map to features
|
|
feature_map = ms.FeatureMap()
|
|
ms.FeatureXMLFile().load("features.featureXML", feature_map)
|
|
|
|
# Annotate features with IDs
|
|
mapper.annotate(feature_map, peptide_ids, protein_ids)
|
|
|
|
# Check annotated features
|
|
for feature in feature_map:
|
|
pep_ids = feature.getPeptideIdentifications()
|
|
if pep_ids:
|
|
for pep_id in pep_ids:
|
|
for hit in pep_id.getHits():
|
|
print(f"Feature {feature.getMZ():.4f}: {hit.getSequence().toString()}")
|
|
```
|
|
|
|
### Protein Grouping
|
|
|
|
Group proteins by shared peptides:
|
|
|
|
```python
|
|
# Create protein inference algorithm
|
|
inference = ms.BasicProteinInferenceAlgorithm()
|
|
|
|
# Run inference
|
|
inference.run(peptide_ids, protein_ids)
|
|
|
|
# Access protein groups
|
|
for protein_id in protein_ids:
|
|
hits = protein_id.getHits()
|
|
if len(hits) > 1:
|
|
print("Protein group:")
|
|
for hit in hits:
|
|
print(f" {hit.getAccession()}")
|
|
```
|
|
|
|
## Peptide Sequence Handling
|
|
|
|
### AASequence Object
|
|
|
|
Work with peptide sequences:
|
|
|
|
```python
|
|
# Create peptide sequence
|
|
seq = ms.AASequence.fromString("PEPTIDE")
|
|
|
|
print(f"Sequence: {seq.toString()}")
|
|
print(f"Monoisotopic mass: {seq.getMonoWeight():.4f}")
|
|
print(f"Average mass: {seq.getAverageWeight():.4f}")
|
|
print(f"Length: {seq.size()}")
|
|
|
|
# Access individual amino acids
|
|
for i in range(seq.size()):
|
|
residue = seq.getResidue(i)
|
|
print(f"Position {i}: {residue.getOneLetterCode()}, mass: {residue.getMonoWeight():.4f}")
|
|
```
|
|
|
|
### Modified Sequences
|
|
|
|
Handle post-translational modifications:
|
|
|
|
```python
|
|
# Sequence with modifications
|
|
mod_seq = ms.AASequence.fromString("PEPTIDEM(Oxidation)K")
|
|
|
|
print(f"Modified sequence: {mod_seq.toString()}")
|
|
print(f"Mass with mods: {mod_seq.getMonoWeight():.4f}")
|
|
|
|
# Check if modified
|
|
print(f"Is modified: {mod_seq.isModified()}")
|
|
|
|
# Get modification info
|
|
for i in range(mod_seq.size()):
|
|
residue = mod_seq.getResidue(i)
|
|
if residue.isModified():
|
|
print(f"Residue {residue.getOneLetterCode()} at position {i}")
|
|
print(f" Modification: {residue.getModificationName()}")
|
|
```
|
|
|
|
### Peptide Digestion
|
|
|
|
Simulate enzymatic digestion:
|
|
|
|
```python
|
|
# Create digestion enzyme
|
|
enzyme = ms.ProteaseDigestion()
|
|
enzyme.setEnzyme("Trypsin")
|
|
|
|
# Set missed cleavages
|
|
enzyme.setMissedCleavages(2)
|
|
|
|
# Digest protein sequence
|
|
protein_seq = "MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEKAVQVKVKALPDAQFEVVHSLAKWKRQTLGQHDFSAGEGLYTHMKALRPDEDRLSPLHSVYVDQWDWERVMGDGERQFSTLKSTVEAIWAGIKATEAAVSEEFGLAPFLPDQIHFVHSQELLSRYPDLDAKGRERAIAKDLGAVFLVGIGGKLSDGHRHDVRAPDYDDWSTPSELGHAGLNGDILVWNPVLEDAFELSSMGIRVDADTLKHQLALTGDEDRLELEWHQALLRGEMPQTIGGGIGQSRLTMLLLQLPHIGQVQAGVWPAAVRESVPSLL"
|
|
|
|
# Get peptides
|
|
peptides = []
|
|
enzyme.digest(ms.AASequence.fromString(protein_seq), peptides)
|
|
|
|
print(f"Generated {len(peptides)} peptides")
|
|
for peptide in peptides[:5]: # Show first 5
|
|
print(f" {peptide.toString()}, mass: {peptide.getMonoWeight():.2f}")
|
|
```
|
|
|
|
## Theoretical Spectrum Generation
|
|
|
|
### Fragment Ion Calculation
|
|
|
|
Generate theoretical fragment ions:
|
|
|
|
```python
|
|
# Create peptide
|
|
peptide = ms.AASequence.fromString("PEPTIDE")
|
|
|
|
# Generate b and y ions
|
|
fragments = []
|
|
ms.TheoreticalSpectrumGenerator().getSpectrum(fragments, peptide, 1, 1)
|
|
|
|
print(f"Generated {fragments.size()} fragment ions")
|
|
|
|
# Access fragments
|
|
mz, intensity = fragments.get_peaks()
|
|
for m, i in zip(mz[:10], intensity[:10]): # Show first 10
|
|
print(f"m/z: {m:.4f}, intensity: {i}")
|
|
```
|
|
|
|
## Complete Identification Workflow
|
|
|
|
### End-to-End Example
|
|
|
|
```python
|
|
import pyopenms as ms
|
|
|
|
def identification_workflow(spectrum_file, fasta_file, output_file):
|
|
"""
|
|
Complete identification workflow with FDR control.
|
|
|
|
Args:
|
|
spectrum_file: Input mzML file
|
|
fasta_file: Protein database (FASTA)
|
|
output_file: Output idXML file
|
|
"""
|
|
|
|
# Step 1: Load spectra
|
|
exp = ms.MSExperiment()
|
|
ms.MzMLFile().load(spectrum_file, exp)
|
|
print(f"Loaded {exp.getNrSpectra()} spectra")
|
|
|
|
# Step 2: Configure search parameters
|
|
search_params = ms.SearchParameters()
|
|
search_params.db = fasta_file
|
|
search_params.precursor_mass_tolerance = 10.0 # ppm
|
|
search_params.fragment_mass_tolerance = 0.5 # Da
|
|
search_params.enzyme = "Trypsin"
|
|
search_params.missed_cleavages = 2
|
|
search_params.modifications = ["Oxidation (M)", "Carbamidomethyl (C)"]
|
|
|
|
# Step 3: Run search (example with Comet adapter)
|
|
# Note: Requires search engine to be installed
|
|
# comet = ms.CometAdapter()
|
|
# protein_ids, peptide_ids = comet.search(exp, search_params)
|
|
|
|
# For this example, load pre-computed results
|
|
protein_ids = []
|
|
peptide_ids = []
|
|
ms.IdXMLFile().load("raw_identifications.idXML", protein_ids, peptide_ids)
|
|
|
|
print(f"Initial peptide IDs: {len(peptide_ids)}")
|
|
|
|
# Step 4: Apply FDR filtering
|
|
fdr = ms.FalseDiscoveryRate()
|
|
fdr.apply(peptide_ids)
|
|
|
|
# Filter by 1% FDR
|
|
filtered_peptide_ids = []
|
|
for peptide_id in peptide_ids:
|
|
filtered_hits = []
|
|
for hit in peptide_id.getHits():
|
|
q_value = hit.getMetaValue("q-value")
|
|
if q_value <= 0.01:
|
|
filtered_hits.append(hit)
|
|
|
|
if filtered_hits:
|
|
peptide_id.setHits(filtered_hits)
|
|
filtered_peptide_ids.append(peptide_id)
|
|
|
|
print(f"Peptides after FDR (1%): {len(filtered_peptide_ids)}")
|
|
|
|
# Step 5: Protein inference
|
|
inference = ms.BasicProteinInferenceAlgorithm()
|
|
inference.run(filtered_peptide_ids, protein_ids)
|
|
|
|
print(f"Identified proteins: {len(protein_ids)}")
|
|
|
|
# Step 6: Save results
|
|
ms.IdXMLFile().store(output_file, protein_ids, filtered_peptide_ids)
|
|
print(f"Results saved to {output_file}")
|
|
|
|
return protein_ids, filtered_peptide_ids
|
|
|
|
# Run workflow
|
|
protein_ids, peptide_ids = identification_workflow(
|
|
"spectra.mzML",
|
|
"database.fasta",
|
|
"identifications_fdr.idXML"
|
|
)
|
|
```
|
|
|
|
## Spectral Library Search
|
|
|
|
### Library Matching
|
|
|
|
```python
|
|
# Load spectral library
|
|
library = ms.MSPFile()
|
|
library_spectra = []
|
|
library.load("spectral_library.msp", library_spectra)
|
|
|
|
# Load experimental spectra
|
|
exp = ms.MSExperiment()
|
|
ms.MzMLFile().load("data.mzML", exp)
|
|
|
|
# Compare spectra
|
|
spectra_compare = ms.SpectraSTSimilarityScore()
|
|
|
|
for exp_spec in exp:
|
|
if exp_spec.getMSLevel() == 2:
|
|
best_match_score = 0
|
|
best_match_lib = None
|
|
|
|
for lib_spec in library_spectra:
|
|
score = spectra_compare.operator()(exp_spec, lib_spec)
|
|
if score > best_match_score:
|
|
best_match_score = score
|
|
best_match_lib = lib_spec
|
|
|
|
if best_match_score > 0.7: # Threshold
|
|
print(f"Match found: score {best_match_score:.3f}")
|
|
```
|
|
|
|
## Best Practices
|
|
|
|
### Decoy Database
|
|
|
|
Use target-decoy approach for FDR calculation:
|
|
|
|
```python
|
|
# Generate decoy database
|
|
decoy_generator = ms.DecoyGenerator()
|
|
|
|
# Load target database
|
|
fasta_entries = []
|
|
ms.FASTAFile().load("target.fasta", fasta_entries)
|
|
|
|
# Generate decoys
|
|
decoy_entries = []
|
|
for entry in fasta_entries:
|
|
decoy_entry = decoy_generator.reverseProtein(entry)
|
|
decoy_entries.append(decoy_entry)
|
|
|
|
# Save combined database
|
|
all_entries = fasta_entries + decoy_entries
|
|
ms.FASTAFile().store("target_decoy.fasta", all_entries)
|
|
```
|
|
|
|
### Score Interpretation
|
|
|
|
Understand score types from different engines:
|
|
|
|
```python
|
|
# Interpret scores based on search engine
|
|
for peptide_id in peptide_ids:
|
|
search_engine = peptide_id.getIdentifier()
|
|
|
|
for hit in peptide_id.getHits():
|
|
score = hit.getScore()
|
|
|
|
# Score interpretation varies by engine
|
|
if "Comet" in search_engine:
|
|
# Comet: higher E-value = worse
|
|
print(f"E-value: {score}")
|
|
elif "Mascot" in search_engine:
|
|
# Mascot: higher score = better
|
|
print(f"Ion score: {score}")
|
|
```
|