498 lines
12 KiB
Markdown
498 lines
12 KiB
Markdown
# Core Data Structures
|
|
|
|
## Overview
|
|
|
|
PyOpenMS uses C++ objects with Python bindings. Understanding these core data structures is essential for effective data manipulation.
|
|
|
|
## Spectrum and Experiment Objects
|
|
|
|
### MSExperiment
|
|
|
|
Container for complete LC-MS experiment data (spectra and chromatograms).
|
|
|
|
```python
|
|
import pyopenms as ms
|
|
|
|
# Create experiment
|
|
exp = ms.MSExperiment()
|
|
|
|
# Load from file
|
|
ms.MzMLFile().load("data.mzML", exp)
|
|
|
|
# Access properties
|
|
print(f"Number of spectra: {exp.getNrSpectra()}")
|
|
print(f"Number of chromatograms: {exp.getNrChromatograms()}")
|
|
|
|
# Get RT range
|
|
rts = [spec.getRT() for spec in exp]
|
|
print(f"RT range: {min(rts):.1f} - {max(rts):.1f} seconds")
|
|
|
|
# Access individual spectrum
|
|
spec = exp.getSpectrum(0)
|
|
|
|
# Iterate through spectra
|
|
for spec in exp:
|
|
if spec.getMSLevel() == 2:
|
|
print(f"MS2 spectrum at RT {spec.getRT():.2f}")
|
|
|
|
# Get metadata
|
|
exp_settings = exp.getExperimentalSettings()
|
|
instrument = exp_settings.getInstrument()
|
|
print(f"Instrument: {instrument.getName()}")
|
|
```
|
|
|
|
### MSSpectrum
|
|
|
|
Individual mass spectrum with m/z and intensity arrays.
|
|
|
|
```python
|
|
# Create empty spectrum
|
|
spec = ms.MSSpectrum()
|
|
|
|
# Get from experiment
|
|
exp = ms.MSExperiment()
|
|
ms.MzMLFile().load("data.mzML", exp)
|
|
spec = exp.getSpectrum(0)
|
|
|
|
# Basic properties
|
|
print(f"MS level: {spec.getMSLevel()}")
|
|
print(f"Retention time: {spec.getRT():.2f} seconds")
|
|
print(f"Number of peaks: {spec.size()}")
|
|
|
|
# Get peak data as numpy arrays
|
|
mz, intensity = spec.get_peaks()
|
|
print(f"m/z range: {mz.min():.2f} - {mz.max():.2f}")
|
|
print(f"Max intensity: {intensity.max():.0f}")
|
|
|
|
# Access individual peaks
|
|
for i in range(min(5, spec.size())): # First 5 peaks
|
|
print(f"Peak {i}: m/z={mz[i]:.4f}, intensity={intensity[i]:.0f}")
|
|
|
|
# Precursor information (for MS2)
|
|
if spec.getMSLevel() == 2:
|
|
precursors = spec.getPrecursors()
|
|
if precursors:
|
|
precursor = precursors[0]
|
|
print(f"Precursor m/z: {precursor.getMZ():.4f}")
|
|
print(f"Precursor charge: {precursor.getCharge()}")
|
|
print(f"Precursor intensity: {precursor.getIntensity():.0f}")
|
|
|
|
# Set peak data
|
|
new_mz = [100.0, 200.0, 300.0]
|
|
new_intensity = [1000.0, 2000.0, 1500.0]
|
|
spec.set_peaks((new_mz, new_intensity))
|
|
```
|
|
|
|
### MSChromatogram
|
|
|
|
Chromatographic trace (TIC, XIC, or SRM transition).
|
|
|
|
```python
|
|
# Access chromatogram from experiment
|
|
for chrom in exp.getChromatograms():
|
|
print(f"Chromatogram ID: {chrom.getNativeID()}")
|
|
|
|
# Get data
|
|
rt, intensity = chrom.get_peaks()
|
|
|
|
print(f" RT points: {len(rt)}")
|
|
print(f" Max intensity: {intensity.max():.0f}")
|
|
|
|
# Precursor info (for XIC)
|
|
precursor = chrom.getPrecursor()
|
|
print(f" Precursor m/z: {precursor.getMZ():.4f}")
|
|
```
|
|
|
|
## Feature Objects
|
|
|
|
### Feature
|
|
|
|
Detected chromatographic peak with 2D spatial extent (RT-m/z).
|
|
|
|
```python
|
|
# Load features
|
|
feature_map = ms.FeatureMap()
|
|
ms.FeatureXMLFile().load("features.featureXML", feature_map)
|
|
|
|
# Access individual feature
|
|
feature = feature_map[0]
|
|
|
|
# Core properties
|
|
print(f"m/z: {feature.getMZ():.4f}")
|
|
print(f"RT: {feature.getRT():.2f} seconds")
|
|
print(f"Intensity: {feature.getIntensity():.0f}")
|
|
print(f"Charge: {feature.getCharge()}")
|
|
|
|
# Quality metrics
|
|
print(f"Overall quality: {feature.getOverallQuality():.3f}")
|
|
print(f"Width (RT): {feature.getWidth():.2f}")
|
|
|
|
# Convex hull (spatial extent)
|
|
hull = feature.getConvexHull()
|
|
print(f"Hull points: {hull.getHullPoints().size()}")
|
|
|
|
# Bounding box
|
|
bbox = hull.getBoundingBox()
|
|
print(f"RT range: {bbox.minPosition()[0]:.2f} - {bbox.maxPosition()[0]:.2f}")
|
|
print(f"m/z range: {bbox.minPosition()[1]:.4f} - {bbox.maxPosition()[1]:.4f}")
|
|
|
|
# Subordinate features (isotopes)
|
|
subordinates = feature.getSubordinates()
|
|
if subordinates:
|
|
print(f"Isotopic features: {len(subordinates)}")
|
|
for sub in subordinates:
|
|
print(f" m/z: {sub.getMZ():.4f}, intensity: {sub.getIntensity():.0f}")
|
|
|
|
# Metadata values
|
|
if feature.metaValueExists("label"):
|
|
label = feature.getMetaValue("label")
|
|
print(f"Label: {label}")
|
|
```
|
|
|
|
### FeatureMap
|
|
|
|
Collection of features from a single LC-MS run.
|
|
|
|
```python
|
|
# Create feature map
|
|
feature_map = ms.FeatureMap()
|
|
|
|
# Load from file
|
|
ms.FeatureXMLFile().load("features.featureXML", feature_map)
|
|
|
|
# Access properties
|
|
print(f"Number of features: {feature_map.size()}")
|
|
|
|
# Get unique features
|
|
print(f"Unique features: {feature_map.getUniqueId()}")
|
|
|
|
# Metadata
|
|
primary_path = feature_map.getPrimaryMSRunPath()
|
|
if primary_path:
|
|
print(f"Source file: {primary_path[0].decode()}")
|
|
|
|
# Iterate through features
|
|
for feature in feature_map:
|
|
print(f"Feature: m/z={feature.getMZ():.4f}, RT={feature.getRT():.2f}")
|
|
|
|
# Add new feature
|
|
new_feature = ms.Feature()
|
|
new_feature.setMZ(500.0)
|
|
new_feature.setRT(300.0)
|
|
new_feature.setIntensity(10000.0)
|
|
feature_map.push_back(new_feature)
|
|
|
|
# Sort features
|
|
feature_map.sortByRT() # or sortByMZ(), sortByIntensity()
|
|
|
|
# Export to pandas
|
|
df = feature_map.get_df()
|
|
print(df.head())
|
|
```
|
|
|
|
### ConsensusFeature
|
|
|
|
Feature linked across multiple samples.
|
|
|
|
```python
|
|
# Load consensus map
|
|
consensus_map = ms.ConsensusMap()
|
|
ms.ConsensusXMLFile().load("consensus.consensusXML", consensus_map)
|
|
|
|
# Access consensus feature
|
|
cons_feature = consensus_map[0]
|
|
|
|
# Consensus properties
|
|
print(f"Consensus m/z: {cons_feature.getMZ():.4f}")
|
|
print(f"Consensus RT: {cons_feature.getRT():.2f}")
|
|
print(f"Consensus intensity: {cons_feature.getIntensity():.0f}")
|
|
|
|
# Get feature handles (individual map features)
|
|
feature_list = cons_feature.getFeatureList()
|
|
print(f"Present in {len(feature_list)} maps")
|
|
|
|
for handle in feature_list:
|
|
map_idx = handle.getMapIndex()
|
|
intensity = handle.getIntensity()
|
|
mz = handle.getMZ()
|
|
rt = handle.getRT()
|
|
|
|
print(f" Map {map_idx}: m/z={mz:.4f}, RT={rt:.2f}, intensity={intensity:.0f}")
|
|
|
|
# Get unique ID in originating map
|
|
for handle in feature_list:
|
|
unique_id = handle.getUniqueId()
|
|
print(f"Unique ID: {unique_id}")
|
|
```
|
|
|
|
### ConsensusMap
|
|
|
|
Collection of consensus features across samples.
|
|
|
|
```python
|
|
# Create consensus map
|
|
consensus_map = ms.ConsensusMap()
|
|
|
|
# Load from file
|
|
ms.ConsensusXMLFile().load("consensus.consensusXML", consensus_map)
|
|
|
|
# Access properties
|
|
print(f"Consensus features: {consensus_map.size()}")
|
|
|
|
# Column headers (file descriptions)
|
|
headers = consensus_map.getColumnHeaders()
|
|
print(f"Number of files: {len(headers)}")
|
|
|
|
for map_idx, description in headers.items():
|
|
print(f"Map {map_idx}:")
|
|
print(f" Filename: {description.filename}")
|
|
print(f" Label: {description.label}")
|
|
print(f" Size: {description.size}")
|
|
|
|
# Iterate through consensus features
|
|
for cons_feature in consensus_map:
|
|
print(f"Consensus feature: m/z={cons_feature.getMZ():.4f}")
|
|
|
|
# Export to DataFrame
|
|
df = consensus_map.get_df()
|
|
```
|
|
|
|
## Identification Objects
|
|
|
|
### PeptideIdentification
|
|
|
|
Identification results for a single spectrum.
|
|
|
|
```python
|
|
# Load identifications
|
|
protein_ids = []
|
|
peptide_ids = []
|
|
ms.IdXMLFile().load("identifications.idXML", protein_ids, peptide_ids)
|
|
|
|
# Access peptide identification
|
|
peptide_id = peptide_ids[0]
|
|
|
|
# Spectrum metadata
|
|
print(f"RT: {peptide_id.getRT():.2f}")
|
|
print(f"m/z: {peptide_id.getMZ():.4f}")
|
|
|
|
# Identification metadata
|
|
print(f"Identifier: {peptide_id.getIdentifier()}")
|
|
print(f"Score type: {peptide_id.getScoreType()}")
|
|
print(f"Higher score better: {peptide_id.isHigherScoreBetter()}")
|
|
|
|
# Get peptide hits
|
|
hits = peptide_id.getHits()
|
|
print(f"Number of hits: {len(hits)}")
|
|
|
|
for hit in hits:
|
|
print(f" Sequence: {hit.getSequence().toString()}")
|
|
print(f" Score: {hit.getScore()}")
|
|
print(f" Charge: {hit.getCharge()}")
|
|
```
|
|
|
|
### PeptideHit
|
|
|
|
Individual peptide match to a spectrum.
|
|
|
|
```python
|
|
# Access hit
|
|
hit = peptide_id.getHits()[0]
|
|
|
|
# Sequence information
|
|
sequence = hit.getSequence()
|
|
print(f"Sequence: {sequence.toString()}")
|
|
print(f"Mass: {sequence.getMonoWeight():.4f}")
|
|
|
|
# Score and rank
|
|
print(f"Score: {hit.getScore()}")
|
|
print(f"Rank: {hit.getRank()}")
|
|
|
|
# Charge state
|
|
print(f"Charge: {hit.getCharge()}")
|
|
|
|
# Protein accessions
|
|
accessions = hit.extractProteinAccessionsSet()
|
|
for acc in accessions:
|
|
print(f"Protein: {acc.decode()}")
|
|
|
|
# Meta values (additional scores, errors)
|
|
if hit.metaValueExists("MS:1002252"): # mass error
|
|
mass_error = hit.getMetaValue("MS:1002252")
|
|
print(f"Mass error: {mass_error:.4f} ppm")
|
|
```
|
|
|
|
### ProteinIdentification
|
|
|
|
Protein-level identification information.
|
|
|
|
```python
|
|
# Access protein identification
|
|
protein_id = protein_ids[0]
|
|
|
|
# Search engine info
|
|
print(f"Search engine: {protein_id.getSearchEngine()}")
|
|
print(f"Search engine version: {protein_id.getSearchEngineVersion()}")
|
|
|
|
# Search parameters
|
|
search_params = protein_id.getSearchParameters()
|
|
print(f"Database: {search_params.db}")
|
|
print(f"Enzyme: {search_params.digestion_enzyme.getName()}")
|
|
print(f"Missed cleavages: {search_params.missed_cleavages}")
|
|
print(f"Precursor tolerance: {search_params.precursor_mass_tolerance}")
|
|
|
|
# Protein hits
|
|
hits = protein_id.getHits()
|
|
for hit in hits:
|
|
print(f"Accession: {hit.getAccession()}")
|
|
print(f"Score: {hit.getScore()}")
|
|
print(f"Coverage: {hit.getCoverage():.1f}%")
|
|
```
|
|
|
|
### ProteinHit
|
|
|
|
Individual protein identification.
|
|
|
|
```python
|
|
# Access protein hit
|
|
protein_hit = protein_id.getHits()[0]
|
|
|
|
# Protein information
|
|
print(f"Accession: {protein_hit.getAccession()}")
|
|
print(f"Description: {protein_hit.getDescription()}")
|
|
print(f"Sequence: {protein_hit.getSequence()}")
|
|
|
|
# Scoring
|
|
print(f"Score: {protein_hit.getScore()}")
|
|
print(f"Coverage: {protein_hit.getCoverage():.1f}%")
|
|
|
|
# Rank
|
|
print(f"Rank: {protein_hit.getRank()}")
|
|
```
|
|
|
|
## Sequence Objects
|
|
|
|
### AASequence
|
|
|
|
Amino acid sequence with modifications.
|
|
|
|
```python
|
|
# Create sequence from string
|
|
seq = ms.AASequence.fromString("PEPTIDE")
|
|
|
|
# Basic properties
|
|
print(f"Sequence: {seq.toString()}")
|
|
print(f"Length: {seq.size()}")
|
|
print(f"Monoisotopic mass: {seq.getMonoWeight():.4f}")
|
|
print(f"Average mass: {seq.getAverageWeight():.4f}")
|
|
|
|
# Individual residues
|
|
for i in range(seq.size()):
|
|
residue = seq.getResidue(i)
|
|
print(f"Position {i}: {residue.getOneLetterCode()}")
|
|
print(f" Mass: {residue.getMonoWeight():.4f}")
|
|
print(f" Formula: {residue.getFormula().toString()}")
|
|
|
|
# Modified sequence
|
|
mod_seq = ms.AASequence.fromString("PEPTIDEM(Oxidation)K")
|
|
print(f"Modified: {mod_seq.isModified()}")
|
|
|
|
# Check modifications
|
|
for i in range(mod_seq.size()):
|
|
residue = mod_seq.getResidue(i)
|
|
if residue.isModified():
|
|
print(f"Modification at {i}: {residue.getModificationName()}")
|
|
|
|
# N-terminal and C-terminal modifications
|
|
term_mod_seq = ms.AASequence.fromString("(Acetyl)PEPTIDE(Amidated)")
|
|
```
|
|
|
|
### EmpiricalFormula
|
|
|
|
Molecular formula representation.
|
|
|
|
```python
|
|
# Create formula
|
|
formula = ms.EmpiricalFormula("C6H12O6") # Glucose
|
|
|
|
# Properties
|
|
print(f"Formula: {formula.toString()}")
|
|
print(f"Monoisotopic mass: {formula.getMonoWeight():.4f}")
|
|
print(f"Average mass: {formula.getAverageWeight():.4f}")
|
|
|
|
# Element composition
|
|
print(f"Carbon atoms: {formula.getNumberOf(b'C')}")
|
|
print(f"Hydrogen atoms: {formula.getNumberOf(b'H')}")
|
|
print(f"Oxygen atoms: {formula.getNumberOf(b'O')}")
|
|
|
|
# Arithmetic operations
|
|
formula2 = ms.EmpiricalFormula("H2O")
|
|
combined = formula + formula2 # Add water
|
|
print(f"Combined: {combined.toString()}")
|
|
```
|
|
|
|
## Parameter Objects
|
|
|
|
### Param
|
|
|
|
Generic parameter container used by algorithms.
|
|
|
|
```python
|
|
# Get algorithm parameters
|
|
algo = ms.GaussFilter()
|
|
params = algo.getParameters()
|
|
|
|
# List all parameters
|
|
for key in params.keys():
|
|
value = params.getValue(key)
|
|
print(f"{key}: {value}")
|
|
|
|
# Get specific parameter
|
|
gaussian_width = params.getValue("gaussian_width")
|
|
print(f"Gaussian width: {gaussian_width}")
|
|
|
|
# Set parameter
|
|
params.setValue("gaussian_width", 0.2)
|
|
|
|
# Apply modified parameters
|
|
algo.setParameters(params)
|
|
|
|
# Copy parameters
|
|
params_copy = ms.Param(params)
|
|
```
|
|
|
|
## Best Practices
|
|
|
|
### Memory Management
|
|
|
|
```python
|
|
# For large files, use indexed access instead of full loading
|
|
indexed_mzml = ms.IndexedMzMLFileLoader()
|
|
indexed_mzml.load("large_file.mzML")
|
|
|
|
# Access specific spectrum without loading entire file
|
|
spec = indexed_mzml.getSpectrumById(100)
|
|
```
|
|
|
|
### Type Conversion
|
|
|
|
```python
|
|
# Convert peak arrays to numpy
|
|
import numpy as np
|
|
|
|
mz, intensity = spec.get_peaks()
|
|
# These are already numpy arrays
|
|
|
|
# Can perform numpy operations
|
|
filtered_mz = mz[intensity > 1000]
|
|
```
|
|
|
|
### Object Copying
|
|
|
|
```python
|
|
# Create deep copy
|
|
exp_copy = ms.MSExperiment(exp)
|
|
|
|
# Modifications to copy don't affect original
|
|
```
|