# Core Data Structures ## Overview PyOpenMS uses C++ objects with Python bindings. Understanding these core data structures is essential for effective data manipulation. ## Spectrum and Experiment Objects ### MSExperiment Container for complete LC-MS experiment data (spectra and chromatograms). ```python import pyopenms as ms # Create experiment exp = ms.MSExperiment() # Load from file ms.MzMLFile().load("data.mzML", exp) # Access properties print(f"Number of spectra: {exp.getNrSpectra()}") print(f"Number of chromatograms: {exp.getNrChromatograms()}") # Get RT range rts = [spec.getRT() for spec in exp] print(f"RT range: {min(rts):.1f} - {max(rts):.1f} seconds") # Access individual spectrum spec = exp.getSpectrum(0) # Iterate through spectra for spec in exp: if spec.getMSLevel() == 2: print(f"MS2 spectrum at RT {spec.getRT():.2f}") # Get metadata exp_settings = exp.getExperimentalSettings() instrument = exp_settings.getInstrument() print(f"Instrument: {instrument.getName()}") ``` ### MSSpectrum Individual mass spectrum with m/z and intensity arrays. ```python # Create empty spectrum spec = ms.MSSpectrum() # Get from experiment exp = ms.MSExperiment() ms.MzMLFile().load("data.mzML", exp) spec = exp.getSpectrum(0) # Basic properties print(f"MS level: {spec.getMSLevel()}") print(f"Retention time: {spec.getRT():.2f} seconds") print(f"Number of peaks: {spec.size()}") # Get peak data as numpy arrays mz, intensity = spec.get_peaks() print(f"m/z range: {mz.min():.2f} - {mz.max():.2f}") print(f"Max intensity: {intensity.max():.0f}") # Access individual peaks for i in range(min(5, spec.size())): # First 5 peaks print(f"Peak {i}: m/z={mz[i]:.4f}, intensity={intensity[i]:.0f}") # Precursor information (for MS2) if spec.getMSLevel() == 2: precursors = spec.getPrecursors() if precursors: precursor = precursors[0] print(f"Precursor m/z: {precursor.getMZ():.4f}") print(f"Precursor charge: {precursor.getCharge()}") print(f"Precursor intensity: {precursor.getIntensity():.0f}") # Set peak data new_mz = [100.0, 200.0, 300.0] new_intensity = [1000.0, 2000.0, 1500.0] spec.set_peaks((new_mz, new_intensity)) ``` ### MSChromatogram Chromatographic trace (TIC, XIC, or SRM transition). ```python # Access chromatogram from experiment for chrom in exp.getChromatograms(): print(f"Chromatogram ID: {chrom.getNativeID()}") # Get data rt, intensity = chrom.get_peaks() print(f" RT points: {len(rt)}") print(f" Max intensity: {intensity.max():.0f}") # Precursor info (for XIC) precursor = chrom.getPrecursor() print(f" Precursor m/z: {precursor.getMZ():.4f}") ``` ## Feature Objects ### Feature Detected chromatographic peak with 2D spatial extent (RT-m/z). ```python # Load features feature_map = ms.FeatureMap() ms.FeatureXMLFile().load("features.featureXML", feature_map) # Access individual feature feature = feature_map[0] # Core properties print(f"m/z: {feature.getMZ():.4f}") print(f"RT: {feature.getRT():.2f} seconds") print(f"Intensity: {feature.getIntensity():.0f}") print(f"Charge: {feature.getCharge()}") # Quality metrics print(f"Overall quality: {feature.getOverallQuality():.3f}") print(f"Width (RT): {feature.getWidth():.2f}") # Convex hull (spatial extent) hull = feature.getConvexHull() print(f"Hull points: {hull.getHullPoints().size()}") # Bounding box bbox = hull.getBoundingBox() print(f"RT range: {bbox.minPosition()[0]:.2f} - {bbox.maxPosition()[0]:.2f}") print(f"m/z range: {bbox.minPosition()[1]:.4f} - {bbox.maxPosition()[1]:.4f}") # Subordinate features (isotopes) subordinates = feature.getSubordinates() if subordinates: print(f"Isotopic features: {len(subordinates)}") for sub in subordinates: print(f" m/z: {sub.getMZ():.4f}, intensity: {sub.getIntensity():.0f}") # Metadata values if feature.metaValueExists("label"): label = feature.getMetaValue("label") print(f"Label: {label}") ``` ### FeatureMap Collection of features from a single LC-MS run. ```python # Create feature map feature_map = ms.FeatureMap() # Load from file ms.FeatureXMLFile().load("features.featureXML", feature_map) # Access properties print(f"Number of features: {feature_map.size()}") # Get unique features print(f"Unique features: {feature_map.getUniqueId()}") # Metadata primary_path = feature_map.getPrimaryMSRunPath() if primary_path: print(f"Source file: {primary_path[0].decode()}") # Iterate through features for feature in feature_map: print(f"Feature: m/z={feature.getMZ():.4f}, RT={feature.getRT():.2f}") # Add new feature new_feature = ms.Feature() new_feature.setMZ(500.0) new_feature.setRT(300.0) new_feature.setIntensity(10000.0) feature_map.push_back(new_feature) # Sort features feature_map.sortByRT() # or sortByMZ(), sortByIntensity() # Export to pandas df = feature_map.get_df() print(df.head()) ``` ### ConsensusFeature Feature linked across multiple samples. ```python # Load consensus map consensus_map = ms.ConsensusMap() ms.ConsensusXMLFile().load("consensus.consensusXML", consensus_map) # Access consensus feature cons_feature = consensus_map[0] # Consensus properties print(f"Consensus m/z: {cons_feature.getMZ():.4f}") print(f"Consensus RT: {cons_feature.getRT():.2f}") print(f"Consensus intensity: {cons_feature.getIntensity():.0f}") # Get feature handles (individual map features) feature_list = cons_feature.getFeatureList() print(f"Present in {len(feature_list)} maps") for handle in feature_list: map_idx = handle.getMapIndex() intensity = handle.getIntensity() mz = handle.getMZ() rt = handle.getRT() print(f" Map {map_idx}: m/z={mz:.4f}, RT={rt:.2f}, intensity={intensity:.0f}") # Get unique ID in originating map for handle in feature_list: unique_id = handle.getUniqueId() print(f"Unique ID: {unique_id}") ``` ### ConsensusMap Collection of consensus features across samples. ```python # Create consensus map consensus_map = ms.ConsensusMap() # Load from file ms.ConsensusXMLFile().load("consensus.consensusXML", consensus_map) # Access properties print(f"Consensus features: {consensus_map.size()}") # Column headers (file descriptions) headers = consensus_map.getColumnHeaders() print(f"Number of files: {len(headers)}") for map_idx, description in headers.items(): print(f"Map {map_idx}:") print(f" Filename: {description.filename}") print(f" Label: {description.label}") print(f" Size: {description.size}") # Iterate through consensus features for cons_feature in consensus_map: print(f"Consensus feature: m/z={cons_feature.getMZ():.4f}") # Export to DataFrame df = consensus_map.get_df() ``` ## Identification Objects ### PeptideIdentification Identification results for a single spectrum. ```python # Load identifications protein_ids = [] peptide_ids = [] ms.IdXMLFile().load("identifications.idXML", protein_ids, peptide_ids) # Access peptide identification peptide_id = peptide_ids[0] # Spectrum metadata print(f"RT: {peptide_id.getRT():.2f}") print(f"m/z: {peptide_id.getMZ():.4f}") # Identification metadata print(f"Identifier: {peptide_id.getIdentifier()}") print(f"Score type: {peptide_id.getScoreType()}") print(f"Higher score better: {peptide_id.isHigherScoreBetter()}") # Get peptide hits hits = peptide_id.getHits() print(f"Number of hits: {len(hits)}") for hit in hits: print(f" Sequence: {hit.getSequence().toString()}") print(f" Score: {hit.getScore()}") print(f" Charge: {hit.getCharge()}") ``` ### PeptideHit Individual peptide match to a spectrum. ```python # Access hit hit = peptide_id.getHits()[0] # Sequence information sequence = hit.getSequence() print(f"Sequence: {sequence.toString()}") print(f"Mass: {sequence.getMonoWeight():.4f}") # Score and rank print(f"Score: {hit.getScore()}") print(f"Rank: {hit.getRank()}") # Charge state print(f"Charge: {hit.getCharge()}") # Protein accessions accessions = hit.extractProteinAccessionsSet() for acc in accessions: print(f"Protein: {acc.decode()}") # Meta values (additional scores, errors) if hit.metaValueExists("MS:1002252"): # mass error mass_error = hit.getMetaValue("MS:1002252") print(f"Mass error: {mass_error:.4f} ppm") ``` ### ProteinIdentification Protein-level identification information. ```python # Access protein identification protein_id = protein_ids[0] # Search engine info print(f"Search engine: {protein_id.getSearchEngine()}") print(f"Search engine version: {protein_id.getSearchEngineVersion()}") # Search parameters search_params = protein_id.getSearchParameters() print(f"Database: {search_params.db}") print(f"Enzyme: {search_params.digestion_enzyme.getName()}") print(f"Missed cleavages: {search_params.missed_cleavages}") print(f"Precursor tolerance: {search_params.precursor_mass_tolerance}") # Protein hits hits = protein_id.getHits() for hit in hits: print(f"Accession: {hit.getAccession()}") print(f"Score: {hit.getScore()}") print(f"Coverage: {hit.getCoverage():.1f}%") ``` ### ProteinHit Individual protein identification. ```python # Access protein hit protein_hit = protein_id.getHits()[0] # Protein information print(f"Accession: {protein_hit.getAccession()}") print(f"Description: {protein_hit.getDescription()}") print(f"Sequence: {protein_hit.getSequence()}") # Scoring print(f"Score: {protein_hit.getScore()}") print(f"Coverage: {protein_hit.getCoverage():.1f}%") # Rank print(f"Rank: {protein_hit.getRank()}") ``` ## Sequence Objects ### AASequence Amino acid sequence with modifications. ```python # Create sequence from string seq = ms.AASequence.fromString("PEPTIDE") # Basic properties print(f"Sequence: {seq.toString()}") print(f"Length: {seq.size()}") print(f"Monoisotopic mass: {seq.getMonoWeight():.4f}") print(f"Average mass: {seq.getAverageWeight():.4f}") # Individual residues for i in range(seq.size()): residue = seq.getResidue(i) print(f"Position {i}: {residue.getOneLetterCode()}") print(f" Mass: {residue.getMonoWeight():.4f}") print(f" Formula: {residue.getFormula().toString()}") # Modified sequence mod_seq = ms.AASequence.fromString("PEPTIDEM(Oxidation)K") print(f"Modified: {mod_seq.isModified()}") # Check modifications for i in range(mod_seq.size()): residue = mod_seq.getResidue(i) if residue.isModified(): print(f"Modification at {i}: {residue.getModificationName()}") # N-terminal and C-terminal modifications term_mod_seq = ms.AASequence.fromString("(Acetyl)PEPTIDE(Amidated)") ``` ### EmpiricalFormula Molecular formula representation. ```python # Create formula formula = ms.EmpiricalFormula("C6H12O6") # Glucose # Properties print(f"Formula: {formula.toString()}") print(f"Monoisotopic mass: {formula.getMonoWeight():.4f}") print(f"Average mass: {formula.getAverageWeight():.4f}") # Element composition print(f"Carbon atoms: {formula.getNumberOf(b'C')}") print(f"Hydrogen atoms: {formula.getNumberOf(b'H')}") print(f"Oxygen atoms: {formula.getNumberOf(b'O')}") # Arithmetic operations formula2 = ms.EmpiricalFormula("H2O") combined = formula + formula2 # Add water print(f"Combined: {combined.toString()}") ``` ## Parameter Objects ### Param Generic parameter container used by algorithms. ```python # Get algorithm parameters algo = ms.GaussFilter() params = algo.getParameters() # List all parameters for key in params.keys(): value = params.getValue(key) print(f"{key}: {value}") # Get specific parameter gaussian_width = params.getValue("gaussian_width") print(f"Gaussian width: {gaussian_width}") # Set parameter params.setValue("gaussian_width", 0.2) # Apply modified parameters algo.setParameters(params) # Copy parameters params_copy = ms.Param(params) ``` ## Best Practices ### Memory Management ```python # For large files, use indexed access instead of full loading indexed_mzml = ms.IndexedMzMLFileLoader() indexed_mzml.load("large_file.mzML") # Access specific spectrum without loading entire file spec = indexed_mzml.getSpectrumById(100) ``` ### Type Conversion ```python # Convert peak arrays to numpy import numpy as np mz, intensity = spec.get_peaks() # These are already numpy arrays # Can perform numpy operations filtered_mz = mz[intensity > 1000] ``` ### Object Copying ```python # Create deep copy exp_copy = ms.MSExperiment(exp) # Modifications to copy don't affect original ```