10 KiB
10 KiB
Feature Detection and Linking
Overview
Feature detection identifies persistent signals (chromatographic peaks) in LC-MS data. Feature linking combines features across multiple samples for quantitative comparison.
Feature Detection Basics
A feature represents a chromatographic peak characterized by:
- m/z value (mass-to-charge ratio)
- Retention time (RT)
- Intensity
- Quality score
- Convex hull (spatial extent in RT-m/z space)
Feature Finding
Feature Finder Multiples (FFM)
Standard algorithm for feature detection in centroided data:
import pyopenms as ms
# Load centroided data
exp = ms.MSExperiment()
ms.MzMLFile().load("centroided.mzML", exp)
# Create feature finder
ff = ms.FeatureFinder()
# Get default parameters
params = ff.getParameters("centroided")
# Modify key parameters
params.setValue("mass_trace:mz_tolerance", 10.0) # ppm
params.setValue("mass_trace:min_spectra", 7) # Min scans per feature
params.setValue("isotopic_pattern:charge_low", 1)
params.setValue("isotopic_pattern:charge_high", 4)
# Run feature detection
features = ms.FeatureMap()
ff.run("centroided", exp, features, params, ms.FeatureMap())
print(f"Detected {features.size()} features")
# Save features
ms.FeatureXMLFile().store("features.featureXML", features)
Feature Finder for Metabolomics
Optimized for small molecules:
# Create feature finder for metabolomics
ff = ms.FeatureFinder()
# Get metabolomics-specific parameters
params = ff.getParameters("centroided")
# Configure for metabolomics
params.setValue("mass_trace:mz_tolerance", 5.0) # Lower tolerance
params.setValue("mass_trace:min_spectra", 5)
params.setValue("isotopic_pattern:charge_low", 1) # Mostly singly charged
params.setValue("isotopic_pattern:charge_high", 2)
# Run detection
features = ms.FeatureMap()
ff.run("centroided", exp, features, params, ms.FeatureMap())
Accessing Feature Data
Iterate Through Features
# Load features
feature_map = ms.FeatureMap()
ms.FeatureXMLFile().load("features.featureXML", feature_map)
# Access individual features
for feature in feature_map:
print(f"m/z: {feature.getMZ():.4f}")
print(f"RT: {feature.getRT():.2f}")
print(f"Intensity: {feature.getIntensity():.0f}")
print(f"Charge: {feature.getCharge()}")
print(f"Quality: {feature.getOverallQuality():.3f}")
print(f"Width (RT): {feature.getWidth():.2f}")
# Get convex hull
hull = feature.getConvexHull()
print(f"Hull points: {hull.getHullPoints().size()}")
Feature Subordinates (Isotope Pattern)
# Access isotopic pattern
for feature in feature_map:
# Get subordinate features (isotopes)
subordinates = feature.getSubordinates()
if subordinates:
print(f"Main feature m/z: {feature.getMZ():.4f}")
for sub in subordinates:
print(f" Isotope m/z: {sub.getMZ():.4f}")
print(f" Isotope intensity: {sub.getIntensity():.0f}")
Export to Pandas
import pandas as pd
# Convert to DataFrame
df = feature_map.get_df()
print(df.columns)
# Typical columns: RT, mz, intensity, charge, quality
# Analyze features
print(f"Mean intensity: {df['intensity'].mean()}")
print(f"RT range: {df['RT'].min():.1f} - {df['RT'].max():.1f}")
Feature Linking
Map Alignment
Align retention times before linking:
# Load multiple feature maps
fm1 = ms.FeatureMap()
fm2 = ms.FeatureMap()
ms.FeatureXMLFile().load("sample1.featureXML", fm1)
ms.FeatureXMLFile().load("sample2.featureXML", fm2)
# Create aligner
aligner = ms.MapAlignmentAlgorithmPoseClustering()
# Align maps
fm_aligned = []
transformations = []
aligner.align([fm1, fm2], fm_aligned, transformations)
Feature Linking Algorithm
Link features across samples:
# Create feature grouping algorithm
grouper = ms.FeatureGroupingAlgorithmQT()
# Configure parameters
params = grouper.getParameters()
params.setValue("distance_RT:max_difference", 30.0) # Max RT difference (s)
params.setValue("distance_MZ:max_difference", 10.0) # Max m/z difference (ppm)
params.setValue("distance_MZ:unit", "ppm")
grouper.setParameters(params)
# Prepare feature maps
feature_maps = [fm1, fm2, fm3]
# Create consensus map
consensus_map = ms.ConsensusMap()
# Link features
grouper.group(feature_maps, consensus_map)
print(f"Created {consensus_map.size()} consensus features")
# Save consensus map
ms.ConsensusXMLFile().store("consensus.consensusXML", consensus_map)
Consensus Features
Access Consensus Data
# Load consensus map
consensus_map = ms.ConsensusMap()
ms.ConsensusXMLFile().load("consensus.consensusXML", consensus_map)
# Iterate through consensus features
for cons_feature in consensus_map:
print(f"Consensus m/z: {cons_feature.getMZ():.4f}")
print(f"Consensus RT: {cons_feature.getRT():.2f}")
# Get features from individual maps
for handle in cons_feature.getFeatureList():
map_idx = handle.getMapIndex()
intensity = handle.getIntensity()
print(f" Sample {map_idx}: intensity {intensity:.0f}")
Consensus Map Metadata
# Access file descriptions (map metadata)
file_descriptions = consensus_map.getColumnHeaders()
for map_idx, description in file_descriptions.items():
print(f"Map {map_idx}:")
print(f" Filename: {description.filename}")
print(f" Label: {description.label}")
print(f" Size: {description.size}")
Adduct Detection
Identify different ionization forms of the same molecule:
# Create adduct detector
adduct_detector = ms.MetaboliteAdductDecharger()
# Configure parameters
params = adduct_detector.getParameters()
params.setValue("potential_adducts", "[M+H]+,[M+Na]+,[M+K]+,[M-H]-")
params.setValue("charge_min", 1)
params.setValue("charge_max", 1)
params.setValue("max_neutrals", 1)
adduct_detector.setParameters(params)
# Detect adducts
feature_map_out = ms.FeatureMap()
adduct_detector.compute(feature_map, feature_map_out, ms.ConsensusMap())
Complete Feature Detection Workflow
End-to-End Example
import pyopenms as ms
def feature_detection_workflow(input_files, output_consensus):
"""
Complete workflow: feature detection and linking across samples.
Args:
input_files: List of mzML file paths
output_consensus: Output consensusXML file path
"""
feature_maps = []
# Step 1: Detect features in each file
for mzml_file in input_files:
print(f"Processing {mzml_file}...")
# Load experiment
exp = ms.MSExperiment()
ms.MzMLFile().load(mzml_file, exp)
# Find features
ff = ms.FeatureFinder()
params = ff.getParameters("centroided")
params.setValue("mass_trace:mz_tolerance", 10.0)
params.setValue("mass_trace:min_spectra", 7)
features = ms.FeatureMap()
ff.run("centroided", exp, features, params, ms.FeatureMap())
# Store filename in feature map
features.setPrimaryMSRunPath([mzml_file.encode()])
feature_maps.append(features)
print(f" Found {features.size()} features")
# Step 2: Align retention times
print("Aligning retention times...")
aligner = ms.MapAlignmentAlgorithmPoseClustering()
aligned_maps = []
transformations = []
aligner.align(feature_maps, aligned_maps, transformations)
# Step 3: Link features
print("Linking features across samples...")
grouper = ms.FeatureGroupingAlgorithmQT()
params = grouper.getParameters()
params.setValue("distance_RT:max_difference", 30.0)
params.setValue("distance_MZ:max_difference", 10.0)
params.setValue("distance_MZ:unit", "ppm")
grouper.setParameters(params)
consensus_map = ms.ConsensusMap()
grouper.group(aligned_maps, consensus_map)
# Save results
ms.ConsensusXMLFile().store(output_consensus, consensus_map)
print(f"Created {consensus_map.size()} consensus features")
print(f"Results saved to {output_consensus}")
return consensus_map
# Run workflow
input_files = ["sample1.mzML", "sample2.mzML", "sample3.mzML"]
consensus = feature_detection_workflow(input_files, "consensus.consensusXML")
Feature Filtering
Filter by Quality
# Filter features by quality score
filtered_features = ms.FeatureMap()
for feature in feature_map:
if feature.getOverallQuality() > 0.5: # Quality threshold
filtered_features.push_back(feature)
print(f"Kept {filtered_features.size()} high-quality features")
Filter by Intensity
# Keep only intense features
min_intensity = 10000
filtered_features = ms.FeatureMap()
for feature in feature_map:
if feature.getIntensity() >= min_intensity:
filtered_features.push_back(feature)
Filter by m/z Range
# Extract features in specific m/z range
mz_min = 200.0
mz_max = 800.0
filtered_features = ms.FeatureMap()
for feature in feature_map:
mz = feature.getMZ()
if mz_min <= mz <= mz_max:
filtered_features.push_back(feature)
Feature Annotation
Add Identification Information
# Annotate features with peptide identifications
# Load identifications
protein_ids = []
peptide_ids = []
ms.IdXMLFile().load("identifications.idXML", protein_ids, peptide_ids)
# Create ID mapper
mapper = ms.IDMapper()
# Map IDs to features
mapper.annotate(feature_map, peptide_ids, protein_ids)
# Check annotations
for feature in feature_map:
peptide_ids_for_feature = feature.getPeptideIdentifications()
if peptide_ids_for_feature:
print(f"Feature at {feature.getMZ():.4f} m/z identified")
Best Practices
Parameter Optimization
Optimize parameters for your data type:
# Test different tolerance values
mz_tolerances = [5.0, 10.0, 20.0] # ppm
for tol in mz_tolerances:
ff = ms.FeatureFinder()
params = ff.getParameters("centroided")
params.setValue("mass_trace:mz_tolerance", tol)
features = ms.FeatureMap()
ff.run("centroided", exp, features, params, ms.FeatureMap())
print(f"Tolerance {tol} ppm: {features.size()} features")
Visual Inspection
Export features for visualization:
# Convert to DataFrame for plotting
df = feature_map.get_df()
import matplotlib.pyplot as plt
plt.figure(figsize=(10, 6))
plt.scatter(df['RT'], df['mz'], s=df['intensity']/1000, alpha=0.5)
plt.xlabel('Retention Time (s)')
plt.ylabel('m/z')
plt.title('Feature Map')
plt.colorbar(label='Intensity (scaled)')
plt.show()