411 lines
10 KiB
Markdown
411 lines
10 KiB
Markdown
# Feature Detection and Linking
|
|
|
|
## Overview
|
|
|
|
Feature detection identifies persistent signals (chromatographic peaks) in LC-MS data. Feature linking combines features across multiple samples for quantitative comparison.
|
|
|
|
## Feature Detection Basics
|
|
|
|
A feature represents a chromatographic peak characterized by:
|
|
- m/z value (mass-to-charge ratio)
|
|
- Retention time (RT)
|
|
- Intensity
|
|
- Quality score
|
|
- Convex hull (spatial extent in RT-m/z space)
|
|
|
|
## Feature Finding
|
|
|
|
### Feature Finder Multiples (FFM)
|
|
|
|
Standard algorithm for feature detection in centroided data:
|
|
|
|
```python
|
|
import pyopenms as ms
|
|
|
|
# Load centroided data
|
|
exp = ms.MSExperiment()
|
|
ms.MzMLFile().load("centroided.mzML", exp)
|
|
|
|
# Create feature finder
|
|
ff = ms.FeatureFinder()
|
|
|
|
# Get default parameters
|
|
params = ff.getParameters("centroided")
|
|
|
|
# Modify key parameters
|
|
params.setValue("mass_trace:mz_tolerance", 10.0) # ppm
|
|
params.setValue("mass_trace:min_spectra", 7) # Min scans per feature
|
|
params.setValue("isotopic_pattern:charge_low", 1)
|
|
params.setValue("isotopic_pattern:charge_high", 4)
|
|
|
|
# Run feature detection
|
|
features = ms.FeatureMap()
|
|
ff.run("centroided", exp, features, params, ms.FeatureMap())
|
|
|
|
print(f"Detected {features.size()} features")
|
|
|
|
# Save features
|
|
ms.FeatureXMLFile().store("features.featureXML", features)
|
|
```
|
|
|
|
### Feature Finder for Metabolomics
|
|
|
|
Optimized for small molecules:
|
|
|
|
```python
|
|
# Create feature finder for metabolomics
|
|
ff = ms.FeatureFinder()
|
|
|
|
# Get metabolomics-specific parameters
|
|
params = ff.getParameters("centroided")
|
|
|
|
# Configure for metabolomics
|
|
params.setValue("mass_trace:mz_tolerance", 5.0) # Lower tolerance
|
|
params.setValue("mass_trace:min_spectra", 5)
|
|
params.setValue("isotopic_pattern:charge_low", 1) # Mostly singly charged
|
|
params.setValue("isotopic_pattern:charge_high", 2)
|
|
|
|
# Run detection
|
|
features = ms.FeatureMap()
|
|
ff.run("centroided", exp, features, params, ms.FeatureMap())
|
|
```
|
|
|
|
## Accessing Feature Data
|
|
|
|
### Iterate Through Features
|
|
|
|
```python
|
|
# Load features
|
|
feature_map = ms.FeatureMap()
|
|
ms.FeatureXMLFile().load("features.featureXML", feature_map)
|
|
|
|
# Access individual features
|
|
for feature in feature_map:
|
|
print(f"m/z: {feature.getMZ():.4f}")
|
|
print(f"RT: {feature.getRT():.2f}")
|
|
print(f"Intensity: {feature.getIntensity():.0f}")
|
|
print(f"Charge: {feature.getCharge()}")
|
|
print(f"Quality: {feature.getOverallQuality():.3f}")
|
|
print(f"Width (RT): {feature.getWidth():.2f}")
|
|
|
|
# Get convex hull
|
|
hull = feature.getConvexHull()
|
|
print(f"Hull points: {hull.getHullPoints().size()}")
|
|
```
|
|
|
|
### Feature Subordinates (Isotope Pattern)
|
|
|
|
```python
|
|
# Access isotopic pattern
|
|
for feature in feature_map:
|
|
# Get subordinate features (isotopes)
|
|
subordinates = feature.getSubordinates()
|
|
|
|
if subordinates:
|
|
print(f"Main feature m/z: {feature.getMZ():.4f}")
|
|
for sub in subordinates:
|
|
print(f" Isotope m/z: {sub.getMZ():.4f}")
|
|
print(f" Isotope intensity: {sub.getIntensity():.0f}")
|
|
```
|
|
|
|
### Export to Pandas
|
|
|
|
```python
|
|
import pandas as pd
|
|
|
|
# Convert to DataFrame
|
|
df = feature_map.get_df()
|
|
|
|
print(df.columns)
|
|
# Typical columns: RT, mz, intensity, charge, quality
|
|
|
|
# Analyze features
|
|
print(f"Mean intensity: {df['intensity'].mean()}")
|
|
print(f"RT range: {df['RT'].min():.1f} - {df['RT'].max():.1f}")
|
|
```
|
|
|
|
## Feature Linking
|
|
|
|
### Map Alignment
|
|
|
|
Align retention times before linking:
|
|
|
|
```python
|
|
# Load multiple feature maps
|
|
fm1 = ms.FeatureMap()
|
|
fm2 = ms.FeatureMap()
|
|
ms.FeatureXMLFile().load("sample1.featureXML", fm1)
|
|
ms.FeatureXMLFile().load("sample2.featureXML", fm2)
|
|
|
|
# Create aligner
|
|
aligner = ms.MapAlignmentAlgorithmPoseClustering()
|
|
|
|
# Align maps
|
|
fm_aligned = []
|
|
transformations = []
|
|
aligner.align([fm1, fm2], fm_aligned, transformations)
|
|
```
|
|
|
|
### Feature Linking Algorithm
|
|
|
|
Link features across samples:
|
|
|
|
```python
|
|
# Create feature grouping algorithm
|
|
grouper = ms.FeatureGroupingAlgorithmQT()
|
|
|
|
# Configure parameters
|
|
params = grouper.getParameters()
|
|
params.setValue("distance_RT:max_difference", 30.0) # Max RT difference (s)
|
|
params.setValue("distance_MZ:max_difference", 10.0) # Max m/z difference (ppm)
|
|
params.setValue("distance_MZ:unit", "ppm")
|
|
grouper.setParameters(params)
|
|
|
|
# Prepare feature maps
|
|
feature_maps = [fm1, fm2, fm3]
|
|
|
|
# Create consensus map
|
|
consensus_map = ms.ConsensusMap()
|
|
|
|
# Link features
|
|
grouper.group(feature_maps, consensus_map)
|
|
|
|
print(f"Created {consensus_map.size()} consensus features")
|
|
|
|
# Save consensus map
|
|
ms.ConsensusXMLFile().store("consensus.consensusXML", consensus_map)
|
|
```
|
|
|
|
## Consensus Features
|
|
|
|
### Access Consensus Data
|
|
|
|
```python
|
|
# Load consensus map
|
|
consensus_map = ms.ConsensusMap()
|
|
ms.ConsensusXMLFile().load("consensus.consensusXML", consensus_map)
|
|
|
|
# Iterate through consensus features
|
|
for cons_feature in consensus_map:
|
|
print(f"Consensus m/z: {cons_feature.getMZ():.4f}")
|
|
print(f"Consensus RT: {cons_feature.getRT():.2f}")
|
|
|
|
# Get features from individual maps
|
|
for handle in cons_feature.getFeatureList():
|
|
map_idx = handle.getMapIndex()
|
|
intensity = handle.getIntensity()
|
|
print(f" Sample {map_idx}: intensity {intensity:.0f}")
|
|
```
|
|
|
|
### Consensus Map Metadata
|
|
|
|
```python
|
|
# Access file descriptions (map metadata)
|
|
file_descriptions = consensus_map.getColumnHeaders()
|
|
|
|
for map_idx, description in file_descriptions.items():
|
|
print(f"Map {map_idx}:")
|
|
print(f" Filename: {description.filename}")
|
|
print(f" Label: {description.label}")
|
|
print(f" Size: {description.size}")
|
|
```
|
|
|
|
## Adduct Detection
|
|
|
|
Identify different ionization forms of the same molecule:
|
|
|
|
```python
|
|
# Create adduct detector
|
|
adduct_detector = ms.MetaboliteAdductDecharger()
|
|
|
|
# Configure parameters
|
|
params = adduct_detector.getParameters()
|
|
params.setValue("potential_adducts", "[M+H]+,[M+Na]+,[M+K]+,[M-H]-")
|
|
params.setValue("charge_min", 1)
|
|
params.setValue("charge_max", 1)
|
|
params.setValue("max_neutrals", 1)
|
|
adduct_detector.setParameters(params)
|
|
|
|
# Detect adducts
|
|
feature_map_out = ms.FeatureMap()
|
|
adduct_detector.compute(feature_map, feature_map_out, ms.ConsensusMap())
|
|
```
|
|
|
|
## Complete Feature Detection Workflow
|
|
|
|
### End-to-End Example
|
|
|
|
```python
|
|
import pyopenms as ms
|
|
|
|
def feature_detection_workflow(input_files, output_consensus):
|
|
"""
|
|
Complete workflow: feature detection and linking across samples.
|
|
|
|
Args:
|
|
input_files: List of mzML file paths
|
|
output_consensus: Output consensusXML file path
|
|
"""
|
|
|
|
feature_maps = []
|
|
|
|
# Step 1: Detect features in each file
|
|
for mzml_file in input_files:
|
|
print(f"Processing {mzml_file}...")
|
|
|
|
# Load experiment
|
|
exp = ms.MSExperiment()
|
|
ms.MzMLFile().load(mzml_file, exp)
|
|
|
|
# Find features
|
|
ff = ms.FeatureFinder()
|
|
params = ff.getParameters("centroided")
|
|
params.setValue("mass_trace:mz_tolerance", 10.0)
|
|
params.setValue("mass_trace:min_spectra", 7)
|
|
|
|
features = ms.FeatureMap()
|
|
ff.run("centroided", exp, features, params, ms.FeatureMap())
|
|
|
|
# Store filename in feature map
|
|
features.setPrimaryMSRunPath([mzml_file.encode()])
|
|
|
|
feature_maps.append(features)
|
|
print(f" Found {features.size()} features")
|
|
|
|
# Step 2: Align retention times
|
|
print("Aligning retention times...")
|
|
aligner = ms.MapAlignmentAlgorithmPoseClustering()
|
|
aligned_maps = []
|
|
transformations = []
|
|
aligner.align(feature_maps, aligned_maps, transformations)
|
|
|
|
# Step 3: Link features
|
|
print("Linking features across samples...")
|
|
grouper = ms.FeatureGroupingAlgorithmQT()
|
|
params = grouper.getParameters()
|
|
params.setValue("distance_RT:max_difference", 30.0)
|
|
params.setValue("distance_MZ:max_difference", 10.0)
|
|
params.setValue("distance_MZ:unit", "ppm")
|
|
grouper.setParameters(params)
|
|
|
|
consensus_map = ms.ConsensusMap()
|
|
grouper.group(aligned_maps, consensus_map)
|
|
|
|
# Save results
|
|
ms.ConsensusXMLFile().store(output_consensus, consensus_map)
|
|
|
|
print(f"Created {consensus_map.size()} consensus features")
|
|
print(f"Results saved to {output_consensus}")
|
|
|
|
return consensus_map
|
|
|
|
# Run workflow
|
|
input_files = ["sample1.mzML", "sample2.mzML", "sample3.mzML"]
|
|
consensus = feature_detection_workflow(input_files, "consensus.consensusXML")
|
|
```
|
|
|
|
## Feature Filtering
|
|
|
|
### Filter by Quality
|
|
|
|
```python
|
|
# Filter features by quality score
|
|
filtered_features = ms.FeatureMap()
|
|
|
|
for feature in feature_map:
|
|
if feature.getOverallQuality() > 0.5: # Quality threshold
|
|
filtered_features.push_back(feature)
|
|
|
|
print(f"Kept {filtered_features.size()} high-quality features")
|
|
```
|
|
|
|
### Filter by Intensity
|
|
|
|
```python
|
|
# Keep only intense features
|
|
min_intensity = 10000
|
|
|
|
filtered_features = ms.FeatureMap()
|
|
for feature in feature_map:
|
|
if feature.getIntensity() >= min_intensity:
|
|
filtered_features.push_back(feature)
|
|
```
|
|
|
|
### Filter by m/z Range
|
|
|
|
```python
|
|
# Extract features in specific m/z range
|
|
mz_min = 200.0
|
|
mz_max = 800.0
|
|
|
|
filtered_features = ms.FeatureMap()
|
|
for feature in feature_map:
|
|
mz = feature.getMZ()
|
|
if mz_min <= mz <= mz_max:
|
|
filtered_features.push_back(feature)
|
|
```
|
|
|
|
## Feature Annotation
|
|
|
|
### Add Identification Information
|
|
|
|
```python
|
|
# Annotate features with peptide identifications
|
|
# Load identifications
|
|
protein_ids = []
|
|
peptide_ids = []
|
|
ms.IdXMLFile().load("identifications.idXML", protein_ids, peptide_ids)
|
|
|
|
# Create ID mapper
|
|
mapper = ms.IDMapper()
|
|
|
|
# Map IDs to features
|
|
mapper.annotate(feature_map, peptide_ids, protein_ids)
|
|
|
|
# Check annotations
|
|
for feature in feature_map:
|
|
peptide_ids_for_feature = feature.getPeptideIdentifications()
|
|
if peptide_ids_for_feature:
|
|
print(f"Feature at {feature.getMZ():.4f} m/z identified")
|
|
```
|
|
|
|
## Best Practices
|
|
|
|
### Parameter Optimization
|
|
|
|
Optimize parameters for your data type:
|
|
|
|
```python
|
|
# Test different tolerance values
|
|
mz_tolerances = [5.0, 10.0, 20.0] # ppm
|
|
|
|
for tol in mz_tolerances:
|
|
ff = ms.FeatureFinder()
|
|
params = ff.getParameters("centroided")
|
|
params.setValue("mass_trace:mz_tolerance", tol)
|
|
|
|
features = ms.FeatureMap()
|
|
ff.run("centroided", exp, features, params, ms.FeatureMap())
|
|
|
|
print(f"Tolerance {tol} ppm: {features.size()} features")
|
|
```
|
|
|
|
### Visual Inspection
|
|
|
|
Export features for visualization:
|
|
|
|
```python
|
|
# Convert to DataFrame for plotting
|
|
df = feature_map.get_df()
|
|
|
|
import matplotlib.pyplot as plt
|
|
|
|
plt.figure(figsize=(10, 6))
|
|
plt.scatter(df['RT'], df['mz'], s=df['intensity']/1000, alpha=0.5)
|
|
plt.xlabel('Retention Time (s)')
|
|
plt.ylabel('m/z')
|
|
plt.title('Feature Map')
|
|
plt.colorbar(label='Intensity (scaled)')
|
|
plt.show()
|
|
```
|