Initial commit
This commit is contained in:
410
skills/pyopenms/references/feature_detection.md
Normal file
410
skills/pyopenms/references/feature_detection.md
Normal file
@@ -0,0 +1,410 @@
|
||||
# Feature Detection and Linking
|
||||
|
||||
## Overview
|
||||
|
||||
Feature detection identifies persistent signals (chromatographic peaks) in LC-MS data. Feature linking combines features across multiple samples for quantitative comparison.
|
||||
|
||||
## Feature Detection Basics
|
||||
|
||||
A feature represents a chromatographic peak characterized by:
|
||||
- m/z value (mass-to-charge ratio)
|
||||
- Retention time (RT)
|
||||
- Intensity
|
||||
- Quality score
|
||||
- Convex hull (spatial extent in RT-m/z space)
|
||||
|
||||
## Feature Finding
|
||||
|
||||
### Feature Finder Multiples (FFM)
|
||||
|
||||
Standard algorithm for feature detection in centroided data:
|
||||
|
||||
```python
|
||||
import pyopenms as ms
|
||||
|
||||
# Load centroided data
|
||||
exp = ms.MSExperiment()
|
||||
ms.MzMLFile().load("centroided.mzML", exp)
|
||||
|
||||
# Create feature finder
|
||||
ff = ms.FeatureFinder()
|
||||
|
||||
# Get default parameters
|
||||
params = ff.getParameters("centroided")
|
||||
|
||||
# Modify key parameters
|
||||
params.setValue("mass_trace:mz_tolerance", 10.0) # ppm
|
||||
params.setValue("mass_trace:min_spectra", 7) # Min scans per feature
|
||||
params.setValue("isotopic_pattern:charge_low", 1)
|
||||
params.setValue("isotopic_pattern:charge_high", 4)
|
||||
|
||||
# Run feature detection
|
||||
features = ms.FeatureMap()
|
||||
ff.run("centroided", exp, features, params, ms.FeatureMap())
|
||||
|
||||
print(f"Detected {features.size()} features")
|
||||
|
||||
# Save features
|
||||
ms.FeatureXMLFile().store("features.featureXML", features)
|
||||
```
|
||||
|
||||
### Feature Finder for Metabolomics
|
||||
|
||||
Optimized for small molecules:
|
||||
|
||||
```python
|
||||
# Create feature finder for metabolomics
|
||||
ff = ms.FeatureFinder()
|
||||
|
||||
# Get metabolomics-specific parameters
|
||||
params = ff.getParameters("centroided")
|
||||
|
||||
# Configure for metabolomics
|
||||
params.setValue("mass_trace:mz_tolerance", 5.0) # Lower tolerance
|
||||
params.setValue("mass_trace:min_spectra", 5)
|
||||
params.setValue("isotopic_pattern:charge_low", 1) # Mostly singly charged
|
||||
params.setValue("isotopic_pattern:charge_high", 2)
|
||||
|
||||
# Run detection
|
||||
features = ms.FeatureMap()
|
||||
ff.run("centroided", exp, features, params, ms.FeatureMap())
|
||||
```
|
||||
|
||||
## Accessing Feature Data
|
||||
|
||||
### Iterate Through Features
|
||||
|
||||
```python
|
||||
# Load features
|
||||
feature_map = ms.FeatureMap()
|
||||
ms.FeatureXMLFile().load("features.featureXML", feature_map)
|
||||
|
||||
# Access individual features
|
||||
for feature in feature_map:
|
||||
print(f"m/z: {feature.getMZ():.4f}")
|
||||
print(f"RT: {feature.getRT():.2f}")
|
||||
print(f"Intensity: {feature.getIntensity():.0f}")
|
||||
print(f"Charge: {feature.getCharge()}")
|
||||
print(f"Quality: {feature.getOverallQuality():.3f}")
|
||||
print(f"Width (RT): {feature.getWidth():.2f}")
|
||||
|
||||
# Get convex hull
|
||||
hull = feature.getConvexHull()
|
||||
print(f"Hull points: {hull.getHullPoints().size()}")
|
||||
```
|
||||
|
||||
### Feature Subordinates (Isotope Pattern)
|
||||
|
||||
```python
|
||||
# Access isotopic pattern
|
||||
for feature in feature_map:
|
||||
# Get subordinate features (isotopes)
|
||||
subordinates = feature.getSubordinates()
|
||||
|
||||
if subordinates:
|
||||
print(f"Main feature m/z: {feature.getMZ():.4f}")
|
||||
for sub in subordinates:
|
||||
print(f" Isotope m/z: {sub.getMZ():.4f}")
|
||||
print(f" Isotope intensity: {sub.getIntensity():.0f}")
|
||||
```
|
||||
|
||||
### Export to Pandas
|
||||
|
||||
```python
|
||||
import pandas as pd
|
||||
|
||||
# Convert to DataFrame
|
||||
df = feature_map.get_df()
|
||||
|
||||
print(df.columns)
|
||||
# Typical columns: RT, mz, intensity, charge, quality
|
||||
|
||||
# Analyze features
|
||||
print(f"Mean intensity: {df['intensity'].mean()}")
|
||||
print(f"RT range: {df['RT'].min():.1f} - {df['RT'].max():.1f}")
|
||||
```
|
||||
|
||||
## Feature Linking
|
||||
|
||||
### Map Alignment
|
||||
|
||||
Align retention times before linking:
|
||||
|
||||
```python
|
||||
# Load multiple feature maps
|
||||
fm1 = ms.FeatureMap()
|
||||
fm2 = ms.FeatureMap()
|
||||
ms.FeatureXMLFile().load("sample1.featureXML", fm1)
|
||||
ms.FeatureXMLFile().load("sample2.featureXML", fm2)
|
||||
|
||||
# Create aligner
|
||||
aligner = ms.MapAlignmentAlgorithmPoseClustering()
|
||||
|
||||
# Align maps
|
||||
fm_aligned = []
|
||||
transformations = []
|
||||
aligner.align([fm1, fm2], fm_aligned, transformations)
|
||||
```
|
||||
|
||||
### Feature Linking Algorithm
|
||||
|
||||
Link features across samples:
|
||||
|
||||
```python
|
||||
# Create feature grouping algorithm
|
||||
grouper = ms.FeatureGroupingAlgorithmQT()
|
||||
|
||||
# Configure parameters
|
||||
params = grouper.getParameters()
|
||||
params.setValue("distance_RT:max_difference", 30.0) # Max RT difference (s)
|
||||
params.setValue("distance_MZ:max_difference", 10.0) # Max m/z difference (ppm)
|
||||
params.setValue("distance_MZ:unit", "ppm")
|
||||
grouper.setParameters(params)
|
||||
|
||||
# Prepare feature maps
|
||||
feature_maps = [fm1, fm2, fm3]
|
||||
|
||||
# Create consensus map
|
||||
consensus_map = ms.ConsensusMap()
|
||||
|
||||
# Link features
|
||||
grouper.group(feature_maps, consensus_map)
|
||||
|
||||
print(f"Created {consensus_map.size()} consensus features")
|
||||
|
||||
# Save consensus map
|
||||
ms.ConsensusXMLFile().store("consensus.consensusXML", consensus_map)
|
||||
```
|
||||
|
||||
## Consensus Features
|
||||
|
||||
### Access Consensus Data
|
||||
|
||||
```python
|
||||
# Load consensus map
|
||||
consensus_map = ms.ConsensusMap()
|
||||
ms.ConsensusXMLFile().load("consensus.consensusXML", consensus_map)
|
||||
|
||||
# Iterate through consensus features
|
||||
for cons_feature in consensus_map:
|
||||
print(f"Consensus m/z: {cons_feature.getMZ():.4f}")
|
||||
print(f"Consensus RT: {cons_feature.getRT():.2f}")
|
||||
|
||||
# Get features from individual maps
|
||||
for handle in cons_feature.getFeatureList():
|
||||
map_idx = handle.getMapIndex()
|
||||
intensity = handle.getIntensity()
|
||||
print(f" Sample {map_idx}: intensity {intensity:.0f}")
|
||||
```
|
||||
|
||||
### Consensus Map Metadata
|
||||
|
||||
```python
|
||||
# Access file descriptions (map metadata)
|
||||
file_descriptions = consensus_map.getColumnHeaders()
|
||||
|
||||
for map_idx, description in file_descriptions.items():
|
||||
print(f"Map {map_idx}:")
|
||||
print(f" Filename: {description.filename}")
|
||||
print(f" Label: {description.label}")
|
||||
print(f" Size: {description.size}")
|
||||
```
|
||||
|
||||
## Adduct Detection
|
||||
|
||||
Identify different ionization forms of the same molecule:
|
||||
|
||||
```python
|
||||
# Create adduct detector
|
||||
adduct_detector = ms.MetaboliteAdductDecharger()
|
||||
|
||||
# Configure parameters
|
||||
params = adduct_detector.getParameters()
|
||||
params.setValue("potential_adducts", "[M+H]+,[M+Na]+,[M+K]+,[M-H]-")
|
||||
params.setValue("charge_min", 1)
|
||||
params.setValue("charge_max", 1)
|
||||
params.setValue("max_neutrals", 1)
|
||||
adduct_detector.setParameters(params)
|
||||
|
||||
# Detect adducts
|
||||
feature_map_out = ms.FeatureMap()
|
||||
adduct_detector.compute(feature_map, feature_map_out, ms.ConsensusMap())
|
||||
```
|
||||
|
||||
## Complete Feature Detection Workflow
|
||||
|
||||
### End-to-End Example
|
||||
|
||||
```python
|
||||
import pyopenms as ms
|
||||
|
||||
def feature_detection_workflow(input_files, output_consensus):
|
||||
"""
|
||||
Complete workflow: feature detection and linking across samples.
|
||||
|
||||
Args:
|
||||
input_files: List of mzML file paths
|
||||
output_consensus: Output consensusXML file path
|
||||
"""
|
||||
|
||||
feature_maps = []
|
||||
|
||||
# Step 1: Detect features in each file
|
||||
for mzml_file in input_files:
|
||||
print(f"Processing {mzml_file}...")
|
||||
|
||||
# Load experiment
|
||||
exp = ms.MSExperiment()
|
||||
ms.MzMLFile().load(mzml_file, exp)
|
||||
|
||||
# Find features
|
||||
ff = ms.FeatureFinder()
|
||||
params = ff.getParameters("centroided")
|
||||
params.setValue("mass_trace:mz_tolerance", 10.0)
|
||||
params.setValue("mass_trace:min_spectra", 7)
|
||||
|
||||
features = ms.FeatureMap()
|
||||
ff.run("centroided", exp, features, params, ms.FeatureMap())
|
||||
|
||||
# Store filename in feature map
|
||||
features.setPrimaryMSRunPath([mzml_file.encode()])
|
||||
|
||||
feature_maps.append(features)
|
||||
print(f" Found {features.size()} features")
|
||||
|
||||
# Step 2: Align retention times
|
||||
print("Aligning retention times...")
|
||||
aligner = ms.MapAlignmentAlgorithmPoseClustering()
|
||||
aligned_maps = []
|
||||
transformations = []
|
||||
aligner.align(feature_maps, aligned_maps, transformations)
|
||||
|
||||
# Step 3: Link features
|
||||
print("Linking features across samples...")
|
||||
grouper = ms.FeatureGroupingAlgorithmQT()
|
||||
params = grouper.getParameters()
|
||||
params.setValue("distance_RT:max_difference", 30.0)
|
||||
params.setValue("distance_MZ:max_difference", 10.0)
|
||||
params.setValue("distance_MZ:unit", "ppm")
|
||||
grouper.setParameters(params)
|
||||
|
||||
consensus_map = ms.ConsensusMap()
|
||||
grouper.group(aligned_maps, consensus_map)
|
||||
|
||||
# Save results
|
||||
ms.ConsensusXMLFile().store(output_consensus, consensus_map)
|
||||
|
||||
print(f"Created {consensus_map.size()} consensus features")
|
||||
print(f"Results saved to {output_consensus}")
|
||||
|
||||
return consensus_map
|
||||
|
||||
# Run workflow
|
||||
input_files = ["sample1.mzML", "sample2.mzML", "sample3.mzML"]
|
||||
consensus = feature_detection_workflow(input_files, "consensus.consensusXML")
|
||||
```
|
||||
|
||||
## Feature Filtering
|
||||
|
||||
### Filter by Quality
|
||||
|
||||
```python
|
||||
# Filter features by quality score
|
||||
filtered_features = ms.FeatureMap()
|
||||
|
||||
for feature in feature_map:
|
||||
if feature.getOverallQuality() > 0.5: # Quality threshold
|
||||
filtered_features.push_back(feature)
|
||||
|
||||
print(f"Kept {filtered_features.size()} high-quality features")
|
||||
```
|
||||
|
||||
### Filter by Intensity
|
||||
|
||||
```python
|
||||
# Keep only intense features
|
||||
min_intensity = 10000
|
||||
|
||||
filtered_features = ms.FeatureMap()
|
||||
for feature in feature_map:
|
||||
if feature.getIntensity() >= min_intensity:
|
||||
filtered_features.push_back(feature)
|
||||
```
|
||||
|
||||
### Filter by m/z Range
|
||||
|
||||
```python
|
||||
# Extract features in specific m/z range
|
||||
mz_min = 200.0
|
||||
mz_max = 800.0
|
||||
|
||||
filtered_features = ms.FeatureMap()
|
||||
for feature in feature_map:
|
||||
mz = feature.getMZ()
|
||||
if mz_min <= mz <= mz_max:
|
||||
filtered_features.push_back(feature)
|
||||
```
|
||||
|
||||
## Feature Annotation
|
||||
|
||||
### Add Identification Information
|
||||
|
||||
```python
|
||||
# Annotate features with peptide identifications
|
||||
# Load identifications
|
||||
protein_ids = []
|
||||
peptide_ids = []
|
||||
ms.IdXMLFile().load("identifications.idXML", protein_ids, peptide_ids)
|
||||
|
||||
# Create ID mapper
|
||||
mapper = ms.IDMapper()
|
||||
|
||||
# Map IDs to features
|
||||
mapper.annotate(feature_map, peptide_ids, protein_ids)
|
||||
|
||||
# Check annotations
|
||||
for feature in feature_map:
|
||||
peptide_ids_for_feature = feature.getPeptideIdentifications()
|
||||
if peptide_ids_for_feature:
|
||||
print(f"Feature at {feature.getMZ():.4f} m/z identified")
|
||||
```
|
||||
|
||||
## Best Practices
|
||||
|
||||
### Parameter Optimization
|
||||
|
||||
Optimize parameters for your data type:
|
||||
|
||||
```python
|
||||
# Test different tolerance values
|
||||
mz_tolerances = [5.0, 10.0, 20.0] # ppm
|
||||
|
||||
for tol in mz_tolerances:
|
||||
ff = ms.FeatureFinder()
|
||||
params = ff.getParameters("centroided")
|
||||
params.setValue("mass_trace:mz_tolerance", tol)
|
||||
|
||||
features = ms.FeatureMap()
|
||||
ff.run("centroided", exp, features, params, ms.FeatureMap())
|
||||
|
||||
print(f"Tolerance {tol} ppm: {features.size()} features")
|
||||
```
|
||||
|
||||
### Visual Inspection
|
||||
|
||||
Export features for visualization:
|
||||
|
||||
```python
|
||||
# Convert to DataFrame for plotting
|
||||
df = feature_map.get_df()
|
||||
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
plt.figure(figsize=(10, 6))
|
||||
plt.scatter(df['RT'], df['mz'], s=df['intensity']/1000, alpha=0.5)
|
||||
plt.xlabel('Retention Time (s)')
|
||||
plt.ylabel('m/z')
|
||||
plt.title('Feature Map')
|
||||
plt.colorbar(label='Intensity (scaled)')
|
||||
plt.show()
|
||||
```
|
||||
Reference in New Issue
Block a user