483 lines
13 KiB
Markdown
483 lines
13 KiB
Markdown
# Metabolomics Workflows
|
|
|
|
## Overview
|
|
|
|
PyOpenMS provides specialized tools for untargeted metabolomics analysis including feature detection optimized for small molecules, adduct grouping, compound identification, and integration with metabolomics databases.
|
|
|
|
## Untargeted Metabolomics Pipeline
|
|
|
|
### Complete Workflow
|
|
|
|
```python
|
|
import pyopenms as ms
|
|
|
|
def metabolomics_pipeline(input_files, output_dir):
|
|
"""
|
|
Complete untargeted metabolomics workflow.
|
|
|
|
Args:
|
|
input_files: List of mzML file paths (one per sample)
|
|
output_dir: Directory for output files
|
|
"""
|
|
|
|
# Step 1: Peak picking and feature detection
|
|
feature_maps = []
|
|
|
|
for mzml_file in input_files:
|
|
print(f"Processing {mzml_file}...")
|
|
|
|
# Load data
|
|
exp = ms.MSExperiment()
|
|
ms.MzMLFile().load(mzml_file, exp)
|
|
|
|
# Peak picking if needed
|
|
if not exp.getSpectrum(0).isSorted():
|
|
picker = ms.PeakPickerHiRes()
|
|
exp_picked = ms.MSExperiment()
|
|
picker.pickExperiment(exp, exp_picked)
|
|
exp = exp_picked
|
|
|
|
# Feature detection
|
|
ff = ms.FeatureFinder()
|
|
params = ff.getParameters("centroided")
|
|
|
|
# Metabolomics-specific parameters
|
|
params.setValue("mass_trace:mz_tolerance", 5.0) # ppm, tighter for metabolites
|
|
params.setValue("mass_trace:min_spectra", 5)
|
|
params.setValue("isotopic_pattern:charge_low", 1)
|
|
params.setValue("isotopic_pattern:charge_high", 2) # Mostly singly charged
|
|
|
|
features = ms.FeatureMap()
|
|
ff.run("centroided", exp, features, params, ms.FeatureMap())
|
|
|
|
features.setPrimaryMSRunPath([mzml_file.encode()])
|
|
feature_maps.append(features)
|
|
|
|
print(f" Detected {features.size()} features")
|
|
|
|
# Step 2: Adduct detection and grouping
|
|
print("Detecting adducts...")
|
|
adduct_grouped_maps = []
|
|
|
|
adduct_detector = ms.MetaboliteAdductDecharger()
|
|
params = adduct_detector.getParameters()
|
|
params.setValue("potential_adducts", "[M+H]+,[M+Na]+,[M+K]+,[M+NH4]+,[M-H]-,[M+Cl]-")
|
|
params.setValue("charge_min", 1)
|
|
params.setValue("charge_max", 1)
|
|
adduct_detector.setParameters(params)
|
|
|
|
for fm in feature_maps:
|
|
fm_out = ms.FeatureMap()
|
|
adduct_detector.compute(fm, fm_out, ms.ConsensusMap())
|
|
adduct_grouped_maps.append(fm_out)
|
|
|
|
# Step 3: RT alignment
|
|
print("Aligning retention times...")
|
|
aligner = ms.MapAlignmentAlgorithmPoseClustering()
|
|
|
|
params = aligner.getParameters()
|
|
params.setValue("max_num_peaks_considered", 1000)
|
|
params.setValue("pairfinder:distance_MZ:max_difference", 10.0)
|
|
params.setValue("pairfinder:distance_MZ:unit", "ppm")
|
|
aligner.setParameters(params)
|
|
|
|
aligned_maps = []
|
|
transformations = []
|
|
aligner.align(adduct_grouped_maps, aligned_maps, transformations)
|
|
|
|
# Step 4: Feature linking
|
|
print("Linking features...")
|
|
grouper = ms.FeatureGroupingAlgorithmQT()
|
|
|
|
params = grouper.getParameters()
|
|
params.setValue("distance_RT:max_difference", 60.0) # seconds
|
|
params.setValue("distance_MZ:max_difference", 5.0) # ppm
|
|
params.setValue("distance_MZ:unit", "ppm")
|
|
grouper.setParameters(params)
|
|
|
|
consensus_map = ms.ConsensusMap()
|
|
grouper.group(aligned_maps, consensus_map)
|
|
|
|
print(f"Created {consensus_map.size()} consensus features")
|
|
|
|
# Step 5: Gap filling (fill missing values)
|
|
print("Filling gaps...")
|
|
# Gap filling not directly available in Python API
|
|
# Would use TOPP tool FeatureFinderMetaboIdent
|
|
|
|
# Step 6: Export results
|
|
consensus_file = f"{output_dir}/consensus.consensusXML"
|
|
ms.ConsensusXMLFile().store(consensus_file, consensus_map)
|
|
|
|
# Export to CSV for downstream analysis
|
|
df = consensus_map.get_df()
|
|
csv_file = f"{output_dir}/metabolite_table.csv"
|
|
df.to_csv(csv_file, index=False)
|
|
|
|
print(f"Results saved to {output_dir}")
|
|
|
|
return consensus_map
|
|
|
|
# Run pipeline
|
|
input_files = ["sample1.mzML", "sample2.mzML", "sample3.mzML"]
|
|
consensus = metabolomics_pipeline(input_files, "output")
|
|
```
|
|
|
|
## Adduct Detection
|
|
|
|
### Configure Adduct Types
|
|
|
|
```python
|
|
# Create adduct detector
|
|
adduct_detector = ms.MetaboliteAdductDecharger()
|
|
|
|
# Configure common adducts
|
|
params = adduct_detector.getParameters()
|
|
|
|
# Positive mode adducts
|
|
positive_adducts = [
|
|
"[M+H]+",
|
|
"[M+Na]+",
|
|
"[M+K]+",
|
|
"[M+NH4]+",
|
|
"[2M+H]+",
|
|
"[M+H-H2O]+"
|
|
]
|
|
|
|
# Negative mode adducts
|
|
negative_adducts = [
|
|
"[M-H]-",
|
|
"[M+Cl]-",
|
|
"[M+FA-H]-", # Formate
|
|
"[2M-H]-"
|
|
]
|
|
|
|
# Set for positive mode
|
|
params.setValue("potential_adducts", ",".join(positive_adducts))
|
|
params.setValue("charge_min", 1)
|
|
params.setValue("charge_max", 1)
|
|
params.setValue("max_neutrals", 1)
|
|
adduct_detector.setParameters(params)
|
|
|
|
# Apply adduct detection
|
|
feature_map_out = ms.FeatureMap()
|
|
adduct_detector.compute(feature_map, feature_map_out, ms.ConsensusMap())
|
|
```
|
|
|
|
### Access Adduct Information
|
|
|
|
```python
|
|
# Check adduct annotations
|
|
for feature in feature_map_out:
|
|
# Get adduct type if annotated
|
|
if feature.metaValueExists("adduct"):
|
|
adduct = feature.getMetaValue("adduct")
|
|
neutral_mass = feature.getMetaValue("neutral_mass")
|
|
print(f"m/z: {feature.getMZ():.4f}")
|
|
print(f" Adduct: {adduct}")
|
|
print(f" Neutral mass: {neutral_mass:.4f}")
|
|
```
|
|
|
|
## Compound Identification
|
|
|
|
### Mass-Based Annotation
|
|
|
|
```python
|
|
# Annotate features with compound database
|
|
from pyopenms import MassDecomposition
|
|
|
|
# Load compound database (example structure)
|
|
# In practice, use external database like HMDB, METLIN
|
|
|
|
compound_db = [
|
|
{"name": "Glucose", "formula": "C6H12O6", "mass": 180.0634},
|
|
{"name": "Citric acid", "formula": "C6H8O7", "mass": 192.0270},
|
|
# ... more compounds
|
|
]
|
|
|
|
# Annotate features
|
|
mass_tolerance = 5.0 # ppm
|
|
|
|
for feature in feature_map:
|
|
observed_mz = feature.getMZ()
|
|
|
|
# Calculate neutral mass (assuming [M+H]+)
|
|
neutral_mass = observed_mz - 1.007276 # Proton mass
|
|
|
|
# Search database
|
|
for compound in compound_db:
|
|
mass_error_ppm = abs(neutral_mass - compound["mass"]) / compound["mass"] * 1e6
|
|
|
|
if mass_error_ppm <= mass_tolerance:
|
|
print(f"Potential match: {compound['name']}")
|
|
print(f" Observed m/z: {observed_mz:.4f}")
|
|
print(f" Expected mass: {compound['mass']:.4f}")
|
|
print(f" Error: {mass_error_ppm:.2f} ppm")
|
|
```
|
|
|
|
### MS/MS-Based Identification
|
|
|
|
```python
|
|
# Load MS2 data
|
|
exp = ms.MSExperiment()
|
|
ms.MzMLFile().load("data_with_ms2.mzML", exp)
|
|
|
|
# Extract MS2 spectra
|
|
ms2_spectra = []
|
|
for spec in exp:
|
|
if spec.getMSLevel() == 2:
|
|
ms2_spectra.append(spec)
|
|
|
|
print(f"Found {len(ms2_spectra)} MS2 spectra")
|
|
|
|
# Match to spectral library
|
|
# (Requires external tool or custom implementation)
|
|
```
|
|
|
|
## Data Normalization
|
|
|
|
### Total Ion Current (TIC) Normalization
|
|
|
|
```python
|
|
import numpy as np
|
|
|
|
# Load consensus map
|
|
consensus_map = ms.ConsensusMap()
|
|
ms.ConsensusXMLFile().load("consensus.consensusXML", consensus_map)
|
|
|
|
# Calculate TIC per sample
|
|
n_samples = len(consensus_map.getColumnHeaders())
|
|
tic_per_sample = np.zeros(n_samples)
|
|
|
|
for cons_feature in consensus_map:
|
|
for handle in cons_feature.getFeatureList():
|
|
map_idx = handle.getMapIndex()
|
|
tic_per_sample[map_idx] += handle.getIntensity()
|
|
|
|
print("TIC per sample:", tic_per_sample)
|
|
|
|
# Normalize to median TIC
|
|
median_tic = np.median(tic_per_sample)
|
|
normalization_factors = median_tic / tic_per_sample
|
|
|
|
print("Normalization factors:", normalization_factors)
|
|
|
|
# Apply normalization
|
|
consensus_map_normalized = ms.ConsensusMap(consensus_map)
|
|
for cons_feature in consensus_map_normalized:
|
|
feature_list = cons_feature.getFeatureList()
|
|
for handle in feature_list:
|
|
map_idx = handle.getMapIndex()
|
|
normalized_intensity = handle.getIntensity() * normalization_factors[map_idx]
|
|
handle.setIntensity(normalized_intensity)
|
|
```
|
|
|
|
## Quality Control
|
|
|
|
### Coefficient of Variation (CV) Filtering
|
|
|
|
```python
|
|
import pandas as pd
|
|
import numpy as np
|
|
|
|
# Export to pandas
|
|
df = consensus_map.get_df()
|
|
|
|
# Assume QC samples are columns with 'QC' in name
|
|
qc_cols = [col for col in df.columns if 'QC' in col]
|
|
|
|
if qc_cols:
|
|
# Calculate CV for each feature in QC samples
|
|
qc_data = df[qc_cols]
|
|
cv = (qc_data.std(axis=1) / qc_data.mean(axis=1)) * 100
|
|
|
|
# Filter features with CV < 30% in QC samples
|
|
good_features = df[cv < 30]
|
|
|
|
print(f"Features before CV filter: {len(df)}")
|
|
print(f"Features after CV filter: {len(good_features)}")
|
|
```
|
|
|
|
### Blank Filtering
|
|
|
|
```python
|
|
# Remove features present in blank samples
|
|
blank_cols = [col for col in df.columns if 'Blank' in col]
|
|
sample_cols = [col for col in df.columns if 'Sample' in col]
|
|
|
|
if blank_cols and sample_cols:
|
|
# Calculate mean intensity in blanks and samples
|
|
blank_mean = df[blank_cols].mean(axis=1)
|
|
sample_mean = df[sample_cols].mean(axis=1)
|
|
|
|
# Keep features with 3x higher intensity in samples than blanks
|
|
ratio = sample_mean / (blank_mean + 1) # Add 1 to avoid division by zero
|
|
filtered_df = df[ratio > 3]
|
|
|
|
print(f"Features before blank filtering: {len(df)}")
|
|
print(f"Features after blank filtering: {len(filtered_df)}")
|
|
```
|
|
|
|
## Missing Value Imputation
|
|
|
|
```python
|
|
import pandas as pd
|
|
import numpy as np
|
|
|
|
# Load data
|
|
df = consensus_map.get_df()
|
|
|
|
# Replace zeros with NaN
|
|
df = df.replace(0, np.nan)
|
|
|
|
# Count missing values
|
|
missing_per_feature = df.isnull().sum(axis=1)
|
|
print(f"Features with >50% missing: {sum(missing_per_feature > len(df.columns)/2)}")
|
|
|
|
# Simple imputation: replace with minimum value
|
|
for col in df.columns:
|
|
if df[col].dtype in [np.float64, np.int64]:
|
|
min_val = df[col].min() / 2 # Half minimum
|
|
df[col].fillna(min_val, inplace=True)
|
|
```
|
|
|
|
## Metabolite Table Export
|
|
|
|
### Create Analysis-Ready Table
|
|
|
|
```python
|
|
import pandas as pd
|
|
|
|
def create_metabolite_table(consensus_map, output_file):
|
|
"""
|
|
Create metabolite quantification table for statistical analysis.
|
|
"""
|
|
|
|
# Get column headers (file descriptions)
|
|
headers = consensus_map.getColumnHeaders()
|
|
|
|
# Initialize data structure
|
|
data = {
|
|
'mz': [],
|
|
'rt': [],
|
|
'feature_id': []
|
|
}
|
|
|
|
# Add sample columns
|
|
for map_idx, header in headers.items():
|
|
sample_name = header.label or f"Sample_{map_idx}"
|
|
data[sample_name] = []
|
|
|
|
# Extract feature data
|
|
for idx, cons_feature in enumerate(consensus_map):
|
|
data['mz'].append(cons_feature.getMZ())
|
|
data['rt'].append(cons_feature.getRT())
|
|
data['feature_id'].append(f"F{idx:06d}")
|
|
|
|
# Initialize intensities
|
|
intensities = {map_idx: 0.0 for map_idx in headers.keys()}
|
|
|
|
# Fill in measured intensities
|
|
for handle in cons_feature.getFeatureList():
|
|
map_idx = handle.getMapIndex()
|
|
intensities[map_idx] = handle.getIntensity()
|
|
|
|
# Add to data structure
|
|
for map_idx, header in headers.items():
|
|
sample_name = header.label or f"Sample_{map_idx}"
|
|
data[sample_name].append(intensities[map_idx])
|
|
|
|
# Create DataFrame
|
|
df = pd.DataFrame(data)
|
|
|
|
# Sort by RT
|
|
df = df.sort_values('rt')
|
|
|
|
# Save to CSV
|
|
df.to_csv(output_file, index=False)
|
|
|
|
print(f"Metabolite table with {len(df)} features saved to {output_file}")
|
|
|
|
return df
|
|
|
|
# Create table
|
|
df = create_metabolite_table(consensus_map, "metabolite_table.csv")
|
|
```
|
|
|
|
## Integration with External Tools
|
|
|
|
### Export for MetaboAnalyst
|
|
|
|
```python
|
|
def export_for_metaboanalyst(df, output_file):
|
|
"""
|
|
Format data for MetaboAnalyst input.
|
|
|
|
Requires sample names as columns, features as rows.
|
|
"""
|
|
|
|
# Transpose DataFrame
|
|
# Remove metadata columns
|
|
sample_cols = [col for col in df.columns if col not in ['mz', 'rt', 'feature_id']]
|
|
|
|
# Extract sample data
|
|
sample_data = df[sample_cols]
|
|
|
|
# Transpose (samples as rows, features as columns)
|
|
df_transposed = sample_data.T
|
|
|
|
# Add feature identifiers as column names
|
|
df_transposed.columns = df['feature_id']
|
|
|
|
# Save
|
|
df_transposed.to_csv(output_file)
|
|
|
|
print(f"MetaboAnalyst format saved to {output_file}")
|
|
|
|
# Export
|
|
export_for_metaboanalyst(df, "for_metaboanalyst.csv")
|
|
```
|
|
|
|
## Best Practices
|
|
|
|
### Sample Size and Replicates
|
|
|
|
- Include QC samples (pooled sample) every 5-10 injections
|
|
- Run blank samples to identify contamination
|
|
- Use at least 3 biological replicates per group
|
|
- Randomize sample injection order
|
|
|
|
### Parameter Optimization
|
|
|
|
Test parameters on pooled QC sample:
|
|
|
|
```python
|
|
# Test different mass trace parameters
|
|
mz_tolerances = [3.0, 5.0, 10.0]
|
|
min_spectra_values = [3, 5, 7]
|
|
|
|
for tol in mz_tolerances:
|
|
for min_spec in min_spectra_values:
|
|
ff = ms.FeatureFinder()
|
|
params = ff.getParameters("centroided")
|
|
params.setValue("mass_trace:mz_tolerance", tol)
|
|
params.setValue("mass_trace:min_spectra", min_spec)
|
|
|
|
features = ms.FeatureMap()
|
|
ff.run("centroided", exp, features, params, ms.FeatureMap())
|
|
|
|
print(f"tol={tol}, min_spec={min_spec}: {features.size()} features")
|
|
```
|
|
|
|
### Retention Time Windows
|
|
|
|
Adjust based on chromatographic method:
|
|
|
|
```python
|
|
# For 10-minute LC gradient
|
|
params.setValue("distance_RT:max_difference", 30.0) # 30 seconds
|
|
|
|
# For 60-minute LC gradient
|
|
params.setValue("distance_RT:max_difference", 90.0) # 90 seconds
|
|
```
|