# Matchms Common Workflows This document provides detailed examples of common mass spectrometry analysis workflows using matchms. ## Workflow 1: Basic Spectral Library Matching Match unknown spectra against a reference library to identify compounds. ```python from matchms.importing import load_from_mgf from matchms.filtering import default_filters, normalize_intensities from matchms.filtering import select_by_relative_intensity, require_minimum_number_of_peaks from matchms import calculate_scores from matchms.similarity import CosineGreedy # Load reference library print("Loading reference library...") library = list(load_from_mgf("reference_library.mgf")) # Load query spectra (unknowns) print("Loading query spectra...") queries = list(load_from_mgf("unknown_spectra.mgf")) # Process library spectra print("Processing library...") processed_library = [] for spectrum in library: spectrum = default_filters(spectrum) spectrum = normalize_intensities(spectrum) spectrum = select_by_relative_intensity(spectrum, intensity_from=0.01) spectrum = require_minimum_number_of_peaks(spectrum, n_required=5) if spectrum is not None: processed_library.append(spectrum) # Process query spectra print("Processing queries...") processed_queries = [] for spectrum in queries: spectrum = default_filters(spectrum) spectrum = normalize_intensities(spectrum) spectrum = select_by_relative_intensity(spectrum, intensity_from=0.01) spectrum = require_minimum_number_of_peaks(spectrum, n_required=5) if spectrum is not None: processed_queries.append(spectrum) # Calculate similarities print("Calculating similarities...") scores = calculate_scores(references=processed_library, queries=processed_queries, similarity_function=CosineGreedy(tolerance=0.1)) # Get top matches for each query print("\nTop matches:") for i, query in enumerate(processed_queries): top_matches = scores.scores_by_query(query, sort=True)[:5] query_name = query.get("compound_name", f"Query {i}") print(f"\n{query_name}:") for ref_idx, score in top_matches: ref_spectrum = processed_library[ref_idx] ref_name = ref_spectrum.get("compound_name", f"Ref {ref_idx}") print(f" {ref_name}: {score:.4f}") ``` --- ## Workflow 2: Quality Control and Data Cleaning Filter and clean spectral data before analysis. ```python from matchms.importing import load_from_mgf from matchms.exporting import save_as_mgf from matchms.filtering import (default_filters, normalize_intensities, require_precursor_mz, require_minimum_number_of_peaks, require_minimum_number_of_high_peaks, select_by_relative_intensity, remove_peaks_around_precursor_mz) # Load spectra spectra = list(load_from_mgf("raw_data.mgf")) print(f"Loaded {len(spectra)} raw spectra") # Apply quality filters cleaned_spectra = [] for spectrum in spectra: # Harmonize metadata spectrum = default_filters(spectrum) # Quality requirements spectrum = require_precursor_mz(spectrum, minimum_accepted_mz=50.0) if spectrum is None: continue spectrum = require_minimum_number_of_peaks(spectrum, n_required=10) if spectrum is None: continue # Clean peaks spectrum = normalize_intensities(spectrum) spectrum = remove_peaks_around_precursor_mz(spectrum, mz_tolerance=17) spectrum = select_by_relative_intensity(spectrum, intensity_from=0.01) # Require high-quality peaks spectrum = require_minimum_number_of_high_peaks(spectrum, n_required=5, intensity_threshold=0.05) if spectrum is None: continue cleaned_spectra.append(spectrum) print(f"Retained {len(cleaned_spectra)} high-quality spectra") print(f"Removed {len(spectra) - len(cleaned_spectra)} low-quality spectra") # Save cleaned data save_as_mgf(cleaned_spectra, "cleaned_data.mgf") ``` --- ## Workflow 3: Multi-Metric Similarity Scoring Combine multiple similarity metrics for robust compound identification. ```python from matchms.importing import load_from_mgf from matchms.filtering import (default_filters, normalize_intensities, derive_inchi_from_smiles, add_fingerprint, add_losses) from matchms import calculate_scores from matchms.similarity import (CosineGreedy, ModifiedCosine, NeutralLossesCosine, FingerprintSimilarity) import numpy as np # Load spectra library = list(load_from_mgf("library.mgf")) queries = list(load_from_mgf("queries.mgf")) # Process with multiple features def process_for_multimetric(spectrum): spectrum = default_filters(spectrum) spectrum = normalize_intensities(spectrum) # Add chemical fingerprints spectrum = derive_inchi_from_smiles(spectrum) spectrum = add_fingerprint(spectrum, fingerprint_type="morgan2", nbits=2048) # Add neutral losses spectrum = add_losses(spectrum, loss_mz_from=5.0, loss_mz_to=200.0) return spectrum processed_library = [process_for_multimetric(s) for s in library if s is not None] processed_queries = [process_for_multimetric(s) for s in queries if s is not None] # Calculate multiple similarity scores print("Calculating Cosine similarity...") cosine_scores = calculate_scores(processed_library, processed_queries, CosineGreedy(tolerance=0.1)) print("Calculating Modified Cosine similarity...") modified_cosine_scores = calculate_scores(processed_library, processed_queries, ModifiedCosine(tolerance=0.1)) print("Calculating Neutral Losses similarity...") neutral_losses_scores = calculate_scores(processed_library, processed_queries, NeutralLossesCosine(tolerance=0.1)) print("Calculating Fingerprint similarity...") fingerprint_scores = calculate_scores(processed_library, processed_queries, FingerprintSimilarity(similarity_measure="jaccard")) # Combine scores with weights weights = { 'cosine': 0.4, 'modified_cosine': 0.3, 'neutral_losses': 0.2, 'fingerprint': 0.1 } # Get combined scores for each query for i, query in enumerate(processed_queries): query_name = query.get("compound_name", f"Query {i}") combined_scores = [] for j, ref in enumerate(processed_library): combined = (weights['cosine'] * cosine_scores.scores[j, i] + weights['modified_cosine'] * modified_cosine_scores.scores[j, i] + weights['neutral_losses'] * neutral_losses_scores.scores[j, i] + weights['fingerprint'] * fingerprint_scores.scores[j, i]) combined_scores.append((j, combined)) # Sort by combined score combined_scores.sort(key=lambda x: x[1], reverse=True) print(f"\n{query_name} - Top 3 matches:") for ref_idx, score in combined_scores[:3]: ref_name = processed_library[ref_idx].get("compound_name", f"Ref {ref_idx}") print(f" {ref_name}: {score:.4f}") ``` --- ## Workflow 4: Precursor-Filtered Library Search Pre-filter by precursor mass before spectral matching for faster searches. ```python from matchms.importing import load_from_mgf from matchms.filtering import default_filters, normalize_intensities from matchms import calculate_scores from matchms.similarity import PrecursorMzMatch, CosineGreedy import numpy as np # Load data library = list(load_from_mgf("large_library.mgf")) queries = list(load_from_mgf("queries.mgf")) # Process spectra processed_library = [normalize_intensities(default_filters(s)) for s in library] processed_queries = [normalize_intensities(default_filters(s)) for s in queries] # Step 1: Fast precursor mass filtering print("Filtering by precursor mass...") mass_filter = calculate_scores(processed_library, processed_queries, PrecursorMzMatch(tolerance=0.1, tolerance_type="Dalton")) # Step 2: Calculate cosine only for matching precursors print("Calculating cosine similarity for filtered candidates...") cosine_scores = calculate_scores(processed_library, processed_queries, CosineGreedy(tolerance=0.1)) # Step 3: Apply mass filter to cosine scores for i, query in enumerate(processed_queries): candidates = [] for j, ref in enumerate(processed_library): # Only consider if precursor matches if mass_filter.scores[j, i] > 0: cosine_score = cosine_scores.scores[j, i] candidates.append((j, cosine_score)) # Sort by cosine score candidates.sort(key=lambda x: x[1], reverse=True) query_name = query.get("compound_name", f"Query {i}") print(f"\n{query_name} - Top 5 matches (from {len(candidates)} candidates):") for ref_idx, score in candidates[:5]: ref_name = processed_library[ref_idx].get("compound_name", f"Ref {ref_idx}") ref_mz = processed_library[ref_idx].get("precursor_mz", "N/A") print(f" {ref_name} (m/z {ref_mz}): {score:.4f}") ``` --- ## Workflow 5: Building a Reusable Processing Pipeline Create a standardized pipeline for consistent processing. ```python from matchms import SpectrumProcessor from matchms.filtering import (default_filters, normalize_intensities, select_by_relative_intensity, remove_peaks_around_precursor_mz, require_minimum_number_of_peaks, derive_inchi_from_smiles, add_fingerprint) from matchms.importing import load_from_mgf from matchms.exporting import save_as_pickle # Define custom processing pipeline def create_standard_pipeline(): """Create a reusable processing pipeline""" return SpectrumProcessor([ default_filters, normalize_intensities, lambda s: remove_peaks_around_precursor_mz(s, mz_tolerance=17), lambda s: select_by_relative_intensity(s, intensity_from=0.01), lambda s: require_minimum_number_of_peaks(s, n_required=5), derive_inchi_from_smiles, lambda s: add_fingerprint(s, fingerprint_type="morgan2") ]) # Create pipeline instance pipeline = create_standard_pipeline() # Process multiple datasets with same pipeline datasets = ["dataset1.mgf", "dataset2.mgf", "dataset3.mgf"] for dataset_file in datasets: print(f"\nProcessing {dataset_file}...") # Load spectra spectra = list(load_from_mgf(dataset_file)) # Apply pipeline processed = [] for spectrum in spectra: result = pipeline(spectrum) if result is not None: processed.append(result) print(f" Loaded: {len(spectra)}") print(f" Processed: {len(processed)}") # Save processed data output_file = dataset_file.replace(".mgf", "_processed.pkl") save_as_pickle(processed, output_file) print(f" Saved to: {output_file}") ``` --- ## Workflow 6: Format Conversion and Standardization Convert between different mass spectrometry file formats. ```python from matchms.importing import load_from_mzml, load_from_mgf from matchms.exporting import save_as_mgf, save_as_msp, save_as_json from matchms.filtering import default_filters, normalize_intensities def convert_and_standardize(input_file, output_format="mgf"): """ Load, standardize, and convert mass spectrometry data Parameters: ----------- input_file : str Input file path (supports .mzML, .mzXML, .mgf) output_format : str Output format ('mgf', 'msp', or 'json') """ # Determine input format and load if input_file.endswith('.mzML') or input_file.endswith('.mzXML'): from matchms.importing import load_from_mzml spectra = list(load_from_mzml(input_file, ms_level=2)) elif input_file.endswith('.mgf'): spectra = list(load_from_mgf(input_file)) else: raise ValueError(f"Unsupported format: {input_file}") print(f"Loaded {len(spectra)} spectra from {input_file}") # Standardize processed = [] for spectrum in spectra: spectrum = default_filters(spectrum) spectrum = normalize_intensities(spectrum) if spectrum is not None: processed.append(spectrum) print(f"Standardized {len(processed)} spectra") # Export output_file = input_file.rsplit('.', 1)[0] + f'_standardized.{output_format}' if output_format == 'mgf': save_as_mgf(processed, output_file) elif output_format == 'msp': save_as_msp(processed, output_file) elif output_format == 'json': save_as_json(processed, output_file) else: raise ValueError(f"Unsupported output format: {output_format}") print(f"Saved to {output_file}") return processed # Convert mzML to MGF convert_and_standardize("raw_data.mzML", output_format="mgf") # Convert MGF to MSP library format convert_and_standardize("library.mgf", output_format="msp") ``` --- ## Workflow 7: Metadata Enrichment and Validation Enrich spectra with chemical structure information and validate annotations. ```python from matchms.importing import load_from_mgf from matchms.exporting import save_as_mgf from matchms.filtering import (default_filters, derive_inchi_from_smiles, derive_inchikey_from_inchi, derive_smiles_from_inchi, add_fingerprint, repair_not_matching_annotation, require_valid_annotation) # Load spectra spectra = list(load_from_mgf("spectra.mgf")) # Enrich and validate enriched_spectra = [] validation_failures = [] for i, spectrum in enumerate(spectra): # Basic harmonization spectrum = default_filters(spectrum) # Derive chemical structures spectrum = derive_inchi_from_smiles(spectrum) spectrum = derive_inchikey_from_inchi(spectrum) spectrum = derive_smiles_from_inchi(spectrum) # Repair mismatches spectrum = repair_not_matching_annotation(spectrum) # Add molecular fingerprints spectrum = add_fingerprint(spectrum, fingerprint_type="morgan2", nbits=2048) # Validate validated = require_valid_annotation(spectrum) if validated is not None: enriched_spectra.append(validated) else: validation_failures.append(i) print(f"Successfully enriched: {len(enriched_spectra)}") print(f"Validation failures: {len(validation_failures)}") # Save enriched data save_as_mgf(enriched_spectra, "enriched_spectra.mgf") # Report failures if validation_failures: print("\nSpectra that failed validation:") for idx in validation_failures[:10]: # Show first 10 original = spectra[idx] name = original.get("compound_name", f"Spectrum {idx}") print(f" - {name}") ``` --- ## Workflow 8: Large-Scale Library Comparison Compare two large spectral libraries efficiently. ```python from matchms.importing import load_from_mgf from matchms.filtering import default_filters, normalize_intensities from matchms import calculate_scores from matchms.similarity import CosineGreedy import numpy as np # Load two libraries print("Loading libraries...") library1 = list(load_from_mgf("library1.mgf")) library2 = list(load_from_mgf("library2.mgf")) # Process processed_lib1 = [normalize_intensities(default_filters(s)) for s in library1] processed_lib2 = [normalize_intensities(default_filters(s)) for s in library2] # Calculate all-vs-all similarities print("Calculating similarities...") scores = calculate_scores(processed_lib1, processed_lib2, CosineGreedy(tolerance=0.1)) # Find high-similarity pairs (potential duplicates or similar compounds) threshold = 0.8 similar_pairs = [] for i, spec1 in enumerate(processed_lib1): for j, spec2 in enumerate(processed_lib2): score = scores.scores[i, j] if score >= threshold: similar_pairs.append({ 'lib1_idx': i, 'lib2_idx': j, 'lib1_name': spec1.get("compound_name", f"L1_{i}"), 'lib2_name': spec2.get("compound_name", f"L2_{j}"), 'similarity': score }) # Sort by similarity similar_pairs.sort(key=lambda x: x['similarity'], reverse=True) print(f"\nFound {len(similar_pairs)} pairs with similarity >= {threshold}") print("\nTop 10 most similar pairs:") for pair in similar_pairs[:10]: print(f"{pair['lib1_name']} <-> {pair['lib2_name']}: {pair['similarity']:.4f}") # Export to CSV import pandas as pd df = pd.DataFrame(similar_pairs) df.to_csv("library_comparison.csv", index=False) print("\nFull results saved to library_comparison.csv") ``` --- ## Workflow 9: Ion Mode Specific Processing Process positive and negative mode spectra separately. ```python from matchms.importing import load_from_mgf from matchms.filtering import (default_filters, normalize_intensities, require_correct_ionmode, derive_ionmode) from matchms.exporting import save_as_mgf # Load mixed mode spectra spectra = list(load_from_mgf("mixed_modes.mgf")) # Separate by ion mode positive_spectra = [] negative_spectra = [] unknown_mode = [] for spectrum in spectra: # Harmonize and derive ion mode spectrum = default_filters(spectrum) spectrum = derive_ionmode(spectrum) # Separate by mode ionmode = spectrum.get("ionmode") if ionmode == "positive": spectrum = normalize_intensities(spectrum) positive_spectra.append(spectrum) elif ionmode == "negative": spectrum = normalize_intensities(spectrum) negative_spectra.append(spectrum) else: unknown_mode.append(spectrum) print(f"Positive mode: {len(positive_spectra)}") print(f"Negative mode: {len(negative_spectra)}") print(f"Unknown mode: {len(unknown_mode)}") # Save separated data save_as_mgf(positive_spectra, "positive_mode.mgf") save_as_mgf(negative_spectra, "negative_mode.mgf") # Process mode-specific analyses from matchms import calculate_scores from matchms.similarity import CosineGreedy if len(positive_spectra) > 1: print("\nCalculating positive mode similarities...") pos_scores = calculate_scores(positive_spectra, positive_spectra, CosineGreedy(tolerance=0.1)) if len(negative_spectra) > 1: print("Calculating negative mode similarities...") neg_scores = calculate_scores(negative_spectra, negative_spectra, CosineGreedy(tolerance=0.1)) ``` --- ## Workflow 10: Automated Compound Identification Report Generate a detailed compound identification report. ```python from matchms.importing import load_from_mgf from matchms.filtering import default_filters, normalize_intensities from matchms import calculate_scores from matchms.similarity import CosineGreedy, ModifiedCosine import pandas as pd def identify_compounds(query_file, library_file, output_csv="identification_report.csv"): """ Automated compound identification with detailed report """ # Load data print("Loading data...") queries = list(load_from_mgf(query_file)) library = list(load_from_mgf(library_file)) # Process proc_queries = [normalize_intensities(default_filters(s)) for s in queries] proc_library = [normalize_intensities(default_filters(s)) for s in library] # Calculate similarities print("Calculating similarities...") cosine_scores = calculate_scores(proc_library, proc_queries, CosineGreedy()) modified_scores = calculate_scores(proc_library, proc_queries, ModifiedCosine()) # Generate report results = [] for i, query in enumerate(proc_queries): query_name = query.get("compound_name", f"Unknown_{i}") query_mz = query.get("precursor_mz", "N/A") # Get top 5 matches cosine_matches = cosine_scores.scores_by_query(query, sort=True)[:5] for rank, (lib_idx, cos_score) in enumerate(cosine_matches, 1): ref = proc_library[lib_idx] mod_score = modified_scores.scores[lib_idx, i] results.append({ 'Query': query_name, 'Query_mz': query_mz, 'Rank': rank, 'Match': ref.get("compound_name", f"Ref_{lib_idx}"), 'Match_mz': ref.get("precursor_mz", "N/A"), 'Cosine_Score': cos_score, 'Modified_Cosine': mod_score, 'InChIKey': ref.get("inchikey", "N/A") }) # Create DataFrame and save df = pd.DataFrame(results) df.to_csv(output_csv, index=False) print(f"\nReport saved to {output_csv}") # Summary statistics print("\nSummary:") high_confidence = len(df[df['Cosine_Score'] >= 0.8]) medium_confidence = len(df[(df['Cosine_Score'] >= 0.6) & (df['Cosine_Score'] < 0.8)]) low_confidence = len(df[df['Cosine_Score'] < 0.6]) print(f" High confidence (≥0.8): {high_confidence}") print(f" Medium confidence (0.6-0.8): {medium_confidence}") print(f" Low confidence (<0.6): {low_confidence}") return df # Run identification report = identify_compounds("unknowns.mgf", "reference_library.mgf") ``` --- ## Best Practices 1. **Always process both queries and references**: Apply the same filters to ensure consistent comparison 2. **Save intermediate results**: Use pickle format for fast reloading of processed spectra 3. **Monitor memory usage**: Use generators for large files instead of loading all at once 4. **Validate data quality**: Apply quality filters before similarity calculations 5. **Choose appropriate similarity metrics**: CosineGreedy for speed, ModifiedCosine for related compounds 6. **Combine multiple metrics**: Use multiple similarity scores for robust identification 7. **Filter by precursor mass first**: Dramatically speeds up large library searches 8. **Document your pipeline**: Save processing parameters for reproducibility ## Further Resources - matchms documentation: https://matchms.readthedocs.io - GNPS platform: https://gnps.ucsd.edu - matchms GitHub: https://github.com/matchms/matchms