Initial commit
This commit is contained in:
685
skills/esm/references/workflows.md
Normal file
685
skills/esm/references/workflows.md
Normal file
@@ -0,0 +1,685 @@
|
||||
# ESM Workflows and Examples
|
||||
|
||||
## Overview
|
||||
|
||||
This document provides complete, end-to-end examples of common workflows using ESM3 and ESM C. Each workflow includes setup, execution, and analysis code.
|
||||
|
||||
## Workflow 1: Novel GFP Design with Chain-of-Thought
|
||||
|
||||
Design a novel fluorescent protein using ESM3's multimodal generation capabilities.
|
||||
|
||||
### Objective
|
||||
|
||||
Generate a green fluorescent protein (GFP) with specific properties using chain-of-thought reasoning across sequence, structure, and function.
|
||||
|
||||
### Complete Implementation
|
||||
|
||||
```python
|
||||
from esm.models.esm3 import ESM3
|
||||
from esm.sdk.api import ESMProtein, GenerationConfig, FunctionAnnotation
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# Setup
|
||||
model = ESM3.from_pretrained("esm3-sm-open-v1").to("cuda")
|
||||
|
||||
# Step 1: Define target properties
|
||||
print("Step 1: Defining target GFP properties...")
|
||||
|
||||
# Create protein with desired function
|
||||
target_length = 238 # Typical GFP length
|
||||
protein = ESMProtein(
|
||||
sequence="_" * target_length,
|
||||
function_annotations=[
|
||||
FunctionAnnotation(
|
||||
label="green_fluorescent_protein",
|
||||
start=65,
|
||||
end=75 # Chromophore region
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
# Step 2: Generate initial sequence with function conditioning
|
||||
print("Step 2: Generating initial sequence...")
|
||||
|
||||
config = GenerationConfig(
|
||||
track="sequence",
|
||||
num_steps=target_length // 3, # Gradual generation
|
||||
temperature=0.7 # Moderate diversity
|
||||
)
|
||||
protein = model.generate(protein, config)
|
||||
print(f"Generated sequence: {protein.sequence[:50]}...")
|
||||
|
||||
# Step 3: Predict structure
|
||||
print("Step 3: Predicting structure...")
|
||||
|
||||
config = GenerationConfig(
|
||||
track="structure",
|
||||
num_steps=target_length // 2
|
||||
)
|
||||
protein = model.generate(protein, config)
|
||||
print(f"Structure predicted, coordinates shape: {protein.coordinates.shape}")
|
||||
|
||||
# Step 4: Refine sequence based on structure
|
||||
print("Step 4: Refining sequence based on structure...")
|
||||
|
||||
# Mask regions for refinement (e.g., surface residues)
|
||||
sequence_list = list(protein.sequence)
|
||||
# Keep chromophore region, refine others
|
||||
for i in range(0, 65):
|
||||
if i % 3 == 0: # Refine every third position
|
||||
sequence_list[i] = '_'
|
||||
for i in range(75, target_length):
|
||||
if i % 3 == 0:
|
||||
sequence_list[i] = '_'
|
||||
|
||||
protein.sequence = ''.join(sequence_list)
|
||||
|
||||
config = GenerationConfig(
|
||||
track="sequence",
|
||||
num_steps=50,
|
||||
temperature=0.5 # Lower temperature for refinement
|
||||
)
|
||||
protein = model.generate(protein, config)
|
||||
|
||||
# Step 5: Final validation
|
||||
print("Step 5: Final validation...")
|
||||
|
||||
# Predict final structure
|
||||
config = GenerationConfig(track="structure", num_steps=30)
|
||||
protein = model.generate(protein, config)
|
||||
|
||||
# Save results
|
||||
with open("novel_gfp.pdb", "w") as f:
|
||||
f.write(protein.to_pdb())
|
||||
|
||||
with open("novel_gfp_sequence.txt", "w") as f:
|
||||
f.write(f">Novel_GFP\n{protein.sequence}\n")
|
||||
|
||||
print(f"\nFinal GFP sequence:\n{protein.sequence}")
|
||||
print(f"\nFunction annotations: {protein.function_annotations}")
|
||||
print(f"Structure saved to: novel_gfp.pdb")
|
||||
```
|
||||
|
||||
### Validation Steps
|
||||
|
||||
```python
|
||||
# Analyze designed GFP
|
||||
def analyze_gfp(protein):
|
||||
"""Analyze generated GFP properties."""
|
||||
|
||||
# Check chromophore region (should be around Ser65-Tyr66-Gly67)
|
||||
chromophore_region = protein.sequence[64:68]
|
||||
print(f"Chromophore region: {chromophore_region}")
|
||||
|
||||
# Check barrel structure (GFPs have beta-barrel)
|
||||
# Analyze secondary structure if available
|
||||
if protein.secondary_structure:
|
||||
beta_content = protein.secondary_structure.count('E') / len(protein.sequence)
|
||||
print(f"Beta sheet content: {beta_content:.2%}")
|
||||
|
||||
# Check sequence similarity to known GFPs
|
||||
# (Would require BLAST or alignment tool in practice)
|
||||
|
||||
return {
|
||||
'length': len(protein.sequence),
|
||||
'chromophore': chromophore_region,
|
||||
'coordinates_available': protein.coordinates is not None
|
||||
}
|
||||
|
||||
analysis = analyze_gfp(protein)
|
||||
print(f"\nAnalysis results: {analysis}")
|
||||
```
|
||||
|
||||
## Workflow 2: Protein Variant Library Generation
|
||||
|
||||
Generate and analyze a library of protein variants for directed evolution.
|
||||
|
||||
### Objective
|
||||
|
||||
Create variants of a parent protein by targeted mutagenesis while maintaining structural integrity.
|
||||
|
||||
### Complete Implementation
|
||||
|
||||
```python
|
||||
from esm.models.esm3 import ESM3
|
||||
from esm.sdk.api import ESMProtein, GenerationConfig
|
||||
import numpy as np
|
||||
from sklearn.cluster import KMeans
|
||||
|
||||
# Setup
|
||||
model = ESM3.from_pretrained("esm3-sm-open-v1").to("cuda")
|
||||
|
||||
# Parent protein
|
||||
parent_sequence = "MPRTKEINDAGLIVHSPQWFYKARNDTESLGKIVHEFPM"
|
||||
parent_protein = ESMProtein(sequence=parent_sequence)
|
||||
|
||||
# Define mutation parameters
|
||||
num_variants = 50
|
||||
positions_to_mutate = 5 # Number of positions per variant
|
||||
|
||||
# Step 1: Generate variant library
|
||||
print("Generating variant library...")
|
||||
|
||||
variants = []
|
||||
for i in range(num_variants):
|
||||
# Create masked sequence with random positions
|
||||
seq_list = list(parent_sequence)
|
||||
|
||||
# Select random positions to mutate
|
||||
mutation_positions = np.random.choice(
|
||||
len(seq_list),
|
||||
size=positions_to_mutate,
|
||||
replace=False
|
||||
)
|
||||
|
||||
for pos in mutation_positions:
|
||||
seq_list[pos] = '_'
|
||||
|
||||
# Generate variant
|
||||
variant_protein = ESMProtein(sequence=''.join(seq_list))
|
||||
|
||||
config = GenerationConfig(
|
||||
track="sequence",
|
||||
num_steps=positions_to_mutate * 2,
|
||||
temperature=0.8 # Higher diversity
|
||||
)
|
||||
|
||||
variant = model.generate(variant_protein, config)
|
||||
variants.append(variant.sequence)
|
||||
|
||||
if (i + 1) % 10 == 0:
|
||||
print(f"Generated {i + 1}/{num_variants} variants")
|
||||
|
||||
print(f"\nGenerated {len(variants)} variants")
|
||||
|
||||
# Step 2: Predict structures for variants
|
||||
print("\nPredicting structures...")
|
||||
|
||||
variant_proteins_with_structure = []
|
||||
for i, seq in enumerate(variants):
|
||||
protein = ESMProtein(sequence=seq)
|
||||
|
||||
config = GenerationConfig(
|
||||
track="structure",
|
||||
num_steps=len(seq) // 2
|
||||
)
|
||||
|
||||
protein_with_structure = model.generate(protein, config)
|
||||
variant_proteins_with_structure.append(protein_with_structure)
|
||||
|
||||
if (i + 1) % 10 == 0:
|
||||
print(f"Predicted structures for {i + 1}/{len(variants)} variants")
|
||||
|
||||
# Step 3: Analyze variant diversity
|
||||
print("\nAnalyzing variant diversity...")
|
||||
|
||||
# Calculate Hamming distances from parent
|
||||
def hamming_distance(seq1, seq2):
|
||||
"""Calculate Hamming distance between sequences."""
|
||||
return sum(c1 != c2 for c1, c2 in zip(seq1, seq2))
|
||||
|
||||
distances = [hamming_distance(parent_sequence, var) for var in variants]
|
||||
print(f"Average mutations per variant: {np.mean(distances):.1f}")
|
||||
print(f"Mutation range: {min(distances)}-{max(distances)}")
|
||||
|
||||
# Step 4: Get embeddings for clustering
|
||||
print("\nGenerating embeddings for clustering...")
|
||||
|
||||
from esm.models.esmc import ESMC
|
||||
|
||||
embedding_model = ESMC.from_pretrained("esmc-300m").to("cuda")
|
||||
|
||||
def get_embedding(sequence):
|
||||
"""Get mean-pooled embedding for sequence."""
|
||||
protein = ESMProtein(sequence=sequence)
|
||||
tensor = embedding_model.encode(protein)
|
||||
emb = embedding_model.forward(tensor)
|
||||
return emb.mean(dim=1).cpu().detach().numpy().flatten()
|
||||
|
||||
variant_embeddings = np.array([get_embedding(seq) for seq in variants])
|
||||
|
||||
# Step 5: Cluster variants
|
||||
print("Clustering variants...")
|
||||
|
||||
n_clusters = 5
|
||||
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
|
||||
cluster_labels = kmeans.fit_predict(variant_embeddings)
|
||||
|
||||
# Analyze clusters
|
||||
print("\nCluster analysis:")
|
||||
for i in range(n_clusters):
|
||||
cluster_variants = [var for var, label in zip(variants, cluster_labels) if label == i]
|
||||
cluster_distances = [hamming_distance(parent_sequence, var) for var in cluster_variants]
|
||||
|
||||
print(f"\nCluster {i}:")
|
||||
print(f" Size: {len(cluster_variants)}")
|
||||
print(f" Avg distance from parent: {np.mean(cluster_distances):.1f}")
|
||||
print(f" Representative: {cluster_variants[0][:40]}...")
|
||||
|
||||
# Step 6: Select diverse representatives
|
||||
print("\nSelecting diverse representatives...")
|
||||
|
||||
representatives = []
|
||||
for i in range(n_clusters):
|
||||
# Get centroid
|
||||
cluster_indices = np.where(cluster_labels == i)[0]
|
||||
cluster_embs = variant_embeddings[cluster_indices]
|
||||
|
||||
# Find closest to centroid
|
||||
centroid = cluster_embs.mean(axis=0)
|
||||
distances_to_centroid = np.linalg.norm(cluster_embs - centroid, axis=1)
|
||||
rep_idx = cluster_indices[np.argmin(distances_to_centroid)]
|
||||
|
||||
representatives.append(variants[rep_idx])
|
||||
|
||||
# Save results
|
||||
print("\nSaving results...")
|
||||
|
||||
with open("variant_library.fasta", "w") as f:
|
||||
f.write(f">Parent\n{parent_sequence}\n\n")
|
||||
for i, var in enumerate(variants):
|
||||
f.write(f">Variant_{i+1}_Cluster_{cluster_labels[i]}\n{var}\n")
|
||||
|
||||
with open("representative_variants.fasta", "w") as f:
|
||||
for i, rep in enumerate(representatives):
|
||||
f.write(f">Representative_Cluster_{i}\n{rep}\n")
|
||||
|
||||
print("Variant library saved to: variant_library.fasta")
|
||||
print("Representatives saved to: representative_variants.fasta")
|
||||
```
|
||||
|
||||
## Workflow 3: Structure-Based Sequence Optimization
|
||||
|
||||
Optimize a protein sequence to improve stability while maintaining function.
|
||||
|
||||
### Objective
|
||||
|
||||
Given a protein structure, design sequences that maintain the fold but have improved properties.
|
||||
|
||||
### Complete Implementation
|
||||
|
||||
```python
|
||||
from esm.models.esm3 import ESM3
|
||||
from esm.sdk.api import ESMProtein, GenerationConfig
|
||||
import numpy as np
|
||||
|
||||
# Setup
|
||||
model = ESM3.from_pretrained("esm3-sm-open-v1").to("cuda")
|
||||
|
||||
# Load target structure (e.g., from PDB)
|
||||
target_protein = ESMProtein.from_pdb("target_structure.pdb")
|
||||
original_sequence = target_protein.sequence
|
||||
|
||||
print(f"Original sequence: {original_sequence}")
|
||||
print(f"Structure loaded: {target_protein.coordinates.shape}")
|
||||
|
||||
# Step 1: Generate multiple sequence designs
|
||||
print("\nGenerating optimized sequences...")
|
||||
|
||||
num_designs = 20
|
||||
optimized_sequences = []
|
||||
|
||||
for i in range(num_designs):
|
||||
# Start with structure, remove sequence
|
||||
design_protein = ESMProtein(
|
||||
coordinates=target_protein.coordinates.copy(),
|
||||
secondary_structure=target_protein.secondary_structure
|
||||
)
|
||||
|
||||
# Generate sequence for this structure
|
||||
config = GenerationConfig(
|
||||
track="sequence",
|
||||
num_steps=len(original_sequence),
|
||||
temperature=0.7,
|
||||
condition_on_coordinates_only=True
|
||||
)
|
||||
|
||||
designed = model.generate(design_protein, config)
|
||||
optimized_sequences.append(designed.sequence)
|
||||
|
||||
if (i + 1) % 5 == 0:
|
||||
print(f"Generated {i + 1}/{num_designs} designs")
|
||||
|
||||
# Step 2: Validate structural compatibility
|
||||
print("\nValidating structural compatibility...")
|
||||
|
||||
validated_designs = []
|
||||
|
||||
for seq in optimized_sequences:
|
||||
# Predict structure for designed sequence
|
||||
test_protein = ESMProtein(sequence=seq)
|
||||
|
||||
config = GenerationConfig(
|
||||
track="structure",
|
||||
num_steps=len(seq) // 2
|
||||
)
|
||||
|
||||
predicted = model.generate(test_protein, config)
|
||||
|
||||
# Calculate RMSD (simplified - in practice use proper alignment)
|
||||
# Here we just check if structure prediction succeeds
|
||||
if predicted.coordinates is not None:
|
||||
validated_designs.append(seq)
|
||||
|
||||
print(f"Validated {len(validated_designs)}/{num_designs} designs")
|
||||
|
||||
# Step 3: Analyze sequence properties
|
||||
print("\nAnalyzing sequence properties...")
|
||||
|
||||
def calculate_properties(sequence):
|
||||
"""Calculate basic sequence properties."""
|
||||
# Hydrophobicity (simplified)
|
||||
hydrophobic = "AILMFWYV"
|
||||
hydrophobic_fraction = sum(1 for aa in sequence if aa in hydrophobic) / len(sequence)
|
||||
|
||||
# Charge
|
||||
positive = "KR"
|
||||
negative = "DE"
|
||||
net_charge = sum(1 for aa in sequence if aa in positive) - sum(1 for aa in sequence if aa in negative)
|
||||
|
||||
# Aromatic content
|
||||
aromatic = "FWY"
|
||||
aromatic_fraction = sum(1 for aa in sequence if aa in aromatic) / len(sequence)
|
||||
|
||||
return {
|
||||
'hydrophobic_fraction': hydrophobic_fraction,
|
||||
'net_charge': net_charge,
|
||||
'aromatic_fraction': aromatic_fraction
|
||||
}
|
||||
|
||||
# Compare to original
|
||||
original_props = calculate_properties(original_sequence)
|
||||
print(f"\nOriginal properties:")
|
||||
print(f" Hydrophobic: {original_props['hydrophobic_fraction']:.2%}")
|
||||
print(f" Net charge: {original_props['net_charge']:+d}")
|
||||
print(f" Aromatic: {original_props['aromatic_fraction']:.2%}")
|
||||
|
||||
# Analyze designs
|
||||
design_properties = [calculate_properties(seq) for seq in validated_designs]
|
||||
|
||||
avg_hydrophobic = np.mean([p['hydrophobic_fraction'] for p in design_properties])
|
||||
avg_charge = np.mean([p['net_charge'] for p in design_properties])
|
||||
avg_aromatic = np.mean([p['aromatic_fraction'] for p in design_properties])
|
||||
|
||||
print(f"\nDesigned sequences (average):")
|
||||
print(f" Hydrophobic: {avg_hydrophobic:.2%}")
|
||||
print(f" Net charge: {avg_charge:+.1f}")
|
||||
print(f" Aromatic: {avg_aromatic:.2%}")
|
||||
|
||||
# Step 4: Rank designs
|
||||
print("\nRanking designs...")
|
||||
|
||||
def score_design(sequence, original_props):
|
||||
"""Score design based on desired properties."""
|
||||
props = calculate_properties(sequence)
|
||||
|
||||
# Prefer higher hydrophobic content (for stability)
|
||||
hydrophobic_score = props['hydrophobic_fraction']
|
||||
|
||||
# Prefer similar charge to original
|
||||
charge_score = 1.0 / (1.0 + abs(props['net_charge'] - original_props['net_charge']))
|
||||
|
||||
# Combined score
|
||||
return hydrophobic_score * 0.6 + charge_score * 0.4
|
||||
|
||||
scores = [(seq, score_design(seq, original_props)) for seq in validated_designs]
|
||||
scores.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
print("\nTop 5 designs:")
|
||||
for i, (seq, score) in enumerate(scores[:5]):
|
||||
print(f"\n{i+1}. Score: {score:.3f}")
|
||||
print(f" Sequence: {seq[:40]}...")
|
||||
|
||||
# Step 5: Save results
|
||||
print("\nSaving results...")
|
||||
|
||||
with open("optimized_sequences.fasta", "w") as f:
|
||||
f.write(f">Original\n{original_sequence}\n\n")
|
||||
|
||||
for i, (seq, score) in enumerate(scores):
|
||||
props = calculate_properties(seq)
|
||||
f.write(f">Design_{i+1}_Score_{score:.3f}\n")
|
||||
f.write(f"# Hydrophobic: {props['hydrophobic_fraction']:.2%}, ")
|
||||
f.write(f"Charge: {props['net_charge']:+d}, ")
|
||||
f.write(f"Aromatic: {props['aromatic_fraction']:.2%}\n")
|
||||
f.write(f"{seq}\n\n")
|
||||
|
||||
print("Results saved to: optimized_sequences.fasta")
|
||||
```
|
||||
|
||||
## Workflow 4: Function Prediction Pipeline
|
||||
|
||||
Predict protein function from sequence using ESM3 and ESM C.
|
||||
|
||||
### Objective
|
||||
|
||||
Build a pipeline that predicts protein function using both generative (ESM3) and embedding (ESM C) approaches.
|
||||
|
||||
### Complete Implementation
|
||||
|
||||
```python
|
||||
from esm.models.esm3 import ESM3
|
||||
from esm.models.esmc import ESMC
|
||||
from esm.sdk.api import ESMProtein, GenerationConfig
|
||||
import numpy as np
|
||||
from sklearn.ensemble import RandomForestClassifier
|
||||
from sklearn.model_selection import cross_val_score
|
||||
|
||||
# Setup models
|
||||
esm3_model = ESM3.from_pretrained("esm3-sm-open-v1").to("cuda")
|
||||
esmc_model = ESMC.from_pretrained("esmc-600m").to("cuda")
|
||||
|
||||
# Example: Predict if protein is an enzyme
|
||||
# (In practice, you'd have a labeled training set)
|
||||
|
||||
def predict_function_generative(sequence):
|
||||
"""Predict function using ESM3 generative approach."""
|
||||
|
||||
protein = ESMProtein(sequence=sequence)
|
||||
|
||||
# Generate function annotations
|
||||
config = GenerationConfig(
|
||||
track="function",
|
||||
num_steps=20,
|
||||
temperature=0.3 # Low temperature for confident predictions
|
||||
)
|
||||
|
||||
protein_with_function = esm3_model.generate(protein, config)
|
||||
|
||||
return protein_with_function.function_annotations
|
||||
|
||||
def predict_function_embedding(sequence, function_classifier):
|
||||
"""Predict function using ESM C embeddings + classifier."""
|
||||
|
||||
# Get embedding
|
||||
protein = ESMProtein(sequence=sequence)
|
||||
tensor = esmc_model.encode(protein)
|
||||
embedding = esmc_model.forward(tensor)
|
||||
|
||||
# Mean pool
|
||||
embedding_pooled = embedding.mean(dim=1).cpu().detach().numpy()
|
||||
|
||||
# Predict with classifier
|
||||
prediction = function_classifier.predict(embedding_pooled)
|
||||
probability = function_classifier.predict_proba(embedding_pooled)
|
||||
|
||||
return prediction[0], probability[0]
|
||||
|
||||
# Example workflow with test sequences
|
||||
test_sequences = {
|
||||
"kinase": "MPRTKEINDAGLIVHSPQWFYKARNDTESLGKIVHEF",
|
||||
"protease": "AGLIVHSPQWFYKARNDTESLGKIVHEFPMCDEGH",
|
||||
"transporter": "KTEFLNDGRPMLIVHSPQWFYKARNDTESLGKIVH"
|
||||
}
|
||||
|
||||
print("Predicting functions...\n")
|
||||
|
||||
for name, sequence in test_sequences.items():
|
||||
print(f"{name.upper()}:")
|
||||
print(f"Sequence: {sequence[:30]}...")
|
||||
|
||||
# Method 1: Generative
|
||||
functions = predict_function_generative(sequence)
|
||||
print(f" Generative predictions: {functions}")
|
||||
|
||||
# Method 2: Embedding-based would require trained classifier
|
||||
# (Skipped in this example as it needs training data)
|
||||
|
||||
print()
|
||||
```
|
||||
|
||||
## Workflow 5: Embedding-Based Clustering and Analysis
|
||||
|
||||
Cluster and analyze a large protein dataset using ESM C embeddings.
|
||||
|
||||
### Complete Implementation
|
||||
|
||||
```python
|
||||
from esm.models.esmc import ESMC
|
||||
from esm.sdk.api import ESMProtein
|
||||
import numpy as np
|
||||
from sklearn.cluster import DBSCAN
|
||||
from sklearn.decomposition import PCA
|
||||
from sklearn.manifold import TSNE
|
||||
import matplotlib.pyplot as plt
|
||||
|
||||
# Setup
|
||||
model = ESMC.from_pretrained("esmc-600m").to("cuda")
|
||||
|
||||
# Load protein dataset (example)
|
||||
sequences = [
|
||||
# In practice, load from FASTA or database
|
||||
"MPRTKEINDAGLIVHSPQWFYK",
|
||||
"AGLIVHSPQWFYKARNDTESL",
|
||||
# ... more sequences
|
||||
]
|
||||
|
||||
print(f"Loaded {len(sequences)} sequences")
|
||||
|
||||
# Step 1: Generate embeddings
|
||||
print("Generating embeddings...")
|
||||
|
||||
embeddings = []
|
||||
for i, seq in enumerate(sequences):
|
||||
protein = ESMProtein(sequence=seq)
|
||||
tensor = model.encode(protein)
|
||||
emb = model.forward(tensor)
|
||||
|
||||
# Mean pooling
|
||||
emb_pooled = emb.mean(dim=1).cpu().detach().numpy().flatten()
|
||||
embeddings.append(emb_pooled)
|
||||
|
||||
if (i + 1) % 100 == 0:
|
||||
print(f"Processed {i + 1}/{len(sequences)}")
|
||||
|
||||
embeddings = np.array(embeddings)
|
||||
print(f"Embeddings shape: {embeddings.shape}")
|
||||
|
||||
# Step 2: Dimensionality reduction for visualization
|
||||
print("\nReducing dimensionality...")
|
||||
|
||||
# PCA for initial reduction
|
||||
pca = PCA(n_components=50)
|
||||
embeddings_pca = pca.fit_transform(embeddings)
|
||||
print(f"PCA explained variance: {pca.explained_variance_ratio_[:10].sum():.2%}")
|
||||
|
||||
# t-SNE for visualization
|
||||
tsne = TSNE(n_components=2, random_state=42)
|
||||
embeddings_2d = tsne.fit_transform(embeddings_pca)
|
||||
|
||||
# Step 3: Clustering
|
||||
print("\nClustering...")
|
||||
|
||||
# DBSCAN for density-based clustering
|
||||
clustering = DBSCAN(eps=0.5, min_samples=5)
|
||||
cluster_labels = clustering.fit_predict(embeddings)
|
||||
|
||||
n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
|
||||
n_noise = list(cluster_labels).count(-1)
|
||||
|
||||
print(f"Number of clusters: {n_clusters}")
|
||||
print(f"Number of noise points: {n_noise}")
|
||||
|
||||
# Step 4: Visualize
|
||||
print("\nGenerating visualization...")
|
||||
|
||||
plt.figure(figsize=(12, 8))
|
||||
scatter = plt.scatter(
|
||||
embeddings_2d[:, 0],
|
||||
embeddings_2d[:, 1],
|
||||
c=cluster_labels,
|
||||
cmap='viridis',
|
||||
alpha=0.6
|
||||
)
|
||||
plt.colorbar(scatter)
|
||||
plt.title("Protein Sequence Clustering (ESM C Embeddings)")
|
||||
plt.xlabel("t-SNE 1")
|
||||
plt.ylabel("t-SNE 2")
|
||||
plt.savefig("protein_clusters.png", dpi=300, bbox_inches='tight')
|
||||
print("Visualization saved to: protein_clusters.png")
|
||||
|
||||
# Step 5: Analyze clusters
|
||||
print("\nCluster analysis:")
|
||||
|
||||
for cluster_id in range(n_clusters):
|
||||
cluster_indices = np.where(cluster_labels == cluster_id)[0]
|
||||
cluster_seqs = [sequences[i] for i in cluster_indices]
|
||||
|
||||
print(f"\nCluster {cluster_id}:")
|
||||
print(f" Size: {len(cluster_seqs)}")
|
||||
print(f" Avg length: {np.mean([len(s) for s in cluster_seqs]):.1f}")
|
||||
print(f" Example: {cluster_seqs[0][:40]}...")
|
||||
|
||||
# Save cluster assignments
|
||||
with open("cluster_assignments.txt", "w") as f:
|
||||
for i, (seq, label) in enumerate(zip(sequences, cluster_labels)):
|
||||
f.write(f"Sequence_{i}\tCluster_{label}\t{seq}\n")
|
||||
|
||||
print("\nCluster assignments saved to: cluster_assignments.txt")
|
||||
```
|
||||
|
||||
## Additional Workflow Tips
|
||||
|
||||
### Memory Management for Large Datasets
|
||||
|
||||
```python
|
||||
def process_large_dataset(sequences, batch_size=32):
|
||||
"""Process large dataset with memory management."""
|
||||
import gc
|
||||
import torch
|
||||
|
||||
results = []
|
||||
|
||||
for i in range(0, len(sequences), batch_size):
|
||||
batch = sequences[i:i + batch_size]
|
||||
|
||||
# Process batch
|
||||
batch_results = [process_sequence(seq) for seq in batch]
|
||||
results.extend(batch_results)
|
||||
|
||||
# Clear memory
|
||||
torch.cuda.empty_cache()
|
||||
gc.collect()
|
||||
|
||||
if (i + batch_size) % 100 == 0:
|
||||
print(f"Processed {min(i + batch_size, len(sequences))}/{len(sequences)}")
|
||||
|
||||
return results
|
||||
```
|
||||
|
||||
### Parallel Processing
|
||||
|
||||
```python
|
||||
from concurrent.futures import ThreadPoolExecutor
|
||||
import asyncio
|
||||
|
||||
def parallel_workflow(sequences, n_workers=4):
|
||||
"""Process sequences in parallel."""
|
||||
|
||||
with ThreadPoolExecutor(max_workers=n_workers) as executor:
|
||||
results = list(executor.map(process_sequence, sequences))
|
||||
|
||||
return results
|
||||
```
|
||||
|
||||
These workflows provide comprehensive examples for common ESM use cases. Adapt them to your specific needs and always validate results with appropriate biological experiments.
|
||||
Reference in New Issue
Block a user