724 lines
17 KiB
Markdown
724 lines
17 KiB
Markdown
# Molfeat Usage Examples
|
|
|
|
This document provides practical examples for common molfeat use cases.
|
|
|
|
## Installation
|
|
|
|
```bash
|
|
# Recommended: Using conda/mamba
|
|
mamba install -c conda-forge molfeat
|
|
|
|
# Alternative: Using pip
|
|
pip install molfeat
|
|
|
|
# With all optional dependencies
|
|
pip install "molfeat[all]"
|
|
|
|
# With specific dependencies
|
|
pip install "molfeat[dgl]" # For GNN models
|
|
pip install "molfeat[graphormer]" # For Graphormer
|
|
pip install "molfeat[transformer]" # For ChemBERTa, ChemGPT
|
|
```
|
|
|
|
---
|
|
|
|
## Quick Start
|
|
|
|
### Basic Featurization Workflow
|
|
|
|
```python
|
|
import datamol as dm
|
|
from molfeat.calc import FPCalculator
|
|
from molfeat.trans import MoleculeTransformer
|
|
|
|
# Load sample data
|
|
data = dm.data.freesolv().sample(100).smiles.values
|
|
|
|
# Single molecule featurization
|
|
calc = FPCalculator("ecfp")
|
|
features_single = calc(data[0])
|
|
print(f"Single molecule features shape: {features_single.shape}")
|
|
# Output: (2048,)
|
|
|
|
# Batch featurization with parallelization
|
|
transformer = MoleculeTransformer(calc, n_jobs=-1)
|
|
features_batch = transformer(data)
|
|
print(f"Batch features shape: {features_batch.shape}")
|
|
# Output: (100, 2048)
|
|
```
|
|
|
|
---
|
|
|
|
## Calculator Examples
|
|
|
|
### Fingerprint Calculators
|
|
|
|
```python
|
|
from molfeat.calc import FPCalculator
|
|
|
|
# ECFP (Extended-Connectivity Fingerprints)
|
|
ecfp = FPCalculator("ecfp", radius=3, fpSize=2048)
|
|
fp = ecfp("CCO") # Ethanol
|
|
print(f"ECFP shape: {fp.shape}") # (2048,)
|
|
|
|
# MACCS keys
|
|
maccs = FPCalculator("maccs")
|
|
fp = maccs("c1ccccc1") # Benzene
|
|
print(f"MACCS shape: {fp.shape}") # (167,)
|
|
|
|
# Count-based fingerprints
|
|
ecfp_count = FPCalculator("ecfp-count", radius=3)
|
|
fp_count = ecfp_count("CC(C)CC(C)C") # Non-binary counts
|
|
|
|
# MAP4 fingerprints
|
|
map4 = FPCalculator("map4")
|
|
fp = map4("CC(=O)Oc1ccccc1C(=O)O") # Aspirin
|
|
```
|
|
|
|
### Descriptor Calculators
|
|
|
|
```python
|
|
from molfeat.calc import RDKitDescriptors2D, MordredDescriptors
|
|
|
|
# RDKit 2D descriptors (200+ properties)
|
|
desc2d = RDKitDescriptors2D()
|
|
descriptors = desc2d("CCO")
|
|
print(f"Number of 2D descriptors: {len(descriptors)}")
|
|
|
|
# Get descriptor names
|
|
names = desc2d.columns
|
|
print(f"First 5 descriptors: {names[:5]}")
|
|
|
|
# Mordred descriptors (1800+ properties)
|
|
mordred = MordredDescriptors()
|
|
descriptors = mordred("c1ccccc1O") # Phenol
|
|
print(f"Mordred descriptors: {len(descriptors)}")
|
|
```
|
|
|
|
### Pharmacophore Calculators
|
|
|
|
```python
|
|
from molfeat.calc import CATSCalculator
|
|
|
|
# 2D CATS descriptors
|
|
cats = CATSCalculator(mode="2D", scale="raw")
|
|
descriptors = cats("CC(C)Cc1ccc(C)cc1C") # Cymene
|
|
print(f"CATS descriptors: {descriptors.shape}") # (21,)
|
|
|
|
# 3D CATS descriptors (requires conformer)
|
|
cats3d = CATSCalculator(mode="3D", scale="num")
|
|
```
|
|
|
|
---
|
|
|
|
## Transformer Examples
|
|
|
|
### Basic Transformer Usage
|
|
|
|
```python
|
|
from molfeat.trans import MoleculeTransformer
|
|
from molfeat.calc import FPCalculator
|
|
import datamol as dm
|
|
|
|
# Prepare data
|
|
smiles_list = [
|
|
"CCO",
|
|
"CC(=O)O",
|
|
"c1ccccc1",
|
|
"CC(C)O",
|
|
"CCCC"
|
|
]
|
|
|
|
# Create transformer
|
|
calc = FPCalculator("ecfp")
|
|
transformer = MoleculeTransformer(calc, n_jobs=-1)
|
|
|
|
# Transform molecules
|
|
features = transformer(smiles_list)
|
|
print(f"Features shape: {features.shape}") # (5, 2048)
|
|
```
|
|
|
|
### Error Handling
|
|
|
|
```python
|
|
# Handle invalid SMILES gracefully
|
|
smiles_with_errors = [
|
|
"CCO", # Valid
|
|
"invalid", # Invalid
|
|
"CC(=O)O", # Valid
|
|
"xyz123", # Invalid
|
|
]
|
|
|
|
transformer = MoleculeTransformer(
|
|
FPCalculator("ecfp"),
|
|
n_jobs=-1,
|
|
verbose=True, # Log errors
|
|
ignore_errors=True # Continue on failure
|
|
)
|
|
|
|
features = transformer(smiles_with_errors)
|
|
# Returns: array with None for failed molecules
|
|
print(features) # [array(...), None, array(...), None]
|
|
```
|
|
|
|
### Concatenating Multiple Featurizers
|
|
|
|
```python
|
|
from molfeat.trans import FeatConcat, MoleculeTransformer
|
|
from molfeat.calc import FPCalculator
|
|
|
|
# Combine MACCS (167) + ECFP (2048) = 2215 dimensions
|
|
concat_calc = FeatConcat([
|
|
FPCalculator("maccs"),
|
|
FPCalculator("ecfp", radius=3, fpSize=2048)
|
|
])
|
|
|
|
transformer = MoleculeTransformer(concat_calc, n_jobs=-1)
|
|
features = transformer(smiles_list)
|
|
print(f"Combined features shape: {features.shape}") # (n, 2215)
|
|
|
|
# Triple combination
|
|
triple_concat = FeatConcat([
|
|
FPCalculator("maccs"),
|
|
FPCalculator("ecfp"),
|
|
FPCalculator("rdkit")
|
|
])
|
|
```
|
|
|
|
### Saving and Loading Configurations
|
|
|
|
```python
|
|
from molfeat.trans import MoleculeTransformer
|
|
from molfeat.calc import FPCalculator
|
|
|
|
# Create and save transformer
|
|
transformer = MoleculeTransformer(
|
|
FPCalculator("ecfp", radius=3, fpSize=2048),
|
|
n_jobs=-1
|
|
)
|
|
|
|
# Save to YAML
|
|
transformer.to_state_yaml_file("my_featurizer.yml")
|
|
|
|
# Save to JSON
|
|
transformer.to_state_json_file("my_featurizer.json")
|
|
|
|
# Load from saved state
|
|
loaded_transformer = MoleculeTransformer.from_state_yaml_file("my_featurizer.yml")
|
|
|
|
# Use loaded transformer
|
|
features = loaded_transformer(smiles_list)
|
|
```
|
|
|
|
---
|
|
|
|
## Pretrained Model Examples
|
|
|
|
### Using the ModelStore
|
|
|
|
```python
|
|
from molfeat.store.modelstore import ModelStore
|
|
|
|
# Initialize model store
|
|
store = ModelStore()
|
|
|
|
# List all available models
|
|
print(f"Total available models: {len(store.available_models)}")
|
|
|
|
# Search for specific models
|
|
chemberta_models = store.search(name="ChemBERTa")
|
|
for model in chemberta_models:
|
|
print(f"- {model.name}: {model.description}")
|
|
|
|
# Get model information
|
|
model_card = store.search(name="ChemBERTa-77M-MLM")[0]
|
|
print(f"Model: {model_card.name}")
|
|
print(f"Version: {model_card.version}")
|
|
print(f"Authors: {model_card.authors}")
|
|
|
|
# View usage instructions
|
|
model_card.usage()
|
|
|
|
# Load model directly
|
|
transformer = store.load("ChemBERTa-77M-MLM")
|
|
```
|
|
|
|
### ChemBERTa Embeddings
|
|
|
|
```python
|
|
from molfeat.trans.pretrained import PretrainedMolTransformer
|
|
|
|
# Load ChemBERTa model
|
|
chemberta = PretrainedMolTransformer("ChemBERTa-77M-MLM", n_jobs=-1)
|
|
|
|
# Generate embeddings
|
|
smiles = ["CCO", "CC(=O)O", "c1ccccc1"]
|
|
embeddings = chemberta(smiles)
|
|
print(f"ChemBERTa embeddings shape: {embeddings.shape}")
|
|
# Output: (3, 768) - 768-dimensional embeddings
|
|
|
|
# Use in ML pipeline
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
from sklearn.model_selection import train_test_split
|
|
|
|
X_train, X_test, y_train, y_test = train_test_split(
|
|
embeddings, labels, test_size=0.2
|
|
)
|
|
|
|
clf = RandomForestClassifier()
|
|
clf.fit(X_train, y_train)
|
|
predictions = clf.predict(X_test)
|
|
```
|
|
|
|
### ChemGPT Models
|
|
|
|
```python
|
|
# Small model (4.7M parameters)
|
|
chemgpt_small = PretrainedMolTransformer("ChemGPT-4.7M", n_jobs=-1)
|
|
|
|
# Medium model (19M parameters)
|
|
chemgpt_medium = PretrainedMolTransformer("ChemGPT-19M", n_jobs=-1)
|
|
|
|
# Large model (1.2B parameters)
|
|
chemgpt_large = PretrainedMolTransformer("ChemGPT-1.2B", n_jobs=-1)
|
|
|
|
# Generate embeddings
|
|
embeddings = chemgpt_small(smiles)
|
|
```
|
|
|
|
### Graph Neural Network Models
|
|
|
|
```python
|
|
# GIN models with different pre-training objectives
|
|
gin_masking = PretrainedMolTransformer("gin-supervised-masking", n_jobs=-1)
|
|
gin_infomax = PretrainedMolTransformer("gin-supervised-infomax", n_jobs=-1)
|
|
gin_edgepred = PretrainedMolTransformer("gin-supervised-edgepred", n_jobs=-1)
|
|
|
|
# Generate graph embeddings
|
|
embeddings = gin_masking(smiles)
|
|
print(f"GIN embeddings shape: {embeddings.shape}")
|
|
|
|
# Graphormer (for quantum chemistry)
|
|
graphormer = PretrainedMolTransformer("Graphormer-pcqm4mv2", n_jobs=-1)
|
|
embeddings = graphormer(smiles)
|
|
```
|
|
|
|
---
|
|
|
|
## Machine Learning Integration
|
|
|
|
### Scikit-learn Pipeline
|
|
|
|
```python
|
|
from sklearn.pipeline import Pipeline
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
from sklearn.model_selection import cross_val_score
|
|
from molfeat.trans import MoleculeTransformer
|
|
from molfeat.calc import FPCalculator
|
|
|
|
# Create ML pipeline
|
|
pipeline = Pipeline([
|
|
('featurizer', MoleculeTransformer(FPCalculator("ecfp"), n_jobs=-1)),
|
|
('classifier', RandomForestClassifier(n_estimators=100))
|
|
])
|
|
|
|
# Train and evaluate
|
|
pipeline.fit(smiles_train, y_train)
|
|
predictions = pipeline.predict(smiles_test)
|
|
|
|
# Cross-validation
|
|
scores = cross_val_score(pipeline, smiles_all, y_all, cv=5)
|
|
print(f"CV scores: {scores.mean():.3f} (+/- {scores.std():.3f})")
|
|
```
|
|
|
|
### Grid Search for Hyperparameter Tuning
|
|
|
|
```python
|
|
from sklearn.model_selection import GridSearchCV
|
|
from sklearn.svm import SVC
|
|
|
|
# Define pipeline
|
|
pipeline = Pipeline([
|
|
('featurizer', MoleculeTransformer(FPCalculator("ecfp"), n_jobs=-1)),
|
|
('classifier', SVC())
|
|
])
|
|
|
|
# Define parameter grid
|
|
param_grid = {
|
|
'classifier__C': [0.1, 1, 10],
|
|
'classifier__kernel': ['rbf', 'linear'],
|
|
'classifier__gamma': ['scale', 'auto']
|
|
}
|
|
|
|
# Grid search
|
|
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
|
|
grid_search.fit(smiles_train, y_train)
|
|
|
|
print(f"Best parameters: {grid_search.best_params_}")
|
|
print(f"Best score: {grid_search.best_score_:.3f}")
|
|
```
|
|
|
|
### Multiple Featurizer Comparison
|
|
|
|
```python
|
|
from sklearn.metrics import roc_auc_score
|
|
|
|
# Test different featurizers
|
|
featurizers = {
|
|
'ECFP': FPCalculator("ecfp"),
|
|
'MACCS': FPCalculator("maccs"),
|
|
'RDKit': FPCalculator("rdkit"),
|
|
'Descriptors': RDKitDescriptors2D(),
|
|
'Combined': FeatConcat([
|
|
FPCalculator("maccs"),
|
|
FPCalculator("ecfp")
|
|
])
|
|
}
|
|
|
|
results = {}
|
|
for name, calc in featurizers.items():
|
|
transformer = MoleculeTransformer(calc, n_jobs=-1)
|
|
X_train = transformer(smiles_train)
|
|
X_test = transformer(smiles_test)
|
|
|
|
clf = RandomForestClassifier(n_estimators=100)
|
|
clf.fit(X_train, y_train)
|
|
|
|
y_pred = clf.predict_proba(X_test)[:, 1]
|
|
auc = roc_auc_score(y_test, y_pred)
|
|
results[name] = auc
|
|
|
|
print(f"{name}: AUC = {auc:.3f}")
|
|
```
|
|
|
|
### PyTorch Deep Learning
|
|
|
|
```python
|
|
import torch
|
|
import torch.nn as nn
|
|
from torch.utils.data import Dataset, DataLoader
|
|
from molfeat.trans import MoleculeTransformer
|
|
from molfeat.calc import FPCalculator
|
|
|
|
# Custom dataset
|
|
class MoleculeDataset(Dataset):
|
|
def __init__(self, smiles, labels, transformer):
|
|
self.features = transformer(smiles)
|
|
self.labels = torch.tensor(labels, dtype=torch.float32)
|
|
|
|
def __len__(self):
|
|
return len(self.labels)
|
|
|
|
def __getitem__(self, idx):
|
|
return (
|
|
torch.tensor(self.features[idx], dtype=torch.float32),
|
|
self.labels[idx]
|
|
)
|
|
|
|
# Prepare data
|
|
transformer = MoleculeTransformer(FPCalculator("ecfp"), n_jobs=-1)
|
|
train_dataset = MoleculeDataset(smiles_train, y_train, transformer)
|
|
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
|
|
|
|
# Simple neural network
|
|
class MoleculeClassifier(nn.Module):
|
|
def __init__(self, input_dim):
|
|
super().__init__()
|
|
self.network = nn.Sequential(
|
|
nn.Linear(input_dim, 512),
|
|
nn.ReLU(),
|
|
nn.Dropout(0.3),
|
|
nn.Linear(512, 256),
|
|
nn.ReLU(),
|
|
nn.Dropout(0.3),
|
|
nn.Linear(256, 1),
|
|
nn.Sigmoid()
|
|
)
|
|
|
|
def forward(self, x):
|
|
return self.network(x)
|
|
|
|
# Train model
|
|
model = MoleculeClassifier(input_dim=2048)
|
|
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
|
|
criterion = nn.BCELoss()
|
|
|
|
for epoch in range(10):
|
|
for batch_features, batch_labels in train_loader:
|
|
optimizer.zero_grad()
|
|
outputs = model(batch_features).squeeze()
|
|
loss = criterion(outputs, batch_labels)
|
|
loss.backward()
|
|
optimizer.step()
|
|
```
|
|
|
|
---
|
|
|
|
## Advanced Usage Patterns
|
|
|
|
### Custom Preprocessing
|
|
|
|
```python
|
|
from molfeat.trans import MoleculeTransformer
|
|
import datamol as dm
|
|
|
|
class CustomTransformer(MoleculeTransformer):
|
|
def preprocess(self, mol):
|
|
"""Custom preprocessing: standardize molecule"""
|
|
if isinstance(mol, str):
|
|
mol = dm.to_mol(mol)
|
|
|
|
# Standardize
|
|
mol = dm.standardize_mol(mol)
|
|
|
|
# Remove salts
|
|
mol = dm.remove_salts(mol)
|
|
|
|
return mol
|
|
|
|
# Use custom transformer
|
|
transformer = CustomTransformer(FPCalculator("ecfp"), n_jobs=-1)
|
|
features = transformer(smiles_list)
|
|
```
|
|
|
|
### Featurization with Conformers
|
|
|
|
```python
|
|
import datamol as dm
|
|
from molfeat.calc import RDKitDescriptors3D
|
|
|
|
# Generate conformers
|
|
def prepare_3d_mol(smiles):
|
|
mol = dm.to_mol(smiles)
|
|
mol = dm.add_hs(mol)
|
|
mol = dm.conform.generate_conformers(mol, n_confs=1)
|
|
return mol
|
|
|
|
# 3D descriptors
|
|
calc_3d = RDKitDescriptors3D()
|
|
|
|
smiles = "CC(C)Cc1ccc(C)cc1C"
|
|
mol_3d = prepare_3d_mol(smiles)
|
|
descriptors_3d = calc_3d(mol_3d)
|
|
```
|
|
|
|
### Parallel Batch Processing
|
|
|
|
```python
|
|
from molfeat.trans import MoleculeTransformer
|
|
from molfeat.calc import FPCalculator
|
|
import time
|
|
|
|
# Large dataset
|
|
smiles_large = load_large_dataset() # e.g., 100,000 molecules
|
|
|
|
# Test different parallelization levels
|
|
for n_jobs in [1, 2, 4, -1]:
|
|
transformer = MoleculeTransformer(
|
|
FPCalculator("ecfp"),
|
|
n_jobs=n_jobs
|
|
)
|
|
|
|
start = time.time()
|
|
features = transformer(smiles_large)
|
|
elapsed = time.time() - start
|
|
|
|
print(f"n_jobs={n_jobs}: {elapsed:.2f}s")
|
|
```
|
|
|
|
### Caching for Expensive Operations
|
|
|
|
```python
|
|
from molfeat.trans.pretrained import PretrainedMolTransformer
|
|
import pickle
|
|
|
|
# Load expensive pretrained model
|
|
transformer = PretrainedMolTransformer("ChemBERTa-77M-MLM", n_jobs=-1)
|
|
|
|
# Cache embeddings for reuse
|
|
cache_file = "embeddings_cache.pkl"
|
|
|
|
try:
|
|
# Try loading cached embeddings
|
|
with open(cache_file, "rb") as f:
|
|
embeddings = pickle.load(f)
|
|
print("Loaded cached embeddings")
|
|
except FileNotFoundError:
|
|
# Compute and cache
|
|
embeddings = transformer(smiles_list)
|
|
with open(cache_file, "wb") as f:
|
|
pickle.dump(embeddings, f)
|
|
print("Computed and cached embeddings")
|
|
```
|
|
|
|
---
|
|
|
|
## Common Workflows
|
|
|
|
### Virtual Screening Workflow
|
|
|
|
```python
|
|
from molfeat.calc import FPCalculator
|
|
from sklearn.ensemble import RandomForestClassifier
|
|
import datamol as dm
|
|
|
|
# 1. Prepare training data (known actives/inactives)
|
|
train_smiles = load_training_data()
|
|
train_labels = load_training_labels() # 1=active, 0=inactive
|
|
|
|
# 2. Featurize training set
|
|
transformer = MoleculeTransformer(FPCalculator("ecfp"), n_jobs=-1)
|
|
X_train = transformer(train_smiles)
|
|
|
|
# 3. Train classifier
|
|
clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
|
|
clf.fit(X_train, train_labels)
|
|
|
|
# 4. Featurize screening library
|
|
screening_smiles = load_screening_library() # e.g., 1M compounds
|
|
X_screen = transformer(screening_smiles)
|
|
|
|
# 5. Predict and rank
|
|
predictions = clf.predict_proba(X_screen)[:, 1]
|
|
ranked_indices = predictions.argsort()[::-1]
|
|
|
|
# 6. Get top hits
|
|
top_n = 1000
|
|
top_hits = [screening_smiles[i] for i in ranked_indices[:top_n]]
|
|
```
|
|
|
|
### QSAR Model Building
|
|
|
|
```python
|
|
from molfeat.calc import RDKitDescriptors2D
|
|
from sklearn.linear_model import Ridge
|
|
from sklearn.preprocessing import StandardScaler
|
|
from sklearn.model_selection import cross_val_score
|
|
import numpy as np
|
|
|
|
# Load QSAR dataset
|
|
smiles = load_molecules()
|
|
y = load_activity_values() # e.g., IC50, logP
|
|
|
|
# Featurize with interpretable descriptors
|
|
transformer = MoleculeTransformer(RDKitDescriptors2D(), n_jobs=-1)
|
|
X = transformer(smiles)
|
|
|
|
# Standardize features
|
|
scaler = StandardScaler()
|
|
X_scaled = scaler.fit_transform(X)
|
|
|
|
# Build linear model
|
|
model = Ridge(alpha=1.0)
|
|
scores = cross_val_score(model, X_scaled, y, cv=5, scoring='r2')
|
|
print(f"R² = {scores.mean():.3f} (+/- {scores.std():.3f})")
|
|
|
|
# Fit final model
|
|
model.fit(X_scaled, y)
|
|
|
|
# Interpret feature importance
|
|
feature_names = transformer.featurizer.columns
|
|
importance = np.abs(model.coef_)
|
|
top_features_idx = importance.argsort()[-10:][::-1]
|
|
|
|
print("Top 10 important features:")
|
|
for idx in top_features_idx:
|
|
print(f" {feature_names[idx]}: {model.coef_[idx]:.3f}")
|
|
```
|
|
|
|
### Similarity Search
|
|
|
|
```python
|
|
from molfeat.calc import FPCalculator
|
|
from sklearn.metrics.pairwise import cosine_similarity
|
|
import numpy as np
|
|
|
|
# Query molecule
|
|
query_smiles = "CC(=O)Oc1ccccc1C(=O)O" # Aspirin
|
|
|
|
# Database of molecules
|
|
database_smiles = load_molecule_database() # Large collection
|
|
|
|
# Compute fingerprints
|
|
calc = FPCalculator("ecfp")
|
|
query_fp = calc(query_smiles).reshape(1, -1)
|
|
|
|
transformer = MoleculeTransformer(calc, n_jobs=-1)
|
|
database_fps = transformer(database_smiles)
|
|
|
|
# Compute similarity
|
|
similarities = cosine_similarity(query_fp, database_fps)[0]
|
|
|
|
# Find most similar
|
|
top_k = 10
|
|
top_indices = similarities.argsort()[-top_k:][::-1]
|
|
|
|
print(f"Top {top_k} similar molecules:")
|
|
for i, idx in enumerate(top_indices, 1):
|
|
print(f"{i}. {database_smiles[idx]} (similarity: {similarities[idx]:.3f})")
|
|
```
|
|
|
|
---
|
|
|
|
## Troubleshooting
|
|
|
|
### Handling Invalid Molecules
|
|
|
|
```python
|
|
# Use ignore_errors to skip invalid molecules
|
|
transformer = MoleculeTransformer(
|
|
FPCalculator("ecfp"),
|
|
ignore_errors=True,
|
|
verbose=True
|
|
)
|
|
|
|
# Filter out None values after transformation
|
|
features = transformer(smiles_list)
|
|
valid_mask = [f is not None for f in features]
|
|
valid_features = [f for f in features if f is not None]
|
|
valid_smiles = [s for s, m in zip(smiles_list, valid_mask) if m]
|
|
```
|
|
|
|
### Memory Management for Large Datasets
|
|
|
|
```python
|
|
# Process in chunks for very large datasets
|
|
def featurize_in_chunks(smiles_list, transformer, chunk_size=10000):
|
|
all_features = []
|
|
|
|
for i in range(0, len(smiles_list), chunk_size):
|
|
chunk = smiles_list[i:i+chunk_size]
|
|
features = transformer(chunk)
|
|
all_features.append(features)
|
|
print(f"Processed {i+len(chunk)}/{len(smiles_list)}")
|
|
|
|
return np.vstack(all_features)
|
|
|
|
# Use with large dataset
|
|
features = featurize_in_chunks(large_smiles_list, transformer)
|
|
```
|
|
|
|
### Reproducibility
|
|
|
|
```python
|
|
import random
|
|
import numpy as np
|
|
import torch
|
|
|
|
# Set all random seeds
|
|
def set_seed(seed=42):
|
|
random.seed(seed)
|
|
np.random.seed(seed)
|
|
torch.manual_seed(seed)
|
|
torch.cuda.manual_seed_all(seed)
|
|
|
|
set_seed(42)
|
|
|
|
# Save exact configuration
|
|
transformer.to_state_yaml_file("config.yml")
|
|
|
|
# Document version
|
|
import molfeat
|
|
print(f"molfeat version: {molfeat.__version__}")
|
|
```
|