Initial commit

This commit is contained in:
Zhongwei Li
2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions

View File

@@ -0,0 +1,243 @@
#!/usr/bin/env python3
"""
Molecular Properties Calculator
Calculate comprehensive molecular properties and descriptors for molecules.
Supports single molecules or batch processing from files.
Usage:
python molecular_properties.py "CCO"
python molecular_properties.py --file molecules.smi --output properties.csv
"""
import argparse
import sys
from pathlib import Path
try:
from rdkit import Chem
from rdkit.Chem import Descriptors, Lipinski
except ImportError:
print("Error: RDKit not installed. Install with: conda install -c conda-forge rdkit")
sys.exit(1)
def calculate_properties(mol):
"""Calculate comprehensive molecular properties."""
if mol is None:
return None
properties = {
# Basic properties
'SMILES': Chem.MolToSmiles(mol),
'Molecular_Formula': Chem.rdMolDescriptors.CalcMolFormula(mol),
# Molecular weight
'MW': Descriptors.MolWt(mol),
'ExactMW': Descriptors.ExactMolWt(mol),
# Lipophilicity
'LogP': Descriptors.MolLogP(mol),
'MR': Descriptors.MolMR(mol),
# Polar surface area
'TPSA': Descriptors.TPSA(mol),
'LabuteASA': Descriptors.LabuteASA(mol),
# Hydrogen bonding
'HBD': Descriptors.NumHDonors(mol),
'HBA': Descriptors.NumHAcceptors(mol),
# Atom counts
'Heavy_Atoms': Descriptors.HeavyAtomCount(mol),
'Heteroatoms': Descriptors.NumHeteroatoms(mol),
'Valence_Electrons': Descriptors.NumValenceElectrons(mol),
# Ring information
'Rings': Descriptors.RingCount(mol),
'Aromatic_Rings': Descriptors.NumAromaticRings(mol),
'Saturated_Rings': Descriptors.NumSaturatedRings(mol),
'Aliphatic_Rings': Descriptors.NumAliphaticRings(mol),
'Aromatic_Heterocycles': Descriptors.NumAromaticHeterocycles(mol),
# Flexibility
'Rotatable_Bonds': Descriptors.NumRotatableBonds(mol),
'Fraction_Csp3': Descriptors.FractionCsp3(mol),
# Complexity
'BertzCT': Descriptors.BertzCT(mol),
# Drug-likeness
'QED': Descriptors.qed(mol),
}
# Lipinski's Rule of Five
properties['Lipinski_Pass'] = (
properties['MW'] <= 500 and
properties['LogP'] <= 5 and
properties['HBD'] <= 5 and
properties['HBA'] <= 10
)
# Lead-likeness
properties['Lead-like'] = (
250 <= properties['MW'] <= 350 and
properties['LogP'] <= 3.5 and
properties['Rotatable_Bonds'] <= 7
)
return properties
def process_single_molecule(smiles):
"""Process a single SMILES string."""
mol = Chem.MolFromSmiles(smiles)
if mol is None:
print(f"Error: Failed to parse SMILES: {smiles}")
return None
props = calculate_properties(mol)
return props
def process_file(input_file, output_file=None):
"""Process molecules from a file."""
input_path = Path(input_file)
if not input_path.exists():
print(f"Error: File not found: {input_file}")
return
# Determine file type
if input_path.suffix.lower() in ['.sdf', '.mol']:
suppl = Chem.SDMolSupplier(str(input_path))
elif input_path.suffix.lower() in ['.smi', '.smiles', '.txt']:
suppl = Chem.SmilesMolSupplier(str(input_path), titleLine=False)
else:
print(f"Error: Unsupported file format: {input_path.suffix}")
return
results = []
for idx, mol in enumerate(suppl):
if mol is None:
print(f"Warning: Failed to parse molecule {idx+1}")
continue
props = calculate_properties(mol)
if props:
props['Index'] = idx + 1
results.append(props)
# Output results
if output_file:
write_csv(results, output_file)
print(f"Results written to: {output_file}")
else:
# Print to console
for props in results:
print("\n" + "="*60)
for key, value in props.items():
print(f"{key:25s}: {value}")
return results
def write_csv(results, output_file):
"""Write results to CSV file."""
import csv
if not results:
print("No results to write")
return
with open(output_file, 'w', newline='') as f:
fieldnames = results[0].keys()
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(results)
def print_properties(props):
"""Print properties in formatted output."""
print("\nMolecular Properties:")
print("="*60)
# Group related properties
print("\n[Basic Information]")
print(f" SMILES: {props['SMILES']}")
print(f" Formula: {props['Molecular_Formula']}")
print("\n[Size & Weight]")
print(f" Molecular Weight: {props['MW']:.2f}")
print(f" Exact MW: {props['ExactMW']:.4f}")
print(f" Heavy Atoms: {props['Heavy_Atoms']}")
print(f" Heteroatoms: {props['Heteroatoms']}")
print("\n[Lipophilicity]")
print(f" LogP: {props['LogP']:.2f}")
print(f" Molar Refractivity: {props['MR']:.2f}")
print("\n[Polarity]")
print(f" TPSA: {props['TPSA']:.2f}")
print(f" Labute ASA: {props['LabuteASA']:.2f}")
print(f" H-bond Donors: {props['HBD']}")
print(f" H-bond Acceptors: {props['HBA']}")
print("\n[Ring Systems]")
print(f" Total Rings: {props['Rings']}")
print(f" Aromatic Rings: {props['Aromatic_Rings']}")
print(f" Saturated Rings: {props['Saturated_Rings']}")
print(f" Aliphatic Rings: {props['Aliphatic_Rings']}")
print(f" Aromatic Heterocycles: {props['Aromatic_Heterocycles']}")
print("\n[Flexibility & Complexity]")
print(f" Rotatable Bonds: {props['Rotatable_Bonds']}")
print(f" Fraction Csp3: {props['Fraction_Csp3']:.3f}")
print(f" Bertz Complexity: {props['BertzCT']:.1f}")
print("\n[Drug-likeness]")
print(f" QED Score: {props['QED']:.3f}")
print(f" Lipinski Pass: {'Yes' if props['Lipinski_Pass'] else 'No'}")
print(f" Lead-like: {'Yes' if props['Lead-like'] else 'No'}")
print("="*60)
def main():
parser = argparse.ArgumentParser(
description='Calculate molecular properties for molecules',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Single molecule
python molecular_properties.py "CCO"
# From file
python molecular_properties.py --file molecules.smi
# Save to CSV
python molecular_properties.py --file molecules.sdf --output properties.csv
"""
)
parser.add_argument('smiles', nargs='?', help='SMILES string to analyze')
parser.add_argument('--file', '-f', help='Input file (SDF or SMILES)')
parser.add_argument('--output', '-o', help='Output CSV file')
args = parser.parse_args()
if not args.smiles and not args.file:
parser.print_help()
sys.exit(1)
if args.smiles:
# Process single molecule
props = process_single_molecule(args.smiles)
if props:
print_properties(props)
elif args.file:
# Process file
process_file(args.file, args.output)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,295 @@
#!/usr/bin/env python3
"""
Molecular Similarity Search
Perform fingerprint-based similarity screening against a database of molecules.
Supports multiple fingerprint types and similarity metrics.
Usage:
python similarity_search.py "CCO" database.smi --threshold 0.7
python similarity_search.py query.smi database.sdf --method morgan --output hits.csv
"""
import argparse
import sys
from pathlib import Path
try:
from rdkit import Chem
from rdkit.Chem import AllChem, MACCSkeys
from rdkit import DataStructs
except ImportError:
print("Error: RDKit not installed. Install with: conda install -c conda-forge rdkit")
sys.exit(1)
FINGERPRINT_METHODS = {
'morgan': 'Morgan fingerprint (ECFP-like)',
'rdkit': 'RDKit topological fingerprint',
'maccs': 'MACCS structural keys',
'atompair': 'Atom pair fingerprint',
'torsion': 'Topological torsion fingerprint'
}
def generate_fingerprint(mol, method='morgan', radius=2, n_bits=2048):
"""Generate molecular fingerprint based on specified method."""
if mol is None:
return None
method = method.lower()
if method == 'morgan':
return AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
elif method == 'rdkit':
return Chem.RDKFingerprint(mol, maxPath=7, fpSize=n_bits)
elif method == 'maccs':
return MACCSkeys.GenMACCSKeys(mol)
elif method == 'atompair':
from rdkit.Chem.AtomPairs import Pairs
return Pairs.GetAtomPairFingerprintAsBitVect(mol, nBits=n_bits)
elif method == 'torsion':
from rdkit.Chem.AtomPairs import Torsions
return Torsions.GetHashedTopologicalTorsionFingerprintAsBitVect(mol, nBits=n_bits)
else:
raise ValueError(f"Unknown fingerprint method: {method}")
def load_molecules(file_path):
"""Load molecules from file."""
path = Path(file_path)
if not path.exists():
print(f"Error: File not found: {file_path}")
return []
molecules = []
if path.suffix.lower() in ['.sdf', '.mol']:
suppl = Chem.SDMolSupplier(str(path))
elif path.suffix.lower() in ['.smi', '.smiles', '.txt']:
suppl = Chem.SmilesMolSupplier(str(path), titleLine=False)
else:
print(f"Error: Unsupported file format: {path.suffix}")
return []
for idx, mol in enumerate(suppl):
if mol is None:
print(f"Warning: Failed to parse molecule {idx+1}")
continue
# Try to get molecule name
name = mol.GetProp('_Name') if mol.HasProp('_Name') else f"Mol_{idx+1}"
smiles = Chem.MolToSmiles(mol)
molecules.append({
'index': idx + 1,
'name': name,
'smiles': smiles,
'mol': mol
})
return molecules
def similarity_search(query_mol, database, method='morgan', threshold=0.7,
radius=2, n_bits=2048, metric='tanimoto'):
"""
Perform similarity search.
Args:
query_mol: Query molecule (RDKit Mol object)
database: List of database molecules
method: Fingerprint method
threshold: Similarity threshold (0-1)
radius: Morgan fingerprint radius
n_bits: Fingerprint size
metric: Similarity metric (tanimoto, dice, cosine)
Returns:
List of hits with similarity scores
"""
if query_mol is None:
print("Error: Invalid query molecule")
return []
# Generate query fingerprint
query_fp = generate_fingerprint(query_mol, method, radius, n_bits)
if query_fp is None:
print("Error: Failed to generate query fingerprint")
return []
# Choose similarity function
if metric.lower() == 'tanimoto':
sim_func = DataStructs.TanimotoSimilarity
elif metric.lower() == 'dice':
sim_func = DataStructs.DiceSimilarity
elif metric.lower() == 'cosine':
sim_func = DataStructs.CosineSimilarity
else:
raise ValueError(f"Unknown similarity metric: {metric}")
# Search database
hits = []
for db_entry in database:
db_fp = generate_fingerprint(db_entry['mol'], method, radius, n_bits)
if db_fp is None:
continue
similarity = sim_func(query_fp, db_fp)
if similarity >= threshold:
hits.append({
'index': db_entry['index'],
'name': db_entry['name'],
'smiles': db_entry['smiles'],
'similarity': similarity
})
# Sort by similarity (descending)
hits.sort(key=lambda x: x['similarity'], reverse=True)
return hits
def write_results(hits, output_file):
"""Write results to CSV file."""
import csv
with open(output_file, 'w', newline='') as f:
fieldnames = ['Rank', 'Index', 'Name', 'SMILES', 'Similarity']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for rank, hit in enumerate(hits, 1):
writer.writerow({
'Rank': rank,
'Index': hit['index'],
'Name': hit['name'],
'SMILES': hit['smiles'],
'Similarity': f"{hit['similarity']:.4f}"
})
def print_results(hits, max_display=20):
"""Print results to console."""
if not hits:
print("\nNo hits found above threshold")
return
print(f"\nFound {len(hits)} similar molecules:")
print("="*80)
print(f"{'Rank':<6} {'Index':<8} {'Similarity':<12} {'Name':<20} {'SMILES'}")
print("-"*80)
for rank, hit in enumerate(hits[:max_display], 1):
name = hit['name'][:18] + '..' if len(hit['name']) > 20 else hit['name']
smiles = hit['smiles'][:40] + '...' if len(hit['smiles']) > 43 else hit['smiles']
print(f"{rank:<6} {hit['index']:<8} {hit['similarity']:<12.4f} {name:<20} {smiles}")
if len(hits) > max_display:
print(f"\n... and {len(hits) - max_display} more")
print("="*80)
def main():
parser = argparse.ArgumentParser(
description='Molecular similarity search using fingerprints',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=f"""
Available fingerprint methods:
{chr(10).join(f' {k:12s} - {v}' for k, v in FINGERPRINT_METHODS.items())}
Similarity metrics:
tanimoto - Tanimoto coefficient (default)
dice - Dice coefficient
cosine - Cosine similarity
Examples:
# Search with SMILES query
python similarity_search.py "CCO" database.smi --threshold 0.7
# Use different fingerprint
python similarity_search.py query.smi database.sdf --method maccs
# Save results
python similarity_search.py "c1ccccc1" database.smi --output hits.csv
# Adjust Morgan radius
python similarity_search.py "CCO" database.smi --method morgan --radius 3
"""
)
parser.add_argument('query', help='Query SMILES or file')
parser.add_argument('database', help='Database file (SDF or SMILES)')
parser.add_argument('--method', '-m', default='morgan',
choices=FINGERPRINT_METHODS.keys(),
help='Fingerprint method (default: morgan)')
parser.add_argument('--threshold', '-t', type=float, default=0.7,
help='Similarity threshold (default: 0.7)')
parser.add_argument('--radius', '-r', type=int, default=2,
help='Morgan fingerprint radius (default: 2)')
parser.add_argument('--bits', '-b', type=int, default=2048,
help='Fingerprint size (default: 2048)')
parser.add_argument('--metric', default='tanimoto',
choices=['tanimoto', 'dice', 'cosine'],
help='Similarity metric (default: tanimoto)')
parser.add_argument('--output', '-o', help='Output CSV file')
parser.add_argument('--max-display', type=int, default=20,
help='Maximum hits to display (default: 20)')
args = parser.parse_args()
# Load query
query_path = Path(args.query)
if query_path.exists():
# Query is a file
query_mols = load_molecules(args.query)
if not query_mols:
print("Error: No valid molecules in query file")
sys.exit(1)
query_mol = query_mols[0]['mol']
query_smiles = query_mols[0]['smiles']
else:
# Query is SMILES string
query_mol = Chem.MolFromSmiles(args.query)
query_smiles = args.query
if query_mol is None:
print(f"Error: Failed to parse query SMILES: {args.query}")
sys.exit(1)
print(f"Query: {query_smiles}")
print(f"Method: {args.method}")
print(f"Threshold: {args.threshold}")
print(f"Loading database: {args.database}...")
# Load database
database = load_molecules(args.database)
if not database:
print("Error: No valid molecules in database")
sys.exit(1)
print(f"Loaded {len(database)} molecules")
print("Searching...")
# Perform search
hits = similarity_search(
query_mol, database,
method=args.method,
threshold=args.threshold,
radius=args.radius,
n_bits=args.bits,
metric=args.metric
)
# Output results
if args.output:
write_results(hits, args.output)
print(f"\nResults written to: {args.output}")
print_results(hits, args.max_display)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,386 @@
#!/usr/bin/env python3
"""
Substructure Filter
Filter molecules based on substructure patterns using SMARTS.
Supports inclusion and exclusion filters, and custom pattern libraries.
Usage:
python substructure_filter.py molecules.smi --pattern "c1ccccc1" --output filtered.smi
python substructure_filter.py database.sdf --exclude "C(=O)Cl" --filter-type functional-groups
"""
import argparse
import sys
from pathlib import Path
try:
from rdkit import Chem
except ImportError:
print("Error: RDKit not installed. Install with: conda install -c conda-forge rdkit")
sys.exit(1)
# Common SMARTS pattern libraries
PATTERN_LIBRARIES = {
'functional-groups': {
'alcohol': '[OH][C]',
'aldehyde': '[CH1](=O)',
'ketone': '[C](=O)[C]',
'carboxylic_acid': 'C(=O)[OH]',
'ester': 'C(=O)O[C]',
'amide': 'C(=O)N',
'amine': '[NX3]',
'ether': '[C][O][C]',
'nitrile': 'C#N',
'nitro': '[N+](=O)[O-]',
'halide': '[C][F,Cl,Br,I]',
'thiol': '[C][SH]',
'sulfide': '[C][S][C]',
},
'rings': {
'benzene': 'c1ccccc1',
'pyridine': 'n1ccccc1',
'pyrrole': 'n1cccc1',
'furan': 'o1cccc1',
'thiophene': 's1cccc1',
'imidazole': 'n1cncc1',
'indole': 'c1ccc2[nH]ccc2c1',
'naphthalene': 'c1ccc2ccccc2c1',
},
'pains': {
'rhodanine': 'S1C(=O)NC(=S)C1',
'catechol': 'c1ccc(O)c(O)c1',
'quinone': 'O=C1C=CC(=O)C=C1',
'michael_acceptor': 'C=CC(=O)',
'alkyl_halide': '[C][I,Br]',
},
'privileged': {
'biphenyl': 'c1ccccc1-c2ccccc2',
'piperazine': 'N1CCNCC1',
'piperidine': 'N1CCCCC1',
'morpholine': 'N1CCOCC1',
}
}
def load_molecules(file_path, keep_props=True):
"""Load molecules from file."""
path = Path(file_path)
if not path.exists():
print(f"Error: File not found: {file_path}")
return []
molecules = []
if path.suffix.lower() in ['.sdf', '.mol']:
suppl = Chem.SDMolSupplier(str(path))
elif path.suffix.lower() in ['.smi', '.smiles', '.txt']:
suppl = Chem.SmilesMolSupplier(str(path), titleLine=False)
else:
print(f"Error: Unsupported file format: {path.suffix}")
return []
for idx, mol in enumerate(suppl):
if mol is None:
print(f"Warning: Failed to parse molecule {idx+1}")
continue
molecules.append(mol)
return molecules
def create_pattern_query(pattern_string):
"""Create SMARTS query from string or SMILES."""
# Try as SMARTS first
query = Chem.MolFromSmarts(pattern_string)
if query is not None:
return query
# Try as SMILES
query = Chem.MolFromSmiles(pattern_string)
if query is not None:
return query
print(f"Error: Invalid pattern: {pattern_string}")
return None
def filter_molecules(molecules, include_patterns=None, exclude_patterns=None,
match_all_include=False):
"""
Filter molecules based on substructure patterns.
Args:
molecules: List of RDKit Mol objects
include_patterns: List of (name, pattern) tuples to include
exclude_patterns: List of (name, pattern) tuples to exclude
match_all_include: If True, molecule must match ALL include patterns
Returns:
Tuple of (filtered_molecules, match_info)
"""
filtered = []
match_info = []
for idx, mol in enumerate(molecules):
if mol is None:
continue
# Check exclusion patterns first
excluded = False
exclude_matches = []
if exclude_patterns:
for name, pattern in exclude_patterns:
if mol.HasSubstructMatch(pattern):
excluded = True
exclude_matches.append(name)
if excluded:
match_info.append({
'index': idx + 1,
'smiles': Chem.MolToSmiles(mol),
'status': 'excluded',
'matches': exclude_matches
})
continue
# Check inclusion patterns
if include_patterns:
include_matches = []
for name, pattern in include_patterns:
if mol.HasSubstructMatch(pattern):
include_matches.append(name)
# Decide if molecule passes inclusion filter
if match_all_include:
passed = len(include_matches) == len(include_patterns)
else:
passed = len(include_matches) > 0
if passed:
filtered.append(mol)
match_info.append({
'index': idx + 1,
'smiles': Chem.MolToSmiles(mol),
'status': 'included',
'matches': include_matches
})
else:
match_info.append({
'index': idx + 1,
'smiles': Chem.MolToSmiles(mol),
'status': 'no_match',
'matches': []
})
else:
# No inclusion patterns, keep all non-excluded
filtered.append(mol)
match_info.append({
'index': idx + 1,
'smiles': Chem.MolToSmiles(mol),
'status': 'included',
'matches': []
})
return filtered, match_info
def write_molecules(molecules, output_file):
"""Write molecules to file."""
output_path = Path(output_file)
if output_path.suffix.lower() in ['.sdf']:
writer = Chem.SDWriter(str(output_path))
for mol in molecules:
writer.write(mol)
writer.close()
elif output_path.suffix.lower() in ['.smi', '.smiles', '.txt']:
with open(output_path, 'w') as f:
for mol in molecules:
smiles = Chem.MolToSmiles(mol)
name = mol.GetProp('_Name') if mol.HasProp('_Name') else ''
f.write(f"{smiles} {name}\n")
else:
print(f"Error: Unsupported output format: {output_path.suffix}")
return
print(f"Wrote {len(molecules)} molecules to {output_file}")
def write_report(match_info, output_file):
"""Write detailed match report."""
import csv
with open(output_file, 'w', newline='') as f:
fieldnames = ['Index', 'SMILES', 'Status', 'Matches']
writer = csv.DictWriter(f, fieldnames=fieldnames)
writer.writeheader()
for info in match_info:
writer.writerow({
'Index': info['index'],
'SMILES': info['smiles'],
'Status': info['status'],
'Matches': ', '.join(info['matches'])
})
def print_summary(total, filtered, match_info):
"""Print filtering summary."""
print("\n" + "="*60)
print("Filtering Summary")
print("="*60)
print(f"Total molecules: {total}")
print(f"Passed filter: {len(filtered)}")
print(f"Filtered out: {total - len(filtered)}")
print(f"Pass rate: {len(filtered)/total*100:.1f}%")
# Count by status
status_counts = {}
for info in match_info:
status = info['status']
status_counts[status] = status_counts.get(status, 0) + 1
print("\nBreakdown:")
for status, count in status_counts.items():
print(f" {status:15s}: {count}")
print("="*60)
def main():
parser = argparse.ArgumentParser(
description='Filter molecules by substructure patterns',
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=f"""
Pattern libraries:
--filter-type functional-groups Common functional groups
--filter-type rings Ring systems
--filter-type pains PAINS (Pan-Assay Interference)
--filter-type privileged Privileged structures
Examples:
# Include molecules with benzene ring
python substructure_filter.py molecules.smi --pattern "c1ccccc1" -o filtered.smi
# Exclude reactive groups
python substructure_filter.py database.sdf --exclude "C(=O)Cl" -o clean.sdf
# Filter by functional groups
python substructure_filter.py molecules.smi --filter-type functional-groups -o fg.smi
# Remove PAINS
python substructure_filter.py compounds.smi --filter-type pains --exclude-mode -o clean.smi
# Multiple patterns
python substructure_filter.py mol.smi --pattern "c1ccccc1" --pattern "N" -o aromatic_amines.smi
"""
)
parser.add_argument('input', help='Input file (SDF or SMILES)')
parser.add_argument('--pattern', '-p', action='append',
help='SMARTS/SMILES pattern to include (can specify multiple)')
parser.add_argument('--exclude', '-e', action='append',
help='SMARTS/SMILES pattern to exclude (can specify multiple)')
parser.add_argument('--filter-type', choices=PATTERN_LIBRARIES.keys(),
help='Use predefined pattern library')
parser.add_argument('--exclude-mode', action='store_true',
help='Use filter-type patterns for exclusion instead of inclusion')
parser.add_argument('--match-all', action='store_true',
help='Molecule must match ALL include patterns')
parser.add_argument('--output', '-o', help='Output file')
parser.add_argument('--report', '-r', help='Write detailed report to CSV')
parser.add_argument('--list-patterns', action='store_true',
help='List available pattern libraries and exit')
args = parser.parse_args()
# List patterns mode
if args.list_patterns:
print("\nAvailable Pattern Libraries:")
print("="*60)
for lib_name, patterns in PATTERN_LIBRARIES.items():
print(f"\n{lib_name}:")
for name, pattern in patterns.items():
print(f" {name:25s}: {pattern}")
sys.exit(0)
# Load molecules
print(f"Loading molecules from: {args.input}")
molecules = load_molecules(args.input)
if not molecules:
print("Error: No valid molecules loaded")
sys.exit(1)
print(f"Loaded {len(molecules)} molecules")
# Prepare patterns
include_patterns = []
exclude_patterns = []
# Add custom include patterns
if args.pattern:
for pattern_str in args.pattern:
query = create_pattern_query(pattern_str)
if query:
include_patterns.append(('custom', query))
# Add custom exclude patterns
if args.exclude:
for pattern_str in args.exclude:
query = create_pattern_query(pattern_str)
if query:
exclude_patterns.append(('custom', query))
# Add library patterns
if args.filter_type:
lib_patterns = PATTERN_LIBRARIES[args.filter_type]
for name, pattern_str in lib_patterns.items():
query = create_pattern_query(pattern_str)
if query:
if args.exclude_mode:
exclude_patterns.append((name, query))
else:
include_patterns.append((name, query))
if not include_patterns and not exclude_patterns:
print("Error: No patterns specified")
sys.exit(1)
# Print filter configuration
print(f"\nFilter configuration:")
if include_patterns:
print(f" Include patterns: {len(include_patterns)}")
if args.match_all:
print(" Mode: Match ALL")
else:
print(" Mode: Match ANY")
if exclude_patterns:
print(f" Exclude patterns: {len(exclude_patterns)}")
# Perform filtering
print("\nFiltering...")
filtered, match_info = filter_molecules(
molecules,
include_patterns=include_patterns if include_patterns else None,
exclude_patterns=exclude_patterns if exclude_patterns else None,
match_all_include=args.match_all
)
# Print summary
print_summary(len(molecules), filtered, match_info)
# Write output
if args.output:
write_molecules(filtered, args.output)
if args.report:
write_report(match_info, args.report)
print(f"Detailed report written to: {args.report}")
if __name__ == '__main__':
main()