Initial commit

2025-11-30 08:30:10 +08:00
commit f0bd18fb4e
824 changed files with 331919 additions and 0 deletions
--- a/skills/rdkit/scripts/molecular_properties.py
+++ b/skills/rdkit/scripts/molecular_properties.py
@@ -0,0 +1,243 @@
+#!/usr/bin/env python3
+"""
+Molecular Properties Calculator
+
+Calculate comprehensive molecular properties and descriptors for molecules.
+Supports single molecules or batch processing from files.
+
+Usage:
+    python molecular_properties.py "CCO"
+    python molecular_properties.py --file molecules.smi --output properties.csv
+"""
+
+import argparse
+import sys
+from pathlib import Path
+
+try:
+    from rdkit import Chem
+    from rdkit.Chem import Descriptors, Lipinski
+except ImportError:
+    print("Error: RDKit not installed. Install with: conda install -c conda-forge rdkit")
+    sys.exit(1)
+
+
+def calculate_properties(mol):
+    """Calculate comprehensive molecular properties."""
+    if mol is None:
+        return None
+
+    properties = {
+        # Basic properties
+        'SMILES': Chem.MolToSmiles(mol),
+        'Molecular_Formula': Chem.rdMolDescriptors.CalcMolFormula(mol),
+
+        # Molecular weight
+        'MW': Descriptors.MolWt(mol),
+        'ExactMW': Descriptors.ExactMolWt(mol),
+
+        # Lipophilicity
+        'LogP': Descriptors.MolLogP(mol),
+        'MR': Descriptors.MolMR(mol),
+
+        # Polar surface area
+        'TPSA': Descriptors.TPSA(mol),
+        'LabuteASA': Descriptors.LabuteASA(mol),
+
+        # Hydrogen bonding
+        'HBD': Descriptors.NumHDonors(mol),
+        'HBA': Descriptors.NumHAcceptors(mol),
+
+        # Atom counts
+        'Heavy_Atoms': Descriptors.HeavyAtomCount(mol),
+        'Heteroatoms': Descriptors.NumHeteroatoms(mol),
+        'Valence_Electrons': Descriptors.NumValenceElectrons(mol),
+
+        # Ring information
+        'Rings': Descriptors.RingCount(mol),
+        'Aromatic_Rings': Descriptors.NumAromaticRings(mol),
+        'Saturated_Rings': Descriptors.NumSaturatedRings(mol),
+        'Aliphatic_Rings': Descriptors.NumAliphaticRings(mol),
+        'Aromatic_Heterocycles': Descriptors.NumAromaticHeterocycles(mol),
+
+        # Flexibility
+        'Rotatable_Bonds': Descriptors.NumRotatableBonds(mol),
+        'Fraction_Csp3': Descriptors.FractionCsp3(mol),
+
+        # Complexity
+        'BertzCT': Descriptors.BertzCT(mol),
+
+        # Drug-likeness
+        'QED': Descriptors.qed(mol),
+    }
+
+    # Lipinski's Rule of Five
+    properties['Lipinski_Pass'] = (
+        properties['MW'] <= 500 and
+        properties['LogP'] <= 5 and
+        properties['HBD'] <= 5 and
+        properties['HBA'] <= 10
+    )
+
+    # Lead-likeness
+    properties['Lead-like'] = (
+        250 <= properties['MW'] <= 350 and
+        properties['LogP'] <= 3.5 and
+        properties['Rotatable_Bonds'] <= 7
+    )
+
+    return properties
+
+
+def process_single_molecule(smiles):
+    """Process a single SMILES string."""
+    mol = Chem.MolFromSmiles(smiles)
+    if mol is None:
+        print(f"Error: Failed to parse SMILES: {smiles}")
+        return None
+
+    props = calculate_properties(mol)
+    return props
+
+
+def process_file(input_file, output_file=None):
+    """Process molecules from a file."""
+    input_path = Path(input_file)
+
+    if not input_path.exists():
+        print(f"Error: File not found: {input_file}")
+        return
+
+    # Determine file type
+    if input_path.suffix.lower() in ['.sdf', '.mol']:
+        suppl = Chem.SDMolSupplier(str(input_path))
+    elif input_path.suffix.lower() in ['.smi', '.smiles', '.txt']:
+        suppl = Chem.SmilesMolSupplier(str(input_path), titleLine=False)
+    else:
+        print(f"Error: Unsupported file format: {input_path.suffix}")
+        return
+
+    results = []
+    for idx, mol in enumerate(suppl):
+        if mol is None:
+            print(f"Warning: Failed to parse molecule {idx+1}")
+            continue
+
+        props = calculate_properties(mol)
+        if props:
+            props['Index'] = idx + 1
+            results.append(props)
+
+    # Output results
+    if output_file:
+        write_csv(results, output_file)
+        print(f"Results written to: {output_file}")
+    else:
+        # Print to console
+        for props in results:
+            print("\n" + "="*60)
+            for key, value in props.items():
+                print(f"{key:25s}: {value}")
+
+    return results
+
+
+def write_csv(results, output_file):
+    """Write results to CSV file."""
+    import csv
+
+    if not results:
+        print("No results to write")
+        return
+
+    with open(output_file, 'w', newline='') as f:
+        fieldnames = results[0].keys()
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(results)
+
+
+def print_properties(props):
+    """Print properties in formatted output."""
+    print("\nMolecular Properties:")
+    print("="*60)
+
+    # Group related properties
+    print("\n[Basic Information]")
+    print(f"  SMILES:              {props['SMILES']}")
+    print(f"  Formula:             {props['Molecular_Formula']}")
+
+    print("\n[Size & Weight]")
+    print(f"  Molecular Weight:    {props['MW']:.2f}")
+    print(f"  Exact MW:            {props['ExactMW']:.4f}")
+    print(f"  Heavy Atoms:         {props['Heavy_Atoms']}")
+    print(f"  Heteroatoms:         {props['Heteroatoms']}")
+
+    print("\n[Lipophilicity]")
+    print(f"  LogP:                {props['LogP']:.2f}")
+    print(f"  Molar Refractivity:  {props['MR']:.2f}")
+
+    print("\n[Polarity]")
+    print(f"  TPSA:                {props['TPSA']:.2f}")
+    print(f"  Labute ASA:          {props['LabuteASA']:.2f}")
+    print(f"  H-bond Donors:       {props['HBD']}")
+    print(f"  H-bond Acceptors:    {props['HBA']}")
+
+    print("\n[Ring Systems]")
+    print(f"  Total Rings:         {props['Rings']}")
+    print(f"  Aromatic Rings:      {props['Aromatic_Rings']}")
+    print(f"  Saturated Rings:     {props['Saturated_Rings']}")
+    print(f"  Aliphatic Rings:     {props['Aliphatic_Rings']}")
+    print(f"  Aromatic Heterocycles: {props['Aromatic_Heterocycles']}")
+
+    print("\n[Flexibility & Complexity]")
+    print(f"  Rotatable Bonds:     {props['Rotatable_Bonds']}")
+    print(f"  Fraction Csp3:       {props['Fraction_Csp3']:.3f}")
+    print(f"  Bertz Complexity:    {props['BertzCT']:.1f}")
+
+    print("\n[Drug-likeness]")
+    print(f"  QED Score:           {props['QED']:.3f}")
+    print(f"  Lipinski Pass:       {'Yes' if props['Lipinski_Pass'] else 'No'}")
+    print(f"  Lead-like:           {'Yes' if props['Lead-like'] else 'No'}")
+    print("="*60)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Calculate molecular properties for molecules',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog="""
+Examples:
+  # Single molecule
+  python molecular_properties.py "CCO"
+
+  # From file
+  python molecular_properties.py --file molecules.smi
+
+  # Save to CSV
+  python molecular_properties.py --file molecules.sdf --output properties.csv
+        """
+    )
+
+    parser.add_argument('smiles', nargs='?', help='SMILES string to analyze')
+    parser.add_argument('--file', '-f', help='Input file (SDF or SMILES)')
+    parser.add_argument('--output', '-o', help='Output CSV file')
+
+    args = parser.parse_args()
+
+    if not args.smiles and not args.file:
+        parser.print_help()
+        sys.exit(1)
+
+    if args.smiles:
+        # Process single molecule
+        props = process_single_molecule(args.smiles)
+        if props:
+            print_properties(props)
+    elif args.file:
+        # Process file
+        process_file(args.file, args.output)
+
+
+if __name__ == '__main__':
+    main()
--- a/skills/rdkit/scripts/similarity_search.py
+++ b/skills/rdkit/scripts/similarity_search.py
@@ -0,0 +1,295 @@
+#!/usr/bin/env python3
+"""
+Molecular Similarity Search
+
+Perform fingerprint-based similarity screening against a database of molecules.
+Supports multiple fingerprint types and similarity metrics.
+
+Usage:
+    python similarity_search.py "CCO" database.smi --threshold 0.7
+    python similarity_search.py query.smi database.sdf --method morgan --output hits.csv
+"""
+
+import argparse
+import sys
+from pathlib import Path
+
+try:
+    from rdkit import Chem
+    from rdkit.Chem import AllChem, MACCSkeys
+    from rdkit import DataStructs
+except ImportError:
+    print("Error: RDKit not installed. Install with: conda install -c conda-forge rdkit")
+    sys.exit(1)
+
+
+FINGERPRINT_METHODS = {
+    'morgan': 'Morgan fingerprint (ECFP-like)',
+    'rdkit': 'RDKit topological fingerprint',
+    'maccs': 'MACCS structural keys',
+    'atompair': 'Atom pair fingerprint',
+    'torsion': 'Topological torsion fingerprint'
+}
+
+
+def generate_fingerprint(mol, method='morgan', radius=2, n_bits=2048):
+    """Generate molecular fingerprint based on specified method."""
+    if mol is None:
+        return None
+
+    method = method.lower()
+
+    if method == 'morgan':
+        return AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
+    elif method == 'rdkit':
+        return Chem.RDKFingerprint(mol, maxPath=7, fpSize=n_bits)
+    elif method == 'maccs':
+        return MACCSkeys.GenMACCSKeys(mol)
+    elif method == 'atompair':
+        from rdkit.Chem.AtomPairs import Pairs
+        return Pairs.GetAtomPairFingerprintAsBitVect(mol, nBits=n_bits)
+    elif method == 'torsion':
+        from rdkit.Chem.AtomPairs import Torsions
+        return Torsions.GetHashedTopologicalTorsionFingerprintAsBitVect(mol, nBits=n_bits)
+    else:
+        raise ValueError(f"Unknown fingerprint method: {method}")
+
+
+def load_molecules(file_path):
+    """Load molecules from file."""
+    path = Path(file_path)
+
+    if not path.exists():
+        print(f"Error: File not found: {file_path}")
+        return []
+
+    molecules = []
+
+    if path.suffix.lower() in ['.sdf', '.mol']:
+        suppl = Chem.SDMolSupplier(str(path))
+    elif path.suffix.lower() in ['.smi', '.smiles', '.txt']:
+        suppl = Chem.SmilesMolSupplier(str(path), titleLine=False)
+    else:
+        print(f"Error: Unsupported file format: {path.suffix}")
+        return []
+
+    for idx, mol in enumerate(suppl):
+        if mol is None:
+            print(f"Warning: Failed to parse molecule {idx+1}")
+            continue
+
+        # Try to get molecule name
+        name = mol.GetProp('_Name') if mol.HasProp('_Name') else f"Mol_{idx+1}"
+        smiles = Chem.MolToSmiles(mol)
+
+        molecules.append({
+            'index': idx + 1,
+            'name': name,
+            'smiles': smiles,
+            'mol': mol
+        })
+
+    return molecules
+
+
+def similarity_search(query_mol, database, method='morgan', threshold=0.7,
+                     radius=2, n_bits=2048, metric='tanimoto'):
+    """
+    Perform similarity search.
+
+    Args:
+        query_mol: Query molecule (RDKit Mol object)
+        database: List of database molecules
+        method: Fingerprint method
+        threshold: Similarity threshold (0-1)
+        radius: Morgan fingerprint radius
+        n_bits: Fingerprint size
+        metric: Similarity metric (tanimoto, dice, cosine)
+
+    Returns:
+        List of hits with similarity scores
+    """
+    if query_mol is None:
+        print("Error: Invalid query molecule")
+        return []
+
+    # Generate query fingerprint
+    query_fp = generate_fingerprint(query_mol, method, radius, n_bits)
+    if query_fp is None:
+        print("Error: Failed to generate query fingerprint")
+        return []
+
+    # Choose similarity function
+    if metric.lower() == 'tanimoto':
+        sim_func = DataStructs.TanimotoSimilarity
+    elif metric.lower() == 'dice':
+        sim_func = DataStructs.DiceSimilarity
+    elif metric.lower() == 'cosine':
+        sim_func = DataStructs.CosineSimilarity
+    else:
+        raise ValueError(f"Unknown similarity metric: {metric}")
+
+    # Search database
+    hits = []
+    for db_entry in database:
+        db_fp = generate_fingerprint(db_entry['mol'], method, radius, n_bits)
+        if db_fp is None:
+            continue
+
+        similarity = sim_func(query_fp, db_fp)
+
+        if similarity >= threshold:
+            hits.append({
+                'index': db_entry['index'],
+                'name': db_entry['name'],
+                'smiles': db_entry['smiles'],
+                'similarity': similarity
+            })
+
+    # Sort by similarity (descending)
+    hits.sort(key=lambda x: x['similarity'], reverse=True)
+
+    return hits
+
+
+def write_results(hits, output_file):
+    """Write results to CSV file."""
+    import csv
+
+    with open(output_file, 'w', newline='') as f:
+        fieldnames = ['Rank', 'Index', 'Name', 'SMILES', 'Similarity']
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+
+        for rank, hit in enumerate(hits, 1):
+            writer.writerow({
+                'Rank': rank,
+                'Index': hit['index'],
+                'Name': hit['name'],
+                'SMILES': hit['smiles'],
+                'Similarity': f"{hit['similarity']:.4f}"
+            })
+
+
+def print_results(hits, max_display=20):
+    """Print results to console."""
+    if not hits:
+        print("\nNo hits found above threshold")
+        return
+
+    print(f"\nFound {len(hits)} similar molecules:")
+    print("="*80)
+    print(f"{'Rank':<6} {'Index':<8} {'Similarity':<12} {'Name':<20} {'SMILES'}")
+    print("-"*80)
+
+    for rank, hit in enumerate(hits[:max_display], 1):
+        name = hit['name'][:18] + '..' if len(hit['name']) > 20 else hit['name']
+        smiles = hit['smiles'][:40] + '...' if len(hit['smiles']) > 43 else hit['smiles']
+        print(f"{rank:<6} {hit['index']:<8} {hit['similarity']:<12.4f} {name:<20} {smiles}")
+
+    if len(hits) > max_display:
+        print(f"\n... and {len(hits) - max_display} more")
+
+    print("="*80)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Molecular similarity search using fingerprints',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=f"""
+Available fingerprint methods:
+{chr(10).join(f'  {k:12s} - {v}' for k, v in FINGERPRINT_METHODS.items())}
+
+Similarity metrics:
+  tanimoto    - Tanimoto coefficient (default)
+  dice        - Dice coefficient
+  cosine      - Cosine similarity
+
+Examples:
+  # Search with SMILES query
+  python similarity_search.py "CCO" database.smi --threshold 0.7
+
+  # Use different fingerprint
+  python similarity_search.py query.smi database.sdf --method maccs
+
+  # Save results
+  python similarity_search.py "c1ccccc1" database.smi --output hits.csv
+
+  # Adjust Morgan radius
+  python similarity_search.py "CCO" database.smi --method morgan --radius 3
+        """
+    )
+
+    parser.add_argument('query', help='Query SMILES or file')
+    parser.add_argument('database', help='Database file (SDF or SMILES)')
+    parser.add_argument('--method', '-m', default='morgan',
+                       choices=FINGERPRINT_METHODS.keys(),
+                       help='Fingerprint method (default: morgan)')
+    parser.add_argument('--threshold', '-t', type=float, default=0.7,
+                       help='Similarity threshold (default: 0.7)')
+    parser.add_argument('--radius', '-r', type=int, default=2,
+                       help='Morgan fingerprint radius (default: 2)')
+    parser.add_argument('--bits', '-b', type=int, default=2048,
+                       help='Fingerprint size (default: 2048)')
+    parser.add_argument('--metric', default='tanimoto',
+                       choices=['tanimoto', 'dice', 'cosine'],
+                       help='Similarity metric (default: tanimoto)')
+    parser.add_argument('--output', '-o', help='Output CSV file')
+    parser.add_argument('--max-display', type=int, default=20,
+                       help='Maximum hits to display (default: 20)')
+
+    args = parser.parse_args()
+
+    # Load query
+    query_path = Path(args.query)
+    if query_path.exists():
+        # Query is a file
+        query_mols = load_molecules(args.query)
+        if not query_mols:
+            print("Error: No valid molecules in query file")
+            sys.exit(1)
+        query_mol = query_mols[0]['mol']
+        query_smiles = query_mols[0]['smiles']
+    else:
+        # Query is SMILES string
+        query_mol = Chem.MolFromSmiles(args.query)
+        query_smiles = args.query
+        if query_mol is None:
+            print(f"Error: Failed to parse query SMILES: {args.query}")
+            sys.exit(1)
+
+    print(f"Query: {query_smiles}")
+    print(f"Method: {args.method}")
+    print(f"Threshold: {args.threshold}")
+    print(f"Loading database: {args.database}...")
+
+    # Load database
+    database = load_molecules(args.database)
+    if not database:
+        print("Error: No valid molecules in database")
+        sys.exit(1)
+
+    print(f"Loaded {len(database)} molecules")
+    print("Searching...")
+
+    # Perform search
+    hits = similarity_search(
+        query_mol, database,
+        method=args.method,
+        threshold=args.threshold,
+        radius=args.radius,
+        n_bits=args.bits,
+        metric=args.metric
+    )
+
+    # Output results
+    if args.output:
+        write_results(hits, args.output)
+        print(f"\nResults written to: {args.output}")
+
+    print_results(hits, args.max_display)
+
+
+if __name__ == '__main__':
+    main()
--- a/skills/rdkit/scripts/substructure_filter.py
+++ b/skills/rdkit/scripts/substructure_filter.py
@@ -0,0 +1,386 @@
+#!/usr/bin/env python3
+"""
+Substructure Filter
+
+Filter molecules based on substructure patterns using SMARTS.
+Supports inclusion and exclusion filters, and custom pattern libraries.
+
+Usage:
+    python substructure_filter.py molecules.smi --pattern "c1ccccc1" --output filtered.smi
+    python substructure_filter.py database.sdf --exclude "C(=O)Cl" --filter-type functional-groups
+"""
+
+import argparse
+import sys
+from pathlib import Path
+
+try:
+    from rdkit import Chem
+except ImportError:
+    print("Error: RDKit not installed. Install with: conda install -c conda-forge rdkit")
+    sys.exit(1)
+
+
+# Common SMARTS pattern libraries
+PATTERN_LIBRARIES = {
+    'functional-groups': {
+        'alcohol': '[OH][C]',
+        'aldehyde': '[CH1](=O)',
+        'ketone': '[C](=O)[C]',
+        'carboxylic_acid': 'C(=O)[OH]',
+        'ester': 'C(=O)O[C]',
+        'amide': 'C(=O)N',
+        'amine': '[NX3]',
+        'ether': '[C][O][C]',
+        'nitrile': 'C#N',
+        'nitro': '[N+](=O)[O-]',
+        'halide': '[C][F,Cl,Br,I]',
+        'thiol': '[C][SH]',
+        'sulfide': '[C][S][C]',
+    },
+    'rings': {
+        'benzene': 'c1ccccc1',
+        'pyridine': 'n1ccccc1',
+        'pyrrole': 'n1cccc1',
+        'furan': 'o1cccc1',
+        'thiophene': 's1cccc1',
+        'imidazole': 'n1cncc1',
+        'indole': 'c1ccc2[nH]ccc2c1',
+        'naphthalene': 'c1ccc2ccccc2c1',
+    },
+    'pains': {
+        'rhodanine': 'S1C(=O)NC(=S)C1',
+        'catechol': 'c1ccc(O)c(O)c1',
+        'quinone': 'O=C1C=CC(=O)C=C1',
+        'michael_acceptor': 'C=CC(=O)',
+        'alkyl_halide': '[C][I,Br]',
+    },
+    'privileged': {
+        'biphenyl': 'c1ccccc1-c2ccccc2',
+        'piperazine': 'N1CCNCC1',
+        'piperidine': 'N1CCCCC1',
+        'morpholine': 'N1CCOCC1',
+    }
+}
+
+
+def load_molecules(file_path, keep_props=True):
+    """Load molecules from file."""
+    path = Path(file_path)
+
+    if not path.exists():
+        print(f"Error: File not found: {file_path}")
+        return []
+
+    molecules = []
+
+    if path.suffix.lower() in ['.sdf', '.mol']:
+        suppl = Chem.SDMolSupplier(str(path))
+    elif path.suffix.lower() in ['.smi', '.smiles', '.txt']:
+        suppl = Chem.SmilesMolSupplier(str(path), titleLine=False)
+    else:
+        print(f"Error: Unsupported file format: {path.suffix}")
+        return []
+
+    for idx, mol in enumerate(suppl):
+        if mol is None:
+            print(f"Warning: Failed to parse molecule {idx+1}")
+            continue
+
+        molecules.append(mol)
+
+    return molecules
+
+
+def create_pattern_query(pattern_string):
+    """Create SMARTS query from string or SMILES."""
+    # Try as SMARTS first
+    query = Chem.MolFromSmarts(pattern_string)
+    if query is not None:
+        return query
+
+    # Try as SMILES
+    query = Chem.MolFromSmiles(pattern_string)
+    if query is not None:
+        return query
+
+    print(f"Error: Invalid pattern: {pattern_string}")
+    return None
+
+
+def filter_molecules(molecules, include_patterns=None, exclude_patterns=None,
+                    match_all_include=False):
+    """
+    Filter molecules based on substructure patterns.
+
+    Args:
+        molecules: List of RDKit Mol objects
+        include_patterns: List of (name, pattern) tuples to include
+        exclude_patterns: List of (name, pattern) tuples to exclude
+        match_all_include: If True, molecule must match ALL include patterns
+
+    Returns:
+        Tuple of (filtered_molecules, match_info)
+    """
+    filtered = []
+    match_info = []
+
+    for idx, mol in enumerate(molecules):
+        if mol is None:
+            continue
+
+        # Check exclusion patterns first
+        excluded = False
+        exclude_matches = []
+        if exclude_patterns:
+            for name, pattern in exclude_patterns:
+                if mol.HasSubstructMatch(pattern):
+                    excluded = True
+                    exclude_matches.append(name)
+
+        if excluded:
+            match_info.append({
+                'index': idx + 1,
+                'smiles': Chem.MolToSmiles(mol),
+                'status': 'excluded',
+                'matches': exclude_matches
+            })
+            continue
+
+        # Check inclusion patterns
+        if include_patterns:
+            include_matches = []
+            for name, pattern in include_patterns:
+                if mol.HasSubstructMatch(pattern):
+                    include_matches.append(name)
+
+            # Decide if molecule passes inclusion filter
+            if match_all_include:
+                passed = len(include_matches) == len(include_patterns)
+            else:
+                passed = len(include_matches) > 0
+
+            if passed:
+                filtered.append(mol)
+                match_info.append({
+                    'index': idx + 1,
+                    'smiles': Chem.MolToSmiles(mol),
+                    'status': 'included',
+                    'matches': include_matches
+                })
+            else:
+                match_info.append({
+                    'index': idx + 1,
+                    'smiles': Chem.MolToSmiles(mol),
+                    'status': 'no_match',
+                    'matches': []
+                })
+        else:
+            # No inclusion patterns, keep all non-excluded
+            filtered.append(mol)
+            match_info.append({
+                'index': idx + 1,
+                'smiles': Chem.MolToSmiles(mol),
+                'status': 'included',
+                'matches': []
+            })
+
+    return filtered, match_info
+
+
+def write_molecules(molecules, output_file):
+    """Write molecules to file."""
+    output_path = Path(output_file)
+
+    if output_path.suffix.lower() in ['.sdf']:
+        writer = Chem.SDWriter(str(output_path))
+        for mol in molecules:
+            writer.write(mol)
+        writer.close()
+    elif output_path.suffix.lower() in ['.smi', '.smiles', '.txt']:
+        with open(output_path, 'w') as f:
+            for mol in molecules:
+                smiles = Chem.MolToSmiles(mol)
+                name = mol.GetProp('_Name') if mol.HasProp('_Name') else ''
+                f.write(f"{smiles} {name}\n")
+    else:
+        print(f"Error: Unsupported output format: {output_path.suffix}")
+        return
+
+    print(f"Wrote {len(molecules)} molecules to {output_file}")
+
+
+def write_report(match_info, output_file):
+    """Write detailed match report."""
+    import csv
+
+    with open(output_file, 'w', newline='') as f:
+        fieldnames = ['Index', 'SMILES', 'Status', 'Matches']
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+
+        for info in match_info:
+            writer.writerow({
+                'Index': info['index'],
+                'SMILES': info['smiles'],
+                'Status': info['status'],
+                'Matches': ', '.join(info['matches'])
+            })
+
+
+def print_summary(total, filtered, match_info):
+    """Print filtering summary."""
+    print("\n" + "="*60)
+    print("Filtering Summary")
+    print("="*60)
+    print(f"Total molecules:     {total}")
+    print(f"Passed filter:       {len(filtered)}")
+    print(f"Filtered out:        {total - len(filtered)}")
+    print(f"Pass rate:           {len(filtered)/total*100:.1f}%")
+
+    # Count by status
+    status_counts = {}
+    for info in match_info:
+        status = info['status']
+        status_counts[status] = status_counts.get(status, 0) + 1
+
+    print("\nBreakdown:")
+    for status, count in status_counts.items():
+        print(f"  {status:15s}: {count}")
+
+    print("="*60)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Filter molecules by substructure patterns',
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+        epilog=f"""
+Pattern libraries:
+  --filter-type functional-groups    Common functional groups
+  --filter-type rings               Ring systems
+  --filter-type pains               PAINS (Pan-Assay Interference)
+  --filter-type privileged          Privileged structures
+
+Examples:
+  # Include molecules with benzene ring
+  python substructure_filter.py molecules.smi --pattern "c1ccccc1" -o filtered.smi
+
+  # Exclude reactive groups
+  python substructure_filter.py database.sdf --exclude "C(=O)Cl" -o clean.sdf
+
+  # Filter by functional groups
+  python substructure_filter.py molecules.smi --filter-type functional-groups -o fg.smi
+
+  # Remove PAINS
+  python substructure_filter.py compounds.smi --filter-type pains --exclude-mode -o clean.smi
+
+  # Multiple patterns
+  python substructure_filter.py mol.smi --pattern "c1ccccc1" --pattern "N" -o aromatic_amines.smi
+        """
+    )
+
+    parser.add_argument('input', help='Input file (SDF or SMILES)')
+    parser.add_argument('--pattern', '-p', action='append',
+                       help='SMARTS/SMILES pattern to include (can specify multiple)')
+    parser.add_argument('--exclude', '-e', action='append',
+                       help='SMARTS/SMILES pattern to exclude (can specify multiple)')
+    parser.add_argument('--filter-type', choices=PATTERN_LIBRARIES.keys(),
+                       help='Use predefined pattern library')
+    parser.add_argument('--exclude-mode', action='store_true',
+                       help='Use filter-type patterns for exclusion instead of inclusion')
+    parser.add_argument('--match-all', action='store_true',
+                       help='Molecule must match ALL include patterns')
+    parser.add_argument('--output', '-o', help='Output file')
+    parser.add_argument('--report', '-r', help='Write detailed report to CSV')
+    parser.add_argument('--list-patterns', action='store_true',
+                       help='List available pattern libraries and exit')
+
+    args = parser.parse_args()
+
+    # List patterns mode
+    if args.list_patterns:
+        print("\nAvailable Pattern Libraries:")
+        print("="*60)
+        for lib_name, patterns in PATTERN_LIBRARIES.items():
+            print(f"\n{lib_name}:")
+            for name, pattern in patterns.items():
+                print(f"  {name:25s}: {pattern}")
+        sys.exit(0)
+
+    # Load molecules
+    print(f"Loading molecules from: {args.input}")
+    molecules = load_molecules(args.input)
+    if not molecules:
+        print("Error: No valid molecules loaded")
+        sys.exit(1)
+
+    print(f"Loaded {len(molecules)} molecules")
+
+    # Prepare patterns
+    include_patterns = []
+    exclude_patterns = []
+
+    # Add custom include patterns
+    if args.pattern:
+        for pattern_str in args.pattern:
+            query = create_pattern_query(pattern_str)
+            if query:
+                include_patterns.append(('custom', query))
+
+    # Add custom exclude patterns
+    if args.exclude:
+        for pattern_str in args.exclude:
+            query = create_pattern_query(pattern_str)
+            if query:
+                exclude_patterns.append(('custom', query))
+
+    # Add library patterns
+    if args.filter_type:
+        lib_patterns = PATTERN_LIBRARIES[args.filter_type]
+        for name, pattern_str in lib_patterns.items():
+            query = create_pattern_query(pattern_str)
+            if query:
+                if args.exclude_mode:
+                    exclude_patterns.append((name, query))
+                else:
+                    include_patterns.append((name, query))
+
+    if not include_patterns and not exclude_patterns:
+        print("Error: No patterns specified")
+        sys.exit(1)
+
+    # Print filter configuration
+    print(f"\nFilter configuration:")
+    if include_patterns:
+        print(f"  Include patterns: {len(include_patterns)}")
+        if args.match_all:
+            print("  Mode: Match ALL")
+        else:
+            print("  Mode: Match ANY")
+    if exclude_patterns:
+        print(f"  Exclude patterns: {len(exclude_patterns)}")
+
+    # Perform filtering
+    print("\nFiltering...")
+    filtered, match_info = filter_molecules(
+        molecules,
+        include_patterns=include_patterns if include_patterns else None,
+        exclude_patterns=exclude_patterns if exclude_patterns else None,
+        match_all_include=args.match_all
+    )
+
+    # Print summary
+    print_summary(len(molecules), filtered, match_info)
+
+    # Write output
+    if args.output:
+        write_molecules(filtered, args.output)
+
+    if args.report:
+        write_report(match_info, args.report)
+        print(f"Detailed report written to: {args.report}")
+
+
+if __name__ == '__main__':
+    main()