Source code for lacan.protect

"""protect.py — Bond protection and SMARTS-based atom exclusion for LACAN.

This module provides two mechanisms:

**SMARTS-based atom exclusion** (stateless)
    Pass a ``protect_smarts`` SMARTS string to any operation in
    :mod:`lacan.replace`, :mod:`lacan.mutate`, or
    :func:`~lacan.gen.optimize_from_mol`.  Atoms matching the pattern are
    skipped by all mutation and fragment-replacement operations each call.
    Because the protected set is re-derived from the molecule on every call
    via :func:`get_protected_atoms`, it survives SMILES round-trips
    transparently, no atom properties are stored on the molecule.

**Bond protection** (stateful, stored as ``_lp`` bond property)
    Protected bonds are excluded from the LACAN score computation.  Use this
    when a molecule contains a required but chemically unusual motif (e.g. a
    Michael-acceptor covalent warhead) whose LACAN score would unfairly
    penalise the whole molecule.  Bond protection is stored as the ``_lp``
    boolean property on RDKit bond objects and is carried through all
    replacement operations via the internal ``_restore_bond_protection``
    helper in :mod:`lacan.replace`.

All functions return *new* molecules — the originals are never modified in place.

The :func:`mol_cleaner` utility uses bond protection internally to iteratively
repair LACAN-failing molecules while preserving the parts that already pass.
"""

from rdkit import Chem
from lacan import lacan

PROP = "_lp"
"""Name of the RDKit **bond** boolean property used to mark bond protection.

Note: atom protection is no longer stored as a mol property — use
``protect_smarts`` parameters instead (see :func:`get_protected_atoms`).
"""


# ---------------------------------------------------------------------------
# SMARTS-based atom exclusion helpers
# ---------------------------------------------------------------------------

[docs] def get_protected_atoms(mol, protect_smarts): """Return the set of atom indices matched by *protect_smarts* in *mol*. Call it once at the top of any function that needs to skip protected atoms, then pass the resulting frozenset to the per-atom checks. Parameters ---------- mol : RDKit Mol protect_smarts : str or None. If None, returns an empty frozenset. Returns ------- frozenset of int Atom indices matched by the SMARTS. Empty frozenset if *protect_smarts* is ``None`` or ``""``. Raises ------ ValueError If *protect_smarts* is not ``None`` and cannot be parsed. """ if not protect_smarts: return frozenset() pattern = Chem.MolFromSmarts(protect_smarts) if pattern is None: raise ValueError(f"Invalid SMARTS: {protect_smarts!r}") return frozenset( idx for match in mol.GetSubstructMatches(pattern) for idx in match )
[docs] def reaction_touches_protected(mol, rxn, protect_smarts): """Return True if any match of *rxn* overlaps an atom matched by *protect_smarts*. Called by :func:`~lacan.mutate.apply_mutations` before each reaction to skip operations that would modify a protected atom. Returns ``False`` immediately when *protect_smarts* is ``None`` or ``""``, adding no overhead for unprotected molecules. Parameters ---------- mol : RDKit Mol rxn : RDKit ChemicalReaction protect_smarts : str or None — SMARTS identifying atoms to exclude; ``None`` disables the check entirely. Returns ------- bool ``True`` if the reaction should be skipped; ``False`` if safe. """ protected = get_protected_atoms(mol, protect_smarts) if not protected: return False for i in range(rxn.GetNumReactantTemplates()): template = rxn.GetReactantTemplate(i) for match in mol.GetSubstructMatches(template): if set(match) & protected: return True return False
# --------------------------------------------------------------------------- # Bond protection — setting # ---------------------------------------------------------------------------
[docs] def protect_bonds_for_idx(mol, bond_indices): """Mark specific bonds as protected. Protected bonds are excluded from the LACAN score (see :func:`score_mol_ignoring_protected_bonds`). They do **not** block mutations — use ``protect_smarts`` in the operation functions for that. Parameters ---------- mol : RDKit Mol bond_indices : iterable of bond indices to protect Returns a new RDKit Mol. """ rwmol = Chem.RWMol(mol) for idx in bond_indices: rwmol.GetBondWithIdx(idx).SetBoolProp(PROP, True) return rwmol.GetMol()
[docs] def protect_rejected_bonds(mol, profile=None, t=0.05): """Protect all bonds that currently fail the LACAN score threshold. After calling this, :func:`score_mol_ignoring_protected_bonds` will score the molecule as if those bonds do not exist. This is useful when a molecule contains a structural motif that is required (e.g. a reactive warhead) but would otherwise cause the whole molecule to score 0. Parameters ---------- mol : RDKit Mol profile : LACAN profile dict; loads the default ChEMBL profile if None t : bond score threshold (default 0.05) Returns a new RDKit Mol with failing bonds protected. """ if profile is None: profile = lacan.load_profile("chembl") apb = lacan.assess_per_bond(mol, profile) bad_bond_indices = [i for i, score in enumerate(apb) if score < t] return protect_bonds_for_idx(mol, bad_bond_indices)
# --------------------------------------------------------------------------- # Bond protection — querying # ---------------------------------------------------------------------------
[docs] def get_protected_bond_indices(mol): """Return a list of indices of all protected bonds in *mol*.""" return [b.GetIdx() for b in mol.GetBonds() if b.HasProp(PROP) and b.GetBoolProp(PROP)]
[docs] def bond_is_protected(bond): """Return True if the RDKit Bond object has the ``_lp`` protection mark.""" return bond.HasProp(PROP) and bond.GetBoolProp(PROP)
# --------------------------------------------------------------------------- # Scoring with bond exclusion # ---------------------------------------------------------------------------
[docs] def score_mol_ignoring_protected_bonds(mol, profile=None, mode="score", t=0.05): """Score a molecule while ignoring any bonds marked as protected. This is a drop-in replacement for :func:`lacan.lacan.score_mol` that omits protected bonds from both the minimum-score calculation and the ``bad_bonds`` list. If all bonds are protected the function returns 1.0 (trivially passes). Parameters ---------- mol : RDKit Mol (may have protected bonds) profile : LACAN profile dict; loads ChEMBL default if None mode : ``"score"`` (continuous, 0–1) or ``"threshold"`` (0 or 1) t : bond score threshold (default 0.05) Returns ------- (score, info) where ``info["bad_bonds"]`` lists unprotected failing bond indices. """ if profile is None: profile = lacan.load_profile("chembl") apb = lacan.assess_per_bond(mol, profile) protected_bonds = set(get_protected_bond_indices(mol)) apb_active = [score for i, score in enumerate(apb) if i not in protected_bonds] if not apb_active: return 1.0, {"bad_bonds": []} # all bonds protected — trivially passes info = {} info["bad_bonds"] = [i for i, b in enumerate(apb) if b < t and i not in protected_bonds] if mode == "threshold": score = 0 if min(apb_active) < t else 1 else: score = min(apb_active) / (1 + min(apb_active)) return score, info
# --------------------------------------------------------------------------- # mol_cleaner # --------------------------------------------------------------------------- def _raw_mutations(mol, profile): """Generate all valid single-step mutation products without score filtering. Unlike :func:`~lacan.mutate.apply_mutations`, this function does **not** score or filter the products. It is used internally by :func:`mol_cleaner` so that partially-fixed intermediates (which still have some bad bonds) are not discarded before we get the chance to evaluate them ourselves. Degenerate products are rejected: * Disconnected molecules (contain ``"."`` in their canonical SMILES) * Molecules with zero bonds * Molecules with fewer than 4 heavy atoms. Parameters ---------- mol : RDKit Mol (typically has protected bonds from :func:`mol_cleaner`) profile : LACAN profile dict (unused here, kept for API consistency) Returns a list of sanitized, connected, non-degenerate RDKit Mol objects. """ from lacan.mutate import mutate_ops products = [] for rxn in mutate_ops.values(): for prod_tuple in rxn.RunReactants((mol,)): try: m = prod_tuple[0] Chem.SanitizeMol(m) if m.GetNumBonds() == 0: continue if m.GetNumHeavyAtoms() < 4: continue if "." in Chem.MolToSmiles(m): continue products.append(m) except Exception: pass return products
[docs] def mol_cleaner(mol, profile=None, score_threshold=0.5, t=0.05, max_iter=100, lateral_patience=5): """Iteratively mutate a molecule to eliminate all LACAN bond violations. This function is designed for molecules that mostly pass the LACAN profile but have one or more bad bond environments that need to be fixed. It freezes the parts that already pass (using bond protection) and mutates only the failing regions. Strategy -------- Each iteration the cleaner: 1. Generates **all** single-step mutation products via :func:`_raw_mutations` — crucially *without* score-filtering the products. Score-filtering at this stage would reject all partially-fixed intermediates (those that still have some bad bonds), preventing multi-step repair paths. 2. Evaluates each product by counting its unprotected LACAN violations. The parent molecule's protected bond mask is re-derived from the *product's own* bond scores (via :func:`_reprotect`), so each candidate is assessed on its own bond landscape. 3. Picks the best candidate: * **Improvement step** — a candidate with fewer violations than the current molecule. Resets the lateral counter. * **Lateral move** — if no improvement is available and the lateral patience budget allows, accept the highest-scoring candidate with the *same* violation count. 4. Re-protects the accepted candidate's newly-passing bonds so subsequent mutations stay focused on the remaining bad regions. Parameters ---------- mol : RDKit Mol to clean profile : LACAN profile dict (loads ChEMBL default if None) score_threshold : final LACAN score required to consider the molecule clean (default 0.5) t : bond score threshold used throughout (default 0.05) max_iter : hard cap on total iterations (default 100) lateral_patience : consecutive lateral steps allowed before giving up (default 5) Returns ------- RDKit Mol if a clean version is found, else None. """ if profile is None: profile = lacan.load_profile("chembl") min_heavy = max(4, mol.GetNumHeavyAtoms() - 6) def _reprotect(m): apb = lacan.assess_per_bond(m, profile) good = [i for i, sc in enumerate(apb) if sc >= t] return protect_bonds_for_idx(m, good) def _eval(m): if m.GetNumBonds() == 0: return 999, 0.0 _, info = score_mol_ignoring_protected_bonds(m, profile, t=t) s, _ = score_mol_ignoring_protected_bonds(m, profile, t=t) return len(info["bad_bonds"]), s working_mol = _reprotect(mol) current_v, current_s = _eval(working_mol) lateral_count = 0 visited_smis = {Chem.MolToSmiles(working_mol)} for _ in range(max_iter): if current_v == 0 and current_s >= score_threshold: return working_mol candidates = _raw_mutations(working_mol, profile) if not candidates: break best_improve_v = current_v best_improve_s = -1.0 best_improve_mol = None best_lateral_s = -1.0 best_lateral_mol = None for c in candidates: if c.GetNumHeavyAtoms() < min_heavy: continue smi = Chem.MolToSmiles(c) if smi in visited_smis: continue c_prot = _reprotect(c) v, s = _eval(c_prot) if v < best_improve_v or (v == best_improve_v and s > best_improve_s): best_improve_v = v best_improve_s = s best_improve_mol = c_prot if v == current_v and s > best_lateral_s: best_lateral_s = s best_lateral_mol = c_prot if best_improve_mol is not None and best_improve_v < current_v: visited_smis.add(Chem.MolToSmiles(best_improve_mol)) working_mol = best_improve_mol current_v = best_improve_v current_s = best_improve_s lateral_count = 0 elif best_lateral_mol is not None and lateral_count < lateral_patience: visited_smis.add(Chem.MolToSmiles(best_lateral_mol)) working_mol = best_lateral_mol current_s = best_lateral_s lateral_count += 1 else: break final_score, _ = score_mol_ignoring_protected_bonds(working_mol, profile, t=t) return working_mol if final_score >= score_threshold else None