Source code for lacan.replace

"""
replace.py — Coarse-grained fragment replacement operations for LACAN.

This module provides four scaffold-editing operations that swap entire fragments
(rings, substituents, linkers) for alternatives drawn from the fragment corpus
(rls.csv).  Together they form the EXPLORE step of the adaptive GA in gen.py.

All four operations share these behaviours:

* **Random site selection** — when a molecule has multiple rings, linkers, or
  substituents that could be operated on, one is chosen at random each call.
  This ensures diverse output across repeated calls and prevents the same site
  from being targeted every time.
* **Conservative sampling** (``conservative=True``, the default) — replacement
  fragments are drawn with probability proportional to both their corpus
  frequency *and* their structural similarity to the fragment being replaced
  (Morgan fp Tanimoto, radius 2, including dummy atoms).  Set
  ``conservative=False`` for purely frequency-weighted sampling.
* **Protection** — pass ``protect_smarts`` (a SMARTS string) to any function
  to skip atoms matching the pattern.  The exclusion is re-derived from the
  molecule on every call, so it survives SMILES round-trips transparently.
* **Deduplication** — output molecules are deduplicated by InChIKey and scored
  by the LACAN profile; molecules scoring 0 are discarded.

Reaction conventions
--------------------
The ``_breakbond`` SMARTS ``[!#0:0]!@[R:1]`` matches exo-bonds as the pair
``(non-ring atom, ring atom)``, so in every match tuple ``b[0]`` is the
**exo** atom and ``b[1]`` is the **ring** atom.  This is important for
:func:`replace_ring` and :func:`replace_linker` which use these indices to
identify which atoms belong to the scaffold vs the side chains.
"""

from rdkit import Chem
from lacan import decompose as decompose_module
from lacan import lacan
from lacan.protect import (
    get_protected_atoms,
    score_mol_ignoring_protected_bonds,
)
import random
import csv
import os
from rdkit import DataStructs
from rdkit.Chem import rdChemReactions, inchi, rdFingerprintGenerator

# ---------------------------------------------------------------------------
# Reactions
# ---------------------------------------------------------------------------

rxn1 = rdChemReactions.ReactionFromSmarts(
    "[*:0]-[#0:1].[*:2]-[#0:3]>>[*:0]-[*:2].[*:1]-[*:3]"
)
"""Connect two single-bond dummy attachment points."""

rxn2 = rdChemReactions.ReactionFromSmarts(
    "[*:0]=[#0:1].[*:2]=[#0:3]>>[*:0]=[*:2].[*:1]=[*:3]"
)
"""Connect two double-bond dummy attachment points."""

htodummy = rdChemReactions.ReactionFromSmarts("[H1,H2,H3:0]>>[*:0]-[#0]")
"""Convert one hydrogen to a single-bond dummy attachment point."""

decompose1 = rdChemReactions.ReactionFromSmarts(
    "[!#0:0]!@;-[R:1]>>[!R:0]-[#0].[R:1]-[#0]"
)
"""Cut a single-bond ring-exo bond, yielding (exo-fragment, ring-fragment)."""

singledummy = Chem.MolFromSmarts("[#0;$([*]-[*])]")
"""Matches a dummy atom attached by a single bond."""

doubledummy = Chem.MolFromSmarts("[#0;$([*]=[*])]")
"""Matches a dummy atom attached by a double bond."""

# _breakbond matches (non-ring-atom, ring-atom) pairs.
# In every match b=(b[0], b[1]): b[0] = exo atom, b[1] = ring atom.
_breakbond = Chem.MolFromSmarts("[!#0:0]!@[R:1]")

# ---------------------------------------------------------------------------
# Fragment corpus
# ---------------------------------------------------------------------------

_FRAGFPGEN = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024)
"""Morgan generator (r=2) used for fragment similarity scoring.

Including dummy atoms at radius 2 captures the immediate attachment
environment, giving meaningful similarity values for fragments that
differ mainly in the context around their attachment points.
"""

_corpus_cache = {}  # keyed by min_count so different thresholds coexist



[docs]
def load_corpus(min_count=200, path=None):
    """Load a fragment corpus CSV, with caching.

    By default loads the built-in ChEMBL corpus from ``data/rls.csv``.  Pass
    ``path`` to load a custom corpus built with ``python -m lacan.decompose``
    (e.g. from COCONUT or your own dataset).

    The CSV is read only on the first call for a given ``(path, min_count)``
    combination; subsequent calls return the cached list immediately.

    Parameters
    ----------
    min_count : int
        Minimum occurrence count for a fragment to be included (default 200).
        Lower values allow rarer fragments into the replacement pool.
    path : str or None
        Path to a corpus CSV produced by :mod:`lacan.decompose`.  If ``None``
        (default) the built-in ChEMBL corpus is used.

    Returns
    -------
    list of [smiles, count, degree, ftype, bonds]
    """
    cache_key = (path, min_count)
    if cache_key in _corpus_cache:
        return _corpus_cache[cache_key]
    if path is None:
        _this_dir = os.path.dirname(__file__)
        path = os.path.join(_this_dir, "data/rls.csv")
    _entries = []
    with open(path, newline="") as _f:
        _reader = csv.reader(_f, delimiter=",", quotechar='"')
        for _e in _reader:
            if _e[0] != "smiles" and int(_e[1]) > min_count:
                _entries.append([_e[0], int(_e[1]), int(_e[2]), _e[3], _e[4]])
    _corpus_cache[cache_key] = _entries
    return _entries


# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------


def _frag_fp(smi):
    """Return a Morgan fingerprint for a fragment SMILES string, or None on failure.

    Dummy atoms (atomic number 0) are treated as regular heavy atoms, so the
    fingerprint captures the attachment-point chemical environment — exactly
    what we want when comparing fragments by their connection context.
    """
    mol = Chem.MolFromSmiles(smi)
    if mol is None:
        return None
    return _FRAGFPGEN.GetFingerprint(mol)


def _similarity_weights(query_smi, candidates, base_weight_power=0.5, sim_boost=2.0):
    """Compute conservative sampling weights for a list of replacement fragment entries.

    The weight assigned to each candidate is::

        weight = count ^ base_weight_power * (1 + sim_boost * tanimoto(query, candidate))

    This blends two signals:

    * **Frequency** (``count ^ base_weight_power``) — common corpus fragments
      are preferred, avoiding rare exotic scaffolds.
    * **Similarity** (Tanimoto of Morgan fps) — fragments resembling the one
      being replaced get a bonus, producing conservative bioisosteric swaps.

    Parameters
    ----------
    query_smi         : SMILES of the fragment currently in the molecule
    candidates        : list of corpus entries ``[smiles, count, degree, ftype, bonds]``
    base_weight_power : exponent applied to occurrence count (default 0.5)
    sim_boost         : multiplier for the similarity bonus term (default 2.0)

    Returns a list of floats parallel to ``candidates``.
    """
    qfp = _frag_fp(query_smi)
    if qfp is None:
        return [e[1] ** base_weight_power for e in candidates]
    cfps = [_frag_fp(e[0]) for e in candidates]
    weights = []
    for i, e in enumerate(candidates):
        occ_w = e[1] ** base_weight_power
        sim = DataStructs.TanimotoSimilarity(qfp, cfps[i]) if cfps[i] is not None else 0.0
        weights.append(occ_w * (1.0 + sim_boost * sim))
    return weights


def _filter_and_dedup(new_mols, p):
    """Sanitize, LACAN-score, and deduplicate a list of product molecules.

    A molecule is discarded if:

    * RDKit sanitization fails,
    * the LACAN score is exactly 0 (at least one bond is a hard alert), or
    * it shares an InChIKey with a molecule already in the output list.

    Protected bonds on the products are honoured by
    :func:`~lacan.protect.score_mol_ignoring_protected_bonds`.

    Returns a list of valid, unique RDKit Mol objects.
    """
    filtered = []
    iks = []
    for m in new_mols:
        try:
            Chem.SanitizeMol(m)
            score, _ = score_mol_ignoring_protected_bonds(m, p)
            if score > 0:
                ik = inchi.MolToInchiKey(m)
                if ik not in iks:
                    filtered.append(m)
                    iks.append(ik)
        except Exception:
            pass
    return filtered


def _get_ring_systems(mol):
    """Return fused ring systems as a list of frozensets of atom indices.

    Simple rings that share at least one atom (fused or bridged systems) are
    merged into a single frozenset, so naphthalene yields one system of 10
    atoms rather than two overlapping 6-membered rings.
    """
    ring_atom_sets = [set(r) for r in mol.GetRingInfo().AtomRings()]
    merged = True
    while merged:
        merged = False
        new_sets = []
        while ring_atom_sets:
            current = ring_atom_sets.pop(0)
            fused = False
            for i, other in enumerate(ring_atom_sets):
                if current & other:
                    ring_atom_sets[i] = current | other
                    fused = True
                    merged = True
                    break
            if not fused:
                new_sets.append(frozenset(current))
        ring_atom_sets = new_sets
    return ring_atom_sets


def _restore_bond_protection(product, orig_mol):
    """Re-apply ``_lp`` bond protection from *orig_mol* onto *product*.

    RDKit reactions do not carry bond properties to products, so ``_lp`` marks
    are lost after stitching.  This function restores them using substructure
    matching: for each protected bond in *orig_mol* a two-atom query (with
    ring-membership constraints on both endpoints) is matched against *product*,
    and the matched bond is re-stamped ``_lp=True``.

    Atom-index-pair matching is deliberately avoided because atom indices are
    not stable across ``RunReactants`` — previously non-dummy atom indices could
    coincidentally match a protected pair in an unrelated part of the product.

    Parameters
    ----------
    product  : RDKit Mol (assembled replacement product, may be RWMol)
    orig_mol : RDKit Mol with ``_lp`` bond properties set

    Returns a new RDKit Mol with protection restored (or *product* unchanged if
    *orig_mol* has no protected bonds).
    """
    protected_bonds = [b for b in orig_mol.GetBonds()
                       if b.HasProp("_lp") and b.GetBoolProp("_lp")]
    if not protected_bonds:
        return product

    rwmol = Chem.RWMol(product)

    for pb in protected_bonds:
        ai = pb.GetBeginAtomIdx()
        aj = pb.GetEndAtomIdx()
        a_i = orig_mol.GetAtomWithIdx(ai)
        a_j = orig_mol.GetAtomWithIdx(aj)

        # Build a two-atom query with ring constraints for both endpoints
        ri = "R" if a_i.IsInRing() else "!R"
        rj = "R" if a_j.IsInRing() else "!R"
        bt = pb.GetBondTypeAsDouble()
        bond_smarts = "-" if bt == 1.0 else (":" if bt == 1.5 else "=")
        smarts = f"[{a_i.GetSymbol()};{ri}]{bond_smarts}[{a_j.GetSymbol()};{rj}]"
        query = Chem.MolFromSmarts(smarts)
        if query is None:
            continue

        matches = rwmol.GetSubstructMatches(query)
        for match in matches:
            prod_bond = rwmol.GetBondBetweenAtoms(match[0], match[1])
            if prod_bond is not None:
                prod_bond.SetBoolProp("_lp", True)
            # Only mark the first match to avoid spurious over-protection
            break

    return rwmol.GetMol()



    """Return fused ring systems as a list of frozensets of atom indices.

    Simple rings that share at least one atom (fused or bridged systems) are
    merged into a single frozenset, so naphthalene yields one system of 10
    atoms rather than two overlapping 6-membered rings.

    Using RDKit's ``GetRingInfo().AtomRings()`` directly (rather than matching
    decompose-derived SMARTS back onto the molecule) avoids the ambiguity where
    the dummy ``*`` atom in a decompose fragment SMARTS can match non-ring atoms
    in the original molecule.
    """
    ring_atom_sets = [set(r) for r in mol.GetRingInfo().AtomRings()]
    merged = True
    while merged:
        merged = False
        new_sets = []
        while ring_atom_sets:
            current = ring_atom_sets.pop(0)
            fused = False
            for i, other in enumerate(ring_atom_sets):
                if current & other:
                    ring_atom_sets[i] = current | other
                    fused = True
                    merged = True
                    break
            if not fused:
                new_sets.append(frozenset(current))
        ring_atom_sets = new_sets
    return ring_atom_sets


# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------



[docs]
def decorate_scaffold(mol, p, score_threshold, fragcorpus=None, n_replacements=100,
                      mode="Hydrogen", replacements_per_mol=1, chance_of_linker=0.5,
                      protect_smarts=None):
    """Add substituents to a scaffold by decorating free positions.

    Two modes are supported:

    **Hydrogen mode** (default)
        For each of the ``n_replacements`` attempts, one H-bearing atom is
        chosen at random and its hydrogen replaced with a dummy, then a
        single-attachment corpus fragment is joined at that position.
        ``replacements_per_mol`` controls how many H → fragment substitutions
        are made per attempt.  Protected atoms are skipped when selecting the
        H position.

    **Dummy mode**
        The scaffold must already contain ``*`` dummy atoms marking desired
        decoration sites (e.g. ``c1c(*)c(*)co1``).  Each dummy is filled with
        a corpus fragment.  With probability ``chance_of_linker`` a two-
        attachment linker fragment is inserted before the terminal substituent,
        extending the chain length.

    Parameters
    ----------
    mol               : RDKit Mol (scaffold to decorate)
    p                 : LACAN profile dict
    score_threshold   : minimum LACAN score required for output molecules;
                        set 0.0 in the GA explore phase to accept all structures
    fragcorpus        : list of corpus entries (default: ChEMBL rls.csv)
    n_replacements    : number of decorated molecules to attempt generating
    mode              : ``"Hydrogen"`` or ``"Dummy"``
    replacements_per_mol : (Hydrogen mode only) H → fragment substitutions per attempt
    chance_of_linker  : (Dummy mode only) probability of prepending a linker fragment
    protect_smarts    : SMARTS string; atoms matching it are skipped when selecting
                        decoration sites.  ``None`` disables the check (default).

    Returns
    -------
    list of RDKit Mol
        Deduplicated molecules that pass ``score_threshold``.
    """
    if fragcorpus is None:
        fragcorpus = load_corpus()
    protected_atoms = get_protected_atoms(mol, protect_smarts)
    fs = [e for e in fragcorpus if e[2] == 1 and e[4] == "-"]
    fd = [e for e in fragcorpus if e[2] == 1 and e[4] == "="]
    fl = [e for e in fragcorpus if e[2] == 2 and e[3] == "Linker" and e[4] == "--"]
    new_mols = []

    if mode == "Dummy":
        sdb_matches = [m for m in mol.GetSubstructMatches(singledummy)
                       if m[0] not in protected_atoms]
        ddb_matches = [m for m in mol.GetSubstructMatches(doubledummy)
                       if m[0] not in protected_atoms]
        sdb = len(sdb_matches)
        ddb = len(ddb_matches)
        for i in range(n_replacements):
            cmol = Chem.Mol(mol)
            fe1 = [Chem.MolFromSmiles(e[0]) for e in random.choices(fs, [e[1] ** 0.5 for e in fs], k=sdb)]
            fe2 = [Chem.MolFromSmiles(e[0]) for e in random.choices(fd, [e[1] ** 0.5 for e in fd], k=ddb)]
            fe3 = [Chem.MolFromSmiles(e[0]) for e in random.choices(fl, [e[1] ** 0.5 for e in fl], k=sdb)]
            for j in range(sdb):
                if random.random() > chance_of_linker:
                    cmol = random.choice(rxn1.RunReactants((cmol, fe3[j])))[0]
                    Chem.SanitizeMol(cmol)
                cmol = random.choice(rxn1.RunReactants((cmol, fe1[j])))[0]
                Chem.SanitizeMol(cmol)
            for j in range(ddb):
                cmol = random.choice(rxn2.RunReactants((cmol, fe2[j])))[0]
                Chem.SanitizeMol(cmol)
            m = _restore_bond_protection(cmol, mol)
            new_mols.append(m)

    elif mode == "Hydrogen":
        for i in range(n_replacements):
            cmol = Chem.Mol(mol)
            for j in range(replacements_per_mol):
                candidates = [a.GetIdx() for a in cmol.GetAtoms()
                              if a.GetTotalNumHs() > 0 and a.GetIdx() not in protected_atoms]
                if not candidates:
                    break
                chosen = random.choice(candidates)
                prods = htodummy.RunReactants((cmol,))
                valid = []
                for prod in prods:
                    try:
                        Chem.SanitizeMol(prod[0])
                        for a in prod[0].GetAtoms():
                            if a.GetAtomicNum() == 0:
                                if any(n.GetIdx() == chosen for n in a.GetNeighbors()):
                                    valid.append(prod[0])
                                    break
                    except Exception:
                        pass
                if valid:
                    cmol = random.choice(valid)
                else:
                    if prods:
                        cmol = random.choice(prods)[0]
                        Chem.SanitizeMol(cmol)
            fe = [Chem.MolFromSmiles(e[0]) for e in random.choices(
                fs, [e[1] ** 0.5 for e in fs], k=replacements_per_mol)]
            for j in range(replacements_per_mol):
                cmol = random.choice(rxn1.RunReactants((cmol, fe[j])))[0]
                Chem.SanitizeMol(cmol)
            m = _restore_bond_protection(cmol, mol)
            new_mols.append(m)
    else:
        print("mode not supported")

    filtered_prods = []
    iks = []
    for m in new_mols:
        try:
            Chem.SanitizeMol(m)
            score, _ = score_mol_ignoring_protected_bonds(m, p)
            if score > score_threshold:
                ik = inchi.MolToInchiKey(m)
                if ik not in iks:
                    filtered_prods.append(m)
                    iks.append(ik)
        except Exception:
            pass
    return filtered_prods




[docs]
def replace_substituent(mol, p, score_threshold, fragcorpus=None, n_replacements=100,
                        conservative=True, protect_smarts=None):
    """Replace one non-ring substituent with a randomly sampled corpus fragment.

    A substituent is defined here strictly: a fragment produced by cutting a
    single ring-exo bond that (a) has exactly one attachment point (one ``*``
    dummy) and (b) contains no ring atoms from the original molecule.  This
    prevents entire ring systems from being mistaken for substituents when the
    SMARTS fires on inter-ring bonds.

    Algorithm
    ---------
    1. Run ``decompose1`` (``[!#0]!@;-[R]>>[!R]-[*].[R]-[*]``) on the
       molecule to enumerate all possible ring-exo single-bond cuts.
    2. For each cut, classify the two fragments: smaller = substituent,
       larger = scaffold.
    3. **Filter**: discard any pair where the smaller fragment contains ring
       atoms from the original molecule (it is a ring system, not a sub).
    4. Also skip if the scaffold's attachment atom is protected.
    5. **Randomly pick** one valid substituent site for each of the
       ``n_replacements`` attempts, so all sites are sampled across repeated
       calls.
    6. Draw a replacement from corpus substituents (degree 1, single bond),
       optionally similarity-weighted, and join it to the scaffold dummy.

    Parameters
    ----------
    mol             : RDKit Mol
    p               : LACAN profile dict
    score_threshold : minimum LACAN score for output molecules
    fragcorpus      : fragment corpus
    n_replacements  : number of replacement attempts
    conservative    : if True, bias sampling toward structurally similar fragments
    protect_smarts  : SMARTS string; atoms matching it are excluded as substituent
                      sites and scaffold attachment points.  ``None`` = no exclusion.

    Returns
    -------
    list of RDKit Mol
    """
    # Pre-compute ring atom indices for the original molecule so we can reject
    # "substituents" that actually contain ring atoms.
    if fragcorpus is None:
        fragcorpus = load_corpus()
    ring_atoms = set()
    for ring in mol.GetRingInfo().AtomRings():
        ring_atoms.update(ring)
    protected_atoms = get_protected_atoms(mol, protect_smarts)

    prods = decompose1.RunReactants((mol,))
    pairs = []      # (substituent_mol, scaffold_mol) tuples
    sub_smis = []   # SMILES of each substituent for similarity weighting

    for prod in prods:
        if len(prod) != 2:
            continue
        try:
            [Chem.SanitizeMol(pp) for pp in prod]
        except Exception:
            continue
        sorted_prod = sorted(prod, key=lambda m: m.GetNumAtoms())
        substituent, scaffold = sorted_prod[0], sorted_prod[1]

        # Reject if the smaller fragment has ring atoms in the original molecule
        # (it is an inter-ring bond cut, not a true ring-exo substituent cut).
        sub_match = mol.GetSubstructMatch(substituent)
        if sub_match and any(idx in ring_atoms for idx in sub_match
                             if mol.GetAtomWithIdx(idx).GetAtomicNum() != 0):
            continue

        # Exactly one dummy and no rings in fragment
        sub_smi = Chem.MolToSmiles(substituent)
        if sub_smi.count("*") != 1:
            continue
        if substituent.GetRingInfo().NumRings() > 0:
            continue

        # Skip if the scaffold attachment point (neighbour of dummy) is protected
        dummy_neighbor = None
        for a in scaffold.GetAtoms():
            if a.GetAtomicNum() == 0:
                for n in a.GetNeighbors():
                    dummy_neighbor = n
        if dummy_neighbor and dummy_neighbor.GetIdx() in protected_atoms:
            continue

        # Skip if the substituent being replaced contains any protected atom
        sub_match = mol.GetSubstructMatch(substituent)
        if sub_match and any(idx in protected_atoms for idx in sub_match
                             if mol.GetAtomWithIdx(idx).GetAtomicNum() != 0):
            continue

        pairs.append((substituent, scaffold))
        sub_smis.append(sub_smi)

    if not pairs:
        return []

    fe = [e for e in fragcorpus if e[2] == 1 and e[4] == "-"]
    if not fe:
        return []

    new_mols = []
    for i in range(n_replacements):
        # Randomly pick one substituent site per attempt
        pair_idx = random.randrange(len(pairs))
        scaffold = pairs[pair_idx][1]
        if conservative:
            weights = _similarity_weights(sub_smis[pair_idx], fe)
        else:
            weights = [e[1] ** 0.5 for e in fe]
        replacement = random.choices(fe, weights=weights, k=1)[0]
        frag_mol = Chem.MolFromSmiles(replacement[0])
        try:
            new_mol = rxn1.RunReactants((frag_mol, scaffold))[0][0]
            m = _restore_bond_protection(new_mol, mol)
            new_mols.append(m)
        except Exception:
            pass
    return _filter_and_dedup(new_mols, p)




[docs]
def replace_linker(mol, p, score_threshold, fragcorpus=None, n_replacements=100,
                   conservative=True, protect_smarts=None):
    """Replace one non-ring linker connecting two ring systems.

    A linker is a non-ring fragment with exactly two attachment points (degree 2)
    that bridges two ring systems.  The ``decompose_molecule`` utility identifies
    them as SMARTS strings with two ``*`` dummies.

    Algorithm
    ---------
    1. Decompose the molecule to get linker SMARTS strings.
    2. For each linker SMARTS, find all substructure matches in the original mol.
    3. For each match, identify the two exo-bonds using ``_breakbond``: atoms in
       the linker match set whose ``b[0]`` (the exo/linker atom) appears in the
       match are the cut points.  Exactly two such bonds are required.
    4. Collect all valid sites (linker SMARTS + two ring fragments after cutting).
    5. **Randomly pick one site** from all valid sites so that molecules with
       multiple linkers sample them uniformly across calls.
    6. Draw ``n_replacements`` replacement linkers from the corpus (degree 2,
       type "Linker", bond string "``--``") and stitch the two ring fragments
       back through each.

    ``_breakbond`` convention: ``b[0]`` = non-ring (linker) atom,
    ``b[1]`` = ring atom.  Only ``b[0]`` is checked against the linker match
    set to locate the two linker-to-ring attachment bonds.

    Parameters
    ----------
    mol             : RDKit Mol
    p               : LACAN profile dict
    score_threshold : minimum LACAN score for output molecules
    fragcorpus      : fragment corpus
    n_replacements  : number of replacement linkers to try at the chosen site
    conservative    : if True, similarity-bias the linker sampling
    protect_smarts  : SMARTS string; linker atoms or bond-endpoint atoms matching
                      it are skipped.  ``None`` = no exclusion (default).

    Returns
    -------
    list of RDKit Mol, or ``[]`` if no replaceable linker is found.
    """
    if fragcorpus is None:
        fragcorpus = load_corpus()
    dcmol = decompose_module.decompose_molecule(Chem.MolToSmiles(mol))
    linkers_smarts = dcmol[1]
    if not linkers_smarts:
        return []
    protected_atoms = get_protected_atoms(mol, protect_smarts)

    breakable = mol.GetSubstructMatches(_breakbond)
    # _breakbond: b[0] = exo/linker atom, b[1] = ring atom

    # Collect all valid linker sites across all linker types and matches
    valid_sites = []  # list of (linker_smi, [frag1, frag2])

    for linker_smi in linkers_smarts:
        linker_pattern = Chem.MolFromSmarts(linker_smi)
        if linker_pattern is None:
            continue
        for linker_match in mol.GetSubstructMatches(linker_pattern):
            linker_set = set(linker_match)
            # b[0] is the linker (non-ring) atom — find breakable bonds where
            # the exo atom belongs to this linker
            to_break = [b for b in breakable if b[0] in linker_set]
            if len(to_break) != 2:
                continue
            if any(idx in protected_atoms
                   for pair in to_break for idx in pair):
                continue
            # Also skip if any atom inside the linker itself is protected
            if any(idx in protected_atoms for idx in linker_set):
                continue
            bond_indices = [mol.GetBondBetweenAtoms(*b).GetIdx() for b in to_break]
            cmol = Chem.FragmentOnBonds(mol, bond_indices)
            # Keep the two ring-side fragments (each has exactly 1 dummy)
            frags = [Chem.MolFromSmiles(f) for f in Chem.MolToSmiles(cmol).split(".")
                     if f.count("*") == 1]
            if len(frags) != 2:
                continue
            valid_sites.append((linker_smi, frags))

    if not valid_sites:
        return []

    # Randomly pick one linker site so multi-linker molecules sample uniformly
    linker_smi, frags = random.choice(valid_sites)

    fe = [e for e in fragcorpus if e[2] == 2 and e[3] == "Linker" and e[4] == "--"]
    if not fe:
        return []

    if conservative:
        weights = _similarity_weights(linker_smi, fe)
    else:
        weights = [e[1] ** 0.5 for e in fe]
    fr = [Chem.MolFromSmiles(e[0]) for e in random.choices(fe, weights=weights, k=n_replacements)]

    new_mols = []
    for new_linker in fr:
        try:
            combined = random.choice(rxn1.RunReactants((new_linker, frags[0])))[0]
            Chem.SanitizeMol(combined)
            combined = random.choice(rxn1.RunReactants((combined, frags[1])))[0]
            Chem.SanitizeMol(combined)
            m = _restore_bond_protection(combined, mol)
            new_mols.append(m)
        except Exception:
            pass
    return _filter_and_dedup(new_mols, p)




[docs]
def replace_ring(mol, p, score_threshold, fragcorpus=None, n_replacements=100,
                 conservative=True, protect_smarts=None):
    """Replace one ring system with a different ring drawn from the corpus.

    Algorithm
    ---------
    1. Find fused ring systems via :func:`_get_ring_systems`.
    2. For each ring system walk bonds to find exo-bonds (leaving the system,
       not themselves in a ring).
    3. Skip protected systems; randomly pick one.
    4. Cut the exo-bonds with ``FragmentOnBonds``.  Use ``GetMolFrags()``
       (without ``asMols``) to get tuples of *original* atom indices for each
       fragment — these are stable and comparable to ``ring_system``.  The
       fragment whose non-dummy atoms are all in ``ring_system`` is the old ring;
       everything else is a side chain to reattach.  Build side-chain Mol objects
       with ``GetMolFrags(asMols=True)`` in the same order.
    5. Sample corpus rings; stitch side chains back.

    Parameters
    ----------
    mol             : RDKit Mol
    p               : LACAN profile dict
    score_threshold : minimum LACAN score for output molecules
    fragcorpus      : fragment corpus (default: ChEMBL rls.csv)
    n_replacements  : number of replacement rings to sample
    conservative    : if True, bias sampling toward structurally similar rings
    protect_smarts  : SMARTS string; ring systems containing any matching atom are
                      skipped entirely.  ``None`` = no exclusion (default).

    Returns
    -------
    list of RDKit Mol
    """
    if fragcorpus is None:
        fragcorpus = load_corpus()

    ring_systems = _get_ring_systems(mol)
    if not ring_systems:
        return []
    protected_atoms = get_protected_atoms(mol, protect_smarts)

    valid = []
    for ring_system in ring_systems:
        if any(i in protected_atoms for i in ring_system):
            continue
        seen = set()
        exo_bonds = []
        for idx in ring_system:
            for bond in mol.GetAtomWithIdx(idx).GetBonds():
                bid = bond.GetIdx()
                if bid in seen:
                    continue
                seen.add(bid)
                if bond.IsInRing():
                    continue
                other = bond.GetOtherAtomIdx(idx)
                if other in ring_system:
                    continue
                btype = "-" if bond.GetBondTypeAsDouble() == 1.0 else "="
                exo_bonds.append((bid, btype))
        if not exo_bonds:
            continue
        ring_smi = Chem.MolFragmentToSmiles(mol, list(ring_system))
        valid.append((ring_system, exo_bonds, ring_smi))

    if not valid:
        return []

    ring_system, exo_bonds, ring_smi = random.choice(valid)
    degree     = len(exo_bonds)
    bond_idxs  = [e[0] for e in exo_bonds]
    bond_types = [e[1] for e in exo_bonds]

    # FragmentOnBonds adds new dummy atoms at indices >= mol.GetNumAtoms().
    # GetMolFrags() (no asMols) returns tuples of original-mol atom indices,
    # so we can compare directly against ring_system.
    cmol = Chem.FragmentOnBonds(mol, bond_idxs)
    orig_n = mol.GetNumAtoms()

    frag_idx_tuples = Chem.GetMolFrags(cmol)          # tuples of atom indices in cmol
    frag_mols       = Chem.GetMolFrags(cmol, asMols=True, sanitizeFrags=False)

    ring_frag_pos = None
    for i, idx_tuple in enumerate(frag_idx_tuples):
        # Non-dummy atoms in this fragment: those with index < orig_n
        heavy = [idx for idx in idx_tuple if idx < orig_n]
        if all(idx in ring_system for idx in heavy):
            ring_frag_pos = i
            break

    if ring_frag_pos is None:
        return []

    side_chains = [m for i, m in enumerate(frag_mols) if i != ring_frag_pos]

    if len(side_chains) != degree:
        return []

    sbc = bond_types.count("-")
    dbc = bond_types.count("=")
    fe = [e for e in fragcorpus
          if e[2] == degree and e[3] == "Ring"
          and e[4].count("-") == sbc and e[4].count("=") == dbc]
    if not fe:
        return []

    weights = _similarity_weights(ring_smi, fe) if conservative else [e[1] ** 0.5 for e in fe]

    new_mols = []
    for entry in random.choices(fe, weights=weights, k=n_replacements):
        new_ring = Chem.MolFromSmiles(entry[0])
        if new_ring is None:
            continue
        try:
            ok = True
            for sc, btype in zip(side_chains, bond_types):
                rxn = rxn1 if btype == "-" else rxn2
                products = rxn.RunReactants((new_ring, sc))
                if not products:
                    ok = False
                    break
                new_ring = random.choice(products)[0]
                Chem.SanitizeMol(new_ring)
            if ok:
                m = _restore_bond_protection(new_ring, mol)
            new_mols.append(m)
        except Exception:
            pass

    return _filter_and_dedup(new_mols, p)



if __name__ == "__main__":
    random.seed(123)
    decorate_scaffold(Chem.MolFromSmiles("c1c(*)c(*)co1"), mode="Dummy")