"""
replace.py — Coarse-grained fragment replacement operations for LACAN.
This module provides four scaffold-editing operations that swap entire fragments
(rings, substituents, linkers) for alternatives drawn from the fragment corpus
(rls.csv). Together they form the EXPLORE step of the adaptive GA in gen.py.
All four operations share these behaviours:
* **Random site selection** — when a molecule has multiple rings, linkers, or
substituents that could be operated on, one is chosen at random each call.
This ensures diverse output across repeated calls and prevents the same site
from being targeted every time.
* **Conservative sampling** (``conservative=True``, the default) — replacement
fragments are drawn with probability proportional to both their corpus
frequency *and* their structural similarity to the fragment being replaced
(Morgan fp Tanimoto, radius 2, including dummy atoms). Set
``conservative=False`` for purely frequency-weighted sampling.
* **Protection** — pass ``protect_smarts`` (a SMARTS string) to any function
to skip atoms matching the pattern. The exclusion is re-derived from the
molecule on every call, so it survives SMILES round-trips transparently.
* **Deduplication** — output molecules are deduplicated by InChIKey and scored
by the LACAN profile; molecules scoring 0 are discarded.
Reaction conventions
--------------------
The ``_breakbond`` SMARTS ``[!#0:0]!@[R:1]`` matches exo-bonds as the pair
``(non-ring atom, ring atom)``, so in every match tuple ``b[0]`` is the
**exo** atom and ``b[1]`` is the **ring** atom. This is important for
:func:`replace_ring` and :func:`replace_linker` which use these indices to
identify which atoms belong to the scaffold vs the side chains.
"""
from rdkit import Chem
from lacan import decompose as decompose_module
from lacan import lacan
from lacan.protect import (
get_protected_atoms,
score_mol_ignoring_protected_bonds,
)
import random
import csv
import os
from rdkit import DataStructs
from rdkit.Chem import rdChemReactions, inchi, rdFingerprintGenerator
# ---------------------------------------------------------------------------
# Reactions
# ---------------------------------------------------------------------------
rxn1 = rdChemReactions.ReactionFromSmarts(
"[*:0]-[#0:1].[*:2]-[#0:3]>>[*:0]-[*:2].[*:1]-[*:3]"
)
"""Connect two single-bond dummy attachment points."""
rxn2 = rdChemReactions.ReactionFromSmarts(
"[*:0]=[#0:1].[*:2]=[#0:3]>>[*:0]=[*:2].[*:1]=[*:3]"
)
"""Connect two double-bond dummy attachment points."""
htodummy = rdChemReactions.ReactionFromSmarts("[H1,H2,H3:0]>>[*:0]-[#0]")
"""Convert one hydrogen to a single-bond dummy attachment point."""
decompose1 = rdChemReactions.ReactionFromSmarts(
"[!#0:0]!@;-[R:1]>>[!R:0]-[#0].[R:1]-[#0]"
)
"""Cut a single-bond ring-exo bond, yielding (exo-fragment, ring-fragment)."""
singledummy = Chem.MolFromSmarts("[#0;$([*]-[*])]")
"""Matches a dummy atom attached by a single bond."""
doubledummy = Chem.MolFromSmarts("[#0;$([*]=[*])]")
"""Matches a dummy atom attached by a double bond."""
# _breakbond matches (non-ring-atom, ring-atom) pairs.
# In every match b=(b[0], b[1]): b[0] = exo atom, b[1] = ring atom.
_breakbond = Chem.MolFromSmarts("[!#0:0]!@[R:1]")
# ---------------------------------------------------------------------------
# Fragment corpus
# ---------------------------------------------------------------------------
_FRAGFPGEN = rdFingerprintGenerator.GetMorganGenerator(radius=2, fpSize=1024)
"""Morgan generator (r=2) used for fragment similarity scoring.
Including dummy atoms at radius 2 captures the immediate attachment
environment, giving meaningful similarity values for fragments that
differ mainly in the context around their attachment points.
"""
_corpus_cache = {} # keyed by min_count so different thresholds coexist
[docs]
def load_corpus(min_count=200, path=None):
"""Load a fragment corpus CSV, with caching.
By default loads the built-in ChEMBL corpus from ``data/rls.csv``. Pass
``path`` to load a custom corpus built with ``python -m lacan.decompose``
(e.g. from COCONUT or your own dataset).
The CSV is read only on the first call for a given ``(path, min_count)``
combination; subsequent calls return the cached list immediately.
Parameters
----------
min_count : int
Minimum occurrence count for a fragment to be included (default 200).
Lower values allow rarer fragments into the replacement pool.
path : str or None
Path to a corpus CSV produced by :mod:`lacan.decompose`. If ``None``
(default) the built-in ChEMBL corpus is used.
Returns
-------
list of [smiles, count, degree, ftype, bonds]
"""
cache_key = (path, min_count)
if cache_key in _corpus_cache:
return _corpus_cache[cache_key]
if path is None:
_this_dir = os.path.dirname(__file__)
path = os.path.join(_this_dir, "data/rls.csv")
_entries = []
with open(path, newline="") as _f:
_reader = csv.reader(_f, delimiter=",", quotechar='"')
for _e in _reader:
if _e[0] != "smiles" and int(_e[1]) > min_count:
_entries.append([_e[0], int(_e[1]), int(_e[2]), _e[3], _e[4]])
_corpus_cache[cache_key] = _entries
return _entries
# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------
def _frag_fp(smi):
"""Return a Morgan fingerprint for a fragment SMILES string, or None on failure.
Dummy atoms (atomic number 0) are treated as regular heavy atoms, so the
fingerprint captures the attachment-point chemical environment — exactly
what we want when comparing fragments by their connection context.
"""
mol = Chem.MolFromSmiles(smi)
if mol is None:
return None
return _FRAGFPGEN.GetFingerprint(mol)
def _similarity_weights(query_smi, candidates, base_weight_power=0.5, sim_boost=2.0):
"""Compute conservative sampling weights for a list of replacement fragment entries.
The weight assigned to each candidate is::
weight = count ^ base_weight_power * (1 + sim_boost * tanimoto(query, candidate))
This blends two signals:
* **Frequency** (``count ^ base_weight_power``) — common corpus fragments
are preferred, avoiding rare exotic scaffolds.
* **Similarity** (Tanimoto of Morgan fps) — fragments resembling the one
being replaced get a bonus, producing conservative bioisosteric swaps.
Parameters
----------
query_smi : SMILES of the fragment currently in the molecule
candidates : list of corpus entries ``[smiles, count, degree, ftype, bonds]``
base_weight_power : exponent applied to occurrence count (default 0.5)
sim_boost : multiplier for the similarity bonus term (default 2.0)
Returns a list of floats parallel to ``candidates``.
"""
qfp = _frag_fp(query_smi)
if qfp is None:
return [e[1] ** base_weight_power for e in candidates]
cfps = [_frag_fp(e[0]) for e in candidates]
weights = []
for i, e in enumerate(candidates):
occ_w = e[1] ** base_weight_power
sim = DataStructs.TanimotoSimilarity(qfp, cfps[i]) if cfps[i] is not None else 0.0
weights.append(occ_w * (1.0 + sim_boost * sim))
return weights
def _filter_and_dedup(new_mols, p):
"""Sanitize, LACAN-score, and deduplicate a list of product molecules.
A molecule is discarded if:
* RDKit sanitization fails,
* the LACAN score is exactly 0 (at least one bond is a hard alert), or
* it shares an InChIKey with a molecule already in the output list.
Protected bonds on the products are honoured by
:func:`~lacan.protect.score_mol_ignoring_protected_bonds`.
Returns a list of valid, unique RDKit Mol objects.
"""
filtered = []
iks = []
for m in new_mols:
try:
Chem.SanitizeMol(m)
score, _ = score_mol_ignoring_protected_bonds(m, p)
if score > 0:
ik = inchi.MolToInchiKey(m)
if ik not in iks:
filtered.append(m)
iks.append(ik)
except Exception:
pass
return filtered
def _get_ring_systems(mol):
"""Return fused ring systems as a list of frozensets of atom indices.
Simple rings that share at least one atom (fused or bridged systems) are
merged into a single frozenset, so naphthalene yields one system of 10
atoms rather than two overlapping 6-membered rings.
"""
ring_atom_sets = [set(r) for r in mol.GetRingInfo().AtomRings()]
merged = True
while merged:
merged = False
new_sets = []
while ring_atom_sets:
current = ring_atom_sets.pop(0)
fused = False
for i, other in enumerate(ring_atom_sets):
if current & other:
ring_atom_sets[i] = current | other
fused = True
merged = True
break
if not fused:
new_sets.append(frozenset(current))
ring_atom_sets = new_sets
return ring_atom_sets
def _restore_bond_protection(product, orig_mol):
"""Re-apply ``_lp`` bond protection from *orig_mol* onto *product*.
RDKit reactions do not carry bond properties to products, so ``_lp`` marks
are lost after stitching. This function restores them using substructure
matching: for each protected bond in *orig_mol* a two-atom query (with
ring-membership constraints on both endpoints) is matched against *product*,
and the matched bond is re-stamped ``_lp=True``.
Atom-index-pair matching is deliberately avoided because atom indices are
not stable across ``RunReactants`` — previously non-dummy atom indices could
coincidentally match a protected pair in an unrelated part of the product.
Parameters
----------
product : RDKit Mol (assembled replacement product, may be RWMol)
orig_mol : RDKit Mol with ``_lp`` bond properties set
Returns a new RDKit Mol with protection restored (or *product* unchanged if
*orig_mol* has no protected bonds).
"""
protected_bonds = [b for b in orig_mol.GetBonds()
if b.HasProp("_lp") and b.GetBoolProp("_lp")]
if not protected_bonds:
return product
rwmol = Chem.RWMol(product)
for pb in protected_bonds:
ai = pb.GetBeginAtomIdx()
aj = pb.GetEndAtomIdx()
a_i = orig_mol.GetAtomWithIdx(ai)
a_j = orig_mol.GetAtomWithIdx(aj)
# Build a two-atom query with ring constraints for both endpoints
ri = "R" if a_i.IsInRing() else "!R"
rj = "R" if a_j.IsInRing() else "!R"
bt = pb.GetBondTypeAsDouble()
bond_smarts = "-" if bt == 1.0 else (":" if bt == 1.5 else "=")
smarts = f"[{a_i.GetSymbol()};{ri}]{bond_smarts}[{a_j.GetSymbol()};{rj}]"
query = Chem.MolFromSmarts(smarts)
if query is None:
continue
matches = rwmol.GetSubstructMatches(query)
for match in matches:
prod_bond = rwmol.GetBondBetweenAtoms(match[0], match[1])
if prod_bond is not None:
prod_bond.SetBoolProp("_lp", True)
# Only mark the first match to avoid spurious over-protection
break
return rwmol.GetMol()
"""Return fused ring systems as a list of frozensets of atom indices.
Simple rings that share at least one atom (fused or bridged systems) are
merged into a single frozenset, so naphthalene yields one system of 10
atoms rather than two overlapping 6-membered rings.
Using RDKit's ``GetRingInfo().AtomRings()`` directly (rather than matching
decompose-derived SMARTS back onto the molecule) avoids the ambiguity where
the dummy ``*`` atom in a decompose fragment SMARTS can match non-ring atoms
in the original molecule.
"""
ring_atom_sets = [set(r) for r in mol.GetRingInfo().AtomRings()]
merged = True
while merged:
merged = False
new_sets = []
while ring_atom_sets:
current = ring_atom_sets.pop(0)
fused = False
for i, other in enumerate(ring_atom_sets):
if current & other:
ring_atom_sets[i] = current | other
fused = True
merged = True
break
if not fused:
new_sets.append(frozenset(current))
ring_atom_sets = new_sets
return ring_atom_sets
# ---------------------------------------------------------------------------
# Public API
# ---------------------------------------------------------------------------
[docs]
def decorate_scaffold(mol, p, score_threshold, fragcorpus=None, n_replacements=100,
mode="Hydrogen", replacements_per_mol=1, chance_of_linker=0.5,
protect_smarts=None):
"""Add substituents to a scaffold by decorating free positions.
Two modes are supported:
**Hydrogen mode** (default)
For each of the ``n_replacements`` attempts, one H-bearing atom is
chosen at random and its hydrogen replaced with a dummy, then a
single-attachment corpus fragment is joined at that position.
``replacements_per_mol`` controls how many H → fragment substitutions
are made per attempt. Protected atoms are skipped when selecting the
H position.
**Dummy mode**
The scaffold must already contain ``*`` dummy atoms marking desired
decoration sites (e.g. ``c1c(*)c(*)co1``). Each dummy is filled with
a corpus fragment. With probability ``chance_of_linker`` a two-
attachment linker fragment is inserted before the terminal substituent,
extending the chain length.
Parameters
----------
mol : RDKit Mol (scaffold to decorate)
p : LACAN profile dict
score_threshold : minimum LACAN score required for output molecules;
set 0.0 in the GA explore phase to accept all structures
fragcorpus : list of corpus entries (default: ChEMBL rls.csv)
n_replacements : number of decorated molecules to attempt generating
mode : ``"Hydrogen"`` or ``"Dummy"``
replacements_per_mol : (Hydrogen mode only) H → fragment substitutions per attempt
chance_of_linker : (Dummy mode only) probability of prepending a linker fragment
protect_smarts : SMARTS string; atoms matching it are skipped when selecting
decoration sites. ``None`` disables the check (default).
Returns
-------
list of RDKit Mol
Deduplicated molecules that pass ``score_threshold``.
"""
if fragcorpus is None:
fragcorpus = load_corpus()
protected_atoms = get_protected_atoms(mol, protect_smarts)
fs = [e for e in fragcorpus if e[2] == 1 and e[4] == "-"]
fd = [e for e in fragcorpus if e[2] == 1 and e[4] == "="]
fl = [e for e in fragcorpus if e[2] == 2 and e[3] == "Linker" and e[4] == "--"]
new_mols = []
if mode == "Dummy":
sdb_matches = [m for m in mol.GetSubstructMatches(singledummy)
if m[0] not in protected_atoms]
ddb_matches = [m for m in mol.GetSubstructMatches(doubledummy)
if m[0] not in protected_atoms]
sdb = len(sdb_matches)
ddb = len(ddb_matches)
for i in range(n_replacements):
cmol = Chem.Mol(mol)
fe1 = [Chem.MolFromSmiles(e[0]) for e in random.choices(fs, [e[1] ** 0.5 for e in fs], k=sdb)]
fe2 = [Chem.MolFromSmiles(e[0]) for e in random.choices(fd, [e[1] ** 0.5 for e in fd], k=ddb)]
fe3 = [Chem.MolFromSmiles(e[0]) for e in random.choices(fl, [e[1] ** 0.5 for e in fl], k=sdb)]
for j in range(sdb):
if random.random() > chance_of_linker:
cmol = random.choice(rxn1.RunReactants((cmol, fe3[j])))[0]
Chem.SanitizeMol(cmol)
cmol = random.choice(rxn1.RunReactants((cmol, fe1[j])))[0]
Chem.SanitizeMol(cmol)
for j in range(ddb):
cmol = random.choice(rxn2.RunReactants((cmol, fe2[j])))[0]
Chem.SanitizeMol(cmol)
m = _restore_bond_protection(cmol, mol)
new_mols.append(m)
elif mode == "Hydrogen":
for i in range(n_replacements):
cmol = Chem.Mol(mol)
for j in range(replacements_per_mol):
candidates = [a.GetIdx() for a in cmol.GetAtoms()
if a.GetTotalNumHs() > 0 and a.GetIdx() not in protected_atoms]
if not candidates:
break
chosen = random.choice(candidates)
prods = htodummy.RunReactants((cmol,))
valid = []
for prod in prods:
try:
Chem.SanitizeMol(prod[0])
for a in prod[0].GetAtoms():
if a.GetAtomicNum() == 0:
if any(n.GetIdx() == chosen for n in a.GetNeighbors()):
valid.append(prod[0])
break
except Exception:
pass
if valid:
cmol = random.choice(valid)
else:
if prods:
cmol = random.choice(prods)[0]
Chem.SanitizeMol(cmol)
fe = [Chem.MolFromSmiles(e[0]) for e in random.choices(
fs, [e[1] ** 0.5 for e in fs], k=replacements_per_mol)]
for j in range(replacements_per_mol):
cmol = random.choice(rxn1.RunReactants((cmol, fe[j])))[0]
Chem.SanitizeMol(cmol)
m = _restore_bond_protection(cmol, mol)
new_mols.append(m)
else:
print("mode not supported")
filtered_prods = []
iks = []
for m in new_mols:
try:
Chem.SanitizeMol(m)
score, _ = score_mol_ignoring_protected_bonds(m, p)
if score > score_threshold:
ik = inchi.MolToInchiKey(m)
if ik not in iks:
filtered_prods.append(m)
iks.append(ik)
except Exception:
pass
return filtered_prods
[docs]
def replace_substituent(mol, p, score_threshold, fragcorpus=None, n_replacements=100,
conservative=True, protect_smarts=None):
"""Replace one non-ring substituent with a randomly sampled corpus fragment.
A substituent is defined here strictly: a fragment produced by cutting a
single ring-exo bond that (a) has exactly one attachment point (one ``*``
dummy) and (b) contains no ring atoms from the original molecule. This
prevents entire ring systems from being mistaken for substituents when the
SMARTS fires on inter-ring bonds.
Algorithm
---------
1. Run ``decompose1`` (``[!#0]!@;-[R]>>[!R]-[*].[R]-[*]``) on the
molecule to enumerate all possible ring-exo single-bond cuts.
2. For each cut, classify the two fragments: smaller = substituent,
larger = scaffold.
3. **Filter**: discard any pair where the smaller fragment contains ring
atoms from the original molecule (it is a ring system, not a sub).
4. Also skip if the scaffold's attachment atom is protected.
5. **Randomly pick** one valid substituent site for each of the
``n_replacements`` attempts, so all sites are sampled across repeated
calls.
6. Draw a replacement from corpus substituents (degree 1, single bond),
optionally similarity-weighted, and join it to the scaffold dummy.
Parameters
----------
mol : RDKit Mol
p : LACAN profile dict
score_threshold : minimum LACAN score for output molecules
fragcorpus : fragment corpus
n_replacements : number of replacement attempts
conservative : if True, bias sampling toward structurally similar fragments
protect_smarts : SMARTS string; atoms matching it are excluded as substituent
sites and scaffold attachment points. ``None`` = no exclusion.
Returns
-------
list of RDKit Mol
"""
# Pre-compute ring atom indices for the original molecule so we can reject
# "substituents" that actually contain ring atoms.
if fragcorpus is None:
fragcorpus = load_corpus()
ring_atoms = set()
for ring in mol.GetRingInfo().AtomRings():
ring_atoms.update(ring)
protected_atoms = get_protected_atoms(mol, protect_smarts)
prods = decompose1.RunReactants((mol,))
pairs = [] # (substituent_mol, scaffold_mol) tuples
sub_smis = [] # SMILES of each substituent for similarity weighting
for prod in prods:
if len(prod) != 2:
continue
try:
[Chem.SanitizeMol(pp) for pp in prod]
except Exception:
continue
sorted_prod = sorted(prod, key=lambda m: m.GetNumAtoms())
substituent, scaffold = sorted_prod[0], sorted_prod[1]
# Reject if the smaller fragment has ring atoms in the original molecule
# (it is an inter-ring bond cut, not a true ring-exo substituent cut).
sub_match = mol.GetSubstructMatch(substituent)
if sub_match and any(idx in ring_atoms for idx in sub_match
if mol.GetAtomWithIdx(idx).GetAtomicNum() != 0):
continue
# Exactly one dummy and no rings in fragment
sub_smi = Chem.MolToSmiles(substituent)
if sub_smi.count("*") != 1:
continue
if substituent.GetRingInfo().NumRings() > 0:
continue
# Skip if the scaffold attachment point (neighbour of dummy) is protected
dummy_neighbor = None
for a in scaffold.GetAtoms():
if a.GetAtomicNum() == 0:
for n in a.GetNeighbors():
dummy_neighbor = n
if dummy_neighbor and dummy_neighbor.GetIdx() in protected_atoms:
continue
# Skip if the substituent being replaced contains any protected atom
sub_match = mol.GetSubstructMatch(substituent)
if sub_match and any(idx in protected_atoms for idx in sub_match
if mol.GetAtomWithIdx(idx).GetAtomicNum() != 0):
continue
pairs.append((substituent, scaffold))
sub_smis.append(sub_smi)
if not pairs:
return []
fe = [e for e in fragcorpus if e[2] == 1 and e[4] == "-"]
if not fe:
return []
new_mols = []
for i in range(n_replacements):
# Randomly pick one substituent site per attempt
pair_idx = random.randrange(len(pairs))
scaffold = pairs[pair_idx][1]
if conservative:
weights = _similarity_weights(sub_smis[pair_idx], fe)
else:
weights = [e[1] ** 0.5 for e in fe]
replacement = random.choices(fe, weights=weights, k=1)[0]
frag_mol = Chem.MolFromSmiles(replacement[0])
try:
new_mol = rxn1.RunReactants((frag_mol, scaffold))[0][0]
m = _restore_bond_protection(new_mol, mol)
new_mols.append(m)
except Exception:
pass
return _filter_and_dedup(new_mols, p)
[docs]
def replace_linker(mol, p, score_threshold, fragcorpus=None, n_replacements=100,
conservative=True, protect_smarts=None):
"""Replace one non-ring linker connecting two ring systems.
A linker is a non-ring fragment with exactly two attachment points (degree 2)
that bridges two ring systems. The ``decompose_molecule`` utility identifies
them as SMARTS strings with two ``*`` dummies.
Algorithm
---------
1. Decompose the molecule to get linker SMARTS strings.
2. For each linker SMARTS, find all substructure matches in the original mol.
3. For each match, identify the two exo-bonds using ``_breakbond``: atoms in
the linker match set whose ``b[0]`` (the exo/linker atom) appears in the
match are the cut points. Exactly two such bonds are required.
4. Collect all valid sites (linker SMARTS + two ring fragments after cutting).
5. **Randomly pick one site** from all valid sites so that molecules with
multiple linkers sample them uniformly across calls.
6. Draw ``n_replacements`` replacement linkers from the corpus (degree 2,
type "Linker", bond string "``--``") and stitch the two ring fragments
back through each.
``_breakbond`` convention: ``b[0]`` = non-ring (linker) atom,
``b[1]`` = ring atom. Only ``b[0]`` is checked against the linker match
set to locate the two linker-to-ring attachment bonds.
Parameters
----------
mol : RDKit Mol
p : LACAN profile dict
score_threshold : minimum LACAN score for output molecules
fragcorpus : fragment corpus
n_replacements : number of replacement linkers to try at the chosen site
conservative : if True, similarity-bias the linker sampling
protect_smarts : SMARTS string; linker atoms or bond-endpoint atoms matching
it are skipped. ``None`` = no exclusion (default).
Returns
-------
list of RDKit Mol, or ``[]`` if no replaceable linker is found.
"""
if fragcorpus is None:
fragcorpus = load_corpus()
dcmol = decompose_module.decompose_molecule(Chem.MolToSmiles(mol))
linkers_smarts = dcmol[1]
if not linkers_smarts:
return []
protected_atoms = get_protected_atoms(mol, protect_smarts)
breakable = mol.GetSubstructMatches(_breakbond)
# _breakbond: b[0] = exo/linker atom, b[1] = ring atom
# Collect all valid linker sites across all linker types and matches
valid_sites = [] # list of (linker_smi, [frag1, frag2])
for linker_smi in linkers_smarts:
linker_pattern = Chem.MolFromSmarts(linker_smi)
if linker_pattern is None:
continue
for linker_match in mol.GetSubstructMatches(linker_pattern):
linker_set = set(linker_match)
# b[0] is the linker (non-ring) atom — find breakable bonds where
# the exo atom belongs to this linker
to_break = [b for b in breakable if b[0] in linker_set]
if len(to_break) != 2:
continue
if any(idx in protected_atoms
for pair in to_break for idx in pair):
continue
# Also skip if any atom inside the linker itself is protected
if any(idx in protected_atoms for idx in linker_set):
continue
bond_indices = [mol.GetBondBetweenAtoms(*b).GetIdx() for b in to_break]
cmol = Chem.FragmentOnBonds(mol, bond_indices)
# Keep the two ring-side fragments (each has exactly 1 dummy)
frags = [Chem.MolFromSmiles(f) for f in Chem.MolToSmiles(cmol).split(".")
if f.count("*") == 1]
if len(frags) != 2:
continue
valid_sites.append((linker_smi, frags))
if not valid_sites:
return []
# Randomly pick one linker site so multi-linker molecules sample uniformly
linker_smi, frags = random.choice(valid_sites)
fe = [e for e in fragcorpus if e[2] == 2 and e[3] == "Linker" and e[4] == "--"]
if not fe:
return []
if conservative:
weights = _similarity_weights(linker_smi, fe)
else:
weights = [e[1] ** 0.5 for e in fe]
fr = [Chem.MolFromSmiles(e[0]) for e in random.choices(fe, weights=weights, k=n_replacements)]
new_mols = []
for new_linker in fr:
try:
combined = random.choice(rxn1.RunReactants((new_linker, frags[0])))[0]
Chem.SanitizeMol(combined)
combined = random.choice(rxn1.RunReactants((combined, frags[1])))[0]
Chem.SanitizeMol(combined)
m = _restore_bond_protection(combined, mol)
new_mols.append(m)
except Exception:
pass
return _filter_and_dedup(new_mols, p)
[docs]
def replace_ring(mol, p, score_threshold, fragcorpus=None, n_replacements=100,
conservative=True, protect_smarts=None):
"""Replace one ring system with a different ring drawn from the corpus.
Algorithm
---------
1. Find fused ring systems via :func:`_get_ring_systems`.
2. For each ring system walk bonds to find exo-bonds (leaving the system,
not themselves in a ring).
3. Skip protected systems; randomly pick one.
4. Cut the exo-bonds with ``FragmentOnBonds``. Use ``GetMolFrags()``
(without ``asMols``) to get tuples of *original* atom indices for each
fragment — these are stable and comparable to ``ring_system``. The
fragment whose non-dummy atoms are all in ``ring_system`` is the old ring;
everything else is a side chain to reattach. Build side-chain Mol objects
with ``GetMolFrags(asMols=True)`` in the same order.
5. Sample corpus rings; stitch side chains back.
Parameters
----------
mol : RDKit Mol
p : LACAN profile dict
score_threshold : minimum LACAN score for output molecules
fragcorpus : fragment corpus (default: ChEMBL rls.csv)
n_replacements : number of replacement rings to sample
conservative : if True, bias sampling toward structurally similar rings
protect_smarts : SMARTS string; ring systems containing any matching atom are
skipped entirely. ``None`` = no exclusion (default).
Returns
-------
list of RDKit Mol
"""
if fragcorpus is None:
fragcorpus = load_corpus()
ring_systems = _get_ring_systems(mol)
if not ring_systems:
return []
protected_atoms = get_protected_atoms(mol, protect_smarts)
valid = []
for ring_system in ring_systems:
if any(i in protected_atoms for i in ring_system):
continue
seen = set()
exo_bonds = []
for idx in ring_system:
for bond in mol.GetAtomWithIdx(idx).GetBonds():
bid = bond.GetIdx()
if bid in seen:
continue
seen.add(bid)
if bond.IsInRing():
continue
other = bond.GetOtherAtomIdx(idx)
if other in ring_system:
continue
btype = "-" if bond.GetBondTypeAsDouble() == 1.0 else "="
exo_bonds.append((bid, btype))
if not exo_bonds:
continue
ring_smi = Chem.MolFragmentToSmiles(mol, list(ring_system))
valid.append((ring_system, exo_bonds, ring_smi))
if not valid:
return []
ring_system, exo_bonds, ring_smi = random.choice(valid)
degree = len(exo_bonds)
bond_idxs = [e[0] for e in exo_bonds]
bond_types = [e[1] for e in exo_bonds]
# FragmentOnBonds adds new dummy atoms at indices >= mol.GetNumAtoms().
# GetMolFrags() (no asMols) returns tuples of original-mol atom indices,
# so we can compare directly against ring_system.
cmol = Chem.FragmentOnBonds(mol, bond_idxs)
orig_n = mol.GetNumAtoms()
frag_idx_tuples = Chem.GetMolFrags(cmol) # tuples of atom indices in cmol
frag_mols = Chem.GetMolFrags(cmol, asMols=True, sanitizeFrags=False)
ring_frag_pos = None
for i, idx_tuple in enumerate(frag_idx_tuples):
# Non-dummy atoms in this fragment: those with index < orig_n
heavy = [idx for idx in idx_tuple if idx < orig_n]
if all(idx in ring_system for idx in heavy):
ring_frag_pos = i
break
if ring_frag_pos is None:
return []
side_chains = [m for i, m in enumerate(frag_mols) if i != ring_frag_pos]
if len(side_chains) != degree:
return []
sbc = bond_types.count("-")
dbc = bond_types.count("=")
fe = [e for e in fragcorpus
if e[2] == degree and e[3] == "Ring"
and e[4].count("-") == sbc and e[4].count("=") == dbc]
if not fe:
return []
weights = _similarity_weights(ring_smi, fe) if conservative else [e[1] ** 0.5 for e in fe]
new_mols = []
for entry in random.choices(fe, weights=weights, k=n_replacements):
new_ring = Chem.MolFromSmiles(entry[0])
if new_ring is None:
continue
try:
ok = True
for sc, btype in zip(side_chains, bond_types):
rxn = rxn1 if btype == "-" else rxn2
products = rxn.RunReactants((new_ring, sc))
if not products:
ok = False
break
new_ring = random.choice(products)[0]
Chem.SanitizeMol(new_ring)
if ok:
m = _restore_bond_protection(new_ring, mol)
new_mols.append(m)
except Exception:
pass
return _filter_and_dedup(new_mols, p)
if __name__ == "__main__":
random.seed(123)
decorate_scaffold(Chem.MolFromSmiles("c1c(*)c(*)co1"), mode="Dummy")