Skip to content

datamol.scaffold

Working with molecular scaffolds

fuzzy_scaffolding(mols, enforce_subs=None, n_atom_cuttoff=8, additional_templates=None, ignore_non_ring=False, mcs_params=None)

Generate fuzzy scaffold with enforceable group that needs to appear in the core, forcing to keep the full side chain if required.

NOTE(hadim): consider parallelize this (if possible). NOTE(hadim): consider refactoring this function in smaller reusable functions.

Parameters:

Name Type Description Default
mols List[rdkit.Chem.rdchem.Mol]

List of all molecules

required
enforce_subs List[str]

List of substructure to enforce on the scaffold.

None
n_atom_cuttoff int

Minimum number of atom a core should have.

8
additional_templates List[rdkit.Chem.rdchem.Mol]

Additional template to use to generate scaffolds.

None
ignore_non_ring bool

Whether to ignore atom no in murcko ring system, even if they are in the framework.

False
mcs_params Dict[Any, Any]

Arguments of MCS algorithm.

None

Returns:

Type Description
  • set - scaffolds - All found scaffolds in the molecules as valid smiles.
  • Dict[Dict] - scaffold_infos - Infos on the scaffold mapping, ignoring any side chain that had to be enforced. Key corresponds to generic scaffold smiles Values at ['smarts'] corresponds to smarts representation of the true scaffold (from MCS) Values at ['mols'] corresponds to list of molecules matching the scaffold
  • Dict[List] - scaffold_to_group - Map between each generic scaffold and the R-groups decomposition row.
Source code in datamol/scaffold/_fuzzy.py
def fuzzy_scaffolding(
    mols: List[Chem.rdchem.Mol],
    enforce_subs: List[str] = None,
    n_atom_cuttoff: int = 8,
    additional_templates: List[Chem.rdchem.Mol] = None,
    ignore_non_ring: bool = False,
    mcs_params: Dict[Any, Any] = None,
):
    """Generate fuzzy scaffold with enforceable group that needs to appear
    in the core, forcing to keep the full side chain if required.

    NOTE(hadim): consider parallelize this (if possible).
    NOTE(hadim): consider refactoring this function in smaller reusable functions.

    Args:
        mols: List of all molecules
        enforce_subs: List of substructure to enforce on the scaffold.
        n_atom_cuttoff: Minimum number of atom a core should have.
        additional_templates: Additional template to use to generate scaffolds.
        ignore_non_ring: Whether to ignore atom no in murcko ring system, even if they are in the framework.
        mcs_params: Arguments of MCS algorithm.

    Returns:
        - `set` - `scaffolds` - All found scaffolds in the molecules as valid smiles.
        - `Dict[Dict]` - `scaffold_infos` - Infos on the scaffold mapping, ignoring any side chain that had
                to be enforced. Key corresponds to generic scaffold smiles
                Values at ['smarts'] corresponds to smarts representation of the true scaffold (from MCS)
                Values at ['mols'] corresponds to list of molecules matching the scaffold
        - `Dict[List]` - `scaffold_to_group` - Map between each generic scaffold and the R-groups decomposition row.
    """

    if enforce_subs is None:
        enforce_subs = []

    if additional_templates is None:
        additional_templates = []

    if mcs_params is None:
        mcs_params = {}

    rg_params = rdRGroupDecomposition.RGroupDecompositionParameters()
    rg_params.removeAllHydrogenRGroups = True
    rg_params.removeHydrogensPostMatch = True
    rg_params.alignment = rdRGroupDecomposition.RGroupCoreAlignment.MCS
    rg_params.matchingStrategy = rdRGroupDecomposition.RGroupMatching.Exhaustive
    rg_params.rgroupLabelling = rdRGroupDecomposition.RGroupLabelling.AtomMap
    rg_params.labels = rdRGroupDecomposition.RGroupLabels.AtomIndexLabels

    core_query_param = AdjustQueryParameters()
    core_query_param.makeDummiesQueries = True
    core_query_param.adjustDegree = False
    core_query_param.makeBondsGeneric = True

    # group molecules by they generic Murcko scaffold, allowing
    # side chain that contains cycle (might be a bad idea)
    scf2infos = collections.defaultdict(dict)
    scf2groups = {}
    all_scaffolds = set([])

    for m in mols:
        generic_m = MurckoScaffold.MakeScaffoldGeneric(m)
        scf = MurckoScaffold.GetScaffoldForMol(m)
        try:
            scf = MurckoScaffold.MakeScaffoldGeneric(scf)
        except:
            pass

        if ignore_non_ring:
            rw_scf = Chem.RWMol(scf)
            atms = [a.GetIdx() for a in rw_scf.GetAtoms() if not a.IsInRing()]
            atms.sort(reverse=True)
            for a in atms:
                rw_scf.RemoveAtom(a)
            scfs = list(rdmolops.GetMolFrags(rw_scf, asMols=False))
        else:
            scfs = [dm.to_smiles(scf)]

        # add templates mols if exists:
        for tmp in additional_templates:
            tmp = dm.to_mol(tmp)
            tmp_scf = MurckoScaffold.MakeScaffoldGeneric(tmp)
            if generic_m.HasSubstructMatch(tmp_scf):
                scfs.append(dm.to_smiles(tmp_scf))

        for scf in scfs:
            if scf2infos[scf].get("mols"):
                scf2infos[scf]["mols"].append(m)
            else:
                scf2infos[scf]["mols"] = [m]

    for scf in scf2infos:
        # cheat by adding murcko as last mol always
        popout = False
        mols = scf2infos[scf]["mols"]
        if len(mols) < 2:
            mols = mols + [MurckoScaffold.GetScaffoldForMol(mols[0])]
            popout = True

        # compute the MCS of the cluster
        mcs = rdFMCS.FindMCS(
            mols,
            atomCompare=rdFMCS.AtomCompare.CompareAny,
            bondCompare=rdFMCS.BondCompare.CompareAny,
            completeRingsOnly=True,
            **mcs_params,
        )

        mcsM = Chem.MolFromSmarts(mcs.smartsString)
        mcsM.UpdatePropertyCache(False)
        Chem.SetHybridization(mcsM)

        if mcsM.GetNumAtoms() < n_atom_cuttoff:
            continue

        scf2infos[scf]["smarts"] = dm.to_smarts(mcsM)
        if popout:
            mols = mols[:-1]

        core_groups = []
        # generate rgroups based on the mcs core
        success_mols = []
        try:
            rg = rdRGroupDecomposition.RGroupDecomposition(mcsM, rg_params)
            for i, analog in enumerate(mols):
                analog.RemoveAllConformers()
                res = rg.Add(analog)
                if not (res < 0):
                    success_mols.append(i)
            rg.Process()
            core_groups = rg.GetRGroupsAsRows()
        except Exception:
            pass

        mols = [mols[i] for i in success_mols]
        scf2groups[scf] = core_groups
        for mol, gp in zip(mols, core_groups):
            core = gp["Core"]
            acceptable_groups = [
                a.GetAtomMapNum()
                for a in core.GetAtoms()
                if (a.GetAtomMapNum() and not a.IsInRing())
            ]

            rgroups = [gp[f"R{k}"] for k in acceptable_groups if f"R{k}" in gp.keys()]
            if enforce_subs:
                rgroups = [
                    rgp
                    for rgp in rgroups
                    if not any([len(rgp.GetSubstructMatch(frag)) > 0 for frag in enforce_subs])
                ]
            try:
                scaff = trim_side_chain(mol, AdjustQueryProperties(core, core_query_param), rgroups)
            except:
                continue
            all_scaffolds.add(dm.to_smiles(scaff))

    return all_scaffolds, scf2infos, scf2groups

trim_side_chain(mol, core, unwanted_side_chains)

Trim list of side chain from a molecule.

Source code in datamol/scaffold/_fuzzy.py
def trim_side_chain(mol: Chem.rdchem.Mol, core, unwanted_side_chains):
    """Trim list of side chain from a molecule."""

    mol = Chem.AddHs(mol)

    match = mol.GetSubstructMatch(core)
    map2idx = {}
    map2nei = {}
    unwanted2map = {}
    for patt in unwanted_side_chains:
        unwanted2map[patt] = [a.GetAtomMapNum() for a in patt.GetAtoms() if a.GetAtomMapNum()]
    unwanted_mapping = list(itertools.chain.from_iterable(unwanted2map.values()))

    for atom in core.GetAtoms():
        num = atom.GetAtomMapNum()
        if num and num in unwanted_mapping:
            mol_atom_idx = match[atom.GetIdx()]
            map2idx[mol_atom_idx] = num
            nei_atoms = mol.GetAtomWithIdx(mol_atom_idx).GetNeighbors()
            map2nei[mol_atom_idx] = [n.GetIdx() for n in nei_atoms if n.GetIdx() in match]

    emol = Chem.EditableMol(mol)
    for atom_idx, atom_map in map2idx.items():
        dummy = Chem.rdchem.Atom("*")
        dummy.SetAtomMapNum(atom_map)
        nei_idx = map2nei.get(atom_idx, [None])[0]
        if nei_idx:
            bond = mol.GetBondBetweenAtoms(atom_idx, nei_idx)
            emol.RemoveBond(atom_idx, nei_idx)
            new_ind = emol.AddAtom(dummy)
            emol.AddBond(nei_idx, new_ind, bond.GetBondType())

    mol = emol.GetMol()
    mol = Chem.RemoveHs(mol)
    query_param = AdjustQueryParameters()
    query_param.makeDummiesQueries = False
    query_param.adjustDegree = False
    query_param.aromatizeIfPossible = True
    for patt, _ in unwanted2map.items():
        cur_frag = dm.fix_mol(patt)
        mol = Chem.DeleteSubstructs(mol, cur_frag, onlyFrags=True)

    return dm.keep_largest_fragment(mol)