Skip to content

datamol.fragment

Molecule fragmentation

anybreak(mol, remove_parent=False, sanitize=True, fix=True)

Fragment molecule by applying brics first, then fall back to frag.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
remove_parent bool

Remove parent from the fragments.

False
sanitize bool

Wether to sanitize the fragments.

True
fix bool

Wether to fix the fragments.

True
Source code in datamol/fragment/_fragment.py
def anybreak(
    mol: Chem.rdchem.Mol,
    remove_parent: bool = False,
    sanitize: bool = True,
    fix: bool = True,
):
    """Fragment molecule by applying brics first, then fall back to frag.

    Args:
        mol: a molecule.
        remove_parent: Remove parent from the fragments.
        sanitize: Wether to sanitize the fragments.
        fix: Wether to fix the fragments.
    """
    frags = []
    try:
        frags = brics(mol, fix=fix, remove_parent=remove_parent, sanitize=sanitize)
    except:
        pass

    if len(frags) == 0:
        frags = frag(mol, remove_parent=remove_parent, sanitize=sanitize, fix=fix)

    return frags

brics(mol, singlepass=True, remove_parent=False, sanitize=True, fix=True)

Run BRICS on the molecules and potentially fix dummy atoms.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
singlepass bool

Single pass for BRICSDecompose.

True
remove_parent bool

Remove parent from the fragments.

False
sanitize bool

Wether to sanitize the fragments.

True
fix bool

Wether to fix the fragments.

True
Source code in datamol/fragment/_fragment.py
def brics(
    mol: Chem.rdchem.Mol,
    singlepass: bool = True,
    remove_parent: bool = False,
    sanitize: bool = True,
    fix: bool = True,
):
    """Run BRICS on the molecules and potentially fix dummy atoms.

    Args:
        mol: a molecule.
        singlepass: Single pass for `BRICSDecompose`.
        remove_parent: Remove parent from the fragments.
        sanitize: Wether to sanitize the fragments.
        fix: Wether to fix the fragments.
    """
    frags = BRICS.BRICSDecompose(mol, returnMols=True, singlePass=singlepass)
    frags = list(frags)

    if fix:
        frags = [dm.fix_mol(x) for x in frags]
    if sanitize:
        frags = [dm.sanitize_mol(x) for x in frags]
    if remove_parent:
        frags.pop(0)

    frags = [x for x in frags if x is not None]

    return frags

frag(mol, remove_parent=False, sanitize=True, fix=True)

Generate all possible fragmentation of a molecule.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
remove_parent bool

Remove parent from the fragments.

False
sanitize bool

Wether to sanitize the fragments.

True
fix bool

Wether to fix the fragments.

True
Source code in datamol/fragment/_fragment.py
def frag(
    mol: Chem.rdchem.Mol,
    remove_parent: bool = False,
    sanitize: bool = True,
    fix: bool = True,
):
    """Generate all possible fragmentation of a molecule.

    Args:
        mol: a molecule.
        remove_parent: Remove parent from the fragments.
        sanitize: Wether to sanitize the fragments.
        fix: Wether to fix the fragments.
    """
    frags = FraggleSim.generate_fraggle_fragmentation(mol)

    smiles = set([])
    for seq in frags:
        smiles |= {s.strip() for s in seq.split(".")}

    smiles = list(sorted(smiles, reverse=True))
    frags = [dm.to_mol(s) for s in smiles]

    if fix:
        frags = [dm.fix_mol(x) for x in frags]
    if sanitize:
        frags = [dm.sanitize_mol(x) for x in frags]

    frags = [x for x in frags if x is not None]

    if remove_parent:
        return frags
    return [mol] + frags

recap(mol, remove_parent=False, sanitize=True, fix=True)

Fragment the molecule using the recap algorithm.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
remove_parent bool

Remove parent from the fragments.

False
sanitize bool

Wether to sanitize the fragments.

True
fix bool

Wether to fix the fragments.

True
Source code in datamol/fragment/_fragment.py
def recap(
    mol: Chem.rdchem.Mol,
    remove_parent: bool = False,
    sanitize: bool = True,
    fix: bool = True,
):
    """Fragment the molecule using the recap algorithm.

    Args:
        mol: a molecule.
        remove_parent: Remove parent from the fragments.
        sanitize: Wether to sanitize the fragments.
        fix: Wether to fix the fragments.
    """
    res = Recap.RecapDecompose(mol)
    frags = [dm.to_mol(x) for x in res.GetAllChildren().keys()]

    if fix:
        frags = [dm.fix_mol(x) for x in frags]
    if sanitize:
        frags = [dm.sanitize_mol(x) for x in frags]

    frags = [x for x in frags if x is not None]

    if remove_parent:
        return frags
    return [mol] + frags

MMPA fragmentation

mmpa_cut(mol, rdkit_pattern=False)

Cut molecules to perform mmpa analysis later

Parameters:

Name Type Description Default
mol Mol

Molecule to fragment.

required
rdkit_pattern bool

Whether to perform the fragmentation using the default rdkit pattern: [#6+0;!$(=, #[!#6])]!@!=!#[]"

False

Returns:

Type Description
Optional[Set[Any]]

List of 'smiles,core,chains'

Source code in datamol/fragment/_fragment.py
def mmpa_cut(mol: Chem.rdchem.Mol, rdkit_pattern: bool = False) -> Optional[Set[Any]]:
    """Cut molecules to perform mmpa analysis later

    Args:
        mol: Molecule to fragment.
        rdkit_pattern: Whether to perform the fragmentation
            using the default rdkit pattern: [#6+0;!$(*=, #[!#6])]!@!=!#[*]"

    Returns:
        List of 'smiles,core,chains'
    """

    if mol is None:
        return mol

    outlines = set()

    smiles = dm.to_smiles(mol)

    if rdkit_pattern:
        frags = mmpa_frag(mol, max_cut=3, max_bond_cut=30)
    else:
        # heavy atoms
        frags = mmpa_frag(mol, pattern="[!#1]!@!=!#[!#1]", max_cut=4, max_bond_cut=30)
        frags.update(mmpa_frag(mol, pattern="[!#1]!@!=!#[!#1]", max_cut=3, max_bond_cut=30))

    frags = set(frags)
    for core, chains in frags:
        output = f"{smiles},{core},{chains}\n"
        outlines.add(output)

    # hydrogen splitting
    mol = Chem.AddHs(mol)
    smiles = dm.to_smiles(mol)

    n = mol.GetNumHeavyAtoms()
    if n < 60:
        frags = mmpa_frag(mol, pattern=None, max_cut=1, max_bond_cut=100, h_split=True)
        for core, chains in frags:
            output = f"{smiles},{core},{chains}\n"
            outlines.add(output)

    return outlines

mmpa_frag(mol, pattern=None, max_cut=1, max_bond_cut=20, h_split=False)

Fragment molecule on specific bonds suitable for a MMPA analysis.

Parameters:

Name Type Description Default
mol

Molecule to fragment.

required
pattern str

Bond pattern to split on. Will use default rdkit pattern '[#6+0;!$(=,#[!#6])]!@!=!#[]' if not provided.

None
max_cut int

Number of cuts.

1
max_bond_cut int

Maximum number of bond to cut. Default to 20.

20
h_split bool

Whether to split at hydrogen position too. This is equivalent to enabling the addition of new fragments.

False

Returns:

Type Description
Optional[Set[rdkit.Chem.rdchem.Mol]]

List of fragments

Source code in datamol/fragment/_fragment.py
def mmpa_frag(
    mol,
    pattern: str = None,
    max_cut: int = 1,
    max_bond_cut: int = 20,
    h_split: bool = False,
) -> Optional[Set[Chem.Mol]]:
    """Fragment molecule on specific bonds suitable for a MMPA analysis.

    Args:
        mol: Molecule to fragment.
        pattern: Bond pattern to split on. Will use default rdkit pattern
            '[#6+0;!$(*=,#[!#6])]!@!=!#[*]' if not provided.
        max_cut: Number of cuts.
        max_bond_cut: Maximum number of bond to cut. Default to 20.
        h_split:  Whether to split at hydrogen position too.
            This is equivalent to enabling the addition of new fragments.

    Returns:
        List of fragments
    """

    frags = []
    if pattern is None:
        frags = rdMMPA.FragmentMol(
            mol,
            maxCuts=max_cut,
            resultsAsMols=False,
            maxCutBonds=max_bond_cut,
        )
    elif pattern:
        frags = rdMMPA.FragmentMol(
            mol,
            pattern=pattern,
            maxCuts=max_cut,
            resultsAsMols=False,
            maxCutBonds=max_bond_cut,
        )

    if h_split:
        mol = Chem.AddHs(mol)
        frags += rdMMPA.FragmentMol(
            mol,
            pattern="[#1]!@!=!#[!#1]",
            maxCuts=1,
            resultsAsMols=False,
            maxCutBonds=max_bond_cut,
        )
    return set(frags)

Molecule assembly

assemble_fragment_iter(fragmentlist, seens=None, scrambleReagents=False, max_n_mols=inf, maxdepth=3, as_smiles=True, RXNS=None)

Perform an assembly from fragment given all potential RXNS transformation.

Source code in datamol/fragment/_assemble.py
def assemble_fragment_iter(
    fragmentlist,
    seens=None,
    scrambleReagents=False,
    max_n_mols=float("inf"),
    maxdepth=3,
    as_smiles=True,
    RXNS=None,
):
    """Perform an assembly from fragment given all potential RXNS transformation."""

    if RXNS is None:
        RXNS = ALL_BRICS_RETRO

    seen = set()
    if max_n_mols <= 0:
        return
    if not seens:
        seens = list(fragmentlist)
    if scrambleReagents:
        seens = list(seens)
        random.shuffle(seens, random=random.random)

    for seen in seens:
        nextSteps = []
        for rxn in RXNS:
            for fg in fragmentlist:
                for m, pSmi in _run_at_all_rct(rxn, fg, seen):
                    if pSmi not in seen:
                        seen.add(pSmi)
                        yield m if not as_smiles else pSmi
                    if _can_continue_with(m, rxn):
                        nextSteps.append(m)

        if nextSteps and len(seen) <= max_n_mols and maxdepth > 0:
            for p in assemble_fragment_iter(
                fragmentlist,
                seens=nextSteps,
                scrambleReagents=scrambleReagents,
                max_n_mols=(max_n_mols - len(seen)),
                maxdepth=maxdepth - 1,
            ):
                pSmi = Chem.MolToSmiles(p, True)
                if pSmi not in seen:
                    seen.add(pSmi)
                    yield p if not as_smiles else pSmi
                    if len(seen) >= max_n_mols:
                        return

assemble_fragment_order(fragmentlist, seen=None, allow_incomplete=False, max_n_mols=inf, RXNS=None)

Assemble a list of fragment into a set of possible molecules under rules defined by the brics algorithm

..note :: We are of course assuming: 1. that the order in the fragmentlist matter :D ! 2. that none of the fragment has explicitly defined hydrogen atoms. 3. only a list of unique molecule is internally maintained

Parameters:

Name Type Description Default
fragmentlist

list of original fragments to grow

required
seen

original molecules used as base. If none, the first element of fragment list will be poped out

None
allow_incomplete bool

Whether to accept assembled molecules with missing fragment

False
Source code in datamol/fragment/_assemble.py
def assemble_fragment_order(
    fragmentlist,
    seen=None,
    allow_incomplete: bool = False,
    max_n_mols: float = float("inf"),
    RXNS=None,
):
    """Assemble a list of fragment into a set of possible molecules under rules defined by the brics algorithm

    ..note ::
        We are of course assuming:
        1. that the order in the fragmentlist matter :D !
        2. that none of the fragment has explicitly defined hydrogen atoms.
        3. only a list of unique molecule is internally maintained

    Args:
        fragmentlist: list of original fragments to grow
        seen: original molecules used as base. If none, the first element of fragment list will be poped out
        allow_incomplete: Whether to accept assembled molecules with missing fragment

    """

    if RXNS is None:
        RXNS = ALL_BRICS_RETRO

    fragmentlist = list(fragmentlist)
    yield_counter = 0
    if seen is None:
        seen = fragmentlist.pop(0)
    seen = [Chem.MolToSmiles(seen)]  # only one molecule to assemble
    while yield_counter < max_n_mols and len(fragmentlist) > 0:
        # find all the way to add this fragment to seen
        frag = fragmentlist.pop(0)
        level_set = [dm.to_mol(x) for x in seen]
        seen = set()
        for sm in level_set:
            try:
                # there is no point in even trying something on molecules that cannot be kekulized
                for rxn in RXNS:
                    for m, mSmi in _run_at_all_rct(rxn, frag, sm):
                        if allow_incomplete and mSmi not in seen:
                            yield m
                            yield_counter += 1
                        seen.add(mSmi)
            except Exception as e:
                print(e)
                pass

    for m in seen:
        if yield_counter < max_n_mols:
            yield dm.to_mol(m)
            yield_counter += 1

break_mol(mol, minFragmentSize=1, silent=True, onlyUseReactions=[], randomize=False, mode='brics', returnTree=False)

Breaks a molecules into a list of fragment.

Source code in datamol/fragment/_assemble.py
def break_mol(
    mol: Chem.rdchem.Mol,
    minFragmentSize: int = 1,
    silent: bool = True,
    onlyUseReactions: list = [],
    randomize: bool = False,
    mode: str = "brics",
    returnTree: bool = False,
):
    """Breaks a molecules into a list of fragment."""

    if mode.lower() == "brics":
        all_reactions = ALL_BRICS
        all_reactions_type = ALL_BRICS_TYPE
    elif mode.lower() == "rxn":
        all_reactions = ALL_RXNS
        all_reactions_type = ALL_RXNS_TYPE
    else:
        all_reactions = ALL_BRICS + ALL_RXNS
        all_reactions_type = ALL_BRICS_TYPE + ALL_RXNS_TYPE
    if randomize:
        p = np.random.permutation(len(all_reactions))
        all_reactions = [all_reactions[ind] for ind in p]
        all_reactions_type = [all_reactions_type[ind] for ind in p]

    nx = dm.graph._get_networkx()
    mSmi = Chem.MolToSmiles(mol, isomericSmiles=True)
    G = nx.DiGraph()
    node_num = 0
    G.add_node(node_num, smiles=mSmi, mol=mol)
    allNodes = set()
    activePool = {mSmi: node_num}
    allNodes.add(mSmi)
    while activePool:
        nSmi = list(activePool.keys())[0]
        parent = activePool.pop(nSmi)
        node = G.nodes[parent]
        mol = node["mol"]
        for rxnIdx, reaction in zip(all_reactions_type, all_reactions):
            if onlyUseReactions and rxnIdx not in onlyUseReactions:
                continue
            ps = reaction.RunReactants((mol,))
            if ps:

                all_pass = [
                    all([prod.GetNumAtoms(onlyExplicit=True) > minFragmentSize for prod in p_])
                    for p_ in ps
                ]
                nz_i = 0
                while nz_i < len(all_pass) and not all_pass[nz_i]:
                    nz_i += 1
                if not silent:
                    print(nSmi, "->", len(ps), "products and selected ", nz_i)
                    # display(MolsToGridImage(list(itertools.chain(*list(ps))), molsPerRow=2))
                prodSeq = ps[nz_i % len(all_pass)]
                seqOk = True
                # we want to disqualify small fragments, so sort the product sequence by size
                prodSeq = [(prod.GetNumAtoms(onlyExplicit=True), prod) for prod in prodSeq]
                prodSeq.sort(key=lambda x: x[0])
                for _, prod in prodSeq:
                    prod.sanitized = True
                    try:
                        Chem.SanitizeMol(prod)
                    except:
                        if dm.sanitize_mol(prod) is None:
                            seqOk = False
                            break
                        continue
                    pSmi = Chem.MolToSmiles(prod, isomericSmiles=True)
                    seqOk = seqOk and (dm.to_mol(pSmi) is not None)

                    notDummies = sum([atm.GetSymbol() != "*" for atm in prod.GetAtoms()])
                    # nDummies = pSmi.count('*')
                    # if minFragmentSize > 0 and (nats - nDummies < minFragmentSize):
                    if minFragmentSize > 0 and notDummies < minFragmentSize:
                        seqOk = False
                        break
                    prod.pSmi = pSmi

                if seqOk:
                    for _, prod in prodSeq:
                        if not prod.sanitized:
                            continue
                        pSmi = prod.pSmi
                        node_num += 1
                        usmi = Chem.MolToSmiles(dm.fix_mol(prod), isomericSmiles=True)
                        G.add_node(node_num, smiles=usmi, mol=prod)
                        G.add_edge(parent, node_num)
                        if usmi not in allNodes:
                            activePool[pSmi] = node_num
                            allNodes.add(usmi)
                    G.nodes[parent]["rxn"] = rxnIdx
                    break  # at least one reaction matches

    leaves_smiles = [
        G.nodes[n]["smiles"] for n in G.nodes() if G.in_degree(n) != 0 and G.out_degree(n) == 0
    ]
    if returnTree:
        return leaves_smiles, allNodes, G
    return leaves_smiles, allNodes

build(ll_mols, max_n_mols=inf, mode='brics', frag_rxn=None, ADD_RNXS=[])

Build a super molecule from a list of fragments

Source code in datamol/fragment/_assemble.py
def build(ll_mols, max_n_mols=float("inf"), mode="brics", frag_rxn=None, ADD_RNXS=[]):
    """Build a super molecule from a list of fragments"""

    seen = set()
    stop = False
    CUR_RXNS = []
    CUR_RXNS_TYPE = []

    if mode == "brics":
        CUR_RXNS = ALL_BRICS_RETRO
        CUR_RXNS_TYPE = ALL_BRICS_TYPE
    elif mode == "rxn":
        CUR_RXNS = ALL_RXNS_RETRO
        CUR_RXNS_TYPE = ALL_RXNS_TYPE
    elif mode is not None:
        CUR_RXNS = ALL_BRICS_RETRO + ALL_RXNS_RETRO
        CUR_RXNS_TYPE = ALL_BRICS_TYPE + ALL_RXNS_TYPE

    if ADD_RNXS is not None:
        ADD_RNXS_TYPE = [f"RXN-{i}" for i in range(len(ADD_RNXS))]
        if isinstance(ADD_RNXS, dict):
            ADD_RNXS_TYPE = ADD_RNXS.keys()
            ADD_RNXS = ADD_RNXS.values()
        CUR_RXNS += list(ADD_RNXS)
        CUR_RXNS_TYPE += list(ADD_RNXS_TYPE)

    for i, rxn_type in enumerate(CUR_RXNS_TYPE):
        if (frag_rxn is not None) and (frag_rxn.strip('"') == rxn_type):
            CUR_RXNS = [CUR_RXNS[i]]
            break

    for fraglist in itertools.product(*ll_mols):
        if stop:
            break

        fraglist = list(fraglist)
        for rxn in CUR_RXNS:  # should be size==1 if frag_rxn is provided
            ps = []
            try:
                ps = _run_at_all_rct(rxn, fraglist[0], fraglist[1])
            except Exception:
                pass
            for m, mSmi in ps:
                if len(seen) >= max_n_mols:
                    stop = True
                    break
                if mSmi not in seen:
                    seen.add(mSmi)
                    yield m