`datamol.fragment`¶

`anybreak(mol, remove_parent=False, sanitize=True, fix=True)` ¶

Fragment molecule by applying brics first, then fall back to frag.

Parameters:

Name	Type	Description	Default
`mol`	`Chem.rdchem.Mol`	a molecule.	required
`remove_parent`	`bool`	Remove parent from the fragments.	`False`
`sanitize`	`bool`	Wether to sanitize the fragments.	`True`
`fix`	`bool`	Wether to fix the fragments.	`True`

Source code in datamol/fragment/_fragment.py

def anybreak(
    mol: Chem.rdchem.Mol,
    remove_parent: bool = False,
    sanitize: bool = True,
    fix: bool = True,
):
    """Fragment molecule by applying brics first, then fall back to frag.

    Args:
        mol: a molecule.
        remove_parent: Remove parent from the fragments.
        sanitize: Wether to sanitize the fragments.
        fix: Wether to fix the fragments.
    """
    frags = []
    try:
        frags = brics(mol, fix=fix, remove_parent=remove_parent, sanitize=sanitize)
    except:
        pass

    if len(frags) == 0:
        frags = frag(mol, remove_parent=remove_parent, sanitize=sanitize, fix=fix)

    return frags

`brics(mol, singlepass=True, remove_parent=False, sanitize=True, fix=True)` ¶

Run BRICS on the molecules and potentially fix dummy atoms.

Parameters:

Name	Type	Description	Default
`mol`	`Chem.rdchem.Mol`	a molecule.	required
`singlepass`	`bool`	Single pass for `BRICSDecompose`.	`True`
`remove_parent`	`bool`	Remove parent from the fragments.	`False`
`sanitize`	`bool`	Wether to sanitize the fragments.	`True`
`fix`	`bool`	Wether to fix the fragments.	`True`

Source code in datamol/fragment/_fragment.py

def brics(
    mol: Chem.rdchem.Mol,
    singlepass: bool = True,
    remove_parent: bool = False,
    sanitize: bool = True,
    fix: bool = True,
):
    """Run BRICS on the molecules and potentially fix dummy atoms.

    Args:
        mol: a molecule.
        singlepass: Single pass for `BRICSDecompose`.
        remove_parent: Remove parent from the fragments.
        sanitize: Wether to sanitize the fragments.
        fix: Wether to fix the fragments.
    """
    frags = BRICS.BRICSDecompose(mol, returnMols=True, singlePass=singlepass)
    frags = list(frags)

    if fix:
        frags = [dm.fix_mol(x) for x in frags]
    if sanitize:
        frags = [dm.sanitize_mol(x) for x in frags]
    if remove_parent:
        frags.pop(0)

    frags = [x for x in frags if x is not None]

    return frags

`frag(mol, remove_parent=False, sanitize=True, fix=True)` ¶

Generate all possible fragmentation of a molecule.

Parameters:

Name	Type	Description	Default
`mol`	`Chem.rdchem.Mol`	a molecule.	required
`remove_parent`	`bool`	Remove parent from the fragments.	`False`
`sanitize`	`bool`	Wether to sanitize the fragments.	`True`
`fix`	`bool`	Wether to fix the fragments.	`True`

Source code in datamol/fragment/_fragment.py

def frag(
    mol: Chem.rdchem.Mol,
    remove_parent: bool = False,
    sanitize: bool = True,
    fix: bool = True,
):
    """Generate all possible fragmentation of a molecule.

    Args:
        mol: a molecule.
        remove_parent: Remove parent from the fragments.
        sanitize: Wether to sanitize the fragments.
        fix: Wether to fix the fragments.
    """
    frags = FraggleSim.generate_fraggle_fragmentation(mol)

    smiles = set([])
    for seq in frags:
        smiles |= {s.strip() for s in seq.split(".")}

    smiles = list(sorted(smiles, reverse=True))
    frags = [dm.to_mol(s) for s in smiles]

    if fix:
        frags = [dm.fix_mol(x) for x in frags]
    if sanitize:
        frags = [dm.sanitize_mol(x) for x in frags]

    frags = [x for x in frags if x is not None]

    if remove_parent:
        return frags
    return [mol] + frags

`mmpa_cut(mol, rdkit_pattern=False)` ¶

Cut molecules to perform mmpa analysis later

Parameters:

Name	Type	Description	Default
`mol`	`dm.Mol`	Molecule to fragment.	required
`rdkit_pattern`	`bool`	Whether to perform the fragmentation using the default rdkit pattern: [#6+0;!$(=, #[!#6])]!@!=!#[]"	`False`

Returns:

Type	Description
`Optional[Set[Any]]`	List of 'smiles,core,chains'

Source code in datamol/fragment/_fragment.py

def mmpa_cut(mol: dm.Mol, rdkit_pattern: bool = False) -> Optional[Set[Any]]:
    """Cut molecules to perform mmpa analysis later

    Args:
        mol: Molecule to fragment.
        rdkit_pattern: Whether to perform the fragmentation
            using the default rdkit pattern: [#6+0;!$(*=, #[!#6])]!@!=!#[*]"

    Returns:
        List of 'smiles,core,chains'
    """

    if mol is None:
        return mol

    outlines = set()

    smiles = dm.to_smiles(mol)

    if rdkit_pattern:
        frags = mmpa_frag(mol, max_cut=3, max_bond_cut=30)
    else:
        # heavy atoms
        frags = mmpa_frag(mol, pattern="[!#1]!@!=!#[!#1]", max_cut=4, max_bond_cut=30)
        frags.update(mmpa_frag(mol, pattern="[!#1]!@!=!#[!#1]", max_cut=3, max_bond_cut=30))

    frags = set(frags)
    for core, chains in frags:
        output = f"{smiles},{core},{chains}\n"
        outlines.add(output)

    # hydrogen splitting
    mol = dm.add_hs(mol)
    smiles = dm.to_smiles(mol)

    n = mol.GetNumHeavyAtoms()
    if n < 60:
        frags = mmpa_frag(mol, pattern=None, max_cut=1, max_bond_cut=100, h_split=True)
        for core, chains in frags:
            output = f"{smiles},{core},{chains}\n"
            outlines.add(output)

    return outlines

`mmpa_frag(mol, pattern=None, max_cut=1, max_bond_cut=20, h_split=False)` ¶

Fragment molecule on specific bonds suitable for a MMPA analysis.

Parameters:

Name	Type	Description	Default
`mol`	`dm.Mol`	Molecule to fragment.	required
`pattern`	`Optional[str]`	Bond pattern to split on. Will use default rdkit pattern '[#6+0;!$(=,#[!#6])]!@!=!#[]' if not provided.	`None`
`max_cut`	`int`	Number of cuts.	`1`
`max_bond_cut`	`int`	Maximum number of bond to cut. Default to 20.	`20`
`h_split`	`bool`	Whether to split at hydrogen position too. This is equivalent to enabling the addition of new fragments.	`False`

Returns:

Type	Description
`Optional[Set[dm.Mol]]`	List of fragments.

Source code in datamol/fragment/_fragment.py

def mmpa_frag(
    mol: dm.Mol,
    pattern: Optional[str] = None,
    max_cut: int = 1,
    max_bond_cut: int = 20,
    h_split: bool = False,
) -> Optional[Set[dm.Mol]]:
    """Fragment molecule on specific bonds suitable for a MMPA analysis.

    Args:
        mol: Molecule to fragment.
        pattern: Bond pattern to split on. Will use default rdkit pattern
            '[#6+0;!$(*=,#[!#6])]!@!=!#[*]' if not provided.
        max_cut: Number of cuts.
        max_bond_cut: Maximum number of bond to cut. Default to 20.
        h_split:  Whether to split at hydrogen position too.
            This is equivalent to enabling the addition of new fragments.

    Returns:
        List of fragments.
    """

    frags = []
    if pattern is None:
        frags = rdMMPA.FragmentMol(
            mol,
            maxCuts=max_cut,
            resultsAsMols=False,
            maxCutBonds=max_bond_cut,
        )
    elif pattern:
        frags = rdMMPA.FragmentMol(
            mol,
            pattern=pattern,
            maxCuts=max_cut,
            resultsAsMols=False,
            maxCutBonds=max_bond_cut,
        )

    if h_split:
        mol = dm.add_hs(mol)
        frags += rdMMPA.FragmentMol(
            mol,
            pattern="[#1]!@!=!#[!#1]",
            maxCuts=1,
            resultsAsMols=False,
            maxCutBonds=max_bond_cut,
        )
    return set(frags)

`recap(mol, remove_parent=False, sanitize=True, fix=True)` ¶

Fragment the molecule using the recap algorithm.

Parameters:

Name	Type	Description	Default
`mol`	`Chem.rdchem.Mol`	a molecule.	required
`remove_parent`	`bool`	Remove parent from the fragments.	`False`
`sanitize`	`bool`	Wether to sanitize the fragments.	`True`
`fix`	`bool`	Wether to fix the fragments.	`True`

Source code in datamol/fragment/_fragment.py

def recap(
    mol: Chem.rdchem.Mol,
    remove_parent: bool = False,
    sanitize: bool = True,
    fix: bool = True,
):
    """Fragment the molecule using the recap algorithm.

    Args:
        mol: a molecule.
        remove_parent: Remove parent from the fragments.
        sanitize: Wether to sanitize the fragments.
        fix: Wether to fix the fragments.
    """
    res = Recap.RecapDecompose(mol)
    frags = [dm.to_mol(x) for x in res.GetAllChildren().keys()]

    if fix:
        frags = [dm.fix_mol(x) for x in frags]
    if sanitize:
        frags = [dm.sanitize_mol(x) for x in frags]

    frags = [x for x in frags if x is not None]

    if remove_parent:
        return frags
    return [mol] + frags

maclandrol: 22/07/19 This is an attempt to reverse engineer the BRICS (Breaking of Retrosynthetically Interesting Chemical Substructures) approach for molecule fragmentation and use it as an heuristic for assembling molecules. The original paper on BRICS can be found here: http://dacemirror.sci-hub.tw/journal-article/93060992e8d889318b77b562c0e5b75f/degen2008.pdf. This makes senses from a methodological point of view, but I can't either guarantee that its is working as expected of if it's the best way to tackle this problem. The goal here is to reconstruct a set of original molecules, which if they were to be fragmented using BRICS, should yield the same fragment set in input. Thus, in theory fragments obtained using BRICS CAN be assembled into the original molecules with this method. This differs from rdkit BRICSBuild implementation that requires the presence of dummy indicator atoms added by a prior BRICS fragmentation.

`assemble_fragment_iter(fragmentlist, seens=None, scrambleReagents=False, max_n_mols=float('inf'), maxdepth=3, as_smiles=True, RXNS=None)` ¶

Perform an assembly from fragment given all potential RXNS transformation.

Source code in datamol/fragment/_assemble.py

def assemble_fragment_iter(
    fragmentlist,
    seens=None,
    scrambleReagents=False,
    max_n_mols=float("inf"),
    maxdepth=3,
    as_smiles=True,
    RXNS=None,
):
    """Perform an assembly from fragment given all potential RXNS transformation."""

    if RXNS is None:
        RXNS = ALL_BRICS_RETRO

    seen = set()
    if max_n_mols <= 0:
        return
    if not seens:
        seens = list(fragmentlist)
    if scrambleReagents:
        seens = list(seens)
        random.shuffle(seens, random=random.random)

    for seen in seens:
        nextSteps = []
        for rxn in RXNS:
            for fg in fragmentlist:
                for m, pSmi in _run_at_all_rct(rxn, fg, seen):
                    if pSmi not in seen:
                        seen.add(pSmi)
                        yield m if not as_smiles else pSmi
                    if _can_continue_with(m, rxn):
                        nextSteps.append(m)

        if nextSteps and len(seen) <= max_n_mols and maxdepth > 0:
            for p in assemble_fragment_iter(
                fragmentlist,
                seens=nextSteps,
                scrambleReagents=scrambleReagents,
                max_n_mols=(max_n_mols - len(seen)),
                maxdepth=maxdepth - 1,
            ):
                pSmi = dm.to_smiles(p, True)
                if pSmi not in seen:
                    seen.add(pSmi)
                    yield p if not as_smiles else pSmi
                    if len(seen) >= max_n_mols:
                        return

`assemble_fragment_order(fragmentlist, seen=None, allow_incomplete=False, max_n_mols=float('inf'), RXNS=None)` ¶

Assemble a list of fragment into a set of possible molecules under rules defined by the brics algorithm

We are of course assuming

that the order in the fragmentlist matter :D !
that none of the fragment has explicitly defined hydrogen atoms.
only a list of unique molecule is internally maintained

Parameters:

Name	Type	Description	Default
`fragmentlist`	`list`	list of original fragments to grow	required
`seen`	`Optional[Mol]`	original molecules used as base. If none, the first element of fragment list will be poped out	`None`
`allow_incomplete`	`bool`	Whether to accept assembled molecules with missing fragment	`False`

Source code in datamol/fragment/_assemble.py

def assemble_fragment_order(
    fragmentlist: list,
    seen: Optional[Mol] = None,
    allow_incomplete: bool = False,
    max_n_mols: float = float("inf"),
    RXNS=None,
):
    """Assemble a list of fragment into a set of possible molecules under rules defined by the brics algorithm

    We are of course assuming:

        1. that the order in the fragmentlist matter :D !
        2. that none of the fragment has explicitly defined hydrogen atoms.
        3. only a list of unique molecule is internally maintained

    Args:
        fragmentlist: list of original fragments to grow
        seen: original molecules used as base. If none, the first element of fragment list will be poped out
        allow_incomplete: Whether to accept assembled molecules with missing fragment
    """

    if RXNS is None:
        RXNS = ALL_BRICS_RETRO

    fragmentlist = list(fragmentlist)
    yield_counter = 0
    if seen is None:
        seen = fragmentlist.pop(0)
    seen = [dm.to_smiles(seen)]  # only one molecule to assemble
    while yield_counter < max_n_mols and len(fragmentlist) > 0:
        # find all the way to add this fragment to seen
        frag = fragmentlist.pop(0)
        level_set = [dm.to_mol(x) for x in seen]
        seen = set()
        for sm in level_set:
            try:
                # there is no point in even trying something on molecules that cannot be kekulized
                for rxn in RXNS:
                    for m, mSmi in _run_at_all_rct(rxn, frag, sm):
                        if allow_incomplete and mSmi not in seen:
                            yield m
                            yield_counter += 1
                        seen.add(mSmi)
            except Exception as e:
                print(e)
                pass

    for m in seen:
        if yield_counter < max_n_mols:
            yield dm.to_mol(m)
            yield_counter += 1

`break_mol(mol, minFragmentSize=1, silent=True, onlyUseReactions=[], randomize=False, mode='brics', returnTree=False)` ¶

Breaks a molecules into a list of fragment.

Source code in datamol/fragment/_assemble.py

def break_mol(
    mol: Chem.rdchem.Mol,
    minFragmentSize: int = 1,
    silent: bool = True,
    onlyUseReactions: list = [],
    randomize: bool = False,
    mode: str = "brics",
    returnTree: bool = False,
):
    """Breaks a molecules into a list of fragment."""

    if mode.lower() == "brics":
        all_reactions = ALL_BRICS
        all_reactions_type = ALL_BRICS_TYPE
    elif mode.lower() == "rxn":
        all_reactions = ALL_RXNS
        all_reactions_type = ALL_RXNS_TYPE
    else:
        all_reactions = ALL_BRICS + ALL_RXNS
        all_reactions_type = ALL_BRICS_TYPE + ALL_RXNS_TYPE
    if randomize:
        p = np.random.permutation(len(all_reactions))
        all_reactions = [all_reactions[ind] for ind in p]
        all_reactions_type = [all_reactions_type[ind] for ind in p]

    nx = dm.graph._get_networkx()
    mSmi = dm.to_smiles(mol, isomeric=True)
    G = nx.DiGraph()
    node_num = 0
    G.add_node(node_num, smiles=mSmi, mol=mol)
    allNodes = set()
    activePool = {mSmi: node_num}
    allNodes.add(mSmi)
    while activePool:
        nSmi = list(activePool.keys())[0]
        parent = activePool.pop(nSmi)
        node = G.nodes[parent]
        mol = node["mol"]
        for rxnIdx, reaction in zip(all_reactions_type, all_reactions):
            if onlyUseReactions and rxnIdx not in onlyUseReactions:
                continue
            ps = reaction.RunReactants((mol,))
            if ps:
                all_pass = [
                    all([prod.GetNumAtoms(onlyExplicit=True) > minFragmentSize for prod in p_])
                    for p_ in ps
                ]
                nz_i = 0
                while nz_i < len(all_pass) and not all_pass[nz_i]:
                    nz_i += 1
                if not silent:
                    print(nSmi, "->", len(ps), "products and selected ", nz_i)
                    # display(MolsToGridImage(list(itertools.chain(*list(ps))), molsPerRow=2))
                prodSeq = ps[nz_i % len(all_pass)]
                seqOk = True
                # we want to disqualify small fragments, so sort the product sequence by size
                prodSeq = [(prod.GetNumAtoms(onlyExplicit=True), prod) for prod in prodSeq]
                prodSeq.sort(key=lambda x: x[0])
                for _, prod in prodSeq:
                    prod.sanitized = True
                    try:
                        Chem.SanitizeMol(prod)
                    except:
                        if dm.sanitize_mol(prod) is None:
                            seqOk = False
                            break
                        continue
                    pSmi = dm.to_smiles(prod, isomeric=True)
                    seqOk = seqOk and (dm.to_mol(pSmi) is not None)

                    notDummies = sum([atm.GetSymbol() != "*" for atm in prod.GetAtoms()])
                    # nDummies = pSmi.count('*')
                    # if minFragmentSize > 0 and (nats - nDummies < minFragmentSize):
                    if minFragmentSize > 0 and notDummies < minFragmentSize:
                        seqOk = False
                        break
                    prod.pSmi = pSmi

                if seqOk:
                    for _, prod in prodSeq:
                        if not prod.sanitized:
                            continue
                        pSmi = prod.pSmi
                        node_num += 1
                        usmi = dm.to_smiles(dm.fix_mol(prod), isomeric=True)
                        G.add_node(node_num, smiles=usmi, mol=prod)
                        G.add_edge(parent, node_num)
                        if usmi not in allNodes:
                            activePool[pSmi] = node_num
                            allNodes.add(usmi)
                    G.nodes[parent]["rxn"] = rxnIdx
                    break  # at least one reaction matches

    leaves_smiles = [
        G.nodes[n]["smiles"] for n in G.nodes() if G.in_degree(n) != 0 and G.out_degree(n) == 0
    ]
    if returnTree:
        return leaves_smiles, allNodes, G
    return leaves_smiles, allNodes

`build(ll_mols, max_n_mols=float('inf'), mode='brics', frag_rxn=None, ADD_RNXS=[])` ¶

Build a super molecule from a list of fragments

Source code in datamol/fragment/_assemble.py

def build(ll_mols, max_n_mols=float("inf"), mode="brics", frag_rxn=None, ADD_RNXS=[]):
    """Build a super molecule from a list of fragments"""

    seen = set()
    stop = False
    CUR_RXNS = []
    CUR_RXNS_TYPE = []

    if mode == "brics":
        CUR_RXNS = ALL_BRICS_RETRO
        CUR_RXNS_TYPE = ALL_BRICS_TYPE
    elif mode == "rxn":
        CUR_RXNS = ALL_RXNS_RETRO
        CUR_RXNS_TYPE = ALL_RXNS_TYPE
    elif mode is not None:
        CUR_RXNS = ALL_BRICS_RETRO + ALL_RXNS_RETRO
        CUR_RXNS_TYPE = ALL_BRICS_TYPE + ALL_RXNS_TYPE

    if ADD_RNXS is not None:
        ADD_RNXS_TYPE = [f"RXN-{i}" for i in range(len(ADD_RNXS))]
        if isinstance(ADD_RNXS, dict):
            ADD_RNXS_TYPE = ADD_RNXS.keys()
            ADD_RNXS = ADD_RNXS.values()
        CUR_RXNS += list(ADD_RNXS)
        CUR_RXNS_TYPE += list(ADD_RNXS_TYPE)

    for i, rxn_type in enumerate(CUR_RXNS_TYPE):
        if (frag_rxn is not None) and (frag_rxn.strip('"') == rxn_type):
            CUR_RXNS = [CUR_RXNS[i]]
            break

    for fraglist in itertools.product(*ll_mols):
        if stop:
            break

        fraglist = list(fraglist)
        for rxn in CUR_RXNS:  # should be size==1 if frag_rxn is provided
            ps = []
            try:
                ps = _run_at_all_rct(rxn, fraglist[0], fraglist[1])
            except Exception:
                pass
            for m, mSmi in ps:
                if len(seen) >= max_n_mols:
                    stop = True
                    break
                if mSmi not in seen:
                    seen.add(mSmi)
                    yield m

datamol.fragment¶

anybreak(mol, remove_parent=False, sanitize=True, fix=True) ¶

brics(mol, singlepass=True, remove_parent=False, sanitize=True, fix=True) ¶

frag(mol, remove_parent=False, sanitize=True, fix=True) ¶

mmpa_cut(mol, rdkit_pattern=False) ¶

mmpa_frag(mol, pattern=None, max_cut=1, max_bond_cut=20, h_split=False) ¶

recap(mol, remove_parent=False, sanitize=True, fix=True) ¶

assemble_fragment_iter(fragmentlist, seens=None, scrambleReagents=False, max_n_mols=float('inf'), maxdepth=3, as_smiles=True, RXNS=None) ¶

assemble_fragment_order(fragmentlist, seen=None, allow_incomplete=False, max_n_mols=float('inf'), RXNS=None) ¶

break_mol(mol, minFragmentSize=1, silent=True, onlyUseReactions=[], randomize=False, mode='brics', returnTree=False) ¶

build(ll_mols, max_n_mols=float('inf'), mode='brics', frag_rxn=None, ADD_RNXS=[]) ¶

`datamol.fragment`¶

`anybreak(mol, remove_parent=False, sanitize=True, fix=True)` ¶

`brics(mol, singlepass=True, remove_parent=False, sanitize=True, fix=True)` ¶

`frag(mol, remove_parent=False, sanitize=True, fix=True)` ¶

`mmpa_cut(mol, rdkit_pattern=False)` ¶

`mmpa_frag(mol, pattern=None, max_cut=1, max_bond_cut=20, h_split=False)` ¶

`recap(mol, remove_parent=False, sanitize=True, fix=True)` ¶

`assemble_fragment_iter(fragmentlist, seens=None, scrambleReagents=False, max_n_mols=float('inf'), maxdepth=3, as_smiles=True, RXNS=None)` ¶

`assemble_fragment_order(fragmentlist, seen=None, allow_incomplete=False, max_n_mols=float('inf'), RXNS=None)` ¶

`break_mol(mol, minFragmentSize=1, silent=True, onlyUseReactions=[], randomize=False, mode='brics', returnTree=False)` ¶

`build(ll_mols, max_n_mols=float('inf'), mode='brics', frag_rxn=None, ADD_RNXS=[])` ¶