`datamol.fragment`¶

Molecule fragmentation¶

`anybreak(mol, remove_parent=False, sanitize=True, fix=True)` ¶

Fragment molecule by applying brics first, then fall back to frag.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	a molecule.	required
`remove_parent`	`bool`	Remove parent from the fragments.	`False`
`sanitize`	`bool`	Wether to sanitize the fragments.	`True`
`fix`	`bool`	Wether to fix the fragments.	`True`

Source code in datamol/fragment/_fragment.py

def anybreak(
    mol: Chem.rdchem.Mol,
    remove_parent: bool = False,
    sanitize: bool = True,
    fix: bool = True,
):
    """Fragment molecule by applying brics first, then fall back to frag.

    Args:
        mol: a molecule.
        remove_parent: Remove parent from the fragments.
        sanitize: Wether to sanitize the fragments.
        fix: Wether to fix the fragments.
    """
    frags = []
    try:
        frags = brics(mol, fix=fix, remove_parent=remove_parent, sanitize=sanitize)
    except:
        pass

    if len(frags) == 0:
        frags = frag(mol, remove_parent=remove_parent, sanitize=sanitize, fix=fix)

    return frags

`brics(mol, singlepass=True, remove_parent=False, sanitize=True, fix=True)` ¶

Run BRICS on the molecules and potentially fix dummy atoms.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	a molecule.	required
`singlepass`	`bool`	Single pass for `BRICSDecompose`.	`True`
`remove_parent`	`bool`	Remove parent from the fragments.	`False`
`sanitize`	`bool`	Wether to sanitize the fragments.	`True`
`fix`	`bool`	Wether to fix the fragments.	`True`

Source code in datamol/fragment/_fragment.py

def brics(
    mol: Chem.rdchem.Mol,
    singlepass: bool = True,
    remove_parent: bool = False,
    sanitize: bool = True,
    fix: bool = True,
):
    """Run BRICS on the molecules and potentially fix dummy atoms.

    Args:
        mol: a molecule.
        singlepass: Single pass for `BRICSDecompose`.
        remove_parent: Remove parent from the fragments.
        sanitize: Wether to sanitize the fragments.
        fix: Wether to fix the fragments.
    """
    frags = BRICS.BRICSDecompose(mol, returnMols=True, singlePass=singlepass)
    frags = list(frags)

    if fix:
        frags = [dm.fix_mol(x) for x in frags]
    if sanitize:
        frags = [dm.sanitize_mol(x) for x in frags]
    if remove_parent:
        frags.pop(0)

    frags = [x for x in frags if x is not None]

    return frags

`frag(mol, remove_parent=False, sanitize=True, fix=True)` ¶

Generate all possible fragmentation of a molecule.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	a molecule.	required
`remove_parent`	`bool`	Remove parent from the fragments.	`False`
`sanitize`	`bool`	Wether to sanitize the fragments.	`True`
`fix`	`bool`	Wether to fix the fragments.	`True`

Source code in datamol/fragment/_fragment.py

def frag(
    mol: Chem.rdchem.Mol,
    remove_parent: bool = False,
    sanitize: bool = True,
    fix: bool = True,
):
    """Generate all possible fragmentation of a molecule.

    Args:
        mol: a molecule.
        remove_parent: Remove parent from the fragments.
        sanitize: Wether to sanitize the fragments.
        fix: Wether to fix the fragments.
    """
    frags = FraggleSim.generate_fraggle_fragmentation(mol)

    smiles = set([])
    for seq in frags:
        smiles |= {s.strip() for s in seq.split(".")}

    smiles = list(sorted(smiles, reverse=True))
    frags = [dm.to_mol(s) for s in smiles]

    if fix:
        frags = [dm.fix_mol(x) for x in frags]
    if sanitize:
        frags = [dm.sanitize_mol(x) for x in frags]

    frags = [x for x in frags if x is not None]

    if remove_parent:
        return frags
    return [mol] + frags

`recap(mol, remove_parent=False, sanitize=True, fix=True)` ¶

Fragment the molecule using the recap algorithm.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	a molecule.	required
`remove_parent`	`bool`	Remove parent from the fragments.	`False`
`sanitize`	`bool`	Wether to sanitize the fragments.	`True`
`fix`	`bool`	Wether to fix the fragments.	`True`

Source code in datamol/fragment/_fragment.py

def recap(
    mol: Chem.rdchem.Mol,
    remove_parent: bool = False,
    sanitize: bool = True,
    fix: bool = True,
):
    """Fragment the molecule using the recap algorithm.

    Args:
        mol: a molecule.
        remove_parent: Remove parent from the fragments.
        sanitize: Wether to sanitize the fragments.
        fix: Wether to fix the fragments.
    """
    res = Recap.RecapDecompose(mol)
    frags = [dm.to_mol(x) for x in res.GetAllChildren().keys()]

    if fix:
        frags = [dm.fix_mol(x) for x in frags]
    if sanitize:
        frags = [dm.sanitize_mol(x) for x in frags]

    frags = [x for x in frags if x is not None]

    if remove_parent:
        return frags
    return [mol] + frags

MMPA fragmentation¶

`mmpa_cut(mol, rdkit_pattern=False)` ¶

Cut molecules to perform mmpa analysis later

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	Molecule to fragment.	required
`rdkit_pattern`	`bool`	Whether to perform the fragmentation using the default rdkit pattern: [#6+0;!$(=, #[!#6])]!@!=!#[]"	`False`

Returns:

Type	Description
`Optional[Set[Any]]`	List of 'smiles,core,chains'

Source code in datamol/fragment/_fragment.py

def mmpa_cut(mol: Chem.rdchem.Mol, rdkit_pattern: bool = False) -> Optional[Set[Any]]:
    """Cut molecules to perform mmpa analysis later

    Args:
        mol: Molecule to fragment.
        rdkit_pattern: Whether to perform the fragmentation
            using the default rdkit pattern: [#6+0;!$(*=, #[!#6])]!@!=!#[*]"

    Returns:
        List of 'smiles,core,chains'
    """

    if mol is None:
        return mol

    outlines = set()

    smiles = dm.to_smiles(mol)

    if rdkit_pattern:
        frags = mmpa_frag(mol, max_cut=3, max_bond_cut=30)
    else:
        # heavy atoms
        frags = mmpa_frag(mol, pattern="[!#1]!@!=!#[!#1]", max_cut=4, max_bond_cut=30)
        frags.update(mmpa_frag(mol, pattern="[!#1]!@!=!#[!#1]", max_cut=3, max_bond_cut=30))

    frags = set(frags)
    for core, chains in frags:
        output = f"{smiles},{core},{chains}\n"
        outlines.add(output)

    # hydrogen splitting
    mol = Chem.AddHs(mol)
    smiles = dm.to_smiles(mol)

    n = mol.GetNumHeavyAtoms()
    if n < 60:
        frags = mmpa_frag(mol, pattern=None, max_cut=1, max_bond_cut=100, h_split=True)
        for core, chains in frags:
            output = f"{smiles},{core},{chains}\n"
            outlines.add(output)

    return outlines

`mmpa_frag(mol, pattern=None, max_cut=1, max_bond_cut=20, h_split=False)` ¶

Fragment molecule on specific bonds suitable for a MMPA analysis.

Parameters:

Name	Type	Description	Default
`mol`		Molecule to fragment.	required
`pattern`	`str`	Bond pattern to split on. Will use default rdkit pattern '[#6+0;!$(=,#[!#6])]!@!=!#[]' if not provided.	`None`
`max_cut`	`int`	Number of cuts.	`1`
`max_bond_cut`	`int`	Maximum number of bond to cut. Default to 20.	`20`
`h_split`	`bool`	Whether to split at hydrogen position too. This is equivalent to enabling the addition of new fragments.	`False`

Returns:

Type	Description
`Optional[Set[rdkit.Chem.rdchem.Mol]]`	List of fragments

Source code in datamol/fragment/_fragment.py

def mmpa_frag(
    mol,
    pattern: str = None,
    max_cut: int = 1,
    max_bond_cut: int = 20,
    h_split: bool = False,
) -> Optional[Set[Chem.Mol]]:
    """Fragment molecule on specific bonds suitable for a MMPA analysis.

    Args:
        mol: Molecule to fragment.
        pattern: Bond pattern to split on. Will use default rdkit pattern
            '[#6+0;!$(*=,#[!#6])]!@!=!#[*]' if not provided.
        max_cut: Number of cuts.
        max_bond_cut: Maximum number of bond to cut. Default to 20.
        h_split:  Whether to split at hydrogen position too.
            This is equivalent to enabling the addition of new fragments.

    Returns:
        List of fragments
    """

    frags = []
    if pattern is None:
        frags = rdMMPA.FragmentMol(
            mol,
            maxCuts=max_cut,
            resultsAsMols=False,
            maxCutBonds=max_bond_cut,
        )
    elif pattern:
        frags = rdMMPA.FragmentMol(
            mol,
            pattern=pattern,
            maxCuts=max_cut,
            resultsAsMols=False,
            maxCutBonds=max_bond_cut,
        )

    if h_split:
        mol = Chem.AddHs(mol)
        frags += rdMMPA.FragmentMol(
            mol,
            pattern="[#1]!@!=!#[!#1]",
            maxCuts=1,
            resultsAsMols=False,
            maxCutBonds=max_bond_cut,
        )
    return set(frags)

Molecule assembly¶

`assemble_fragment_iter(fragmentlist, seens=None, scrambleReagents=False, max_n_mols=inf, maxdepth=3, as_smiles=True, RXNS=None)` ¶

Perform an assembly from fragment given all potential RXNS transformation.

Source code in datamol/fragment/_assemble.py

def assemble_fragment_iter(
    fragmentlist,
    seens=None,
    scrambleReagents=False,
    max_n_mols=float("inf"),
    maxdepth=3,
    as_smiles=True,
    RXNS=None,
):
    """Perform an assembly from fragment given all potential RXNS transformation."""

    if RXNS is None:
        RXNS = ALL_BRICS_RETRO

    seen = set()
    if max_n_mols <= 0:
        return
    if not seens:
        seens = list(fragmentlist)
    if scrambleReagents:
        seens = list(seens)
        random.shuffle(seens, random=random.random)

    for seen in seens:
        nextSteps = []
        for rxn in RXNS:
            for fg in fragmentlist:
                for m, pSmi in _run_at_all_rct(rxn, fg, seen):
                    if pSmi not in seen:
                        seen.add(pSmi)
                        yield m if not as_smiles else pSmi
                    if _can_continue_with(m, rxn):
                        nextSteps.append(m)

        if nextSteps and len(seen) <= max_n_mols and maxdepth > 0:
            for p in assemble_fragment_iter(
                fragmentlist,
                seens=nextSteps,
                scrambleReagents=scrambleReagents,
                max_n_mols=(max_n_mols - len(seen)),
                maxdepth=maxdepth - 1,
            ):
                pSmi = Chem.MolToSmiles(p, True)
                if pSmi not in seen:
                    seen.add(pSmi)
                    yield p if not as_smiles else pSmi
                    if len(seen) >= max_n_mols:
                        return

`assemble_fragment_order(fragmentlist, seen=None, allow_incomplete=False, max_n_mols=inf, RXNS=None)` ¶

Assemble a list of fragment into a set of possible molecules under rules defined by the brics algorithm

..note :: We are of course assuming: 1. that the order in the fragmentlist matter :D ! 2. that none of the fragment has explicitly defined hydrogen atoms. 3. only a list of unique molecule is internally maintained

Parameters:

Name	Type	Description	Default
`fragmentlist`		list of original fragments to grow	required
`seen`		original molecules used as base. If none, the first element of fragment list will be poped out	`None`
`allow_incomplete`	`bool`	Whether to accept assembled molecules with missing fragment	`False`

Source code in datamol/fragment/_assemble.py

def assemble_fragment_order(
    fragmentlist,
    seen=None,
    allow_incomplete: bool = False,
    max_n_mols: float = float("inf"),
    RXNS=None,
):
    """Assemble a list of fragment into a set of possible molecules under rules defined by the brics algorithm

    ..note ::
        We are of course assuming:
        1. that the order in the fragmentlist matter :D !
        2. that none of the fragment has explicitly defined hydrogen atoms.
        3. only a list of unique molecule is internally maintained

    Args:
        fragmentlist: list of original fragments to grow
        seen: original molecules used as base. If none, the first element of fragment list will be poped out
        allow_incomplete: Whether to accept assembled molecules with missing fragment

    """

    if RXNS is None:
        RXNS = ALL_BRICS_RETRO

    fragmentlist = list(fragmentlist)
    yield_counter = 0
    if seen is None:
        seen = fragmentlist.pop(0)
    seen = [Chem.MolToSmiles(seen)]  # only one molecule to assemble
    while yield_counter < max_n_mols and len(fragmentlist) > 0:
        # find all the way to add this fragment to seen
        frag = fragmentlist.pop(0)
        level_set = [dm.to_mol(x) for x in seen]
        seen = set()
        for sm in level_set:
            try:
                # there is no point in even trying something on molecules that cannot be kekulized
                for rxn in RXNS:
                    for m, mSmi in _run_at_all_rct(rxn, frag, sm):
                        if allow_incomplete and mSmi not in seen:
                            yield m
                            yield_counter += 1
                        seen.add(mSmi)
            except Exception as e:
                print(e)
                pass

    for m in seen:
        if yield_counter < max_n_mols:
            yield dm.to_mol(m)
            yield_counter += 1

`break_mol(mol, minFragmentSize=1, silent=True, onlyUseReactions=[], randomize=False, mode='brics', returnTree=False)` ¶

Breaks a molecules into a list of fragment.

Source code in datamol/fragment/_assemble.py

def break_mol(
    mol: Chem.rdchem.Mol,
    minFragmentSize: int = 1,
    silent: bool = True,
    onlyUseReactions: list = [],
    randomize: bool = False,
    mode: str = "brics",
    returnTree: bool = False,
):
    """Breaks a molecules into a list of fragment."""

    if mode.lower() == "brics":
        all_reactions = ALL_BRICS
        all_reactions_type = ALL_BRICS_TYPE
    elif mode.lower() == "rxn":
        all_reactions = ALL_RXNS
        all_reactions_type = ALL_RXNS_TYPE
    else:
        all_reactions = ALL_BRICS + ALL_RXNS
        all_reactions_type = ALL_BRICS_TYPE + ALL_RXNS_TYPE
    if randomize:
        p = np.random.permutation(len(all_reactions))
        all_reactions = [all_reactions[ind] for ind in p]
        all_reactions_type = [all_reactions_type[ind] for ind in p]

    nx = dm.graph._get_networkx()
    mSmi = Chem.MolToSmiles(mol, isomericSmiles=True)
    G = nx.DiGraph()
    node_num = 0
    G.add_node(node_num, smiles=mSmi, mol=mol)
    allNodes = set()
    activePool = {mSmi: node_num}
    allNodes.add(mSmi)
    while activePool:
        nSmi = list(activePool.keys())[0]
        parent = activePool.pop(nSmi)
        node = G.nodes[parent]
        mol = node["mol"]
        for rxnIdx, reaction in zip(all_reactions_type, all_reactions):
            if onlyUseReactions and rxnIdx not in onlyUseReactions:
                continue
            ps = reaction.RunReactants((mol,))
            if ps:

                all_pass = [
                    all([prod.GetNumAtoms(onlyExplicit=True) > minFragmentSize for prod in p_])
                    for p_ in ps
                ]
                nz_i = 0
                while nz_i < len(all_pass) and not all_pass[nz_i]:
                    nz_i += 1
                if not silent:
                    print(nSmi, "->", len(ps), "products and selected ", nz_i)
                    # display(MolsToGridImage(list(itertools.chain(*list(ps))), molsPerRow=2))
                prodSeq = ps[nz_i % len(all_pass)]
                seqOk = True
                # we want to disqualify small fragments, so sort the product sequence by size
                prodSeq = [(prod.GetNumAtoms(onlyExplicit=True), prod) for prod in prodSeq]
                prodSeq.sort(key=lambda x: x[0])
                for _, prod in prodSeq:
                    prod.sanitized = True
                    try:
                        Chem.SanitizeMol(prod)
                    except:
                        if dm.sanitize_mol(prod) is None:
                            seqOk = False
                            break
                        continue
                    pSmi = Chem.MolToSmiles(prod, isomericSmiles=True)
                    seqOk = seqOk and (dm.to_mol(pSmi) is not None)

                    notDummies = sum([atm.GetSymbol() != "*" for atm in prod.GetAtoms()])
                    # nDummies = pSmi.count('*')
                    # if minFragmentSize > 0 and (nats - nDummies < minFragmentSize):
                    if minFragmentSize > 0 and notDummies < minFragmentSize:
                        seqOk = False
                        break
                    prod.pSmi = pSmi

                if seqOk:
                    for _, prod in prodSeq:
                        if not prod.sanitized:
                            continue
                        pSmi = prod.pSmi
                        node_num += 1
                        usmi = Chem.MolToSmiles(dm.fix_mol(prod), isomericSmiles=True)
                        G.add_node(node_num, smiles=usmi, mol=prod)
                        G.add_edge(parent, node_num)
                        if usmi not in allNodes:
                            activePool[pSmi] = node_num
                            allNodes.add(usmi)
                    G.nodes[parent]["rxn"] = rxnIdx
                    break  # at least one reaction matches

    leaves_smiles = [
        G.nodes[n]["smiles"] for n in G.nodes() if G.in_degree(n) != 0 and G.out_degree(n) == 0
    ]
    if returnTree:
        return leaves_smiles, allNodes, G
    return leaves_smiles, allNodes

`build(ll_mols, max_n_mols=inf, mode='brics', frag_rxn=None, ADD_RNXS=[])` ¶

Build a super molecule from a list of fragments

Source code in datamol/fragment/_assemble.py

def build(ll_mols, max_n_mols=float("inf"), mode="brics", frag_rxn=None, ADD_RNXS=[]):
    """Build a super molecule from a list of fragments"""

    seen = set()
    stop = False
    CUR_RXNS = []
    CUR_RXNS_TYPE = []

    if mode == "brics":
        CUR_RXNS = ALL_BRICS_RETRO
        CUR_RXNS_TYPE = ALL_BRICS_TYPE
    elif mode == "rxn":
        CUR_RXNS = ALL_RXNS_RETRO
        CUR_RXNS_TYPE = ALL_RXNS_TYPE
    elif mode is not None:
        CUR_RXNS = ALL_BRICS_RETRO + ALL_RXNS_RETRO
        CUR_RXNS_TYPE = ALL_BRICS_TYPE + ALL_RXNS_TYPE

    if ADD_RNXS is not None:
        ADD_RNXS_TYPE = [f"RXN-{i}" for i in range(len(ADD_RNXS))]
        if isinstance(ADD_RNXS, dict):
            ADD_RNXS_TYPE = ADD_RNXS.keys()
            ADD_RNXS = ADD_RNXS.values()
        CUR_RXNS += list(ADD_RNXS)
        CUR_RXNS_TYPE += list(ADD_RNXS_TYPE)

    for i, rxn_type in enumerate(CUR_RXNS_TYPE):
        if (frag_rxn is not None) and (frag_rxn.strip('"') == rxn_type):
            CUR_RXNS = [CUR_RXNS[i]]
            break

    for fraglist in itertools.product(*ll_mols):
        if stop:
            break

        fraglist = list(fraglist)
        for rxn in CUR_RXNS:  # should be size==1 if frag_rxn is provided
            ps = []
            try:
                ps = _run_at_all_rct(rxn, fraglist[0], fraglist[1])
            except Exception:
                pass
            for m, mSmi in ps:
                if len(seen) >= max_n_mols:
                    stop = True
                    break
                if mSmi not in seen:
                    seen.add(mSmi)
                    yield m

datamol.fragment¶

Molecule fragmentation¶

anybreak(mol, remove_parent=False, sanitize=True, fix=True) ¶

brics(mol, singlepass=True, remove_parent=False, sanitize=True, fix=True) ¶

frag(mol, remove_parent=False, sanitize=True, fix=True) ¶

recap(mol, remove_parent=False, sanitize=True, fix=True) ¶

MMPA fragmentation¶

mmpa_cut(mol, rdkit_pattern=False) ¶

mmpa_frag(mol, pattern=None, max_cut=1, max_bond_cut=20, h_split=False) ¶

Molecule assembly¶

assemble_fragment_iter(fragmentlist, seens=None, scrambleReagents=False, max_n_mols=inf, maxdepth=3, as_smiles=True, RXNS=None) ¶

assemble_fragment_order(fragmentlist, seen=None, allow_incomplete=False, max_n_mols=inf, RXNS=None) ¶

break_mol(mol, minFragmentSize=1, silent=True, onlyUseReactions=[], randomize=False, mode='brics', returnTree=False) ¶

build(ll_mols, max_n_mols=inf, mode='brics', frag_rxn=None, ADD_RNXS=[]) ¶

`datamol.fragment`¶

`anybreak(mol, remove_parent=False, sanitize=True, fix=True)` ¶

`brics(mol, singlepass=True, remove_parent=False, sanitize=True, fix=True)` ¶

`frag(mol, remove_parent=False, sanitize=True, fix=True)` ¶

`recap(mol, remove_parent=False, sanitize=True, fix=True)` ¶

`mmpa_cut(mol, rdkit_pattern=False)` ¶

`mmpa_frag(mol, pattern=None, max_cut=1, max_bond_cut=20, h_split=False)` ¶

`assemble_fragment_iter(fragmentlist, seens=None, scrambleReagents=False, max_n_mols=inf, maxdepth=3, as_smiles=True, RXNS=None)` ¶

`assemble_fragment_order(fragmentlist, seen=None, allow_incomplete=False, max_n_mols=inf, RXNS=None)` ¶

`break_mol(mol, minFragmentSize=1, silent=True, onlyUseReactions=[], randomize=False, mode='brics', returnTree=False)` ¶

`build(ll_mols, max_n_mols=inf, mode='brics', frag_rxn=None, ADD_RNXS=[])` ¶