Skip to content

datamol.fragment

anybreak(mol, remove_parent=False, sanitize=True, fix=True)

Fragment molecule by applying brics first, then fall back to frag.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
remove_parent bool

Remove parent from the fragments.

False
sanitize bool

Wether to sanitize the fragments.

True
fix bool

Wether to fix the fragments.

True
Source code in datamol/fragment/_fragment.py
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
def anybreak(
    mol: Chem.rdchem.Mol,
    remove_parent: bool = False,
    sanitize: bool = True,
    fix: bool = True,
):
    """Fragment molecule by applying brics first, then fall back to frag.

    Args:
        mol: a molecule.
        remove_parent: Remove parent from the fragments.
        sanitize: Wether to sanitize the fragments.
        fix: Wether to fix the fragments.
    """
    frags = []
    try:
        frags = brics(mol, fix=fix, remove_parent=remove_parent, sanitize=sanitize)
    except Exception:
        pass

    if len(frags) == 0:
        frags = frag(mol, remove_parent=remove_parent, sanitize=sanitize, fix=fix)

    return frags

brics(mol, singlepass=True, remove_parent=False, sanitize=True, fix=True)

Run BRICS on the molecules and potentially fix dummy atoms.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
singlepass bool

Single pass for BRICSDecompose.

True
remove_parent bool

Remove parent from the fragments.

False
sanitize bool

Wether to sanitize the fragments.

True
fix bool

Wether to fix the fragments.

True
Source code in datamol/fragment/_fragment.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
def brics(
    mol: Chem.rdchem.Mol,
    singlepass: bool = True,
    remove_parent: bool = False,
    sanitize: bool = True,
    fix: bool = True,
):
    """Run BRICS on the molecules and potentially fix dummy atoms.

    Args:
        mol: a molecule.
        singlepass: Single pass for `BRICSDecompose`.
        remove_parent: Remove parent from the fragments.
        sanitize: Wether to sanitize the fragments.
        fix: Wether to fix the fragments.
    """
    frags = BRICS.BRICSDecompose(mol, returnMols=True, singlePass=singlepass)
    frags = list(frags)

    if fix:
        frags = [dm.fix_mol(x) for x in frags]
    if sanitize:
        frags = [dm.sanitize_mol(x) for x in frags]
    if remove_parent:
        frags.pop(0)

    frags = [x for x in frags if x is not None]

    return frags

frag(mol, remove_parent=False, sanitize=True, fix=True)

Generate all possible fragmentation of a molecule.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
remove_parent bool

Remove parent from the fragments.

False
sanitize bool

Wether to sanitize the fragments.

True
fix bool

Wether to fix the fragments.

True
Source code in datamol/fragment/_fragment.py
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
def frag(
    mol: Chem.rdchem.Mol,
    remove_parent: bool = False,
    sanitize: bool = True,
    fix: bool = True,
):
    """Generate all possible fragmentation of a molecule.

    Args:
        mol: a molecule.
        remove_parent: Remove parent from the fragments.
        sanitize: Wether to sanitize the fragments.
        fix: Wether to fix the fragments.
    """
    frags = FraggleSim.generate_fraggle_fragmentation(mol)

    smiles = set([])
    for seq in frags:
        smiles |= {s.strip() for s in seq.split(".")}

    smiles = list(sorted(smiles, reverse=True))
    frags = [dm.to_mol(s) for s in smiles]

    if fix:
        frags = [dm.fix_mol(x) for x in frags]
    if sanitize:
        frags = [dm.sanitize_mol(x) for x in frags]

    frags = [x for x in frags if x is not None]

    if remove_parent:
        return frags
    return [mol] + frags

mmpa_cut(mol, rdkit_pattern=False)

Cut molecules to perform mmpa analysis later

Parameters:

Name Type Description Default
mol Mol

Molecule to fragment.

required
rdkit_pattern bool

Whether to perform the fragmentation using the default rdkit pattern: [#6+0;!$(=, #[!#6])]!@!=!#[]"

False

Returns:

Type Description
Optional[Set[Any]]

List of 'smiles,core,chains'

Source code in datamol/fragment/_fragment.py
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
def mmpa_cut(mol: dm.Mol, rdkit_pattern: bool = False) -> Optional[Set[Any]]:
    """Cut molecules to perform mmpa analysis later

    Args:
        mol: Molecule to fragment.
        rdkit_pattern: Whether to perform the fragmentation
            using the default rdkit pattern: [#6+0;!$(*=, #[!#6])]!@!=!#[*]"

    Returns:
        List of 'smiles,core,chains'
    """

    if mol is None:
        return mol

    outlines = set()

    smiles = dm.to_smiles(mol)

    if rdkit_pattern:
        frags = mmpa_frag(mol, max_cut=3, max_bond_cut=30)
    else:
        # heavy atoms
        frags = mmpa_frag(mol, pattern="[!#1]!@!=!#[!#1]", max_cut=4, max_bond_cut=30)
        frags.update(mmpa_frag(mol, pattern="[!#1]!@!=!#[!#1]", max_cut=3, max_bond_cut=30))

    frags = set(frags)
    for core, chains in frags:
        output = f"{smiles},{core},{chains}\n"
        outlines.add(output)

    # hydrogen splitting
    mol = dm.add_hs(mol)
    smiles = dm.to_smiles(mol)

    n = mol.GetNumHeavyAtoms()
    if n < 60:
        frags = mmpa_frag(mol, pattern=None, max_cut=1, max_bond_cut=100, h_split=True)
        for core, chains in frags:
            output = f"{smiles},{core},{chains}\n"
            outlines.add(output)

    return outlines

mmpa_frag(mol, pattern=None, max_cut=1, max_bond_cut=20, h_split=False)

Fragment molecule on specific bonds suitable for a MMPA analysis.

Parameters:

Name Type Description Default
mol Mol

Molecule to fragment.

required
pattern Optional[str]

Bond pattern to split on. Will use default rdkit pattern '[#6+0;!$(=,#[!#6])]!@!=!#[]' if not provided.

None
max_cut int

Number of cuts.

1
max_bond_cut int

Maximum number of bond to cut. Default to 20.

20
h_split bool

Whether to split at hydrogen position too. This is equivalent to enabling the addition of new fragments.

False

Returns:

Type Description
Optional[Set[Mol]]

List of fragments.

Source code in datamol/fragment/_fragment.py
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
def mmpa_frag(
    mol: dm.Mol,
    pattern: Optional[str] = None,
    max_cut: int = 1,
    max_bond_cut: int = 20,
    h_split: bool = False,
) -> Optional[Set[dm.Mol]]:
    """Fragment molecule on specific bonds suitable for a MMPA analysis.

    Args:
        mol: Molecule to fragment.
        pattern: Bond pattern to split on. Will use default rdkit pattern
            '[#6+0;!$(*=,#[!#6])]!@!=!#[*]' if not provided.
        max_cut: Number of cuts.
        max_bond_cut: Maximum number of bond to cut. Default to 20.
        h_split:  Whether to split at hydrogen position too.
            This is equivalent to enabling the addition of new fragments.

    Returns:
        List of fragments.
    """

    frags = []
    if pattern is None:
        frags = rdMMPA.FragmentMol(
            mol,
            maxCuts=max_cut,
            resultsAsMols=False,
            maxCutBonds=max_bond_cut,
        )
    elif pattern:
        frags = rdMMPA.FragmentMol(
            mol,
            pattern=pattern,
            maxCuts=max_cut,
            resultsAsMols=False,
            maxCutBonds=max_bond_cut,
        )

    if h_split:
        mol = dm.add_hs(mol)
        frags += rdMMPA.FragmentMol(
            mol,
            pattern="[#1]!@!=!#[!#1]",
            maxCuts=1,
            resultsAsMols=False,
            maxCutBonds=max_bond_cut,
        )
    return set(frags)

recap(mol, remove_parent=False, sanitize=True, fix=True)

Fragment the molecule using the recap algorithm.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
remove_parent bool

Remove parent from the fragments.

False
sanitize bool

Wether to sanitize the fragments.

True
fix bool

Wether to fix the fragments.

True
Source code in datamol/fragment/_fragment.py
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
def recap(
    mol: Chem.rdchem.Mol,
    remove_parent: bool = False,
    sanitize: bool = True,
    fix: bool = True,
):
    """Fragment the molecule using the recap algorithm.

    Args:
        mol: a molecule.
        remove_parent: Remove parent from the fragments.
        sanitize: Wether to sanitize the fragments.
        fix: Wether to fix the fragments.
    """
    res = Recap.RecapDecompose(mol)
    frags = [dm.to_mol(x) for x in res.GetAllChildren().keys()]

    if fix:
        frags = [dm.fix_mol(x) for x in frags]
    if sanitize:
        frags = [dm.sanitize_mol(x) for x in frags]

    frags = [x for x in frags if x is not None]

    if remove_parent:
        return frags
    return [mol] + frags

maclandrol: 22/07/19 This is an attempt to reverse engineer the BRICS (Breaking of Retrosynthetically Interesting Chemical Substructures) approach for molecule fragmentation and use it as an heuristic for assembling molecules. The original paper on BRICS can be found here: http://dacemirror.sci-hub.tw/journal-article/93060992e8d889318b77b562c0e5b75f/degen2008.pdf. This makes senses from a methodological point of view, but I can't either guarantee that its is working as expected of if it's the best way to tackle this problem. The goal here is to reconstruct a set of original molecules, which if they were to be fragmented using BRICS, should yield the same fragment set in input. Thus, in theory fragments obtained using BRICS CAN be assembled into the original molecules with this method. This differs from rdkit BRICSBuild implementation that requires the presence of dummy indicator atoms added by a prior BRICS fragmentation.

assemble_fragment_order(fragmentlist, seen=None, allow_incomplete=False, max_n_mols=float('inf'), RXNS=None)

Assemble a list of fragment into a set of possible molecules under rules defined by the brics algorithm

We are of course assuming:

1. that the order in the fragmentlist matter :D !
2. that none of the fragment has explicitly defined hydrogen atoms.
3. only a list of unique molecule is internally maintained

Parameters:

Name Type Description Default
fragmentlist list

list of original fragments to grow

required
seen Optional[Mol]

original molecules used as base. If none, the first element of fragment list will be poped out

None
allow_incomplete bool

Whether to accept assembled molecules with missing fragment

False
Source code in datamol/fragment/_assemble.py
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
def assemble_fragment_order(
    fragmentlist: list,
    seen: Optional[Mol] = None,
    allow_incomplete: bool = False,
    max_n_mols: float = float("inf"),
    RXNS=None,
):
    """Assemble a list of fragment into a set of possible molecules under rules defined by the brics algorithm

    We are of course assuming:

        1. that the order in the fragmentlist matter :D !
        2. that none of the fragment has explicitly defined hydrogen atoms.
        3. only a list of unique molecule is internally maintained

    Args:
        fragmentlist: list of original fragments to grow
        seen: original molecules used as base. If none, the first element of fragment list will be poped out
        allow_incomplete: Whether to accept assembled molecules with missing fragment
    """

    if RXNS is None:
        RXNS = ALL_BRICS_RETRO

    fragmentlist = list(fragmentlist)
    yield_counter = 0
    if seen is None:
        seen = fragmentlist.pop(0)
    seen = [dm.to_smiles(seen)]  # only one molecule to assemble
    while yield_counter < max_n_mols and len(fragmentlist) > 0:
        # find all the way to add this fragment to seen
        frag = fragmentlist.pop(0)
        level_set = [dm.to_mol(x) for x in seen]
        seen = set()
        for sm in level_set:
            try:
                # there is no point in even trying something on molecules that cannot be kekulized
                for rxn in RXNS:
                    for m, mSmi in _run_at_all_rct(rxn, frag, sm):
                        if allow_incomplete and mSmi not in seen:
                            yield m
                            yield_counter += 1
                        seen.add(mSmi)
            except Exception as e:
                logger.error(e)
                pass

    for m in seen:
        if yield_counter < max_n_mols:
            yield dm.to_mol(m)
            yield_counter += 1

break_mol(mol, minFragmentSize=1, silent=True, onlyUseReactions=[], randomize=False, mode='brics', returnTree=False)

Breaks a molecules into a list of fragment.

Source code in datamol/fragment/_assemble.py
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
def break_mol(
    mol: Chem.rdchem.Mol,
    minFragmentSize: int = 1,
    silent: bool = True,
    onlyUseReactions: list = [],
    randomize: bool = False,
    mode: str = "brics",
    returnTree: bool = False,
):
    """Breaks a molecules into a list of fragment."""

    if mode.lower() == "brics":
        all_reactions = ALL_BRICS
        all_reactions_type = ALL_BRICS_TYPE
    elif mode.lower() == "rxn":
        all_reactions = ALL_RXNS
        all_reactions_type = ALL_RXNS_TYPE
    else:
        all_reactions = ALL_BRICS + ALL_RXNS
        all_reactions_type = ALL_BRICS_TYPE + ALL_RXNS_TYPE
    if randomize:
        p = np.random.permutation(len(all_reactions))
        all_reactions = [all_reactions[ind] for ind in p]
        all_reactions_type = [all_reactions_type[ind] for ind in p]

    nx = dm.graph._get_networkx()
    mSmi = dm.to_smiles(mol, isomeric=True)
    G = nx.DiGraph()
    node_num = 0
    G.add_node(node_num, smiles=mSmi, mol=mol)
    allNodes = set()
    activePool = {mSmi: node_num}
    allNodes.add(mSmi)
    while activePool:
        nSmi = list(activePool.keys())[0]
        parent = activePool.pop(nSmi)
        node = G.nodes[parent]
        mol = node["mol"]
        for rxnIdx, reaction in zip(all_reactions_type, all_reactions):
            if onlyUseReactions and rxnIdx not in onlyUseReactions:
                continue
            ps = reaction.RunReactants((mol,))
            if ps:
                all_pass = [
                    all([prod.GetNumAtoms(onlyExplicit=True) > minFragmentSize for prod in p_])
                    for p_ in ps
                ]
                nz_i = 0
                while nz_i < len(all_pass) and not all_pass[nz_i]:
                    nz_i += 1
                if not silent:
                    print(nSmi, "->", len(ps), "products and selected ", nz_i)
                    # display(MolsToGridImage(list(itertools.chain(*list(ps))), molsPerRow=2))
                prodSeq = ps[nz_i % len(all_pass)]
                seqOk = True
                # we want to disqualify small fragments, so sort the product sequence by size
                prodSeq = [(prod.GetNumAtoms(onlyExplicit=True), prod) for prod in prodSeq]
                prodSeq.sort(key=lambda x: x[0])
                for _, prod in prodSeq:
                    prod.sanitized = True
                    try:
                        Chem.SanitizeMol(prod)
                    except Exception:
                        if dm.sanitize_mol(prod) is None:
                            seqOk = False
                            break
                        continue
                    pSmi = dm.to_smiles(prod, isomeric=True)
                    seqOk = seqOk and (dm.to_mol(pSmi) is not None)

                    notDummies = sum([atm.GetSymbol() != "*" for atm in prod.GetAtoms()])
                    # nDummies = pSmi.count('*')
                    # if minFragmentSize > 0 and (nats - nDummies < minFragmentSize):
                    if minFragmentSize > 0 and notDummies < minFragmentSize:
                        seqOk = False
                        break
                    prod.pSmi = pSmi

                if seqOk:
                    for _, prod in prodSeq:
                        if not prod.sanitized:
                            continue
                        pSmi = prod.pSmi
                        node_num += 1
                        usmi = dm.to_smiles(dm.fix_mol(prod), isomeric=True)
                        G.add_node(node_num, smiles=usmi, mol=prod)
                        G.add_edge(parent, node_num)
                        if usmi not in allNodes:
                            activePool[pSmi] = node_num
                            allNodes.add(usmi)
                    G.nodes[parent]["rxn"] = rxnIdx
                    break  # at least one reaction matches

    leaves_smiles = [
        G.nodes[n]["smiles"] for n in G.nodes() if G.in_degree(n) != 0 and G.out_degree(n) == 0
    ]
    if returnTree:
        return leaves_smiles, allNodes, G
    return leaves_smiles, allNodes

build(ll_mols, max_n_mols=float('inf'), mode='brics', frag_rxn=None, ADD_RNXS=[])

Build a super molecule from a list of fragments

Source code in datamol/fragment/_assemble.py
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
def build(ll_mols, max_n_mols=float("inf"), mode="brics", frag_rxn=None, ADD_RNXS=[]):
    """Build a super molecule from a list of fragments"""

    seen = set()
    stop = False
    CUR_RXNS = []
    CUR_RXNS_TYPE = []

    if mode == "brics":
        CUR_RXNS = ALL_BRICS_RETRO
        CUR_RXNS_TYPE = ALL_BRICS_TYPE
    elif mode == "rxn":
        CUR_RXNS = ALL_RXNS_RETRO
        CUR_RXNS_TYPE = ALL_RXNS_TYPE
    elif mode is not None:
        CUR_RXNS = ALL_BRICS_RETRO + ALL_RXNS_RETRO
        CUR_RXNS_TYPE = ALL_BRICS_TYPE + ALL_RXNS_TYPE

    if ADD_RNXS is not None:
        ADD_RNXS_TYPE = [f"RXN-{i}" for i in range(len(ADD_RNXS))]
        if isinstance(ADD_RNXS, dict):
            ADD_RNXS_TYPE = ADD_RNXS.keys()
            ADD_RNXS = ADD_RNXS.values()
        CUR_RXNS += list(ADD_RNXS)
        CUR_RXNS_TYPE += list(ADD_RNXS_TYPE)

    for i, rxn_type in enumerate(CUR_RXNS_TYPE):
        if (frag_rxn is not None) and (frag_rxn.strip('"') == rxn_type):
            CUR_RXNS = [CUR_RXNS[i]]
            break

    for fraglist in itertools.product(*ll_mols):
        if stop:
            break

        fraglist = list(fraglist)
        for rxn in CUR_RXNS:  # should be size==1 if frag_rxn is provided
            ps = []
            try:
                ps = _run_at_all_rct(rxn, fraglist[0], fraglist[1])
            except Exception:
                pass
            for m, mSmi in ps:
                if len(seen) >= max_n_mols:
                    stop = True
                    break
                if mSmi not in seen:
                    seen.add(mSmi)
                    yield m