Skip to content

datamol.isomers

canonical_tautomer(mol)

Get the canonical tautomer of the current molecule.

Parameters:

Name Type Description Default
mol Mol

A molecule.

required
Source code in datamol/isomers/_enumerate.py
257
258
259
260
261
262
263
264
def canonical_tautomer(mol: dm.Mol):
    """Get the canonical tautomer of the current molecule.

    Args:
        mol: A molecule.
    """
    enumerator = rdMolStandardize.TautomerEnumerator()
    return enumerator.Canonicalize(mol)

count_stereoisomers(mol, n_variants=20, undefined_only=False, rationalise=True, timeout_seconds=None, clean_it=True, precise=False)

Get the number of possible stereoisomers for a molecule.

Warning: By default, this function compute an estimtion number based on the stereo bonds which gives an upper bound of possible stereoisomers. By setting precise=True, the number is computed by enumrerating the stereoisomers. However, it can be computationnaly intensive.

Parameters:

Name Type Description Default
mol Mol

The molecule whose state we should enumerate.

required
n_variants int

The maximum amount of molecules that should be returned.

20
undefined_only bool

If we should enumerate all stereocenters and bonds or only those with undefined stereochemistry.

False
rationalise bool

If we should try to build and rationalise the molecule to ensure it can exist.

True
timeout_seconds int

The maximum amount of time to spend on enumeration. None will disable the timeout. Note that the timeout might be inaccurate as a running single variant computation is not stopped when the duration is reached.

None
clean_it bool

A flag for assigning stereochemistry. If True, it will remove previous stereochemistry markings on the bonds.

True
precise bool

Whether compute counts by enumerate the stereoisomers using enumerate_stereoisomers.

False
Source code in datamol/isomers/_enumerate.py
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
def count_stereoisomers(
    mol: dm.Mol,
    n_variants: int = 20,
    undefined_only: bool = False,
    rationalise: bool = True,
    timeout_seconds: int = None,
    clean_it: bool = True,
    precise: bool = False,
):
    """Get the number of possible stereoisomers for a molecule.

    Warning: By default, this function compute an estimtion number based on the stereo bonds which
    gives an upper bound of possible stereoisomers. By setting `precise=True`, the number is computed
    by enumrerating the stereoisomers. However, it can be computationnaly intensive.

    Args:
        mol: The molecule whose state we should enumerate.
        n_variants: The maximum amount of molecules that should be returned.
        undefined_only: If we should enumerate all stereocenters and bonds or only those
            with undefined stereochemistry.
        rationalise: If we should try to build and rationalise the molecule to ensure it
            can exist.
        timeout_seconds: The maximum amount of time to spend on enumeration. None
            will disable the timeout. Note that the timeout might be inaccurate as a running single variant
            computation is not stopped when the duration is reached.
        clean_it: A flag for assigning stereochemistry. If True, it will remove previous stereochemistry
            markings on the bonds.
        precise: Whether compute counts by enumerate the stereoisomers using `enumerate_stereoisomers`.

    """
    if precise:
        num_variants = len(
            enumerate_stereoisomers(
                mol=mol,
                n_variants=n_variants,
                undefined_only=undefined_only,
                rationalise=rationalise,
                timeout_seconds=timeout_seconds,
                clean_it=clean_it,
            )
        )
    else:
        # safety first
        mol = dm.copy_mol(mol)

        # in case any bonds/centers are missing stereo chem flag it here
        Chem.AssignStereochemistry(mol, force=False, flagPossibleStereoCenters=True, cleanIt=clean_it)  # type: ignore
        # lu: do not clean (overwrite bond stereo information) when set `undefined_only=Ture`
        Chem.FindPotentialStereoBonds(mol, cleanIt=not undefined_only and clean_it)

        # set up the options
        stereo_opts = StereoEnumerationOptions(
            tryEmbedding=rationalise,
            onlyUnassigned=undefined_only,
            unique=True,
        )

        num_variants = GetStereoisomerCount(mol, options=stereo_opts)

    return num_variants

enumerate_stereoisomers(mol, n_variants=20, undefined_only=False, rationalise=True, timeout_seconds=None, clean_it=True)

Enumerate the stereocenters and bonds of the current molecule.

Original source: the openff-toolkit lib.

Warning: this function can be computationnaly intensive.

Parameters:

Name Type Description Default
mol Mol

The molecule whose state we should enumerate.

required
n_variants int

The maximum amount of molecules that should be returned.

20
undefined_only bool

If we should enumerate all stereocenters and bonds or only those with undefined stereochemistry.

False
rationalise bool

If we should try to build and rationalise the molecule to ensure it can exist.

True
timeout_seconds int

The maximum amount of time to spend on enumeration. None will disable the timeout. Note that the timeout might be inaccurate as a running single variant computation is not stopped when the duration is reached.

None
clean_it bool

A flag for assigning stereochemistry. If True, it will remove previous stereochemistry markings on the bonds.

True
Source code in datamol/isomers/_enumerate.py
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def enumerate_stereoisomers(
    mol: dm.Mol,
    n_variants: int = 20,
    undefined_only: bool = False,
    rationalise: bool = True,
    timeout_seconds: int = None,
    clean_it: bool = True,
):
    """Enumerate the stereocenters and bonds of the current molecule.

    Original source: the `openff-toolkit` lib.

    Warning: this function can be computationnaly intensive.

    Args:
        mol: The molecule whose state we should enumerate.
        n_variants: The maximum amount of molecules that should be returned.
        undefined_only: If we should enumerate all stereocenters and bonds or only those
            with undefined stereochemistry.
        rationalise: If we should try to build and rationalise the molecule to ensure it
            can exist.
        timeout_seconds: The maximum amount of time to spend on enumeration. None
            will disable the timeout. Note that the timeout might be inaccurate as a running single variant
            computation is not stopped when the duration is reached.
        clean_it: A flag for assigning stereochemistry. If True, it will remove previous stereochemistry
            markings on the bonds.
    """

    # safety first
    mol = dm.copy_mol(mol)

    # in case any bonds/centers are missing stereo chem flag it here
    Chem.AssignStereochemistry(mol, force=False, flagPossibleStereoCenters=True, cleanIt=clean_it)  # type: ignore
    # lu: do not clean (overwrite bond stereo information) when set `undefined_only=Ture`
    Chem.FindPotentialStereoBonds(mol, cleanIt=not undefined_only and clean_it)

    # set up the options
    stereo_opts = StereoEnumerationOptions(
        tryEmbedding=rationalise,
        onlyUnassigned=undefined_only,
        maxIsomers=n_variants,
        unique=True,
    )

    isomers_iterator = EnumerateStereoisomers(mol, options=stereo_opts)

    start = time.time()
    duration = 0

    isomers = []
    while timeout_seconds is None or duration < timeout_seconds:
        try:
            isomer = next(isomers_iterator)
            isomers.append(isomer)
        except StopIteration:
            break

        duration = time.time() - start

    variants = []
    for isomer in isomers:
        # isomer has CIS/TRANS tags so convert back to E/Z
        Chem.SetDoubleBondNeighborDirections(isomer)  # type: ignore
        Chem.AssignStereochemistry(isomer, force=True, cleanIt=clean_it)  # type: ignore
        variants.append(isomer)

    return variants

enumerate_structisomers(mol, n_variants=20, allow_cycle=False, allow_double_bond=False, allow_triple_bond=False, depth=None, timeout_seconds=None)

Enumerate the structural isomers of the input molecule

Warning: this function can be computationnaly intensive.

Parameters:

Name Type Description Default
mol Mol

The molecule whose state we should enumerate.

required
n_variants int

The maximum amount of molecules that should be returned.

20
allow_cycle bool

whether to allow transformation involving cycle creation.

False
allow_double_bond bool

whether to allow transformation involving at least one double bond.

False
allow_triple_bond bool

whether to allow transformation involving at least one triple bond.

False
depth int

depth of the search, use a sensible value to decrease search time. Will fall back to number of atoms.

None
timeout_seconds int

The maximum amount of time to spend on enumeration. None will disable the timeout. Note that the timeout might be inaccurate as a running single variant computation is not stopped when the duration is reached.

None
Source code in datamol/isomers/_enumerate.py
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
def enumerate_structisomers(
    mol: dm.Mol,
    n_variants: int = 20,
    allow_cycle: bool = False,
    allow_double_bond: bool = False,
    allow_triple_bond: bool = False,
    depth: int = None,
    timeout_seconds: int = None,
):
    """Enumerate the structural isomers of the input molecule

    Warning: this function can be computationnaly intensive.

    Args:
        mol: The molecule whose state we should enumerate.
        n_variants: The maximum amount of molecules that should be returned.
        allow_cycle: whether to allow transformation involving cycle creation.
        allow_double_bond: whether to allow transformation involving at least one double bond.
        allow_triple_bond: whether to allow transformation involving at least one triple bond.
        depth: depth of the search, use a sensible value to decrease search time. Will fall back to number of atoms.
        timeout_seconds: The maximum amount of time to spend on enumeration. None
            will disable the timeout. Note that the timeout might be inaccurate as a running single variant
            computation is not stopped when the duration is reached.
    """

    mol = dm.copy_mol(mol)

    enumerator = IsomerEnumerator(
        allow_cycle=allow_cycle,
        allow_double_bond=allow_double_bond,
        allow_triple_bond=allow_triple_bond,
    )

    if depth is None:
        depth = mol.GetNumAtoms()

    isomers_iterator = enumerator(
        mol,
        max_mols=n_variants,
        include_input=False,
        depth=depth,
    )

    start = time.time()
    duration = 0

    isomers = []
    while timeout_seconds is None or duration < timeout_seconds:
        try:
            isomer = next(isomers_iterator)
            isomers.append(dm.to_mol(isomer))
        except StopIteration:
            break

        duration = time.time() - start

    return isomers

enumerate_tautomers(mol, n_variants=20, max_transforms=1000, reassign_stereo=True, remove_bond_stereo=True, remove_sp3_stereo=True)

Enumerate the possible tautomers of the current molecule.

Parameters:

Name Type Description Default
mol Mol

The molecule whose state we should enumerate.

required
n_variants int

The maximum amount of molecules that should be returned.

20
max_transforms int

Set the maximum number of transformations to be applied. This limit is usually hit earlier than the n_variants limit and leads to a more linear scaling of CPU time with increasing number of tautomeric centers (see Sitzmann et al.).

1000
reassign_stereo bool

Whether to reassign stereo centers.

True
remove_bond_stereo bool

Whether stereochemistry information is removed from double bonds involved in tautomerism. This means that enols will lose their E/Z stereochemistry after going through tautomer enumeration because of the keto-enolic tautomerism.

True
remove_sp3_stereo bool

Whether stereochemistry information is removed from sp3 atoms involved in tautomerism. This means that S-aminoacids will lose their stereochemistry after going through tautomer enumeration because of the amido-imidol tautomerism.

True
Source code in datamol/isomers/_enumerate.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
def enumerate_tautomers(
    mol: dm.Mol,
    n_variants: int = 20,
    max_transforms: int = 1000,
    reassign_stereo: bool = True,
    remove_bond_stereo: bool = True,
    remove_sp3_stereo: bool = True,
):
    """Enumerate the possible tautomers of the current molecule.

    Args:
        mol: The molecule whose state we should enumerate.
        n_variants: The maximum amount of molecules that should be returned.
        max_transforms: Set the maximum number of transformations to be applied. This limit is usually
            hit earlier than the n_variants limit and leads to a more linear scaling
            of CPU time with increasing number of tautomeric centers (see Sitzmann et al.).
        reassign_stereo: Whether to reassign stereo centers.
        remove_bond_stereo: Whether stereochemistry information is removed from double bonds involved in tautomerism.
            This means that enols will lose their E/Z stereochemistry after going through tautomer enumeration because
            of the keto-enolic tautomerism.
        remove_sp3_stereo: Whether stereochemistry information is removed from sp3 atoms involved in tautomerism. This
            means that S-aminoacids will lose their stereochemistry after going through tautomer enumeration because
            of the amido-imidol tautomerism.
    """
    enumerator = rdMolStandardize.TautomerEnumerator()

    # Configure the enumeration
    enumerator.SetMaxTautomers(n_variants)
    enumerator.SetMaxTransforms(max_transforms)
    enumerator.SetReassignStereo(reassign_stereo)
    enumerator.SetRemoveBondStereo(remove_bond_stereo)
    enumerator.SetRemoveSp3Stereo(remove_sp3_stereo)

    tautomers = enumerator.Enumerate(mol)
    return list(tautomers)

remove_stereochemistry(mol, copy=True)

Removes all stereochemistry info from the molecule.

Parameters:

Name Type Description Default
mol Mol

a molecule

required
copy bool

Whether to copy the molecule.

True
Source code in datamol/isomers/_enumerate.py
243
244
245
246
247
248
249
250
251
252
253
254
def remove_stereochemistry(mol: dm.Mol, copy: bool = True):
    """Removes all stereochemistry info from the molecule.

    Args:
        mol: a molecule
        copy: Whether to copy the molecule.
    """

    if copy is True:
        mol = dm.copy_mol(mol)
    rdmolops.RemoveStereochemistry(mol)
    return mol

IsomerEnumerator

Implementation of the isomer enumeration algorithm described in https://doi.org/10.1186/s13321-017-0252-9

..note:: Due to the chemical reaction used, atom mapping will not be preserved !

Source code in datamol/isomers/_structural.py
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
class IsomerEnumerator:
    """
    Implementation of the isomer enumeration algorithm described in  https://doi.org/10.1186/s13321-017-0252-9

    ..note::
        Due to the chemical reaction used, atom mapping will not be preserved !

    """

    def __init__(
        self,
        allow_cycle: bool = True,
        allow_double_bond: bool = False,
        allow_triple_bond: bool = False,
        rxn_list: Optional[list] = None,
    ):
        """Structural isomer enumeration

        Args:
            allow_cycle (bool, optional): whether to allow transformation involving cycle creation
            allow_double_bond (bool, optional): whether to allow transformation involving at least one double bond
            allow_triple_bond (bool, optional): whether to allow transformation involving at least one triple bond
            rxn_list (list, optional): List of str to specifiy the reactions that should be used
        """
        self.allow_cycle = allow_cycle
        self.allow_triple_bond = allow_triple_bond
        self.allow_double_bond = allow_double_bond
        self.rxn_cache = self._resolve(rxn_list)
        self._ring_smarts = dm.from_smarts("[R]")
        # there should be a lot more of this
        self._impossible_chemistry = [dm.from_smarts("[$([*;r3,r4](=*)=*)]")]
        if not self.rxn_cache:
            raise ValueError("No isomeric transformation matches your")

    def _resolve(self, rxn_list: Optional[list]):
        """Resolve map of rxn to use

        Args:
            rxn_list: list of reaction to use
        """
        if rxn_list is not None and len(rxn_list) > 0:
            rxn_list = [_rxn for _rxn in rxn_list if _rxn in ISOMERIZERS]
        else:
            rxn_list = list([x for x, val in ISOMERIZERS.items() if val.use])

        final_rxns = dict()

        for x in rxn_list:
            rxn_trans = ISOMERIZERS[x]
            # check for triple bonds
            can_add = (not rxn_trans.triplebond) or self.allow_triple_bond
            # check for double bonds
            can_add &= not (rxn_trans.doublebond) or self.allow_double_bond
            # check for cycles
            can_add &= rxn_trans.acyclic or self.allow_cycle
            if can_add:
                as_rxn = Chem.rdChemReactions.ReactionFromSmarts(rxn_trans.smarts)
                final_rxns[x] = as_rxn
        return final_rxns

    def _clean(self, mol: dm.Mol):
        """Clean and sanitize a molecule during enumeration

        Args:
            mol: molecule to clean
        """
        clean_mol = None
        clean_smiles = ""
        try:
            mol = dm.sanitize_mol(mol)
            Chem.rdmolops.AssignRadicals(mol)
            for a in mol.GetAtoms():
                a.SetNoImplicit(True)
                a.SetNumExplicitHs(a.GetTotalNumHs() + a.GetNumRadicalElectrons())
                a.SetNumRadicalElectrons(0)
            mol.UpdatePropertyCache()
            clean_smiles = dm.to_smiles(mol)
            clean_mol = dm.to_mol(clean_smiles)
        except Exception:
            pass

        return clean_mol, clean_smiles

    def _is_valid(self, new_mol, new_smiles, need_substruct: Optional[dm.Mol] = None):
        """Check whether the mol or smiles is valid"""
        # not disconnected
        is_ok = new_smiles.count(".") == 0
        if not self.allow_cycle:
            is_ok &= not new_mol.HasSubstructMatch(self._ring_smarts)
        is_ok &= not (any(new_mol.HasSubstructMatch(x) for x in self._impossible_chemistry))
        if need_substruct is not None:
            is_ok &= new_mol.HasSubstructMatch(need_substruct)
        return is_ok

    def _has_correct_size(self, new_mol, expected_size):
        """Check whether the molecule has the correct size"""
        return new_mol.GetNumHeavyAtoms() == expected_size

    def __call__(self, *args, **kwargs):
        return self.enumerate(*args, **kwargs)

    def enumerate(
        self,
        mol: dm.Mol,
        depth: Optional[int] = None,
        include_input: bool = True,
        protect_substruct: Optional[Chem.rdchem.Mol] = None,
        max_mols: Optional[int] = None,
    ):
        """Enumerate the list of isomers of the current mol

        Args:
            mol: input molecule or smiles
            depth: optional maximum depth of the breadth search (default=None)
            include_input: whether to include the input molecule (default=True)
            protect_substruct: Optional substruct to protect
            max_mols: maximum number of molecule to sample
        """

        mol = dm.to_mol(mol)
        if protect_substruct is not None:
            mol = dm.protect_atoms(mol, substruct=protect_substruct, in_place=False)
        smiles = dm.to_smiles(mol, isomeric=True, canonical=True)
        mol_size = mol.GetNumHeavyAtoms()
        source_mols = [(mol, 0)]
        seen = set([smiles])
        seen_inchi_key = set([dm.to_inchikey(smiles)])

        _depth = depth or float("inf")
        _max_mols = max_mols or float("inf")

        if include_input:
            yield smiles
        while len(source_mols) > 0 and _max_mols > 0:
            curmol, curdepth = source_mols.pop(0)
            try:
                # we first try to kekulize the molecule
                Chem.Kekulize(curmol)
                curmol = Chem.AddHs(curmol, explicitOnly=False)
            except Exception:
                pass

            if curdepth >= _depth:
                return
            with dm.without_rdkit_log():
                for k, rxn in self.rxn_cache.items():
                    if _max_mols < 1:
                        break
                    for new_mols in rxn.RunReactants((curmol,)):
                        if _max_mols < 1:
                            break
                        new_mol = new_mols[0]
                        # sanitize mol
                        new_mol, new_smiles = self._clean(new_mol)
                        # need to clean twice, not sure why
                        _, new_smiles = self._clean(new_mol)
                        new_smiles_inchikey = dm.to_inchikey(new_mol)

                        if (
                            new_mol is None
                            or new_smiles_inchikey in seen_inchi_key
                            or not self._has_correct_size(new_mol, mol_size)
                        ):
                            continue
                        if protect_substruct is not None:
                            new_mol = dm.protect_atoms(
                                new_mol,
                                substruct=protect_substruct,
                                in_place=False,
                            )
                        seen.add(new_smiles)
                        seen_inchi_key.add(new_smiles_inchikey)
                        source_mols.append((new_mol, curdepth + 1))
                        if self._is_valid(new_mol, new_smiles, protect_substruct):
                            yield new_smiles
                            _max_mols -= 1

__init__(allow_cycle=True, allow_double_bond=False, allow_triple_bond=False, rxn_list=None)

Structural isomer enumeration

Parameters:

Name Type Description Default
allow_cycle bool

whether to allow transformation involving cycle creation

True
allow_double_bond bool

whether to allow transformation involving at least one double bond

False
allow_triple_bond bool

whether to allow transformation involving at least one triple bond

False
rxn_list list

List of str to specifiy the reactions that should be used

None
Source code in datamol/isomers/_structural.py
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
def __init__(
    self,
    allow_cycle: bool = True,
    allow_double_bond: bool = False,
    allow_triple_bond: bool = False,
    rxn_list: Optional[list] = None,
):
    """Structural isomer enumeration

    Args:
        allow_cycle (bool, optional): whether to allow transformation involving cycle creation
        allow_double_bond (bool, optional): whether to allow transformation involving at least one double bond
        allow_triple_bond (bool, optional): whether to allow transformation involving at least one triple bond
        rxn_list (list, optional): List of str to specifiy the reactions that should be used
    """
    self.allow_cycle = allow_cycle
    self.allow_triple_bond = allow_triple_bond
    self.allow_double_bond = allow_double_bond
    self.rxn_cache = self._resolve(rxn_list)
    self._ring_smarts = dm.from_smarts("[R]")
    # there should be a lot more of this
    self._impossible_chemistry = [dm.from_smarts("[$([*;r3,r4](=*)=*)]")]
    if not self.rxn_cache:
        raise ValueError("No isomeric transformation matches your")

enumerate(mol, depth=None, include_input=True, protect_substruct=None, max_mols=None)

Enumerate the list of isomers of the current mol

Parameters:

Name Type Description Default
mol Mol

input molecule or smiles

required
depth Optional[int]

optional maximum depth of the breadth search (default=None)

None
include_input bool

whether to include the input molecule (default=True)

True
protect_substruct Optional[Mol]

Optional substruct to protect

None
max_mols Optional[int]

maximum number of molecule to sample

None
Source code in datamol/isomers/_structural.py
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
def enumerate(
    self,
    mol: dm.Mol,
    depth: Optional[int] = None,
    include_input: bool = True,
    protect_substruct: Optional[Chem.rdchem.Mol] = None,
    max_mols: Optional[int] = None,
):
    """Enumerate the list of isomers of the current mol

    Args:
        mol: input molecule or smiles
        depth: optional maximum depth of the breadth search (default=None)
        include_input: whether to include the input molecule (default=True)
        protect_substruct: Optional substruct to protect
        max_mols: maximum number of molecule to sample
    """

    mol = dm.to_mol(mol)
    if protect_substruct is not None:
        mol = dm.protect_atoms(mol, substruct=protect_substruct, in_place=False)
    smiles = dm.to_smiles(mol, isomeric=True, canonical=True)
    mol_size = mol.GetNumHeavyAtoms()
    source_mols = [(mol, 0)]
    seen = set([smiles])
    seen_inchi_key = set([dm.to_inchikey(smiles)])

    _depth = depth or float("inf")
    _max_mols = max_mols or float("inf")

    if include_input:
        yield smiles
    while len(source_mols) > 0 and _max_mols > 0:
        curmol, curdepth = source_mols.pop(0)
        try:
            # we first try to kekulize the molecule
            Chem.Kekulize(curmol)
            curmol = Chem.AddHs(curmol, explicitOnly=False)
        except Exception:
            pass

        if curdepth >= _depth:
            return
        with dm.without_rdkit_log():
            for k, rxn in self.rxn_cache.items():
                if _max_mols < 1:
                    break
                for new_mols in rxn.RunReactants((curmol,)):
                    if _max_mols < 1:
                        break
                    new_mol = new_mols[0]
                    # sanitize mol
                    new_mol, new_smiles = self._clean(new_mol)
                    # need to clean twice, not sure why
                    _, new_smiles = self._clean(new_mol)
                    new_smiles_inchikey = dm.to_inchikey(new_mol)

                    if (
                        new_mol is None
                        or new_smiles_inchikey in seen_inchi_key
                        or not self._has_correct_size(new_mol, mol_size)
                    ):
                        continue
                    if protect_substruct is not None:
                        new_mol = dm.protect_atoms(
                            new_mol,
                            substruct=protect_substruct,
                            in_place=False,
                        )
                    seen.add(new_smiles)
                    seen_inchi_key.add(new_smiles_inchikey)
                    source_mols.append((new_mol, curdepth + 1))
                    if self._is_valid(new_mol, new_smiles, protect_substruct):
                        yield new_smiles
                        _max_mols -= 1