Skip to content

datamol.convert

from_df(df, smiles_column='smiles', mol_column=None, conserve_smiles=False, sanitize=True)

Convert a dataframe to a list of mols.

For the reverse operation, you might to check dm.to_df().

Note

If smiles_column is used to build the molecules, this property is removed from the molecules' properties. You can decide to conserve the SMILES column by setting conserve_smiles to True.

Parameters:

Name Type Description Default
df DataFrame

a dataframe.

required
smiles_column Optional[str]

Column name to extract the molecule.

'smiles'
mol_column Optional[str]

Column name to extract the molecule. It takes precedence over smiles_column.

None
conserve_smiles bool

Whether to conserve the SMILES in the mols' props.

False
sanitize bool

Whether to sanitize if smiles_column is not None.

True
Source code in datamol/convert.py
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
def from_df(
    df: pd.DataFrame,
    smiles_column: Optional[str] = "smiles",
    mol_column: Optional[str] = None,
    conserve_smiles: bool = False,
    sanitize: bool = True,
) -> List[Mol]:
    """Convert a dataframe to a list of mols.

    For the reverse operation, you might to check `dm.to_df()`.

    Note:
        If `smiles_column` is used to build the molecules, this property
        is removed from the molecules' properties. You can decide to conserve
        the SMILES column by setting `conserve_smiles` to True.

    Args:
        df: a dataframe.
        smiles_column: Column name to extract the molecule.
        mol_column: Column name to extract the molecule. It takes
            precedence over `smiles_column`.
        conserve_smiles: Whether to conserve the SMILES in the mols' props.
        sanitize: Whether to sanitize if `smiles_column` is not None.
    """

    if smiles_column is None and mol_column is None:
        raise ValueError("Either `smiles_column` or `mol_column` must be not None.")

    if len(df) == 0:
        return []

    # Try to detect the mol column if `mol_column` is None.
    if mol_column is None:
        for col in df.columns:
            if isinstance(df[col].iloc[0], Mol):
                col = cast(str, col)
                mol_column = col

    def _row_to_mol(row) -> Optional[Mol]:
        props = row.to_dict()

        if mol_column is not None:
            mol = props.pop(mol_column)
        else:
            if conserve_smiles:
                smiles = props[smiles_column]
            else:
                # If a SMILES column is used to create the molecule then it is removed from the
                # properties.
                smiles = props.pop(smiles_column)

            mol = dm.to_mol(smiles, sanitize=sanitize)

        if mol is None:
            return None

        dm.set_mol_props(mol, props)
        return mol

    return df.apply(_row_to_mol, axis=1).tolist()  # type: ignore

from_inchi(inchi, sanitize=True, remove_hs=True)

Convert an InChi to a mol.

Parameters:

Name Type Description Default
inchi Optional[str]

an inchi string.

required
sanitize bool

do sanitize.

True
remove_hs bool

do remove hs.

True

Returns:

Type Description
Optional[Mol]

mol

Source code in datamol/convert.py
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
def from_inchi(
    inchi: Optional[str],
    sanitize: bool = True,
    remove_hs: bool = True,
) -> Optional[Mol]:
    """Convert an InChi to a mol.

    Args:
        inchi: an inchi string.
        sanitize: do sanitize.
        remove_hs: do remove hs.

    Returns:
        mol
    """
    if inchi is None:
        return None

    return Chem.MolFromInchi(inchi, sanitize=sanitize, removeHs=remove_hs)

from_selfies(selfies, as_mol=False)

Convert a SEFLIES to a smiles or a mol.

Parameters:

Name Type Description Default
selfies str

a selfies.

required
as_mol str

whether to return a mol or a smiles.

False

Returns:

Type Description
Optional[Union[str, Mol]]

smiles or mol.

Source code in datamol/convert.py
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
def from_selfies(selfies: str, as_mol: bool = False) -> Optional[Union[str, Mol]]:
    """Convert a SEFLIES to a smiles or a mol.

    Args:
        selfies: a selfies.
        as_mol (str, optional): whether to return a mol or a smiles.

    Returns:
        smiles or mol.
    """
    if selfies is None:
        return None

    smiles = sf.decoder(selfies)

    if as_mol and smiles is not None:
        return dm.to_mol(smiles)

    return smiles

from_smarts(smarts)

Convert a SMARTS string to a molecule

Parameters:

Name Type Description Default
smarts Optional[str]

a smarts string

required
Source code in datamol/convert.py
352
353
354
355
356
357
358
359
360
361
def from_smarts(smarts: Optional[str]) -> Optional[Mol]:
    """Convert a SMARTS string to a molecule

    Args:
        smarts: a smarts string
    """

    if smarts is None:
        return None
    return Chem.MolFromSmarts(smarts)  # type: ignore

render_mol_df(df)

Render the molecules column in a dataframe. The rendering is performed in-place only. So nothing is returned.

Parameters:

Name Type Description Default
df DataFrame

a dataframe.

required
Source code in datamol/convert.py
502
503
504
505
506
507
508
509
510
511
512
513
514
def render_mol_df(df: pd.DataFrame):
    """Render the molecules column in a dataframe. The rendering is performed
    in-place only. So nothing is returned.

    Args:
        df: a dataframe.
    """

    # NOTE(hadim): _ChangeMoleculeRendering is not relevant anymore with rdkit>=2022.09
    if dm.is_lower_than_current_rdkit_version("2022.09"):
        # NOTE(hadim): replace by `PandaTools.ChangeMoleculeRendering` once
        # https://github.com/rdkit/rdkit/issues/3563 is fixed.
        _ChangeMoleculeRendering(df)

smiles_as_smarts(mol, keep_hs=True)

Convert a smiles to a smarts if possible

Parameters:

Name Type Description Default
mol Union[str, Mol]

a molecule.

required
keep_hs bool

Whether to keep hydrogen. This will increase the count of H atoms for atoms with attached hydrogens to create a valid smarts without further substitution allowed e.g. [H]-[CH]-[] -> [H]-[CH2]-[]

True

Returns:

Type Description
Optional[str]

smarts of the molecule

Source code in datamol/convert.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
def smiles_as_smarts(mol: Union[str, Mol], keep_hs: bool = True) -> Optional[str]:
    """Convert a smiles to a smarts if possible

    Args:
        mol: a molecule.
        keep_hs: Whether to keep hydrogen. This will increase the count of H atoms
            for atoms with attached hydrogens to create a valid smarts without further substitution allowed
            e.g. [H]-[CH]-[*] -> [H]-[CH2]-[*]

    Returns:
        smarts of the molecule
    """

    if isinstance(mol, str):
        mol = dm.to_mol(mol)

    if mol is None:
        return None

    # Change the isotope to 99
    for atom in mol.GetAtoms():  # type: ignore
        if keep_hs:
            s = sum(na.GetAtomicNum() == 1 for na in atom.GetNeighbors())
            if s:
                atom.SetNumExplicitHs(atom.GetTotalNumHs() + s)
        atom.SetIsotope(99)

    # Print out the smiles, all the atom attributes will be fully specified
    smarts = to_smiles(mol, isomeric=True, explicit_bonds=True)

    if smarts is None:
        return None

    # Remove the 99 isotope labels
    smarts = re.sub(r"\[99", "[", smarts)
    return smarts

to_df(mols, smiles_column='smiles', mol_column=None, include_private=False, include_computed=False, render_df_mol=True, render_all_df_mol=False, n_jobs=1)

Convert a list of mols to a dataframe using each mol properties as a column.

For the reverse operation, you might to check dm.from_df().

Parameters:

Name Type Description Default
mols Sequence[Mol]

a molecule.

required
smiles_column Optional[str]

name of the SMILES column.

'smiles'
mol_column Optional[str]

Name of the column. If not None, rdkit.Chem.PandaTools is used to add a molecule column.

None
include_private bool

Include private properties in the columns.

False
include_computed bool

Include computed properties in the columns.

False
render_df_mol bool

whether to render the molecule in the dataframe to images. If called once, it will be applied for the newly created dataframe with mol in it.

True
render_all_df_mol bool

Whether to render all pandas dataframe mol column as images.

False
n_jobs Optional[int]

Number of jobs for parallelization. Leave to 1 for no parallelization. Set to -1 to use all available cores.

1
Source code in datamol/convert.py
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
def to_df(
    mols: Sequence[Mol],
    smiles_column: Optional[str] = "smiles",
    mol_column: Optional[str] = None,
    include_private: bool = False,
    include_computed: bool = False,
    render_df_mol: bool = True,
    render_all_df_mol: bool = False,
    n_jobs: Optional[int] = 1,
) -> pd.DataFrame:
    """Convert a list of mols to a dataframe using each mol properties
    as a column.

    For the reverse operation, you might to check `dm.from_df()`.

    Args:
        mols: a molecule.
        smiles_column: name of the SMILES column.
        mol_column: Name of the column. If not None, rdkit.Chem.PandaTools
            is used to add a molecule column.
        include_private: Include private properties in the columns.
        include_computed: Include computed properties in the columns.
        render_df_mol: whether to render the molecule in the dataframe to images.
            If called once, it will be applied for the newly created dataframe with
            mol in it.
        render_all_df_mol: Whether to render all pandas dataframe mol column as images.
        n_jobs: Number of jobs for parallelization. Leave to 1 for no
            parallelization. Set to -1 to use all available cores.
    """

    # Init a dataframe
    df = pd.DataFrame()

    # Feed it with smiles
    if smiles_column is not None:
        smiles = dm.parallelized(to_smiles, mols, n_jobs=n_jobs)
        df[smiles_column] = smiles

    # Add a mol column
    if mol_column is not None:
        df[mol_column] = mols

    # Add any other properties present in the molecule
    def _mol_to_prop_dict(mol):
        if mol is not None:
            return mol.GetPropsAsDict(
                includePrivate=include_private,
                includeComputed=include_computed,
            )
        else:
            return {}

    # EN: You cannot use `processes` here because all properties will be lost
    # An alternative would be https://www.rdkit.org/docs/source/rdkit.Chem.PropertyMol.html
    # But this has less overhead
    props = dm.parallelized(_mol_to_prop_dict, mols, n_jobs=n_jobs, scheduler="threads")
    props_df = pd.DataFrame(props)
    if smiles_column is not None and smiles_column in props_df.columns:
        logger.warning(
            f"The SMILES column name provided ('{smiles_column}') is already present in the properties"
            " of the molecules. THe returned dataframe will two columns with the same name."
        )

    # Concat the df with the properties df
    df = pd.concat([df, props_df], axis=1)

    # Render mol column to images
    if render_df_mol is True and mol_column is not None:
        render_mol_df(df)

        if render_all_df_mol:
            PandasTools.RenderImagesInAllDataFrames()

    return df

to_inchi(mol)

Convert a mol to a standard Inchi.

Parameters:

Name Type Description Default
mol Union[str, Mol]

a molecule.

required
Source code in datamol/convert.py
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
def to_inchi(mol: Union[str, Mol]) -> Optional[str]:
    """Convert a mol to a standard Inchi.

    Args:
        mol: a molecule.
    """

    if isinstance(mol, str):
        mol = dm.to_mol(mol)

    if mol is None:
        return None

    inchi_val = Chem.MolToInchi(mol)
    if not inchi_val:
        return None
    return inchi_val

to_inchi_non_standard(mol, fixed_hydrogen_layer=True, undefined_stereocenter=True, reconnected_metal_layer=True, tautomerism_keto_enol=True, tautomerism_15=True, options=None)

Convert a mol to a non-standard Inchi.

Note that turning all the flags to False will result in the standard Inchi.

Warning: this function will return a non-standard Inchi. See https://www.inchi-trust.org/technical-faq-2 for details.

It's important to not mix standard and non-standard InChi. If you don't know much about non-standard InChi, we highly recommend you to use the standard InChi with dm.to_inchi().

Parameters:

Name Type Description Default
mol Union[str, Mol]

a molecule.

required
fixed_hydrogen_layer bool

whether to include a fixed hydrogen layer (/FixedH).

True
undefined_stereocenter bool

whether to include an undefined stereocenter layer (/SUU).

True
reconnected_metal_layer bool

whether to include reconnected metals (/RecMet).

True
tautomerism_keto_enol bool

whether to account tautomerism keto-enol (/KET).

True
tautomerism_15 bool

whether to account 1,5-tautomerism (/15T).

True
options Optional[List[str]]

More InchI options in a form of a list of string. Example: ["/SRel", "/AuxNone"].

None
Source code in datamol/convert.py
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
def to_inchi_non_standard(
    mol: Union[str, Mol],
    fixed_hydrogen_layer: bool = True,
    undefined_stereocenter: bool = True,
    reconnected_metal_layer: bool = True,
    tautomerism_keto_enol: bool = True,
    tautomerism_15: bool = True,
    options: Optional[List[str]] = None,
) -> Optional[str]:
    """Convert a mol to a non-standard Inchi.

    Note that turning all the flags to `False` will result in the standard Inchi.

    **Warning**: this function will return a **non-standard** Inchi. See
    https://www.inchi-trust.org/technical-faq-2 for details.

    It's important to not mix standard and non-standard InChi. If you don't know
    much about non-standard InChi, we highly recommend you to use the
    standard InChi with `dm.to_inchi()`.

    Args:
        mol: a molecule.
        fixed_hydrogen_layer: whether to include a fixed hydrogen layer (`/FixedH`).
        undefined_stereocenter: whether to include an undefined stereocenter layer (`/SUU`).
        reconnected_metal_layer: whether to include reconnected metals (`/RecMet`).
        tautomerism_keto_enol: whether to account tautomerism keto-enol (`/KET`).
        tautomerism_15: whether to account 1,5-tautomerism (`/15T`).
        options: More InchI options in a form of a list of string. Example:
            `["/SRel", "/AuxNone"]`.
    """

    if isinstance(mol, str):
        mol = dm.to_mol(mol)

    if mol is None:
        return None

    inchi_options = _process_inchi_options(
        fixed_hydrogen_layer=fixed_hydrogen_layer,
        undefined_stereocenter=undefined_stereocenter,
        reconnected_metal_layer=reconnected_metal_layer,
        tautomerism_keto_enol=tautomerism_keto_enol,
        tautomerism_15=tautomerism_15,
        options=options,
    )

    inchi_val = Chem.MolToInchi(mol, options=inchi_options)
    if not inchi_val:
        return None
    return inchi_val

to_inchikey(mol)

Convert a mol to a standard InchiKey.

Parameters:

Name Type Description Default
mol Union[str, Mol]

a molecule

required
Source code in datamol/convert.py
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
def to_inchikey(mol: Union[str, Mol]) -> Optional[str]:
    """Convert a mol to a standard InchiKey.

    Args:
        mol: a molecule
    """
    if isinstance(mol, str):
        mol = dm.to_mol(mol)

    if mol is None:
        return None

    inchikey = Chem.MolToInchiKey(mol)
    if not inchikey:
        return None
    return inchikey

to_inchikey_non_standard(mol, fixed_hydrogen_layer=True, undefined_stereocenter=True, reconnected_metal_layer=True, tautomerism_keto_enol=True, tautomerism_15=True, options=None)

Convert a mol to a non-standard InchiKey.

Note that turning all the flags to False will result in the standard InchiKey.

Warning: this function will return a non-standard InchiKey. See https://www.inchi-trust.org/technical-faq-2 for details.

It's important to not mix standard and non-standard InChiKey. If you don't know much about non-standard InchiKey, we highly recommend you to use the standard InchiKey with dm.to_inchikey().

Parameters:

Name Type Description Default
mol Union[str, Mol]

a molecule

required
fixed_hydrogen_layer bool

whether to include a fixed hydrogen layer (/FixedH).

True
undefined_stereocenter bool

whether to include an undefined stereocenter layer (/SUU).

True
reconnected_metal_layer bool

whether to include reconnected metals (/RecMet).

True
tautomerism_keto_enol bool

whether to account tautomerism keto-enol (/KET).

True
tautomerism_15 bool

whether to account 1,5-tautomerism (/15T).

True
options Optional[List[str]]

More InchI options in a form of a list of string. Example: ["/SRel", "/AuxNone"].

None
Source code in datamol/convert.py
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
def to_inchikey_non_standard(
    mol: Union[str, Mol],
    fixed_hydrogen_layer: bool = True,
    undefined_stereocenter: bool = True,
    reconnected_metal_layer: bool = True,
    tautomerism_keto_enol: bool = True,
    tautomerism_15: bool = True,
    options: Optional[List[str]] = None,
) -> Optional[str]:
    """Convert a mol to a non-standard InchiKey.

    Note that turning all the flags to `False` will result in the standard InchiKey.

    **Warning**: this function will return a **non-standard** InchiKey. See
    https://www.inchi-trust.org/technical-faq-2 for details.

    It's important to not mix standard and non-standard InChiKey. If you don't know
    much about non-standard InchiKey, we highly recommend you to use the
    standard InchiKey with `dm.to_inchikey()`.

    Args:
        mol: a molecule
        fixed_hydrogen_layer: whether to include a fixed hydrogen layer (`/FixedH`).
        undefined_stereocenter: whether to include an undefined stereocenter layer (`/SUU`).
        reconnected_metal_layer: whether to include reconnected metals (`/RecMet`).
        tautomerism_keto_enol: whether to account tautomerism keto-enol (`/KET`).
        tautomerism_15: whether to account 1,5-tautomerism (`/15T`).
        options: More InchI options in a form of a list of string. Example:
            `["/SRel", "/AuxNone"]`.
    """

    if isinstance(mol, str):
        mol = dm.to_mol(mol)

    if mol is None:
        return None

    inchi_options = _process_inchi_options(
        fixed_hydrogen_layer=fixed_hydrogen_layer,
        undefined_stereocenter=undefined_stereocenter,
        reconnected_metal_layer=reconnected_metal_layer,
        tautomerism_keto_enol=tautomerism_keto_enol,
        tautomerism_15=tautomerism_15,
        options=options,
    )

    inchikey = Chem.MolToInchiKey(mol, options=inchi_options)
    if not inchikey:
        return None
    return inchikey

to_selfies(mol)

Convert a mol to SELFIES.

Parameters:

Name Type Description Default
mol Union[str, Mol]

a molecule or a SMILES.

required

Returns:

Name Type Description
selfies Optional[str]

SELFIES string.

Source code in datamol/convert.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
def to_selfies(mol: Union[str, Mol]) -> Optional[str]:
    """Convert a mol to SELFIES.

    Args:
        mol: a molecule or a SMILES.

    Returns:
        selfies: SELFIES string.
    """

    if isinstance(mol, Mol):
        mol = to_smiles(mol)

    if mol is None:
        return None
    selfies = sf.encoder(mol)

    if selfies == -1:
        return None

    return selfies

to_smarts(mol)

Convert a mol to SMARTS format

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
Source code in datamol/convert.py
248
249
250
251
252
253
254
255
256
257
258
def to_smarts(mol: Mol) -> Optional[str]:
    """Convert a mol to SMARTS format

    Args:
        mol: a molecule.
    """

    if mol is None:
        return None

    return Chem.MolToSmarts(mol)  # type: ignore

to_smiles(mol, canonical=True, isomeric=True, kekulize=False, ordered=False, explicit_bonds=False, explicit_hs=False, randomize=False, cxsmiles=False, allow_to_fail=False, with_atom_indices=False)

Convert a mol to a SMILES.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
canonical bool

if false no attempt will be made to canonicalize the molecule.

True
isomeric bool

whether to include information about stereochemistry in the SMILES.

True
kekulize bool

whether to return the kekule version of the SMILES.

False
ordered bool

whether to force reordering of the atoms first.

False
explicit_bonds bool

if true, all bond orders will be explicitly indicated in the output SMILES.

False
explicit_hs bool

if true, all H counts will be explicitly indicated in the output SMILES.

False
randomize bool

whether to randomize the generated smiles. Override canonical.

False
cxsmiles bool

Whether to return a CXSMILES instead of a SMILES.

False
allow_to_fail bool

Raise an error if the conversion to SMILES fails. Return None otherwise.

False
with_atom_indices bool

Whether to add atom indices to the SMILES.

False
Source code in datamol/convert.py
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
def to_smiles(
    mol: Mol,
    canonical: bool = True,
    isomeric: bool = True,
    kekulize: bool = False,
    ordered: bool = False,
    explicit_bonds: bool = False,
    explicit_hs: bool = False,
    randomize: bool = False,
    cxsmiles: bool = False,
    allow_to_fail: bool = False,
    with_atom_indices: bool = False,
) -> Optional[str]:
    """Convert a mol to a SMILES.

    Args:
        mol: a molecule.
        canonical: if false no attempt will be made to canonicalize the molecule.
        isomeric: whether to include information about stereochemistry in the SMILES.
        kekulize: whether to return the kekule version of the SMILES.
        ordered: whether to force reordering of the atoms first.
        explicit_bonds: if true, all bond orders will be explicitly indicated in the output SMILES.
        explicit_hs: if true, all H counts will be explicitly indicated in the output SMILES.
        randomize: whether to randomize the generated smiles. Override `canonical`.
        cxsmiles: Whether to return a CXSMILES instead of a SMILES.
        allow_to_fail: Raise an error if the conversion to SMILES fails. Return None otherwise.
        with_atom_indices: Whether to add atom indices to the SMILES.
    """

    if ordered and canonical is False:
        mol = dm.reorder_atoms(mol)

    if randomize:
        mol = dm.randomize_atoms(mol)
        canonical = False

    if with_atom_indices:
        mol = dm.atom_indices_to_mol(mol, copy=True)

    smiles = None
    try:
        if cxsmiles:
            smiles = rdmolfiles.MolToCXSmiles(
                mol,
                isomericSmiles=isomeric,
                canonical=canonical,
                allBondsExplicit=explicit_bonds,
                allHsExplicit=explicit_hs,
                kekuleSmiles=kekulize,
            )

        else:
            smiles = rdmolfiles.MolToSmiles(
                mol,
                isomericSmiles=isomeric,
                canonical=canonical,
                allBondsExplicit=explicit_bonds,
                allHsExplicit=explicit_hs,
                kekuleSmiles=kekulize,
            )

    except Exception as e:
        if allow_to_fail:
            raise e

        return None

    return smiles