datamol
¶
All the below functions are accessible under datamol.FUNCTION_NAME
.
datamol._viz
¶
datamol.viz._viz.to_image(mols, legends=None, n_cols=4, use_svg=False, mol_size=(200, 200), highlight_atom=None, highlight_bond=None, outfile=None, max_mols=32, copy=False, indices=False)
¶
Generate an image out of a molecule or a list of molecule.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mols |
Union[List[rdkit.Chem.rdchem.Mol], rdkit.Chem.rdchem.Mol] |
one or a list of molecules. |
required |
legends |
Union[List[Optional[str]], str] |
a string or a list of string as legend for every molecules. |
None |
n_cols |
int |
number of molecules per column. |
4 |
use_svg |
bool |
whether to ouput an SVG (or a PNG). |
False |
mol_size |
Union[Tuple[int, int], int] |
a int or a tuple of int defining the size per molecule. |
(200, 200) |
highlight_atom |
List[List[int]] |
atom to highlight. |
None |
highlight_bond |
List[List[int]] |
bonds to highlight. |
None |
outfile |
str |
path where to save the image (local or remote path). |
None |
max_mols |
int |
the maximum number of molecules to display. |
32 |
copy |
bool |
whether to copy the molecules or not. |
False |
indices |
bool |
Whether to draw the atom indices. |
False |
Source code in datamol/viz/_viz.py
def to_image(
mols: Union[List[Chem.rdchem.Mol], Chem.rdchem.Mol],
legends: Union[List[Union[str, None]], str, None] = None,
n_cols: int = 4,
use_svg: bool = False,
mol_size: Union[Tuple[int, int], int] = (200, 200),
highlight_atom: List[List[int]] = None,
highlight_bond: List[List[int]] = None,
outfile: str = None,
max_mols: int = 32,
copy: bool = False,
indices: bool = False,
):
"""Generate an image out of a molecule or a list of molecule.
Args:
mols: one or a list of molecules.
legends: a string or a list of string as legend for every molecules.
n_cols: number of molecules per column.
use_svg: whether to ouput an SVG (or a PNG).
mol_size: a int or a tuple of int defining the size per molecule.
highlight_atom: atom to highlight.
highlight_bond: bonds to highlight.
outfile: path where to save the image (local or remote path).
max_mols: the maximum number of molecules to display.
copy: whether to copy the molecules or not.
indices: Whether to draw the atom indices.
"""
if isinstance(mol_size, int):
mol_size = (mol_size, mol_size)
if isinstance(mols, Chem.rdchem.Mol):
mols = [mols]
if isinstance(legends, str):
legends = [legends]
if copy:
mols = [dm.copy_mol(mol) for mol in mols]
if max_mols is not None:
mols = mols[:max_mols]
if legends is not None:
legends = legends[:max_mols]
if indices is True:
[dm.atom_indices_to_mol(mol) for mol in mols]
_highlight_atom = highlight_atom
if highlight_atom is not None and isinstance(highlight_atom[0], int):
_highlight_atom = [highlight_atom]
_highlight_bond = highlight_bond
if highlight_bond is not None and isinstance(highlight_bond[0], int):
_highlight_bond = [highlight_bond]
# Don't make the image bigger than it
if len(mols) < n_cols:
n_cols = len(mols)
image = Draw.MolsToGridImage(
mols,
legends=legends,
molsPerRow=n_cols,
useSVG=use_svg,
subImgSize=mol_size,
highlightAtomLists=_highlight_atom,
highlightBondLists=_highlight_bond,
)
if outfile is not None:
with fsspec.open(outfile, "wb") as f:
if use_svg:
if isinstance(image, str):
# in a terminal process
f.write(image.encode())
else:
# in a jupyter kernel process
f.write(image.data.encode()) # type: ignore
else:
if isinstance(image, PIL.PngImagePlugin.PngImageFile): # type: ignore
# in a terminal process
image.save(f)
else:
# in a jupyter kernel process
f.write(image.data) # type: ignore
return image
datamol.cluster
¶
datamol.cluster.assign_to_centroids(mols, centroids, feature_fn=None, dist_fn=None, n_jobs=1)
¶
Assign molecules to centroids. Each molecule will be assigned to the closest centroid.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mols |
List[rdkit.Chem.rdchem.Mol] |
a list of molecules to assign to centroids |
required |
centroids |
List[rdkit.Chem.rdchem.Mol] |
list of molecules to use as centroid |
required |
feature_fn |
Callable |
A feature function that takes a Chem.rdchem.Mol object
and return molecular features. By default, the |
None |
dist_fn |
Callable |
A function that takes two indexes (i,j) and return the distance between them. You might use partial to set the fingerprints as input. By default, the Tanimoto similarity will be used. Default to None. |
None |
n_jobs |
Optional[int] |
Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores. |
1 |
Returns:
Type | Description |
---|---|
clusters_map |
dict of index mapping each centroid index to the molecule index in the cluster clusters_list: list of all molecules in each cluster. The cluster index follows the index of the centroid. Note that the centroid molecule is not added to the cluster. |
Source code in datamol/cluster.py
def assign_to_centroids(
mols: List[Chem.rdchem.Mol],
centroids: List[Chem.rdchem.Mol],
feature_fn: Callable = None,
dist_fn: Callable = None,
n_jobs: Optional[int] = 1,
):
r"""Assign molecules to centroids. Each molecule will be assigned to the closest centroid.
Args:
mols: a list of molecules to assign to centroids
centroids: list of molecules to use as centroid
feature_fn: A feature function that takes a Chem.rdchem.Mol object
and return molecular features. By default, the `dm.to_fp()` is used.
Default to None.
dist_fn: A function that takes two indexes (i,j) and return the
distance between them. You might use partial to set the fingerprints as input.
By default, the Tanimoto similarity will be used. Default to None.
n_jobs: Number of jobs for parallelization. Let to 1 for no
parallelization. Set to None to use all available cores.
Returns:
clusters_map: dict of index mapping each centroid index to the molecule index in the cluster
clusters_list: list of all molecules in each cluster. The cluster index follows the index of the centroid.
Note that the centroid molecule is not added to the cluster.
"""
if feature_fn is None:
feature_fn = functools.partial(dm.to_fp, as_array=False)
all_mols = [x for x in mols] + [c for c in centroids]
features = dm.parallelized(feature_fn, all_mols, n_jobs=n_jobs)
def distij(i, j, features=features):
return 1.0 - DataStructs.TanimotoSimilarity(features[int(i)], features[int(j)])
if dist_fn is None:
dist_fn = distij
clusters_map = ddict(list)
clusters_list = [[] for _ in centroids]
query_inds = np.expand_dims(np.arange(len(mols), dtype=int), axis=1)
centroid_inds = np.expand_dims(np.arange(len(centroids), dtype=int), axis=1) + len(mols)
dist_mat = distance.cdist(query_inds, centroid_inds, metric=distij)
closest = np.argmin(dist_mat, axis=1)
for ind, cluster_ind in enumerate(closest): # type: ignore
clusters_map[cluster_ind].append(ind)
clusters_list[cluster_ind].append(mols[ind])
return clusters_map, clusters_list
datamol.cluster.cluster_mols(mols, cutoff=0.2, feature_fn=None, n_jobs=1)
¶
Cluster a set of molecules using the butina clustering algorithm and a given threshold.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mols |
List[rdkit.Chem.rdchem.Mol] |
a list of molecules. |
required |
cutoff |
float |
Cuttoff for the clustering. Default to 0.2. |
0.2 |
feature_fn |
Callable |
A feature function that takes a Chem.rdchem.Mol object
and return molecular features. By default, the |
None |
n_jobs |
Optional[int] |
Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores. |
1 |
Source code in datamol/cluster.py
def cluster_mols(
mols: List[Chem.rdchem.Mol],
cutoff: float = 0.2,
feature_fn: Callable = None,
n_jobs: Optional[int] = 1,
):
"""Cluster a set of molecules using the butina clustering algorithm and a given threshold.
Args:
mols: a list of molecules.
cutoff: Cuttoff for the clustering. Default to 0.2.
feature_fn: A feature function that takes a Chem.rdchem.Mol object
and return molecular features. By default, the `dm.to_fp()` is used.
Default to None.
n_jobs: Number of jobs for parallelization. Let to 1 for no
parallelization. Set to None to use all available cores.
"""
if feature_fn is None:
feature_fn = functools.partial(dm.to_fp, as_array=False)
features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs)
dists = []
n_mols = len(mols)
for i in range(1, n_mols):
dist = DataStructs.BulkTanimotoSimilarity(features[i], features[:i], returnDistance=True)
dists.extend([x for x in dist])
# now cluster the data
cluster_indices = Butina.ClusterData(dists, n_mols, cutoff, isDistData=True)
cluster_mols = [operator.itemgetter(*cluster)(mols) for cluster in cluster_indices]
# Make single mol cluster a list
cluster_mols = [[c] if isinstance(c, Chem.rdchem.Mol) else c for c in cluster_mols]
return cluster_indices, cluster_mols
datamol.cluster.pick_centroids(mols, npick=0, initial_picks=None, threshold=0.5, feature_fn=None, dist_fn=None, seed=42, method='sphere', n_jobs=1)
¶
Pick a set of npick
centroids from a list of molecules.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mols |
List[rdkit.Chem.rdchem.Mol] |
a list of molecules. |
required |
npick |
int |
Number of element to pick from mols, including the preselection. |
0 |
threshold |
float |
Minimum distance between centroids for |
0.5 |
initial_picks |
List[int] |
Starting list of index for molecules that should be in the set of picked molecules. Default to None. |
None |
feature_fn |
Callable |
A feature function that takes a Chem.rdchem.Mol object
and return molecular features. By default, the |
None |
dist_fn |
Callable |
A function that takes two indexes (i,j) and return the distance between them. You might use partial to set the fingerprints as input. By default, the Tanimoto similarity will be used. Default to None. |
None |
seed |
int |
seed for reproducibility |
42 |
method |
str |
Picking method to use. One of |
'sphere' |
n_jobs |
Optional[int] |
Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores. |
1 |
Returns:
Type | Description |
---|---|
picked_inds |
index of the molecule that have been selected as centroids mols: molecules that have been picked |
Source code in datamol/cluster.py
def pick_centroids(
mols: List[Chem.rdchem.Mol],
npick: int = 0,
initial_picks: List[int] = None,
threshold: float = 0.5,
feature_fn: Callable = None,
dist_fn: Callable = None,
seed: int = 42,
method: str = "sphere",
n_jobs: Optional[int] = 1,
):
r"""Pick a set of `npick` centroids from a list of molecules.
Args:
mols: a list of molecules.
npick: Number of element to pick from mols, including the preselection.
threshold: Minimum distance between centroids for `maxmin` and sphere exclusion (`sphere`) methods.
initial_picks: Starting list of index for molecules that should be in the
set of picked molecules. Default to None.
feature_fn (callable, optional): A feature function that takes a Chem.rdchem.Mol object
and return molecular features. By default, the `dm.to_fp()` is used.
Default to None.
dist_fn: A function that takes two indexes (i,j) and return the
distance between them. You might use partial to set the fingerprints as input.
By default, the Tanimoto similarity will be used. Default to None.
seed: seed for reproducibility
method: Picking method to use. One of `sphere`, `maxmin` or any
supported rdkit hierarchical clustering method such as `centroid`, `clink`, `upgma`
n_jobs: Number of jobs for parallelization. Let to 1 for no
parallelization. Set to None to use all available cores.
Returns:
picked_inds: index of the molecule that have been selected as centroids
mols: molecules that have been picked
"""
n_mols = len(mols)
if feature_fn is None:
feature_fn = functools.partial(dm.to_fp, as_array=False)
features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs)
def distij(i, j, features=features):
return 1.0 - DataStructs.TanimotoSimilarity(features[i], features[j])
if dist_fn is None:
dist_fn = distij
initial_picks = [] if initial_picks is None else initial_picks
if method == "maxmin":
picker = MaxMinPicker()
picked_inds, _ = picker.LazyPickWithThreshold(
dist_fn,
n_mols,
pickSize=npick,
threshold=threshold,
firstPicks=initial_picks,
seed=seed,
)
elif method == "sphere":
picker = LeaderPicker()
picked_inds = picker.LazyPick(
dist_fn, n_mols, threshold=threshold, pickSize=npick, firstPicks=initial_picks
)
elif method.upper() in ClusterMethod.names.keys() and npick:
if initial_picks:
logger.warning(
"Initial picks is not supported by hierarchical clustering. You pick has been discarded."
)
dist_mat = dm.parallelized(
distij, list(zip(*np.tril_indices(len(mols), k=-1))), arg_type="args"
)
dist_mat = np.asarray(dist_mat)
picker = HierarchicalClusterPicker(ClusterMethod.names[method.upper()])
picked_inds = picker.Pick(dist_mat, n_mols, npick)
else:
raise ValueError(f"Picking method {method} with {npick} elements to pick is not supported.")
picked_inds = np.array(picked_inds)
picked_mols = [mols[x] for x in picked_inds]
return picked_inds, picked_mols
datamol.cluster.pick_diverse(mols, npick, initial_picks=None, feature_fn=None, dist_fn=None, seed=42, n_jobs=1)
¶
Pick a set of diverse molecules based on they fingerprint.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mols |
List[rdkit.Chem.rdchem.Mol] |
a list of molecules. |
required |
npick |
int |
Number of element to pick from mols, including the preselection. |
required |
initial_picks |
List[int] |
Starting list of index for molecules that should be in the set of picked molecules. Default to None. |
None |
feature_fn |
Callable |
A feature function that takes a Chem.rdchem.Mol object
and return molecular features. By default, the |
None |
dist_fn |
Callable |
A function that takes two indexes (i,j) and return the distance between them. You might use partial to set the fingerprints as input. By default, the Tanimoto similarity will be used. Default to None. |
None |
seed |
int |
seed for reproducibility |
42 |
n_jobs |
Optional[int] |
Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores. |
1 |
Returns:
Type | Description |
---|---|
picked_inds |
index of the molecule that have been picked mols: molecules that have been picked |
Source code in datamol/cluster.py
def pick_diverse(
mols: List[Chem.rdchem.Mol],
npick: int,
initial_picks: List[int] = None,
feature_fn: Callable = None,
dist_fn: Callable = None,
seed: int = 42,
n_jobs: Optional[int] = 1,
):
r"""Pick a set of diverse molecules based on they fingerprint.
Args:
mols: a list of molecules.
npick: Number of element to pick from mols, including the preselection.
initial_picks: Starting list of index for molecules that should be in the
set of picked molecules. Default to None.
feature_fn: A feature function that takes a Chem.rdchem.Mol object
and return molecular features. By default, the `dm.to_fp()` is used.
Default to None.
dist_fn: A function that takes two indexes (i,j) and return the
distance between them. You might use partial to set the fingerprints as input.
By default, the Tanimoto similarity will be used. Default to None.
seed: seed for reproducibility
n_jobs: Number of jobs for parallelization. Let to 1 for no
parallelization. Set to None to use all available cores.
Returns:
picked_inds: index of the molecule that have been picked
mols: molecules that have been picked
"""
if feature_fn is None:
feature_fn = functools.partial(dm.to_fp, as_array=False)
features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs)
def distij(i, j, features=features):
return 1.0 - DataStructs.TanimotoSimilarity(features[i], features[j])
if dist_fn is None:
dist_fn = distij
picker = MaxMinPicker()
initial_picks = [] if initial_picks is None else initial_picks
picked_inds = picker.LazyPick(dist_fn, len(mols), npick, firstPicks=initial_picks, seed=seed)
picked_inds = np.array(picked_inds)
picked_mols = [mols[x] for x in picked_inds]
return picked_inds, picked_mols
datamol.convert
¶
datamol.convert.from_df(df, smiles_column='smiles', mol_column=None, conserve_smiles=False)
¶
Convert a dataframe to a list of mols.
Note
If smiles_column
is used to build the molecules, this property
is removed from the molecules' properties. You can decide to conserve
the SMILES column by setting conserve_smiles
to True.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
df |
DataFrame |
a dataframe. |
required |
smiles_column |
Optional[str] |
Column name to extract the molecule. |
'smiles' |
mol_column |
str |
Column name to extract the molecule. It takes
precedence over |
None |
conserve_smiles |
bool |
Whether to conserve the SMILES in the mols' props. |
False |
Source code in datamol/convert.py
def from_df(
df: pd.DataFrame,
smiles_column: Optional[str] = "smiles",
mol_column: str = None,
conserve_smiles: bool = False,
) -> List[Chem.rdchem.Mol]:
"""Convert a dataframe to a list of mols.
Note:
If `smiles_column` is used to build the molecules, this property
is removed from the molecules' properties. You can decide to conserve
the SMILES column by setting `conserve_smiles` to True.
Args:
df: a dataframe.
smiles_column: Column name to extract the molecule.
mol_column: Column name to extract the molecule. It takes
precedence over `smiles_column`.
conserve_smiles: Whether to conserve the SMILES in the mols' props.
"""
if smiles_column is None and mol_column is None:
raise ValueError("Either `smiles_column` or `mol_column` must be not None.")
if len(df) == 0:
return []
def _row_to_mol(row):
props = row.to_dict()
if mol_column is not None:
mol = props[mol_column]
else:
if conserve_smiles:
smiles = props[smiles_column]
else:
# If a SMILES column is used to create the molecule then it is removed from the
# properties.
smiles = props.pop(smiles_column)
mol = dm.to_mol(smiles)
if mol is None:
return None
dm.set_mol_props(mol, props)
return mol
return df.apply(_row_to_mol, axis=1).tolist()
datamol.convert.from_inchi(inchi, sanitize=True, remove_hs=True)
¶
Convert an InChi to a mol.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
inchi |
Optional[str] |
an inchi string. |
required |
sanitize |
bool |
do sanitize. |
True |
remove_hs |
bool |
do remove hs. |
True |
Returns:
Type | Description |
---|---|
Optional[rdkit.Chem.rdchem.Mol] |
mol |
Source code in datamol/convert.py
def from_inchi(
inchi: Optional[str],
sanitize: bool = True,
remove_hs: bool = True,
) -> Optional[Chem.rdchem.Mol]:
"""Convert an InChi to a mol.
Args:
inchi: an inchi string.
sanitize: do sanitize.
remove_hs: do remove hs.
Returns:
mol
"""
if inchi is None:
return None
return Chem.MolFromInchi(inchi, sanitize=sanitize, removeHs=remove_hs)
datamol.convert.from_selfies(selfies, as_mol=False)
¶
Convert a SEFLIES to a smiles or a mol.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
selfies |
str |
a selfies. |
required |
as_mol |
bool |
whether to return a mol or a smiles. |
False |
Returns:
Type | Description |
---|---|
Union[str, rdkit.Chem.rdchem.Mol] |
smiles or mol. |
Source code in datamol/convert.py
def from_selfies(selfies: str, as_mol: bool = False) -> Optional[Union[str, Chem.rdchem.Mol]]:
"""Convert a SEFLIES to a smiles or a mol.
Args:
selfies: a selfies.
as_mol (str, optional): whether to return a mol or a smiles.
Returns:
smiles or mol.
"""
if selfies is None:
return None
smiles = sf.decoder(selfies)
if as_mol and smiles is not None:
return dm.to_mol(smiles)
return smiles
datamol.convert.to_df(mols, smiles_column='smiles', mol_column=None, include_private=False, include_computed=False, render_df_mol=True, render_all_df_mol=False)
¶
Convert a list of mols to a dataframe using each mol properties as a column.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mols |
List[rdkit.Chem.rdchem.Mol] |
a molecule. |
required |
smiles_column |
Optional[str] |
name of the SMILES column. |
'smiles' |
mol_column |
str |
Name of the column. If not None, rdkit.Chem.PandaTools is used to add a molecule column. |
None |
include_private |
bool |
Include private properties in the columns. |
False |
include_computed |
bool |
Include computed properties in the columns. |
False |
render_df_mol |
bool |
whether to render the molecule in the dataframe to images. If called once, it will be applied for the newly created dataframe with mol in it. |
True |
render_all_df_mol |
bool |
Whether to render all pandas dataframe mol column as images. |
False |
Source code in datamol/convert.py
def to_df(
mols: List[Chem.rdchem.Mol],
smiles_column: Optional[str] = "smiles",
mol_column: str = None,
include_private: bool = False,
include_computed: bool = False,
render_df_mol: bool = True,
render_all_df_mol: bool = False,
) -> Optional[pd.DataFrame]:
"""Convert a list of mols to a dataframe using each mol properties
as a column.
Args:
mols: a molecule.
smiles_column: name of the SMILES column.
mol_column: Name of the column. If not None, rdkit.Chem.PandaTools
is used to add a molecule column.
include_private: Include private properties in the columns.
include_computed: Include computed properties in the columns.
render_df_mol: whether to render the molecule in the dataframe to images.
If called once, it will be applied for the newly created dataframe with
mol in it.
render_all_df_mol: Whether to render all pandas dataframe mol column as images.
"""
# Init a dataframe
df = pd.DataFrame()
# Feed it with smiles
if smiles_column is not None:
smiles = [dm.to_smiles(mol) for mol in mols]
df[smiles_column] = smiles
# Add a mol column
if mol_column is not None:
df[mol_column] = mols
# Add any other properties present in the molecule
props = [
mol.GetPropsAsDict(
includePrivate=include_private,
includeComputed=include_computed,
)
for mol in mols
]
props_df = pd.DataFrame(props)
if smiles_column is not None and smiles_column in props_df.columns:
logger.warning(
f"The SMILES column name provided ('{smiles_column}') is already present in the properties"
" of the molecules. THe returned dataframe will two columns with the same name."
)
# Concat the df with the properties df
df = pd.concat([df, props_df], axis=1)
# Render mol column to images
if render_df_mol is True and mol_column is not None:
# NOTE(hadim): replace by `PandaTools.ChangeMoleculeRendering` once
# https://github.com/rdkit/rdkit/issues/3563 is fixed.
_ChangeMoleculeRendering(df)
if render_all_df_mol:
PandasTools.RenderImagesInAllDataFrames()
return df
datamol.convert.to_inchi(mol)
¶
Convert a mol to Inchi.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Union[str, rdkit.Chem.rdchem.Mol] |
a molecule. |
required |
Source code in datamol/convert.py
def to_inchi(mol: Union[str, Chem.rdchem.Mol]) -> Optional[str]:
"""Convert a mol to Inchi.
Args:
mol: a molecule.
"""
if mol is None:
return None
if isinstance(mol, str):
mol = dm.to_mol(mol)
return Chem.MolToInchi(mol)
datamol.convert.to_inchikey(mol)
¶
Convert a mol to Inchi key.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Union[str, rdkit.Chem.rdchem.Mol] |
a molecule |
required |
Source code in datamol/convert.py
def to_inchikey(mol: Union[str, Chem.rdchem.Mol]) -> Optional[str]:
"""Convert a mol to Inchi key.
Args:
mol: a molecule
"""
if mol is None:
return None
if isinstance(mol, str):
mol = dm.to_mol(mol)
return Chem.MolToInchiKey(mol)
datamol.convert.to_selfies(mol)
¶
Convert a mol to SELFIES.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Union[str, rdkit.Chem.rdchem.Mol] |
a molecule or a SMILES. |
required |
Returns:
Type | Description |
---|---|
Optional[str] |
selfies: SELFIES string. |
Source code in datamol/convert.py
def to_selfies(mol: Union[str, Chem.rdchem.Mol]) -> Optional[str]:
"""Convert a mol to SELFIES.
Args:
mol: a molecule or a SMILES.
Returns:
selfies: SELFIES string.
"""
if mol is None:
return None
if isinstance(mol, Chem.rdchem.Mol):
mol = to_smiles(mol)
selfies = sf.encoder(mol) # type: ignore
if selfies == -1:
return None
return selfies
datamol.convert.to_smarts(mol, keep_hs=True)
¶
Convert a molecule to a smarts.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Union[str, rdkit.Chem.rdchem.Mol] |
a molecule. |
required |
keep_hs |
bool |
Whether to keep hydrogen. This will increase the count of H atoms for atoms with attached hydrogens to create a valid smarts. e.g. [H]-[CH2]-[] -> [H]-[CH3]-[] |
True |
Returns:
Type | Description |
---|---|
Optional[str] |
smarts of the molecule |
Source code in datamol/convert.py
def to_smarts(mol: Union[str, Chem.rdchem.Mol], keep_hs: bool = True) -> Optional[str]:
"""Convert a molecule to a smarts.
Args:
mol: a molecule.
keep_hs: Whether to keep hydrogen. This will increase the count of H atoms
for atoms with attached hydrogens to create a valid smarts.
e.g. [H]-[CH2]-[*] -> [H]-[CH3]-[*]
Returns:
smarts of the molecule
"""
if mol is None:
return None
if isinstance(mol, str):
mol = dm.to_mol(mol)
# Change the isotope to 42
for atom in mol.GetAtoms(): # type: ignore
if keep_hs:
s = sum(na.GetAtomicNum() == 1 for na in atom.GetNeighbors())
if s:
atom.SetNumExplicitHs(atom.GetTotalNumHs() + s)
atom.SetIsotope(42)
# Print out the smiles, all the atom attributes will be fully specified
smarts = to_smiles(mol, isomeric=True, explicit_bonds=True)
if smarts is None:
return None
# Remove the 42 isotope labels
smarts = re.sub(r"\[42", "[", smarts)
return smarts
datamol.convert.to_smiles(mol, canonical=True, isomeric=True, ordered=False, explicit_bonds=False, explicit_hs=False, randomize=False, cxsmiles=False, allow_to_fail=False)
¶
Convert a mol to a SMILES.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
a molecule. |
required |
canonical |
bool |
if false no attempt will be made to canonicalize the molecule. |
True |
isomeric |
bool |
whether to include information about stereochemistry in the SMILES. |
True |
ordered |
bool |
whether to force reordering of the atoms first. |
False |
explicit_bonds |
bool |
if true, all bond orders will be explicitly indicated in the output SMILES. |
False |
explicit_hs |
bool |
if true, all H counts will be explicitly indicated in the output SMILES. |
False |
randomize |
bool |
whether to randomize the generated smiles. Override |
False |
cxsmiles |
bool |
Whether to return a CXSMILES instead of a SMILES. |
False |
allow_to_fail |
bool |
Raise an error if the conversion to SMILES fails. Return None otherwise. |
False |
Source code in datamol/convert.py
def to_smiles(
mol: Chem.rdchem.Mol,
canonical: bool = True,
isomeric: bool = True,
ordered: bool = False,
explicit_bonds: bool = False,
explicit_hs: bool = False,
randomize: bool = False,
cxsmiles: bool = False,
allow_to_fail: bool = False,
) -> Optional[str]:
"""Convert a mol to a SMILES.
Args:
mol: a molecule.
canonical: if false no attempt will be made to canonicalize the molecule.
isomeric: whether to include information about stereochemistry in the SMILES.
ordered: whether to force reordering of the atoms first.
explicit_bonds: if true, all bond orders will be explicitly indicated in the output SMILES.
explicit_hs: if true, all H counts will be explicitly indicated in the output SMILES.
randomize: whether to randomize the generated smiles. Override `canonical`.
cxsmiles: Whether to return a CXSMILES instead of a SMILES.
allow_to_fail: Raise an error if the conversion to SMILES fails. Return None otherwise.
"""
if ordered and canonical is False:
mol = dm.reorder_atoms(mol)
if randomize:
mol = dm.randomize_atoms(mol)
canonical = False
smiles = None
try:
if cxsmiles:
smiles = Chem.MolToCXSmiles( # type: ignore
mol,
isomericSmiles=isomeric,
canonical=canonical,
allBondsExplicit=explicit_bonds,
allHsExplicit=explicit_hs,
)
else:
smiles = Chem.MolToSmiles( # type: ignore
mol,
isomericSmiles=isomeric,
canonical=canonical,
allBondsExplicit=explicit_bonds,
allHsExplicit=explicit_hs,
)
except Exception as e:
if allow_to_fail:
raise e
return None
return smiles
datamol.data
¶
datamol.data.freesolv()
¶
Source code in datamol/data.py
def freesolv():
with pkg_resources.resource_stream("datamol", "data/freesolv.csv") as f:
data = pd.read_csv(f)
return data
datamol.fp
¶
datamol.fp.fp_to_array(fp, dtype=<class 'int'>)
¶
Convert rdkit fingerprint to numpy array.
Note
This implementation has shown to be faster than using DataStructs.ConvertToNumpyArray
by a factor of ~4.
Source code in datamol/fp.py
def fp_to_array(fp: DataStructs.ExplicitBitVect, dtype: type = int) -> np.ndarray:
"""Convert rdkit fingerprint to numpy array.
Note:
This implementation has shown to be faster than using `DataStructs.ConvertToNumpyArray`
by a factor of ~4.
"""
if isinstance(fp, np.ndarray):
return fp
return np.frombuffer(fp.ToBitString().encode(), "u1") - ord("0")
datamol.fp.to_fp(mol, fp_size=2048, radius=3, use_features=True, as_array=True)
¶
Transform a molecule from smiles to morgan fingerprint.
Note
That function should be expanded to compute more type of fingerprints.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Union[str, rdkit.Chem.rdchem.Mol] |
a molecule or a SMILES. |
required |
fp_size |
int |
Size of morgan fingerprint. Default to 2048. |
2048 |
radius |
int |
Radius of the morgan fingerprints. Default to 3. |
3 |
use_features |
bool |
Whether to use atom features. Default to True. |
True |
as_array |
bool |
Whether to return a numpy array of an RDKit vec. Default to True. |
True |
Returns:
Type | Description |
---|---|
Union[numpy.ndarray, rdkit.DataStructs.cDataStructs.ExplicitBitVect] |
A fingerprint vector or None |
Source code in datamol/fp.py
def to_fp(
mol: Union[str, Chem.rdchem.Mol],
fp_size: int = 2048,
radius: int = 3,
use_features: bool = True,
as_array: bool = True,
) -> Optional[Union[np.ndarray, DataStructs.ExplicitBitVect]]:
"""Transform a molecule from smiles to morgan fingerprint.
Note:
That function should be expanded to compute more type of fingerprints.
Args:
mol (Chem.Mol or str): a molecule or a SMILES.
fp_size (int, optional): Size of morgan fingerprint. Default to 2048.
radius (int, optional): Radius of the morgan fingerprints. Default to 3.
use_features: Whether to use atom features. Default to True.
as_array: Whether to return a numpy array of an RDKit vec. Default to True.
Returns:
A fingerprint vector or None
"""
if isinstance(mol, str):
mol = dm.to_mol(mol)
fp = rdMolDescriptors.GetMorganFingerprintAsBitVect(
mol,
radius,
nBits=fp_size,
useFeatures=use_features,
)
if as_array:
return fp_to_array(fp)
return fp
datamol.graph
¶
datamol.graph.get_all_path_between(mol, atom_idx_1, atom_idx_2, ignore_cycle_basis=False)
¶
Get all simple path between two atoms of a molecule
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
a molecule |
required |
atom_idx_1 |
int |
Atom index 1. |
required |
atom_idx_2 |
int |
Atom index 2. |
required |
ignore_cycle_basis |
bool |
Whether to ignore cycle basis. Defaults to False. |
False |
Returns:
Type | Description |
---|---|
[type] |
[description] |
Source code in datamol/graph.py
def get_all_path_between(
mol: Chem.Mol,
atom_idx_1: int,
atom_idx_2: int,
ignore_cycle_basis: bool = False,
):
"""Get all simple path between two atoms of a molecule
Args:
mol (Chem.Mol): a molecule
atom_idx_1 (int): Atom index 1.
atom_idx_2 (int): Atom index 2.
ignore_cycle_basis: Whether to ignore cycle basis.
Defaults to False.
Returns:
[type]: [description]
"""
nx = _get_networkx()
adj = Chem.rdmolops.GetAdjacencyMatrix(mol)
G = nx.Graph(adj)
path = nx.all_simple_paths(G, source=atom_idx_1, target=atom_idx_2)
if ignore_cycle_basis:
rings = [set(x) for x in mol.GetRingInfo().AtomRings()]
final_path = []
for p in path:
reject_path = False
for r in rings:
if r.issubset(set(p)):
reject_path = True
break
if not reject_path:
final_path.append(p)
path = final_path
return list(path)
datamol.graph.to_graph(mol)
¶
Convert a molecule to a network x graph. A list of properties are added to every nodes and edges.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
a molecule. |
required |
Returns:
Type | Description |
---|---|
mol_graph (networkx.Graph) |
a graph representing the molecule. |
Source code in datamol/graph.py
def to_graph(mol: Chem.Mol):
"""Convert a molecule to a network x graph. A list of properties are added
to every nodes and edges.
Args:
mol (Chem.Mol): a molecule.
Returns:
mol_graph (networkx.Graph): a graph representing the molecule.
"""
nx = _get_networkx()
mol_graph = nx.Graph()
for atom in mol.GetAtoms():
mol_graph.add_node(
atom.GetIdx(),
atomic_num=atom.GetAtomicNum(),
formal_charge=atom.GetFormalCharge(),
chiral_tag=atom.GetChiralTag(),
hybridization=atom.GetHybridization(),
num_explicit_hs=atom.GetNumExplicitHs(),
implicit_valence=atom.GetImplicitValence(),
degree=atom.GetDegree(),
symbol=atom.GetSymbol(),
ring_atom=atom.IsInRing(),
is_aromatic=atom.GetIsAromatic(),
)
for bond in mol.GetBonds():
mol_graph.add_edge(
bond.GetBeginAtomIdx(),
bond.GetEndAtomIdx(),
bond_type=bond.GetBondType(),
)
return mol_graph
datamol.io
¶
datamol.io.read_csv(urlpath, smiles_column=None, mol_column='mol', **kwargs)
¶
Read a CSV file.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
urlpath |
Union[str, os.PathLike, TextIO] |
Path to a file or a file-like object. Path can be remote or local. |
required |
smiles_column |
str |
Use this column to build a mol column. |
None |
mol_column |
str |
Name to give to the mol column. If not None a mol column will be build. Avoid when loading a very large file. |
'mol' |
kwargs |
|
Arguments to pass to |
{} |
Returns:
Type | Description |
---|---|
DataFrame |
df: a |
Source code in datamol/io.py
def read_csv(
urlpath: Union[str, os.PathLike, TextIO],
smiles_column: str = None,
mol_column: str = "mol",
**kwargs,
) -> pd.DataFrame:
"""Read a CSV file.
Args:
urlpath: Path to a file or a file-like object. Path can be remote or local.
smiles_column: Use this column to build a mol column.
mol_column: Name to give to the mol column. If not None a mol column will be build.
Avoid when loading a very large file.
kwargs: Arguments to pass to `pd.read_csv()`.
Returns:
df: a `pandas.DataFrame`
"""
df = pd.read_csv(urlpath, **kwargs)
if smiles_column is not None:
PandasTools.AddMoleculeColumnToFrame(df, smiles_column, mol_column)
return df
datamol.io.read_excel(urlpath, sheet_name=0, smiles_column=None, mol_column='mol', **kwargs)
¶
Read an excel file.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
urlpath |
Union[str, os.PathLike, TextIO] |
Path to a file or a file-like object. Path can be remote or local. |
required |
sheet_name |
Union[str, int, list] |
see |
0 |
mol_column |
str |
Name to give to the mol column. If not None a mol column will be build. Avoid when loading a very large file. |
'mol' |
mol_column |
str |
name to give to the mol column. |
'mol' |
kwargs |
|
Arguments to pass to |
{} |
Returns:
Type | Description |
---|---|
DataFrame |
df: a |
Source code in datamol/io.py
def read_excel(
urlpath: Union[str, os.PathLike, TextIO],
sheet_name: Optional[Union[str, int, list]] = 0,
smiles_column: str = None,
mol_column: str = "mol",
**kwargs,
) -> pd.DataFrame:
"""Read an excel file.
Args:
urlpath: Path to a file or a file-like object. Path can be remote or local.
sheet_name: see `pandas.read_excel()` doc.
mol_column: Name to give to the mol column. If not None a mol column will be build.
Avoid when loading a very large file.
mol_column: name to give to the mol column.
kwargs: Arguments to pass to `pd.read_excel()`.
Returns:
df: a `pandas.DataFrame`
"""
df = pd.read_excel(urlpath, sheet_name=sheet_name, **kwargs) # type: ignore
if smiles_column is not None:
PandasTools.AddMoleculeColumnToFrame(df, smiles_column, mol_column)
return df
datamol.io.read_sdf(urlpath, as_df=False, smiles_column='smiles', mol_column=None, include_private=False, include_computed=False)
¶
Read an SDF file.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
urlpath |
Union[str, os.PathLike, TextIO] |
Path to a file or a file-like object. Path can be remote or local. |
required |
as_df |
bool |
Whether to return a list mol or a pandas DataFrame. |
False |
smiles_column |
Optional[str] |
Name of the SMILES column. Only relevant if |
'smiles' |
mol_column |
str |
Name of the mol column. Only relevant if |
None |
include_private |
bool |
Include private properties in the columns. Only relevant if
|
False |
include_computed |
bool |
Include computed properties in the columns. Only relevant if
|
False |
Source code in datamol/io.py
def read_sdf(
urlpath: Union[str, os.PathLike, TextIO],
as_df: bool = False,
smiles_column: Optional[str] = "smiles",
mol_column: str = None,
include_private: bool = False,
include_computed: bool = False,
) -> Union[List[Chem.rdchem.Mol], pd.DataFrame]:
"""Read an SDF file.
Args:
urlpath: Path to a file or a file-like object. Path can be remote or local.
as_df: Whether to return a list mol or a pandas DataFrame.
smiles_column: Name of the SMILES column. Only relevant if `as_df` is True.
mol_column: Name of the mol column. Only relevant if `as_df` is True.
include_private: Include private properties in the columns. Only relevant if
`as_df` is True.
include_computed: Include computed properties in the columns. Only relevant if
`as_df` is True.
"""
# File-like object
if isinstance(urlpath, io.IOBase):
supplier = Chem.ForwardSDMolSupplier(urlpath)
mols = [mol for mol in supplier if mol is not None]
# Regular local or remote paths
else:
with fsspec.open(urlpath) as f:
if str(urlpath).endswith(".gz") or str(urlpath).endswith(".gzip"):
f = gzip.open(f)
supplier = Chem.ForwardSDMolSupplier(f)
mols = [mol for mol in supplier if mol is not None]
if as_df:
return dm.to_df(
mols,
smiles_column=smiles_column,
mol_column=mol_column,
include_private=include_private,
include_computed=include_computed,
) # type: ignore
return mols
datamol.io.read_smi(urlpath)
¶
Read a list of smiles from am .smi
file.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
urlpath |
Union[str, os.PathLike] |
Path to a file or a file-like object. Path can be remote or local. Note: file-like object are not supported yet. |
required |
Source code in datamol/io.py
def read_smi(
urlpath: Union[str, os.PathLike],
) -> Sequence[Chem.rdchem.Mol]:
"""Read a list of smiles from am `.smi` file.
Args:
urlpath: Path to a file or a file-like object. Path can be remote or local.
Note: file-like object are not supported yet.
"""
active_path = urlpath
# NOTE(hadim): the temporary local file copy
# is because `SmilesMolSupplier` does not support
# using file-like object, only path.
# Copy to a local temporary path if the path is a remote one.
if not fsspec.utils.can_be_local(str(urlpath)):
active_path = pathlib.Path(tempfile.mkstemp()[1])
dm.utils.fs.copy_file(urlpath, active_path)
# Read the molecules
supplier = Chem.SmilesMolSupplier(str(active_path), titleLine=0)
mols = [mol for mol in supplier if mol is not None]
# Delete the local temporary path
if not fsspec.utils.can_be_local(str(urlpath)):
pathlib.Path(active_path).unlink()
return mols
datamol.io.to_sdf(mols, urlpath, smiles_column='smiles', mol_column=None)
¶
Write molecules to a file.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mols |
Union[Sequence[rdkit.Chem.rdchem.Mol], pandas.core.frame.DataFrame] |
a dataframe or a list of molecule. |
required |
urlpath |
Union[str, os.PathLike, TextIO] |
Path to a file or a file-like object. Path can be remote or local. |
required |
smiles_column |
Optional[str] |
Column name to extract the molecule. |
'smiles' |
mol_column |
str |
Column name to extract the molecule. It takes
precedence over |
None |
Source code in datamol/io.py
def to_sdf(
mols: Union[Sequence[Chem.rdchem.Mol], pd.DataFrame],
urlpath: Union[str, os.PathLike, TextIO],
smiles_column: Optional[str] = "smiles",
mol_column: str = None,
):
"""Write molecules to a file.
Args:
mols: a dataframe or a list of molecule.
urlpath: Path to a file or a file-like object. Path can be remote or local.
smiles_column: Column name to extract the molecule.
mol_column: Column name to extract the molecule. It takes
precedence over `smiles_column`.
"""
if isinstance(mols, pd.DataFrame):
mols = dm.from_df(mols, smiles_column=smiles_column, mol_column=mol_column)
# Filter out None values
mols = [mol for mol in mols if mol is not None]
# File-like object
if isinstance(urlpath, io.IOBase):
writer = Chem.SDWriter(urlpath)
for mol in mols:
writer.write(mol)
writer.close()
# Regular local or remote paths
else:
with fsspec.open(urlpath, mode="w") as f:
writer = Chem.SDWriter(f)
for mol in mols:
writer.write(mol)
writer.close()
datamol.io.to_smi(mols, urlpath, error_if_empty=False)
¶
Save a list of molecules in an .smi
file.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mols |
Sequence[rdkit.Chem.rdchem.Mol] |
a list of molecules. |
required |
urlpath |
Union[str, os.PathLike, TextIO] |
Path to a file or a file-like object. Path can be remote or local. |
required |
error_if_empty |
bool |
whether to raise and error if the input list is empty. |
False |
Source code in datamol/io.py
def to_smi(
mols: Sequence[Chem.rdchem.Mol],
urlpath: Union[str, os.PathLike, TextIO],
error_if_empty: bool = False,
):
"""Save a list of molecules in an `.smi` file.
Args:
mols: a list of molecules.
urlpath: Path to a file or a file-like object. Path can be remote or local.
error_if_empty: whether to raise and error if the input list is empty.
"""
if len(mols) == 0 and error_if_empty:
raise ValueError("The list of mols/smiles provided is empty.")
# Filter out None values
mols = [mol for mol in mols if mol is not None]
# File-like object
if isinstance(urlpath, io.IOBase):
writer = Chem.SmilesWriter(urlpath, includeHeader=False, nameHeader="")
for mol in mols:
writer.write(mol)
writer.close()
# Regular local or remote paths
else:
with fsspec.open(urlpath, "w") as f:
writer = Chem.SmilesWriter(f, includeHeader=False, nameHeader="")
for mol in mols:
writer.write(mol)
writer.close()
datamol.jobs
¶
datamol.utils.jobs.JobRunner
¶
is_sequential
property
readonly
¶
Check whether the job is sequential or parallel
__call__(self, *args, **kwargs)
special
¶
Run job using the n_jobs attribute to determine regime
Source code in datamol/utils/jobs.py
def __call__(self, *args, **kwargs):
"""
Run job using the n_jobs attribute to determine regime
"""
if self.is_sequential:
return self.sequential(*args, **kwargs)
return self.parallel(*args, **kwargs)
__init__(self, n_jobs=-1, prefer=None, progress=False, **job_kwargs)
special
¶
JobRunner with sequential/parallel regimes. The multiprocessing backend use joblib which allows taking advantage of its features, while the progress bar use tqdm
Parameters:
Name | Type | Description | Default |
---|---|---|---|
n_jobs |
Optional[int] |
Number of process. Use 0 or None to force sequential. Use -1 to use all the available processors. For details see https://joblib.readthedocs.io/en/latest/parallel.html#parallel-reference-documentation |
-1 |
prefer |
str |
Choose from ['processes', 'threads'] or None. Default to None.
Soft hint to choose the default backend if no specific backend
was selected with the parallel_backend context manager. The
default process-based backend is 'loky' and the default
thread-based backend is 'threading'. Ignored if the |
None |
progress |
bool |
whether to display progress bar |
False |
job_kwargs |
|
Any additional keyword argument supported by joblib.Parallel. |
{} |
Examples:
import datamol as dm
runner = dm.JobRunner(n_jobs=4, progress=True, prefer="threads")
results = runner(lambda x: x**2, [1, 2, 3, 4])
Source code in datamol/utils/jobs.py
def __init__(
self,
n_jobs: Optional[int] = -1,
prefer: str = None,
progress: bool = False,
**job_kwargs,
):
"""
JobRunner with sequential/parallel regimes. The multiprocessing backend use joblib which
allows taking advantage of its features, while the progress bar use tqdm
Args:
n_jobs: Number of process. Use 0 or None to force sequential.
Use -1 to use all the available processors. For details see
https://joblib.readthedocs.io/en/latest/parallel.html#parallel-reference-documentation
prefer: Choose from ['processes', 'threads'] or None. Default to None.
Soft hint to choose the default backend if no specific backend
was selected with the parallel_backend context manager. The
default process-based backend is 'loky' and the default
thread-based backend is 'threading'. Ignored if the ``backend``
parameter is specified.
progress: whether to display progress bar
job_kwargs: Any additional keyword argument supported by joblib.Parallel.
Example:
```python
import datamol as dm
runner = dm.JobRunner(n_jobs=4, progress=True, prefer="threads")
results = runner(lambda x: x**2, [1, 2, 3, 4])
```
"""
self.n_jobs = n_jobs
self.prefer = prefer
self.job_kwargs = job_kwargs
self.job_kwargs.update(n_jobs=self.n_jobs, prefer=self.prefer)
self.no_progress = not progress
get_iterator_length(data)
staticmethod
¶
Attempt to get the length of an iterator
Source code in datamol/utils/jobs.py
@staticmethod
def get_iterator_length(data):
"""Attempt to get the length of an iterator"""
total_length = None
try:
total_length = len(data)
except TypeError:
# most likely a generator, ignore
pass
return total_length
parallel(self, callable_fn, data, arg_type=None, **fn_kwargs)
¶
Run job in parallel
Parameters:
Name | Type | Description | Default |
---|---|---|---|
callable_fn |
Callable |
function to call |
required |
data |
Iterable[Any] |
input data |
required |
arg_type |
Optional[str] |
function argument type ('arg'/None or 'args' or 'kwargs') |
None |
fn_kwargs |
dict |
optional keyword argument to pass to the callable funciton |
{} |
Source code in datamol/utils/jobs.py
def parallel(
self,
callable_fn: Callable,
data: Iterable[Any],
arg_type: Optional[str] = None,
**fn_kwargs,
):
r"""
Run job in parallel
Args:
callable_fn (callable): function to call
data (iterable): input data
arg_type (str, optional): function argument type ('arg'/None or 'args' or 'kwargs')
fn_kwargs (dict, optional): optional keyword argument to pass to the callable funciton
"""
runner = JobRunner._parallel_helper(**self.job_kwargs)
total_length = JobRunner.get_iterator_length(data)
results = runner(total=total_length, disable=self.no_progress)(
delayed(JobRunner.wrap_fn(callable_fn, arg_type, **fn_kwargs))(dt) for dt in data
)
return results
sequential(self, callable_fn, data, arg_type=None, **fn_kwargs)
¶
Run job in sequential version
Parameters:
Name | Type | Description | Default |
---|---|---|---|
callable_fn |
Callable |
function to call |
required |
data |
Iterable[Any] |
input data |
required |
arg_type |
Optional[str] |
function argument type ('arg'/None or 'args' or 'kwargs') |
None |
fn_kwargs |
dict |
optional keyword argument to pass to the callable funciton |
{} |
Source code in datamol/utils/jobs.py
def sequential(
self,
callable_fn: Callable,
data: Iterable[Any],
arg_type: Optional[str] = None,
**fn_kwargs,
):
r"""
Run job in sequential version
Args:
callable_fn (callable): function to call
data (iterable): input data
arg_type (str, optional): function argument type ('arg'/None or 'args' or 'kwargs')
fn_kwargs (dict, optional): optional keyword argument to pass to the callable funciton
"""
total_length = JobRunner.get_iterator_length(data)
res = [
JobRunner.wrap_fn(callable_fn, arg_type, **fn_kwargs)(dt)
for dt in tqdm(data, total=total_length, disable=self.no_progress)
]
return res
wrap_fn(fn, arg_type=None, **fn_kwargs)
staticmethod
¶
Small wrapper around a callable to properly format it's argument
Source code in datamol/utils/jobs.py
@staticmethod
def wrap_fn(fn: Callable, arg_type: Optional[str] = None, **fn_kwargs):
"""Small wrapper around a callable to properly format it's argument"""
# EN probably use something like (moms.utils.commons.is_callable) ?
def _run(args: Any):
if arg_type == "kwargs":
fn_kwargs.update(**args)
return fn(**fn_kwargs)
elif arg_type == "args":
return fn(*args, **fn_kwargs)
return fn(args, **fn_kwargs)
return _run
datamol.utils.jobs.parallelized(fn, inputs_list, scheduler='processes', n_jobs=-1, progress=False, arg_type='arg')
¶
Run a function in parallel.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
fn |
Callable |
The function to run in parallel. |
required |
inputs_list |
Iterable[Any] |
List of inputs to pass to |
required |
scheduler |
str |
Choose between ["processes", "threads"]. Defaults to None which uses the default joblib "loky" scheduler. |
'processes' |
n_jobs |
Optional[int] |
Number of process. Use 0 or None to force sequential. Use -1 to use all the available processors. For details see https://joblib.readthedocs.io/en/latest/parallel.html#parallel-reference-documentation |
-1 |
progress |
bool |
Display a progress bar. Defaults to False. |
False |
arg_type |
str |
One of ["arg", "args", "kwargs]:
- "arg": the input is passed as an argument: |
'arg' |
Returns:
Type | Description |
---|---|
Optional[List[Any]] |
The results of the execution as a list. |
Source code in datamol/utils/jobs.py
def parallelized(
fn: Callable,
inputs_list: Iterable[Any],
scheduler: str = "processes",
n_jobs: Optional[int] = -1,
progress: bool = False,
arg_type: str = "arg",
) -> Optional[List[Any]]:
"""Run a function in parallel.
Args:
fn: The function to run in parallel.
inputs_list: List of inputs to pass to `fn`.
scheduler: Choose between ["processes", "threads"]. Defaults
to None which uses the default joblib "loky" scheduler.
n_jobs: Number of process. Use 0 or None to force sequential.
Use -1 to use all the available processors. For details see
https://joblib.readthedocs.io/en/latest/parallel.html#parallel-reference-documentation
progress: Display a progress bar. Defaults to False.
arg_type: One of ["arg", "args", "kwargs]:
- "arg": the input is passed as an argument: `fn(arg)` (default).
- "args": the input is passed as a list: `fn(*args)`.
- "kwargs": the input is passed as a map: `fn(**kwargs)`.
Returns:
The results of the execution as a list.
"""
runner = JobRunner(n_jobs=n_jobs, progress=progress, prefer=scheduler)
return runner(fn, inputs_list, arg_type=arg_type)
datamol.log
¶
datamol.log.disable_rdkit_log()
¶
Disable all rdkit logs.
Source code in datamol/log.py
def disable_rdkit_log():
"""Disable all rdkit logs."""
for log_level in RDLogger._levels:
rdBase.DisableLog(log_level)
datamol.log.enable_rdkit_log()
¶
Enable all rdkit logs.
Source code in datamol/log.py
def enable_rdkit_log():
"""Enable all rdkit logs."""
for log_level in RDLogger._levels:
rdBase.EnableLog(log_level)
datamol.log.without_rdkit_log
¶
Context manager to disable RDKit logs. By default all logs are disabled.
datamol.mol
¶
datamol.mol.adjust_singleton(mol)
¶
Remove all atoms that are essentially disconnected singleton nodes in the molecular graph. For example, the chlorine atom and methane fragment will be removed in Cl.[N:1]1=CC(O)=CC2CCCCC12.CC.C", but not the ethane fragment.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
a molecule. |
required |
Source code in datamol/mol.py
def adjust_singleton(mol: Chem.rdchem.Mol) -> Optional[Chem.rdchem.Mol]:
"""Remove all atoms that are essentially disconnected singleton nodes in the molecular graph.
For example, the chlorine atom and methane fragment will be removed in Cl.[N:1]1=CC(O)=CC2CCCCC12.CC.C",
but not the ethane fragment.
Args:
mol: a molecule.
"""
to_rem = []
em = Chem.RWMol(mol)
for atom in mol.GetAtoms():
if atom.GetExplicitValence() == 0:
to_rem.append(atom.GetIdx())
to_rem.sort(reverse=True)
for a_idx in to_rem:
em.RemoveAtom(a_idx)
return em.GetMol()
datamol.mol.atom_indices_to_mol(mol, copy=False)
¶
Add the molAtomMapNumber
property to each atoms.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
a molecule |
required |
copy |
bool |
Whether to copy the molecule. |
False |
Source code in datamol/mol.py
def atom_indices_to_mol(mol: Chem.rdchem.Mol, copy: bool = False):
"""Add the `molAtomMapNumber` property to each atoms.
Args:
mol: a molecule
copy: Whether to copy the molecule.
"""
if copy is True:
mol = copy_mol(mol)
for atom in mol.GetAtoms():
atom.SetProp("molAtomMapNumber", str(atom.GetIdx()))
return mol
datamol.mol.copy_mol(mol)
¶
Copy a molecule and return a new one.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
a molecule to copy. |
required |
Source code in datamol/mol.py
def copy_mol(mol: Chem.rdchem.Mol) -> Chem.rdchem.Mol:
"""Copy a molecule and return a new one.
Args:
mol: a molecule to copy.
"""
return copy.deepcopy(mol)
datamol.mol.copy_mol_props(source, destination)
¶
Copy properties from one source molecule to another destination molecule.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
source |
Mol |
a molecule to copy from. |
required |
destination |
Mol |
a molecule to copy to. |
required |
Source code in datamol/mol.py
def copy_mol_props(source: Chem.rdchem.Mol, destination: Chem.rdchem.Mol):
"""Copy properties from one source molecule to another destination
molecule.
Args:
source: a molecule to copy from.
destination: a molecule to copy to.
"""
props = source.GetPropsAsDict()
dm.set_mol_props(destination, props)
datamol.mol.decrease_bond(bond)
¶
Remove one single bond from the input bond. Note that you should first kekulize your molecules and remove non-standard bond.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
bond |
Bond |
a bond. |
required |
Source code in datamol/mol.py
def decrease_bond(bond: Chem.rdchem.Bond) -> Optional[Union[list, Chem.rdchem.Bond]]:
"""Remove one single bond from the input bond. Note that you should
first kekulize your molecules and remove non-standard bond.
Args:
bond: a bond.
"""
if bond.GetBondType() == TRIPLE_BOND:
return DOUBLE_BOND
if bond.GetBondType() == DOUBLE_BOND:
return SINGLE_BOND
if bond.GetBondType() == SINGLE_BOND:
return None
return bond
datamol.mol.enumerate_stereoisomers(mol, n_variants=20, undefined_only=False, rationalise=True)
¶
Enumerate the stereocenters and bonds of the current molecule.
Original source: the openff-toolkit
lib.
Warning: this function can be computationnaly intensive.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
|
The molecule whose state we should enumerate. |
required |
n_variants |
int |
The maximum amount of molecules that should be returned. |
20 |
undefined_only |
bool |
If we should enumerate all stereocenters and bonds or only those with undefined stereochemistry. |
False |
rationalise |
bool |
If we should try to build and rationalise the molecule to ensure it can exist. |
True |
Source code in datamol/mol.py
def enumerate_stereoisomers(
mol,
n_variants: int = 20,
undefined_only: bool = False,
rationalise: bool = True,
):
"""Enumerate the stereocenters and bonds of the current molecule.
Original source: the `openff-toolkit` lib.
Warning: this function can be computationnaly intensive.
Args:
mol: The molecule whose state we should enumerate.
n_variants: The maximum amount of molecules that should be returned.
undefined_only: If we should enumerate all stereocenters and bonds or only those
with undefined stereochemistry.
rationalise: If we should try to build and rationalise the molecule to ensure it
can exist.
"""
from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers
from rdkit.Chem.EnumerateStereoisomers import StereoEnumerationOptions
# safety first
mol = copy_mol(mol)
# in case any bonds/centers are missing stereo chem flag it here
Chem.AssignStereochemistry(mol, force=False, flagPossibleStereoCenters=True, cleanIt=True) # type: ignore
Chem.FindPotentialStereoBonds(mol, cleanIt=True) # type: ignore
# set up the options
stereo_opts = StereoEnumerationOptions(
tryEmbedding=rationalise,
onlyUnassigned=undefined_only,
maxIsomers=n_variants,
)
try:
isomers = tuple(EnumerateStereoisomers(mol, options=stereo_opts))
except:
# NOTE(hadim): often got "Stereo atoms should be specified before specifying CIS/TRANS bond stereochemistry"
# for the ligand of reference (coming from the PDB). Not sure how to handle that.
isomers = []
variants = []
for isomer in isomers:
# isomer has CIS/TRANS tags so convert back to E/Z
Chem.SetDoubleBondNeighborDirections(isomer) # type: ignore
Chem.AssignStereochemistry(isomer, force=True, cleanIt=True) # type: ignore
variants.append(isomer)
return variants
datamol.mol.enumerate_tautomers(mol, n_variants=20)
¶
Enumerate the possible tautomers of the current molecule.
Original source: the openff-toolkit
lib.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
The molecule whose state we should enumerate. |
required |
n_variants |
int |
The maximum amount of molecules that should be returned. |
20 |
Source code in datamol/mol.py
def enumerate_tautomers(mol: Chem.rdchem.Mol, n_variants: int = 20):
"""Enumerate the possible tautomers of the current molecule.
Original source: the `openff-toolkit` lib.
Args:
mol: The molecule whose state we should enumerate.
n_variants: The maximum amount of molecules that should be returned.
"""
# safety first
mol = copy_mol(mol)
enumerator = rdMolStandardize.TautomerEnumerator()
enumerator.SetMaxTautomers(n_variants)
tautomers = enumerator.Enumerate(mol)
return list(tautomers)
datamol.mol.fix_mol(mol, n_iter=1, remove_singleton=False, largest_only=False, inplace=False)
¶
Fix error in molecule using a greedy approach.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
input molecule to fix |
required |
n_iter |
int |
Number of valence fix iteration to apply |
1 |
remove_singleton |
bool |
Whether |
False |
largest_only |
bool |
Whether only the largest fragment should be kept |
False |
inplace |
bool |
Whether to return a copy of the mol or perform in place operation |
False |
Returns:
Type | Description |
---|---|
Optional[rdkit.Chem.rdchem.Mol] |
Fixed molecule. |
Source code in datamol/mol.py
def fix_mol(
mol: Chem.rdchem.Mol,
n_iter: int = 1,
remove_singleton: bool = False,
largest_only: bool = False,
inplace: bool = False,
) -> Optional[Chem.rdchem.Mol]:
"""Fix error in molecule using a greedy approach.
Args:
mol: input molecule to fix
n_iter: Number of valence fix iteration to apply
remove_singleton: Whether `adjust_singleton` should be applied
largest_only: Whether only the largest fragment should be kept
inplace: Whether to return a copy of the mol or perform in place operation
Returns:
Fixed molecule.
"""
if not inplace:
mol = copy.copy(mol)
m = sanitize_mol(mol) or mol # fail back to mol when the fixer fail
if m is not None:
m = remove_dummies(m)
for _ in range(n_iter):
m = fix_valence(m)
if remove_singleton:
m = adjust_singleton(m)
if largest_only:
# m = max(Chem.rdmolops.GetMolFrags(m, asMols=True, sanitizeFrags=False), key=lambda m: m.GetNumAtoms())
m = rdMolStandardize.FragmentParent(m, skipStandardize=True)
return m
datamol.mol.fix_valence(mol, inplace=False, allow_ring_break=False)
¶
Identify and try to fix valence issues by removing any supplemental bond that should not be in the graph.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
|
input molecule with incorrect valence for some atoms |
required |
inplace |
bool |
Whether to modify in place or make a copy |
False |
allow_ring_break |
bool |
Whether bond removal involving ring is allowed. |
False |
Returns:
Type | Description |
---|---|
Optional[rdkit.Chem.rdchem.Mol] |
Fixed potential valence issue in molecule or original molecule when nothing is broken of if failed. |
Source code in datamol/mol.py
def fix_valence(
mol, inplace: bool = False, allow_ring_break: bool = False
) -> Optional[Chem.rdchem.Mol]:
"""Identify and try to fix valence issues by removing any supplemental bond
that should not be in the graph.
Args:
mol: input molecule with incorrect valence for some atoms
inplace: Whether to modify in place or make a copy
allow_ring_break: Whether bond removal involving ring is allowed.
Returns:
Fixed potential valence issue in molecule or original molecule when nothing is broken
of if failed.
"""
if not inplace:
mol = copy.copy(mol)
vm = rdMolStandardize.RDKitValidation()
if len(vm.validate(mol)) == 0: # don't fix something that is not broken
return mol
try:
m = Chem.RemoveHs(
mol,
implicitOnly=False,
updateExplicitCount=True,
sanitize=False,
)
m.UpdatePropertyCache(False)
# first pass using explicit false count
for atom in m.GetAtoms():
while incorrect_valence(atom) and atom.GetTotalNumHs() > 0:
cur_hydrogen = atom.GetTotalNumHs()
atom.SetNumExplicitHs(max(0, cur_hydrogen - 1))
atom.SetFormalCharge(max(0, atom.GetFormalCharge() - 1))
# atom.SetNumRadicalElectrons(0)
atom.UpdatePropertyCache(False)
em = Chem.RWMol(m)
bonds = em.GetBonds()
bonds = [
bond
for bond in bonds
if any(
[
incorrect_valence(bond.GetBeginAtom()),
incorrect_valence(bond.GetEndAtom()),
]
)
]
for bond in bonds:
a1 = bond.GetBeginAtom()
a2 = bond.GetEndAtom()
if incorrect_valence(a1) or incorrect_valence(a2):
mbond = decrease_bond(bond)
if allow_ring_break or (mbond or not bond.IsInRing()):
em.RemoveBond(a1.GetIdx(), a2.GetIdx())
if mbond is not None:
em.AddBond(a1.GetIdx(), a2.GetIdx(), mbond)
a1.UpdatePropertyCache(False)
a2.UpdatePropertyCache(False)
m = em.GetMol()
except Exception:
return None
return m
datamol.mol.fix_valence_charge(mol, inplace=False)
¶
Fix valence issues that are due to incorrect charges.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
Input molecule with incorrect valence for some atoms |
required |
inplace |
bool |
Whether to modify in place or make a copy. |
False |
Returns:
Type | Description |
---|---|
Optional[rdkit.Chem.rdchem.Mol] |
Fixed molecule via charge correction or original molecule if failed. |
Source code in datamol/mol.py
def fix_valence_charge(mol: Chem.rdchem.Mol, inplace: bool = False) -> Optional[Chem.rdchem.Mol]:
"""Fix valence issues that are due to incorrect charges.
Args:
mol: Input molecule with incorrect valence for some atoms
inplace: Whether to modify in place or make a copy.
Returns:
Fixed molecule via charge correction or original molecule if failed.
"""
vm = rdMolStandardize.RDKitValidation()
# Don't fix something that is not broken
if len(vm.validate(mol)) > 0:
if not inplace:
mol = copy.copy(mol)
mol.UpdatePropertyCache(False)
for a in mol.GetAtoms():
n_electron = (
a.GetImplicitValence()
+ a.GetExplicitValence()
- dm.PERIODIC_TABLE.GetDefaultValence(a.GetSymbol())
)
a.SetFormalCharge(n_electron)
return mol
datamol.mol.incorrect_valence(a, update=False)
¶
Check if an atom connection is not valid or all the atom of a molecule.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
a |
Union[rdkit.Chem.rdchem.Mol, rdkit.Chem.rdchem.Atom] |
atom or molecule to check for valence issue. |
required |
update |
bool |
Update owning molecule property cache first. |
False |
Returns:
Type | Description |
---|---|
bool |
Whether the input atom valence is correct. |
Source code in datamol/mol.py
def incorrect_valence(a: Union[Chem.rdchem.Mol, Chem.rdchem.Atom], update: bool = False) -> bool:
"""Check if an atom connection is not valid or all the atom of a molecule.
Args:
a: atom or molecule to check for valence issue.
update: Update owning molecule property cache first.
Returns:
Whether the input atom valence is correct.
"""
if isinstance(a, Chem.rdchem.Mol):
a.UpdatePropertyCache(False)
vm = rdMolStandardize.RDKitValidation()
return len(vm.validate(a)) > 0
if update:
m = a.GetOwningMol()
m.UpdatePropertyCache(False)
return (a.GetImplicitValence() == 0) and (
a.GetExplicitValence() > max(PERIODIC_TABLE.GetValenceList(a.GetSymbol()))
)
datamol.mol.is_transition_metal(at)
¶
Check if atom is a transition metal.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
at |
Atom |
an atom. |
required |
Source code in datamol/mol.py
def is_transition_metal(at: Chem.rdchem.Atom) -> bool:
"""Check if atom is a transition metal.
Args:
at: an atom.
"""
n = at.GetAtomicNum()
return (n >= 22 and n <= 29) or (n >= 40 and n <= 47) or (n >= 72 and n <= 79)
datamol.mol.keep_largest_fragment(mol)
¶
Only keep largest fragment of each molecule.
Source code in datamol/mol.py
def keep_largest_fragment(mol: Chem.rdchem.Mol) -> Optional[Chem.rdchem.Mol]:
"""Only keep largest fragment of each molecule."""
return max(
rdmolops.GetMolFrags(mol, asMols=True),
default=mol,
key=lambda m: m.GetNumAtoms(),
)
datamol.mol.randomize_atoms(mol)
¶
Randomize the position of the atoms in a mol.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
a molecule. |
required |
Returns:
Type | Description |
---|---|
Optional[rdkit.Chem.rdchem.Mol] |
mol: a molecule. |
Source code in datamol/mol.py
def randomize_atoms(mol: Chem.rdchem.Mol) -> Optional[Chem.rdchem.Mol]:
"""Randomize the position of the atoms in a mol.
Args:
mol: a molecule.
Returns:
mol: a molecule.
"""
if mol.GetNumAtoms() == 0:
return mol
atom_indices = list(range(mol.GetNumAtoms()))
random.shuffle(atom_indices)
return Chem.RenumberAtoms(mol, atom_indices)
datamol.mol.remove_dummies(mol, dummy='*')
¶
Remove dummy atoms from molecules.
Source code in datamol/mol.py
def remove_dummies(mol: Chem.rdchem.Mol, dummy: str = "*") -> Optional[Chem.rdchem.Mol]:
"""Remove dummy atoms from molecules."""
du = dm.to_mol(dummy)
out = mol
try:
out = Chem.ReplaceSubstructs(mol, du, dm.to_mol("[H]"), True)[0]
out = Chem.RemoveHs(out)
except Exception as e:
out = Chem.DeleteSubstructs(mol, du)
return out
datamol.mol.reorder_atoms(mol, break_ties=True, include_chirality=True, include_isotopes=True)
¶
Reorder the atoms in a mol. It ensures a single atom order for the same molecule, regardless of its original representation.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
a molecule. |
required |
break_ties |
bool |
Force breaking of ranked ties. |
True |
include_chirality |
bool |
Use chiral information when computing rank. |
True |
include_isotopes |
bool |
Use isotope information when computing rank. |
True |
Returns:
Type | Description |
---|---|
Optional[rdkit.Chem.rdchem.Mol] |
mol: a molecule. |
Source code in datamol/mol.py
def reorder_atoms(
mol: Chem.rdchem.Mol,
break_ties: bool = True,
include_chirality: bool = True,
include_isotopes: bool = True,
) -> Optional[Chem.rdchem.Mol]:
"""Reorder the atoms in a mol. It ensures a single atom order for the same molecule,
regardless of its original representation.
Args:
mol: a molecule.
break_ties: Force breaking of ranked ties.
include_chirality: Use chiral information when computing rank.
include_isotopes: Use isotope information when computing rank.
Returns:
mol: a molecule.
"""
if mol.GetNumAtoms() == 0:
return mol
new_order = Chem.CanonicalRankAtoms(
mol,
breakTies=break_ties,
includeChirality=include_chirality,
includeIsotopes=include_isotopes,
)
new_order = sorted([(y, x) for x, y in enumerate(new_order)])
return Chem.RenumberAtoms(mol, [y for (x, y) in new_order])
datamol.mol.replace_dummies_atoms(mol, atom='C', dummy='*', replace_all=True)
¶
Remove dummy atoms from molecules.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
molecule with dummies |
required |
atom |
str |
replacement atom, default is carbon |
'C' |
dummy |
str |
dummy atom representation |
'*' |
replace_all |
bool |
Whether to replace all dummies |
True |
Returns:
Type | Description |
---|---|
Optional[rdkit.Chem.rdchem.Mol] |
mol: Molecule with dummy replaced |
Source code in datamol/mol.py
def replace_dummies_atoms(
mol: Chem.rdchem.Mol,
atom: str = "C",
dummy: str = "*",
replace_all: bool = True,
) -> Optional[Chem.rdchem.Mol]:
"""Remove dummy atoms from molecules.
Args:
mol: molecule with dummies
atom: replacement atom, default is carbon
dummy: dummy atom representation
replace_all: Whether to replace all dummies
Returns:
mol: Molecule with dummy replaced
"""
du = Chem.MolFromSmiles(dummy)
replacement = Chem.MolFromSmiles(atom)
out = Chem.ReplaceSubstructs(mol, du, replacement, replaceAll=replace_all)[0]
return out
datamol.mol.sanitize_first(mols, charge_neutral=False, sanifix=True)
¶
Sanitize a list of molecules and return the first valid molecule seen in the list.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mols |
List[rdkit.Chem.rdchem.Mol] |
a list of molecules. |
required |
charge_neutral |
bool |
whether charge neutralization should be applied. |
False |
sanifix |
bool |
whether to run the sanifix from James Davidson (sanifix4.py) that try to adjust aromatic nitrogens. |
True |
Returns:
Type | Description |
---|---|
mol |
a molecule. |
Source code in datamol/mol.py
def sanitize_first(mols: List[Chem.rdchem.Mol], charge_neutral: bool = False, sanifix: bool = True):
"""Sanitize a list of molecules and return the first valid molecule seen in the list.
Args:
mols: a list of molecules.
charge_neutral: whether charge neutralization should be applied.
sanifix: whether to run the sanifix from James Davidson
(sanifix4.py) that try to adjust aromatic nitrogens.
Returns:
mol: a molecule.
"""
for mol in mols:
mol = sanitize_mol(mol, charge_neutral=charge_neutral, sanifix=sanifix)
if mol:
return mol
return None
datamol.mol.sanitize_mol(mol, charge_neutral=False, sanifix=True)
¶
Sanitize molecule and fix common errors.
Warning
The procedure includes a SMILES conversion to avoid accasional aromaticity errors. In consequence, all the properties and the conformers will be lost.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
a molecule. |
required |
charge_neutral |
bool |
whether charge neutralization should be applied. |
False |
sanifix |
bool |
whether to run the sanifix from James Davidson (sanifix4.py) that try to adjust aromatic nitrogens. |
True |
Returns:
Type | Description |
---|---|
Optional[rdkit.Chem.rdchem.Mol] |
mol: a molecule. |
Source code in datamol/mol.py
def sanitize_mol(
mol: Chem.rdchem.Mol, charge_neutral: bool = False, sanifix: bool = True
) -> Optional[Chem.rdchem.Mol]:
"""Sanitize molecule and fix common errors.
Warning:
The procedure includes a SMILES conversion to avoid accasional aromaticity
errors. In consequence, all the properties and the conformers will be lost.
Args:
mol: a molecule.
charge_neutral: whether charge neutralization should be applied.
sanifix: whether to run the sanifix from James Davidson
(sanifix4.py) that try to adjust aromatic nitrogens.
Returns:
mol: a molecule.
"""
if mol is None:
return mol
if charge_neutral:
mol = to_neutral(mol)
if sanifix:
mol = _sanifix4.sanifix(mol)
if mol:
try:
# Try catch to avoid occasional aromaticity errors
return to_mol(dm.to_smiles(mol), sanitize=True) # type: ignore
except Exception:
return None
return mol
datamol.mol.sanitize_smiles(smiles, isomeric=True)
¶
Takes SMILES string and returns its sanitized version.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
smiles |
str |
smiles to be sanitized. |
required |
isomeric |
bool |
Whether to include information about stereochemistry in the SMILES. |
True |
Returns:
Type | Description |
---|---|
Optional[str] |
sanitized smiles. |
Source code in datamol/mol.py
def sanitize_smiles(smiles: str, isomeric: bool = True) -> Optional[str]:
"""Takes SMILES string and returns its sanitized version.
Args:
smiles: smiles to be sanitized.
isomeric: Whether to include information about stereochemistry in the SMILES.
Returns:
sanitized smiles.
"""
try:
mol = dm.to_mol(smiles, sanitize=False)
mol = dm.sanitize_mol(mol, False)
except Exception:
return None
if mol is None:
return None
try:
smiles = dm.to_smiles(mol, isomeric=isomeric) # type: ignore
except:
return None
return smiles
datamol.mol.set_dative_bonds(mol, from_atoms=(7, 8))
¶
Replaces some single bonds between metals and atoms with atomic numbers in fromAtoms with dative bonds. The replacement is only done if the atom has "too many" bonds.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
molecule with bond to modify |
required |
from_atoms |
Tuple[int, int] |
List of atoms (symbol or atomic number) to consider for bond replacement. By default, only Nitrogen (7) and Oxygen (8) are considered. |
(7, 8) |
Returns:
Type | Description |
---|---|
Optional[rdkit.Chem.rdchem.Mol] |
The modified molecule. |
Source code in datamol/mol.py
def set_dative_bonds(
mol: Chem.rdchem.Mol, from_atoms: Tuple[int, int] = (7, 8)
) -> Optional[Chem.rdchem.Mol]:
"""Replaces some single bonds between metals and atoms with atomic numbers in fromAtoms
with dative bonds. The replacement is only done if the atom has "too many" bonds.
Arguments:
mol: molecule with bond to modify
from_atoms: List of atoms (symbol or atomic number) to consider for bond replacement.
By default, only Nitrogen (7) and Oxygen (8) are considered.
Returns:
The modified molecule.
"""
rwmol = Chem.RWMol(mol)
rwmol.UpdatePropertyCache(strict=False)
metals = [at for at in rwmol.GetAtoms() if is_transition_metal(at)]
for metal in metals:
for nbr in metal.GetNeighbors():
if (nbr.GetAtomicNum() in from_atoms or nbr.GetSymbol() in from_atoms) and (
nbr.GetExplicitValence() > PERIODIC_TABLE.GetDefaultValence(nbr.GetAtomicNum())
and rwmol.GetBondBetweenAtoms(nbr.GetIdx(), metal.GetIdx()).GetBondType()
== SINGLE_BOND
):
rwmol.RemoveBond(nbr.GetIdx(), metal.GetIdx())
rwmol.AddBond(nbr.GetIdx(), metal.GetIdx(), DATIVE_BOND)
return rwmol
datamol.mol.set_mol_props(mol, props, copy=False)
¶
Set properties to a mol from a dict.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
the mol where to copy the props. |
required |
props |
Dict[str, Any] |
the props to copy. |
required |
copy |
bool |
whether to copy the provided mol |
False |
Source code in datamol/mol.py
def set_mol_props(
mol: Chem.rdchem.Mol, props: Dict[str, Any], copy: bool = False
) -> Chem.rdchem.Mol:
"""Set properties to a mol from a dict.
Args:
mol: the mol where to copy the props.
props: the props to copy.
copy: whether to copy the provided mol
"""
if copy is True:
mol = dm.copy_mol(mol)
for k, v in props.items():
if isinstance(v, bool):
mol.SetBoolProp(k, v)
elif isinstance(v, int):
mol.SetIntProp(k, v)
elif isinstance(v, float):
mol.SetDoubleProp(k, v)
else:
mol.SetProp(k, str(v))
return mol
datamol.mol.standardize_mol(mol, disconnect_metals=False, normalize=True, reionize=True, uncharge=False, stereo=True)
¶
This function returns a standardized version the given molecule, with or without disconnect the metals. The process is apply in the order of the argument.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
The molecule to standardize. |
required |
disconnect_metals |
bool |
Whether to disconnect the metallic atoms from non-metals |
False |
normalize |
bool |
Whether to apply normalization (correct functional groups and recombine charges). |
True |
reionize |
bool |
Whether to apply molecule reionization |
True |
uncharge |
bool |
Whether to remove all charge from molecule |
False |
stereo |
bool |
Whether to attempt to assign stereochemistry |
True |
Returns:
Type | Description |
---|---|
mol |
The standardized molecule. |
Source code in datamol/mol.py
def standardize_mol(
mol: Chem.rdchem.Mol,
disconnect_metals: bool = False,
normalize: bool = True,
reionize: bool = True,
uncharge: bool = False,
stereo: bool = True,
):
r"""
This function returns a standardized version the given molecule, with or without disconnect the metals.
The process is apply in the order of the argument.
Arguments:
mol: The molecule to standardize.
disconnect_metals: Whether to disconnect the metallic atoms from non-metals
normalize: Whether to apply normalization (correct functional groups and recombine charges).
reionize: Whether to apply molecule reionization
uncharge: Whether to remove all charge from molecule
stereo: Whether to attempt to assign stereochemistry
Returns:
mol: The standardized molecule.
"""
mol = copy.copy(mol)
if disconnect_metals:
md = rdMolStandardize.MetalDisconnector()
mol = md.Disconnect(mol)
if normalize:
mol = rdMolStandardize.Normalize(mol)
if reionize:
reionizer = rdMolStandardize.Reionizer()
mol = reionizer.reionize(mol)
if uncharge:
uncharger = rdMolStandardize.Uncharger()
mol = uncharger.uncharge(mol)
if stereo:
Chem.AssignStereochemistry(mol, force=False, cleanIt=True)
return mol
datamol.mol.standardize_smiles(smiles, tautomer=False)
¶
Apply smile standardization procedure. This is a convenient function wrapped arrounf RDKit smiles standardizer and tautomeric canonicalization.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
smiles |
str |
Smiles to standardize |
required |
tautomer |
bool |
Whether to canonicalize tautomers |
False |
Returns:
Type | Description |
---|---|
standard_smiles |
the standardized smiles |
Source code in datamol/mol.py
def standardize_smiles(smiles: str, tautomer: bool = False):
r"""
Apply smile standardization procedure. This is a convenient function wrapped arrounf RDKit
smiles standardizer and tautomeric canonicalization.
Args:
smiles: Smiles to standardize
tautomer: Whether to canonicalize tautomers
Returns:
standard_smiles: the standardized smiles
"""
smiles = rdMolStandardize.StandardizeSmiles(smiles)
if tautomer:
smiles = canonicalize_tautomer_smiles(smiles)
return smiles
datamol.mol.to_mol(mol, add_hs=False, explicit_only=False, ordered=False, kekulize=False, sanitize=True)
¶
Convert an input molecule (smiles representation) into a Chem.rdchem.Mol
.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
str |
SMILES of a molecule or a molecule. |
required |
add_hs |
bool |
Whether hydrogens should be added the molecule. |
False |
explicit_only |
bool |
Whether to only add explicit hydrogen or both
(implicit and explicit). when |
False |
ordered |
bool |
Whether the atom should be ordered. This option is important if you want to ensure that the features returned will always maintain a single atom order for the same molecule, regardless of its original SMILES representation. |
False |
kekulize |
bool |
Whether to perform kekulization of the input molecules. |
False |
sanitize |
bool |
Whether to apply rdkit sanitization when input is a SMILES. |
True |
Returns:
Type | Description |
---|---|
Optional[rdkit.Chem.rdchem.Mol] |
mol: the molecule if some conversion have been made. If the conversion fails None is returned so make sure that you handle this case on your own. |
Source code in datamol/mol.py
def to_mol(
mol: str,
add_hs: bool = False,
explicit_only: bool = False,
ordered: bool = False,
kekulize: bool = False,
sanitize: bool = True,
) -> Optional[Chem.rdchem.Mol]:
"""Convert an input molecule (smiles representation) into a `Chem.rdchem.Mol`.
Args:
mol: SMILES of a molecule or a molecule.
add_hs: Whether hydrogens should be added the molecule.
explicit_only: Whether to only add explicit hydrogen or both
(implicit and explicit). when `add_hs` is set to True.
ordered: Whether the atom should be ordered. This option is
important if you want to ensure that the features returned will always maintain
a single atom order for the same molecule, regardless of its original SMILES representation.
kekulize: Whether to perform kekulization of the input molecules.
sanitize: Whether to apply rdkit sanitization when input is a SMILES.
Returns:
mol: the molecule if some conversion have been made. If the conversion fails
None is returned so make sure that you handle this case on your own.
"""
if not isinstance(mol, (str, Chem.rdchem.Mol)):
raise ValueError(f"Input should be a Chem.rdchem.Mol or a string instead of '{type(mol)}'")
if isinstance(mol, str):
_mol = Chem.MolFromSmiles(mol, sanitize=sanitize)
if not sanitize and _mol is not None:
_mol.UpdatePropertyCache(False)
else:
_mol = mol
# Add hydrogens
if _mol is not None and add_hs:
_mol = Chem.AddHs(_mol, explicitOnly=explicit_only)
# Reorder atoms
if _mol is not None and ordered:
_mol = reorder_atoms(_mol)
if _mol is not None and kekulize:
Chem.Kekulize(_mol, clearAromaticFlags=False)
return _mol
datamol.mol.to_neutral(mol)
¶
Neutralize the charge of a molecule.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
a molecule. |
required |
Returns:
Type | Description |
---|---|
Optional[rdkit.Chem.rdchem.Mol] |
mol: a molecule. |
Source code in datamol/mol.py
def to_neutral(mol: Chem.rdchem.Mol) -> Optional[Chem.rdchem.Mol]:
"""Neutralize the charge of a molecule.
Args:
mol: a molecule.
Returns:
mol: a molecule.
"""
if mol is None:
return mol
for a in mol.GetAtoms():
if a.GetFormalCharge() < 0 or (
a.GetExplicitValence() >= PERIODIC_TABLE.GetDefaultValence(a.GetSymbol())
and a.GetFormalCharge() > 0
):
a.SetFormalCharge(0)
a.UpdatePropertyCache(False)
return mol
datamol.similarity
¶
datamol.similarity.cdist(mols1, mols2, n_jobs=1, **fp_args)
¶
Compute the pairwise tanimoto distance between the fingerprints of each pair of molecules of the two collections of inputs.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mols1 |
List[rdkit.Chem.rdchem.Mol] |
list of molecules. |
required |
mols2 |
List[rdkit.Chem.rdchem.Mol] |
list of molecules. |
required |
n_jobs |
Optional[int] |
Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores. |
1 |
**fp_args |
|
list of args to pass to |
{} |
Returns:
Type | Description |
---|---|
ndarray |
distmat |
Source code in datamol/similarity.py
def cdist(
mols1: List[Chem.rdchem.Mol],
mols2: List[Chem.rdchem.Mol],
n_jobs: Optional[int] = 1,
**fp_args,
) -> np.ndarray:
"""Compute the pairwise tanimoto distance between the fingerprints of
each pair of molecules of the two collections of inputs.
Args:
mols1: list of molecules.
mols2: list of molecules.
n_jobs: Number of jobs for parallelization. Let to 1 for no
parallelization. Set to None to use all available cores.
**fp_args: list of args to pass to `to_fp()`.
Returns:
distmat
"""
fps1 = dm.parallelized(
functools.partial(dm.to_fp, as_array=True, **fp_args),
mols1,
n_jobs=n_jobs,
)
fps2 = dm.parallelized(
functools.partial(dm.to_fp, as_array=True, **fp_args),
mols2,
n_jobs=n_jobs,
)
fps1 = np.array(fps1)
fps2 = np.array(fps2)
dist_mat = distance.cdist(fps1, fps2, metric="jaccard")
return dist_mat
datamol.similarity.pdist(mols, n_jobs=1, **fp_args)
¶
Compute the pairwise tanimoto distance between the fingerprints of all the molecules in the input set.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mols |
List[rdkit.Chem.rdchem.Mol] |
list of molecules |
required |
n_jobs |
Optional[int] |
Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores. |
1 |
**fp_args |
|
list of args to pass to |
{} |
Returns:
Type | Description |
---|---|
Tuple[numpy.ndarray, numpy.ndarray] |
distmat, valid_idx: Distance matrix, and valid index that have passed the conversion to fingerprint. |
Source code in datamol/similarity.py
def pdist(
mols: List[Chem.rdchem.Mol], n_jobs: Optional[int] = 1, **fp_args
) -> Tuple[np.ndarray, np.ndarray]:
"""Compute the pairwise tanimoto distance between the fingerprints of all the
molecules in the input set.
Args:
mols: list of molecules
n_jobs: Number of jobs for parallelization. Let to 1 for no
parallelization. Set to None to use all available cores.
**fp_args: list of args to pass to `to_fp()`.
Returns:
distmat, valid_idx: Distance matrix, and valid index that have passed the conversion
to fingerprint.
"""
fps = dm.parallelized(
functools.partial(dm.to_fp, as_array=False, **fp_args),
mols,
n_jobs=n_jobs,
)
valid_idx, fps = zip(*[(i, fp) for i, fp in enumerate(fps) if fp is not None])
fps = list(fps)
dist = GetTanimotoDistMat(fps)
dist_mat = np.zeros((len(fps), len(fps)))
dist_mat[np.triu_indices_from(dist_mat, 1)] = dist
dist_mat += dist_mat.T
return dist_mat, np.array(valid_idx)