The Basics

In [11]:

import matplotlib.pyplot as plt

import datamol as dm

Common functions¶

In [2]:

# Mol from a smiles
mol = dm.to_mol("O=C(C)Oc1ccccc1C(=O)O", sanitize=True)
mol

Out[2]:

In [3]:

# Morgan fingerprint
fp = dm.to_fp(mol)
fp

Out[3]:

array([1, 1, 1, ..., 0, 0, 0])

In [4]:

# Generate a selfies
selfies = dm.to_selfies(mol)
selfies

Out[4]:

'[C][C][Branch1_2][C][=O][O][C][=C][C][=C][C][=C][Ring1][Branch1_2][C][Branch1_2][C][=O][O]'

In [5]:

# Get InChi
inchi = dm.to_inchi(mol)
inchi

Out[5]:

'InChI=1S/C9H8O4/c1-6(10)13-8-5-3-2-4-7(8)9(11)12/h2-5H,1H3,(H,11,12)'

Dataframe and list of molecules¶

In [6]:

# Load a built-in dataset (only for demonstration purposes)
data = dm.data.freesolv()

# Make a molecule column from the smiles column
data['mol'] = data["smiles"].apply(lambda x: dm.to_mol(x))

data.head()

Out[6]:

	iupac	smiles	expt	calc	mol
0	4-methoxy-N,N-dimethyl-benzamide	CN(C)C(=O)c1ccc(cc1)OC	-11.01	-9.625	<rdkit.Chem.rdchem.Mol object at 0x7fc7bd6f0ac0>
1	methanesulfonyl chloride	CS(=O)(=O)Cl	-4.87	-6.219	<rdkit.Chem.rdchem.Mol object at 0x7fc7bd6f0a60>
2	3-methylbut-1-ene	CC(C)C=C	1.83	2.452	<rdkit.Chem.rdchem.Mol object at 0x7fc7bd6f0a00>
3	2-ethylpyrazine	CCc1cnccn1	-5.45	-5.809	<rdkit.Chem.rdchem.Mol object at 0x7fc7bd6f09a0>
4	heptan-1-ol	CCCCCCCO	-4.21	-2.917	<rdkit.Chem.rdchem.Mol object at 0x7fc7bd6f0940>

In [7]:

# Convert a dataframe to a list of mols
data = dm.data.freesolv()
mols = dm.from_df(data, smiles_column="smiles")
mols[:5]

Out[7]:

[<rdkit.Chem.rdchem.Mol at 0x7fc7bd677100>,
 <rdkit.Chem.rdchem.Mol at 0x7fc7bd6770a0>,
 <rdkit.Chem.rdchem.Mol at 0x7fc7bd677160>,
 <rdkit.Chem.rdchem.Mol at 0x7fc7bd6771c0>,
 <rdkit.Chem.rdchem.Mol at 0x7fc7bd677220>]

In [8]:

# Convert a list of mols to a dataframe
df = dm.to_df(mols)
df.head()

Out[8]:

	smiles	iupac	expt	calc
0	COc1ccc(C(=O)N(C)C)cc1	4-methoxy-N,N-dimethyl-benzamide	-11.01	-9.625
1	CS(=O)(=O)Cl	methanesulfonyl chloride	-4.87	-6.219
2	C=CC(C)C	3-methylbut-1-ene	1.83	2.452
3	CCc1cnccn1	2-ethylpyrazine	-5.45	-5.809
4	CCCCCCCO	heptan-1-ol	-4.21	-2.917

Visualization¶

In [9]:

# Get a list of molecules
data = dm.data.freesolv()
mols = dm.from_df(data)
mols = mols[:8]

# Make an image from the molecules list with their SMILES as legend.
legends = [dm.to_smiles(mol) for mol in mols]
dm.viz.to_image(mols, legends=legends, n_cols=4, mol_size=(200, 200))

Out[9]:

Conformers¶

In [10]:

smiles = "O=C(C)Oc1ccccc1C(=O)O"
mol = dm.to_mol(smiles)

# Generate conformers
# If `n_confs` is None, an appropriate number of conformers will be picked
# according to the size of the molecules.
mol = dm.conformers.generate(mol, n_confs=None, rms_cutoff=None, minimize_energy=False)
mol.GetNumConformers()

Out[10]:

In [13]:

# Compute SASA from conformers (not on windows)
sasa = dm.conformers.sasa(mol)
sasa[:10]

[366.91634791 365.57081308 364.79047526 359.93368508 366.4153889
 360.38477247 365.91939996 366.89598476 362.14762374 366.5461946 ]

In [13]:

# Compute RMSD between conformers
rmsd = dm.conformers.rmsd(mol)
rmsd[:4, :4]

Out[13]:

array([[5.20271993e-08, 1.17444351e+00, 1.02526309e+00, 1.24307499e+00],
       [1.17444351e+00, 5.20271993e-08, 5.88242112e-01, 5.58334770e-01],
       [1.02526309e+00, 5.88242112e-01, 7.35775709e-08, 7.84115932e-01],
       [1.24307499e+00, 5.58334770e-01, 7.84115932e-01, 0.00000000e+00]])

In [15]:

# Visualize the conformers in 3D
dm.viz.conformers(mol, n_confs=3, width="auto")

More advanced computation¶

In [15]:

# Compute the pairwise Tanimoto similarity distance using Morgan fingerprints.
data = dm.data.freesolv()
data['mol'] = data["smiles"].apply(lambda x: dm.to_mol(x))

mols = data['mol'].to_list()
dist_mat = dm.pdist(mols, n_jobs=None)
plt.imshow(dist_mat)
plt.colorbar()

Out[15]:

<matplotlib.colorbar.Colorbar at 0x7f24bcac3650>

Parallelize anything¶

In [16]:

# Easy parallelization
def compute_something(mol):
    # just a fake computation
    result = mol.GetNumAtoms() ** 0.5
    return mol, result
    
data = dm.data.freesolv()
data['mol'] = data["smiles"].apply(lambda x: dm.to_mol(x))

results = dm.parallelized(compute_something, data['mol'].tolist(), n_jobs=None)
results[:5]

Out[16]:

[(<rdkit.Chem.rdchem.Mol at 0x7f24c0430580>, 3.605551275463989),
 (<rdkit.Chem.rdchem.Mol at 0x7f24c0430030>, 2.23606797749979),
 (<rdkit.Chem.rdchem.Mol at 0x7f24c052bda0>, 2.23606797749979),
 (<rdkit.Chem.rdchem.Mol at 0x7f24c052bd00>, 2.8284271247461903),
 (<rdkit.Chem.rdchem.Mol at 0x7f24c052b350>, 2.8284271247461903)]

In [ ]: