The Basics

In [1]:

            
                Copied!
                
import matplotlib.pyplot as plt

import datamol as dm
import matplotlib.pyplot as plt

import datamol as dm

Common functions¶

In [2]:

            
                Copied!
                
# Mol from a smiles
mol = dm.to_mol("O=C(C)Oc1ccccc1C(=O)O", sanitize=True)
mol
# Mol from a smiles
mol = dm.to_mol("O=C(C)Oc1ccccc1C(=O)O", sanitize=True)
mol

Out[2]:

In [3]:

            
                Copied!
                
# Morgan fingerprint
fp = dm.to_fp(mol)
fp
# Morgan fingerprint
fp = dm.to_fp(mol)
fp

Out[3]:

array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)

In [4]:

            
                Copied!
                
# Generate a selfies
selfies = dm.to_selfies(mol)
selfies
# Generate a selfies
selfies = dm.to_selfies(mol)
selfies

Out[4]:

'[C][C][=Branch1][C][=O][O][C][=C][C][=C][C][=C][Ring1][=Branch1][C][=Branch1][C][=O][O]'

In [5]:

            
                Copied!
                
# Get InChi
inchi = dm.to_inchi(mol)
inchi
# Get InChi
inchi = dm.to_inchi(mol)
inchi

Out[5]:

'InChI=1S/C9H8O4/c1-6(10)13-8-5-3-2-4-7(8)9(11)12/h2-5H,1H3,(H,11,12)'

Dataframe and list of molecules¶

In [6]:

            
                Copied!
                
# Load a built-in dataset (only for demonstration purposes)
data = dm.data.freesolv()

# Make a molecule column from the smiles column
data["mol"] = data["smiles"].apply(lambda x: dm.to_mol(x))

data.head()
# Load a built-in dataset (only for demonstration purposes)
data = dm.data.freesolv()

# Make a molecule column from the smiles column
data["mol"] = data["smiles"].apply(lambda x: dm.to_mol(x))

data.head()

Out[6]:

	iupac	smiles	expt	calc	mol
0	4-methoxy-N,N-dimethyl-benzamide	CN(C)C(=O)c1ccc(cc1)OC	-11.01	-9.625	<img data-content="rdkit/molecule" src="data:i...
1	methanesulfonyl chloride	CS(=O)(=O)Cl	-4.87	-6.219	<img data-content="rdkit/molecule" src="data:i...
2	3-methylbut-1-ene	CC(C)C=C	1.83	2.452	<img data-content="rdkit/molecule" src="data:i...
3	2-ethylpyrazine	CCc1cnccn1	-5.45	-5.809	<img data-content="rdkit/molecule" src="data:i...
4	heptan-1-ol	CCCCCCCO	-4.21	-2.917	<img data-content="rdkit/molecule" src="data:i...

In [7]:

            
                Copied!
                
# Convert a dataframe to a list of mols
data = dm.data.freesolv()
mols = dm.from_df(data, smiles_column="smiles")
mols[:5]
# Convert a dataframe to a list of mols
data = dm.data.freesolv()
mols = dm.from_df(data, smiles_column="smiles")
mols[:5]

Out[7]:

[<rdkit.Chem.rdchem.Mol at 0x7f1f54b41720>,
 <rdkit.Chem.rdchem.Mol at 0x7f1f54b426e0>,
 <rdkit.Chem.rdchem.Mol at 0x7f1f54b408e0>,
 <rdkit.Chem.rdchem.Mol at 0x7f1f54b427a0>,
 <rdkit.Chem.rdchem.Mol at 0x7f1f54b42c80>]

In [8]:

            
                Copied!
                
# Convert a list of mols to a dataframe
df = dm.to_df(mols)
df.head()
# Convert a list of mols to a dataframe
df = dm.to_df(mols)
df.head()

Out[8]:

	smiles	iupac	expt	calc
0	COc1ccc(C(=O)N(C)C)cc1	4-methoxy-N,N-dimethyl-benzamide	-11.01	-9.625
1	CS(=O)(=O)Cl	methanesulfonyl chloride	-4.87	-6.219
2	C=CC(C)C	3-methylbut-1-ene	1.83	2.452
3	CCc1cnccn1	2-ethylpyrazine	-5.45	-5.809
4	CCCCCCCO	heptan-1-ol	-4.21	-2.917

Visualization¶

In [10]:

            
                Copied!
                
                    
                    
                
                

        
# Get a list of molecules
data = dm.data.freesolv()
mols = dm.from_df(data)
mols = mols[:8]

# Make an image from the molecules list with their SMILES as legend.
legends = [dm.to_smiles(mol) for mol in mols]
dm.viz.to_image(mols, legends=legends, n_cols=4, mol_size=(200, 200), use_svg=False)
# Get a list of molecules
data = dm.data.freesolv()
mols = dm.from_df(data)
mols = mols[:8]

# Make an image from the molecules list with their SMILES as legend.
legends = [dm.to_smiles(mol) for mol in mols]
dm.viz.to_image(mols, legends=legends, n_cols=4, mol_size=(200, 200), use_svg=False)

Out[10]:

Conformers¶

In [11]:

            
                Copied!
                
                    
                    
                
                

        
smiles = "O=C(C)Oc1ccccc1C(=O)O"
mol = dm.to_mol(smiles)

# Generate conformers
# If `n_confs` is None, an appropriate number of conformers will be picked
# according to the size of the molecules.
mol = dm.conformers.generate(mol, n_confs=None, rms_cutoff=None, minimize_energy=False)
mol.GetNumConformers()
smiles = "O=C(C)Oc1ccccc1C(=O)O"
mol = dm.to_mol(smiles)

# Generate conformers
# If `n_confs` is None, an appropriate number of conformers will be picked
# according to the size of the molecules.
mol = dm.conformers.generate(mol, n_confs=None, rms_cutoff=None, minimize_energy=False)
mol.GetNumConformers()

Out[11]:

In [12]:

            
                Copied!
                
# Compute SASA from conformers (not on windows)
sasa = dm.conformers.sasa(mol)
sasa[:10]
# Compute SASA from conformers (not on windows)
sasa = dm.conformers.sasa(mol)
sasa[:10]

Out[12]:

array([335.54161916, 328.89583715, 331.70118493, 335.84175163,
       333.35663075, 335.69047204, 333.240087  , 337.22086805,
       336.34704967, 332.86752587])

In [13]:

            
                Copied!
                
# Compute RMSD between conformers
rmsd = dm.conformers.rmsd(mol)
rmsd[:4, :4]
# Compute RMSD between conformers
rmsd = dm.conformers.rmsd(mol)
rmsd[:4, :4]

Out[13]:

array([[6.61254163e-08, 1.01515980e+00, 1.01196417e+00, 3.80744856e-02],
       [1.01515980e+00, 4.67577303e-08, 3.61762165e-02, 1.02185385e+00],
       [1.01196417e+00, 3.61762165e-02, 4.67577303e-08, 1.01825112e+00],
       [3.80744856e-02, 1.02185385e+00, 1.01825112e+00, 0.00000000e+00]])

In [14]:

            
                Copied!
                
# Visualize the conformers in 3D
dm.viz.conformers(mol, n_confs=3, width="auto")
# Visualize the conformers in 3D
dm.viz.conformers(mol, n_confs=3, width="auto")

More advanced computation¶

In [15]:

            
                Copied!
                
                    
                    
                
                

        
# Compute the pairwise Tanimoto similarity distance using Morgan fingerprints.
data = dm.data.freesolv()
data["mol"] = data["smiles"].apply(lambda x: dm.to_mol(x))

mols = data["mol"].to_list()
dist_mat = dm.pdist(mols, n_jobs=None)
plt.imshow(dist_mat)
plt.colorbar()
# Compute the pairwise Tanimoto similarity distance using Morgan fingerprints.
data = dm.data.freesolv()
data["mol"] = data["smiles"].apply(lambda x: dm.to_mol(x))

mols = data["mol"].to_list()
dist_mat = dm.pdist(mols, n_jobs=None)
plt.imshow(dist_mat)
plt.colorbar()

Out[15]:

<matplotlib.colorbar.Colorbar at 0x7f1fe83d0bb0>

Parallelize anything¶

In [16]:

            
                Copied!
                
# Easy parallelization
def compute_something(mol):
    # just a fake computation
    result = mol.GetNumAtoms() ** 0.5
    return mol, result

data = dm.data.freesolv()
data["mol"] = data["smiles"].apply(lambda x: dm.to_mol(x))

results = dm.parallelized(compute_something, data["mol"].tolist(), n_jobs=None)
results[:5]
# Easy parallelization
def compute_something(mol):
    # just a fake computation
    result = mol.GetNumAtoms() ** 0.5
    return mol, result

data = dm.data.freesolv()
data["mol"] = data["smiles"].apply(lambda x: dm.to_mol(x))

results = dm.parallelized(compute_something, data["mol"].tolist(), n_jobs=None)
results[:5]

Out[16]:

[(<rdkit.Chem.rdchem.Mol at 0x7f1f4ff2a140>, 3.605551275463989),
 (<rdkit.Chem.rdchem.Mol at 0x7f1f4ff29e40>, 2.23606797749979),
 (<rdkit.Chem.rdchem.Mol at 0x7f1f4ff2a620>, 2.23606797749979),
 (<rdkit.Chem.rdchem.Mol at 0x7f1f4ff2a6e0>, 2.8284271247461903),
 (<rdkit.Chem.rdchem.Mol at 0x7f1f4ff2a800>, 2.8284271247461903)]

In [ ]: