The Basics
In [1]:
Copied!
import matplotlib.pyplot as plt
import datamol as dm
import matplotlib.pyplot as plt
import datamol as dm
Common functions¶
In [2]:
Copied!
# Mol from a smiles
mol = dm.to_mol("O=C(C)Oc1ccccc1C(=O)O", sanitize=True)
mol
# Mol from a smiles
mol = dm.to_mol("O=C(C)Oc1ccccc1C(=O)O", sanitize=True)
mol
Out[2]:
In [3]:
Copied!
# Morgan fingerprint
fp = dm.to_fp(mol)
fp
# Morgan fingerprint
fp = dm.to_fp(mol)
fp
Out[3]:
array([0, 0, 0, ..., 0, 0, 0], dtype=uint8)
In [4]:
Copied!
# Generate a selfies
selfies = dm.to_selfies(mol)
selfies
# Generate a selfies
selfies = dm.to_selfies(mol)
selfies
Out[4]:
'[C][C][=Branch1][C][=O][O][C][=C][C][=C][C][=C][Ring1][=Branch1][C][=Branch1][C][=O][O]'
In [5]:
Copied!
# Get InChi
inchi = dm.to_inchi(mol)
inchi
# Get InChi
inchi = dm.to_inchi(mol)
inchi
Out[5]:
'InChI=1S/C9H8O4/c1-6(10)13-8-5-3-2-4-7(8)9(11)12/h2-5H,1H3,(H,11,12)'
Dataframe and list of molecules¶
In [6]:
Copied!
# Load a built-in dataset (only for demonstration purposes)
data = dm.data.freesolv()
# Make a molecule column from the smiles column
data["mol"] = data["smiles"].apply(lambda x: dm.to_mol(x))
data.head()
# Load a built-in dataset (only for demonstration purposes)
data = dm.data.freesolv()
# Make a molecule column from the smiles column
data["mol"] = data["smiles"].apply(lambda x: dm.to_mol(x))
data.head()
Out[6]:
iupac | smiles | expt | calc | mol | |
---|---|---|---|---|---|
0 | 4-methoxy-N,N-dimethyl-benzamide | CN(C)C(=O)c1ccc(cc1)OC | -11.01 | -9.625 | <img data-content="rdkit/molecule" src="data:i... |
1 | methanesulfonyl chloride | CS(=O)(=O)Cl | -4.87 | -6.219 | <img data-content="rdkit/molecule" src="data:i... |
2 | 3-methylbut-1-ene | CC(C)C=C | 1.83 | 2.452 | <img data-content="rdkit/molecule" src="data:i... |
3 | 2-ethylpyrazine | CCc1cnccn1 | -5.45 | -5.809 | <img data-content="rdkit/molecule" src="data:i... |
4 | heptan-1-ol | CCCCCCCO | -4.21 | -2.917 | <img data-content="rdkit/molecule" src="data:i... |
In [7]:
Copied!
# Convert a dataframe to a list of mols
data = dm.data.freesolv()
mols = dm.from_df(data, smiles_column="smiles")
mols[:5]
# Convert a dataframe to a list of mols
data = dm.data.freesolv()
mols = dm.from_df(data, smiles_column="smiles")
mols[:5]
Out[7]:
[<rdkit.Chem.rdchem.Mol at 0x7f1f54b41720>, <rdkit.Chem.rdchem.Mol at 0x7f1f54b426e0>, <rdkit.Chem.rdchem.Mol at 0x7f1f54b408e0>, <rdkit.Chem.rdchem.Mol at 0x7f1f54b427a0>, <rdkit.Chem.rdchem.Mol at 0x7f1f54b42c80>]
In [8]:
Copied!
# Convert a list of mols to a dataframe
df = dm.to_df(mols)
df.head()
# Convert a list of mols to a dataframe
df = dm.to_df(mols)
df.head()
Out[8]:
smiles | iupac | expt | calc | |
---|---|---|---|---|
0 | COc1ccc(C(=O)N(C)C)cc1 | 4-methoxy-N,N-dimethyl-benzamide | -11.01 | -9.625 |
1 | CS(=O)(=O)Cl | methanesulfonyl chloride | -4.87 | -6.219 |
2 | C=CC(C)C | 3-methylbut-1-ene | 1.83 | 2.452 |
3 | CCc1cnccn1 | 2-ethylpyrazine | -5.45 | -5.809 |
4 | CCCCCCCO | heptan-1-ol | -4.21 | -2.917 |
Visualization¶
In [10]:
Copied!
# Get a list of molecules
data = dm.data.freesolv()
mols = dm.from_df(data)
mols = mols[:8]
# Make an image from the molecules list with their SMILES as legend.
legends = [dm.to_smiles(mol) for mol in mols]
dm.viz.to_image(mols, legends=legends, n_cols=4, mol_size=(200, 200))
# Get a list of molecules
data = dm.data.freesolv()
mols = dm.from_df(data)
mols = mols[:8]
# Make an image from the molecules list with their SMILES as legend.
legends = [dm.to_smiles(mol) for mol in mols]
dm.viz.to_image(mols, legends=legends, n_cols=4, mol_size=(200, 200))
Out[10]:
Conformers¶
In [11]:
Copied!
smiles = "O=C(C)Oc1ccccc1C(=O)O"
mol = dm.to_mol(smiles)
# Generate conformers
# If `n_confs` is None, an appropriate number of conformers will be picked
# according to the size of the molecules.
mol = dm.conformers.generate(mol, n_confs=None, rms_cutoff=None, minimize_energy=False)
mol.GetNumConformers()
smiles = "O=C(C)Oc1ccccc1C(=O)O"
mol = dm.to_mol(smiles)
# Generate conformers
# If `n_confs` is None, an appropriate number of conformers will be picked
# according to the size of the molecules.
mol = dm.conformers.generate(mol, n_confs=None, rms_cutoff=None, minimize_energy=False)
mol.GetNumConformers()
Out[11]:
50
In [12]:
Copied!
# Compute SASA from conformers (not on windows)
sasa = dm.conformers.sasa(mol)
sasa[:10]
# Compute SASA from conformers (not on windows)
sasa = dm.conformers.sasa(mol)
sasa[:10]
Out[12]:
array([335.54161916, 328.89583715, 331.70118493, 335.84175163, 333.35663075, 335.69047204, 333.240087 , 337.22086805, 336.34704967, 332.86752587])
In [13]:
Copied!
# Compute RMSD between conformers
rmsd = dm.conformers.rmsd(mol)
rmsd[:4, :4]
# Compute RMSD between conformers
rmsd = dm.conformers.rmsd(mol)
rmsd[:4, :4]
Out[13]:
array([[6.61254163e-08, 1.01515980e+00, 1.01196417e+00, 3.80744856e-02], [1.01515980e+00, 4.67577303e-08, 3.61762165e-02, 1.02185385e+00], [1.01196417e+00, 3.61762165e-02, 4.67577303e-08, 1.01825112e+00], [3.80744856e-02, 1.02185385e+00, 1.01825112e+00, 0.00000000e+00]])
In [14]:
Copied!
# Visualize the conformers in 3D
dm.viz.conformers(mol, n_confs=3, width="auto")
# Visualize the conformers in 3D
dm.viz.conformers(mol, n_confs=3, width="auto")
More advanced computation¶
In [15]:
Copied!
# Compute the pairwise Tanimoto similarity distance using Morgan fingerprints.
data = dm.data.freesolv()
data["mol"] = data["smiles"].apply(lambda x: dm.to_mol(x))
mols = data["mol"].to_list()
dist_mat = dm.pdist(mols, n_jobs=None)
plt.imshow(dist_mat)
plt.colorbar()
# Compute the pairwise Tanimoto similarity distance using Morgan fingerprints.
data = dm.data.freesolv()
data["mol"] = data["smiles"].apply(lambda x: dm.to_mol(x))
mols = data["mol"].to_list()
dist_mat = dm.pdist(mols, n_jobs=None)
plt.imshow(dist_mat)
plt.colorbar()
Out[15]:
<matplotlib.colorbar.Colorbar at 0x7f1fe83d0bb0>
Parallelize anything¶
In [16]:
Copied!
# Easy parallelization
def compute_something(mol):
# just a fake computation
result = mol.GetNumAtoms() ** 0.5
return mol, result
data = dm.data.freesolv()
data["mol"] = data["smiles"].apply(lambda x: dm.to_mol(x))
results = dm.parallelized(compute_something, data["mol"].tolist(), n_jobs=None)
results[:5]
# Easy parallelization
def compute_something(mol):
# just a fake computation
result = mol.GetNumAtoms() ** 0.5
return mol, result
data = dm.data.freesolv()
data["mol"] = data["smiles"].apply(lambda x: dm.to_mol(x))
results = dm.parallelized(compute_something, data["mol"].tolist(), n_jobs=None)
results[:5]
Out[16]:
[(<rdkit.Chem.rdchem.Mol at 0x7f1f4ff2a140>, 3.605551275463989), (<rdkit.Chem.rdchem.Mol at 0x7f1f4ff29e40>, 2.23606797749979), (<rdkit.Chem.rdchem.Mol at 0x7f1f4ff2a620>, 2.23606797749979), (<rdkit.Chem.rdchem.Mol at 0x7f1f4ff2a6e0>, 2.8284271247461903), (<rdkit.Chem.rdchem.Mol at 0x7f1f4ff2a800>, 2.8284271247461903)]
In [ ]:
Copied!