The Basics
In [11]:
import matplotlib.pyplot as plt
import datamol as dm
Common functions¶
In [2]:
# Mol from a smiles
mol = dm.to_mol("O=C(C)Oc1ccccc1C(=O)O", sanitize=True)
mol
Out[2]:
In [3]:
# Morgan fingerprint
fp = dm.to_fp(mol)
fp
Out[3]:
array([1, 1, 1, ..., 0, 0, 0])
In [4]:
# Generate a selfies
selfies = dm.to_selfies(mol)
selfies
Out[4]:
'[C][C][Branch1_2][C][=O][O][C][=C][C][=C][C][=C][Ring1][Branch1_2][C][Branch1_2][C][=O][O]'
In [5]:
# Get InChi
inchi = dm.to_inchi(mol)
inchi
Out[5]:
'InChI=1S/C9H8O4/c1-6(10)13-8-5-3-2-4-7(8)9(11)12/h2-5H,1H3,(H,11,12)'
Dataframe and list of molecules¶
In [6]:
# Load a built-in dataset (only for demonstration purposes)
data = dm.data.freesolv()
# Make a molecule column from the smiles column
data['mol'] = data["smiles"].apply(lambda x: dm.to_mol(x))
data.head()
Out[6]:
iupac | smiles | expt | calc | mol | |
---|---|---|---|---|---|
0 | 4-methoxy-N,N-dimethyl-benzamide | CN(C)C(=O)c1ccc(cc1)OC | -11.01 | -9.625 | <rdkit.Chem.rdchem.Mol object at 0x7fc7bd6f0ac0> |
1 | methanesulfonyl chloride | CS(=O)(=O)Cl | -4.87 | -6.219 | <rdkit.Chem.rdchem.Mol object at 0x7fc7bd6f0a60> |
2 | 3-methylbut-1-ene | CC(C)C=C | 1.83 | 2.452 | <rdkit.Chem.rdchem.Mol object at 0x7fc7bd6f0a00> |
3 | 2-ethylpyrazine | CCc1cnccn1 | -5.45 | -5.809 | <rdkit.Chem.rdchem.Mol object at 0x7fc7bd6f09a0> |
4 | heptan-1-ol | CCCCCCCO | -4.21 | -2.917 | <rdkit.Chem.rdchem.Mol object at 0x7fc7bd6f0940> |
In [7]:
# Convert a dataframe to a list of mols
data = dm.data.freesolv()
mols = dm.from_df(data, smiles_column="smiles")
mols[:5]
Out[7]:
[<rdkit.Chem.rdchem.Mol at 0x7fc7bd677100>, <rdkit.Chem.rdchem.Mol at 0x7fc7bd6770a0>, <rdkit.Chem.rdchem.Mol at 0x7fc7bd677160>, <rdkit.Chem.rdchem.Mol at 0x7fc7bd6771c0>, <rdkit.Chem.rdchem.Mol at 0x7fc7bd677220>]
In [8]:
# Convert a list of mols to a dataframe
df = dm.to_df(mols)
df.head()
Out[8]:
smiles | iupac | expt | calc | |
---|---|---|---|---|
0 | COc1ccc(C(=O)N(C)C)cc1 | 4-methoxy-N,N-dimethyl-benzamide | -11.01 | -9.625 |
1 | CS(=O)(=O)Cl | methanesulfonyl chloride | -4.87 | -6.219 |
2 | C=CC(C)C | 3-methylbut-1-ene | 1.83 | 2.452 |
3 | CCc1cnccn1 | 2-ethylpyrazine | -5.45 | -5.809 |
4 | CCCCCCCO | heptan-1-ol | -4.21 | -2.917 |
Visualization¶
In [9]:
# Get a list of molecules
data = dm.data.freesolv()
mols = dm.from_df(data)
mols = mols[:8]
# Make an image from the molecules list with their SMILES as legend.
legends = [dm.to_smiles(mol) for mol in mols]
dm.viz.to_image(mols, legends=legends, n_cols=4, mol_size=(200, 200))
Out[9]:
Conformers¶
In [10]:
smiles = "O=C(C)Oc1ccccc1C(=O)O"
mol = dm.to_mol(smiles)
# Generate conformers
# If `n_confs` is None, an appropriate number of conformers will be picked
# according to the size of the molecules.
mol = dm.conformers.generate(mol, n_confs=None, rms_cutoff=None, minimize_energy=False)
mol.GetNumConformers()
Out[10]:
50
In [13]:
# Compute SASA from conformers (not on windows)
sasa = dm.conformers.sasa(mol)
sasa[:10]
[366.91634791 365.57081308 364.79047526 359.93368508 366.4153889 360.38477247 365.91939996 366.89598476 362.14762374 366.5461946 ]
In [13]:
# Compute RMSD between conformers
rmsd = dm.conformers.rmsd(mol)
rmsd[:4, :4]
Out[13]:
array([[5.20271993e-08, 1.17444351e+00, 1.02526309e+00, 1.24307499e+00], [1.17444351e+00, 5.20271993e-08, 5.88242112e-01, 5.58334770e-01], [1.02526309e+00, 5.88242112e-01, 7.35775709e-08, 7.84115932e-01], [1.24307499e+00, 5.58334770e-01, 7.84115932e-01, 0.00000000e+00]])
In [15]:
# Visualize the conformers in 3D
dm.viz.conformers(mol, n_confs=3, width="auto")
More advanced computation¶
In [15]:
# Compute the pairwise Tanimoto similarity distance using Morgan fingerprints.
data = dm.data.freesolv()
data['mol'] = data["smiles"].apply(lambda x: dm.to_mol(x))
mols = data['mol'].to_list()
dist_mat = dm.pdist(mols, n_jobs=None)
plt.imshow(dist_mat)
plt.colorbar()
Out[15]:
<matplotlib.colorbar.Colorbar at 0x7f24bcac3650>
Parallelize anything¶
In [16]:
# Easy parallelization
def compute_something(mol):
# just a fake computation
result = mol.GetNumAtoms() ** 0.5
return mol, result
data = dm.data.freesolv()
data['mol'] = data["smiles"].apply(lambda x: dm.to_mol(x))
results = dm.parallelized(compute_something, data['mol'].tolist(), n_jobs=None)
results[:5]
Out[16]:
[(<rdkit.Chem.rdchem.Mol at 0x7f24c0430580>, 3.605551275463989), (<rdkit.Chem.rdchem.Mol at 0x7f24c0430030>, 2.23606797749979), (<rdkit.Chem.rdchem.Mol at 0x7f24c052bda0>, 2.23606797749979), (<rdkit.Chem.rdchem.Mol at 0x7f24c052bd00>, 2.8284271247461903), (<rdkit.Chem.rdchem.Mol at 0x7f24c052b350>, 2.8284271247461903)]
In [ ]: