Skip to content

datamol.data

The data module aims to provide a fast and convenient access to various molecular datasets.


cdk2(as_df=True, mol_column='mol')

Return the RDKit CDK2 dataset from RDConfig.RDDocsDir, 'Book/data/cdk2.sdf'.

Parameters:

Name Type Description Default
as_df bool

Whether to return a list mol or a pandas DataFrame.

True
mol_column Optional[str]

Name of the mol column. Only relevant if as_df is True.

'mol'
Source code in datamol/data/__init__.py
119
120
121
122
123
124
125
126
127
128
129
def cdk2(as_df: bool = True, mol_column: Optional[str] = "mol"):
    """Return the RDKit CDK2 dataset from `RDConfig.RDDocsDir, 'Book/data/cdk2.sdf'`.

    Args:
        as_df: Whether to return a list mol or a pandas DataFrame.
        mol_column: Name of the mol column. Only relevant if `as_df` is True.
    """

    with open_datamol_data_file("cdk2.sdf", open_binary=True) as f:
        data = read_sdf(f, as_df=as_df, mol_column=mol_column)
    return data

chembl_drugs(as_df=True)

A list of ~2k molecules from ChEMBL (all drugs).

Originally, proposed by Patrick Walters at https://github.com/PatWalters/practical_cheminformatics_posts/tree/b4dae239a8b942dab3a41e637ac55d4491aee96f/molskill.

Source code in datamol/data/__init__.py
196
197
198
199
200
201
202
203
204
205
206
207
208
def chembl_drugs(as_df: bool = True) -> Union[List[Mol], pd.DataFrame]:
    """A list of ~2k molecules from ChEMBL (all drugs).

    Originally, proposed by Patrick Walters at <https://github.com/PatWalters/practical_cheminformatics_posts/tree/b4dae239a8b942dab3a41e637ac55d4491aee96f/molskill>.
    """

    with open_datamol_data_file("chembl_drugs.csv") as f:
        data = pd.read_csv(f)

    if not as_df:
        data = from_df(data)

    return data

chembl_samples(as_df=True)

A list of ~2k molecules from ChEMBL.

Originally, proposed by Patrick Walters at https://github.com/PatWalters/practical_cheminformatics_posts/tree/b4dae239a8b942dab3a41e637ac55d4491aee96f/molskill.

Source code in datamol/data/__init__.py
221
222
223
224
225
226
227
228
229
230
231
232
233
def chembl_samples(as_df: bool = True) -> Union[List[Mol], pd.DataFrame]:
    """A list of ~2k molecules from ChEMBL.

    Originally, proposed by Patrick Walters at <https://github.com/PatWalters/practical_cheminformatics_posts/tree/b4dae239a8b942dab3a41e637ac55d4491aee96f/molskill>.
    """

    with open_datamol_data_file("chembl_samples.csv") as f:
        data = pd.read_csv(f)

    if not as_df:
        data = from_df(data)

    return data

freesolv(as_df=True)

Return the FreeSolv dataset as a dataframe.

The dataset contains 642 molecules and the following columns: ['iupac', 'smiles', 'expt', 'calc'].

Warning

This dataset is only meant to be used as a toy dataset for pedagogic and testing purposes. It is not a dataset for benchmarking, analysis or model training.

Source code in datamol/data/__init__.py
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
def freesolv(as_df: bool = True) -> Union[List[Mol], pd.DataFrame]:
    """Return the FreeSolv dataset as a dataframe.

    The dataset contains 642 molecules and the following columns:
    `['iupac', 'smiles', 'expt', 'calc']`.

    Warning:
        This dataset is only meant to be used as a toy dataset for pedagogic and
        testing purposes. **It is not** a dataset for benchmarking, analysis or
        model training.
    """

    with open_datamol_data_file("freesolv.csv") as f:
        data = pd.read_csv(f)

    if not as_df:
        data = from_df(data)

    return data

solubility(as_df=True, mol_column='mol')

Return the RDKit solubility dataset from RDConfig.RDDocsDir, 'Book/data/solubility.{train|test}.sdf'.

The dataframe or the list of molecules with contain a split column, either train or test.

Parameters:

Name Type Description Default
as_df bool

Whether to return a list mol or a pandas DataFrame.

True
mol_column Optional[str]

Name of the mol column. Only relevant if as_df is True.

'mol'
Source code in datamol/data/__init__.py
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
def solubility(as_df: bool = True, mol_column: Optional[str] = "mol"):
    """Return the RDKit solubility dataset from `RDConfig.RDDocsDir, 'Book/data/solubility.{train|test}.sdf'`.

    The dataframe or the list of molecules with contain a `split` column, either `train` or `test`.

    Args:
        as_df: Whether to return a list mol or a pandas DataFrame.
        mol_column: Name of the mol column. Only relevant if `as_df` is True.
    """

    with open_datamol_data_file("solubility.train.sdf", open_binary=True) as f:
        train = read_sdf(f, as_df=True, mol_column="mol", smiles_column=None)

    with open_datamol_data_file("solubility.test.sdf", open_binary=True) as f:
        test = read_sdf(f, as_df=True, mol_column="mol", smiles_column=None)

    train = cast(pd.DataFrame, train)
    test = cast(pd.DataFrame, test)

    train["split"] = "train"
    test["split"] = "test"

    # NOTE(hadim): LMAO RDkit consistency xD
    test = test.rename(columns={"SMILES": "smiles"})

    data = pd.concat([train, test], ignore_index=True)

    if as_df:
        if mol_column is None:
            data = data.drop(columns=["mol"])

        render_mol_df(data)
        return data

    return from_df(data, mol_column=mol_column)