Skip to content

datamol.data

The data module aims to provide a fast and convenient access to various molecular datasets.


cdk2(as_df=True, mol_column='mol')

Return the RDKit CDK2 dataset from RDConfig.RDDocsDir, 'Book/data/cdk2.sdf'.

Parameters:

Name Type Description Default
as_df bool

Whether to return a list mol or a pandas DataFrame.

True
mol_column Optional[str]

Name of the mol column. Only relevant if as_df is True.

'mol'
Source code in datamol/data/__init__.py
115
116
117
118
119
120
121
122
123
124
125
def cdk2(as_df: bool = True, mol_column: Optional[str] = "mol"):
    """Return the RDKit CDK2 dataset from `RDConfig.RDDocsDir, 'Book/data/cdk2.sdf'`.

    Args:
        as_df: Whether to return a list mol or a pandas DataFrame.
        mol_column: Name of the mol column. Only relevant if `as_df` is True.
    """

    with open_datamol_data_file("cdk2.sdf", open_binary=True) as f:
        data = read_sdf(f, as_df=as_df, mol_column=mol_column)
    return data

chembl_drugs(as_df=True)

A list of ~2.5k molecules from ChEMBL (all approved drugs) in SMILES format. Includes metadata indicating year of first approval, molecule chembl id, molecule type and pref_name.

List was generated with 'Get_ChEMBL_Approved_Drugs.ipynb' on 2023-10-18. The notebook works with the chembl_webresource_client api to collect chembl IDs and metadata, then focuses on small molecules with valid SMILES and first approval date.

Source code in datamol/data/__init__.py
187
188
189
190
191
192
193
194
195
196
197
198
199
200
def chembl_drugs(as_df: bool = True) -> Union[List[Mol], pd.DataFrame]:
    """A list of ~2.5k molecules from ChEMBL (all approved drugs) in SMILES format.
    Includes metadata indicating year of first approval, molecule chembl id, molecule type and pref_name.

    List was generated with ['Get_ChEMBL_Approved_Drugs.ipynb'](https://github.com/datamol-io/datamol/notebooks/Get_ChEMBL_Approved_Drugs.ipynb) on 2023-10-18.
    The notebook works with the chembl_webresource_client api to collect chembl IDs and metadata, then focuses on small molecules with valid SMILES and first approval date.
    """
    with open_datamol_data_file("chembl_approved_drugs.parquet", open_binary=True) as f:
        data = pd.read_parquet(f)

    if not as_df:
        data = from_df(data)

    return data

chembl_samples(as_df=True)

A list of ~2k molecules from ChEMBL.

Originally, proposed by Patrick Walters at https://github.com/PatWalters/practical_cheminformatics_posts/tree/b4dae239a8b942dab3a41e637ac55d4491aee96f/molskill.

Source code in datamol/data/__init__.py
211
212
213
214
215
216
217
218
219
220
221
222
223
def chembl_samples(as_df: bool = True) -> Union[List[Mol], pd.DataFrame]:
    """A list of ~2k molecules from ChEMBL.

    Originally, proposed by Patrick Walters at <https://github.com/PatWalters/practical_cheminformatics_posts/tree/b4dae239a8b942dab3a41e637ac55d4491aee96f/molskill>.
    """

    with open_datamol_data_file("chembl_samples.csv") as f:
        data = pd.read_csv(f)

    if not as_df:
        data = from_df(data)

    return data

freesolv(as_df=True)

Return the FreeSolv dataset as a dataframe.

The dataset contains 642 molecules and the following columns: ['iupac', 'smiles', 'expt', 'calc'].

Warning

This dataset is only meant to be used as a toy dataset for pedagogic and testing purposes. It is not a dataset for benchmarking, analysis or model training.

Source code in datamol/data/__init__.py
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
def freesolv(as_df: bool = True) -> Union[List[Mol], pd.DataFrame]:
    """Return the FreeSolv dataset as a dataframe.

    The dataset contains 642 molecules and the following columns:
    `['iupac', 'smiles', 'expt', 'calc']`.

    Warning:
        This dataset is only meant to be used as a toy dataset for pedagogic and
        testing purposes. **It is not** a dataset for benchmarking, analysis or
        model training.
    """

    with open_datamol_data_file("freesolv.csv") as f:
        data = pd.read_csv(f)

    if not as_df:
        data = from_df(data)

    return data

solubility(as_df=True, mol_column='mol')

Return the RDKit solubility dataset from RDConfig.RDDocsDir, 'Book/data/solubility.{train|test}.sdf'.

The dataframe or the list of molecules with contain a split column, either train or test.

Parameters:

Name Type Description Default
as_df bool

Whether to return a list mol or a pandas DataFrame.

True
mol_column Optional[str]

Name of the mol column. Only relevant if as_df is True.

'mol'
Source code in datamol/data/__init__.py
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
def solubility(as_df: bool = True, mol_column: Optional[str] = "mol"):
    """Return the RDKit solubility dataset from `RDConfig.RDDocsDir, 'Book/data/solubility.{train|test}.sdf'`.

    The dataframe or the list of molecules with contain a `split` column, either `train` or `test`.

    Args:
        as_df: Whether to return a list mol or a pandas DataFrame.
        mol_column: Name of the mol column. Only relevant if `as_df` is True.
    """

    with open_datamol_data_file("solubility.train.sdf", open_binary=True) as f:
        train = read_sdf(f, as_df=True, mol_column="mol", smiles_column=None)

    with open_datamol_data_file("solubility.test.sdf", open_binary=True) as f:
        test = read_sdf(f, as_df=True, mol_column="mol", smiles_column=None)

    train = cast(pd.DataFrame, train)
    test = cast(pd.DataFrame, test)

    train["split"] = "train"
    test["split"] = "test"

    # NOTE(hadim): LMAO RDkit consistency xD
    test = test.rename(columns={"SMILES": "smiles"})

    data = pd.concat([train, test], ignore_index=True)

    if as_df:
        if mol_column is None:
            data = data.drop(columns=["mol"])

        render_mol_df(data)
        return data

    return from_df(data, mol_column=mol_column)