Skip to content

datamol.data

The data module aims to provide a fast and convenient access to various molecular datasets.


cdk2(as_df=True, mol_column='mol')

Return the RDKit CDK2 dataset from RDConfig.RDDocsDir, 'Book/data/cdk2.sdf'.

Parameters:

Name Type Description Default
as_df bool

Whether to return a list mol or a pandas DataFrame.

True
mol_column Optional[str]

Name of the mol column. Only relevant if as_df is True.

'mol'
Source code in datamol/data/__init__.py
108
109
110
111
112
113
114
115
116
117
118
def cdk2(as_df: bool = True, mol_column: Optional[str] = "mol"):
    """Return the RDKit CDK2 dataset from `RDConfig.RDDocsDir, 'Book/data/cdk2.sdf'`.

    Args:
        as_df: Whether to return a list mol or a pandas DataFrame.
        mol_column: Name of the mol column. Only relevant if `as_df` is True.
    """

    with open_datamol_data_file("cdk2.sdf", open_binary=True) as f:
        data = read_sdf(f, as_df=as_df, mol_column=mol_column)
    return data

freesolv(as_df=True)

Return the FreeSolv dataset as a dataframe.

The dataset contains 642 molecules and the following columns: ['iupac', 'smiles', 'expt', 'calc'].

Warning

This dataset is only meant to be used as a toy dataset for pedagogic and testing purposes. It is not a dataset for benchmarking, analysis or model training.

Source code in datamol/data/__init__.py
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
def freesolv(as_df: bool = True) -> Union[List[Mol], pd.DataFrame]:
    """Return the FreeSolv dataset as a dataframe.

    The dataset contains 642 molecules and the following columns:
    `['iupac', 'smiles', 'expt', 'calc']`.

    Warning:
        This dataset is only meant to be used as a toy dataset for pedagogic and
        testing purposes. **It is not** a dataset for benchmarking, analysis or
        model training.
    """

    with open_datamol_data_file("freesolv.csv") as f:
        data = pd.read_csv(f)

    if not as_df:
        data = from_df(data)

    return data

solubility(as_df=True, mol_column='mol')

Return the RDKit solubility dataset from RDConfig.RDDocsDir, 'Book/data/solubility.{train|test}.sdf'.

The dataframe or the list of molecules with contain a split column, either train or test.

Parameters:

Name Type Description Default
as_df bool

Whether to return a list mol or a pandas DataFrame.

True
mol_column Optional[str]

Name of the mol column. Only relevant if as_df is True.

'mol'
Source code in datamol/data/__init__.py
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
def solubility(as_df: bool = True, mol_column: Optional[str] = "mol"):
    """Return the RDKit solubility dataset from `RDConfig.RDDocsDir, 'Book/data/solubility.{train|test}.sdf'`.

    The dataframe or the list of molecules with contain a `split` column, either `train` or `test`.

    Args:
        as_df: Whether to return a list mol or a pandas DataFrame.
        mol_column: Name of the mol column. Only relevant if `as_df` is True.
    """

    with open_datamol_data_file("solubility.train.sdf", open_binary=True) as f:
        train = read_sdf(f, as_df=True, mol_column="mol", smiles_column=None)

    with open_datamol_data_file("solubility.test.sdf", open_binary=True) as f:
        test = read_sdf(f, as_df=True, mol_column="mol", smiles_column=None)

    train = cast(pd.DataFrame, train)
    test = cast(pd.DataFrame, test)

    train["split"] = "train"
    test["split"] = "test"

    # NOTE(hadim): LMAO RDkit consistency xD
    test = test.rename(columns={"SMILES": "smiles"})

    data = pd.concat([train, test], ignore_index=True)

    if as_df:
        if mol_column is None:
            data = data.drop(columns=["mol"])

        render_mol_df(data)
        return data

    return from_df(data, mol_column=mol_column)