Skip to content

datamol.data

The data module aims to provide a fast and convenient access to various molecular datasets.


cdk2(as_df=True, mol_column='mol')

Return the RDKit CDK2 dataset from RDConfig.RDDocsDir, 'Book/data/cdk2.sdf'.

Parameters:

Name Type Description Default
as_df bool

Whether to return a list mol or a pandas DataFrame.

True
mol_column Optional[str]

Name of the mol column. Only relevant if as_df is True.

'mol'
Source code in datamol/data.py
75
76
77
78
79
80
81
82
83
84
85
def cdk2(as_df: bool = True, mol_column: Optional[str] = "mol"):
    """Return the RDKit CDK2 dataset from `RDConfig.RDDocsDir, 'Book/data/cdk2.sdf'`.

    Args:
        as_df: Whether to return a list mol or a pandas DataFrame.
        mol_column: Name of the mol column. Only relevant if `as_df` is True.
    """

    with pkg_resources.resource_stream("datamol", "data/cdk2.sdf") as f:
        data = read_sdf(f, as_df=as_df, mol_column=mol_column)
    return data

freesolv(as_df=True)

Return the FreeSolv dataset as a dataframe.

The dataset contains 642 molecules and the following columns: ['iupac', 'smiles', 'expt', 'calc'].

Warning

This dataset is only meant to be used as a toy dataset for pedagogic and testing purposes. It is not a dataset for benchmarking, analysis or model training.

Source code in datamol/data.py
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
def freesolv(as_df: bool = True) -> Union[List[Mol], pd.DataFrame]:
    """Return the FreeSolv dataset as a dataframe.

    The dataset contains 642 molecules and the following columns:
    `['iupac', 'smiles', 'expt', 'calc']`.

    Warning:
        This dataset is only meant to be used as a toy dataset for pedagogic and
        testing purposes. **It is not** a dataset for benchmarking, analysis or
        model training.
    """

    with pkg_resources.resource_stream("datamol", "data/freesolv.csv") as f:
        data = pd.read_csv(f)

    if not as_df:
        data = from_df(data)

    return data

solubility(as_df=True, mol_column='mol')

Return the RDKit solubility dataset from RDConfig.RDDocsDir, 'Book/data/solubility.{train|test}.sdf'.

The dataframe or the list of molecules with contain a split column, either train or test.

Parameters:

Name Type Description Default
as_df bool

Whether to return a list mol or a pandas DataFrame.

True
mol_column Optional[str]

Name of the mol column. Only relevant if as_df is True.

'mol'
Source code in datamol/data.py
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
def solubility(as_df: bool = True, mol_column: Optional[str] = "mol"):
    """Return the RDKit solubility dataset from `RDConfig.RDDocsDir, 'Book/data/solubility.{train|test}.sdf'`.

    The dataframe or the list of molecules with contain a `split` column, either `train` or `test`.

    Args:
        as_df: Whether to return a list mol or a pandas DataFrame.
        mol_column: Name of the mol column. Only relevant if `as_df` is True.
    """

    with pkg_resources.resource_stream("datamol", "data/solubility.train.sdf") as f:
        train = read_sdf(f, as_df=True, mol_column="mol", smiles_column=None)

    with pkg_resources.resource_stream("datamol", "data/solubility.test.sdf") as f:
        test = read_sdf(f, as_df=True, mol_column="mol", smiles_column=None)

    train = cast(pd.DataFrame, train)
    test = cast(pd.DataFrame, test)

    train["split"] = "train"
    test["split"] = "test"

    # NOTE(hadim): LMAO RDkit consistency xD
    test = test.rename(columns={"SMILES": "smiles"})

    data = pd.concat([train, test], ignore_index=True)

    if as_df:
        if mol_column is None:
            data = data.drop(columns=["mol"])

        render_mol_df(data)
        return data

    return from_df(data, mol_column=mol_column)