Skip to content

datamol.cluster

assign_to_centroids(mols, centroids, feature_fn=None, dist_fn=None, n_jobs=1)

Assign molecules to centroids. Each molecule will be assigned to the closest centroid.

Parameters:

Name Type Description Default
mols List[Mol]

a list of molecules to assign to centroids

required
centroids List[Mol]

list of molecules to use as centroid

required
feature_fn Optional[Callable]

A feature function that takes a Mol object and return molecular features. By default, the dm.to_fp() is used. Default to None.

None
dist_fn Optional[Callable]

A function that takes two indexes (i,j) and return the distance between them. You might use partial to set the fingerprints as input. By default, the Tanimoto similarity will be used. Default to None.

None
n_jobs Optional[int]

Number of jobs for parallelization. Let to 1 for no parallelization. Set to -1 to use all available cores.

1

Returns:

Name Type Description
clusters_map dict

dict of index mapping each centroid index to the molecule index in the cluster

clusters_list list

list of all molecules in each cluster. The cluster index follows the index of the centroid. Note that the centroid molecule is not added to the cluster.

Source code in datamol/cluster.py
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
def assign_to_centroids(
    mols: List[Mol],
    centroids: List[Mol],
    feature_fn: Optional[Callable] = None,
    dist_fn: Optional[Callable] = None,
    n_jobs: Optional[int] = 1,
) -> Tuple[dict, list]:
    r"""Assign molecules to centroids. Each molecule will be assigned to the closest centroid.

    Args:
        mols: a list of molecules to assign to centroids
        centroids: list of molecules to use as centroid
        feature_fn: A feature function that takes a Mol object
            and return molecular features. By default, the `dm.to_fp()` is used.
            Default to None.
        dist_fn: A function that takes two indexes (i,j) and return the
            distance between them. You might use partial to set the fingerprints as input.
            By default, the Tanimoto similarity will be used. Default to None.
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to -1 to use all available cores.

    Returns:
        clusters_map: dict of index mapping each centroid index to the molecule index in the cluster
        clusters_list: list of all molecules in each cluster. The cluster index follows the index of the centroid.
            Note that the centroid molecule is not added to the cluster.
    """

    if feature_fn is None:
        feature_fn = functools.partial(dm.to_fp, as_array=False)

    all_mols = [x for x in mols] + [c for c in centroids]
    features = dm.parallelized(feature_fn, all_mols, n_jobs=n_jobs)

    def distij(i, j, features=features):
        return 1.0 - DataStructs.cDataStructs.TanimotoSimilarity(
            features[int(i.item())], features[int(j.item())]
        )

    if dist_fn is None:
        dist_fn = distij

    clusters_map = ddict(list)
    clusters_list = [[] for _ in centroids]
    query_inds = np.expand_dims(np.arange(len(mols), dtype=int), axis=1)
    centroid_inds = np.expand_dims(np.arange(len(centroids), dtype=int), axis=1) + len(mols)
    dist_mat = distance.cdist(query_inds, centroid_inds, metric=distij)
    closest = np.argmin(dist_mat, axis=1)
    for ind, cluster_ind in enumerate(closest):  # type: ignore
        clusters_map[cluster_ind].append(ind)
        clusters_list[cluster_ind].append(mols[ind])
    return clusters_map, clusters_list

cluster_mols(mols, cutoff=0.2, feature_fn=None, n_jobs=1)

Cluster a set of molecules using the butina clustering algorithm and a given threshold.

Parameters:

Name Type Description Default
mols Union[Sequence[Mol], Series]

a list of molecules.

required
cutoff float

Cuttoff for the clustering. Default to 0.2.

0.2
feature_fn Optional[Callable]

A feature function that takes a Mol object and return molecular features. By default, the dm.to_fp() is used. Default to None.

None
n_jobs Optional[int]

Number of jobs for parallelization. Let to 1 for no parallelization. Set to -1 to use all available cores.

1
Source code in datamol/cluster.py
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
def cluster_mols(
    mols: Union[Sequence[Mol], pd.Series],
    cutoff: float = 0.2,
    feature_fn: Optional[Callable] = None,
    n_jobs: Optional[int] = 1,
):
    """Cluster a set of molecules using the butina clustering algorithm and a given threshold.

    Args:
        mols: a list of molecules.
        cutoff: Cuttoff for the clustering. Default to 0.2.
        feature_fn: A feature function that takes a Mol object
            and return molecular features. By default, the `dm.to_fp()` is used.
            Default to None.
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to -1 to use all available cores.
    """

    if feature_fn is None:
        feature_fn = functools.partial(dm.to_fp, as_array=False)

    features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs)

    dists = []
    n_mols = len(mols)

    for i in range(1, n_mols):
        dist = DataStructs.cDataStructs.BulkTanimotoSimilarity(
            features[i], features[:i], returnDistance=True
        )
        dists.extend([x for x in dist])

    # now cluster the data
    cluster_indices = Butina.ClusterData(dists, n_mols, cutoff, isDistData=True)
    cluster_mols = [operator.itemgetter(*cluster)(mols) for cluster in cluster_indices]

    # Make single mol cluster a list
    cluster_mols = [[c] if isinstance(c, Mol) else c for c in cluster_mols]

    return cluster_indices, cluster_mols

pick_centroids(mols, npick=0, initial_picks=None, threshold=0.5, feature_fn=None, dist_fn=None, seed=42, method='sphere', n_jobs=1)

Pick a set of npick centroids from a list of molecules.

Parameters:

Name Type Description Default
mols List[Mol]

a list of molecules.

required
npick int

Number of element to pick from mols, including the preselection.

0
threshold float

Minimum distance between centroids for maxmin and sphere exclusion (sphere) methods.

0.5
initial_picks Optional[List[int]]

Starting list of index for molecules that should be in the set of picked molecules. Default to None.

None
feature_fn callable

A feature function that takes a Mol object and return molecular features. By default, the dm.to_fp() is used. Default to None.

None
dist_fn Optional[Callable]

A function that takes two indexes (i,j) and return the distance between them. You might use partial to set the fingerprints as input. By default, the Tanimoto similarity will be used. Default to None.

None
seed int

seed for reproducibility

42
method str

Picking method to use. One of sphere, maxmin or any supported rdkit hierarchical clustering method such as centroid, clink, upgma

'sphere'
n_jobs Optional[int]

Number of jobs for parallelization. Let to 1 for no parallelization. Set to -1 to use all available cores.

1

Returns:

Name Type Description
picked_inds int

index of the molecule that have been selected as centroids

mols list

molecules that have been picked

Source code in datamol/cluster.py
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
def pick_centroids(
    mols: List[Mol],
    npick: int = 0,
    initial_picks: Optional[List[int]] = None,
    threshold: float = 0.5,
    feature_fn: Optional[Callable] = None,
    dist_fn: Optional[Callable] = None,
    seed: int = 42,
    method: str = "sphere",
    n_jobs: Optional[int] = 1,
) -> Tuple[int, list]:
    r"""Pick a set of `npick` centroids from a list of molecules.

    Args:
        mols: a list of molecules.
        npick: Number of element to pick from mols, including the preselection.
        threshold: Minimum distance between centroids for `maxmin` and sphere exclusion (`sphere`) methods.
        initial_picks: Starting list of index for molecules that should be in the
            set of picked molecules. Default to None.
        feature_fn (callable, optional): A feature function that takes a Mol object
            and return molecular features. By default, the `dm.to_fp()` is used.
            Default to None.
        dist_fn: A function that takes two indexes (i,j) and return the
            distance between them. You might use partial to set the fingerprints as input.
            By default, the Tanimoto similarity will be used. Default to None.
        seed: seed for reproducibility
        method: Picking method to use. One of  `sphere`, `maxmin` or any
            supported rdkit hierarchical clustering method such as `centroid`, `clink`, `upgma`
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to -1 to use all available cores.

    Returns:
        picked_inds: index of the molecule that have been selected as centroids
        mols: molecules that have been picked
    """

    n_mols = len(mols)
    if feature_fn is None:
        feature_fn = functools.partial(dm.to_fp, as_array=False)

    features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs)

    def distij(i, j, features=features):
        return 1.0 - DataStructs.cDataStructs.TanimotoSimilarity(features[i], features[j])

    if dist_fn is None:
        dist_fn = distij

    initial_picks = [] if initial_picks is None else initial_picks

    if method == "maxmin":
        picker = MaxMinPicker()
        picked_inds, _ = picker.LazyPickWithThreshold(
            dist_fn,
            n_mols,
            pickSize=npick,
            threshold=threshold,
            firstPicks=initial_picks,
            seed=seed,
        )

    elif method == "sphere":
        picker = LeaderPicker()
        picked_inds = picker.LazyPick(
            dist_fn, n_mols, threshold=threshold, pickSize=npick, firstPicks=initial_picks
        )

    elif method.upper() in ClusterMethod.names.keys() and npick:
        if initial_picks:
            logger.warning(
                "Initial picks is not supported by hierarchical clustering. You pick has been discarded."
            )

        dist_mat = dm.parallelized(
            distij, list(zip(*np.tril_indices(len(mols), k=-1))), arg_type="args"
        )
        dist_mat = np.asarray(dist_mat)
        picker = HierarchicalClusterPicker(ClusterMethod.names[method.upper()])
        picked_inds = picker.Pick(dist_mat, n_mols, npick)
    else:
        raise ValueError(f"Picking method {method} with {npick} elements to pick is not supported.")
    picked_inds = np.array(picked_inds)
    picked_mols = [mols[x] for x in picked_inds]

    return picked_inds, picked_mols

pick_diverse(mols, npick, initial_picks=None, feature_fn=None, dist_fn=None, seed=42, n_jobs=1)

Pick a set of diverse molecules based on they fingerprint.

Parameters:

Name Type Description Default
mols List[Mol]

a list of molecules.

required
npick int

Number of element to pick from mols, including the preselection.

required
initial_picks Optional[List[int]]

Starting list of index for molecules that should be in the set of picked molecules. Default to None.

None
feature_fn Optional[Callable]

A feature function that takes a Mol object and return molecular features. By default, the dm.to_fp() is used. Default to None.

None
dist_fn Optional[Callable]

A function that takes two indexes (i,j) and return the distance between them. You might use partial to set the fingerprints as input. By default, the Tanimoto similarity will be used. Default to None.

None
seed int

seed for reproducibility

42
n_jobs Optional[int]

Number of jobs for parallelization. Let to 1 for no parallelization. Set to -1 to use all available cores.

1

Returns:

Name Type Description
picked_inds int

index of the molecule that have been picked

mols list

molecules that have been picked

Source code in datamol/cluster.py
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def pick_diverse(
    mols: List[Mol],
    npick: int,
    initial_picks: Optional[List[int]] = None,
    feature_fn: Optional[Callable] = None,
    dist_fn: Optional[Callable] = None,
    seed: int = 42,
    n_jobs: Optional[int] = 1,
) -> Tuple[int, list]:
    r"""Pick a set of diverse molecules based on they fingerprint.

    Args:
        mols: a list of molecules.
        npick: Number of element to pick from mols, including the preselection.
        initial_picks: Starting list of index for molecules that should be in the
            set of picked molecules. Default to None.
        feature_fn: A feature function that takes a Mol object
            and return molecular features. By default, the `dm.to_fp()` is used.
            Default to None.
        dist_fn: A function that takes two indexes (i,j) and return the
            distance between them. You might use partial to set the fingerprints as input.
            By default, the Tanimoto similarity will be used. Default to None.
        seed: seed for reproducibility
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to -1 to use all available cores.

    Returns:
        picked_inds: index of the molecule that have been picked
        mols: molecules that have been picked
    """

    if feature_fn is None:
        feature_fn = functools.partial(dm.to_fp, as_array=False)

    features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs)

    def distij(i, j, features=features):
        return 1.0 - DataStructs.cDataStructs.TanimotoSimilarity(features[i], features[j])

    if dist_fn is None:
        dist_fn = distij

    picker = MaxMinPicker()
    initial_picks = [] if initial_picks is None else initial_picks
    picked_inds = picker.LazyPick(dist_fn, len(mols), npick, firstPicks=initial_picks, seed=seed)
    picked_inds = np.array(picked_inds)
    picked_mols = [mols[x] for x in picked_inds]

    return picked_inds, picked_mols