Skip to content

datamol.fp

fold_count_fp(fp, dim=1024, binary=False)

Fast folding of a count fingerprint to the specified dimension.

Parameters:

Name Type Description Default
fp Union[ndarray, SparseBitVect, ExplicitBitVect]

A fingerprint.

required
dim int

The dimension of the folded array.

1024
binary bool

Whether to fold into a binary array or take use a count vector.

False

Returns:

Name Type Description
folded ndarray

returns folded array to the provided dimension.

Source code in datamol/fp.py
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
def fold_count_fp(
    fp: Union[np.ndarray, SparseBitVect, ExplicitBitVect],
    dim: int = 1024,
    binary: bool = False,
) -> np.ndarray:
    """Fast folding of a count fingerprint to the specified dimension.

    Args:
        fp: A fingerprint.
        dim: The dimension of the folded array.
        binary: Whether to fold into a binary array or take use a count vector.

    Returns:
        folded: returns folded array to the provided dimension.
    """
    if isinstance(
        fp,
        (
            UIntSparseIntVect,
            IntSparseIntVect,
            LongSparseIntVect,
            ULongSparseIntVect,
        ),
    ):
        tmp = fp.GetNonzeroElements()

    elif isinstance(fp, SparseBitVect):
        on_bits = fp.GetOnBits()
        tmp = dict(zip(on_bits, np.ones(len(on_bits))))

    else:
        raise ValueError(f"The fingerprint is of wrong type: {type(fp)}")

    # ON bits dict to (i, v)
    i = np.array(list(tmp.keys())) % dim
    v = np.array(list(tmp.values()))

    # Fold indices
    i = i % dim

    # Create the folded fp
    folded = np.zeros(dim, dtype="int")
    np.add.at(folded, i, v)

    if binary:
        folded = np.clip(folded, a_min=0, a_max=1)

    return folded

fp_to_array(fp)

Convert rdkit fingerprint to numpy array.

Note

This implementation has shown to be faster than using DataStructs.ConvertToNumpyArray by a factor of ~4. See https://github.com/rdkit/rdkit/discussions/3863.

Parameters:

Name Type Description Default
fp Union[ndarray, SparseBitVect, ExplicitBitVect, UIntSparseIntVect]

The fingerprint.

required
Source code in datamol/fp.py
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
def fp_to_array(
    fp: Union[np.ndarray, SparseBitVect, ExplicitBitVect, UIntSparseIntVect]
) -> np.ndarray:
    """Convert rdkit fingerprint to numpy array.

    Note:
        This implementation has shown to be faster than using `DataStructs.ConvertToNumpyArray`
        by a factor of ~4. See https://github.com/rdkit/rdkit/discussions/3863.

    Args:
        fp: The fingerprint.
    """

    if isinstance(fp, np.ndarray):
        fp_out = fp

    elif isinstance(fp, SparseBitVect):
        tmp = np.zeros(fp.GetNumBits(), dtype=int)
        on_bits = np.array(fp.GetOnBits())
        tmp[on_bits] = 1
        fp_out = tmp

    elif isinstance(fp, ExplicitBitVect):
        fp_out = np.frombuffer(fp.ToBitString().encode(), "u1") - ord("0")

    elif isinstance(
        fp,
        (
            UIntSparseIntVect,
            IntSparseIntVect,
            LongSparseIntVect,
            ULongSparseIntVect,
        ),
    ):
        tmp = np.zeros(fp.GetLength(), dtype=int)
        bit_idx, values = np.array(list(fp.GetNonzeroElements().items())).T
        tmp[bit_idx] = values
        fp_out = tmp

    else:
        raise ValueError(
            f"The fingerprint of type '{type(fp)}' is not supported. "
            "Please open a ticket at https://github.com/datamol-io/datamol/issues."
        )

    return fp_out

list_supported_fingerprints()

Return the supported fingerprints in datamol.

Source code in datamol/fp.py
295
296
297
298
def list_supported_fingerprints():
    """Return the supported fingerprints in datamol."""

    return _FP_FUNCS

to_fp(mol, as_array=True, fp_type='ecfp', fold_size=None, **fp_args)

Compute the molecular fingerprint given a molecule or a SMILES.

Parameters:

Name Type Description Default
mol Union[str, Mol]

a molecule or a SMILES.

required
as_array bool

Whether to return a numpy array of an RDKit vec. Default to True.

True
fp_type str

The type of the fingerprint. See dm.list_supported_fingerprints() for a complete list.

'ecfp'
fold_size Optional[int]

If set, fold the fingerprint to the fold_size. If set, returned array is always a numpy array.

None
**fp_args Any

Arguments to build the fingerprint. Refer to the official RDKit documentation.

{}

Returns:

Type Description
Optional[Union[ndarray, SparseBitVect, ExplicitBitVect]]

A fingerprint vector or None

Source code in datamol/fp.py
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
def to_fp(
    mol: Union[str, Chem.rdchem.Mol],
    as_array: bool = True,
    fp_type: str = "ecfp",
    fold_size: Optional[int] = None,
    **fp_args: Any,
) -> Optional[Union[np.ndarray, SparseBitVect, ExplicitBitVect]]:
    """Compute the molecular fingerprint given a molecule or a SMILES.

    Args:
        mol: a molecule or a SMILES.
        as_array: Whether to return a numpy array of an RDKit vec. Default to True.
        fp_type: The type of the fingerprint. See `dm.list_supported_fingerprints()` for a
            complete list.
        fold_size: If set, fold the fingerprint to the `fold_size`. If set, returned array is
            always a numpy array.
        **fp_args: Arguments to build the fingerprint. Refer to the official RDKit documentation.

    Returns:
        A fingerprint vector or None
    """

    # Get fp function
    fp_func = _FP_FUNCS.get(fp_type)

    if fp_func is None:
        raise ValueError(
            f"The fingerprint '{fp_type}' is not available. Use `dm.list_supported_fingerprints()` to "
            "get a complete list of the available fingerprints."
        )

    # Convert input to mol if needed
    if isinstance(mol, str):
        mol_obj = dm.to_mol(mol)
    else:
        mol_obj = mol

    if mol_obj is None:
        raise ValueError(f"It seems like the input molecule '{mol}' is invalid.")

    mol = mol_obj

    # Insert default values.
    for key, value in _FP_DEFAULT_ARGS[fp_type].items():
        fp_args.setdefault(key, value)

    # Compute the fingerprint
    fp = fp_func(mol, **fp_args)

    # Fold the fp if needed.
    if fold_size is not None:
        fp = fold_count_fp(fp, dim=fold_size)

    # Convert to a numpy array
    if not fold_size and as_array:
        fp = fp_to_array(fp)

    return fp