`datamol.fp`¶

`fold_count_fp(fp, dim=1024, binary=False)` ¶

Fast folding of a count fingerprint to the specified dimension.

Parameters:

Name	Type	Description	Default
`fp`	`Union[np.ndarray, SparseBitVect, ExplicitBitVect]`	A fingerprint.	required
`dim`	`int`	The dimension of the folded array.	`1024`
`binary`	`bool`	Whether to fold into a binary array or take use a count vector.	`False`

Returns:

Name	Type	Description
`folded`	`np.ndarray`	returns folded array to the provided dimension.

Source code in datamol/fp.py

def fold_count_fp(
    fp: Union[np.ndarray, SparseBitVect, ExplicitBitVect],
    dim: int = 1024,
    binary: bool = False,
) -> np.ndarray:
    """Fast folding of a count fingerprint to the specified dimension.

    Args:
        fp: A fingerprint.
        dim: The dimension of the folded array.
        binary: Whether to fold into a binary array or take use a count vector.

    Returns:
        folded: returns folded array to the provided dimension.
    """
    if isinstance(
        fp,
        (
            UIntSparseIntVect,
            IntSparseIntVect,
            LongSparseIntVect,
            ULongSparseIntVect,
        ),
    ):
        tmp = fp.GetNonzeroElements()

    elif isinstance(fp, SparseBitVect):
        on_bits = fp.GetOnBits()
        tmp = dict(zip(on_bits, np.ones(len(on_bits))))

    else:
        raise ValueError(f"The fingerprint is of wrong type: {type(fp)}")

    # ON bits dict to (i, v)
    i = np.array(list(tmp.keys())) % dim
    v = np.array(list(tmp.values()))

    # Fold indices
    i = i % dim

    # Create the folded fp
    folded = np.zeros(dim, dtype="int")
    np.add.at(folded, i, v)

    if binary:
        folded = np.clip(folded, a_min=0, a_max=1)

    return folded

`fp_to_array(fp)` ¶

Convert rdkit fingerprint to numpy array.

Note

This implementation has shown to be faster than using DataStructs.ConvertToNumpyArray by a factor of ~4. See https://github.com/rdkit/rdkit/discussions/3863.

Parameters:

Name	Type	Description	Default
`fp`	`Union[np.ndarray, SparseBitVect, ExplicitBitVect, UIntSparseIntVect]`	The fingerprint.	required

Source code in datamol/fp.py

def fp_to_array(
    fp: Union[np.ndarray, SparseBitVect, ExplicitBitVect, UIntSparseIntVect]
) -> np.ndarray:
    """Convert rdkit fingerprint to numpy array.

    Note:
        This implementation has shown to be faster than using `DataStructs.ConvertToNumpyArray`
        by a factor of ~4. See https://github.com/rdkit/rdkit/discussions/3863.

    Args:
        fp: The fingerprint.
    """

    if isinstance(fp, np.ndarray):
        fp_out = fp

    elif isinstance(fp, SparseBitVect):
        tmp = np.zeros(fp.GetNumBits(), dtype=int)
        on_bits = np.array(fp.GetOnBits())
        tmp[on_bits] = 1
        fp_out = tmp

    elif isinstance(fp, ExplicitBitVect):
        fp_out = np.frombuffer(fp.ToBitString().encode(), "u1") - ord("0")

    elif isinstance(
        fp,
        (
            UIntSparseIntVect,
            IntSparseIntVect,
            LongSparseIntVect,
            ULongSparseIntVect,
        ),
    ):
        tmp = np.zeros(fp.GetLength(), dtype=int)
        bit_idx, values = np.array(list(fp.GetNonzeroElements().items())).T
        tmp[bit_idx] = values
        fp_out = tmp

    else:
        raise ValueError(
            f"The fingerprint of type '{type(fp)}' is not supported. "
            "Please open a ticket at https://github.com/datamol-io/datamol/issues."
        )

    return fp_out

`list_supported_fingerprints()` ¶

Return the supported fingerprints in datamol.

Source code in datamol/fp.py

def list_supported_fingerprints():
    """Return the supported fingerprints in datamol."""

    return _FP_FUNCS

`to_fp(mol, as_array=True, fp_type='ecfp', fold_size=None, **fp_args)` ¶

Compute the molecular fingerprint given a molecule or a SMILES.

Parameters:

Name	Type	Description	Default
`mol`	`Union[str, Chem.rdchem.Mol]`	a molecule or a SMILES.	required
`as_array`	`bool`	Whether to return a numpy array of an RDKit vec. Default to True.	`True`
`fp_type`	`str`	The type of the fingerprint. See `dm.list_supported_fingerprints()` for a complete list.	`'ecfp'`
`fold_size`	`Optional[int]`	If set, fold the fingerprint to the `fold_size`. If set, returned array is always a numpy array.	`None`
`**fp_args`	`Any`	Arguments to build the fingerprint. Refer to the official RDKit documentation.	`{}`

Returns:

Type	Description
`Optional[Union[np.ndarray, SparseBitVect, ExplicitBitVect]]`	A fingerprint vector or None

Source code in datamol/fp.py

def to_fp(
    mol: Union[str, Chem.rdchem.Mol],
    as_array: bool = True,
    fp_type: str = "ecfp",
    fold_size: Optional[int] = None,
    **fp_args: Any,
) -> Optional[Union[np.ndarray, SparseBitVect, ExplicitBitVect]]:
    """Compute the molecular fingerprint given a molecule or a SMILES.

    Args:
        mol: a molecule or a SMILES.
        as_array: Whether to return a numpy array of an RDKit vec. Default to True.
        fp_type: The type of the fingerprint. See `dm.list_supported_fingerprints()` for a
            complete list.
        fold_size: If set, fold the fingerprint to the `fold_size`. If set, returned array is
            always a numpy array.
        **fp_args: Arguments to build the fingerprint. Refer to the official RDKit documentation.

    Returns:
        A fingerprint vector or None
    """

    # Get fp function
    fp_func = _FP_FUNCS.get(fp_type)

    if fp_func is None:
        raise ValueError(
            f"The fingerprint '{fp_type}' is not available. Use `dm.list_supported_fingerprints()` to "
            "get a complete list of the available fingerprints."
        )

    # Convert input to mol if needed
    if isinstance(mol, str):
        mol_obj = dm.to_mol(mol)
    else:
        mol_obj = mol

    if mol_obj is None:
        raise ValueError(f"It seems like the input molecule '{mol}' is invalid.")

    mol = mol_obj

    # Insert default values.
    for key, value in _FP_DEFAULT_ARGS[fp_type].items():
        fp_args.setdefault(key, value)

    # Compute the fingerprint
    fp = fp_func(mol, **fp_args)

    # Fold the fp if needed.
    if fold_size is not None:
        fp = fold_count_fp(fp, dim=fold_size)

    # Convert to a numpy array
    if not fold_size and as_array:
        fp = fp_to_array(fp)

    return fp

datamol.fp¶

fold_count_fp(fp, dim=1024, binary=False) ¶

fp_to_array(fp) ¶

list_supported_fingerprints() ¶

to_fp(mol, as_array=True, fp_type='ecfp', fold_size=None, **fp_args) ¶

`datamol.fp`¶

`fold_count_fp(fp, dim=1024, binary=False)` ¶

`fp_to_array(fp)` ¶

`list_supported_fingerprints()` ¶

`to_fp(mol, as_array=True, fp_type='ecfp', fold_size=None, **fp_args)` ¶