Skip to content

datamol.similarity

cdist(mols1, mols2, n_jobs=1, distances_chunk=False, distances_chunk_memory=1024, distances_n_jobs=-1, **fp_args)

Compute the tanimoto distance between the fingerprints of each pair of molecules of the two collections of inputs.

Parameters:

Name Type Description Default
mols1 List[Union[str, dm.Mol]]

list of molecules.

required
mols2 List[Union[str, dm.Mol]]

list of molecules.

required
n_jobs Optional[int]

Number of jobs for fingerprint computation. Let to 1 for no parallelization. Set to -1 to use all available cores.

1
distances_chunk bool

Whether to use chunked computation.

False
distances_chunk_memory int

Memory size in MB to use for chunked computation.

1024
distances_n_jobs int

Number of jobs for parallelization.

-1
**fp_args Any

list of args to pass to to_fp().

{}

Returns:

Type Description
np.ndarray

distmat

Source code in datamol/similarity.py
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
def cdist(
    mols1: List[Union[str, dm.Mol]],
    mols2: List[Union[str, dm.Mol]],
    n_jobs: Optional[int] = 1,
    distances_chunk: bool = False,
    distances_chunk_memory: int = 1024,
    distances_n_jobs: int = -1,
    **fp_args: Any,
) -> np.ndarray:
    """Compute the tanimoto distance between the fingerprints of each pair of
    molecules of the two collections of inputs.

    Args:
        mols1: list of molecules.
        mols2: list of molecules.
        n_jobs: Number of jobs for fingerprint computation. Let to 1 for no
            parallelization. Set to -1 to use all available cores.
        distances_chunk: Whether to use chunked computation.
        distances_chunk_memory: Memory size in MB to use for chunked computation.
        distances_n_jobs: Number of jobs for parallelization.
        **fp_args: list of args to pass to `to_fp()`.

    Returns:
        distmat
    """

    fps1 = dm.parallelized(
        functools.partial(dm.to_fp, as_array=True, **fp_args),
        mols1,
        n_jobs=n_jobs,
    )

    fps2 = dm.parallelized(
        functools.partial(dm.to_fp, as_array=True, **fp_args),
        mols2,
        n_jobs=n_jobs,
    )

    fps1 = np.array(fps1).astype(bool)
    fps2 = np.array(fps2).astype(bool)

    if distances_chunk:
        distances = pairwise_distances_chunked(
            fps1,
            fps2,
            metric="jaccard",
            n_jobs=distances_n_jobs,
            working_memory=distances_chunk_memory,
        )
        distances = [i for i in distances]
        distances = np.vstack(distances)
    else:
        distances = distance.cdist(fps1, fps2, metric="jaccard")

    return distances

pdist(mols, n_jobs=1, squareform=True, **fp_args)

Compute the pairwise tanimoto distance between the fingerprints of all the molecules in the input set.

Parameters:

Name Type Description Default
mols List[Union[str, dm.Mol]]

list of molecules

required
n_jobs Optional[int]

Number of jobs for parallelization. Let to 1 for no parallelization. Set to -1 to use all available cores.

1
squareform bool

Whether to return in square form (matrix) or in a condensed form (1D vector).

True
**fp_args Any

list of args to pass to to_fp().

{}

Returns:

Type Description
np.ndarray

dist_mat

Source code in datamol/similarity.py
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def pdist(
    mols: List[Union[str, dm.Mol]],
    n_jobs: Optional[int] = 1,
    squareform: bool = True,
    **fp_args: Any,
) -> np.ndarray:
    """Compute the pairwise tanimoto distance between the fingerprints of all the
    molecules in the input set.

    Args:
        mols: list of molecules
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to -1 to use all available cores.
        squareform: Whether to return in square form (matrix) or in a condensed
            form (1D vector).
        **fp_args: list of args to pass to `to_fp()`.

    Returns:
        dist_mat
    """

    fps = dm.parallelized(
        functools.partial(dm.to_fp, as_array=True, **fp_args),
        mols,
        n_jobs=n_jobs,
    )

    fps = np.array(fps)

    dist_mat = distance.pdist(fps, metric="jaccard")

    if squareform:
        dist_mat = distance.squareform(dist_mat, force="tomatrix")

    return dist_mat