`datamol.similarity`¶

`cdist(mols1, mols2, n_jobs=1, distances_chunk=False, distances_chunk_memory=1024, distances_n_jobs=-1, **fp_args)` ¶

Compute the tanimoto distance between the fingerprints of each pair of molecules of the two collections of inputs.

Parameters:

Name	Type	Description	Default
`mols1`	`List[Union[str, Mol]]`	list of molecules.	required
`mols2`	`List[Union[str, Mol]]`	list of molecules.	required
`n_jobs`	`Optional[int]`	Number of jobs for fingerprint computation. Let to 1 for no parallelization. Set to -1 to use all available cores.	`1`
`distances_chunk`	`bool`	Whether to use chunked computation.	`False`
`distances_chunk_memory`	`int`	Memory size in MB to use for chunked computation.	`1024`
`distances_n_jobs`	`int`	Number of jobs for parallelization.	`-1`
`**fp_args`	`Any`	list of args to pass to `to_fp()`.	`{}`

Returns:

Type	Description
`ndarray`	distmat

Source code in datamol/similarity.py

def cdist(
    mols1: List[Union[str, dm.Mol]],
    mols2: List[Union[str, dm.Mol]],
    n_jobs: Optional[int] = 1,
    distances_chunk: bool = False,
    distances_chunk_memory: int = 1024,
    distances_n_jobs: int = -1,
    **fp_args: Any,
) -> np.ndarray:
    """Compute the tanimoto distance between the fingerprints of each pair of
    molecules of the two collections of inputs.

    Args:
        mols1: list of molecules.
        mols2: list of molecules.
        n_jobs: Number of jobs for fingerprint computation. Let to 1 for no
            parallelization. Set to -1 to use all available cores.
        distances_chunk: Whether to use chunked computation.
        distances_chunk_memory: Memory size in MB to use for chunked computation.
        distances_n_jobs: Number of jobs for parallelization.
        **fp_args: list of args to pass to `to_fp()`.

    Returns:
        distmat
    """

    fps1 = dm.parallelized(
        functools.partial(dm.to_fp, as_array=True, **fp_args),
        mols1,
        n_jobs=n_jobs,
    )

    fps2 = dm.parallelized(
        functools.partial(dm.to_fp, as_array=True, **fp_args),
        mols2,
        n_jobs=n_jobs,
    )

    fps1_array = np.array(fps1).astype(bool)
    fps2_array = np.array(fps2).astype(bool)

    if distances_chunk:
        distances = pairwise_distances_chunked(
            fps1_array,
            fps2_array,
            metric="jaccard",
            n_jobs=distances_n_jobs,
            working_memory=distances_chunk_memory,
        )
        distances_array = np.vstack(list(distances))
    else:
        distances_array = distance.cdist(fps1_array, fps2_array, metric="jaccard")

    return distances_array

`pdist(mols, n_jobs=1, squareform=True, **fp_args)` ¶

Compute the pairwise tanimoto distance between the fingerprints of all the molecules in the input set.

Parameters:

Name	Type	Description	Default
`mols`	`List[Union[str, Mol]]`	list of molecules	required
`n_jobs`	`Optional[int]`	Number of jobs for parallelization. Let to 1 for no parallelization. Set to -1 to use all available cores.	`1`
`squareform`	`bool`	Whether to return in square form (matrix) or in a condensed form (1D vector).	`True`
`**fp_args`	`Any`	list of args to pass to `to_fp()`.	`{}`

Returns:

Type	Description
`ndarray`	dist_mat

Source code in datamol/similarity.py

def pdist(
    mols: List[Union[str, dm.Mol]],
    n_jobs: Optional[int] = 1,
    squareform: bool = True,
    **fp_args: Any,
) -> np.ndarray:
    """Compute the pairwise tanimoto distance between the fingerprints of all the
    molecules in the input set.

    Args:
        mols: list of molecules
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to -1 to use all available cores.
        squareform: Whether to return in square form (matrix) or in a condensed
            form (1D vector).
        **fp_args: list of args to pass to `to_fp()`.

    Returns:
        dist_mat
    """

    fps = dm.parallelized(
        functools.partial(dm.to_fp, as_array=True, **fp_args),
        mols,
        n_jobs=n_jobs,
    )

    fps_array = np.array(fps)

    dist_mat = distance.pdist(fps_array, metric="jaccard")

    if squareform:
        dist_mat = distance.squareform(dist_mat, force="tomatrix")

    return dist_mat

datamol.similarity¶

cdist(mols1, mols2, n_jobs=1, distances_chunk=False, distances_chunk_memory=1024, distances_n_jobs=-1, **fp_args) ¶

pdist(mols, n_jobs=1, squareform=True, **fp_args) ¶

`datamol.similarity`¶

`cdist(mols1, mols2, n_jobs=1, distances_chunk=False, distances_chunk_memory=1024, distances_n_jobs=-1, **fp_args)` ¶

`pdist(mols, n_jobs=1, squareform=True, **fp_args)` ¶