`datamol`¶

Datamol is designed to be used with a single import (import datamol as dm). Most of the functions are available in datamol.*. The others ones are available throught their specific modules.

The below sections shows you the directly available Datamol functions. For other modules, please browser the API using the left menu.

Working with molecules¶

The basics¶

`to_mol(mol, add_hs=False, explicit_only=False, ordered=False, kekulize=False, sanitize=True)` ¶

Convert an input molecule (smiles representation) into a Chem.rdchem.Mol.

Parameters:

Name	Type	Description	Default
`mol`	`str`	SMILES of a molecule or a molecule.	required
`add_hs`	`bool`	Whether hydrogens should be added the molecule.	`False`
`explicit_only`	`bool`	Whether to only add explicit hydrogen or both (implicit and explicit). when `add_hs` is set to True.	`False`
`ordered`	`bool`	Whether the atom should be ordered. This option is important if you want to ensure that the features returned will always maintain a single atom order for the same molecule, regardless of its original SMILES representation.	`False`
`kekulize`	`bool`	Whether to perform kekulization of the input molecules.	`False`
`sanitize`	`bool`	Whether to apply rdkit sanitization when input is a SMILES.	`True`

Returns:

Type	Description
`Optional[rdkit.Chem.rdchem.Mol]`	mol: the molecule if some conversion have been made. If the conversion fails None is returned so make sure that you handle this case on your own.

Source code in datamol/mol.py

def to_mol(
    mol: str,
    add_hs: bool = False,
    explicit_only: bool = False,
    ordered: bool = False,
    kekulize: bool = False,
    sanitize: bool = True,
) -> Optional[Chem.rdchem.Mol]:
    """Convert an input molecule (smiles representation) into a `Chem.rdchem.Mol`.

    Args:
        mol: SMILES of a molecule or a molecule.
        add_hs: Whether hydrogens should be added the molecule.
        explicit_only: Whether to only add explicit hydrogen or both
            (implicit and explicit). when `add_hs` is set to True.
        ordered: Whether the atom should be ordered. This option is
            important if you want to ensure that the features returned will always maintain
            a single atom order for the same molecule, regardless of its original SMILES representation.
        kekulize: Whether to perform kekulization of the input molecules.
        sanitize: Whether to apply rdkit sanitization when input is a SMILES.

    Returns:
        mol: the molecule if some conversion have been made. If the conversion fails
        None is returned so make sure that you handle this case on your own.
    """

    if not isinstance(mol, (str, Chem.rdchem.Mol)):
        raise ValueError(f"Input should be a Chem.rdchem.Mol or a string instead of '{type(mol)}'")

    if isinstance(mol, str):
        _mol = Chem.MolFromSmiles(mol, sanitize=sanitize)

        if not sanitize and _mol is not None:
            _mol.UpdatePropertyCache(False)
    else:
        _mol = mol

    # Add hydrogens
    if _mol is not None and add_hs:
        _mol = Chem.AddHs(_mol, explicitOnly=explicit_only, addCoords=True)

    # Reorder atoms
    if _mol is not None and ordered:
        _mol = reorder_atoms(_mol)

    if _mol is not None and kekulize:
        Chem.Kekulize(_mol, clearAromaticFlags=False)
    return _mol

`copy_mol(mol)` ¶

Copy a molecule and return a new one.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	a molecule to copy.	required

Source code in datamol/mol.py

def copy_mol(mol: Chem.rdchem.Mol) -> Chem.rdchem.Mol:
    """Copy a molecule and return a new one.

    Args:
        mol: a molecule to copy.
    """
    return copy.deepcopy(mol)

`reorder_atoms(mol, break_ties=True, include_chirality=True, include_isotopes=True)` ¶

Reorder the atoms in a mol. It ensures a single atom order for the same molecule, regardless of its original representation.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	a molecule.	required
`break_ties`	`bool`	Force breaking of ranked ties.	`True`
`include_chirality`	`bool`	Use chiral information when computing rank.	`True`
`include_isotopes`	`bool`	Use isotope information when computing rank.	`True`

Returns:

Type	Description
`Optional[rdkit.Chem.rdchem.Mol]`	mol: a molecule.

Source code in datamol/mol.py

def reorder_atoms(
    mol: Chem.rdchem.Mol,
    break_ties: bool = True,
    include_chirality: bool = True,
    include_isotopes: bool = True,
) -> Optional[Chem.rdchem.Mol]:
    """Reorder the atoms in a mol. It ensures a single atom order for the same molecule,
    regardless of its original representation.

    Args:
        mol: a molecule.
        break_ties: Force breaking of ranked ties.
        include_chirality: Use chiral information when computing rank.
        include_isotopes: Use isotope information when computing rank.

    Returns:
        mol: a molecule.
    """
    if mol.GetNumAtoms() == 0:
        return mol

    new_order = Chem.CanonicalRankAtoms(
        mol,
        breakTies=break_ties,
        includeChirality=include_chirality,
        includeIsotopes=include_isotopes,
    )
    new_order = sorted([(y, x) for x, y in enumerate(new_order)])
    return Chem.RenumberAtoms(mol, [y for (x, y) in new_order])

`randomize_atoms(mol)` ¶

Randomize the position of the atoms in a mol.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	a molecule.	required

Returns:

Type	Description
`Optional[rdkit.Chem.rdchem.Mol]`	mol: a molecule.

Source code in datamol/mol.py

def randomize_atoms(mol: Chem.rdchem.Mol) -> Optional[Chem.rdchem.Mol]:
    """Randomize the position of the atoms in a mol.

    Args:
        mol: a molecule.

    Returns:
        mol: a molecule.
    """
    if mol.GetNumAtoms() == 0:
        return mol

    atom_indices = list(range(mol.GetNumAtoms()))
    random.shuffle(atom_indices)
    return Chem.RenumberAtoms(mol, atom_indices)

`to_neutral(mol)` ¶

Neutralize the charge of a molecule.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	a molecule.	required

Returns:

Type	Description
`Optional[rdkit.Chem.rdchem.Mol]`	mol: a molecule.

Source code in datamol/mol.py

def to_neutral(mol: Chem.rdchem.Mol) -> Optional[Chem.rdchem.Mol]:
    """Neutralize the charge of a molecule.

    Args:
        mol: a molecule.

    Returns:
        mol: a molecule.
    """
    if mol is None:
        return mol

    for a in mol.GetAtoms():
        if a.GetFormalCharge() < 0 or (
            a.GetExplicitValence() >= PERIODIC_TABLE.GetDefaultValence(a.GetSymbol())
            and a.GetFormalCharge() > 0
        ):
            a.SetFormalCharge(0)
            a.UpdatePropertyCache(False)
    return mol

`set_mol_props(mol, props, copy=False)` ¶

Set properties to a mol from a dict.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	the mol where to copy the props.	required
`props`	`Dict[str, Any]`	the props to copy.	required
`copy`	`bool`	whether to copy the provided mol	`False`

Source code in datamol/mol.py

def set_mol_props(
    mol: Chem.rdchem.Mol,
    props: Dict[str, Any],
    copy: bool = False,
) -> Chem.rdchem.Mol:
    """Set properties to a mol from a dict.

    Args:
        mol: the mol where to copy the props.
        props: the props to copy.
        copy: whether to copy the provided mol

    """

    if copy is True:
        mol = dm.copy_mol(mol)

    for k, v in props.items():
        if isinstance(v, bool):
            mol.SetBoolProp(k, v)
        elif isinstance(v, int):
            mol.SetIntProp(k, v)
        elif isinstance(v, float):
            mol.SetDoubleProp(k, v)
        else:
            mol.SetProp(k, str(v))

    return mol

`copy_mol_props(source, destination)` ¶

Copy properties from one source molecule to another destination molecule.

Parameters:

Name	Type	Description	Default
`source`	`Mol`	a molecule to copy from.	required
`destination`	`Mol`	a molecule to copy to.	required

Source code in datamol/mol.py

def copy_mol_props(source: Chem.rdchem.Mol, destination: Chem.rdchem.Mol):
    """Copy properties from one source molecule to another destination
    molecule.

    Args:
        source: a molecule to copy from.
        destination: a molecule to copy to.
    """

    props = source.GetPropsAsDict()
    dm.set_mol_props(destination, props)

`atom_indices_to_mol(mol, copy=False)` ¶

Add the molAtomMapNumber property to each atoms.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	a molecule	required
`copy`	`bool`	Whether to copy the molecule.	`False`

Source code in datamol/mol.py

def atom_indices_to_mol(mol: Chem.rdchem.Mol, copy: bool = False):
    """Add the `molAtomMapNumber` property to each atoms.

    Args:
        mol: a molecule
        copy: Whether to copy the molecule.
    """

    if copy is True:
        mol = copy_mol(mol)

    for atom in mol.GetAtoms():
        atom.SetProp("molAtomMapNumber", str(atom.GetIdx()))
    return mol

`same_mol(mol1, mol2)` ¶

Check two molecules are the same by comparing their InChiKey.

Invalid molecules (None) are always considered as not the same.

Parameters:

Name	Type	Description	Default
`mol1`	`Optional[rdkit.Chem.rdchem.Mol]`	A molecule.	required
`mol2`	`Optional[rdkit.Chem.rdchem.Mol]`	A molecule.	required

Source code in datamol/mol.py

def same_mol(mol1: Optional[Chem.rdchem.Mol], mol2: Optional[Chem.rdchem.Mol]):
    """Check two molecules are the same by comparing their InChiKey.

    Invalid molecules (None) are always considered as not the same.

    Args:
        mol1: A molecule.
        mol2: A molecule.
    """

    if mol1 is None or mol2 is None:
        return False

    return dm.to_inchikey(mol1) == dm.to_inchikey(mol2)

Fix, sanitize and standardize¶

`sanitize_mol(mol, charge_neutral=False, sanifix=True, verbose=True, add_hs=False)` ¶

An augmented version of RDKit sanitize=True. It uses a mol-SMILES-mol conversion to catch potential aromaticity errors and try to fix aromatic nitrogen (using the popular sanifix4 script). Optionally, it can neutralize the charge of the molecule.

Note #1: Only the first conformer (if present) will be preserved and a warning will be displayed if more than one conformer is detected.

Note #2: The molecule's properties will be preserved but the atom's properties will be lost.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	a molecule.	required
`charge_neutral`	`bool`	whether charge neutralization should be applied.	`False`
`sanifix`	`bool`	whether to run the sanifix from James Davidson (sanifix4.py) that try to adjust aromatic nitrogens.	`True`
`verbose`	`bool`	Whether displaying a warning about multiple conformers.	`True`
`add_hs`	`bool`	Add hydrogens to the returned molecule. Useful when the input molecule already contains hydrogens.	`False`

Returns:

Type	Description
`Optional[rdkit.Chem.rdchem.Mol]`	mol: a molecule.

Source code in datamol/mol.py

def sanitize_mol(
    mol: Chem.rdchem.Mol,
    charge_neutral: bool = False,
    sanifix: bool = True,
    verbose: bool = True,
    add_hs: bool = False,
) -> Optional[Chem.rdchem.Mol]:
    """An augmented version of RDKit `sanitize=True`. It uses a
    mol-SMILES-mol conversion to catch potential aromaticity errors
    and try to fix aromatic nitrogen (using the popular sanifix4 script).
    Optionally, it can neutralize the charge of the molecule.

    Note #1: Only the first conformer (if present) will be preserved and
    a warning will be displayed if more than one conformer is detected.

    Note #2: The molecule's properties will be preserved but the atom's
    properties will be lost.

    Args:
        mol: a molecule.
        charge_neutral: whether charge neutralization should be applied.
        sanifix: whether to run the sanifix from James Davidson
            (sanifix4.py) that try to adjust aromatic nitrogens.
        verbose: Whether displaying a warning about multiple conformers.
        add_hs: Add hydrogens to the returned molecule. Useful when the input
            molecule already contains hydrogens.

    Returns:
        mol: a molecule.
    """
    if mol is None:
        return mol

    # Extract properties.
    original_mol = copy_mol(mol)
    properties = original_mol.GetPropsAsDict()

    if charge_neutral:
        mol = to_neutral(mol)

    if sanifix:
        mol = _sanifix4.sanifix(mol)

    if mol is not None:

        # Detect multiple conformers
        if verbose and mol.GetNumConformers() > 1:
            logger.warning(
                f"The molecule contains multiple conformers. Only the first one will be preserved."
            )

        # Try catch to avoid occasional aromaticity errors
        try:
            # `cxsmiles` is used here to preserve the first conformer.
            mol = to_mol(dm.to_smiles(mol, cxsmiles=True), sanitize=True, add_hs=add_hs)  # type: ignore
        except Exception:
            mol = None

    if mol is not None:
        # Insert back properties.
        mol = dm.set_mol_props(mol, properties)

    return mol

`sanitize_first(mols, charge_neutral=False, sanifix=True)` ¶

Sanitize a list of molecules and return the first valid molecule seen in the list.

Parameters:

Name	Type	Description	Default
`mols`	`List[rdkit.Chem.rdchem.Mol]`	a list of molecules.	required
`charge_neutral`	`bool`	whether charge neutralization should be applied.	`False`
`sanifix`	`bool`	whether to run the sanifix from James Davidson (sanifix4.py) that try to adjust aromatic nitrogens.	`True`

Returns:

Type	Description
`mol`	a molecule.

Source code in datamol/mol.py

def sanitize_first(mols: List[Chem.rdchem.Mol], charge_neutral: bool = False, sanifix: bool = True):
    """Sanitize a list of molecules and return the first valid molecule seen in the list.

    Args:
        mols: a list of molecules.
        charge_neutral: whether charge neutralization should be applied.
        sanifix: whether to run the sanifix from James Davidson
            (sanifix4.py) that try to adjust aromatic nitrogens.

    Returns:
        mol: a molecule.
    """
    for mol in mols:
        mol = sanitize_mol(mol, charge_neutral=charge_neutral, sanifix=sanifix)
        if mol:
            return mol
    return None

`sanitize_smiles(smiles, isomeric=True)` ¶

Takes SMILES string and returns its sanitized version.

Parameters:

Name	Type	Description	Default
`smiles`	`str`	smiles to be sanitized.	required
`isomeric`	`bool`	Whether to include information about stereochemistry in the SMILES.	`True`

Returns:

Type	Description
`Optional[str]`	sanitized smiles.

Source code in datamol/mol.py

def sanitize_smiles(smiles: str, isomeric: bool = True) -> Optional[str]:
    """Takes SMILES string and returns its sanitized version.

    Args:
        smiles: smiles to be sanitized.
        isomeric: Whether to include information about stereochemistry in the SMILES.

    Returns:
        sanitized smiles.
    """
    try:
        mol = dm.to_mol(smiles, sanitize=False)
        mol = dm.sanitize_mol(mol, False)
    except Exception:
        return None

    if mol is None:
        return None

    try:
        smiles = dm.to_smiles(mol, isomeric=isomeric)  # type: ignore
    except:
        return None
    return smiles

`standardize_smiles(smiles, tautomer=False)` ¶

Apply smile standardization procedure. This is a convenient function wrapped arrounf RDKit smiles standardizer and tautomeric canonicalization.

Parameters:

Name	Type	Description	Default
`smiles`	`str`	Smiles to standardize	required
`tautomer`	`bool`	Whether to canonicalize tautomers	`False`

Returns:

Type	Description
`standard_smiles`	the standardized smiles

Source code in datamol/mol.py

def standardize_smiles(smiles: str, tautomer: bool = False):
    r"""
    Apply smile standardization procedure. This is a convenient function wrapped arrounf RDKit
    smiles standardizer and tautomeric canonicalization.

    Args:
        smiles: Smiles to standardize
        tautomer: Whether to canonicalize tautomers

    Returns:
        standard_smiles: the standardized smiles
    """

    smiles = rdMolStandardize.StandardizeSmiles(smiles)
    if tautomer:
        smiles = canonicalize_tautomer_smiles(smiles)
    return smiles

`standardize_mol(mol, disconnect_metals=False, normalize=True, reionize=True, uncharge=False, stereo=True)` ¶

This function returns a standardized version the given molecule, with or without disconnect the metals. The process is apply in the order of the argument.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	The molecule to standardize.	required
`disconnect_metals`	`bool`	Whether to disconnect the metallic atoms from non-metals	`False`
`normalize`	`bool`	Whether to apply normalization (correct functional groups and recombine charges).	`True`
`reionize`	`bool`	Whether to apply molecule reionization	`True`
`uncharge`	`bool`	Whether to remove all charge from molecule	`False`
`stereo`	`bool`	Whether to attempt to assign stereochemistry	`True`

Returns:

Type	Description
`mol`	The standardized molecule.

Source code in datamol/mol.py

def standardize_mol(
    mol: Chem.rdchem.Mol,
    disconnect_metals: bool = False,
    normalize: bool = True,
    reionize: bool = True,
    uncharge: bool = False,
    stereo: bool = True,
):
    r"""
    This function returns a standardized version the given molecule, with or without disconnect the metals.
    The process is apply in the order of the argument.

    Arguments:
        mol: The molecule to standardize.
        disconnect_metals: Whether to disconnect the metallic atoms from non-metals
        normalize: Whether to apply normalization (correct functional groups and recombine charges).
        reionize: Whether to apply molecule reionization
        uncharge: Whether to remove all charge from molecule
        stereo: Whether to attempt to assign stereochemistry

    Returns:
        mol: The standardized molecule.
    """
    mol = copy_mol(mol)

    if disconnect_metals:
        md = rdMolStandardize.MetalDisconnector()
        mol = md.Disconnect(mol)

    if normalize:
        mol = rdMolStandardize.Normalize(mol)

    if reionize:
        reionizer = rdMolStandardize.Reionizer()
        mol = reionizer.reionize(mol)

    if uncharge:
        uncharger = rdMolStandardize.Uncharger()
        mol = uncharger.uncharge(mol)

    if stereo:
        Chem.AssignStereochemistry(mol, force=False, cleanIt=True)

    return mol

`fix_valence_charge(mol, inplace=False)` ¶

Fix valence issues that are due to incorrect charges.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	Input molecule with incorrect valence for some atoms	required
`inplace`	`bool`	Whether to modify in place or make a copy.	`False`

Returns:

Type	Description
`Optional[rdkit.Chem.rdchem.Mol]`	Fixed molecule via charge correction or original molecule if failed.

Source code in datamol/mol.py

def fix_valence_charge(mol: Chem.rdchem.Mol, inplace: bool = False) -> Optional[Chem.rdchem.Mol]:
    """Fix valence issues that are due to incorrect charges.

    Args:
        mol: Input molecule with incorrect valence for some atoms
        inplace: Whether to modify in place or make a copy.

    Returns:
        Fixed molecule via charge correction or original molecule if failed.
    """

    vm = rdMolStandardize.RDKitValidation()

    # Don't fix something that is not broken
    if len(vm.validate(mol)) > 0:

        if not inplace:
            mol = copy.copy(mol)

        mol.UpdatePropertyCache(False)
        for a in mol.GetAtoms():
            n_electron = (
                a.GetImplicitValence()
                + a.GetExplicitValence()
                - dm.PERIODIC_TABLE.GetDefaultValence(a.GetSymbol())
            )
            a.SetFormalCharge(n_electron)

    return mol

`incorrect_valence(a, update=False)` ¶

Check if an atom connection is not valid or all the atom of a molecule.

Parameters:

Name	Type	Description	Default
`a`	`Union[rdkit.Chem.rdchem.Mol, rdkit.Chem.rdchem.Atom]`	atom or molecule to check for valence issue.	required
`update`	`bool`	Update owning molecule property cache first.	`False`

Returns:

Type	Description
`bool`	Whether the input atom valence is correct.

Source code in datamol/mol.py

def incorrect_valence(a: Union[Chem.rdchem.Mol, Chem.rdchem.Atom], update: bool = False) -> bool:
    """Check if an atom connection is not valid or all the atom of a molecule.

    Args:
        a: atom or molecule to check for valence issue.
        update: Update owning molecule property cache first.

    Returns:
        Whether the input atom valence is correct.
    """
    if isinstance(a, Chem.rdchem.Mol):
        a.UpdatePropertyCache(False)
        vm = rdMolStandardize.RDKitValidation()
        return len(vm.validate(a)) > 0

    if update:
        m = a.GetOwningMol()
        m.UpdatePropertyCache(False)
    return (a.GetImplicitValence() == 0) and (
        a.GetExplicitValence() > max(PERIODIC_TABLE.GetValenceList(a.GetSymbol()))
    )

`decrease_bond(bond)` ¶

Remove one single bond from the input bond. Note that you should first kekulize your molecules and remove non-standard bond.

Parameters:

Name	Type	Description	Default
`bond`	`Bond`	a bond.	required

Source code in datamol/mol.py

def decrease_bond(bond: Chem.rdchem.Bond) -> Optional[Union[list, Chem.rdchem.Bond]]:
    """Remove one single bond from the input bond. Note that you should
    first kekulize your molecules and remove non-standard bond.

    Args:
        bond: a bond.
    """
    if bond.GetBondType() == TRIPLE_BOND:
        return DOUBLE_BOND
    if bond.GetBondType() == DOUBLE_BOND:
        return SINGLE_BOND
    if bond.GetBondType() == SINGLE_BOND:
        return None
    return bond

`fix_valence(mol, inplace=False, allow_ring_break=False)` ¶

Identify and try to fix valence issues by removing any supplemental bond that should not be in the graph.

Parameters:

Name	Type	Description	Default
`mol`		input molecule with incorrect valence for some atoms	required
`inplace`	`bool`	Whether to modify in place or make a copy	`False`
`allow_ring_break`	`bool`	Whether bond removal involving ring is allowed.	`False`

Returns:

Type	Description
`Optional[rdkit.Chem.rdchem.Mol]`	Fixed potential valence issue in molecule or original molecule when nothing is broken of if failed.

Source code in datamol/mol.py

def fix_valence(
    mol, inplace: bool = False, allow_ring_break: bool = False
) -> Optional[Chem.rdchem.Mol]:
    """Identify and try to fix valence issues by removing any supplemental bond
    that should not be in the graph.

    Args:
        mol: input molecule with incorrect valence for some atoms
        inplace: Whether to modify in place or make a copy
        allow_ring_break: Whether bond removal involving ring is allowed.

    Returns:
        Fixed potential valence issue in molecule or original molecule when nothing is broken
        of if failed.
    """
    if not inplace:
        mol = copy.copy(mol)

    vm = rdMolStandardize.RDKitValidation()
    if len(vm.validate(mol)) == 0:  # don't fix something that is not broken
        return mol

    try:
        m = Chem.RemoveHs(
            mol,
            implicitOnly=False,
            updateExplicitCount=True,
            sanitize=False,
        )
        m.UpdatePropertyCache(False)

        # first pass using explicit false count
        for atom in m.GetAtoms():
            while incorrect_valence(atom) and atom.GetTotalNumHs() > 0:
                cur_hydrogen = atom.GetTotalNumHs()
                atom.SetNumExplicitHs(max(0, cur_hydrogen - 1))
                atom.SetFormalCharge(max(0, atom.GetFormalCharge() - 1))
                # atom.SetNumRadicalElectrons(0)
            atom.UpdatePropertyCache(False)

        em = Chem.RWMol(m)
        bonds = em.GetBonds()
        bonds = [
            bond
            for bond in bonds
            if any(
                [
                    incorrect_valence(bond.GetBeginAtom()),
                    incorrect_valence(bond.GetEndAtom()),
                ]
            )
        ]
        for bond in bonds:
            a1 = bond.GetBeginAtom()
            a2 = bond.GetEndAtom()
            if incorrect_valence(a1) or incorrect_valence(a2):
                mbond = decrease_bond(bond)
                if allow_ring_break or (mbond or not bond.IsInRing()):
                    em.RemoveBond(a1.GetIdx(), a2.GetIdx())
                    if mbond is not None:
                        em.AddBond(a1.GetIdx(), a2.GetIdx(), mbond)
            a1.UpdatePropertyCache(False)
            a2.UpdatePropertyCache(False)
        m = em.GetMol()

    except Exception:
        return None

    return m

`adjust_singleton(mol)` ¶

Remove all atoms that are essentially disconnected singleton nodes in the molecular graph. For example, the chlorine atom and methane fragment will be removed in Cl.[N:1]1=CC(O)=CC2CCCCC12.CC.C", but not the ethane fragment.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	a molecule.	required

Source code in datamol/mol.py

def adjust_singleton(mol: Chem.rdchem.Mol) -> Optional[Chem.rdchem.Mol]:
    """Remove all atoms that are essentially disconnected singleton nodes in the molecular graph.
    For example, the chlorine atom and methane fragment will be removed in Cl.[N:1]1=CC(O)=CC2CCCCC12.CC.C",
    but not the ethane fragment.

    Args:
        mol: a molecule.
    """
    to_rem = []
    em = Chem.RWMol(mol)
    for atom in mol.GetAtoms():
        if atom.GetExplicitValence() == 0:
            to_rem.append(atom.GetIdx())
    to_rem.sort(reverse=True)
    for a_idx in to_rem:
        em.RemoveAtom(a_idx)
    return em.GetMol()

`remove_dummies(mol, dummy='*')` ¶

Remove dummy atoms from molecules.

Source code in datamol/mol.py

def remove_dummies(mol: Chem.rdchem.Mol, dummy: str = "*") -> Optional[Chem.rdchem.Mol]:
    """Remove dummy atoms from molecules."""
    du = dm.to_mol(dummy)
    out = mol
    try:
        out = Chem.ReplaceSubstructs(mol, du, dm.to_mol("[H]"), True)[0]
        out = Chem.RemoveHs(out)
    except Exception as e:
        out = Chem.DeleteSubstructs(mol, du)
    return out

`fix_mol(mol, n_iter=1, remove_singleton=False, largest_only=False, inplace=False)` ¶

Fix error in molecule using a greedy approach.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	input molecule to fix	required
`n_iter`	`int`	Number of valence fix iteration to apply	`1`
`remove_singleton`	`bool`	Whether `adjust_singleton` should be applied	`False`
`largest_only`	`bool`	Whether only the largest fragment should be kept	`False`
`inplace`	`bool`	Whether to return a copy of the mol or perform in place operation	`False`

Returns:

Type	Description
`Optional[rdkit.Chem.rdchem.Mol]`	Fixed molecule.

Source code in datamol/mol.py

def fix_mol(
    mol: Chem.rdchem.Mol,
    n_iter: int = 1,
    remove_singleton: bool = False,
    largest_only: bool = False,
    inplace: bool = False,
) -> Optional[Chem.rdchem.Mol]:
    """Fix error in molecule using a greedy approach.

    Args:
        mol: input molecule to fix
        n_iter: Number of valence fix iteration to apply
        remove_singleton: Whether `adjust_singleton` should be applied
        largest_only: Whether only the largest fragment should be kept
        inplace: Whether to return a copy of the mol or perform in place operation

    Returns:
        Fixed molecule.
    """

    if not inplace:
        mol = copy.copy(mol)

    m = sanitize_mol(mol) or mol  # fail back to mol when the fixer fail

    if m is not None:
        m = remove_dummies(m)
        for _ in range(n_iter):
            m = fix_valence(m)

        if remove_singleton:
            m = adjust_singleton(m)

        if largest_only:
            # m = max(Chem.rdmolops.GetMolFrags(m, asMols=True, sanitizeFrags=False), key=lambda m: m.GetNumAtoms())
            m = rdMolStandardize.FragmentParent(m, skipStandardize=True)

    return m

`replace_dummies_atoms(mol, atom='C', dummy='*', replace_all=True)` ¶

Remove dummy atoms from molecules.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	molecule with dummies	required
`atom`	`str`	replacement atom, default is carbon	`'C'`
`dummy`	`str`	dummy atom representation	`'*'`
`replace_all`	`bool`	Whether to replace all dummies	`True`

Returns:

Type	Description
`Optional[rdkit.Chem.rdchem.Mol]`	mol: Molecule with dummy replaced

Source code in datamol/mol.py

def replace_dummies_atoms(
    mol: Chem.rdchem.Mol,
    atom: str = "C",
    dummy: str = "*",
    replace_all: bool = True,
) -> Optional[Chem.rdchem.Mol]:
    """Remove dummy atoms from molecules.

    Args:
        mol: molecule with dummies
        atom: replacement atom, default is carbon
        dummy: dummy atom representation
        replace_all: Whether to replace all dummies

    Returns:
        mol: Molecule with dummy replaced
    """
    du = Chem.MolFromSmiles(dummy)
    replacement = Chem.MolFromSmiles(atom)
    out = Chem.ReplaceSubstructs(mol, du, replacement, replaceAll=replace_all)[0]
    return out

`keep_largest_fragment(mol)` ¶

Only keep largest fragment of each molecule.

Source code in datamol/mol.py

def keep_largest_fragment(mol: Chem.rdchem.Mol) -> Optional[Chem.rdchem.Mol]:
    """Only keep largest fragment of each molecule."""
    return max(
        rdmolops.GetMolFrags(mol, asMols=True),
        default=mol,
        key=lambda m: m.GetNumAtoms(),
    )

`is_transition_metal(at)` ¶

Check if atom is a transition metal.

Parameters:

Name	Type	Description	Default
`at`	`Atom`	an atom.	required

Source code in datamol/mol.py

def is_transition_metal(at: Chem.rdchem.Atom) -> bool:
    """Check if atom is a transition metal.

    Args:
        at: an atom.
    """
    n = at.GetAtomicNum()
    return (n >= 22 and n <= 29) or (n >= 40 and n <= 47) or (n >= 72 and n <= 79)

`set_dative_bonds(mol, from_atoms=(7, 8))` ¶

Replaces some single bonds between metals and atoms with atomic numbers in fromAtoms with dative bonds. The replacement is only done if the atom has "too many" bonds.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	molecule with bond to modify	required
`from_atoms`	`Tuple[int, int]`	List of atoms (symbol or atomic number) to consider for bond replacement. By default, only Nitrogen (7) and Oxygen (8) are considered.	`(7, 8)`

Returns:

Type	Description
`Optional[rdkit.Chem.rdchem.Mol]`	The modified molecule.

Source code in datamol/mol.py

def set_dative_bonds(
    mol: Chem.rdchem.Mol, from_atoms: Tuple[int, int] = (7, 8)
) -> Optional[Chem.rdchem.Mol]:
    """Replaces some single bonds between metals and atoms with atomic numbers in fromAtoms
    with dative bonds. The replacement is only done if the atom has "too many" bonds.

    Arguments:
        mol: molecule with bond to modify
        from_atoms: List of atoms  (symbol or atomic number) to consider for bond replacement.
            By default, only Nitrogen (7) and Oxygen (8) are considered.

    Returns:
        The modified molecule.
    """
    rwmol = Chem.RWMol(mol)
    rwmol.UpdatePropertyCache(strict=False)

    metals = [at for at in rwmol.GetAtoms() if is_transition_metal(at)]
    for metal in metals:
        for nbr in metal.GetNeighbors():
            if (nbr.GetAtomicNum() in from_atoms or nbr.GetSymbol() in from_atoms) and (
                nbr.GetExplicitValence() > PERIODIC_TABLE.GetDefaultValence(nbr.GetAtomicNum())
                and rwmol.GetBondBetweenAtoms(nbr.GetIdx(), metal.GetIdx()).GetBondType()
                == SINGLE_BOND
            ):
                rwmol.RemoveBond(nbr.GetIdx(), metal.GetIdx())
                rwmol.AddBond(nbr.GetIdx(), metal.GetIdx(), DATIVE_BOND)
    return rwmol

Enumerate¶

`enumerate_stereoisomers(mol, n_variants=20, undefined_only=False, rationalise=True)` ¶

Enumerate the stereocenters and bonds of the current molecule.

Original source: the openff-toolkit lib.

Warning: this function can be computationnaly intensive.

Parameters:

Name	Type	Description	Default
`mol`		The molecule whose state we should enumerate.	required
`n_variants`	`int`	The maximum amount of molecules that should be returned.	`20`
`undefined_only`	`bool`	If we should enumerate all stereocenters and bonds or only those with undefined stereochemistry.	`False`
`rationalise`	`bool`	If we should try to build and rationalise the molecule to ensure it can exist.	`True`

Source code in datamol/mol.py

def enumerate_stereoisomers(
    mol,
    n_variants: int = 20,
    undefined_only: bool = False,
    rationalise: bool = True,
):
    """Enumerate the stereocenters and bonds of the current molecule.

    Original source: the `openff-toolkit` lib.

    Warning: this function can be computationnaly intensive.

    Args:
        mol: The molecule whose state we should enumerate.
        n_variants: The maximum amount of molecules that should be returned.
        undefined_only: If we should enumerate all stereocenters and bonds or only those
            with undefined stereochemistry.
        rationalise: If we should try to build and rationalise the molecule to ensure it
            can exist.
    """
    from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers
    from rdkit.Chem.EnumerateStereoisomers import StereoEnumerationOptions

    # safety first
    mol = copy_mol(mol)

    # in case any bonds/centers are missing stereo chem flag it here
    Chem.AssignStereochemistry(mol, force=False, flagPossibleStereoCenters=True, cleanIt=True)  # type: ignore
    Chem.FindPotentialStereoBonds(mol, cleanIt=True)  # type: ignore

    # set up the options
    stereo_opts = StereoEnumerationOptions(
        tryEmbedding=rationalise,
        onlyUnassigned=undefined_only,
        maxIsomers=n_variants,
    )

    try:
        isomers = tuple(EnumerateStereoisomers(mol, options=stereo_opts))
    except:
        # NOTE(hadim): often got "Stereo atoms should be specified before specifying CIS/TRANS bond stereochemistry"
        # for the ligand of reference (coming from the PDB). Not sure how to handle that.
        isomers = []

    variants = []
    for isomer in isomers:
        # isomer has CIS/TRANS tags so convert back to E/Z
        Chem.SetDoubleBondNeighborDirections(isomer)  # type: ignore
        Chem.AssignStereochemistry(isomer, force=True, cleanIt=True)  # type: ignore
        variants.append(isomer)

    return variants

`enumerate_tautomers(mol, n_variants=20)` ¶

Enumerate the possible tautomers of the current molecule.

Original source: the openff-toolkit lib.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	The molecule whose state we should enumerate.	required
`n_variants`	`int`	The maximum amount of molecules that should be returned.	`20`

Source code in datamol/mol.py

def enumerate_tautomers(mol: Chem.rdchem.Mol, n_variants: int = 20):
    """Enumerate the possible tautomers of the current molecule.

    Original source: the `openff-toolkit` lib.

    Args:
        mol: The molecule whose state we should enumerate.
        n_variants: The maximum amount of molecules that should be returned.
    """
    # safety first
    mol = copy_mol(mol)

    enumerator = rdMolStandardize.TautomerEnumerator()
    enumerator.SetMaxTautomers(n_variants)
    tautomers = enumerator.Enumerate(mol)
    return list(tautomers)

Convert molecule(s)¶

`to_smiles(mol, canonical=True, isomeric=True, ordered=False, explicit_bonds=False, explicit_hs=False, randomize=False, cxsmiles=False, allow_to_fail=False)` ¶

Convert a mol to a SMILES.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	a molecule.	required
`canonical`	`bool`	if false no attempt will be made to canonicalize the molecule.	`True`
`isomeric`	`bool`	whether to include information about stereochemistry in the SMILES.	`True`
`ordered`	`bool`	whether to force reordering of the atoms first.	`False`
`explicit_bonds`	`bool`	if true, all bond orders will be explicitly indicated in the output SMILES.	`False`
`explicit_hs`	`bool`	if true, all H counts will be explicitly indicated in the output SMILES.	`False`
`randomize`	`bool`	whether to randomize the generated smiles. Override `canonical`.	`False`
`cxsmiles`	`bool`	Whether to return a CXSMILES instead of a SMILES.	`False`
`allow_to_fail`	`bool`	Raise an error if the conversion to SMILES fails. Return None otherwise.	`False`

Source code in datamol/convert.py

def to_smiles(
    mol: Chem.rdchem.Mol,
    canonical: bool = True,
    isomeric: bool = True,
    ordered: bool = False,
    explicit_bonds: bool = False,
    explicit_hs: bool = False,
    randomize: bool = False,
    cxsmiles: bool = False,
    allow_to_fail: bool = False,
) -> Optional[str]:
    """Convert a mol to a SMILES.

    Args:
        mol: a molecule.
        canonical: if false no attempt will be made to canonicalize the molecule.
        isomeric: whether to include information about stereochemistry in the SMILES.
        ordered: whether to force reordering of the atoms first.
        explicit_bonds: if true, all bond orders will be explicitly indicated in the output SMILES.
        explicit_hs: if true, all H counts will be explicitly indicated in the output SMILES.
        randomize: whether to randomize the generated smiles. Override `canonical`.
        cxsmiles: Whether to return a CXSMILES instead of a SMILES.
        allow_to_fail: Raise an error if the conversion to SMILES fails. Return None otherwise.
    """
    if ordered and canonical is False:
        mol = dm.reorder_atoms(mol)

    if randomize:
        mol = dm.randomize_atoms(mol)
        canonical = False

    smiles = None
    try:

        if cxsmiles:
            smiles = Chem.MolToCXSmiles(  # type: ignore
                mol,
                isomericSmiles=isomeric,
                canonical=canonical,
                allBondsExplicit=explicit_bonds,
                allHsExplicit=explicit_hs,
            )

        else:
            smiles = Chem.MolToSmiles(  # type: ignore
                mol,
                isomericSmiles=isomeric,
                canonical=canonical,
                allBondsExplicit=explicit_bonds,
                allHsExplicit=explicit_hs,
            )

    except Exception as e:

        if allow_to_fail:
            raise e

        return None

    return smiles

`to_selfies(mol)` ¶

Convert a mol to SELFIES.

Parameters:

Name	Type	Description	Default
`mol`	`Union[str, rdkit.Chem.rdchem.Mol]`	a molecule or a SMILES.	required

Returns:

Type	Description
`Optional[str]`	selfies: SELFIES string.

Source code in datamol/convert.py

def to_selfies(mol: Union[str, Chem.rdchem.Mol]) -> Optional[str]:
    """Convert a mol to SELFIES.

    Args:
        mol: a molecule or a SMILES.

    Returns:
        selfies: SELFIES string.
    """
    if mol is None:
        return None

    if isinstance(mol, Chem.rdchem.Mol):
        mol = to_smiles(mol)

    selfies = sf.encoder(mol)  # type: ignore

    if selfies == -1:
        return None

    return selfies

`from_selfies(selfies, as_mol=False)` ¶

Convert a SEFLIES to a smiles or a mol.

Parameters:

Name	Type	Description	Default
`selfies`	`str`	a selfies.	required
`as_mol`	`bool`	whether to return a mol or a smiles.	`False`

Returns:

Type	Description
`Union[str, rdkit.Chem.rdchem.Mol]`	smiles or mol.

Source code in datamol/convert.py

def from_selfies(selfies: str, as_mol: bool = False) -> Optional[Union[str, Chem.rdchem.Mol]]:
    """Convert a SEFLIES to a smiles or a mol.

    Args:
        selfies: a selfies.
        as_mol (str, optional): whether to return a mol or a smiles.

    Returns:
        smiles or mol.
    """
    if selfies is None:
        return None

    smiles = sf.decoder(selfies)

    if as_mol and smiles is not None:
        return dm.to_mol(smiles)

    return smiles

`to_smarts(mol, keep_hs=True)` ¶

Convert a molecule to a smarts.

Parameters:

Name	Type	Description	Default
`mol`	`Union[str, rdkit.Chem.rdchem.Mol]`	a molecule.	required
`keep_hs`	`bool`	Whether to keep hydrogen. This will increase the count of H atoms for atoms with attached hydrogens to create a valid smarts. e.g. [H]-[CH2]-[] -> [H]-[CH3]-[]	`True`

Returns:

Type	Description
`Optional[str]`	smarts of the molecule

Source code in datamol/convert.py

def to_smarts(mol: Union[str, Chem.rdchem.Mol], keep_hs: bool = True) -> Optional[str]:
    """Convert a molecule to a smarts.

    Args:
        mol: a molecule.
        keep_hs: Whether to keep hydrogen. This will increase the count of H atoms
            for atoms with attached hydrogens to create a valid smarts.
            e.g. [H]-[CH2]-[*] -> [H]-[CH3]-[*]

    Returns:
        smarts of the molecule
    """

    if mol is None:
        return None

    if isinstance(mol, str):
        mol = dm.to_mol(mol)

    # Change the isotope to 42
    for atom in mol.GetAtoms():  # type: ignore
        if keep_hs:
            s = sum(na.GetAtomicNum() == 1 for na in atom.GetNeighbors())
            if s:
                atom.SetNumExplicitHs(atom.GetTotalNumHs() + s)
        atom.SetIsotope(42)

    # Print out the smiles, all the atom attributes will be fully specified
    smarts = to_smiles(mol, isomeric=True, explicit_bonds=True)

    if smarts is None:
        return None

    # Remove the 42 isotope labels
    smarts = re.sub(r"\[42", "[", smarts)
    return smarts

`to_inchi(mol)` ¶

Convert a mol to Inchi.

Parameters:

Name	Type	Description	Default
`mol`	`Union[str, rdkit.Chem.rdchem.Mol]`	a molecule.	required

Source code in datamol/convert.py

def to_inchi(mol: Union[str, Chem.rdchem.Mol]) -> Optional[str]:
    """Convert a mol to Inchi.

    Args:
        mol: a molecule.
    """

    if mol is None:
        return None

    if isinstance(mol, str):
        mol = dm.to_mol(mol)

    return Chem.MolToInchi(mol)

`to_inchikey(mol)` ¶

Convert a mol to Inchi key.

Parameters:

Name	Type	Description	Default
`mol`	`Union[str, rdkit.Chem.rdchem.Mol]`	a molecule	required

Source code in datamol/convert.py

def to_inchikey(mol: Union[str, Chem.rdchem.Mol]) -> Optional[str]:
    """Convert a mol to Inchi key.

    Args:
        mol: a molecule
    """

    if mol is None:
        return None

    if isinstance(mol, str):
        mol = dm.to_mol(mol)

    return Chem.MolToInchiKey(mol)

`from_inchi(inchi, sanitize=True, remove_hs=True)` ¶

Convert an InChi to a mol.

Parameters:

Name	Type	Description	Default
`inchi`	`Optional[str]`	an inchi string.	required
`sanitize`	`bool`	do sanitize.	`True`
`remove_hs`	`bool`	do remove hs.	`True`

Returns:

Type	Description
`Optional[rdkit.Chem.rdchem.Mol]`	mol

Source code in datamol/convert.py

def from_inchi(
    inchi: Optional[str],
    sanitize: bool = True,
    remove_hs: bool = True,
) -> Optional[Chem.rdchem.Mol]:
    """Convert an InChi to a mol.

    Args:
        inchi: an inchi string.
        sanitize: do sanitize.
        remove_hs: do remove hs.

    Returns:
        mol
    """
    if inchi is None:
        return None

    return Chem.MolFromInchi(inchi, sanitize=sanitize, removeHs=remove_hs)

`to_df(mols, smiles_column='smiles', mol_column=None, include_private=False, include_computed=False, render_df_mol=True, render_all_df_mol=False)` ¶

Convert a list of mols to a dataframe using each mol properties as a column.

Parameters:

Name	Type	Description	Default
`mols`	`List[rdkit.Chem.rdchem.Mol]`	a molecule.	required
`smiles_column`	`Optional[str]`	name of the SMILES column.	`'smiles'`
`mol_column`	`str`	Name of the column. If not None, rdkit.Chem.PandaTools is used to add a molecule column.	`None`
`include_private`	`bool`	Include private properties in the columns.	`False`
`include_computed`	`bool`	Include computed properties in the columns.	`False`
`render_df_mol`	`bool`	whether to render the molecule in the dataframe to images. If called once, it will be applied for the newly created dataframe with mol in it.	`True`
`render_all_df_mol`	`bool`	Whether to render all pandas dataframe mol column as images.	`False`

Source code in datamol/convert.py

def to_df(
    mols: List[Chem.rdchem.Mol],
    smiles_column: Optional[str] = "smiles",
    mol_column: str = None,
    include_private: bool = False,
    include_computed: bool = False,
    render_df_mol: bool = True,
    render_all_df_mol: bool = False,
) -> Optional[pd.DataFrame]:
    """Convert a list of mols to a dataframe using each mol properties
    as a column.

    Args:
        mols: a molecule.
        smiles_column: name of the SMILES column.
        mol_column: Name of the column. If not None, rdkit.Chem.PandaTools
            is used to add a molecule column.
        include_private: Include private properties in the columns.
        include_computed: Include computed properties in the columns.
        render_df_mol: whether to render the molecule in the dataframe to images.
            If called once, it will be applied for the newly created dataframe with
            mol in it.
        render_all_df_mol: Whether to render all pandas dataframe mol column as images.
    """

    # Init a dataframe
    df = pd.DataFrame()

    # Feed it with smiles
    if smiles_column is not None:
        smiles = [dm.to_smiles(mol) for mol in mols]
        df[smiles_column] = smiles

    # Add a mol column
    if mol_column is not None:
        df[mol_column] = mols

    # Add any other properties present in the molecule
    props = [
        mol.GetPropsAsDict(
            includePrivate=include_private,
            includeComputed=include_computed,
        )
        for mol in mols
    ]
    props_df = pd.DataFrame(props)

    if smiles_column is not None and smiles_column in props_df.columns:
        logger.warning(
            f"The SMILES column name provided ('{smiles_column}') is already present in the properties"
            " of the molecules. THe returned dataframe will two columns with the same name."
        )

    # Concat the df with the properties df
    df = pd.concat([df, props_df], axis=1)

    # Render mol column to images
    if render_df_mol is True and mol_column is not None:
        # NOTE(hadim): replace by `PandaTools.ChangeMoleculeRendering` once
        # https://github.com/rdkit/rdkit/issues/3563 is fixed.
        _ChangeMoleculeRendering(df)

        if render_all_df_mol:
            PandasTools.RenderImagesInAllDataFrames()

    return df

`from_df(df, smiles_column='smiles', mol_column=None, conserve_smiles=False, sanitize=True)` ¶

Convert a dataframe to a list of mols.

Note

If smiles_column is used to build the molecules, this property is removed from the molecules' properties. You can decide to conserve the SMILES column by setting conserve_smiles to True.

Parameters:

Name	Type	Description	Default
`df`	`DataFrame`	a dataframe.	required
`smiles_column`	`Optional[str]`	Column name to extract the molecule.	`'smiles'`
`mol_column`	`str`	Column name to extract the molecule. It takes precedence over `smiles_column`.	`None`
`conserve_smiles`	`bool`	Whether to conserve the SMILES in the mols' props.	`False`
`sanitize`	`bool`	Whether to sanitize if `smiles_column` is not None.	`True`

Source code in datamol/convert.py

def from_df(
    df: pd.DataFrame,
    smiles_column: Optional[str] = "smiles",
    mol_column: str = None,
    conserve_smiles: bool = False,
    sanitize: bool = True,
) -> List[Chem.rdchem.Mol]:
    """Convert a dataframe to a list of mols.

    Note:
        If `smiles_column` is used to build the molecules, this property
        is removed from the molecules' properties. You can decide to conserve
        the SMILES column by setting `conserve_smiles` to True.

    Args:
        df: a dataframe.
        smiles_column: Column name to extract the molecule.
        mol_column: Column name to extract the molecule. It takes
            precedence over `smiles_column`.
        conserve_smiles: Whether to conserve the SMILES in the mols' props.
        sanitize: Whether to sanitize if `smiles_column` is not None.
    """

    if smiles_column is None and mol_column is None:
        raise ValueError("Either `smiles_column` or `mol_column` must be not None.")

    if len(df) == 0:
        return []

    # Try to detect the mol column if `mol_column` is None.
    if mol_column is None:
        for col in df.columns:
            if isinstance(df[col].iloc[0], Chem.rdchem.Mol):
                mol_column = col

    def _row_to_mol(row):

        props = row.to_dict()

        if mol_column is not None:
            mol = props.pop(mol_column)
        else:

            if conserve_smiles:
                smiles = props[smiles_column]
            else:
                # If a SMILES column is used to create the molecule then it is removed from the
                # properties.
                smiles = props.pop(smiles_column)

            mol = dm.to_mol(smiles, sanitize=sanitize)

        if mol is None:
            return None

        dm.set_mol_props(mol, props)
        return mol

    return df.apply(_row_to_mol, axis=1).tolist()

Input/Output¶

`read_csv(urlpath, smiles_column=None, mol_column='mol', **kwargs)` ¶

Read a CSV file.

Parameters:

Name	Type	Description	Default
`urlpath`	`Union[str, os.PathLike, TextIO]`	Path to a file or a file-like object. Path can be remote or local.	required
`smiles_column`	`str`	Use this column to build a mol column.	`None`
`mol_column`	`str`	Name to give to the mol column. If not None a mol column will be build. Avoid when loading a very large file.	`'mol'`
`kwargs`		Arguments to pass to `pd.read_csv()`.	`{}`

Returns:

Type	Description
`DataFrame`	df: a `pandas.DataFrame`

Source code in datamol/io.py

def read_csv(
    urlpath: Union[str, os.PathLike, TextIO],
    smiles_column: str = None,
    mol_column: str = "mol",
    **kwargs,
) -> pd.DataFrame:
    """Read a CSV file.

    Args:
        urlpath: Path to a file or a file-like object. Path can be remote or local.
        smiles_column: Use this column to build a mol column.
        mol_column: Name to give to the mol column. If not None a mol column will be build.
            Avoid when loading a very large file.
        kwargs: Arguments to pass to `pd.read_csv()`.

    Returns:
        df: a `pandas.DataFrame`
    """

    df: pd.DataFrame = pd.read_csv(urlpath, **kwargs)  # type: ignore

    if smiles_column is not None:
        PandasTools.AddMoleculeColumnToFrame(df, smiles_column, mol_column)

    return df

`read_excel(urlpath, sheet_name=0, smiles_column=None, mol_column='mol', **kwargs)` ¶

Read an excel file.

Parameters:

Name	Type	Description	Default
`urlpath`	`Union[str, os.PathLike, TextIO]`	Path to a file or a file-like object. Path can be remote or local.	required
`sheet_name`	`Union[str, int, list]`	see `pandas.read_excel()` doc.	`0`
`mol_column`	`str`	Name to give to the mol column. If not None a mol column will be build. Avoid when loading a very large file.	`'mol'`
`mol_column`	`str`	name to give to the mol column.	`'mol'`
`kwargs`		Arguments to pass to `pd.read_excel()`.	`{}`

Returns:

Type	Description
`DataFrame`	df: a `pandas.DataFrame`

Source code in datamol/io.py

def read_excel(
    urlpath: Union[str, os.PathLike, TextIO],
    sheet_name: Optional[Union[str, int, list]] = 0,
    smiles_column: str = None,
    mol_column: str = "mol",
    **kwargs,
) -> pd.DataFrame:
    """Read an excel file.

    Args:
        urlpath: Path to a file or a file-like object. Path can be remote or local.
        sheet_name: see `pandas.read_excel()` doc.
        mol_column: Name to give to the mol column. If not None a mol column will be build.
            Avoid when loading a very large file.
        mol_column: name to give to the mol column.
        kwargs: Arguments to pass to `pd.read_excel()`.

    Returns:
        df: a `pandas.DataFrame`
    """

    df = pd.read_excel(urlpath, sheet_name=sheet_name, **kwargs)  # type: ignore

    if smiles_column is not None:
        PandasTools.AddMoleculeColumnToFrame(df, smiles_column, mol_column)

    return df

`read_sdf(urlpath, sanitize=True, as_df=False, smiles_column='smiles', mol_column=None, include_private=False, include_computed=False, strict_parsing=True)` ¶

Read an SDF file.

Note: This function is meant to be used with dataset that fit in-memory. For a more advanced usage we suggest you to use directly Chem.ForwardSDMolSupplier.

Parameters:

Name	Type	Description	Default
`urlpath`	`Union[str, os.PathLike, TextIO]`	Path to a file or a file-like object. Path can be remote or local.	required
`sanitize`	`bool`	Whether to sanitize the molecules.	`True`
`as_df`	`bool`	Whether to return a list mol or a pandas DataFrame.	`False`
`smiles_column`	`Optional[str]`	Name of the SMILES column. Only relevant if `as_df` is True.	`'smiles'`
`mol_column`	`str`	Name of the mol column. Only relevant if `as_df` is True.	`None`
`include_private`	`bool`	Include private properties in the columns. Only relevant if `as_df` is True.	`False`
`include_computed`	`bool`	Include computed properties in the columns. Only relevant if `as_df` is True.	`False`
`strict_parsing`	`bool`	If set to false, the parser is more lax about correctness of the contents.	`True`

Source code in datamol/io.py

def read_sdf(
    urlpath: Union[str, os.PathLike, TextIO],
    sanitize: bool = True,
    as_df: bool = False,
    smiles_column: Optional[str] = "smiles",
    mol_column: str = None,
    include_private: bool = False,
    include_computed: bool = False,
    strict_parsing: bool = True,
) -> Union[List[Chem.rdchem.Mol], pd.DataFrame]:
    """Read an SDF file.

    Note: This function is meant to be used with dataset that fit _in-memory_.
    For a more advanced usage we suggest you to use directly `Chem.ForwardSDMolSupplier`.

    Args:
        urlpath: Path to a file or a file-like object. Path can be remote or local.
        sanitize: Whether to sanitize the molecules.
        as_df: Whether to return a list mol or a pandas DataFrame.
        smiles_column: Name of the SMILES column. Only relevant if `as_df` is True.
        mol_column: Name of the mol column. Only relevant if `as_df` is True.
        include_private: Include private properties in the columns.  Only relevant if
            `as_df` is True.
        include_computed: Include computed properties in the columns.  Only relevant if
            `as_df` is True.
        strict_parsing: If set to false, the parser is more lax about correctness of the contents.
    """

    # File-like object
    if isinstance(urlpath, io.IOBase):
        supplier = Chem.ForwardSDMolSupplier(
            urlpath,
            sanitize=sanitize,
            strictParsing=strict_parsing,
        )
        mols = list(supplier)

    # Regular local or remote paths
    else:
        with fsspec.open(urlpath) as f:

            # Handle gzip file if needed
            if str(urlpath).endswith(".gz") or str(urlpath).endswith(".gzip"):
                f = gzip.open(f)

            supplier = Chem.ForwardSDMolSupplier(
                f,
                sanitize=sanitize,
                strictParsing=strict_parsing,
            )
            mols = list(supplier)

    # Discard None values
    mols = [mol for mol in mols if mol is not None]

    # Convert to dataframe
    if as_df:
        return dm.to_df(
            mols,
            smiles_column=smiles_column,
            mol_column=mol_column,
            include_private=include_private,
            include_computed=include_computed,
        )  # type: ignore

    return mols

`to_sdf(mols, urlpath, smiles_column='smiles', mol_column=None)` ¶

Write molecules to a file.

Parameters:

Name	Type	Description	Default
`mols`	`Union[rdkit.Chem.rdchem.Mol, Sequence[rdkit.Chem.rdchem.Mol], pandas.core.frame.DataFrame]`	a dataframe, a molecule or a list of molecule.	required
`urlpath`	`Union[str, os.PathLike, TextIO]`	Path to a file or a file-like object. Path can be remote or local.	required
`smiles_column`	`Optional[str]`	Column name to extract the molecule.	`'smiles'`
`mol_column`	`str`	Column name to extract the molecule. It takes precedence over `smiles_column`.	`None`

Source code in datamol/io.py

def to_sdf(
    mols: Union[Chem.rdchem.Mol, Sequence[Chem.rdchem.Mol], pd.DataFrame],
    urlpath: Union[str, os.PathLike, TextIO],
    smiles_column: Optional[str] = "smiles",
    mol_column: str = None,
):
    """Write molecules to a file.

    Args:
        mols: a dataframe, a molecule or a list of molecule.
        urlpath: Path to a file or a file-like object. Path can be remote or local.
        smiles_column: Column name to extract the molecule.
        mol_column: Column name to extract the molecule. It takes
            precedence over `smiles_column`.
    """

    if isinstance(mols, pd.DataFrame):
        mols = dm.from_df(mols, smiles_column=smiles_column, mol_column=mol_column)

    elif isinstance(mols, Chem.rdchem.Mol):
        mols = [mols]

    # Filter out None values
    mols = [mol for mol in mols if mol is not None]

    # File-like object
    if isinstance(urlpath, io.IOBase):
        writer = Chem.SDWriter(urlpath)
        for mol in mols:
            writer.write(mol)
        writer.close()

    # Regular local or remote paths
    else:
        with fsspec.open(urlpath, mode="w") as f:
            writer = Chem.SDWriter(f)
            for mol in mols:
                writer.write(mol)
            writer.close()

`to_smi(mols, urlpath, error_if_empty=False)` ¶

Save a list of molecules in an .smi file.

Parameters:

Name	Type	Description	Default
`mols`	`Sequence[rdkit.Chem.rdchem.Mol]`	a list of molecules.	required
`urlpath`	`Union[str, os.PathLike, TextIO]`	Path to a file or a file-like object. Path can be remote or local.	required
`error_if_empty`	`bool`	whether to raise and error if the input list is empty.	`False`

Source code in datamol/io.py

def to_smi(
    mols: Sequence[Chem.rdchem.Mol],
    urlpath: Union[str, os.PathLike, TextIO],
    error_if_empty: bool = False,
):
    """Save a list of molecules in an `.smi` file.

    Args:
        mols: a list of molecules.
        urlpath: Path to a file or a file-like object. Path can be remote or local.
        error_if_empty: whether to raise and error if the input list is empty.
    """

    if len(mols) == 0 and error_if_empty:
        raise ValueError("The list of mols/smiles provided is empty.")

    # Filter out None values
    mols = [mol for mol in mols if mol is not None]

    # File-like object
    if isinstance(urlpath, io.IOBase):
        writer = Chem.SmilesWriter(urlpath, includeHeader=False, nameHeader="")
        for mol in mols:
            writer.write(mol)
        writer.close()

    # Regular local or remote paths
    else:
        with fsspec.open(urlpath, "w") as f:
            writer = Chem.SmilesWriter(f, includeHeader=False, nameHeader="")
            for mol in mols:
                writer.write(mol)
            writer.close()

`read_smi(urlpath)` ¶

Read a list of smiles from am .smi file.

Parameters:

Name	Type	Description	Default
`urlpath`	`Union[str, os.PathLike]`	Path to a file or a file-like object. Path can be remote or local. Note: file-like object are not supported yet.	required

Source code in datamol/io.py

def read_smi(
    urlpath: Union[str, os.PathLike],
) -> Sequence[Chem.rdchem.Mol]:
    """Read a list of smiles from am `.smi` file.

    Args:
        urlpath: Path to a file or a file-like object. Path can be remote or local.
            Note: file-like object are not supported yet.
    """

    active_path = urlpath

    # NOTE(hadim): the temporary local file copy
    # is because `SmilesMolSupplier` does not support
    # using file-like object, only path.

    # Copy to a local temporary path if the path is a remote one.
    if not fsspec.utils.can_be_local(str(urlpath)):
        active_path = pathlib.Path(tempfile.mkstemp()[1])
        dm.utils.fs.copy_file(urlpath, active_path)

    # Read the molecules
    supplier = Chem.SmilesMolSupplier(str(active_path), titleLine=0)
    mols = [mol for mol in supplier if mol is not None]

    # Delete the local temporary path
    if not fsspec.utils.can_be_local(str(urlpath)):
        pathlib.Path(active_path).unlink()

    return mols

Molecule similarity and distance¶

`pdist(mols, n_jobs=1, squareform=True, **fp_args)` ¶

Compute the pairwise tanimoto distance between the fingerprints of all the molecules in the input set.

Parameters:

Name	Type	Description	Default
`mols`	`List[Union[str, rdkit.Chem.rdchem.Mol]]`	list of molecules	required
`n_jobs`	`Optional[int]`	Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores.	`1`
`squareform`	`bool`	Whether to return in square form (matrix) or in a condensed form (1D vector).	`True`
`**fp_args`		list of args to pass to `to_fp()`.	`{}`

Returns:

Type	Description
`ndarray`	dist_mat

Source code in datamol/similarity.py

def pdist(
    mols: List[Union[str, Chem.rdchem.Mol]],
    n_jobs: Optional[int] = 1,
    squareform: bool = True,
    **fp_args,
) -> np.ndarray:
    """Compute the pairwise tanimoto distance between the fingerprints of all the
    molecules in the input set.

    Args:
        mols: list of molecules
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to None to use all available cores.
        squareform: Whether to return in square form (matrix) or in a condensed
            form (1D vector).
        **fp_args: list of args to pass to `to_fp()`.

    Returns:
        dist_mat
    """

    fps = dm.parallelized(
        functools.partial(dm.to_fp, as_array=True, **fp_args),
        mols,
        n_jobs=n_jobs,
    )

    fps = np.array(fps)

    dist_mat = distance.pdist(fps, metric="jaccard")

    if squareform:
        dist_mat = distance.squareform(dist_mat, force="tomatrix")

    return dist_mat

`cdist(mols1, mols2, n_jobs=1, **fp_args)` ¶

Compute the tanimoto distance between the fingerprints of each pair of molecules of the two collections of inputs.

Parameters:

Name	Type	Description	Default
`mols1`	`List[Union[str, rdkit.Chem.rdchem.Mol]]`	list of molecules.	required
`mols2`	`List[Union[str, rdkit.Chem.rdchem.Mol]]`	list of molecules.	required
`n_jobs`	`Optional[int]`	Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores.	`1`
`**fp_args`		list of args to pass to `to_fp()`.	`{}`

Returns:

Type	Description
`ndarray`	distmat

Source code in datamol/similarity.py

def cdist(
    mols1: List[Union[str, Chem.rdchem.Mol]],
    mols2: List[Union[str, Chem.rdchem.Mol]],
    n_jobs: Optional[int] = 1,
    **fp_args,
) -> np.ndarray:
    """Compute the tanimoto distance between the fingerprints of each pair of
    molecules of the two collections of inputs.

    Args:
        mols1: list of molecules.
        mols2: list of molecules.
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to None to use all available cores.
        **fp_args: list of args to pass to `to_fp()`.

    Returns:
        distmat
    """

    fps1 = dm.parallelized(
        functools.partial(dm.to_fp, as_array=True, **fp_args),
        mols1,
        n_jobs=n_jobs,
    )

    fps2 = dm.parallelized(
        functools.partial(dm.to_fp, as_array=True, **fp_args),
        mols2,
        n_jobs=n_jobs,
    )

    fps1 = np.array(fps1)
    fps2 = np.array(fps2)

    dist_mat = distance.cdist(fps1, fps2, metric="jaccard")

    return dist_mat

Working with fingerprints¶

`to_fp(mol, as_array=True, fp_type='ecfp', fold_size=None, **fp_args)` ¶

Compute the molecular fingerprint given a molecule or a SMILES.

Parameters:

Name	Type	Description	Default
`mol`	`Union[str, rdkit.Chem.rdchem.Mol]`	a molecule or a SMILES.	required
`as_array`	`bool`	Whether to return a numpy array of an RDKit vec. Default to True.	`True`
`fp_type`	`str`	The type of the fingerprint. See `dm.list_supported_fingerprints()` for a complete list.	`'ecfp'`
`fold_size`	`int`	If set, fold the fingerprint to the `fold_size`. If set, returned array is always a numpy array.	`None`
`fp_args`		Arguments to build the fingerprint. Refer to the official RDKit documentation.	`{}`

Returns:

Type	Description
`Union[numpy.ndarray, rdkit.DataStructs.cDataStructs.SparseBitVect, rdkit.DataStructs.cDataStructs.ExplicitBitVect]`	A fingerprint vector or None

Source code in datamol/fp.py

def to_fp(
    mol: Union[str, Chem.rdchem.Mol],
    as_array: bool = True,
    fp_type: str = "ecfp",
    fold_size: int = None,
    **fp_args,
) -> Optional[Union[np.ndarray, SparseBitVect, ExplicitBitVect]]:
    """Compute the molecular fingerprint given a molecule or a SMILES.

    Args:
        mol: a molecule or a SMILES.
        as_array: Whether to return a numpy array of an RDKit vec. Default to True.
        fp_type: The type of the fingerprint. See `dm.list_supported_fingerprints()` for a
            complete list.
        fold_size: If set, fold the fingerprint to the `fold_size`. If set, returned array is
            always a numpy array.
        fp_args: Arguments to build the fingerprint. Refer to the official RDKit documentation.

    Returns:
        A fingerprint vector or None
    """

    # Get fp function
    fp_func = _FP_FUNCS.get(fp_type)

    if fp_func is None:
        raise ValueError(
            f"The fingerprint '{fp_type}' is not available. Use `dm.list_supported_fingerprints()` to "
            "get a complete list of the available fingerprints."
        )

    # Convert input to mol if needed
    if isinstance(mol, str):
        mol_obj = dm.to_mol(mol)
    else:
        mol_obj = mol

    if mol_obj is None:
        raise ValueError(f"It seems like the input molecule '{mol}' is invalid.")

    mol = mol_obj

    # Deal with new API introduced in >=0.4 + throw a warning if needed.
    if "fp_size" in fp_args:
        warnings.warn(
            "Using `fp_size` is now deprecated and will be removed in datamol 0.5.0. Please use `nBits` instead.",
            DeprecationWarning,
        )
        fp_args["nBits"] = fp_args.pop("fp_size")

    if "use_features" in fp_args:
        warnings.warn(
            "Using `use_features` is now deprecated and will be removed in datamol 0.5.0. Please use `useFeatures` instead.",
            DeprecationWarning,
        )
        fp_args["useFeatures"] = fp_args.pop("use_features")

    # Insert default values.
    for key, value in _FP_DEFAULT_ARGS[fp_type].items():
        fp_args.setdefault(key, value)

    # Compute the fingerprint
    fp = fp_func(mol, **fp_args)

    # Fold the fp if needed.
    if fold_size is not None:
        fp = fold_count_fp(fp, dim=fold_size)

    # Convert to a numpy array
    if not fold_size and as_array:
        fp = fp_to_array(fp)

    return fp

`fp_to_array(fp)` ¶

Convert rdkit fingerprint to numpy array.

Note

This implementation has shown to be faster than using DataStructs.ConvertToNumpyArray by a factor of ~4. See https://github.com/rdkit/rdkit/discussions/3863.

Parameters:

Name	Type	Description	Default
`fp`	`Union[numpy.ndarray, rdkit.DataStructs.cDataStructs.SparseBitVect, rdkit.DataStructs.cDataStructs.ExplicitBitVect, rdkit.DataStructs.cDataStructs.UIntSparseIntVect]`	The fingerprint.	required

Source code in datamol/fp.py

def fp_to_array(
    fp: Union[np.ndarray, SparseBitVect, ExplicitBitVect, UIntSparseIntVect]
) -> np.ndarray:
    """Convert rdkit fingerprint to numpy array.

    Note:
        This implementation has shown to be faster than using `DataStructs.ConvertToNumpyArray`
        by a factor of ~4. See https://github.com/rdkit/rdkit/discussions/3863.

    Args:
        fp: The fingerprint.
    """

    if isinstance(fp, np.ndarray):
        fp_out = fp

    elif isinstance(fp, SparseBitVect):
        tmp = np.zeros(fp.GetNumBits(), dtype=int)
        on_bits = np.array(fp.GetOnBits())
        tmp[on_bits] = 1
        fp_out = tmp

    elif isinstance(fp, ExplicitBitVect):
        fp_out = np.frombuffer(fp.ToBitString().encode(), "u1") - ord("0")

    elif isinstance(
        fp,
        (
            UIntSparseIntVect,
            IntSparseIntVect,
            LongSparseIntVect,
            ULongSparseIntVect,
        ),
    ):
        tmp = np.zeros(fp.GetLength(), dtype=int)
        bit_idx, values = np.array(list(fp.GetNonzeroElements().items())).T
        tmp[bit_idx] = values
        fp_out = tmp

    else:
        raise ValueError(
            f"The fingerprint of type '{type(fp)}' is not supported. "
            "Please open a ticket at https://github.com/datamol-org/datamol/issues."
        )

    return fp_out

`list_supported_fingerprints()` ¶

Return the supported fingerprints in datamol.

Source code in datamol/fp.py

def list_supported_fingerprints():
    """Return the supported fingerprints in datamol."""

    return _FP_FUNCS

`fold_count_fp(fp, dim=1024, binary=False)` ¶

Fast folding of a count fingerprint to the specified dimension.

Parameters:

Name	Type	Description	Default
`fp`	`Union[numpy.ndarray, rdkit.DataStructs.cDataStructs.SparseBitVect, rdkit.DataStructs.cDataStructs.ExplicitBitVect]`	A fingerprint.	required
`dim`	`int`	The dimension of the folded array.	`1024`
`binary`	`bool`	Whether to fold into a binary array or take use a count vector.	`False`

Returns:

Type	Description
`folded`	returns folded array to the provided dimension.

Source code in datamol/fp.py

def fold_count_fp(
    fp: Union[np.ndarray, SparseBitVect, ExplicitBitVect],
    dim: int = 1024,
    binary: bool = False,
):
    """Fast folding of a count fingerprint to the specified dimension.

    Args:
        fp: A fingerprint.
        dim: The dimension of the folded array.
        binary: Whether to fold into a binary array or take use a count vector.

    Returns:
        folded: returns folded array to the provided dimension.
    """
    if isinstance(
        fp,
        (
            UIntSparseIntVect,
            IntSparseIntVect,
            LongSparseIntVect,
            ULongSparseIntVect,
        ),
    ):
        tmp = fp.GetNonzeroElements()

    elif isinstance(fp, SparseBitVect):
        on_bits = fp.GetOnBits()
        tmp = dict(zip(on_bits, np.ones(len(on_bits))))

    else:
        raise ValueError(f"The fingerprint is of wrong type: {type(fp)}")

    # ON bits dict to (i, v)
    i = np.array(list(tmp.keys())) % dim
    v = np.array(list(tmp.values()))

    # Fold indices
    i = i % dim

    # Create the folded fp
    folded = np.zeros(dim, dtype="int")
    np.add.at(folded, i, v)

    if binary:
        folded = np.clip(folded, a_min=0, a_max=1)

    return folded

Cluster molecules¶

`cluster_mols(mols, cutoff=0.2, feature_fn=None, n_jobs=1)` ¶

Cluster a set of molecules using the butina clustering algorithm and a given threshold.

Parameters:

Name	Type	Description	Default
`mols`	`List[rdkit.Chem.rdchem.Mol]`	a list of molecules.	required
`cutoff`	`float`	Cuttoff for the clustering. Default to 0.2.	`0.2`
`feature_fn`	`Callable`	A feature function that takes a Chem.rdchem.Mol object and return molecular features. By default, the `dm.to_fp()` is used. Default to None.	`None`
`n_jobs`	`Optional[int]`	Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores.	`1`

Source code in datamol/cluster.py

def cluster_mols(
    mols: List[Chem.rdchem.Mol],
    cutoff: float = 0.2,
    feature_fn: Callable = None,
    n_jobs: Optional[int] = 1,
):
    """Cluster a set of molecules using the butina clustering algorithm and a given threshold.

    Args:
        mols: a list of molecules.
        cutoff: Cuttoff for the clustering. Default to 0.2.
        feature_fn: A feature function that takes a Chem.rdchem.Mol object
            and return molecular features. By default, the `dm.to_fp()` is used.
            Default to None.
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to None to use all available cores.
    """

    if feature_fn is None:
        feature_fn = functools.partial(dm.to_fp, as_array=False)

    features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs)

    dists = []
    n_mols = len(mols)

    for i in range(1, n_mols):
        dist = DataStructs.BulkTanimotoSimilarity(features[i], features[:i], returnDistance=True)
        dists.extend([x for x in dist])

    # now cluster the data
    cluster_indices = Butina.ClusterData(dists, n_mols, cutoff, isDistData=True)
    cluster_mols = [operator.itemgetter(*cluster)(mols) for cluster in cluster_indices]

    # Make single mol cluster a list
    cluster_mols = [[c] if isinstance(c, Chem.rdchem.Mol) else c for c in cluster_mols]

    return cluster_indices, cluster_mols

`pick_diverse(mols, npick, initial_picks=None, feature_fn=None, dist_fn=None, seed=42, n_jobs=1)` ¶

Pick a set of diverse molecules based on they fingerprint.

Parameters:

Name	Type	Description	Default
`mols`	`List[rdkit.Chem.rdchem.Mol]`	a list of molecules.	required
`npick`	`int`	Number of element to pick from mols, including the preselection.	required
`initial_picks`	`List[int]`	Starting list of index for molecules that should be in the set of picked molecules. Default to None.	`None`
`feature_fn`	`Callable`	A feature function that takes a Chem.rdchem.Mol object and return molecular features. By default, the `dm.to_fp()` is used. Default to None.	`None`
`dist_fn`	`Callable`	A function that takes two indexes (i,j) and return the distance between them. You might use partial to set the fingerprints as input. By default, the Tanimoto similarity will be used. Default to None.	`None`
`seed`	`int`	seed for reproducibility	`42`
`n_jobs`	`Optional[int]`	Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores.	`1`

Returns:

Type	Description
`picked_inds`	index of the molecule that have been picked mols: molecules that have been picked

Source code in datamol/cluster.py

def pick_diverse(
    mols: List[Chem.rdchem.Mol],
    npick: int,
    initial_picks: List[int] = None,
    feature_fn: Callable = None,
    dist_fn: Callable = None,
    seed: int = 42,
    n_jobs: Optional[int] = 1,
):
    r"""Pick a set of diverse molecules based on they fingerprint.

    Args:
        mols: a list of molecules.
        npick: Number of element to pick from mols, including the preselection.
        initial_picks: Starting list of index for molecules that should be in the
            set of picked molecules. Default to None.
        feature_fn: A feature function that takes a Chem.rdchem.Mol object
            and return molecular features. By default, the `dm.to_fp()` is used.
            Default to None.
        dist_fn: A function that takes two indexes (i,j) and return the
            distance between them. You might use partial to set the fingerprints as input.
            By default, the Tanimoto similarity will be used. Default to None.
        seed: seed for reproducibility
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to None to use all available cores.

    Returns:
        picked_inds: index of the molecule that have been picked
        mols: molecules that have been picked
    """

    if feature_fn is None:
        feature_fn = functools.partial(dm.to_fp, as_array=False)

    features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs)

    def distij(i, j, features=features):
        return 1.0 - DataStructs.TanimotoSimilarity(features[i], features[j])

    if dist_fn is None:
        dist_fn = distij

    picker = MaxMinPicker()
    initial_picks = [] if initial_picks is None else initial_picks
    picked_inds = picker.LazyPick(dist_fn, len(mols), npick, firstPicks=initial_picks, seed=seed)
    picked_inds = np.array(picked_inds)
    picked_mols = [mols[x] for x in picked_inds]

    return picked_inds, picked_mols

`pick_centroids(mols, npick=0, initial_picks=None, threshold=0.5, feature_fn=None, dist_fn=None, seed=42, method='sphere', n_jobs=1)` ¶

Pick a set of npick centroids from a list of molecules.

Parameters:

Name	Type	Description	Default
`mols`	`List[rdkit.Chem.rdchem.Mol]`	a list of molecules.	required
`npick`	`int`	Number of element to pick from mols, including the preselection.	`0`
`threshold`	`float`	Minimum distance between centroids for `maxmin` and sphere exclusion (`sphere`) methods.	`0.5`
`initial_picks`	`List[int]`	Starting list of index for molecules that should be in the set of picked molecules. Default to None.	`None`
`feature_fn`	`Callable`	A feature function that takes a Chem.rdchem.Mol object and return molecular features. By default, the `dm.to_fp()` is used. Default to None.	`None`
`dist_fn`	`Callable`	A function that takes two indexes (i,j) and return the distance between them. You might use partial to set the fingerprints as input. By default, the Tanimoto similarity will be used. Default to None.	`None`
`seed`	`int`	seed for reproducibility	`42`
`method`	`str`	Picking method to use. One of `sphere`, `maxmin` or any supported rdkit hierarchical clustering method such as `centroid`, `clink`, `upgma`	`'sphere'`
`n_jobs`	`Optional[int]`	Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores.	`1`

Returns:

Type	Description
`picked_inds`	index of the molecule that have been selected as centroids mols: molecules that have been picked

Source code in datamol/cluster.py

def pick_centroids(
    mols: List[Chem.rdchem.Mol],
    npick: int = 0,
    initial_picks: List[int] = None,
    threshold: float = 0.5,
    feature_fn: Callable = None,
    dist_fn: Callable = None,
    seed: int = 42,
    method: str = "sphere",
    n_jobs: Optional[int] = 1,
):
    r"""Pick a set of `npick` centroids from a list of molecules.

    Args:
        mols: a list of molecules.
        npick: Number of element to pick from mols, including the preselection.
        threshold: Minimum distance between centroids for `maxmin` and sphere exclusion (`sphere`) methods.
        initial_picks: Starting list of index for molecules that should be in the
            set of picked molecules. Default to None.
        feature_fn (callable, optional): A feature function that takes a Chem.rdchem.Mol object
            and return molecular features. By default, the `dm.to_fp()` is used.
            Default to None.
        dist_fn: A function that takes two indexes (i,j) and return the
            distance between them. You might use partial to set the fingerprints as input.
            By default, the Tanimoto similarity will be used. Default to None.
        seed: seed for reproducibility
        method: Picking method to use. One of  `sphere`, `maxmin` or any
            supported rdkit hierarchical clustering method such as `centroid`, `clink`, `upgma`
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to None to use all available cores.

    Returns:
        picked_inds: index of the molecule that have been selected as centroids
        mols: molecules that have been picked
    """

    n_mols = len(mols)
    if feature_fn is None:
        feature_fn = functools.partial(dm.to_fp, as_array=False)

    features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs)

    def distij(i, j, features=features):
        return 1.0 - DataStructs.TanimotoSimilarity(features[i], features[j])

    if dist_fn is None:
        dist_fn = distij

    initial_picks = [] if initial_picks is None else initial_picks

    if method == "maxmin":
        picker = MaxMinPicker()
        picked_inds, _ = picker.LazyPickWithThreshold(
            dist_fn,
            n_mols,
            pickSize=npick,
            threshold=threshold,
            firstPicks=initial_picks,
            seed=seed,
        )

    elif method == "sphere":
        picker = LeaderPicker()
        picked_inds = picker.LazyPick(
            dist_fn, n_mols, threshold=threshold, pickSize=npick, firstPicks=initial_picks
        )

    elif method.upper() in ClusterMethod.names.keys() and npick:
        if initial_picks:
            logger.warning(
                "Initial picks is not supported by hierarchical clustering. You pick has been discarded."
            )

        dist_mat = dm.parallelized(
            distij, list(zip(*np.tril_indices(len(mols), k=-1))), arg_type="args"
        )
        dist_mat = np.asarray(dist_mat)
        picker = HierarchicalClusterPicker(ClusterMethod.names[method.upper()])
        picked_inds = picker.Pick(dist_mat, n_mols, npick)
    else:
        raise ValueError(f"Picking method {method} with {npick} elements to pick is not supported.")
    picked_inds = np.array(picked_inds)
    picked_mols = [mols[x] for x in picked_inds]

    return picked_inds, picked_mols

`assign_to_centroids(mols, centroids, feature_fn=None, dist_fn=None, n_jobs=1)` ¶

Assign molecules to centroids. Each molecule will be assigned to the closest centroid.

Parameters:

Name	Type	Description	Default
`mols`	`List[rdkit.Chem.rdchem.Mol]`	a list of molecules to assign to centroids	required
`centroids`	`List[rdkit.Chem.rdchem.Mol]`	list of molecules to use as centroid	required
`feature_fn`	`Callable`	A feature function that takes a Chem.rdchem.Mol object and return molecular features. By default, the `dm.to_fp()` is used. Default to None.	`None`
`dist_fn`	`Callable`	A function that takes two indexes (i,j) and return the distance between them. You might use partial to set the fingerprints as input. By default, the Tanimoto similarity will be used. Default to None.	`None`
`n_jobs`	`Optional[int]`	Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores.	`1`

Returns:

Type	Description
`clusters_map`	dict of index mapping each centroid index to the molecule index in the cluster clusters_list: list of all molecules in each cluster. The cluster index follows the index of the centroid. Note that the centroid molecule is not added to the cluster.

Source code in datamol/cluster.py

def assign_to_centroids(
    mols: List[Chem.rdchem.Mol],
    centroids: List[Chem.rdchem.Mol],
    feature_fn: Callable = None,
    dist_fn: Callable = None,
    n_jobs: Optional[int] = 1,
):
    r"""Assign molecules to centroids. Each molecule will be assigned to the closest centroid.

    Args:
        mols: a list of molecules to assign to centroids
        centroids: list of molecules to use as centroid
        feature_fn: A feature function that takes a Chem.rdchem.Mol object
            and return molecular features. By default, the `dm.to_fp()` is used.
            Default to None.
        dist_fn: A function that takes two indexes (i,j) and return the
            distance between them. You might use partial to set the fingerprints as input.
            By default, the Tanimoto similarity will be used. Default to None.
        n_jobs: Number of jobs for parallelization. Let to 1 for no
            parallelization. Set to None to use all available cores.

    Returns:
        clusters_map: dict of index mapping each centroid index to the molecule index in the cluster
        clusters_list: list of all molecules in each cluster. The cluster index follows the index of the centroid.
            Note that the centroid molecule is not added to the cluster.
    """

    if feature_fn is None:
        feature_fn = functools.partial(dm.to_fp, as_array=False)

    all_mols = [x for x in mols] + [c for c in centroids]
    features = dm.parallelized(feature_fn, all_mols, n_jobs=n_jobs)

    def distij(i, j, features=features):
        return 1.0 - DataStructs.TanimotoSimilarity(features[int(i)], features[int(j)])

    if dist_fn is None:
        dist_fn = distij

    clusters_map = ddict(list)
    clusters_list = [[] for _ in centroids]
    query_inds = np.expand_dims(np.arange(len(mols), dtype=int), axis=1)
    centroid_inds = np.expand_dims(np.arange(len(centroids), dtype=int), axis=1) + len(mols)
    dist_mat = distance.cdist(query_inds, centroid_inds, metric=distij)
    closest = np.argmin(dist_mat, axis=1)
    for ind, cluster_ind in enumerate(closest):  # type: ignore
        clusters_map[cluster_ind].append(ind)
        clusters_list[cluster_ind].append(mols[ind])
    return clusters_map, clusters_list

Molecule as a graph¶

`to_graph(mol)` ¶

Convert a molecule to a network x graph. A list of properties are added to every nodes and edges.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	a molecule.	required

Returns:

Type	Description
`mol_graph (networkx.Graph)`	a graph representing the molecule.

Source code in datamol/graph.py

def to_graph(mol: Chem.rdchem.Mol):
    """Convert a molecule to a network x graph. A list of properties are added
    to every nodes and edges.

    Args:
        mol (Chem.Mol): a molecule.

    Returns:
        mol_graph (networkx.Graph): a graph representing the molecule.
    """

    nx = _get_networkx()

    mol_graph = nx.Graph()
    for atom in mol.GetAtoms():
        mol_graph.add_node(
            atom.GetIdx(),
            atomic_num=atom.GetAtomicNum(),
            formal_charge=atom.GetFormalCharge(),
            chiral_tag=atom.GetChiralTag(),
            hybridization=atom.GetHybridization(),
            num_explicit_hs=atom.GetNumExplicitHs(),
            implicit_valence=atom.GetImplicitValence(),
            degree=atom.GetDegree(),
            symbol=atom.GetSymbol(),
            ring_atom=atom.IsInRing(),
            is_aromatic=atom.GetIsAromatic(),
        )
    for bond in mol.GetBonds():
        mol_graph.add_edge(
            bond.GetBeginAtomIdx(),
            bond.GetEndAtomIdx(),
            bond_type=bond.GetBondType(),
        )
    return mol_graph

`get_all_path_between(mol, atom_idx_1, atom_idx_2, ignore_cycle_basis=False)` ¶

Get all simple path between two atoms of a molecule

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	a molecule	required
`atom_idx_1`	`int`	Atom index 1.	required
`atom_idx_2`	`int`	Atom index 2.	required
`ignore_cycle_basis`	`bool`	Whether to ignore cycle basis. Defaults to False.	`False`

Returns:

Type	Description
`[type]`	[description]

Source code in datamol/graph.py

def get_all_path_between(
    mol: Chem.rdchem.Mol,
    atom_idx_1: int,
    atom_idx_2: int,
    ignore_cycle_basis: bool = False,
):
    """Get all simple path between two atoms of a molecule

    Args:
        mol (Chem.Mol): a molecule
        atom_idx_1 (int): Atom index 1.
        atom_idx_2 (int): Atom index 2.
        ignore_cycle_basis: Whether to ignore cycle basis.
            Defaults to False.

    Returns:
        [type]: [description]
    """

    nx = _get_networkx()

    adj = Chem.rdmolops.GetAdjacencyMatrix(mol)
    G = nx.Graph(adj)
    path = nx.all_simple_paths(G, source=atom_idx_1, target=atom_idx_2)

    if ignore_cycle_basis:
        rings = [set(x) for x in mol.GetRingInfo().AtomRings()]
        final_path = []
        for p in path:
            reject_path = False
            for r in rings:
                if r.issubset(set(p)):
                    reject_path = True
                    break
            if not reject_path:
                final_path.append(p)
        path = final_path

    return list(path)

Constants¶

`PERIODIC_TABLE: None` ¶

`TRIPLE_BOND: None` ¶

`DOUBLE_BOND: None` ¶

`SINGLE_BOND: None` ¶

`AROMATIC_BOND: None` ¶

Control RDKit logging¶

`without_rdkit_log` ¶

Context manager to disable RDKit logs. By default all logs are disabled.

Examples:

import datamol as dm

with dm.without_rdkit_log():
    mol = dm.to_mol("CCCCO")  # potential RDKit logs won't show

`enable_rdkit_log()` ¶

Enable all rdkit logs.

Source code in datamol/log.py

def enable_rdkit_log():
    """Enable all rdkit logs."""
    for log_level in RDLogger._levels:
        rdBase.EnableLog(log_level)

`disable_rdkit_log()` ¶

Disable all rdkit logs.

Source code in datamol/log.py

def disable_rdkit_log():
    """Disable all rdkit logs."""
    for log_level in RDLogger._levels:
        rdBase.DisableLog(log_level)

Toy dataset¶

`freesolv()` ¶

Return the FreeSolv dataset as a dataframe.

The dataset contains 642 molecules and the following columns: ['iupac', 'smiles', 'expt', 'calc'].

Warning

This dataset is only meant to be used as a toy dataset for pedagogic and testing purposes. It is not a dataset for benchmarking, analysis or model training.

Source code in datamol/data.py

def freesolv():
    """Return the FreeSolv dataset as a dataframe.

    The dataset contains 642 molecules and the following columns:
    `['iupac', 'smiles', 'expt', 'calc']`.

    Warning:
        This dataset is only meant to be used as a toy dataset for pedagogic and
        testing purposes. **It is not** a dataset for benchmarking, analysis or
        model training.
    """

    with pkg_resources.resource_stream("datamol", "data/freesolv.csv") as f:
        data = pd.read_csv(f)
    return data

datamol¶

Working with molecules¶

The basics¶

to_mol(mol, add_hs=False, explicit_only=False, ordered=False, kekulize=False, sanitize=True) ¶

copy_mol(mol) ¶

reorder_atoms(mol, break_ties=True, include_chirality=True, include_isotopes=True) ¶

randomize_atoms(mol) ¶

to_neutral(mol) ¶

set_mol_props(mol, props, copy=False) ¶

copy_mol_props(source, destination) ¶

atom_indices_to_mol(mol, copy=False) ¶

same_mol(mol1, mol2) ¶

Fix, sanitize and standardize¶

sanitize_mol(mol, charge_neutral=False, sanifix=True, verbose=True, add_hs=False) ¶

sanitize_first(mols, charge_neutral=False, sanifix=True) ¶

sanitize_smiles(smiles, isomeric=True) ¶

standardize_smiles(smiles, tautomer=False) ¶

standardize_mol(mol, disconnect_metals=False, normalize=True, reionize=True, uncharge=False, stereo=True) ¶

fix_valence_charge(mol, inplace=False) ¶

incorrect_valence(a, update=False) ¶

decrease_bond(bond) ¶

fix_valence(mol, inplace=False, allow_ring_break=False) ¶

adjust_singleton(mol) ¶

remove_dummies(mol, dummy='*') ¶

fix_mol(mol, n_iter=1, remove_singleton=False, largest_only=False, inplace=False) ¶

replace_dummies_atoms(mol, atom='C', dummy='*', replace_all=True) ¶

keep_largest_fragment(mol) ¶

is_transition_metal(at) ¶

set_dative_bonds(mol, from_atoms=(7, 8)) ¶

Enumerate¶

enumerate_stereoisomers(mol, n_variants=20, undefined_only=False, rationalise=True) ¶

enumerate_tautomers(mol, n_variants=20) ¶

Convert molecule(s)¶

to_smiles(mol, canonical=True, isomeric=True, ordered=False, explicit_bonds=False, explicit_hs=False, randomize=False, cxsmiles=False, allow_to_fail=False) ¶

to_selfies(mol) ¶

from_selfies(selfies, as_mol=False) ¶

to_smarts(mol, keep_hs=True) ¶

to_inchi(mol) ¶

to_inchikey(mol) ¶

from_inchi(inchi, sanitize=True, remove_hs=True) ¶

to_df(mols, smiles_column='smiles', mol_column=None, include_private=False, include_computed=False, render_df_mol=True, render_all_df_mol=False) ¶

from_df(df, smiles_column='smiles', mol_column=None, conserve_smiles=False, sanitize=True) ¶

Input/Output¶

read_csv(urlpath, smiles_column=None, mol_column='mol', **kwargs) ¶

read_excel(urlpath, sheet_name=0, smiles_column=None, mol_column='mol', **kwargs) ¶

read_sdf(urlpath, sanitize=True, as_df=False, smiles_column='smiles', mol_column=None, include_private=False, include_computed=False, strict_parsing=True) ¶

to_sdf(mols, urlpath, smiles_column='smiles', mol_column=None) ¶

to_smi(mols, urlpath, error_if_empty=False) ¶

read_smi(urlpath) ¶

Molecule similarity and distance¶

pdist(mols, n_jobs=1, squareform=True, **fp_args) ¶

cdist(mols1, mols2, n_jobs=1, **fp_args) ¶

Working with fingerprints¶

to_fp(mol, as_array=True, fp_type='ecfp', fold_size=None, **fp_args) ¶

fp_to_array(fp) ¶

list_supported_fingerprints() ¶

fold_count_fp(fp, dim=1024, binary=False) ¶

Cluster molecules¶

cluster_mols(mols, cutoff=0.2, feature_fn=None, n_jobs=1) ¶

pick_diverse(mols, npick, initial_picks=None, feature_fn=None, dist_fn=None, seed=42, n_jobs=1) ¶

pick_centroids(mols, npick=0, initial_picks=None, threshold=0.5, feature_fn=None, dist_fn=None, seed=42, method='sphere', n_jobs=1) ¶

assign_to_centroids(mols, centroids, feature_fn=None, dist_fn=None, n_jobs=1) ¶

Molecule as a graph¶

to_graph(mol) ¶

get_all_path_between(mol, atom_idx_1, atom_idx_2, ignore_cycle_basis=False) ¶

Constants¶

PERIODIC_TABLE: None ¶

TRIPLE_BOND: None ¶

DOUBLE_BOND: None ¶

SINGLE_BOND: None ¶

AROMATIC_BOND: None ¶

Control RDKit logging¶

without_rdkit_log ¶

enable_rdkit_log() ¶

disable_rdkit_log() ¶

Toy dataset¶

freesolv() ¶

`datamol`¶

`to_mol(mol, add_hs=False, explicit_only=False, ordered=False, kekulize=False, sanitize=True)` ¶

`copy_mol(mol)` ¶

`reorder_atoms(mol, break_ties=True, include_chirality=True, include_isotopes=True)` ¶

`randomize_atoms(mol)` ¶

`to_neutral(mol)` ¶

`set_mol_props(mol, props, copy=False)` ¶

`copy_mol_props(source, destination)` ¶

`atom_indices_to_mol(mol, copy=False)` ¶

`same_mol(mol1, mol2)` ¶

`sanitize_mol(mol, charge_neutral=False, sanifix=True, verbose=True, add_hs=False)` ¶

`sanitize_first(mols, charge_neutral=False, sanifix=True)` ¶

`sanitize_smiles(smiles, isomeric=True)` ¶

`standardize_smiles(smiles, tautomer=False)` ¶

`standardize_mol(mol, disconnect_metals=False, normalize=True, reionize=True, uncharge=False, stereo=True)` ¶

`fix_valence_charge(mol, inplace=False)` ¶

`incorrect_valence(a, update=False)` ¶

`decrease_bond(bond)` ¶

`fix_valence(mol, inplace=False, allow_ring_break=False)` ¶

`adjust_singleton(mol)` ¶

`remove_dummies(mol, dummy='*')` ¶

`fix_mol(mol, n_iter=1, remove_singleton=False, largest_only=False, inplace=False)` ¶

`replace_dummies_atoms(mol, atom='C', dummy='*', replace_all=True)` ¶

`keep_largest_fragment(mol)` ¶

`is_transition_metal(at)` ¶

`set_dative_bonds(mol, from_atoms=(7, 8))` ¶

`enumerate_stereoisomers(mol, n_variants=20, undefined_only=False, rationalise=True)` ¶

`enumerate_tautomers(mol, n_variants=20)` ¶

`to_smiles(mol, canonical=True, isomeric=True, ordered=False, explicit_bonds=False, explicit_hs=False, randomize=False, cxsmiles=False, allow_to_fail=False)` ¶

`to_selfies(mol)` ¶

`from_selfies(selfies, as_mol=False)` ¶

`to_smarts(mol, keep_hs=True)` ¶

`to_inchi(mol)` ¶

`to_inchikey(mol)` ¶

`from_inchi(inchi, sanitize=True, remove_hs=True)` ¶

`to_df(mols, smiles_column='smiles', mol_column=None, include_private=False, include_computed=False, render_df_mol=True, render_all_df_mol=False)` ¶

`from_df(df, smiles_column='smiles', mol_column=None, conserve_smiles=False, sanitize=True)` ¶

`read_csv(urlpath, smiles_column=None, mol_column='mol', **kwargs)` ¶

`read_excel(urlpath, sheet_name=0, smiles_column=None, mol_column='mol', **kwargs)` ¶

`read_sdf(urlpath, sanitize=True, as_df=False, smiles_column='smiles', mol_column=None, include_private=False, include_computed=False, strict_parsing=True)` ¶

`to_sdf(mols, urlpath, smiles_column='smiles', mol_column=None)` ¶

`to_smi(mols, urlpath, error_if_empty=False)` ¶

`read_smi(urlpath)` ¶

`pdist(mols, n_jobs=1, squareform=True, **fp_args)` ¶

`cdist(mols1, mols2, n_jobs=1, **fp_args)` ¶

`to_fp(mol, as_array=True, fp_type='ecfp', fold_size=None, **fp_args)` ¶

`fp_to_array(fp)` ¶

`list_supported_fingerprints()` ¶

`fold_count_fp(fp, dim=1024, binary=False)` ¶

`cluster_mols(mols, cutoff=0.2, feature_fn=None, n_jobs=1)` ¶

`pick_diverse(mols, npick, initial_picks=None, feature_fn=None, dist_fn=None, seed=42, n_jobs=1)` ¶

`pick_centroids(mols, npick=0, initial_picks=None, threshold=0.5, feature_fn=None, dist_fn=None, seed=42, method='sphere', n_jobs=1)` ¶

`assign_to_centroids(mols, centroids, feature_fn=None, dist_fn=None, n_jobs=1)` ¶

`to_graph(mol)` ¶

`get_all_path_between(mol, atom_idx_1, atom_idx_2, ignore_cycle_basis=False)` ¶

`PERIODIC_TABLE: None` ¶

`TRIPLE_BOND: None` ¶

`DOUBLE_BOND: None` ¶

`SINGLE_BOND: None` ¶

`AROMATIC_BOND: None` ¶

`without_rdkit_log` ¶

`enable_rdkit_log()` ¶

`disable_rdkit_log()` ¶

`freesolv()` ¶