datamol
¶
Datamol is designed to be used with a single import (import datamol as dm
). Most of the functions are available in datamol.*
. The others ones are available throught their specific modules.
The below sections shows you the directly available Datamol functions. For other modules, please browser the API using the left menu.
Working with molecules¶
The basics¶
to_mol(mol, add_hs=False, explicit_only=False, ordered=False, kekulize=False, sanitize=True)
¶
Convert an input molecule (smiles representation) into a Chem.rdchem.Mol
.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
str |
SMILES of a molecule or a molecule. |
required |
add_hs |
bool |
Whether hydrogens should be added the molecule. |
False |
explicit_only |
bool |
Whether to only add explicit hydrogen or both
(implicit and explicit). when |
False |
ordered |
bool |
Whether the atom should be ordered. This option is important if you want to ensure that the features returned will always maintain a single atom order for the same molecule, regardless of its original SMILES representation. |
False |
kekulize |
bool |
Whether to perform kekulization of the input molecules. |
False |
sanitize |
bool |
Whether to apply rdkit sanitization when input is a SMILES. |
True |
Returns:
Type | Description |
---|---|
Optional[rdkit.Chem.rdchem.Mol] |
mol: the molecule if some conversion have been made. If the conversion fails None is returned so make sure that you handle this case on your own. |
Source code in datamol/mol.py
def to_mol(
mol: str,
add_hs: bool = False,
explicit_only: bool = False,
ordered: bool = False,
kekulize: bool = False,
sanitize: bool = True,
) -> Optional[Chem.rdchem.Mol]:
"""Convert an input molecule (smiles representation) into a `Chem.rdchem.Mol`.
Args:
mol: SMILES of a molecule or a molecule.
add_hs: Whether hydrogens should be added the molecule.
explicit_only: Whether to only add explicit hydrogen or both
(implicit and explicit). when `add_hs` is set to True.
ordered: Whether the atom should be ordered. This option is
important if you want to ensure that the features returned will always maintain
a single atom order for the same molecule, regardless of its original SMILES representation.
kekulize: Whether to perform kekulization of the input molecules.
sanitize: Whether to apply rdkit sanitization when input is a SMILES.
Returns:
mol: the molecule if some conversion have been made. If the conversion fails
None is returned so make sure that you handle this case on your own.
"""
if not isinstance(mol, (str, Chem.rdchem.Mol)):
raise ValueError(f"Input should be a Chem.rdchem.Mol or a string instead of '{type(mol)}'")
if isinstance(mol, str):
_mol = Chem.MolFromSmiles(mol, sanitize=sanitize)
if not sanitize and _mol is not None:
_mol.UpdatePropertyCache(False)
else:
_mol = mol
# Add hydrogens
if _mol is not None and add_hs:
_mol = Chem.AddHs(_mol, explicitOnly=explicit_only, addCoords=True)
# Reorder atoms
if _mol is not None and ordered:
_mol = reorder_atoms(_mol)
if _mol is not None and kekulize:
Chem.Kekulize(_mol, clearAromaticFlags=False)
return _mol
copy_mol(mol)
¶
Copy a molecule and return a new one.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
a molecule to copy. |
required |
Source code in datamol/mol.py
def copy_mol(mol: Chem.rdchem.Mol) -> Chem.rdchem.Mol:
"""Copy a molecule and return a new one.
Args:
mol: a molecule to copy.
"""
return copy.deepcopy(mol)
reorder_atoms(mol, break_ties=True, include_chirality=True, include_isotopes=True)
¶
Reorder the atoms in a mol. It ensures a single atom order for the same molecule, regardless of its original representation.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
a molecule. |
required |
break_ties |
bool |
Force breaking of ranked ties. |
True |
include_chirality |
bool |
Use chiral information when computing rank. |
True |
include_isotopes |
bool |
Use isotope information when computing rank. |
True |
Returns:
Type | Description |
---|---|
Optional[rdkit.Chem.rdchem.Mol] |
mol: a molecule. |
Source code in datamol/mol.py
def reorder_atoms(
mol: Chem.rdchem.Mol,
break_ties: bool = True,
include_chirality: bool = True,
include_isotopes: bool = True,
) -> Optional[Chem.rdchem.Mol]:
"""Reorder the atoms in a mol. It ensures a single atom order for the same molecule,
regardless of its original representation.
Args:
mol: a molecule.
break_ties: Force breaking of ranked ties.
include_chirality: Use chiral information when computing rank.
include_isotopes: Use isotope information when computing rank.
Returns:
mol: a molecule.
"""
if mol.GetNumAtoms() == 0:
return mol
new_order = Chem.CanonicalRankAtoms(
mol,
breakTies=break_ties,
includeChirality=include_chirality,
includeIsotopes=include_isotopes,
)
new_order = sorted([(y, x) for x, y in enumerate(new_order)])
return Chem.RenumberAtoms(mol, [y for (x, y) in new_order])
randomize_atoms(mol)
¶
Randomize the position of the atoms in a mol.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
a molecule. |
required |
Returns:
Type | Description |
---|---|
Optional[rdkit.Chem.rdchem.Mol] |
mol: a molecule. |
Source code in datamol/mol.py
def randomize_atoms(mol: Chem.rdchem.Mol) -> Optional[Chem.rdchem.Mol]:
"""Randomize the position of the atoms in a mol.
Args:
mol: a molecule.
Returns:
mol: a molecule.
"""
if mol.GetNumAtoms() == 0:
return mol
atom_indices = list(range(mol.GetNumAtoms()))
random.shuffle(atom_indices)
return Chem.RenumberAtoms(mol, atom_indices)
to_neutral(mol)
¶
Neutralize the charge of a molecule.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
a molecule. |
required |
Returns:
Type | Description |
---|---|
Optional[rdkit.Chem.rdchem.Mol] |
mol: a molecule. |
Source code in datamol/mol.py
def to_neutral(mol: Chem.rdchem.Mol) -> Optional[Chem.rdchem.Mol]:
"""Neutralize the charge of a molecule.
Args:
mol: a molecule.
Returns:
mol: a molecule.
"""
if mol is None:
return mol
for a in mol.GetAtoms():
if a.GetFormalCharge() < 0 or (
a.GetExplicitValence() >= PERIODIC_TABLE.GetDefaultValence(a.GetSymbol())
and a.GetFormalCharge() > 0
):
a.SetFormalCharge(0)
a.UpdatePropertyCache(False)
return mol
set_mol_props(mol, props, copy=False)
¶
Set properties to a mol from a dict.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
the mol where to copy the props. |
required |
props |
Dict[str, Any] |
the props to copy. |
required |
copy |
bool |
whether to copy the provided mol |
False |
Source code in datamol/mol.py
def set_mol_props(
mol: Chem.rdchem.Mol,
props: Dict[str, Any],
copy: bool = False,
) -> Chem.rdchem.Mol:
"""Set properties to a mol from a dict.
Args:
mol: the mol where to copy the props.
props: the props to copy.
copy: whether to copy the provided mol
"""
if copy is True:
mol = dm.copy_mol(mol)
for k, v in props.items():
if isinstance(v, bool):
mol.SetBoolProp(k, v)
elif isinstance(v, int):
mol.SetIntProp(k, v)
elif isinstance(v, float):
mol.SetDoubleProp(k, v)
else:
mol.SetProp(k, str(v))
return mol
copy_mol_props(source, destination)
¶
Copy properties from one source molecule to another destination molecule.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
source |
Mol |
a molecule to copy from. |
required |
destination |
Mol |
a molecule to copy to. |
required |
Source code in datamol/mol.py
def copy_mol_props(source: Chem.rdchem.Mol, destination: Chem.rdchem.Mol):
"""Copy properties from one source molecule to another destination
molecule.
Args:
source: a molecule to copy from.
destination: a molecule to copy to.
"""
props = source.GetPropsAsDict()
dm.set_mol_props(destination, props)
atom_indices_to_mol(mol, copy=False)
¶
Add the molAtomMapNumber
property to each atoms.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
a molecule |
required |
copy |
bool |
Whether to copy the molecule. |
False |
Source code in datamol/mol.py
def atom_indices_to_mol(mol: Chem.rdchem.Mol, copy: bool = False):
"""Add the `molAtomMapNumber` property to each atoms.
Args:
mol: a molecule
copy: Whether to copy the molecule.
"""
if copy is True:
mol = copy_mol(mol)
for atom in mol.GetAtoms():
atom.SetProp("molAtomMapNumber", str(atom.GetIdx()))
return mol
same_mol(mol1, mol2)
¶
Check two molecules are the same by comparing their InChiKey.
Invalid molecules (None) are always considered as not the same.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol1 |
Optional[rdkit.Chem.rdchem.Mol] |
A molecule. |
required |
mol2 |
Optional[rdkit.Chem.rdchem.Mol] |
A molecule. |
required |
Source code in datamol/mol.py
def same_mol(mol1: Optional[Chem.rdchem.Mol], mol2: Optional[Chem.rdchem.Mol]):
"""Check two molecules are the same by comparing their InChiKey.
Invalid molecules (None) are always considered as not the same.
Args:
mol1: A molecule.
mol2: A molecule.
"""
if mol1 is None or mol2 is None:
return False
return dm.to_inchikey(mol1) == dm.to_inchikey(mol2)
Fix, sanitize and standardize¶
sanitize_mol(mol, charge_neutral=False, sanifix=True, verbose=True, add_hs=False)
¶
An augmented version of RDKit sanitize=True
. It uses a
mol-SMILES-mol conversion to catch potential aromaticity errors
and try to fix aromatic nitrogen (using the popular sanifix4 script).
Optionally, it can neutralize the charge of the molecule.
Note #1: Only the first conformer (if present) will be preserved and a warning will be displayed if more than one conformer is detected.
Note #2: The molecule's properties will be preserved but the atom's properties will be lost.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
a molecule. |
required |
charge_neutral |
bool |
whether charge neutralization should be applied. |
False |
sanifix |
bool |
whether to run the sanifix from James Davidson (sanifix4.py) that try to adjust aromatic nitrogens. |
True |
verbose |
bool |
Whether displaying a warning about multiple conformers. |
True |
add_hs |
bool |
Add hydrogens to the returned molecule. Useful when the input molecule already contains hydrogens. |
False |
Returns:
Type | Description |
---|---|
Optional[rdkit.Chem.rdchem.Mol] |
mol: a molecule. |
Source code in datamol/mol.py
def sanitize_mol(
mol: Chem.rdchem.Mol,
charge_neutral: bool = False,
sanifix: bool = True,
verbose: bool = True,
add_hs: bool = False,
) -> Optional[Chem.rdchem.Mol]:
"""An augmented version of RDKit `sanitize=True`. It uses a
mol-SMILES-mol conversion to catch potential aromaticity errors
and try to fix aromatic nitrogen (using the popular sanifix4 script).
Optionally, it can neutralize the charge of the molecule.
Note #1: Only the first conformer (if present) will be preserved and
a warning will be displayed if more than one conformer is detected.
Note #2: The molecule's properties will be preserved but the atom's
properties will be lost.
Args:
mol: a molecule.
charge_neutral: whether charge neutralization should be applied.
sanifix: whether to run the sanifix from James Davidson
(sanifix4.py) that try to adjust aromatic nitrogens.
verbose: Whether displaying a warning about multiple conformers.
add_hs: Add hydrogens to the returned molecule. Useful when the input
molecule already contains hydrogens.
Returns:
mol: a molecule.
"""
if mol is None:
return mol
# Extract properties.
original_mol = copy_mol(mol)
properties = original_mol.GetPropsAsDict()
if charge_neutral:
mol = to_neutral(mol)
if sanifix:
mol = _sanifix4.sanifix(mol)
if mol is not None:
# Detect multiple conformers
if verbose and mol.GetNumConformers() > 1:
logger.warning(
f"The molecule contains multiple conformers. Only the first one will be preserved."
)
# Try catch to avoid occasional aromaticity errors
try:
# `cxsmiles` is used here to preserve the first conformer.
mol = to_mol(dm.to_smiles(mol, cxsmiles=True), sanitize=True, add_hs=add_hs) # type: ignore
except Exception:
mol = None
if mol is not None:
# Insert back properties.
mol = dm.set_mol_props(mol, properties)
return mol
sanitize_first(mols, charge_neutral=False, sanifix=True)
¶
Sanitize a list of molecules and return the first valid molecule seen in the list.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mols |
List[rdkit.Chem.rdchem.Mol] |
a list of molecules. |
required |
charge_neutral |
bool |
whether charge neutralization should be applied. |
False |
sanifix |
bool |
whether to run the sanifix from James Davidson (sanifix4.py) that try to adjust aromatic nitrogens. |
True |
Returns:
Type | Description |
---|---|
mol |
a molecule. |
Source code in datamol/mol.py
def sanitize_first(mols: List[Chem.rdchem.Mol], charge_neutral: bool = False, sanifix: bool = True):
"""Sanitize a list of molecules and return the first valid molecule seen in the list.
Args:
mols: a list of molecules.
charge_neutral: whether charge neutralization should be applied.
sanifix: whether to run the sanifix from James Davidson
(sanifix4.py) that try to adjust aromatic nitrogens.
Returns:
mol: a molecule.
"""
for mol in mols:
mol = sanitize_mol(mol, charge_neutral=charge_neutral, sanifix=sanifix)
if mol:
return mol
return None
sanitize_smiles(smiles, isomeric=True)
¶
Takes SMILES string and returns its sanitized version.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
smiles |
str |
smiles to be sanitized. |
required |
isomeric |
bool |
Whether to include information about stereochemistry in the SMILES. |
True |
Returns:
Type | Description |
---|---|
Optional[str] |
sanitized smiles. |
Source code in datamol/mol.py
def sanitize_smiles(smiles: str, isomeric: bool = True) -> Optional[str]:
"""Takes SMILES string and returns its sanitized version.
Args:
smiles: smiles to be sanitized.
isomeric: Whether to include information about stereochemistry in the SMILES.
Returns:
sanitized smiles.
"""
try:
mol = dm.to_mol(smiles, sanitize=False)
mol = dm.sanitize_mol(mol, False)
except Exception:
return None
if mol is None:
return None
try:
smiles = dm.to_smiles(mol, isomeric=isomeric) # type: ignore
except:
return None
return smiles
standardize_smiles(smiles, tautomer=False)
¶
Apply smile standardization procedure. This is a convenient function wrapped arrounf RDKit smiles standardizer and tautomeric canonicalization.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
smiles |
str |
Smiles to standardize |
required |
tautomer |
bool |
Whether to canonicalize tautomers |
False |
Returns:
Type | Description |
---|---|
standard_smiles |
the standardized smiles |
Source code in datamol/mol.py
def standardize_smiles(smiles: str, tautomer: bool = False):
r"""
Apply smile standardization procedure. This is a convenient function wrapped arrounf RDKit
smiles standardizer and tautomeric canonicalization.
Args:
smiles: Smiles to standardize
tautomer: Whether to canonicalize tautomers
Returns:
standard_smiles: the standardized smiles
"""
smiles = rdMolStandardize.StandardizeSmiles(smiles)
if tautomer:
smiles = canonicalize_tautomer_smiles(smiles)
return smiles
standardize_mol(mol, disconnect_metals=False, normalize=True, reionize=True, uncharge=False, stereo=True)
¶
This function returns a standardized version the given molecule, with or without disconnect the metals. The process is apply in the order of the argument.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
The molecule to standardize. |
required |
disconnect_metals |
bool |
Whether to disconnect the metallic atoms from non-metals |
False |
normalize |
bool |
Whether to apply normalization (correct functional groups and recombine charges). |
True |
reionize |
bool |
Whether to apply molecule reionization |
True |
uncharge |
bool |
Whether to remove all charge from molecule |
False |
stereo |
bool |
Whether to attempt to assign stereochemistry |
True |
Returns:
Type | Description |
---|---|
mol |
The standardized molecule. |
Source code in datamol/mol.py
def standardize_mol(
mol: Chem.rdchem.Mol,
disconnect_metals: bool = False,
normalize: bool = True,
reionize: bool = True,
uncharge: bool = False,
stereo: bool = True,
):
r"""
This function returns a standardized version the given molecule, with or without disconnect the metals.
The process is apply in the order of the argument.
Arguments:
mol: The molecule to standardize.
disconnect_metals: Whether to disconnect the metallic atoms from non-metals
normalize: Whether to apply normalization (correct functional groups and recombine charges).
reionize: Whether to apply molecule reionization
uncharge: Whether to remove all charge from molecule
stereo: Whether to attempt to assign stereochemistry
Returns:
mol: The standardized molecule.
"""
mol = copy_mol(mol)
if disconnect_metals:
md = rdMolStandardize.MetalDisconnector()
mol = md.Disconnect(mol)
if normalize:
mol = rdMolStandardize.Normalize(mol)
if reionize:
reionizer = rdMolStandardize.Reionizer()
mol = reionizer.reionize(mol)
if uncharge:
uncharger = rdMolStandardize.Uncharger()
mol = uncharger.uncharge(mol)
if stereo:
Chem.AssignStereochemistry(mol, force=False, cleanIt=True)
return mol
fix_valence_charge(mol, inplace=False)
¶
Fix valence issues that are due to incorrect charges.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
Input molecule with incorrect valence for some atoms |
required |
inplace |
bool |
Whether to modify in place or make a copy. |
False |
Returns:
Type | Description |
---|---|
Optional[rdkit.Chem.rdchem.Mol] |
Fixed molecule via charge correction or original molecule if failed. |
Source code in datamol/mol.py
def fix_valence_charge(mol: Chem.rdchem.Mol, inplace: bool = False) -> Optional[Chem.rdchem.Mol]:
"""Fix valence issues that are due to incorrect charges.
Args:
mol: Input molecule with incorrect valence for some atoms
inplace: Whether to modify in place or make a copy.
Returns:
Fixed molecule via charge correction or original molecule if failed.
"""
vm = rdMolStandardize.RDKitValidation()
# Don't fix something that is not broken
if len(vm.validate(mol)) > 0:
if not inplace:
mol = copy.copy(mol)
mol.UpdatePropertyCache(False)
for a in mol.GetAtoms():
n_electron = (
a.GetImplicitValence()
+ a.GetExplicitValence()
- dm.PERIODIC_TABLE.GetDefaultValence(a.GetSymbol())
)
a.SetFormalCharge(n_electron)
return mol
incorrect_valence(a, update=False)
¶
Check if an atom connection is not valid or all the atom of a molecule.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
a |
Union[rdkit.Chem.rdchem.Mol, rdkit.Chem.rdchem.Atom] |
atom or molecule to check for valence issue. |
required |
update |
bool |
Update owning molecule property cache first. |
False |
Returns:
Type | Description |
---|---|
bool |
Whether the input atom valence is correct. |
Source code in datamol/mol.py
def incorrect_valence(a: Union[Chem.rdchem.Mol, Chem.rdchem.Atom], update: bool = False) -> bool:
"""Check if an atom connection is not valid or all the atom of a molecule.
Args:
a: atom or molecule to check for valence issue.
update: Update owning molecule property cache first.
Returns:
Whether the input atom valence is correct.
"""
if isinstance(a, Chem.rdchem.Mol):
a.UpdatePropertyCache(False)
vm = rdMolStandardize.RDKitValidation()
return len(vm.validate(a)) > 0
if update:
m = a.GetOwningMol()
m.UpdatePropertyCache(False)
return (a.GetImplicitValence() == 0) and (
a.GetExplicitValence() > max(PERIODIC_TABLE.GetValenceList(a.GetSymbol()))
)
decrease_bond(bond)
¶
Remove one single bond from the input bond. Note that you should first kekulize your molecules and remove non-standard bond.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
bond |
Bond |
a bond. |
required |
Source code in datamol/mol.py
def decrease_bond(bond: Chem.rdchem.Bond) -> Optional[Union[list, Chem.rdchem.Bond]]:
"""Remove one single bond from the input bond. Note that you should
first kekulize your molecules and remove non-standard bond.
Args:
bond: a bond.
"""
if bond.GetBondType() == TRIPLE_BOND:
return DOUBLE_BOND
if bond.GetBondType() == DOUBLE_BOND:
return SINGLE_BOND
if bond.GetBondType() == SINGLE_BOND:
return None
return bond
fix_valence(mol, inplace=False, allow_ring_break=False)
¶
Identify and try to fix valence issues by removing any supplemental bond that should not be in the graph.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
|
input molecule with incorrect valence for some atoms |
required |
inplace |
bool |
Whether to modify in place or make a copy |
False |
allow_ring_break |
bool |
Whether bond removal involving ring is allowed. |
False |
Returns:
Type | Description |
---|---|
Optional[rdkit.Chem.rdchem.Mol] |
Fixed potential valence issue in molecule or original molecule when nothing is broken of if failed. |
Source code in datamol/mol.py
def fix_valence(
mol, inplace: bool = False, allow_ring_break: bool = False
) -> Optional[Chem.rdchem.Mol]:
"""Identify and try to fix valence issues by removing any supplemental bond
that should not be in the graph.
Args:
mol: input molecule with incorrect valence for some atoms
inplace: Whether to modify in place or make a copy
allow_ring_break: Whether bond removal involving ring is allowed.
Returns:
Fixed potential valence issue in molecule or original molecule when nothing is broken
of if failed.
"""
if not inplace:
mol = copy.copy(mol)
vm = rdMolStandardize.RDKitValidation()
if len(vm.validate(mol)) == 0: # don't fix something that is not broken
return mol
try:
m = Chem.RemoveHs(
mol,
implicitOnly=False,
updateExplicitCount=True,
sanitize=False,
)
m.UpdatePropertyCache(False)
# first pass using explicit false count
for atom in m.GetAtoms():
while incorrect_valence(atom) and atom.GetTotalNumHs() > 0:
cur_hydrogen = atom.GetTotalNumHs()
atom.SetNumExplicitHs(max(0, cur_hydrogen - 1))
atom.SetFormalCharge(max(0, atom.GetFormalCharge() - 1))
# atom.SetNumRadicalElectrons(0)
atom.UpdatePropertyCache(False)
em = Chem.RWMol(m)
bonds = em.GetBonds()
bonds = [
bond
for bond in bonds
if any(
[
incorrect_valence(bond.GetBeginAtom()),
incorrect_valence(bond.GetEndAtom()),
]
)
]
for bond in bonds:
a1 = bond.GetBeginAtom()
a2 = bond.GetEndAtom()
if incorrect_valence(a1) or incorrect_valence(a2):
mbond = decrease_bond(bond)
if allow_ring_break or (mbond or not bond.IsInRing()):
em.RemoveBond(a1.GetIdx(), a2.GetIdx())
if mbond is not None:
em.AddBond(a1.GetIdx(), a2.GetIdx(), mbond)
a1.UpdatePropertyCache(False)
a2.UpdatePropertyCache(False)
m = em.GetMol()
except Exception:
return None
return m
adjust_singleton(mol)
¶
Remove all atoms that are essentially disconnected singleton nodes in the molecular graph. For example, the chlorine atom and methane fragment will be removed in Cl.[N:1]1=CC(O)=CC2CCCCC12.CC.C", but not the ethane fragment.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
a molecule. |
required |
Source code in datamol/mol.py
def adjust_singleton(mol: Chem.rdchem.Mol) -> Optional[Chem.rdchem.Mol]:
"""Remove all atoms that are essentially disconnected singleton nodes in the molecular graph.
For example, the chlorine atom and methane fragment will be removed in Cl.[N:1]1=CC(O)=CC2CCCCC12.CC.C",
but not the ethane fragment.
Args:
mol: a molecule.
"""
to_rem = []
em = Chem.RWMol(mol)
for atom in mol.GetAtoms():
if atom.GetExplicitValence() == 0:
to_rem.append(atom.GetIdx())
to_rem.sort(reverse=True)
for a_idx in to_rem:
em.RemoveAtom(a_idx)
return em.GetMol()
remove_dummies(mol, dummy='*')
¶
Remove dummy atoms from molecules.
Source code in datamol/mol.py
def remove_dummies(mol: Chem.rdchem.Mol, dummy: str = "*") -> Optional[Chem.rdchem.Mol]:
"""Remove dummy atoms from molecules."""
du = dm.to_mol(dummy)
out = mol
try:
out = Chem.ReplaceSubstructs(mol, du, dm.to_mol("[H]"), True)[0]
out = Chem.RemoveHs(out)
except Exception as e:
out = Chem.DeleteSubstructs(mol, du)
return out
fix_mol(mol, n_iter=1, remove_singleton=False, largest_only=False, inplace=False)
¶
Fix error in molecule using a greedy approach.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
input molecule to fix |
required |
n_iter |
int |
Number of valence fix iteration to apply |
1 |
remove_singleton |
bool |
Whether |
False |
largest_only |
bool |
Whether only the largest fragment should be kept |
False |
inplace |
bool |
Whether to return a copy of the mol or perform in place operation |
False |
Returns:
Type | Description |
---|---|
Optional[rdkit.Chem.rdchem.Mol] |
Fixed molecule. |
Source code in datamol/mol.py
def fix_mol(
mol: Chem.rdchem.Mol,
n_iter: int = 1,
remove_singleton: bool = False,
largest_only: bool = False,
inplace: bool = False,
) -> Optional[Chem.rdchem.Mol]:
"""Fix error in molecule using a greedy approach.
Args:
mol: input molecule to fix
n_iter: Number of valence fix iteration to apply
remove_singleton: Whether `adjust_singleton` should be applied
largest_only: Whether only the largest fragment should be kept
inplace: Whether to return a copy of the mol or perform in place operation
Returns:
Fixed molecule.
"""
if not inplace:
mol = copy.copy(mol)
m = sanitize_mol(mol) or mol # fail back to mol when the fixer fail
if m is not None:
m = remove_dummies(m)
for _ in range(n_iter):
m = fix_valence(m)
if remove_singleton:
m = adjust_singleton(m)
if largest_only:
# m = max(Chem.rdmolops.GetMolFrags(m, asMols=True, sanitizeFrags=False), key=lambda m: m.GetNumAtoms())
m = rdMolStandardize.FragmentParent(m, skipStandardize=True)
return m
replace_dummies_atoms(mol, atom='C', dummy='*', replace_all=True)
¶
Remove dummy atoms from molecules.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
molecule with dummies |
required |
atom |
str |
replacement atom, default is carbon |
'C' |
dummy |
str |
dummy atom representation |
'*' |
replace_all |
bool |
Whether to replace all dummies |
True |
Returns:
Type | Description |
---|---|
Optional[rdkit.Chem.rdchem.Mol] |
mol: Molecule with dummy replaced |
Source code in datamol/mol.py
def replace_dummies_atoms(
mol: Chem.rdchem.Mol,
atom: str = "C",
dummy: str = "*",
replace_all: bool = True,
) -> Optional[Chem.rdchem.Mol]:
"""Remove dummy atoms from molecules.
Args:
mol: molecule with dummies
atom: replacement atom, default is carbon
dummy: dummy atom representation
replace_all: Whether to replace all dummies
Returns:
mol: Molecule with dummy replaced
"""
du = Chem.MolFromSmiles(dummy)
replacement = Chem.MolFromSmiles(atom)
out = Chem.ReplaceSubstructs(mol, du, replacement, replaceAll=replace_all)[0]
return out
keep_largest_fragment(mol)
¶
Only keep largest fragment of each molecule.
Source code in datamol/mol.py
def keep_largest_fragment(mol: Chem.rdchem.Mol) -> Optional[Chem.rdchem.Mol]:
"""Only keep largest fragment of each molecule."""
return max(
rdmolops.GetMolFrags(mol, asMols=True),
default=mol,
key=lambda m: m.GetNumAtoms(),
)
is_transition_metal(at)
¶
Check if atom is a transition metal.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
at |
Atom |
an atom. |
required |
Source code in datamol/mol.py
def is_transition_metal(at: Chem.rdchem.Atom) -> bool:
"""Check if atom is a transition metal.
Args:
at: an atom.
"""
n = at.GetAtomicNum()
return (n >= 22 and n <= 29) or (n >= 40 and n <= 47) or (n >= 72 and n <= 79)
set_dative_bonds(mol, from_atoms=(7, 8))
¶
Replaces some single bonds between metals and atoms with atomic numbers in fromAtoms with dative bonds. The replacement is only done if the atom has "too many" bonds.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
molecule with bond to modify |
required |
from_atoms |
Tuple[int, int] |
List of atoms (symbol or atomic number) to consider for bond replacement. By default, only Nitrogen (7) and Oxygen (8) are considered. |
(7, 8) |
Returns:
Type | Description |
---|---|
Optional[rdkit.Chem.rdchem.Mol] |
The modified molecule. |
Source code in datamol/mol.py
def set_dative_bonds(
mol: Chem.rdchem.Mol, from_atoms: Tuple[int, int] = (7, 8)
) -> Optional[Chem.rdchem.Mol]:
"""Replaces some single bonds between metals and atoms with atomic numbers in fromAtoms
with dative bonds. The replacement is only done if the atom has "too many" bonds.
Arguments:
mol: molecule with bond to modify
from_atoms: List of atoms (symbol or atomic number) to consider for bond replacement.
By default, only Nitrogen (7) and Oxygen (8) are considered.
Returns:
The modified molecule.
"""
rwmol = Chem.RWMol(mol)
rwmol.UpdatePropertyCache(strict=False)
metals = [at for at in rwmol.GetAtoms() if is_transition_metal(at)]
for metal in metals:
for nbr in metal.GetNeighbors():
if (nbr.GetAtomicNum() in from_atoms or nbr.GetSymbol() in from_atoms) and (
nbr.GetExplicitValence() > PERIODIC_TABLE.GetDefaultValence(nbr.GetAtomicNum())
and rwmol.GetBondBetweenAtoms(nbr.GetIdx(), metal.GetIdx()).GetBondType()
== SINGLE_BOND
):
rwmol.RemoveBond(nbr.GetIdx(), metal.GetIdx())
rwmol.AddBond(nbr.GetIdx(), metal.GetIdx(), DATIVE_BOND)
return rwmol
Enumerate¶
enumerate_stereoisomers(mol, n_variants=20, undefined_only=False, rationalise=True)
¶
Enumerate the stereocenters and bonds of the current molecule.
Original source: the openff-toolkit
lib.
Warning: this function can be computationnaly intensive.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
|
The molecule whose state we should enumerate. |
required |
n_variants |
int |
The maximum amount of molecules that should be returned. |
20 |
undefined_only |
bool |
If we should enumerate all stereocenters and bonds or only those with undefined stereochemistry. |
False |
rationalise |
bool |
If we should try to build and rationalise the molecule to ensure it can exist. |
True |
Source code in datamol/mol.py
def enumerate_stereoisomers(
mol,
n_variants: int = 20,
undefined_only: bool = False,
rationalise: bool = True,
):
"""Enumerate the stereocenters and bonds of the current molecule.
Original source: the `openff-toolkit` lib.
Warning: this function can be computationnaly intensive.
Args:
mol: The molecule whose state we should enumerate.
n_variants: The maximum amount of molecules that should be returned.
undefined_only: If we should enumerate all stereocenters and bonds or only those
with undefined stereochemistry.
rationalise: If we should try to build and rationalise the molecule to ensure it
can exist.
"""
from rdkit.Chem.EnumerateStereoisomers import EnumerateStereoisomers
from rdkit.Chem.EnumerateStereoisomers import StereoEnumerationOptions
# safety first
mol = copy_mol(mol)
# in case any bonds/centers are missing stereo chem flag it here
Chem.AssignStereochemistry(mol, force=False, flagPossibleStereoCenters=True, cleanIt=True) # type: ignore
Chem.FindPotentialStereoBonds(mol, cleanIt=True) # type: ignore
# set up the options
stereo_opts = StereoEnumerationOptions(
tryEmbedding=rationalise,
onlyUnassigned=undefined_only,
maxIsomers=n_variants,
)
try:
isomers = tuple(EnumerateStereoisomers(mol, options=stereo_opts))
except:
# NOTE(hadim): often got "Stereo atoms should be specified before specifying CIS/TRANS bond stereochemistry"
# for the ligand of reference (coming from the PDB). Not sure how to handle that.
isomers = []
variants = []
for isomer in isomers:
# isomer has CIS/TRANS tags so convert back to E/Z
Chem.SetDoubleBondNeighborDirections(isomer) # type: ignore
Chem.AssignStereochemistry(isomer, force=True, cleanIt=True) # type: ignore
variants.append(isomer)
return variants
enumerate_tautomers(mol, n_variants=20)
¶
Enumerate the possible tautomers of the current molecule.
Original source: the openff-toolkit
lib.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
The molecule whose state we should enumerate. |
required |
n_variants |
int |
The maximum amount of molecules that should be returned. |
20 |
Source code in datamol/mol.py
def enumerate_tautomers(mol: Chem.rdchem.Mol, n_variants: int = 20):
"""Enumerate the possible tautomers of the current molecule.
Original source: the `openff-toolkit` lib.
Args:
mol: The molecule whose state we should enumerate.
n_variants: The maximum amount of molecules that should be returned.
"""
# safety first
mol = copy_mol(mol)
enumerator = rdMolStandardize.TautomerEnumerator()
enumerator.SetMaxTautomers(n_variants)
tautomers = enumerator.Enumerate(mol)
return list(tautomers)
Convert molecule(s)¶
to_smiles(mol, canonical=True, isomeric=True, ordered=False, explicit_bonds=False, explicit_hs=False, randomize=False, cxsmiles=False, allow_to_fail=False)
¶
Convert a mol to a SMILES.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
a molecule. |
required |
canonical |
bool |
if false no attempt will be made to canonicalize the molecule. |
True |
isomeric |
bool |
whether to include information about stereochemistry in the SMILES. |
True |
ordered |
bool |
whether to force reordering of the atoms first. |
False |
explicit_bonds |
bool |
if true, all bond orders will be explicitly indicated in the output SMILES. |
False |
explicit_hs |
bool |
if true, all H counts will be explicitly indicated in the output SMILES. |
False |
randomize |
bool |
whether to randomize the generated smiles. Override |
False |
cxsmiles |
bool |
Whether to return a CXSMILES instead of a SMILES. |
False |
allow_to_fail |
bool |
Raise an error if the conversion to SMILES fails. Return None otherwise. |
False |
Source code in datamol/convert.py
def to_smiles(
mol: Chem.rdchem.Mol,
canonical: bool = True,
isomeric: bool = True,
ordered: bool = False,
explicit_bonds: bool = False,
explicit_hs: bool = False,
randomize: bool = False,
cxsmiles: bool = False,
allow_to_fail: bool = False,
) -> Optional[str]:
"""Convert a mol to a SMILES.
Args:
mol: a molecule.
canonical: if false no attempt will be made to canonicalize the molecule.
isomeric: whether to include information about stereochemistry in the SMILES.
ordered: whether to force reordering of the atoms first.
explicit_bonds: if true, all bond orders will be explicitly indicated in the output SMILES.
explicit_hs: if true, all H counts will be explicitly indicated in the output SMILES.
randomize: whether to randomize the generated smiles. Override `canonical`.
cxsmiles: Whether to return a CXSMILES instead of a SMILES.
allow_to_fail: Raise an error if the conversion to SMILES fails. Return None otherwise.
"""
if ordered and canonical is False:
mol = dm.reorder_atoms(mol)
if randomize:
mol = dm.randomize_atoms(mol)
canonical = False
smiles = None
try:
if cxsmiles:
smiles = Chem.MolToCXSmiles( # type: ignore
mol,
isomericSmiles=isomeric,
canonical=canonical,
allBondsExplicit=explicit_bonds,
allHsExplicit=explicit_hs,
)
else:
smiles = Chem.MolToSmiles( # type: ignore
mol,
isomericSmiles=isomeric,
canonical=canonical,
allBondsExplicit=explicit_bonds,
allHsExplicit=explicit_hs,
)
except Exception as e:
if allow_to_fail:
raise e
return None
return smiles
to_selfies(mol)
¶
Convert a mol to SELFIES.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Union[str, rdkit.Chem.rdchem.Mol] |
a molecule or a SMILES. |
required |
Returns:
Type | Description |
---|---|
Optional[str] |
selfies: SELFIES string. |
Source code in datamol/convert.py
def to_selfies(mol: Union[str, Chem.rdchem.Mol]) -> Optional[str]:
"""Convert a mol to SELFIES.
Args:
mol: a molecule or a SMILES.
Returns:
selfies: SELFIES string.
"""
if mol is None:
return None
if isinstance(mol, Chem.rdchem.Mol):
mol = to_smiles(mol)
selfies = sf.encoder(mol) # type: ignore
if selfies == -1:
return None
return selfies
from_selfies(selfies, as_mol=False)
¶
Convert a SEFLIES to a smiles or a mol.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
selfies |
str |
a selfies. |
required |
as_mol |
bool |
whether to return a mol or a smiles. |
False |
Returns:
Type | Description |
---|---|
Union[str, rdkit.Chem.rdchem.Mol] |
smiles or mol. |
Source code in datamol/convert.py
def from_selfies(selfies: str, as_mol: bool = False) -> Optional[Union[str, Chem.rdchem.Mol]]:
"""Convert a SEFLIES to a smiles or a mol.
Args:
selfies: a selfies.
as_mol (str, optional): whether to return a mol or a smiles.
Returns:
smiles or mol.
"""
if selfies is None:
return None
smiles = sf.decoder(selfies)
if as_mol and smiles is not None:
return dm.to_mol(smiles)
return smiles
to_smarts(mol, keep_hs=True)
¶
Convert a molecule to a smarts.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Union[str, rdkit.Chem.rdchem.Mol] |
a molecule. |
required |
keep_hs |
bool |
Whether to keep hydrogen. This will increase the count of H atoms for atoms with attached hydrogens to create a valid smarts. e.g. [H]-[CH2]-[] -> [H]-[CH3]-[] |
True |
Returns:
Type | Description |
---|---|
Optional[str] |
smarts of the molecule |
Source code in datamol/convert.py
def to_smarts(mol: Union[str, Chem.rdchem.Mol], keep_hs: bool = True) -> Optional[str]:
"""Convert a molecule to a smarts.
Args:
mol: a molecule.
keep_hs: Whether to keep hydrogen. This will increase the count of H atoms
for atoms with attached hydrogens to create a valid smarts.
e.g. [H]-[CH2]-[*] -> [H]-[CH3]-[*]
Returns:
smarts of the molecule
"""
if mol is None:
return None
if isinstance(mol, str):
mol = dm.to_mol(mol)
# Change the isotope to 42
for atom in mol.GetAtoms(): # type: ignore
if keep_hs:
s = sum(na.GetAtomicNum() == 1 for na in atom.GetNeighbors())
if s:
atom.SetNumExplicitHs(atom.GetTotalNumHs() + s)
atom.SetIsotope(42)
# Print out the smiles, all the atom attributes will be fully specified
smarts = to_smiles(mol, isomeric=True, explicit_bonds=True)
if smarts is None:
return None
# Remove the 42 isotope labels
smarts = re.sub(r"\[42", "[", smarts)
return smarts
to_inchi(mol)
¶
Convert a mol to Inchi.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Union[str, rdkit.Chem.rdchem.Mol] |
a molecule. |
required |
Source code in datamol/convert.py
def to_inchi(mol: Union[str, Chem.rdchem.Mol]) -> Optional[str]:
"""Convert a mol to Inchi.
Args:
mol: a molecule.
"""
if mol is None:
return None
if isinstance(mol, str):
mol = dm.to_mol(mol)
return Chem.MolToInchi(mol)
to_inchikey(mol)
¶
Convert a mol to Inchi key.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Union[str, rdkit.Chem.rdchem.Mol] |
a molecule |
required |
Source code in datamol/convert.py
def to_inchikey(mol: Union[str, Chem.rdchem.Mol]) -> Optional[str]:
"""Convert a mol to Inchi key.
Args:
mol: a molecule
"""
if mol is None:
return None
if isinstance(mol, str):
mol = dm.to_mol(mol)
return Chem.MolToInchiKey(mol)
from_inchi(inchi, sanitize=True, remove_hs=True)
¶
Convert an InChi to a mol.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
inchi |
Optional[str] |
an inchi string. |
required |
sanitize |
bool |
do sanitize. |
True |
remove_hs |
bool |
do remove hs. |
True |
Returns:
Type | Description |
---|---|
Optional[rdkit.Chem.rdchem.Mol] |
mol |
Source code in datamol/convert.py
def from_inchi(
inchi: Optional[str],
sanitize: bool = True,
remove_hs: bool = True,
) -> Optional[Chem.rdchem.Mol]:
"""Convert an InChi to a mol.
Args:
inchi: an inchi string.
sanitize: do sanitize.
remove_hs: do remove hs.
Returns:
mol
"""
if inchi is None:
return None
return Chem.MolFromInchi(inchi, sanitize=sanitize, removeHs=remove_hs)
to_df(mols, smiles_column='smiles', mol_column=None, include_private=False, include_computed=False, render_df_mol=True, render_all_df_mol=False)
¶
Convert a list of mols to a dataframe using each mol properties as a column.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mols |
List[rdkit.Chem.rdchem.Mol] |
a molecule. |
required |
smiles_column |
Optional[str] |
name of the SMILES column. |
'smiles' |
mol_column |
str |
Name of the column. If not None, rdkit.Chem.PandaTools is used to add a molecule column. |
None |
include_private |
bool |
Include private properties in the columns. |
False |
include_computed |
bool |
Include computed properties in the columns. |
False |
render_df_mol |
bool |
whether to render the molecule in the dataframe to images. If called once, it will be applied for the newly created dataframe with mol in it. |
True |
render_all_df_mol |
bool |
Whether to render all pandas dataframe mol column as images. |
False |
Source code in datamol/convert.py
def to_df(
mols: List[Chem.rdchem.Mol],
smiles_column: Optional[str] = "smiles",
mol_column: str = None,
include_private: bool = False,
include_computed: bool = False,
render_df_mol: bool = True,
render_all_df_mol: bool = False,
) -> Optional[pd.DataFrame]:
"""Convert a list of mols to a dataframe using each mol properties
as a column.
Args:
mols: a molecule.
smiles_column: name of the SMILES column.
mol_column: Name of the column. If not None, rdkit.Chem.PandaTools
is used to add a molecule column.
include_private: Include private properties in the columns.
include_computed: Include computed properties in the columns.
render_df_mol: whether to render the molecule in the dataframe to images.
If called once, it will be applied for the newly created dataframe with
mol in it.
render_all_df_mol: Whether to render all pandas dataframe mol column as images.
"""
# Init a dataframe
df = pd.DataFrame()
# Feed it with smiles
if smiles_column is not None:
smiles = [dm.to_smiles(mol) for mol in mols]
df[smiles_column] = smiles
# Add a mol column
if mol_column is not None:
df[mol_column] = mols
# Add any other properties present in the molecule
props = [
mol.GetPropsAsDict(
includePrivate=include_private,
includeComputed=include_computed,
)
for mol in mols
]
props_df = pd.DataFrame(props)
if smiles_column is not None and smiles_column in props_df.columns:
logger.warning(
f"The SMILES column name provided ('{smiles_column}') is already present in the properties"
" of the molecules. THe returned dataframe will two columns with the same name."
)
# Concat the df with the properties df
df = pd.concat([df, props_df], axis=1)
# Render mol column to images
if render_df_mol is True and mol_column is not None:
# NOTE(hadim): replace by `PandaTools.ChangeMoleculeRendering` once
# https://github.com/rdkit/rdkit/issues/3563 is fixed.
_ChangeMoleculeRendering(df)
if render_all_df_mol:
PandasTools.RenderImagesInAllDataFrames()
return df
from_df(df, smiles_column='smiles', mol_column=None, conserve_smiles=False, sanitize=True)
¶
Convert a dataframe to a list of mols.
Note
If smiles_column
is used to build the molecules, this property
is removed from the molecules' properties. You can decide to conserve
the SMILES column by setting conserve_smiles
to True.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
df |
DataFrame |
a dataframe. |
required |
smiles_column |
Optional[str] |
Column name to extract the molecule. |
'smiles' |
mol_column |
str |
Column name to extract the molecule. It takes
precedence over |
None |
conserve_smiles |
bool |
Whether to conserve the SMILES in the mols' props. |
False |
sanitize |
bool |
Whether to sanitize if |
True |
Source code in datamol/convert.py
def from_df(
df: pd.DataFrame,
smiles_column: Optional[str] = "smiles",
mol_column: str = None,
conserve_smiles: bool = False,
sanitize: bool = True,
) -> List[Chem.rdchem.Mol]:
"""Convert a dataframe to a list of mols.
Note:
If `smiles_column` is used to build the molecules, this property
is removed from the molecules' properties. You can decide to conserve
the SMILES column by setting `conserve_smiles` to True.
Args:
df: a dataframe.
smiles_column: Column name to extract the molecule.
mol_column: Column name to extract the molecule. It takes
precedence over `smiles_column`.
conserve_smiles: Whether to conserve the SMILES in the mols' props.
sanitize: Whether to sanitize if `smiles_column` is not None.
"""
if smiles_column is None and mol_column is None:
raise ValueError("Either `smiles_column` or `mol_column` must be not None.")
if len(df) == 0:
return []
# Try to detect the mol column if `mol_column` is None.
if mol_column is None:
for col in df.columns:
if isinstance(df[col].iloc[0], Chem.rdchem.Mol):
mol_column = col
def _row_to_mol(row):
props = row.to_dict()
if mol_column is not None:
mol = props.pop(mol_column)
else:
if conserve_smiles:
smiles = props[smiles_column]
else:
# If a SMILES column is used to create the molecule then it is removed from the
# properties.
smiles = props.pop(smiles_column)
mol = dm.to_mol(smiles, sanitize=sanitize)
if mol is None:
return None
dm.set_mol_props(mol, props)
return mol
return df.apply(_row_to_mol, axis=1).tolist()
Input/Output¶
read_csv(urlpath, smiles_column=None, mol_column='mol', **kwargs)
¶
Read a CSV file.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
urlpath |
Union[str, os.PathLike, TextIO] |
Path to a file or a file-like object. Path can be remote or local. |
required |
smiles_column |
str |
Use this column to build a mol column. |
None |
mol_column |
str |
Name to give to the mol column. If not None a mol column will be build. Avoid when loading a very large file. |
'mol' |
kwargs |
|
Arguments to pass to |
{} |
Returns:
Type | Description |
---|---|
DataFrame |
df: a |
Source code in datamol/io.py
def read_csv(
urlpath: Union[str, os.PathLike, TextIO],
smiles_column: str = None,
mol_column: str = "mol",
**kwargs,
) -> pd.DataFrame:
"""Read a CSV file.
Args:
urlpath: Path to a file or a file-like object. Path can be remote or local.
smiles_column: Use this column to build a mol column.
mol_column: Name to give to the mol column. If not None a mol column will be build.
Avoid when loading a very large file.
kwargs: Arguments to pass to `pd.read_csv()`.
Returns:
df: a `pandas.DataFrame`
"""
df: pd.DataFrame = pd.read_csv(urlpath, **kwargs) # type: ignore
if smiles_column is not None:
PandasTools.AddMoleculeColumnToFrame(df, smiles_column, mol_column)
return df
read_excel(urlpath, sheet_name=0, smiles_column=None, mol_column='mol', **kwargs)
¶
Read an excel file.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
urlpath |
Union[str, os.PathLike, TextIO] |
Path to a file or a file-like object. Path can be remote or local. |
required |
sheet_name |
Union[str, int, list] |
see |
0 |
mol_column |
str |
Name to give to the mol column. If not None a mol column will be build. Avoid when loading a very large file. |
'mol' |
mol_column |
str |
name to give to the mol column. |
'mol' |
kwargs |
|
Arguments to pass to |
{} |
Returns:
Type | Description |
---|---|
DataFrame |
df: a |
Source code in datamol/io.py
def read_excel(
urlpath: Union[str, os.PathLike, TextIO],
sheet_name: Optional[Union[str, int, list]] = 0,
smiles_column: str = None,
mol_column: str = "mol",
**kwargs,
) -> pd.DataFrame:
"""Read an excel file.
Args:
urlpath: Path to a file or a file-like object. Path can be remote or local.
sheet_name: see `pandas.read_excel()` doc.
mol_column: Name to give to the mol column. If not None a mol column will be build.
Avoid when loading a very large file.
mol_column: name to give to the mol column.
kwargs: Arguments to pass to `pd.read_excel()`.
Returns:
df: a `pandas.DataFrame`
"""
df = pd.read_excel(urlpath, sheet_name=sheet_name, **kwargs) # type: ignore
if smiles_column is not None:
PandasTools.AddMoleculeColumnToFrame(df, smiles_column, mol_column)
return df
read_sdf(urlpath, sanitize=True, as_df=False, smiles_column='smiles', mol_column=None, include_private=False, include_computed=False, strict_parsing=True)
¶
Read an SDF file.
Note: This function is meant to be used with dataset that fit in-memory.
For a more advanced usage we suggest you to use directly Chem.ForwardSDMolSupplier
.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
urlpath |
Union[str, os.PathLike, TextIO] |
Path to a file or a file-like object. Path can be remote or local. |
required |
sanitize |
bool |
Whether to sanitize the molecules. |
True |
as_df |
bool |
Whether to return a list mol or a pandas DataFrame. |
False |
smiles_column |
Optional[str] |
Name of the SMILES column. Only relevant if |
'smiles' |
mol_column |
str |
Name of the mol column. Only relevant if |
None |
include_private |
bool |
Include private properties in the columns. Only relevant if
|
False |
include_computed |
bool |
Include computed properties in the columns. Only relevant if
|
False |
strict_parsing |
bool |
If set to false, the parser is more lax about correctness of the contents. |
True |
Source code in datamol/io.py
def read_sdf(
urlpath: Union[str, os.PathLike, TextIO],
sanitize: bool = True,
as_df: bool = False,
smiles_column: Optional[str] = "smiles",
mol_column: str = None,
include_private: bool = False,
include_computed: bool = False,
strict_parsing: bool = True,
) -> Union[List[Chem.rdchem.Mol], pd.DataFrame]:
"""Read an SDF file.
Note: This function is meant to be used with dataset that fit _in-memory_.
For a more advanced usage we suggest you to use directly `Chem.ForwardSDMolSupplier`.
Args:
urlpath: Path to a file or a file-like object. Path can be remote or local.
sanitize: Whether to sanitize the molecules.
as_df: Whether to return a list mol or a pandas DataFrame.
smiles_column: Name of the SMILES column. Only relevant if `as_df` is True.
mol_column: Name of the mol column. Only relevant if `as_df` is True.
include_private: Include private properties in the columns. Only relevant if
`as_df` is True.
include_computed: Include computed properties in the columns. Only relevant if
`as_df` is True.
strict_parsing: If set to false, the parser is more lax about correctness of the contents.
"""
# File-like object
if isinstance(urlpath, io.IOBase):
supplier = Chem.ForwardSDMolSupplier(
urlpath,
sanitize=sanitize,
strictParsing=strict_parsing,
)
mols = list(supplier)
# Regular local or remote paths
else:
with fsspec.open(urlpath) as f:
# Handle gzip file if needed
if str(urlpath).endswith(".gz") or str(urlpath).endswith(".gzip"):
f = gzip.open(f)
supplier = Chem.ForwardSDMolSupplier(
f,
sanitize=sanitize,
strictParsing=strict_parsing,
)
mols = list(supplier)
# Discard None values
mols = [mol for mol in mols if mol is not None]
# Convert to dataframe
if as_df:
return dm.to_df(
mols,
smiles_column=smiles_column,
mol_column=mol_column,
include_private=include_private,
include_computed=include_computed,
) # type: ignore
return mols
to_sdf(mols, urlpath, smiles_column='smiles', mol_column=None)
¶
Write molecules to a file.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mols |
Union[rdkit.Chem.rdchem.Mol, Sequence[rdkit.Chem.rdchem.Mol], pandas.core.frame.DataFrame] |
a dataframe, a molecule or a list of molecule. |
required |
urlpath |
Union[str, os.PathLike, TextIO] |
Path to a file or a file-like object. Path can be remote or local. |
required |
smiles_column |
Optional[str] |
Column name to extract the molecule. |
'smiles' |
mol_column |
str |
Column name to extract the molecule. It takes
precedence over |
None |
Source code in datamol/io.py
def to_sdf(
mols: Union[Chem.rdchem.Mol, Sequence[Chem.rdchem.Mol], pd.DataFrame],
urlpath: Union[str, os.PathLike, TextIO],
smiles_column: Optional[str] = "smiles",
mol_column: str = None,
):
"""Write molecules to a file.
Args:
mols: a dataframe, a molecule or a list of molecule.
urlpath: Path to a file or a file-like object. Path can be remote or local.
smiles_column: Column name to extract the molecule.
mol_column: Column name to extract the molecule. It takes
precedence over `smiles_column`.
"""
if isinstance(mols, pd.DataFrame):
mols = dm.from_df(mols, smiles_column=smiles_column, mol_column=mol_column)
elif isinstance(mols, Chem.rdchem.Mol):
mols = [mols]
# Filter out None values
mols = [mol for mol in mols if mol is not None]
# File-like object
if isinstance(urlpath, io.IOBase):
writer = Chem.SDWriter(urlpath)
for mol in mols:
writer.write(mol)
writer.close()
# Regular local or remote paths
else:
with fsspec.open(urlpath, mode="w") as f:
writer = Chem.SDWriter(f)
for mol in mols:
writer.write(mol)
writer.close()
to_smi(mols, urlpath, error_if_empty=False)
¶
Save a list of molecules in an .smi
file.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mols |
Sequence[rdkit.Chem.rdchem.Mol] |
a list of molecules. |
required |
urlpath |
Union[str, os.PathLike, TextIO] |
Path to a file or a file-like object. Path can be remote or local. |
required |
error_if_empty |
bool |
whether to raise and error if the input list is empty. |
False |
Source code in datamol/io.py
def to_smi(
mols: Sequence[Chem.rdchem.Mol],
urlpath: Union[str, os.PathLike, TextIO],
error_if_empty: bool = False,
):
"""Save a list of molecules in an `.smi` file.
Args:
mols: a list of molecules.
urlpath: Path to a file or a file-like object. Path can be remote or local.
error_if_empty: whether to raise and error if the input list is empty.
"""
if len(mols) == 0 and error_if_empty:
raise ValueError("The list of mols/smiles provided is empty.")
# Filter out None values
mols = [mol for mol in mols if mol is not None]
# File-like object
if isinstance(urlpath, io.IOBase):
writer = Chem.SmilesWriter(urlpath, includeHeader=False, nameHeader="")
for mol in mols:
writer.write(mol)
writer.close()
# Regular local or remote paths
else:
with fsspec.open(urlpath, "w") as f:
writer = Chem.SmilesWriter(f, includeHeader=False, nameHeader="")
for mol in mols:
writer.write(mol)
writer.close()
read_smi(urlpath)
¶
Read a list of smiles from am .smi
file.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
urlpath |
Union[str, os.PathLike] |
Path to a file or a file-like object. Path can be remote or local. Note: file-like object are not supported yet. |
required |
Source code in datamol/io.py
def read_smi(
urlpath: Union[str, os.PathLike],
) -> Sequence[Chem.rdchem.Mol]:
"""Read a list of smiles from am `.smi` file.
Args:
urlpath: Path to a file or a file-like object. Path can be remote or local.
Note: file-like object are not supported yet.
"""
active_path = urlpath
# NOTE(hadim): the temporary local file copy
# is because `SmilesMolSupplier` does not support
# using file-like object, only path.
# Copy to a local temporary path if the path is a remote one.
if not fsspec.utils.can_be_local(str(urlpath)):
active_path = pathlib.Path(tempfile.mkstemp()[1])
dm.utils.fs.copy_file(urlpath, active_path)
# Read the molecules
supplier = Chem.SmilesMolSupplier(str(active_path), titleLine=0)
mols = [mol for mol in supplier if mol is not None]
# Delete the local temporary path
if not fsspec.utils.can_be_local(str(urlpath)):
pathlib.Path(active_path).unlink()
return mols
Molecule similarity and distance¶
pdist(mols, n_jobs=1, squareform=True, **fp_args)
¶
Compute the pairwise tanimoto distance between the fingerprints of all the molecules in the input set.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mols |
List[Union[str, rdkit.Chem.rdchem.Mol]] |
list of molecules |
required |
n_jobs |
Optional[int] |
Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores. |
1 |
squareform |
bool |
Whether to return in square form (matrix) or in a condensed form (1D vector). |
True |
**fp_args |
|
list of args to pass to |
{} |
Returns:
Type | Description |
---|---|
ndarray |
dist_mat |
Source code in datamol/similarity.py
def pdist(
mols: List[Union[str, Chem.rdchem.Mol]],
n_jobs: Optional[int] = 1,
squareform: bool = True,
**fp_args,
) -> np.ndarray:
"""Compute the pairwise tanimoto distance between the fingerprints of all the
molecules in the input set.
Args:
mols: list of molecules
n_jobs: Number of jobs for parallelization. Let to 1 for no
parallelization. Set to None to use all available cores.
squareform: Whether to return in square form (matrix) or in a condensed
form (1D vector).
**fp_args: list of args to pass to `to_fp()`.
Returns:
dist_mat
"""
fps = dm.parallelized(
functools.partial(dm.to_fp, as_array=True, **fp_args),
mols,
n_jobs=n_jobs,
)
fps = np.array(fps)
dist_mat = distance.pdist(fps, metric="jaccard")
if squareform:
dist_mat = distance.squareform(dist_mat, force="tomatrix")
return dist_mat
cdist(mols1, mols2, n_jobs=1, **fp_args)
¶
Compute the tanimoto distance between the fingerprints of each pair of molecules of the two collections of inputs.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mols1 |
List[Union[str, rdkit.Chem.rdchem.Mol]] |
list of molecules. |
required |
mols2 |
List[Union[str, rdkit.Chem.rdchem.Mol]] |
list of molecules. |
required |
n_jobs |
Optional[int] |
Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores. |
1 |
**fp_args |
|
list of args to pass to |
{} |
Returns:
Type | Description |
---|---|
ndarray |
distmat |
Source code in datamol/similarity.py
def cdist(
mols1: List[Union[str, Chem.rdchem.Mol]],
mols2: List[Union[str, Chem.rdchem.Mol]],
n_jobs: Optional[int] = 1,
**fp_args,
) -> np.ndarray:
"""Compute the tanimoto distance between the fingerprints of each pair of
molecules of the two collections of inputs.
Args:
mols1: list of molecules.
mols2: list of molecules.
n_jobs: Number of jobs for parallelization. Let to 1 for no
parallelization. Set to None to use all available cores.
**fp_args: list of args to pass to `to_fp()`.
Returns:
distmat
"""
fps1 = dm.parallelized(
functools.partial(dm.to_fp, as_array=True, **fp_args),
mols1,
n_jobs=n_jobs,
)
fps2 = dm.parallelized(
functools.partial(dm.to_fp, as_array=True, **fp_args),
mols2,
n_jobs=n_jobs,
)
fps1 = np.array(fps1)
fps2 = np.array(fps2)
dist_mat = distance.cdist(fps1, fps2, metric="jaccard")
return dist_mat
Working with fingerprints¶
to_fp(mol, as_array=True, fp_type='ecfp', fold_size=None, **fp_args)
¶
Compute the molecular fingerprint given a molecule or a SMILES.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Union[str, rdkit.Chem.rdchem.Mol] |
a molecule or a SMILES. |
required |
as_array |
bool |
Whether to return a numpy array of an RDKit vec. Default to True. |
True |
fp_type |
str |
The type of the fingerprint. See |
'ecfp' |
fold_size |
int |
If set, fold the fingerprint to the |
None |
fp_args |
|
Arguments to build the fingerprint. Refer to the official RDKit documentation. |
{} |
Returns:
Type | Description |
---|---|
Union[numpy.ndarray, rdkit.DataStructs.cDataStructs.SparseBitVect, rdkit.DataStructs.cDataStructs.ExplicitBitVect] |
A fingerprint vector or None |
Source code in datamol/fp.py
def to_fp(
mol: Union[str, Chem.rdchem.Mol],
as_array: bool = True,
fp_type: str = "ecfp",
fold_size: int = None,
**fp_args,
) -> Optional[Union[np.ndarray, SparseBitVect, ExplicitBitVect]]:
"""Compute the molecular fingerprint given a molecule or a SMILES.
Args:
mol: a molecule or a SMILES.
as_array: Whether to return a numpy array of an RDKit vec. Default to True.
fp_type: The type of the fingerprint. See `dm.list_supported_fingerprints()` for a
complete list.
fold_size: If set, fold the fingerprint to the `fold_size`. If set, returned array is
always a numpy array.
fp_args: Arguments to build the fingerprint. Refer to the official RDKit documentation.
Returns:
A fingerprint vector or None
"""
# Get fp function
fp_func = _FP_FUNCS.get(fp_type)
if fp_func is None:
raise ValueError(
f"The fingerprint '{fp_type}' is not available. Use `dm.list_supported_fingerprints()` to "
"get a complete list of the available fingerprints."
)
# Convert input to mol if needed
if isinstance(mol, str):
mol_obj = dm.to_mol(mol)
else:
mol_obj = mol
if mol_obj is None:
raise ValueError(f"It seems like the input molecule '{mol}' is invalid.")
mol = mol_obj
# Deal with new API introduced in >=0.4 + throw a warning if needed.
if "fp_size" in fp_args:
warnings.warn(
"Using `fp_size` is now deprecated and will be removed in datamol 0.5.0. Please use `nBits` instead.",
DeprecationWarning,
)
fp_args["nBits"] = fp_args.pop("fp_size")
if "use_features" in fp_args:
warnings.warn(
"Using `use_features` is now deprecated and will be removed in datamol 0.5.0. Please use `useFeatures` instead.",
DeprecationWarning,
)
fp_args["useFeatures"] = fp_args.pop("use_features")
# Insert default values.
for key, value in _FP_DEFAULT_ARGS[fp_type].items():
fp_args.setdefault(key, value)
# Compute the fingerprint
fp = fp_func(mol, **fp_args)
# Fold the fp if needed.
if fold_size is not None:
fp = fold_count_fp(fp, dim=fold_size)
# Convert to a numpy array
if not fold_size and as_array:
fp = fp_to_array(fp)
return fp
fp_to_array(fp)
¶
Convert rdkit fingerprint to numpy array.
Note
This implementation has shown to be faster than using DataStructs.ConvertToNumpyArray
by a factor of ~4. See https://github.com/rdkit/rdkit/discussions/3863.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
fp |
Union[numpy.ndarray, rdkit.DataStructs.cDataStructs.SparseBitVect, rdkit.DataStructs.cDataStructs.ExplicitBitVect, rdkit.DataStructs.cDataStructs.UIntSparseIntVect] |
The fingerprint. |
required |
Source code in datamol/fp.py
def fp_to_array(
fp: Union[np.ndarray, SparseBitVect, ExplicitBitVect, UIntSparseIntVect]
) -> np.ndarray:
"""Convert rdkit fingerprint to numpy array.
Note:
This implementation has shown to be faster than using `DataStructs.ConvertToNumpyArray`
by a factor of ~4. See https://github.com/rdkit/rdkit/discussions/3863.
Args:
fp: The fingerprint.
"""
if isinstance(fp, np.ndarray):
fp_out = fp
elif isinstance(fp, SparseBitVect):
tmp = np.zeros(fp.GetNumBits(), dtype=int)
on_bits = np.array(fp.GetOnBits())
tmp[on_bits] = 1
fp_out = tmp
elif isinstance(fp, ExplicitBitVect):
fp_out = np.frombuffer(fp.ToBitString().encode(), "u1") - ord("0")
elif isinstance(
fp,
(
UIntSparseIntVect,
IntSparseIntVect,
LongSparseIntVect,
ULongSparseIntVect,
),
):
tmp = np.zeros(fp.GetLength(), dtype=int)
bit_idx, values = np.array(list(fp.GetNonzeroElements().items())).T
tmp[bit_idx] = values
fp_out = tmp
else:
raise ValueError(
f"The fingerprint of type '{type(fp)}' is not supported. "
"Please open a ticket at https://github.com/datamol-org/datamol/issues."
)
return fp_out
list_supported_fingerprints()
¶
Return the supported fingerprints in datamol.
Source code in datamol/fp.py
def list_supported_fingerprints():
"""Return the supported fingerprints in datamol."""
return _FP_FUNCS
fold_count_fp(fp, dim=1024, binary=False)
¶
Fast folding of a count fingerprint to the specified dimension.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
fp |
Union[numpy.ndarray, rdkit.DataStructs.cDataStructs.SparseBitVect, rdkit.DataStructs.cDataStructs.ExplicitBitVect] |
A fingerprint. |
required |
dim |
int |
The dimension of the folded array. |
1024 |
binary |
bool |
Whether to fold into a binary array or take use a count vector. |
False |
Returns:
Type | Description |
---|---|
folded |
returns folded array to the provided dimension. |
Source code in datamol/fp.py
def fold_count_fp(
fp: Union[np.ndarray, SparseBitVect, ExplicitBitVect],
dim: int = 1024,
binary: bool = False,
):
"""Fast folding of a count fingerprint to the specified dimension.
Args:
fp: A fingerprint.
dim: The dimension of the folded array.
binary: Whether to fold into a binary array or take use a count vector.
Returns:
folded: returns folded array to the provided dimension.
"""
if isinstance(
fp,
(
UIntSparseIntVect,
IntSparseIntVect,
LongSparseIntVect,
ULongSparseIntVect,
),
):
tmp = fp.GetNonzeroElements()
elif isinstance(fp, SparseBitVect):
on_bits = fp.GetOnBits()
tmp = dict(zip(on_bits, np.ones(len(on_bits))))
else:
raise ValueError(f"The fingerprint is of wrong type: {type(fp)}")
# ON bits dict to (i, v)
i = np.array(list(tmp.keys())) % dim
v = np.array(list(tmp.values()))
# Fold indices
i = i % dim
# Create the folded fp
folded = np.zeros(dim, dtype="int")
np.add.at(folded, i, v)
if binary:
folded = np.clip(folded, a_min=0, a_max=1)
return folded
Cluster molecules¶
cluster_mols(mols, cutoff=0.2, feature_fn=None, n_jobs=1)
¶
Cluster a set of molecules using the butina clustering algorithm and a given threshold.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mols |
List[rdkit.Chem.rdchem.Mol] |
a list of molecules. |
required |
cutoff |
float |
Cuttoff for the clustering. Default to 0.2. |
0.2 |
feature_fn |
Callable |
A feature function that takes a Chem.rdchem.Mol object
and return molecular features. By default, the |
None |
n_jobs |
Optional[int] |
Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores. |
1 |
Source code in datamol/cluster.py
def cluster_mols(
mols: List[Chem.rdchem.Mol],
cutoff: float = 0.2,
feature_fn: Callable = None,
n_jobs: Optional[int] = 1,
):
"""Cluster a set of molecules using the butina clustering algorithm and a given threshold.
Args:
mols: a list of molecules.
cutoff: Cuttoff for the clustering. Default to 0.2.
feature_fn: A feature function that takes a Chem.rdchem.Mol object
and return molecular features. By default, the `dm.to_fp()` is used.
Default to None.
n_jobs: Number of jobs for parallelization. Let to 1 for no
parallelization. Set to None to use all available cores.
"""
if feature_fn is None:
feature_fn = functools.partial(dm.to_fp, as_array=False)
features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs)
dists = []
n_mols = len(mols)
for i in range(1, n_mols):
dist = DataStructs.BulkTanimotoSimilarity(features[i], features[:i], returnDistance=True)
dists.extend([x for x in dist])
# now cluster the data
cluster_indices = Butina.ClusterData(dists, n_mols, cutoff, isDistData=True)
cluster_mols = [operator.itemgetter(*cluster)(mols) for cluster in cluster_indices]
# Make single mol cluster a list
cluster_mols = [[c] if isinstance(c, Chem.rdchem.Mol) else c for c in cluster_mols]
return cluster_indices, cluster_mols
pick_diverse(mols, npick, initial_picks=None, feature_fn=None, dist_fn=None, seed=42, n_jobs=1)
¶
Pick a set of diverse molecules based on they fingerprint.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mols |
List[rdkit.Chem.rdchem.Mol] |
a list of molecules. |
required |
npick |
int |
Number of element to pick from mols, including the preselection. |
required |
initial_picks |
List[int] |
Starting list of index for molecules that should be in the set of picked molecules. Default to None. |
None |
feature_fn |
Callable |
A feature function that takes a Chem.rdchem.Mol object
and return molecular features. By default, the |
None |
dist_fn |
Callable |
A function that takes two indexes (i,j) and return the distance between them. You might use partial to set the fingerprints as input. By default, the Tanimoto similarity will be used. Default to None. |
None |
seed |
int |
seed for reproducibility |
42 |
n_jobs |
Optional[int] |
Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores. |
1 |
Returns:
Type | Description |
---|---|
picked_inds |
index of the molecule that have been picked mols: molecules that have been picked |
Source code in datamol/cluster.py
def pick_diverse(
mols: List[Chem.rdchem.Mol],
npick: int,
initial_picks: List[int] = None,
feature_fn: Callable = None,
dist_fn: Callable = None,
seed: int = 42,
n_jobs: Optional[int] = 1,
):
r"""Pick a set of diverse molecules based on they fingerprint.
Args:
mols: a list of molecules.
npick: Number of element to pick from mols, including the preselection.
initial_picks: Starting list of index for molecules that should be in the
set of picked molecules. Default to None.
feature_fn: A feature function that takes a Chem.rdchem.Mol object
and return molecular features. By default, the `dm.to_fp()` is used.
Default to None.
dist_fn: A function that takes two indexes (i,j) and return the
distance between them. You might use partial to set the fingerprints as input.
By default, the Tanimoto similarity will be used. Default to None.
seed: seed for reproducibility
n_jobs: Number of jobs for parallelization. Let to 1 for no
parallelization. Set to None to use all available cores.
Returns:
picked_inds: index of the molecule that have been picked
mols: molecules that have been picked
"""
if feature_fn is None:
feature_fn = functools.partial(dm.to_fp, as_array=False)
features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs)
def distij(i, j, features=features):
return 1.0 - DataStructs.TanimotoSimilarity(features[i], features[j])
if dist_fn is None:
dist_fn = distij
picker = MaxMinPicker()
initial_picks = [] if initial_picks is None else initial_picks
picked_inds = picker.LazyPick(dist_fn, len(mols), npick, firstPicks=initial_picks, seed=seed)
picked_inds = np.array(picked_inds)
picked_mols = [mols[x] for x in picked_inds]
return picked_inds, picked_mols
pick_centroids(mols, npick=0, initial_picks=None, threshold=0.5, feature_fn=None, dist_fn=None, seed=42, method='sphere', n_jobs=1)
¶
Pick a set of npick
centroids from a list of molecules.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mols |
List[rdkit.Chem.rdchem.Mol] |
a list of molecules. |
required |
npick |
int |
Number of element to pick from mols, including the preselection. |
0 |
threshold |
float |
Minimum distance between centroids for |
0.5 |
initial_picks |
List[int] |
Starting list of index for molecules that should be in the set of picked molecules. Default to None. |
None |
feature_fn |
Callable |
A feature function that takes a Chem.rdchem.Mol object
and return molecular features. By default, the |
None |
dist_fn |
Callable |
A function that takes two indexes (i,j) and return the distance between them. You might use partial to set the fingerprints as input. By default, the Tanimoto similarity will be used. Default to None. |
None |
seed |
int |
seed for reproducibility |
42 |
method |
str |
Picking method to use. One of |
'sphere' |
n_jobs |
Optional[int] |
Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores. |
1 |
Returns:
Type | Description |
---|---|
picked_inds |
index of the molecule that have been selected as centroids mols: molecules that have been picked |
Source code in datamol/cluster.py
def pick_centroids(
mols: List[Chem.rdchem.Mol],
npick: int = 0,
initial_picks: List[int] = None,
threshold: float = 0.5,
feature_fn: Callable = None,
dist_fn: Callable = None,
seed: int = 42,
method: str = "sphere",
n_jobs: Optional[int] = 1,
):
r"""Pick a set of `npick` centroids from a list of molecules.
Args:
mols: a list of molecules.
npick: Number of element to pick from mols, including the preselection.
threshold: Minimum distance between centroids for `maxmin` and sphere exclusion (`sphere`) methods.
initial_picks: Starting list of index for molecules that should be in the
set of picked molecules. Default to None.
feature_fn (callable, optional): A feature function that takes a Chem.rdchem.Mol object
and return molecular features. By default, the `dm.to_fp()` is used.
Default to None.
dist_fn: A function that takes two indexes (i,j) and return the
distance between them. You might use partial to set the fingerprints as input.
By default, the Tanimoto similarity will be used. Default to None.
seed: seed for reproducibility
method: Picking method to use. One of `sphere`, `maxmin` or any
supported rdkit hierarchical clustering method such as `centroid`, `clink`, `upgma`
n_jobs: Number of jobs for parallelization. Let to 1 for no
parallelization. Set to None to use all available cores.
Returns:
picked_inds: index of the molecule that have been selected as centroids
mols: molecules that have been picked
"""
n_mols = len(mols)
if feature_fn is None:
feature_fn = functools.partial(dm.to_fp, as_array=False)
features = dm.parallelized(feature_fn, mols, n_jobs=n_jobs)
def distij(i, j, features=features):
return 1.0 - DataStructs.TanimotoSimilarity(features[i], features[j])
if dist_fn is None:
dist_fn = distij
initial_picks = [] if initial_picks is None else initial_picks
if method == "maxmin":
picker = MaxMinPicker()
picked_inds, _ = picker.LazyPickWithThreshold(
dist_fn,
n_mols,
pickSize=npick,
threshold=threshold,
firstPicks=initial_picks,
seed=seed,
)
elif method == "sphere":
picker = LeaderPicker()
picked_inds = picker.LazyPick(
dist_fn, n_mols, threshold=threshold, pickSize=npick, firstPicks=initial_picks
)
elif method.upper() in ClusterMethod.names.keys() and npick:
if initial_picks:
logger.warning(
"Initial picks is not supported by hierarchical clustering. You pick has been discarded."
)
dist_mat = dm.parallelized(
distij, list(zip(*np.tril_indices(len(mols), k=-1))), arg_type="args"
)
dist_mat = np.asarray(dist_mat)
picker = HierarchicalClusterPicker(ClusterMethod.names[method.upper()])
picked_inds = picker.Pick(dist_mat, n_mols, npick)
else:
raise ValueError(f"Picking method {method} with {npick} elements to pick is not supported.")
picked_inds = np.array(picked_inds)
picked_mols = [mols[x] for x in picked_inds]
return picked_inds, picked_mols
assign_to_centroids(mols, centroids, feature_fn=None, dist_fn=None, n_jobs=1)
¶
Assign molecules to centroids. Each molecule will be assigned to the closest centroid.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mols |
List[rdkit.Chem.rdchem.Mol] |
a list of molecules to assign to centroids |
required |
centroids |
List[rdkit.Chem.rdchem.Mol] |
list of molecules to use as centroid |
required |
feature_fn |
Callable |
A feature function that takes a Chem.rdchem.Mol object
and return molecular features. By default, the |
None |
dist_fn |
Callable |
A function that takes two indexes (i,j) and return the distance between them. You might use partial to set the fingerprints as input. By default, the Tanimoto similarity will be used. Default to None. |
None |
n_jobs |
Optional[int] |
Number of jobs for parallelization. Let to 1 for no parallelization. Set to None to use all available cores. |
1 |
Returns:
Type | Description |
---|---|
clusters_map |
dict of index mapping each centroid index to the molecule index in the cluster clusters_list: list of all molecules in each cluster. The cluster index follows the index of the centroid. Note that the centroid molecule is not added to the cluster. |
Source code in datamol/cluster.py
def assign_to_centroids(
mols: List[Chem.rdchem.Mol],
centroids: List[Chem.rdchem.Mol],
feature_fn: Callable = None,
dist_fn: Callable = None,
n_jobs: Optional[int] = 1,
):
r"""Assign molecules to centroids. Each molecule will be assigned to the closest centroid.
Args:
mols: a list of molecules to assign to centroids
centroids: list of molecules to use as centroid
feature_fn: A feature function that takes a Chem.rdchem.Mol object
and return molecular features. By default, the `dm.to_fp()` is used.
Default to None.
dist_fn: A function that takes two indexes (i,j) and return the
distance between them. You might use partial to set the fingerprints as input.
By default, the Tanimoto similarity will be used. Default to None.
n_jobs: Number of jobs for parallelization. Let to 1 for no
parallelization. Set to None to use all available cores.
Returns:
clusters_map: dict of index mapping each centroid index to the molecule index in the cluster
clusters_list: list of all molecules in each cluster. The cluster index follows the index of the centroid.
Note that the centroid molecule is not added to the cluster.
"""
if feature_fn is None:
feature_fn = functools.partial(dm.to_fp, as_array=False)
all_mols = [x for x in mols] + [c for c in centroids]
features = dm.parallelized(feature_fn, all_mols, n_jobs=n_jobs)
def distij(i, j, features=features):
return 1.0 - DataStructs.TanimotoSimilarity(features[int(i)], features[int(j)])
if dist_fn is None:
dist_fn = distij
clusters_map = ddict(list)
clusters_list = [[] for _ in centroids]
query_inds = np.expand_dims(np.arange(len(mols), dtype=int), axis=1)
centroid_inds = np.expand_dims(np.arange(len(centroids), dtype=int), axis=1) + len(mols)
dist_mat = distance.cdist(query_inds, centroid_inds, metric=distij)
closest = np.argmin(dist_mat, axis=1)
for ind, cluster_ind in enumerate(closest): # type: ignore
clusters_map[cluster_ind].append(ind)
clusters_list[cluster_ind].append(mols[ind])
return clusters_map, clusters_list
Molecule as a graph¶
to_graph(mol)
¶
Convert a molecule to a network x graph. A list of properties are added to every nodes and edges.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
a molecule. |
required |
Returns:
Type | Description |
---|---|
mol_graph (networkx.Graph) |
a graph representing the molecule. |
Source code in datamol/graph.py
def to_graph(mol: Chem.rdchem.Mol):
"""Convert a molecule to a network x graph. A list of properties are added
to every nodes and edges.
Args:
mol (Chem.Mol): a molecule.
Returns:
mol_graph (networkx.Graph): a graph representing the molecule.
"""
nx = _get_networkx()
mol_graph = nx.Graph()
for atom in mol.GetAtoms():
mol_graph.add_node(
atom.GetIdx(),
atomic_num=atom.GetAtomicNum(),
formal_charge=atom.GetFormalCharge(),
chiral_tag=atom.GetChiralTag(),
hybridization=atom.GetHybridization(),
num_explicit_hs=atom.GetNumExplicitHs(),
implicit_valence=atom.GetImplicitValence(),
degree=atom.GetDegree(),
symbol=atom.GetSymbol(),
ring_atom=atom.IsInRing(),
is_aromatic=atom.GetIsAromatic(),
)
for bond in mol.GetBonds():
mol_graph.add_edge(
bond.GetBeginAtomIdx(),
bond.GetEndAtomIdx(),
bond_type=bond.GetBondType(),
)
return mol_graph
get_all_path_between(mol, atom_idx_1, atom_idx_2, ignore_cycle_basis=False)
¶
Get all simple path between two atoms of a molecule
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
a molecule |
required |
atom_idx_1 |
int |
Atom index 1. |
required |
atom_idx_2 |
int |
Atom index 2. |
required |
ignore_cycle_basis |
bool |
Whether to ignore cycle basis. Defaults to False. |
False |
Returns:
Type | Description |
---|---|
[type] |
[description] |
Source code in datamol/graph.py
def get_all_path_between(
mol: Chem.rdchem.Mol,
atom_idx_1: int,
atom_idx_2: int,
ignore_cycle_basis: bool = False,
):
"""Get all simple path between two atoms of a molecule
Args:
mol (Chem.Mol): a molecule
atom_idx_1 (int): Atom index 1.
atom_idx_2 (int): Atom index 2.
ignore_cycle_basis: Whether to ignore cycle basis.
Defaults to False.
Returns:
[type]: [description]
"""
nx = _get_networkx()
adj = Chem.rdmolops.GetAdjacencyMatrix(mol)
G = nx.Graph(adj)
path = nx.all_simple_paths(G, source=atom_idx_1, target=atom_idx_2)
if ignore_cycle_basis:
rings = [set(x) for x in mol.GetRingInfo().AtomRings()]
final_path = []
for p in path:
reject_path = False
for r in rings:
if r.issubset(set(p)):
reject_path = True
break
if not reject_path:
final_path.append(p)
path = final_path
return list(path)
Constants¶
PERIODIC_TABLE: None
¶
TRIPLE_BOND: None
¶
DOUBLE_BOND: None
¶
SINGLE_BOND: None
¶
AROMATIC_BOND: None
¶
Control RDKit logging¶
without_rdkit_log
¶
Context manager to disable RDKit logs. By default all logs are disabled.
Examples:
import datamol as dm
with dm.without_rdkit_log():
mol = dm.to_mol("CCCCO") # potential RDKit logs won't show
enable_rdkit_log()
¶
Enable all rdkit logs.
Source code in datamol/log.py
def enable_rdkit_log():
"""Enable all rdkit logs."""
for log_level in RDLogger._levels:
rdBase.EnableLog(log_level)
disable_rdkit_log()
¶
Disable all rdkit logs.
Source code in datamol/log.py
def disable_rdkit_log():
"""Disable all rdkit logs."""
for log_level in RDLogger._levels:
rdBase.DisableLog(log_level)
Toy dataset¶
freesolv()
¶
Return the FreeSolv dataset as a dataframe.
The dataset contains 642 molecules and the following columns:
['iupac', 'smiles', 'expt', 'calc']
.
Warning
This dataset is only meant to be used as a toy dataset for pedagogic and testing purposes. It is not a dataset for benchmarking, analysis or model training.
Source code in datamol/data.py
def freesolv():
"""Return the FreeSolv dataset as a dataframe.
The dataset contains 642 molecules and the following columns:
`['iupac', 'smiles', 'expt', 'calc']`.
Warning:
This dataset is only meant to be used as a toy dataset for pedagogic and
testing purposes. **It is not** a dataset for benchmarking, analysis or
model training.
"""
with pkg_resources.resource_stream("datamol", "data/freesolv.csv") as f:
data = pd.read_csv(f)
return data