datamol.descriptors
¶
Various molecular descriptors¶
compute_many_descriptors(mol, properties_fn=None, add_properties=True)
¶
Compute a list of opiniated molecular properties.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mol |
Mol |
A molecule. |
required |
properties_fn |
Dict[str, Union[Callable, str]] |
A list of functions that compute properties. If None,
a default list of properties is used. If the function is a string,
|
None |
add_properties |
bool |
Whether to add the computed properties to the default list. |
True |
Returns:
Type | Description |
---|---|
dict |
Computed properties as a dict. |
Source code in datamol/descriptors/descriptors.py
def compute_many_descriptors(
mol: Mol,
properties_fn: Dict[str, Union[Callable, str]] = None,
add_properties: bool = True,
) -> dict:
"""Compute a list of opiniated molecular properties.
Args:
mol: A molecule.
properties_fn: A list of functions that compute properties. If None,
a default list of properties is used. If the function is a string,
`dm.descriptors.any_descriptor()` is used to retrieve the descriptor
function.
add_properties: Whether to add the computed properties to the default list.
Returns:
Computed properties as a dict.
"""
if properties_fn is None:
properties_fn = _DEFAULT_PROPERTIES_FN
elif add_properties:
[properties_fn.setdefault(k, v) for k, v in _DEFAULT_PROPERTIES_FN.items()]
props = {}
for k, v in properties_fn.items():
if isinstance(v, str):
v = any_rdkit_descriptor(v)
props[k] = v(mol)
return props
batch_compute_many_descriptors(mols, properties_fn=None, add_properties=True, n_jobs=1, batch_size=None, progress=False, progress_leave=True)
¶
Compute a list of opiniated molecular properties on a list of molecules.
Parameters:
Name | Type | Description | Default |
---|---|---|---|
mols |
List[rdkit.Chem.rdchem.Mol] |
A list of molecules. |
required |
properties_fn |
Dict[str, Union[Callable, str]] |
A list of functions that compute properties. If None,
a default list of properties is used. If the function is a string,
|
None |
add_properties |
bool |
Whether to add the computed properties to the default list. |
True |
Returns:
Type | Description |
---|---|
DataFrame |
A dataframe of computed properties with one row per input molecules. |
Source code in datamol/descriptors/descriptors.py
def batch_compute_many_descriptors(
mols: List[Mol],
properties_fn: Dict[str, Union[Callable, str]] = None,
add_properties: bool = True,
n_jobs: int = 1,
batch_size: int = None,
progress: bool = False,
progress_leave: bool = True,
) -> pd.DataFrame:
"""Compute a list of opiniated molecular properties on a list of molecules.
Args:
mols: A list of molecules.
properties_fn: A list of functions that compute properties. If None,
a default list of properties is used. If the function is a string,
`dm.descriptors.any_descriptor()` is used to retrieve the descriptor
function.
add_properties: Whether to add the computed properties to the default list.
Returns:
A dataframe of computed properties with one row per input molecules.
"""
compute_fn = functools.partial(
compute_many_descriptors,
properties_fn=properties_fn,
add_properties=add_properties,
)
props = parallelized(
compute_fn,
mols,
batch_size=batch_size,
progress=progress,
n_jobs=n_jobs,
tqdm_kwargs=dict(leave=progress_leave),
)
return pd.DataFrame(props)
mw
¶
CalcExactMolWt( (Mol)mol [, (bool)onlyHeavy=False]) -> float : returns the molecule's exact molecular weight
C++ signature :
double CalcExactMolWt(RDKit::ROMol [,bool=False])
fsp3
¶
CalcFractionCSP3( (Mol)mol) -> float : returns the fraction of C atoms that are SP3 hybridized
C++ signature :
double CalcFractionCSP3(RDKit::ROMol)
n_hba
¶
CalcNumHBA( (Mol)mol) -> int : returns the number of H-bond acceptors for a molecule
C++ signature :
unsigned int CalcNumHBA(RDKit::ROMol)
n_hbd
¶
CalcNumHBD( (Mol)mol) -> int : returns the number of H-bond donors for a molecule
C++ signature :
unsigned int CalcNumHBD(RDKit::ROMol)
n_rings
¶
CalcNumRings( (Mol)mol) -> int : returns the number of rings for a molecule
C++ signature :
unsigned int CalcNumRings(RDKit::ROMol)
n_hetero_atoms
¶
CalcNumHeteroatoms( (Mol)mol) -> int : returns the number of heteroatoms for a molecule
C++ signature :
unsigned int CalcNumHeteroatoms(RDKit::ROMol)
n_heavy_atoms(mol)
¶
Number of heavy atoms a molecule.
Source code in rdkit/Chem/Lipinski.py
def HeavyAtomCount(mol):
" Number of heavy atoms a molecule."
return mol.GetNumHeavyAtoms()
n_rotatable_bonds
¶
CalcNumRotatableBonds( (Mol)mol, (bool)strict) -> int : returns the number of rotatable bonds for a molecule. strict = NumRotatableBondsOptions.NonStrict - Simple rotatable bond definition. strict = NumRotatableBondsOptions.Strict - (default) does not count things like amide or ester bonds strict = NumRotatableBondsOptions.StrictLinkages - handles linkages between ring systems. - Single bonds between aliphatic ring Cs are always rotatable. This means that the central bond in CC1CCCC(C)C1-C1C(C)CCCC1C is now considered rotatable; it was not before - Heteroatoms in the linked rings no longer affect whether or not the linking bond is rotatable - the linking bond in systems like Cc1cccc(C)c1-c1c(C)cccc1 is now considered non-rotatable
C++ signature :
unsigned int CalcNumRotatableBonds(RDKit::ROMol,bool)
CalcNumRotatableBonds( (Mol)mol [, (NumRotatableBondsOptions)strict=rdkit.Chem.rdMolDescriptors.NumRotatableBondsOptions.Default]) -> int : returns the number of rotatable bonds for a molecule. strict = NumRotatableBondsOptions.NonStrict - Simple rotatable bond definition. strict = NumRotatableBondsOptions.Strict - (default) does not count things like amide or ester bonds strict = NumRotatableBondsOptions.StrictLinkages - handles linkages between ring systems. - Single bonds between aliphatic ring Cs are always rotatable. This means that the central bond in CC1CCCC(C)C1-C1C(C)CCCC1C is now considered rotatable; it was not before - Heteroatoms in the linked rings no longer affect whether or not the linking bond is rotatable - the linking bond in systems like Cc1cccc(C)c1-c1c(C)cccc1 is now considered non-rotatable
C++ signature :
unsigned int CalcNumRotatableBonds(RDKit::ROMol [,RDKit::Descriptors::NumRotatableBondsOptions=rdkit.Chem.rdMolDescriptors.NumRotatableBondsOptions.Default])
n_aliphatic_rings(x, y=<Boost.Python.function object at 0x5556694aacd0>)
¶
CalcNumAliphaticRings( (Mol)mol) -> int : returns the number of aliphatic (containing at least one non-aromatic bond) rings for a molecule
C++ signature :
unsigned int CalcNumAliphaticRings(RDKit::ROMol)
Source code in rdkit/Chem/Lipinski.py
_fn = lambda x, y=_cfn: y(x)
n_aromatic_rings(x, y=<Boost.Python.function object at 0x555669355a90>)
¶
CalcNumAromaticRings( (Mol)mol) -> int : returns the number of aromatic rings for a molecule
C++ signature :
unsigned int CalcNumAromaticRings(RDKit::ROMol)
Source code in rdkit/Chem/Lipinski.py
_fn = lambda x, y=_cfn: y(x)
n_saturated_rings(x, y=<Boost.Python.function object at 0x5556693b0a20>)
¶
CalcNumSaturatedRings( (Mol)mol) -> int : returns the number of saturated rings for a molecule
C++ signature :
unsigned int CalcNumSaturatedRings(RDKit::ROMol)
Source code in rdkit/Chem/Lipinski.py
_fn = lambda x, y=_cfn: y(x)
n_radical_electrons(mol)
¶
The number of radical electrons the molecule has (says nothing about spin state)
NumRadicalElectrons(Chem.MolFromSmiles('CC')) 0 NumRadicalElectrons(Chem.MolFromSmiles('C[CH3]')) 0 NumRadicalElectrons(Chem.MolFromSmiles('C[CH2]')) 1 NumRadicalElectrons(Chem.MolFromSmiles('C[CH]')) 2 NumRadicalElectrons(Chem.MolFromSmiles('C[C]')) 3
Source code in rdkit/Chem/Descriptors.py
def NumRadicalElectrons(mol):
""" The number of radical electrons the molecule has
(says nothing about spin state)
>>> NumRadicalElectrons(Chem.MolFromSmiles('CC'))
0
>>> NumRadicalElectrons(Chem.MolFromSmiles('C[CH3]'))
0
>>> NumRadicalElectrons(Chem.MolFromSmiles('C[CH2]'))
1
>>> NumRadicalElectrons(Chem.MolFromSmiles('C[CH]'))
2
>>> NumRadicalElectrons(Chem.MolFromSmiles('C[C]'))
3
"""
return sum(atom.GetNumRadicalElectrons() for atom in mol.GetAtoms())
tpsa
¶
CalcTPSA( (Mol)mol [, (bool)force=False [, (bool)includeSandP=False]]) -> float : returns the TPSA value for a molecule
C++ signature :
double CalcTPSA(RDKit::ROMol [,bool=False [,bool=False]])
qed(mol, w=QEDproperties(MW=0.66, ALOGP=0.46, HBA=0.05, HBD=0.61, PSA=0.06, ROTB=0.65, AROM=0.48, ALERTS=0.95), qedProperties=None)
¶
Calculate the weighted sum of ADS mapped properties
some examples from the QED paper, reference values from Peter G's original implementation
m = Chem.MolFromSmiles('N=C(CCSCc1csc(N=C(N)N)n1)NS(N)(=O)=O') qed(m) 0.253... m = Chem.MolFromSmiles('CNC(=NCCSCc1nc[nH]c1C)NC#N') qed(m) 0.234... m = Chem.MolFromSmiles('CCCCCNC(=N)NN=Cc1c[nH]c2ccc(CO)cc12') qed(m) 0.234...
Source code in rdkit/Chem/QED.py
@setDescriptorVersion(version='1.1.0')
def qed(mol, w=WEIGHT_MEAN, qedProperties=None):
""" Calculate the weighted sum of ADS mapped properties
some examples from the QED paper, reference values from Peter G's original implementation
>>> m = Chem.MolFromSmiles('N=C(CCSCc1csc(N=C(N)N)n1)NS(N)(=O)=O')
>>> qed(m)
0.253...
>>> m = Chem.MolFromSmiles('CNC(=NCCSCc1nc[nH]c1C)NC#N')
>>> qed(m)
0.234...
>>> m = Chem.MolFromSmiles('CCCCCNC(=N)NN=Cc1c[nH]c2ccc(CO)cc12')
>>> qed(m)
0.234...
"""
if qedProperties is None:
qedProperties = properties(mol)
d = [ads(pi, adsParameters[name]) for name, pi in qedProperties._asdict().items()]
t = sum(wi * math.log(di) for wi, di in zip(w, d))
return math.exp(t / sum(w))
clogp(*x, **y)
¶
Wildman-Crippen LogP value
Uses an atom-based scheme based on the values in the paper: S. A. Wildman and G. M. Crippen JCICS 39 868-873 (1999)
Arguments
- inMol: a molecule
- addHs: (optional) toggles adding of Hs to the molecule for the calculation. If true, hydrogens will be added to the molecule and used in the calculation.
Source code in rdkit/Chem/Crippen.py
MolLogP = lambda *x, **y: rdMolDescriptors.CalcCrippenDescriptors(*x, **y)[0]
sas(m)
¶
Source code in SA_Score/sascorer.py
def calculateScore(m):
if _fscores is None:
readFragmentScores()
# fragment score
fp = rdMolDescriptors.GetMorganFingerprint(m,
2) # <- 2 is the *radius* of the circular fingerprint
fps = fp.GetNonzeroElements()
score1 = 0.
nf = 0
for bitId, v in fps.items():
nf += v
sfp = bitId
score1 += _fscores.get(sfp, -4) * v
score1 /= nf
# features score
nAtoms = m.GetNumAtoms()
nChiralCenters = len(Chem.FindMolChiralCenters(m, includeUnassigned=True))
ri = m.GetRingInfo()
nBridgeheads, nSpiro = numBridgeheadsAndSpiro(m, ri)
nMacrocycles = 0
for x in ri.AtomRings():
if len(x) > 8:
nMacrocycles += 1
sizePenalty = nAtoms**1.005 - nAtoms
stereoPenalty = math.log10(nChiralCenters + 1)
spiroPenalty = math.log10(nSpiro + 1)
bridgePenalty = math.log10(nBridgeheads + 1)
macrocyclePenalty = 0.
# ---------------------------------------
# This differs from the paper, which defines:
# macrocyclePenalty = math.log10(nMacrocycles+1)
# This form generates better results when 2 or more macrocycles are present
if nMacrocycles > 0:
macrocyclePenalty = math.log10(2)
score2 = 0. - sizePenalty - stereoPenalty - spiroPenalty - bridgePenalty - macrocyclePenalty
# correction for the fingerprint density
# not in the original publication, added in version 1.1
# to make highly symmetrical molecules easier to synthetise
score3 = 0.
if nAtoms > len(fps):
score3 = math.log(float(nAtoms) / len(fps)) * .5
sascore = score1 + score2 + score3
# need to transform "raw" value into scale between 1 and 10
min = -4.0
max = 2.5
sascore = 11. - (sascore - min + 1) / (max - min) * 9.
# smooth the 10-end
if sascore > 8.:
sascore = 8. + math.log(sascore + 1. - 9.)
if sascore > 10.:
sascore = 10.0
elif sascore < 1.:
sascore = 1.0
return sascore