Skip to content

datamol.mol

add_hs(mol, explicit_only=False, add_coords=False, only_on_atoms=None, add_residue_info=False)

Adds hydrogens to the molecule.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
explicit_only bool

whether to only add explicit hydrogens.

False
add_coords bool

whether to add 3D coordinates to the hydrogens.

False
only_on_atoms Optional[List[int]]

a list of atoms to add hydrogens only on.

None
add_residue_info bool

whether to add residue information to the hydrogens. Useful for PDB files.

False
Source code in datamol/mol.py
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
def add_hs(
    mol: Mol,
    explicit_only: bool = False,
    add_coords: bool = False,
    only_on_atoms: Optional[List[int]] = None,
    add_residue_info: bool = False,
):
    """Adds hydrogens to the molecule.

    Args:
        mol: a molecule.
        explicit_only: whether to only add explicit hydrogens.
        add_coords: whether to add 3D coordinates to the hydrogens.
        only_on_atoms: a list of atoms to add hydrogens only on.
        add_residue_info: whether to add residue information to the hydrogens.
            Useful for PDB files.
    """
    mol = AddHs(
        mol,
        explicitOnly=explicit_only,
        addCoords=add_coords,
        onlyOnAtoms=only_on_atoms,
        addResidueInfo=add_residue_info,
    )

    return mol

adjust_singleton(mol)

Remove all atoms that are essentially disconnected singleton nodes in the molecular graph. For example, the chlorine atom and methane fragment will be removed in Cl.[N:1]1=CC(O)=CC2CCCCC12.CC.C", but not the ethane fragment.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
Source code in datamol/mol.py
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
def adjust_singleton(mol: Mol) -> Optional[Mol]:
    """Remove all atoms that are essentially disconnected singleton nodes in the molecular graph.
    For example, the chlorine atom and methane fragment will be removed in Cl.[N:1]1=CC(O)=CC2CCCCC12.CC.C",
    but not the ethane fragment.

    Args:
        mol: a molecule.
    """
    to_rem = []
    em = RWMol(mol)
    for atom in mol.GetAtoms():
        if atom.GetExplicitValence() == 0:
            to_rem.append(atom.GetIdx())
    to_rem.sort(reverse=True)
    for a_idx in to_rem:
        em.RemoveAtom(a_idx)
    return em.GetMol()

atom_indices_to_mol(mol, copy=False)

Add the molAtomMapNumber property to each atoms.

Parameters:

Name Type Description Default
mol Mol

a molecule

required
copy bool

Whether to copy the molecule.

False
Source code in datamol/mol.py
850
851
852
853
854
855
856
857
858
859
860
861
862
863
def atom_indices_to_mol(mol: Mol, copy: bool = False):
    """Add the `molAtomMapNumber` property to each atoms.

    Args:
        mol: a molecule
        copy: Whether to copy the molecule.
    """

    if copy is True:
        mol = copy_mol(mol)

    for atom in mol.GetAtoms():
        atom.SetProp("molAtomMapNumber", str(atom.GetIdx()))
    return mol

atom_list_to_bond(mol, atom_indices, bond_as_idx=False)

Return a list of existing bond indices between a list of atom indices.

Parameters:

Name Type Description Default
mol Mol

A molecule.

required
atom_indices List[int]

A list of atom indices.

required
Source code in datamol/mol.py
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
def atom_list_to_bond(
    mol: Mol,
    atom_indices: List[int],
    bond_as_idx: bool = False,
):
    """Return a list of existing bond indices between a list of
    atom indices.

    Args:
        mol: A molecule.
        atom_indices: A list of atom indices.
    """

    # Build an atom map
    atom_map = {}
    submol = PathToSubmol(mol, atom_indices, useQuery=True, atomMap=atom_map)
    atom_map_reversed = {v: k for k, v in atom_map.items()}

    bonds = []

    for bond in submol.GetBonds():
        a1, a2 = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        ori_a1 = atom_map_reversed[a1]
        ori_a2 = atom_map_reversed[a2]

        if ori_a1 in atom_indices and ori_a2 in atom_indices:
            ori_bond = mol.GetBondBetweenAtoms(ori_a1, ori_a2)
            if bond_as_idx:
                bonds.append(ori_bond.GetIdx())
            else:
                bonds.append(ori_bond)

    return bonds

clear_mol_props(mol, copy=True, include_private=False, include_computed=False)

Clear all properties from a molecule.

Parameters:

Name Type Description Default
mol Mol

A molecule.

required
copy bool

Whether to copy the molecule.

True
Source code in datamol/mol.py
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
def clear_mol_props(
    mol: Mol,
    copy: bool = True,
    include_private: bool = False,
    include_computed: bool = False,
):
    """Clear all properties from a molecule.

    Args:
        mol: A molecule.
        copy: Whether to copy the molecule.
    """

    if copy:
        mol = copy_mol(mol)

    props = mol.GetPropsAsDict(includePrivate=include_private, includeComputed=include_computed)

    for key in props.keys():
        mol.ClearProp(key)

    return mol

compute_ring_system(mol, include_spiro=True)

Compute the list of ring system in a molecule. This is based on RDKit's cookbook: https://www.rdkit.org/docs/Cookbook.html#rings-aromaticity-and-kekulization

Parameters:

Name Type Description Default
mol Mol

input molecule

required
include_spiro bool

whether to include spiro rings.

True

Returns:

Name Type Description
ring_system List[Set[int]]

list of ring system (atom indices).

Source code in datamol/mol.py
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
def compute_ring_system(mol: Mol, include_spiro: bool = True) -> List[Set[int]]:
    """Compute the list of ring system in a molecule. This is based on RDKit's cookbook:
    https://www.rdkit.org/docs/Cookbook.html#rings-aromaticity-and-kekulization

    Args:
        mol: input molecule
        include_spiro: whether to include spiro rings.

    Returns:
        ring_system: list of ring system (atom indices).
    """
    ri = mol.GetRingInfo()
    systems = []
    for ring in ri.AtomRings():
        ringAts = set(ring)
        nSystems = []
        for system in systems:
            nInCommon = len(ringAts.intersection(system))
            if nInCommon and (include_spiro or nInCommon > 1):
                ringAts = ringAts.union(system)
            else:
                nSystems.append(system)
        nSystems.append(ringAts)
        systems = nSystems
    return systems

copy_mol(mol)

Copy a molecule and return a new one.

Parameters:

Name Type Description Default
mol Mol

a molecule to copy.

required
Source code in datamol/mol.py
58
59
60
61
62
63
64
def copy_mol(mol: Mol) -> Mol:
    """Copy a molecule and return a new one.

    Args:
        mol: a molecule to copy.
    """
    return copy.deepcopy(mol)

copy_mol_props(source, destination, include_private=False, include_computed=False)

Copy properties from one source molecule to another destination molecule.

Parameters:

Name Type Description Default
source Mol

a molecule to copy from.

required
destination Mol

a molecule to copy to.

required
include_private bool

Include private properties.

False
include_computed bool

Include computed properties.

False
Source code in datamol/mol.py
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
def copy_mol_props(
    source: Mol,
    destination: Mol,
    include_private: bool = False,
    include_computed: bool = False,
):
    """Copy properties from one source molecule to another destination
    molecule.

    Args:
        source: a molecule to copy from.
        destination: a molecule to copy to.
        include_private: Include private properties.
        include_computed: Include computed properties.
    """

    props = source.GetPropsAsDict(includePrivate=include_private, includeComputed=include_computed)
    set_mol_props(destination, props)

decrease_bond(bond)

Remove one single bond from the input bond. Note that you should first kekulize your molecules and remove non-standard bond.

Parameters:

Name Type Description Default
bond Chem.rdchem.Bond

a bond.

required
Source code in datamol/mol.py
542
543
544
545
546
547
548
549
550
551
552
553
554
555
def decrease_bond(bond: Chem.rdchem.Bond) -> Optional[Union[list, Chem.rdchem.Bond]]:
    """Remove one single bond from the input bond. Note that you should
    first kekulize your molecules and remove non-standard bond.

    Args:
        bond: a bond.
    """
    if bond.GetBondType() == TRIPLE_BOND:
        return DOUBLE_BOND
    if bond.GetBondType() == DOUBLE_BOND:
        return SINGLE_BOND
    if bond.GetBondType() == SINGLE_BOND:
        return None
    return bond

fix_mol(mol, n_iter=1, remove_singleton=False, largest_only=False, inplace=False)

Fix error in molecule using a greedy approach.

Parameters:

Name Type Description Default
mol Mol

input molecule to fix

required
n_iter int

Number of valence fix iteration to apply

1
remove_singleton bool

Whether adjust_singleton should be applied

False
largest_only bool

Whether only the largest fragment should be kept

False
inplace bool

Whether to return a copy of the mol or perform in place operation

False

Returns:

Type Description
Optional[Mol]

Fixed molecule.

Source code in datamol/mol.py
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
def fix_mol(
    mol: Mol,
    n_iter: int = 1,
    remove_singleton: bool = False,
    largest_only: bool = False,
    inplace: bool = False,
) -> Optional[Mol]:
    """Fix error in molecule using a greedy approach.

    Args:
        mol: input molecule to fix
        n_iter: Number of valence fix iteration to apply
        remove_singleton: Whether `adjust_singleton` should be applied
        largest_only: Whether only the largest fragment should be kept
        inplace: Whether to return a copy of the mol or perform in place operation

    Returns:
        Fixed molecule.
    """

    if not inplace:
        mol = copy.copy(mol)

    m = sanitize_mol(mol) or mol  # fail back to mol when the fixer fail

    if m is not None:
        m = remove_dummies(m)
        for _ in range(n_iter):
            m = fix_valence(m)

        if remove_singleton:
            m = adjust_singleton(m)

        if largest_only:
            # m = max(Chem.rdmolops.GetMolFrags(m, asMols=True, sanitizeFrags=False), key=lambda m: m.GetNumAtoms())
            m = rdMolStandardize.FragmentParent(m, skipStandardize=True)

    return m

fix_valence(mol, inplace=False, allow_ring_break=False)

Identify and try to fix valence issues by removing any supplemental bond that should not be in the graph.

Parameters:

Name Type Description Default
mol Mol

input molecule with incorrect valence for some atoms

required
inplace bool

Whether to modify in place or make a copy

False
allow_ring_break bool

Whether bond removal involving ring is allowed.

False

Returns:

Type Description
Optional[Mol]

Fixed potential valence issue in molecule or original molecule when nothing is broken

Optional[Mol]

of if failed.

Source code in datamol/mol.py
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
def fix_valence(mol: Mol, inplace: bool = False, allow_ring_break: bool = False) -> Optional[Mol]:
    """Identify and try to fix valence issues by removing any supplemental bond
    that should not be in the graph.

    Args:
        mol: input molecule with incorrect valence for some atoms
        inplace: Whether to modify in place or make a copy
        allow_ring_break: Whether bond removal involving ring is allowed.

    Returns:
        Fixed potential valence issue in molecule or original molecule when nothing is broken
        of if failed.
    """
    if not inplace:
        mol = copy.copy(mol)

    vm = rdMolStandardize.RDKitValidation()
    if len(vm.validate(mol)) == 0:  # don't fix something that is not broken
        return mol

    try:
        m = remove_hs(
            mol,
            implicit_only=False,
            update_explicit_count=True,
            sanitize=False,
        )
        m.UpdatePropertyCache(False)

        # first pass using explicit false count
        for atom in m.GetAtoms():
            while incorrect_valence(atom) and atom.GetTotalNumHs() > 0:
                cur_hydrogen = atom.GetTotalNumHs()
                atom.SetNumExplicitHs(max(0, cur_hydrogen - 1))
                atom.SetFormalCharge(max(0, atom.GetFormalCharge() - 1))
                # atom.SetNumRadicalElectrons(0)
            atom.UpdatePropertyCache(False)

        em = RWMol(m)
        bonds = em.GetBonds()
        bonds = [
            bond
            for bond in bonds
            if any(
                [
                    incorrect_valence(bond.GetBeginAtom()),
                    incorrect_valence(bond.GetEndAtom()),
                ]
            )
        ]
        for bond in bonds:
            a1 = bond.GetBeginAtom()
            a2 = bond.GetEndAtom()
            if incorrect_valence(a1) or incorrect_valence(a2):
                mbond = decrease_bond(bond)
                if allow_ring_break or (mbond or not bond.IsInRing()):
                    em.RemoveBond(a1.GetIdx(), a2.GetIdx())
                    if mbond is not None:
                        em.AddBond(a1.GetIdx(), a2.GetIdx(), mbond)
            a1.UpdatePropertyCache(False)
            a2.UpdatePropertyCache(False)
        m = em.GetMol()

    except Exception:
        return None

    return m

fix_valence_charge(mol, inplace=False)

Fix valence issues that are due to incorrect charges.

Parameters:

Name Type Description Default
mol Mol

Input molecule with incorrect valence for some atoms

required
inplace bool

Whether to modify in place or make a copy.

False

Returns:

Type Description
Optional[Mol]

Fixed molecule via charge correction or original molecule if failed.

Source code in datamol/mol.py
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
def fix_valence_charge(mol: Mol, inplace: bool = False) -> Optional[Mol]:
    """Fix valence issues that are due to incorrect charges.

    Args:
        mol: Input molecule with incorrect valence for some atoms
        inplace: Whether to modify in place or make a copy.

    Returns:
        Fixed molecule via charge correction or original molecule if failed.
    """

    vm = rdMolStandardize.RDKitValidation()

    # Don't fix something that is not broken
    if len(vm.validate(mol)) > 0:

        if not inplace:
            mol = copy.copy(mol)

        mol.UpdatePropertyCache(False)
        for a in mol.GetAtoms():
            n_electron = (
                a.GetImplicitValence()
                + a.GetExplicitValence()
                - PERIODIC_TABLE.GetDefaultValence(a.GetSymbol())
            )
            a.SetFormalCharge(n_electron)

    return mol

hash_mol(mol, hash_scheme='all')

Generate a unique hash code for a molecule based on chemistry. If two molecules are chemically “the same”, they should have the same hash.

Using molhash adds value beyond using SMILES because it:

  • Ignores SMILES features that are not chemically meaningful (e.g. atom map numbers).
  • Canonicalizes enhanced stereochemistry groups. For example C[C@H](O)CC |&1:1| and C[C@@H](O)CC |&1:1| have the same molhash.
  • Canonicalizes S group data (for example, polymer data).

Parameters:

Name Type Description Default
mol Mol

A molecule.

required
hash_scheme str

There are three hash schemes: - "all": most strict hash scheme utilizing all layers. - "no_stereo": excludes stereo sensitive layers. - "no_tautomers": excludes tautomer sensitive layers.

'all'

Returns:

Type Description
str

The hash as a 40 chars string.

Source code in datamol/mol.py
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
def hash_mol(mol: Mol, hash_scheme: str = "all") -> str:
    """Generate a unique hash code for a molecule based on chemistry. If two molecules are
    chemically “the same”, they should have the same hash.

    Using molhash adds value beyond using SMILES because it:

    - Ignores SMILES features that are not chemically meaningful (e.g. atom map numbers).
    - Canonicalizes enhanced stereochemistry groups. For example `C[C@H](O)CC |&1:1|` and `C[C@@H](O)CC |&1:1|`
    have the same molhash.
    - Canonicalizes S group data (for example, polymer data).

    Args:
        mol: A molecule.
        hash_scheme: There are three hash schemes:
            - "all": most strict hash scheme utilizing all layers.
            - "no_stereo": excludes stereo sensitive layers.
            - "no_tautomers": excludes tautomer sensitive layers.

    Returns:
        The hash as a 40 chars string.
    """

    if is_lower_than_current_rdkit_version("2022.09"):
        raise NotImplementedError("`datamol.hash_mol() is only available with RDKit>=2022.09.")

    from rdkit.Chem import RegistrationHash

    # Compute the mol layers
    all_layers = RegistrationHash.GetMolLayers(mol)

    # Select the hash scheme
    if hash_scheme == "all":
        hash_scheme_enum = RegistrationHash.HashScheme.ALL_LAYERS
    elif hash_scheme == "no_stereo":
        hash_scheme_enum = RegistrationHash.HashScheme.STEREO_INSENSITIVE_LAYERS
    elif hash_scheme == "no_tautomers":
        hash_scheme_enum = RegistrationHash.HashScheme.TAUTOMER_INSENSITIVE_LAYERS
    else:
        raise ValueError(
            f"`hash_scheme` is invalid. Please choose from: 'all', 'no_stereo' or 'no_tautomers'."
        )

    # Generate the hash
    return RegistrationHash.GetMolHash(all_layers=all_layers, hash_scheme=hash_scheme_enum)

incorrect_valence(a, update=False)

Check if an atom connection is not valid or all the atom of a molecule.

Parameters:

Name Type Description Default
a Union[Mol, Chem.rdchem.Atom]

atom or molecule to check for valence issue.

required
update bool

Update owning molecule property cache first.

False

Returns:

Type Description
bool

Whether the input atom valence is correct.

Source code in datamol/mol.py
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
def incorrect_valence(a: Union[Mol, Chem.rdchem.Atom], update: bool = False) -> bool:
    """Check if an atom connection is not valid or all the atom of a molecule.

    Args:
        a: atom or molecule to check for valence issue.
        update: Update owning molecule property cache first.

    Returns:
        Whether the input atom valence is correct.
    """
    if isinstance(a, Mol):
        a.UpdatePropertyCache(False)
        vm = rdMolStandardize.RDKitValidation()
        return len(vm.validate(a)) > 0

    if update:
        m = a.GetOwningMol()
        m.UpdatePropertyCache(False)
    return (a.GetImplicitValence() == 0) and (
        a.GetExplicitValence() > max(PERIODIC_TABLE.GetValenceList(a.GetSymbol()))
    )

is_transition_metal(at)

Check if atom is a transition metal.

Parameters:

Name Type Description Default
at Chem.rdchem.Atom

an atom.

required
Source code in datamol/mol.py
732
733
734
735
736
737
738
739
def is_transition_metal(at: Chem.rdchem.Atom) -> bool:
    """Check if atom is a transition metal.

    Args:
        at: an atom.
    """
    n = at.GetAtomicNum()
    return (n >= 22 and n <= 29) or (n >= 40 and n <= 47) or (n >= 72 and n <= 79)

keep_largest_fragment(mol)

Only keep largest fragment of each molecule.

Source code in datamol/mol.py
723
724
725
726
727
728
729
def keep_largest_fragment(mol: Mol) -> Optional[Mol]:
    """Only keep largest fragment of each molecule."""
    return max(
        GetMolFrags(mol, asMols=True),
        default=mol,
        key=lambda m: m.GetNumAtoms(),
    )

make_scaffold_generic(mol, include_bonds=False)

Make the atom in a scaffold or molecule generic.

Parameters:

Name Type Description Default
mol Mol

A molecule or a scaffold.

required
include_bonds bool

Whether we should also update bond order or keep as is.

False
Source code in datamol/mol.py
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
def make_scaffold_generic(mol: Mol, include_bonds: bool = False):
    """Make the atom in a scaffold or molecule generic.

    Args:
        mol: A molecule or a scaffold.
        include_bonds: Whether we should also update bond order or keep as is.
    """

    for atom in mol.GetAtoms():
        if atom.GetAtomicNum() != 1:
            atom.SetAtomicNum(0)

        atom.SetFormalCharge(0)
        atom.SetChiralTag(rdchem.ChiralType.CHI_UNSPECIFIED)
        atom.SetNoImplicit(0)
        atom.SetNumExplicitHs(0)

    if include_bonds:
        for bond in mol.GetBonds():
            bond.SetBondType(UNSPECIFIED_BOND)

    mol.UpdatePropertyCache()
    Chem.GetSymmSSSR(mol)  # type: ignore

    return mol

protect_atoms(mol, substruct=None, atoms=None, in_place=False)

Protect a list of atoms or substruct in a molecule.

The _protected attributes of a molecule is used by RDKit in several functions, especially for reactions where "protected" atoms are disallowed from taking part in reactions.

Parameters:

Name Type Description Default
mol Mol

input molecule to protect

required
substruct Optional[Mol]

optional substructure query to identify atoms to protect

None
atoms Optional[Union[List[int], int]]

optional list of atom indices to protect

None
in_place bool

whether to perform the protection in place or return a copy of the molecule

False
Source code in datamol/mol.py
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
def protect_atoms(
    mol: Mol,
    substruct: Optional[Mol] = None,
    atoms: Optional[Union[List[int], int]] = None,
    in_place: bool = False,
) -> Mol:
    """Protect a list of atoms or substruct in a molecule.

    The _protected attributes of a molecule is used by RDKit in several functions, especially for reactions
    where "protected" atoms are disallowed from taking part in reactions.

    Args:
        mol: input molecule to protect
        substruct: optional substructure query to identify atoms to protect
        atoms: optional list of atom indices to protect
        in_place: whether to perform the protection in place or return a copy of the molecule
    """
    if atoms is None:
        atoms = []
    elif not isinstance(atoms, (tuple, list)):
        atoms = [atoms]

    # do not perform protection in place
    if in_place:
        mol_copy = mol
    else:
        mol_copy = copy_mol(mol)

    if substruct is not None:
        matches = mol_copy.GetSubstructMatches(substruct)
        atoms.extend(itertools.chain(*matches))

    for a in atoms:
        if a is None:
            continue
        mol_copy.GetAtomWithIdx(a).SetProp("_protected", "1")

    return mol_copy

randomize_atoms(mol)

Randomize the position of the atoms in a mol.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required

Returns:

Name Type Description
mol Optional[Mol]

a molecule.

Source code in datamol/mol.py
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
def randomize_atoms(mol: Mol) -> Optional[Mol]:
    """Randomize the position of the atoms in a mol.

    Args:
        mol: a molecule.

    Returns:
        mol: a molecule.
    """
    if mol.GetNumAtoms() == 0:
        return mol

    atom_indices = list(range(mol.GetNumAtoms()))
    random.shuffle(atom_indices)
    return RenumberAtoms(mol, atom_indices)

remove_dummies(mol, dummy='*')

Remove dummy atoms from molecules.

Source code in datamol/mol.py
646
647
648
649
650
651
652
653
654
655
656
657
def remove_dummies(mol: Mol, dummy: str = "*") -> Optional[Mol]:
    """Remove dummy atoms from molecules."""

    du = to_mol(dummy)
    out = mol

    try:
        out = ReplaceSubstructs(mol, du, to_mol("[H]"), True)[0]
        out = remove_hs(out)
    except Exception:
        out = DeleteSubstructs(mol, du)
    return out

remove_hs(mol, implicit_only=False, update_explicit_count=False, sanitize=True)

Removes hydrogens from a molecule.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
implicit_only bool

whether to only remove implicit hydrogens.

False
update_explicit_count bool

whether to update the explicit hydrogen count.

False
sanitize bool

whether to sanitize the molecule after the hydrogens are removed.

True
Source code in datamol/mol.py
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
def remove_hs(
    mol: Mol,
    implicit_only: bool = False,
    update_explicit_count: bool = False,
    sanitize: bool = True,
):
    """Removes hydrogens from a molecule.

    Args:
        mol: a molecule.
        implicit_only: whether to only remove implicit hydrogens.
        update_explicit_count: whether to update the explicit hydrogen count.
        sanitize: whether to sanitize the molecule after the hydrogens are removed.
    """
    mol = RemoveHs(
        mol,
        implicitOnly=implicit_only,
        updateExplicitCount=update_explicit_count,
        sanitize=sanitize,
    )

    return mol

reorder_atoms(mol, break_ties=True, include_chirality=True, include_isotopes=True)

Reorder the atoms in a mol. It ensures a single atom order for the same molecule, regardless of its original representation.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
break_ties bool

Force breaking of ranked ties.

True
include_chirality bool

Use chiral information when computing rank.

True
include_isotopes bool

Use isotope information when computing rank.

True

Returns:

Name Type Description
mol Optional[Mol]

a molecule.

Source code in datamol/mol.py
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
def reorder_atoms(
    mol: Mol,
    break_ties: bool = True,
    include_chirality: bool = True,
    include_isotopes: bool = True,
) -> Optional[Mol]:
    """Reorder the atoms in a mol. It ensures a single atom order for the same molecule,
    regardless of its original representation.

    Args:
        mol: a molecule.
        break_ties: Force breaking of ranked ties.
        include_chirality: Use chiral information when computing rank.
        include_isotopes: Use isotope information when computing rank.

    Returns:
        mol: a molecule.
    """
    if mol.GetNumAtoms() == 0:
        return mol

    new_order = CanonicalRankAtoms(
        mol,
        breakTies=break_ties,
        includeChirality=include_chirality,
        includeIsotopes=include_isotopes,
    )
    new_order = sorted([(y, x) for x, y in enumerate(new_order)])
    return RenumberAtoms(mol, [y for (x, y) in new_order])

replace_dummies_atoms(mol, atom='C', dummy='*', replace_all=True)

Remove dummy atoms from molecules.

Parameters:

Name Type Description Default
mol Mol

molecule with dummies

required
atom str

replacement atom, default is carbon

'C'
dummy str

dummy atom representation

'*'
replace_all bool

Whether to replace all dummies

True

Returns:

Name Type Description
mol Optional[Mol]

Molecule with dummy replaced

Source code in datamol/mol.py
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
def replace_dummies_atoms(
    mol: Mol,
    atom: str = "C",
    dummy: str = "*",
    replace_all: bool = True,
) -> Optional[Mol]:
    """Remove dummy atoms from molecules.

    Args:
        mol: molecule with dummies
        atom: replacement atom, default is carbon
        dummy: dummy atom representation
        replace_all: Whether to replace all dummies

    Returns:
        mol: Molecule with dummy replaced
    """
    du = to_mol(dummy)
    replacement = to_mol(atom)
    out = ReplaceSubstructs(mol, du, replacement, replaceAll=replace_all)[0]
    return out

same_mol(mol1, mol2, use_non_standard_inchikey=False)

Check two molecules are the same by comparing their InChiKey.

Invalid molecules (None) are always considered as not the same.

Parameters:

Name Type Description Default
mol1 Optional[Mol]

A molecule.

required
mol2 Optional[Mol]

A molecule.

required
use_non_standard_inchikey bool

Whether to use the standard or non-standard InChiKey.

False
Source code in datamol/mol.py
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
def same_mol(
    mol1: Optional[Mol],
    mol2: Optional[Mol],
    use_non_standard_inchikey: bool = False,
) -> bool:
    """Check two molecules are the same by comparing their InChiKey.

    Invalid molecules (None) are always considered as not the same.

    Args:
        mol1: A molecule.
        mol2: A molecule.
        use_non_standard_inchikey: Whether to use the standard or non-standard InChiKey.
    """

    if mol1 is None or mol2 is None:
        return False

    if use_non_standard_inchikey:
        return to_inchikey_non_standard(mol1) == to_inchikey_non_standard(mol2)
    else:
        return to_inchikey(mol1) == to_inchikey(mol2)

sanitize_first(mols, charge_neutral=False, sanifix=True)

Sanitize a list of molecules and return the first valid molecule seen in the list.

Parameters:

Name Type Description Default
mols List[Mol]

a list of molecules.

required
charge_neutral bool

whether charge neutralization should be applied.

False
sanifix bool

whether to run the sanifix from James Davidson (sanifix4.py) that try to adjust aromatic nitrogens.

True

Returns:

Name Type Description
mol Mol

a molecule.

Source code in datamol/mol.py
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
def sanitize_first(mols: List[Mol], charge_neutral: bool = False, sanifix: bool = True) -> Mol:
    """Sanitize a list of molecules and return the first valid molecule seen in the list.

    Args:
        mols: a list of molecules.
        charge_neutral: whether charge neutralization should be applied.
        sanifix: whether to run the sanifix from James Davidson
            (sanifix4.py) that try to adjust aromatic nitrogens.

    Returns:
        mol: a molecule.
    """
    for mol in mols:
        mol = sanitize_mol(mol, charge_neutral=charge_neutral, sanifix=sanifix)
        if mol:
            return mol
    return None

sanitize_mol(mol, charge_neutral=False, sanifix=True, verbose=True, add_hs=False)

An augmented version of RDKit sanitize=True. It uses a mol-SMILES-mol conversion to catch potential aromaticity errors and try to fix aromatic nitrogen (using the popular sanifix4 script). Optionally, it can neutralize the charge of the molecule.

Note #1: Only the first conformer (if present) will be preserved and a warning will be displayed if more than one conformer is detected.

Note #2: The molecule's properties will be preserved but the atom's properties will be lost.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
charge_neutral bool

whether charge neutralization should be applied.

False
sanifix bool

whether to run the sanifix from James Davidson (sanifix4.py) that try to adjust aromatic nitrogens.

True
verbose bool

Whether displaying a warning about multiple conformers.

True
add_hs bool

Add hydrogens to the returned molecule. Useful when the input molecule already contains hydrogens.

False

Returns:

Name Type Description
mol Optional[Mol]

a molecule.

Source code in datamol/mol.py
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
def sanitize_mol(
    mol: Mol,
    charge_neutral: bool = False,
    sanifix: bool = True,
    verbose: bool = True,
    add_hs: bool = False,
) -> Optional[Mol]:
    """An augmented version of RDKit `sanitize=True`. It uses a
    mol-SMILES-mol conversion to catch potential aromaticity errors
    and try to fix aromatic nitrogen (using the popular sanifix4 script).
    Optionally, it can neutralize the charge of the molecule.

    Note #1: Only the first conformer (if present) will be preserved and
    a warning will be displayed if more than one conformer is detected.

    Note #2: The molecule's properties will be preserved but the atom's
    properties will be lost.

    Args:
        mol: a molecule.
        charge_neutral: whether charge neutralization should be applied.
        sanifix: whether to run the sanifix from James Davidson
            (sanifix4.py) that try to adjust aromatic nitrogens.
        verbose: Whether displaying a warning about multiple conformers.
        add_hs: Add hydrogens to the returned molecule. Useful when the input
            molecule already contains hydrogens.

    Returns:
        mol: a molecule.
    """
    if mol is None:
        return mol

    # Extract properties.
    original_mol = copy_mol(mol)
    properties = original_mol.GetPropsAsDict()

    if charge_neutral:
        mol = to_neutral(mol)

    if sanifix:
        mol = _sanifix4.sanifix(mol)

    if mol is not None:

        # Detect multiple conformers
        if verbose and mol.GetNumConformers() > 1:
            logger.warning(
                f"The molecule contains multiple conformers. Only the first one will be preserved."
            )

        # Try catch to avoid occasional aromaticity errors
        try:
            # `cxsmiles` is used here to preserve the first conformer.
            mol = to_mol(to_smiles(mol, cxsmiles=True), sanitize=True, add_hs=add_hs)
        except Exception:
            mol = None

    if mol is not None:
        # Insert back properties.
        mol = set_mol_props(mol, properties)

    return mol

sanitize_smiles(smiles, isomeric=True)

Takes SMILES string and returns its sanitized version.

Parameters:

Name Type Description Default
smiles Optional[str]

smiles to be sanitized.

required
isomeric bool

Whether to include information about stereochemistry in the SMILES.

True

Returns:

Type Description
Optional[str]

sanitized smiles.

Source code in datamol/mol.py
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
def sanitize_smiles(smiles: Optional[str], isomeric: bool = True) -> Optional[str]:
    """Takes SMILES string and returns its sanitized version.

    Args:
        smiles: smiles to be sanitized.
        isomeric: Whether to include information about stereochemistry in the SMILES.

    Returns:
        sanitized smiles.
    """

    mol = None

    try:
        mol = to_mol(smiles, sanitize=False)
        mol = sanitize_mol(mol, False)
    except Exception:
        return None

    if mol is None:
        return None

    try:
        smiles = to_smiles(mol, isomeric=isomeric)
    except:
        return None

    return smiles

set_dative_bonds(mol, from_atoms=(7, 8))

Replaces some single bonds between metals and atoms with atomic numbers in fromAtoms with dative bonds. The replacement is only done if the atom has "too many" bonds.

Parameters:

Name Type Description Default
mol Mol

molecule with bond to modify

required
from_atoms Tuple[int, int]

List of atoms (symbol or atomic number) to consider for bond replacement. By default, only Nitrogen (7) and Oxygen (8) are considered.

(7, 8)

Returns:

Type Description
Optional[Mol]

The modified molecule.

Source code in datamol/mol.py
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
def set_dative_bonds(mol: Mol, from_atoms: Tuple[int, int] = (7, 8)) -> Optional[Mol]:
    """Replaces some single bonds between metals and atoms with atomic numbers in fromAtoms
    with dative bonds. The replacement is only done if the atom has "too many" bonds.

    Arguments:
        mol: molecule with bond to modify
        from_atoms: List of atoms  (symbol or atomic number) to consider for bond replacement.
            By default, only Nitrogen (7) and Oxygen (8) are considered.

    Returns:
        The modified molecule.
    """
    rwmol = RWMol(mol)
    rwmol.UpdatePropertyCache(strict=False)

    metals = [at for at in rwmol.GetAtoms() if is_transition_metal(at)]
    for metal in metals:
        for nbr in metal.GetNeighbors():
            if (nbr.GetAtomicNum() in from_atoms or nbr.GetSymbol() in from_atoms) and (
                nbr.GetExplicitValence() > PERIODIC_TABLE.GetDefaultValence(nbr.GetAtomicNum())
                and rwmol.GetBondBetweenAtoms(nbr.GetIdx(), metal.GetIdx()).GetBondType()
                == SINGLE_BOND
            ):
                rwmol.RemoveBond(nbr.GetIdx(), metal.GetIdx())
                rwmol.AddBond(nbr.GetIdx(), metal.GetIdx(), DATIVE_BOND)
    return rwmol

set_mol_props(mol, props, copy=False)

Set properties to a mol from a dict.

Parameters:

Name Type Description Default
mol Mol

the mol where to copy the props.

required
props Dict[str, Any]

the props to copy.

required
copy bool

whether to copy the provided mol

False
Source code in datamol/mol.py
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
def set_mol_props(
    mol: Mol,
    props: Dict[str, Any],
    copy: bool = False,
) -> Mol:
    """Set properties to a mol from a dict.

    Args:
        mol: the mol where to copy the props.
        props: the props to copy.
        copy: whether to copy the provided mol

    """

    if copy is True:
        mol = copy_mol(mol)

    for k, v in props.items():
        if isinstance(v, bool):
            mol.SetBoolProp(k, v)
        elif isinstance(v, int):
            # NOTE(hadim): A Python integer is 32 bits and RDKit seems
            # to overflow before that. Here we catch the error
            # and instead uses silently `SetDoubleProp` instead.
            try:
                mol.SetIntProp(k, v)
            except OverflowError:
                mol.SetDoubleProp(k, v)
        elif isinstance(v, float):
            mol.SetDoubleProp(k, v)
        else:
            mol.SetProp(k, str(v))

    return mol

standardize_mol(mol, disconnect_metals=False, normalize=True, reionize=True, uncharge=False, stereo=True)

This function returns a standardized version the given molecule. It relies on the RDKit rdMolStandardize module which is largely inspired from MolVS.

Parameters:

Name Type Description Default
mol Mol

A molecule to standardize.

required
disconnect_metals bool

Disconnect metals that are defined as covalently bonded to non-metal. Depending on the source of the database, some compounds may be reported in salt form or associated to metallic ions (e.g. the sodium salt of a carboxylic compound). In most cases, these counter-ions are not relevant so the use of this function is required before further utilization of the dataset. In summary the process is the following:

  • Break covalent bonds between metals and organic atoms under certain conditions.
  • First, disconnect N, O, F from any metal. Then disconnect other non-metals from transition metals (with exceptions).
  • For every bond broken, adjust the charges of the begin and end atoms accordingly.
False
normalize bool

Applies a series of standard transformations to correct functional groups and recombine charges. It corrects drawing errors and standardizes functional groups in the molecule as well as ensuring the overall proper charge of the compound. It includes:

  • Uncharge-separate sulfones
  • Charge-separate nitro groups
  • Charge-separate pyridine oxide
  • Charge-separate azide
  • Charge-separate diazo and azo groups
  • Charge-separate sulfoxides
  • Hydrazine-diazonium system
True
reionize bool

If one or more acidic functionalities are present in the molecule, this option ensures the correct neutral/ionized state for such functional groups. Molecules are uncharged by adding and/or removing hydrogens. For zwitterions, hydrogens are moved to eliminate charges where possible. However, in cases where there is a positive charge that is not neutralizable, an attempt is made to also preserve the corresponding negative charge The algorithm works as follows:

  • Use SMARTS to find the strongest protonated acid and the weakest ionized acid.
  • If the ionized acid is weaker than the protonated acid, swap proton and repeat.
True
uncharge bool

This option neutralize the molecule by reversing the protonation state of protonated and deprotonated groups, if present (e.g. a carboxylate is re-protonated to the corresponding carboxylic acid). In cases where there is a positive charge that is not neutralizable, an attempt is made to also preserve the corresponding negative charge to ensure a net zero charge.

False
stereo bool

Stereochemical information is corrected and/or added if missing using built-in RDKit functionality to force a clean recalculation of stereochemistry (AssignStereochemistry).

True

Returns:

Name Type Description
mol Mol

A standardized molecule.

Source code in datamol/mol.py
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
def standardize_mol(
    mol: Mol,
    disconnect_metals: bool = False,
    normalize: bool = True,
    reionize: bool = True,
    uncharge: bool = False,
    stereo: bool = True,
) -> Mol:
    r"""
    This function returns a standardized version the given molecule. It relies on the
    RDKit [`rdMolStandardize` module](https://www.rdkit.org/docs/source/rdkit.Chem.MolStandardize.rdMolStandardize.html)
    which is largely inspired from [MolVS](https://github.com/mcs07/MolVS).

    Arguments:
        mol: A molecule to standardize.

        disconnect_metals: Disconnect metals that are defined as covalently bonded to non-metal.
            Depending on the source of the database, some compounds may be reported in salt form
            or associated to metallic ions (e.g. the sodium salt of a carboxylic compound).
            In most cases, these counter-ions are not relevant so the use of this function is required
            before further utilization of the dataset. In summary the process is the following:

            - Break covalent bonds between metals and organic atoms under certain conditions.
            - First, disconnect N, O, F from any metal. Then disconnect other non-metals from transition metals (with exceptions).
            - For every bond broken, adjust the charges of the begin and end atoms accordingly.

        normalize: Applies a series of standard transformations to correct functional groups and recombine charges.
            It corrects drawing errors and standardizes functional groups in the molecule as well as ensuring the
            overall proper charge of the compound. It includes:

            - Uncharge-separate sulfones
            - Charge-separate nitro groups
            - Charge-separate pyridine oxide
            - Charge-separate azide
            - Charge-separate diazo and azo groups
            - Charge-separate sulfoxides
            - Hydrazine-diazonium system

        reionize: If one or more acidic functionalities are present in the molecule, this option ensures the correct
            neutral/ionized state for such functional groups. Molecules are uncharged by adding and/or removing hydrogens.
            For zwitterions, hydrogens are moved to eliminate charges where possible. However, in cases where there is a
            positive charge that is not neutralizable, an attempt is made to also preserve the corresponding negative charge
            The algorithm works as follows:

            - Use SMARTS to find the strongest protonated acid and the weakest ionized acid.
            - If the ionized acid is weaker than the protonated acid, swap proton and repeat.

        uncharge: This option neutralize the molecule by reversing the protonation state of protonated and deprotonated groups,
            if present (e.g. a carboxylate is re-protonated to the corresponding carboxylic acid).
            In cases where there is a positive charge that is not neutralizable, an attempt is made to also preserve the
            corresponding negative charge to ensure a net zero charge.

        stereo: Stereochemical information is corrected and/or added if missing using built-in RDKit functionality to force a clean recalculation of stereochemistry (`AssignStereochemistry`).

    Returns:
        mol: A standardized molecule.
    """
    mol = copy_mol(mol)

    if disconnect_metals:
        md = rdMolStandardize.MetalDisconnector()
        mol = md.Disconnect(mol)

    if normalize:
        mol = rdMolStandardize.Normalize(mol)

    if reionize:
        reionizer = rdMolStandardize.Reionizer()
        mol = reionizer.reionize(mol)

    if uncharge:
        uncharger = rdMolStandardize.Uncharger()
        mol = uncharger.uncharge(mol)

    if stereo:
        AssignStereochemistry(mol, force=False, cleanIt=True)

    return mol

standardize_smiles(smiles, tautomer=False)

Apply smile standardization procedure. This is a convenient function wrapped arrounf RDKit smiles standardizer and tautomeric canonicalization.

Parameters:

Name Type Description Default
smiles str

Smiles to standardize

required
tautomer bool

Whether to canonicalize tautomers

False

Returns:

Name Type Description
standard_smiles str

the standardized smiles

Source code in datamol/mol.py
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
def standardize_smiles(smiles: str, tautomer: bool = False) -> str:
    r"""
    Apply smile standardization procedure. This is a convenient function wrapped arrounf RDKit
    smiles standardizer and tautomeric canonicalization.

    Args:
        smiles: Smiles to standardize
        tautomer: Whether to canonicalize tautomers

    Returns:
        standard_smiles: the standardized smiles
    """

    smiles = rdMolStandardize.StandardizeSmiles(smiles)
    if tautomer:
        smiles = canonicalize_tautomer_smiles(smiles)
    return smiles

strip_mol_to_core(mol, bond_cutter=None)

Strip a molecule to its core, i.e. remove all atoms not in the core. This method 'guess' the molecular core, by finding the ring system.

Parameters:

Name Type Description Default
mol Mol

A molecule.

required
bond_cutter Mol

A molecule used to cut the bonds.

None
Source code in datamol/mol.py
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
def strip_mol_to_core(mol: Mol, bond_cutter: Mol = None):
    """Strip a molecule to its core, i.e. remove all atoms not in the core.
    This method 'guess' the molecular core, by finding the ring system.

    Args:
        mol: A molecule.
        bond_cutter: A molecule used to cut the bonds.
    """

    if bond_cutter is None:
        bond_cutter = from_smarts("[R;!$(*=,#[!#6])]!@!=!#[*;$([A;!R][A;!R])]")

    with without_rdkit_log():

        scaffold = MurckoScaffold.GetScaffoldForMol(mol)
        out = mol.GetSubstructMatches(bond_cutter)
        bond_inds = [mol.GetBondBetweenAtoms(i, j).GetIdx() for i, j in out]

        if len(bond_inds) > 0:
            fragmented = rdmolops.FragmentOnBonds(mol, bond_inds)
            fragmented = remove_dummies(fragmented)
            fragmented = to_scaffold_murcko(fragmented)
            scaffold = keep_largest_fragment(fragmented)

    return scaffold

substructure_matching_bonds(mol, query, **kwargs)

Perform a substructure match using GetSubstructMatches but instead of returning only the atom indices also return the bond indices.

Parameters:

Name Type Description Default
mol Mol

A molecule.

required
query Mol

A molecule used as a query to match against.

required
**kwargs Any

Any other arguments to pass to mol.GetSubstructMatches().

{}

Returns:

Name Type Description
atom_matches list

A list of lists of atom indices.

bond_matches list

A list of lists of bond indices.

Source code in datamol/mol.py
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
def substructure_matching_bonds(mol: Mol, query: Mol, **kwargs: Any) -> Tuple[list, list]:
    """Perform a substructure match using `GetSubstructMatches` but instead
    of returning only the atom indices also return the bond indices.

    Args:
        mol: A molecule.
        query: A molecule used as a query to match against.
        **kwargs: Any other arguments to pass to `mol.GetSubstructMatches()`.

    Returns:
        atom_matches: A list of lists of atom indices.
        bond_matches: A list of lists of bond indices.
    """

    # NOTE(hadim): If more substructure functions are added here, consider moving it to
    # a dedicated `substructure` module.

    # Set default arguments
    kwargs.setdefault("uniquify", True)

    # Get the matching atom indices
    atom_matches = list(mol.GetSubstructMatches(query, **kwargs))

    # Get the bond to highligh from the query
    query_bond_indices = [
        (bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()) for bond in query.GetBonds()
    ]

    # Retrieve the atom indices
    query_atom_indices = [atom.GetIdx() for i, atom in enumerate(query.GetAtoms())]

    bond_matches = []

    for match in atom_matches:

        # Map the atom of the query to the atom of the mol matching the query
        atom_map = dict(zip(query_atom_indices, match))

        # For this match atoms we now, we use the map to retrieve the matching bonds
        # in the mol.
        mol_bond_indices = [(atom_map[a1], atom_map[a2]) for a1, a2 in query_bond_indices]

        # Convert the bond atom indices to bond indices
        mol_bond_indices = [mol.GetBondBetweenAtoms(a1, a2).GetIdx() for a1, a2 in mol_bond_indices]

        bond_matches.append(mol_bond_indices)

    return atom_matches, bond_matches

to_mol(mol, add_hs=False, explicit_only=False, ordered=False, kekulize=False, sanitize=True)

Convert an input molecule (smiles representation) into a Mol.

Parameters:

Name Type Description Default
mol Union[str, Mol]

A SMILES or a molecule.

required
add_hs bool

Whether hydrogens should be added the molecule.

False
explicit_only bool

Whether to only add explicit hydrogen or both (implicit and explicit). when add_hs is set to True.

False
ordered bool

Whether the atom should be ordered. This option is important if you want to ensure that the features returned will always maintain a single atom order for the same molecule, regardless of its original SMILES representation.

False
kekulize bool

Whether to perform kekulization of the input molecules.

False
sanitize bool

Whether to apply rdkit sanitization when input is a SMILES.

True

Returns:

Name Type Description
mol Optional[Mol]

the molecule if some conversion have been made. If the conversion fails

Optional[Mol]

None is returned so make sure that you handle this case on your own.

Source code in datamol/mol.py
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
def to_mol(
    mol: Union[str, Mol],
    add_hs: bool = False,
    explicit_only: bool = False,
    ordered: bool = False,
    kekulize: bool = False,
    sanitize: bool = True,
) -> Optional[Mol]:
    """Convert an input molecule (smiles representation) into a `Mol`.

    Args:
        mol: A SMILES or a molecule.
        add_hs: Whether hydrogens should be added the molecule.
        explicit_only: Whether to only add explicit hydrogen or both
            (implicit and explicit). when `add_hs` is set to True.
        ordered: Whether the atom should be ordered. This option is
            important if you want to ensure that the features returned will always maintain
            a single atom order for the same molecule, regardless of its original SMILES representation.
        kekulize: Whether to perform kekulization of the input molecules.
        sanitize: Whether to apply rdkit sanitization when input is a SMILES.

    Returns:
        mol: the molecule if some conversion have been made. If the conversion fails
        None is returned so make sure that you handle this case on your own.
    """

    if not isinstance(mol, (str, Mol)):
        raise ValueError(f"Input should be a Mol or a string instead of '{type(mol)}'")

    if isinstance(mol, str):
        _mol = MolFromSmiles(mol, sanitize=sanitize)

        if not sanitize and _mol is not None:
            _mol.UpdatePropertyCache(False)
    else:
        _mol = mol

    # Add hydrogens
    if _mol is not None and add_hs:
        _mol = AddHs(_mol, explicitOnly=explicit_only, addCoords=True)

    # Reorder atoms
    if _mol is not None and ordered:
        _mol = reorder_atoms(_mol)

    if _mol is not None and kekulize:
        Kekulize(_mol, clearAromaticFlags=False)
    return _mol

to_neutral(mol)

Neutralize the charge of a molecule.

Parameters:

Name Type Description Default
mol Optional[Mol]

a molecule.

required

Returns:

Name Type Description
mol Optional[Mol]

a molecule.

Source code in datamol/mol.py
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
def to_neutral(mol: Optional[Mol]) -> Optional[Mol]:
    """Neutralize the charge of a molecule.

    Args:
        mol: a molecule.

    Returns:
        mol: a molecule.
    """
    if mol is None:
        return mol

    for a in mol.GetAtoms():
        if a.GetFormalCharge() < 0 or (
            a.GetExplicitValence() >= PERIODIC_TABLE.GetDefaultValence(a.GetSymbol())
            and a.GetFormalCharge() > 0
        ):
            a.SetFormalCharge(0)
            a.UpdatePropertyCache(False)
    return mol

to_scaffold_murcko(mol, make_generic=False)

Extract the Murcko scaffold from a molecule.

Parameters:

Name Type Description Default
mol Mol

A molecule.

required
make_generic bool

Whether to make the scaffold generic.

False
Source code in datamol/mol.py
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
def to_scaffold_murcko(mol: Mol, make_generic: bool = False):
    """Extract the Murcko scaffold from a molecule.

    Args:
        mol: A molecule.
        make_generic: Whether to make the scaffold generic.
    """
    scf = MurckoScaffold.GetScaffoldForMol(mol)

    # NOTE(hadim): this is already done in `GetScaffoldForMol`
    # Note sure we need it here.
    scf.UpdatePropertyCache()
    Chem.GetSymmSSSR(scf)  # type: ignore

    if make_generic:
        scf = make_scaffold_generic(scf)
        scf = to_mol(scf)

    return scf

unique_id(mol)

A datamol unique molecule ID.

The ID is an MD5 hash of the non-standard InChiKey provided by dm.to_inchikey_non_standard(). It guarantees uniqueness for different tautomeric forms of the same molecule.

Parameters:

Name Type Description Default
mol Mol

A molecule.

required
Source code in datamol/mol.py
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
def unique_id(mol: Mol) -> Optional[str]:
    """A datamol unique molecule ID.

    The ID is an MD5 hash of the non-standard InChiKey provided
    by `dm.to_inchikey_non_standard()`. It guarantees uniqueness for
    different tautomeric forms of the same molecule.

    Args:
        mol: A molecule.
    """
    ik = to_inchikey_non_standard(mol)

    if ik is None:
        return None

    return hashlib.md5(ik.encode("utf-8")).hexdigest()