Skip to content

datamol.mol

add_hs(mol, explicit_only=False, add_coords=False, only_on_atoms=None, add_residue_info=False)

Adds hydrogens to the molecule.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
explicit_only bool

whether to only add explicit hydrogens.

False
add_coords bool

whether to add 3D coordinates to the hydrogens.

False
only_on_atoms Optional[List[int]]

a list of atoms to add hydrogens only on.

None
add_residue_info bool

whether to add residue information to the hydrogens. Useful for PDB files.

False
Source code in datamol/mol.py
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
def add_hs(
    mol: Mol,
    explicit_only: bool = False,
    add_coords: bool = False,
    only_on_atoms: Optional[List[int]] = None,
    add_residue_info: bool = False,
):
    """Adds hydrogens to the molecule.

    Args:
        mol: a molecule.
        explicit_only: whether to only add explicit hydrogens.
        add_coords: whether to add 3D coordinates to the hydrogens.
        only_on_atoms: a list of atoms to add hydrogens only on.
        add_residue_info: whether to add residue information to the hydrogens.
            Useful for PDB files.
    """
    mol = AddHs(
        mol,
        explicitOnly=explicit_only,
        addCoords=add_coords,
        onlyOnAtoms=only_on_atoms,
        addResidueInfo=add_residue_info,
    )

    return mol

adjust_singleton(mol)

Remove all atoms that are essentially disconnected singleton nodes in the molecular graph. For example, the chlorine atom and methane fragment will be removed in Cl.[N:1]1=CC(O)=CC2CCCCC12.CC.C", but not the ethane fragment.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
Source code in datamol/mol.py
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
def adjust_singleton(mol: Mol) -> Optional[Mol]:
    """Remove all atoms that are essentially disconnected singleton nodes in the molecular graph.
    For example, the chlorine atom and methane fragment will be removed in Cl.[N:1]1=CC(O)=CC2CCCCC12.CC.C",
    but not the ethane fragment.

    Args:
        mol: a molecule.
    """
    to_rem = []
    em = RWMol(mol)
    for atom in mol.GetAtoms():
        if atom.GetExplicitValence() == 0:
            to_rem.append(atom.GetIdx())
    to_rem.sort(reverse=True)
    for a_idx in to_rem:
        em.RemoveAtom(a_idx)
    return em.GetMol()

atom_indices_to_mol(mol, copy=False)

Add the molAtomMapNumber property to each atoms.

Parameters:

Name Type Description Default
mol Mol

a molecule

required
copy bool

Whether to copy the molecule.

False
Source code in datamol/mol.py
803
804
805
806
807
808
809
810
811
812
813
814
815
816
def atom_indices_to_mol(mol: Mol, copy: bool = False):
    """Add the `molAtomMapNumber` property to each atoms.

    Args:
        mol: a molecule
        copy: Whether to copy the molecule.
    """

    if copy is True:
        mol = copy_mol(mol)

    for atom in mol.GetAtoms():
        atom.SetProp("molAtomMapNumber", str(atom.GetIdx()))
    return mol

atom_list_to_bond(mol, atom_indices, bond_as_idx=False)

Return a list of existing bond indices between a list of atom indices.

Parameters:

Name Type Description Default
mol Mol

A molecule.

required
atom_indices List[int]

A list of atom indices.

required
Source code in datamol/mol.py
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
def atom_list_to_bond(
    mol: Mol,
    atom_indices: List[int],
    bond_as_idx: bool = False,
):
    """Return a list of existing bond indices between a list of
    atom indices.

    Args:
        mol: A molecule.
        atom_indices: A list of atom indices.
    """

    # Build an atom map
    atom_map = {}
    submol = PathToSubmol(mol, atom_indices, useQuery=True, atomMap=atom_map)
    atom_map_reversed = {v: k for k, v in atom_map.items()}

    bonds = []

    for bond in submol.GetBonds():
        a1, a2 = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        ori_a1 = atom_map_reversed[a1]
        ori_a2 = atom_map_reversed[a2]

        if ori_a1 in atom_indices and ori_a2 in atom_indices:
            ori_bond = mol.GetBondBetweenAtoms(ori_a1, ori_a2)
            if bond_as_idx:
                bonds.append(ori_bond.GetIdx())
            else:
                bonds.append(ori_bond)

    return bonds

clear_mol_props(mol, copy=True, include_private=False, include_computed=False)

Clear all properties from a molecule.

Parameters:

Name Type Description Default
mol Mol

A molecule.

required
copy bool

Whether to copy the molecule.

True
Source code in datamol/mol.py
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
def clear_mol_props(
    mol: Mol,
    copy: bool = True,
    include_private: bool = False,
    include_computed: bool = False,
):
    """Clear all properties from a molecule.

    Args:
        mol: A molecule.
        copy: Whether to copy the molecule.
    """

    if copy:
        mol = copy_mol(mol)

    props = mol.GetPropsAsDict(includePrivate=include_private, includeComputed=include_computed)

    for key in props.keys():
        mol.ClearProp(key)

    return mol

compute_ring_system(mol, include_spiro=True)

Compute the list of ring system in a molecule. This is based on RDKit's cookbook: https://www.rdkit.org/docs/Cookbook.html#rings-aromaticity-and-kekulization

Parameters:

Name Type Description Default
mol Mol

input molecule

required
include_spiro bool

whether to include spiro rings.

True

Returns:

Name Type Description
ring_system List[Set[int]]

list of ring system (atom indices).

Source code in datamol/mol.py
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
def compute_ring_system(mol: Mol, include_spiro: bool = True) -> List[Set[int]]:
    """Compute the list of ring system in a molecule. This is based on RDKit's cookbook:
    https://www.rdkit.org/docs/Cookbook.html#rings-aromaticity-and-kekulization

    Args:
        mol: input molecule
        include_spiro: whether to include spiro rings.

    Returns:
        ring_system: list of ring system (atom indices).
    """
    ri = mol.GetRingInfo()
    systems = []
    for ring in ri.AtomRings():
        ringAts = set(ring)
        nSystems = []
        for system in systems:
            nInCommon = len(ringAts.intersection(system))
            if nInCommon and (include_spiro or nInCommon > 1):
                ringAts = ringAts.union(system)
            else:
                nSystems.append(system)
        nSystems.append(ringAts)
        systems = nSystems
    return systems

copy_mol(mol)

Copy a molecule and return a new one.

Parameters:

Name Type Description Default
mol Mol

a molecule to copy.

required
Source code in datamol/mol.py
57
58
59
60
61
62
63
def copy_mol(mol: Mol) -> Mol:
    """Copy a molecule and return a new one.

    Args:
        mol: a molecule to copy.
    """
    return copy.deepcopy(mol)

copy_mol_props(source, destination, include_private=False, include_computed=False)

Copy properties from one source molecule to another destination molecule.

Parameters:

Name Type Description Default
source Mol

a molecule to copy from.

required
destination Mol

a molecule to copy to.

required
include_private bool

Include private properties.

False
include_computed bool

Include computed properties.

False
Source code in datamol/mol.py
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
def copy_mol_props(
    source: Mol,
    destination: Mol,
    include_private: bool = False,
    include_computed: bool = False,
):
    """Copy properties from one source molecule to another destination
    molecule.

    Args:
        source: a molecule to copy from.
        destination: a molecule to copy to.
        include_private: Include private properties.
        include_computed: Include computed properties.
    """

    props = source.GetPropsAsDict(includePrivate=include_private, includeComputed=include_computed)
    set_mol_props(destination, props)

decrease_bond(bond)

Remove one single bond from the input bond. Note that you should first kekulize your molecules and remove non-standard bond.

Parameters:

Name Type Description Default
bond Chem.rdchem.Bond

a bond.

required
Source code in datamol/mol.py
495
496
497
498
499
500
501
502
503
504
505
506
507
508
def decrease_bond(bond: Chem.rdchem.Bond) -> Optional[Union[list, Chem.rdchem.Bond]]:
    """Remove one single bond from the input bond. Note that you should
    first kekulize your molecules and remove non-standard bond.

    Args:
        bond: a bond.
    """
    if bond.GetBondType() == TRIPLE_BOND:
        return DOUBLE_BOND
    if bond.GetBondType() == DOUBLE_BOND:
        return SINGLE_BOND
    if bond.GetBondType() == SINGLE_BOND:
        return None
    return bond

fix_mol(mol, n_iter=1, remove_singleton=False, largest_only=False, inplace=False)

Fix error in molecule using a greedy approach.

Parameters:

Name Type Description Default
mol Mol

input molecule to fix

required
n_iter int

Number of valence fix iteration to apply

1
remove_singleton bool

Whether adjust_singleton should be applied

False
largest_only bool

Whether only the largest fragment should be kept

False
inplace bool

Whether to return a copy of the mol or perform in place operation

False

Returns:

Type Description
Optional[Mol]

Fixed molecule.

Source code in datamol/mol.py
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
def fix_mol(
    mol: Mol,
    n_iter: int = 1,
    remove_singleton: bool = False,
    largest_only: bool = False,
    inplace: bool = False,
) -> Optional[Mol]:
    """Fix error in molecule using a greedy approach.

    Args:
        mol: input molecule to fix
        n_iter: Number of valence fix iteration to apply
        remove_singleton: Whether `adjust_singleton` should be applied
        largest_only: Whether only the largest fragment should be kept
        inplace: Whether to return a copy of the mol or perform in place operation

    Returns:
        Fixed molecule.
    """

    if not inplace:
        mol = copy.copy(mol)

    m = sanitize_mol(mol) or mol  # fail back to mol when the fixer fail

    if m is not None:
        m = remove_dummies(m)
        for _ in range(n_iter):
            m = fix_valence(m)

        if remove_singleton:
            m = adjust_singleton(m)

        if largest_only:
            # m = max(Chem.rdmolops.GetMolFrags(m, asMols=True, sanitizeFrags=False), key=lambda m: m.GetNumAtoms())
            m = rdMolStandardize.FragmentParent(m, skipStandardize=True)

    return m

fix_valence(mol, inplace=False, allow_ring_break=False)

Identify and try to fix valence issues by removing any supplemental bond that should not be in the graph.

Parameters:

Name Type Description Default
mol Mol

input molecule with incorrect valence for some atoms

required
inplace bool

Whether to modify in place or make a copy

False
allow_ring_break bool

Whether bond removal involving ring is allowed.

False

Returns:

Type Description
Optional[Mol]

Fixed potential valence issue in molecule or original molecule when nothing is broken

Optional[Mol]

of if failed.

Source code in datamol/mol.py
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
def fix_valence(mol: Mol, inplace: bool = False, allow_ring_break: bool = False) -> Optional[Mol]:
    """Identify and try to fix valence issues by removing any supplemental bond
    that should not be in the graph.

    Args:
        mol: input molecule with incorrect valence for some atoms
        inplace: Whether to modify in place or make a copy
        allow_ring_break: Whether bond removal involving ring is allowed.

    Returns:
        Fixed potential valence issue in molecule or original molecule when nothing is broken
        of if failed.
    """
    if not inplace:
        mol = copy.copy(mol)

    vm = rdMolStandardize.RDKitValidation()
    if len(vm.validate(mol)) == 0:  # don't fix something that is not broken
        return mol

    try:
        m = remove_hs(
            mol,
            implicit_only=False,
            update_explicit_count=True,
            sanitize=False,
        )
        m.UpdatePropertyCache(False)

        # first pass using explicit false count
        for atom in m.GetAtoms():
            while incorrect_valence(atom) and atom.GetTotalNumHs() > 0:
                cur_hydrogen = atom.GetTotalNumHs()
                atom.SetNumExplicitHs(max(0, cur_hydrogen - 1))
                atom.SetFormalCharge(max(0, atom.GetFormalCharge() - 1))
                # atom.SetNumRadicalElectrons(0)
            atom.UpdatePropertyCache(False)

        em = RWMol(m)
        bonds = em.GetBonds()
        bonds = [
            bond
            for bond in bonds
            if any(
                [
                    incorrect_valence(bond.GetBeginAtom()),
                    incorrect_valence(bond.GetEndAtom()),
                ]
            )
        ]
        for bond in bonds:
            a1 = bond.GetBeginAtom()
            a2 = bond.GetEndAtom()
            if incorrect_valence(a1) or incorrect_valence(a2):
                mbond = decrease_bond(bond)
                if allow_ring_break or (mbond or not bond.IsInRing()):
                    em.RemoveBond(a1.GetIdx(), a2.GetIdx())
                    if mbond is not None:
                        em.AddBond(a1.GetIdx(), a2.GetIdx(), mbond)
            a1.UpdatePropertyCache(False)
            a2.UpdatePropertyCache(False)
        m = em.GetMol()

    except Exception:
        return None

    return m

fix_valence_charge(mol, inplace=False)

Fix valence issues that are due to incorrect charges.

Parameters:

Name Type Description Default
mol Mol

Input molecule with incorrect valence for some atoms

required
inplace bool

Whether to modify in place or make a copy.

False

Returns:

Type Description
Optional[Mol]

Fixed molecule via charge correction or original molecule if failed.

Source code in datamol/mol.py
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
def fix_valence_charge(mol: Mol, inplace: bool = False) -> Optional[Mol]:
    """Fix valence issues that are due to incorrect charges.

    Args:
        mol: Input molecule with incorrect valence for some atoms
        inplace: Whether to modify in place or make a copy.

    Returns:
        Fixed molecule via charge correction or original molecule if failed.
    """

    vm = rdMolStandardize.RDKitValidation()

    # Don't fix something that is not broken
    if len(vm.validate(mol)) > 0:

        if not inplace:
            mol = copy.copy(mol)

        mol.UpdatePropertyCache(False)
        for a in mol.GetAtoms():
            n_electron = (
                a.GetImplicitValence()
                + a.GetExplicitValence()
                - PERIODIC_TABLE.GetDefaultValence(a.GetSymbol())
            )
            a.SetFormalCharge(n_electron)

    return mol

incorrect_valence(a, update=False)

Check if an atom connection is not valid or all the atom of a molecule.

Parameters:

Name Type Description Default
a Union[Mol, Chem.rdchem.Atom]

atom or molecule to check for valence issue.

required
update bool

Update owning molecule property cache first.

False

Returns:

Type Description
bool

Whether the input atom valence is correct.

Source code in datamol/mol.py
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
def incorrect_valence(a: Union[Mol, Chem.rdchem.Atom], update: bool = False) -> bool:
    """Check if an atom connection is not valid or all the atom of a molecule.

    Args:
        a: atom or molecule to check for valence issue.
        update: Update owning molecule property cache first.

    Returns:
        Whether the input atom valence is correct.
    """
    if isinstance(a, Mol):
        a.UpdatePropertyCache(False)
        vm = rdMolStandardize.RDKitValidation()
        return len(vm.validate(a)) > 0

    if update:
        m = a.GetOwningMol()
        m.UpdatePropertyCache(False)
    return (a.GetImplicitValence() == 0) and (
        a.GetExplicitValence() > max(PERIODIC_TABLE.GetValenceList(a.GetSymbol()))
    )

is_transition_metal(at)

Check if atom is a transition metal.

Parameters:

Name Type Description Default
at Chem.rdchem.Atom

an atom.

required
Source code in datamol/mol.py
685
686
687
688
689
690
691
692
def is_transition_metal(at: Chem.rdchem.Atom) -> bool:
    """Check if atom is a transition metal.

    Args:
        at: an atom.
    """
    n = at.GetAtomicNum()
    return (n >= 22 and n <= 29) or (n >= 40 and n <= 47) or (n >= 72 and n <= 79)

keep_largest_fragment(mol)

Only keep largest fragment of each molecule.

Source code in datamol/mol.py
676
677
678
679
680
681
682
def keep_largest_fragment(mol: Mol) -> Optional[Mol]:
    """Only keep largest fragment of each molecule."""
    return max(
        GetMolFrags(mol, asMols=True),
        default=mol,
        key=lambda m: m.GetNumAtoms(),
    )

make_scaffold_generic(mol, include_bonds=False)

Make the atom in a scaffold or molecule generic.

Parameters:

Name Type Description Default
mol Mol

A molecule or a scaffold.

required
include_bonds bool

Whether we should also update bond order or keep as is.

False
Source code in datamol/mol.py
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
def make_scaffold_generic(mol: Mol, include_bonds: bool = False):
    """Make the atom in a scaffold or molecule generic.

    Args:
        mol: A molecule or a scaffold.
        include_bonds: Whether we should also update bond order or keep as is.
    """

    for atom in mol.GetAtoms():
        if atom.GetAtomicNum() != 1:
            atom.SetAtomicNum(0)

        atom.SetFormalCharge(0)
        atom.SetChiralTag(rdchem.ChiralType.CHI_UNSPECIFIED)
        atom.SetNoImplicit(0)
        atom.SetNumExplicitHs(0)

    if include_bonds:
        for bond in mol.GetBonds():
            bond.SetBondType(UNSPECIFIED_BOND)

    mol.UpdatePropertyCache()
    Chem.GetSymmSSSR(mol)  # type: ignore

    return mol

protect_atoms(mol, substruct=None, atoms=None, in_place=False)

Protect a list of atoms or substruct in a molecule.

The _protected attributes of a molecule is used by RDKit in several functions, especially for reactions where "protected" atoms are disallowed from taking part in reactions.

Parameters:

Name Type Description Default
mol Mol

input molecule to protect

required
substruct Optional[Mol]

optional substructure query to identify atoms to protect

None
atoms Optional[Union[List[int], int]]

optional list of atom indices to protect

None
in_place bool

whether to perform the protection in place or return a copy of the molecule

False
Source code in datamol/mol.py
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
def protect_atoms(
    mol: Mol,
    substruct: Optional[Mol] = None,
    atoms: Optional[Union[List[int], int]] = None,
    in_place: bool = False,
) -> Mol:
    """Protect a list of atoms or substruct in a molecule.

    The _protected attributes of a molecule is used by RDKit in several functions, especially for reactions
    where "protected" atoms are disallowed from taking part in reactions.

    Args:
        mol: input molecule to protect
        substruct: optional substructure query to identify atoms to protect
        atoms: optional list of atom indices to protect
        in_place: whether to perform the protection in place or return a copy of the molecule
    """
    if atoms is None:
        atoms = []
    elif not isinstance(atoms, (tuple, list)):
        atoms = [atoms]

    # do not perform protection in place
    if in_place:
        mol_copy = mol
    else:
        mol_copy = copy_mol(mol)

    if substruct is not None:
        matches = mol_copy.GetSubstructMatches(substruct)
        atoms.extend(itertools.chain(*matches))

    for a in atoms:
        if a is None:
            continue
        mol_copy.GetAtomWithIdx(a).SetProp("_protected", "1")

    return mol_copy

randomize_atoms(mol)

Randomize the position of the atoms in a mol.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required

Returns:

Name Type Description
mol Optional[Mol]

a molecule.

Source code in datamol/mol.py
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
def randomize_atoms(mol: Mol) -> Optional[Mol]:
    """Randomize the position of the atoms in a mol.

    Args:
        mol: a molecule.

    Returns:
        mol: a molecule.
    """
    if mol.GetNumAtoms() == 0:
        return mol

    atom_indices = list(range(mol.GetNumAtoms()))
    random.shuffle(atom_indices)
    return RenumberAtoms(mol, atom_indices)

remove_dummies(mol, dummy='*')

Remove dummy atoms from molecules.

Source code in datamol/mol.py
599
600
601
602
603
604
605
606
607
608
609
610
def remove_dummies(mol: Mol, dummy: str = "*") -> Optional[Mol]:
    """Remove dummy atoms from molecules."""

    du = to_mol(dummy)
    out = mol

    try:
        out = ReplaceSubstructs(mol, du, to_mol("[H]"), True)[0]
        out = remove_hs(out)
    except Exception:
        out = DeleteSubstructs(mol, du)
    return out

remove_hs(mol, implicit_only=False, update_explicit_count=False, sanitize=True)

Removes hydrogens from a molecule.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
implicit_only bool

whether to only remove implicit hydrogens.

False
update_explicit_count bool

whether to update the explicit hydrogen count.

False
sanitize bool

whether to sanitize the molecule after the hydrogens are removed.

True
Source code in datamol/mol.py
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
def remove_hs(
    mol: Mol,
    implicit_only: bool = False,
    update_explicit_count: bool = False,
    sanitize: bool = True,
):
    """Removes hydrogens from a molecule.

    Args:
        mol: a molecule.
        implicit_only: whether to only remove implicit hydrogens.
        update_explicit_count: whether to update the explicit hydrogen count.
        sanitize: whether to sanitize the molecule after the hydrogens are removed.
    """
    mol = RemoveHs(
        mol,
        implicitOnly=implicit_only,
        updateExplicitCount=update_explicit_count,
        sanitize=sanitize,
    )

    return mol

reorder_atoms(mol, break_ties=True, include_chirality=True, include_isotopes=True)

Reorder the atoms in a mol. It ensures a single atom order for the same molecule, regardless of its original representation.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
break_ties bool

Force breaking of ranked ties.

True
include_chirality bool

Use chiral information when computing rank.

True
include_isotopes bool

Use isotope information when computing rank.

True

Returns:

Name Type Description
mol Optional[Mol]

a molecule.

Source code in datamol/mol.py
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
def reorder_atoms(
    mol: Mol,
    break_ties: bool = True,
    include_chirality: bool = True,
    include_isotopes: bool = True,
) -> Optional[Mol]:
    """Reorder the atoms in a mol. It ensures a single atom order for the same molecule,
    regardless of its original representation.

    Args:
        mol: a molecule.
        break_ties: Force breaking of ranked ties.
        include_chirality: Use chiral information when computing rank.
        include_isotopes: Use isotope information when computing rank.

    Returns:
        mol: a molecule.
    """
    if mol.GetNumAtoms() == 0:
        return mol

    new_order = CanonicalRankAtoms(
        mol,
        breakTies=break_ties,
        includeChirality=include_chirality,
        includeIsotopes=include_isotopes,
    )
    new_order = sorted([(y, x) for x, y in enumerate(new_order)])
    return RenumberAtoms(mol, [y for (x, y) in new_order])

replace_dummies_atoms(mol, atom='C', dummy='*', replace_all=True)

Remove dummy atoms from molecules.

Parameters:

Name Type Description Default
mol Mol

molecule with dummies

required
atom str

replacement atom, default is carbon

'C'
dummy str

dummy atom representation

'*'
replace_all bool

Whether to replace all dummies

True

Returns:

Name Type Description
mol Optional[Mol]

Molecule with dummy replaced

Source code in datamol/mol.py
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
def replace_dummies_atoms(
    mol: Mol,
    atom: str = "C",
    dummy: str = "*",
    replace_all: bool = True,
) -> Optional[Mol]:
    """Remove dummy atoms from molecules.

    Args:
        mol: molecule with dummies
        atom: replacement atom, default is carbon
        dummy: dummy atom representation
        replace_all: Whether to replace all dummies

    Returns:
        mol: Molecule with dummy replaced
    """
    du = to_mol(dummy)
    replacement = to_mol(atom)
    out = ReplaceSubstructs(mol, du, replacement, replaceAll=replace_all)[0]
    return out

same_mol(mol1, mol2, use_non_standard_inchikey=False)

Check two molecules are the same by comparing their InChiKey.

Invalid molecules (None) are always considered as not the same.

Parameters:

Name Type Description Default
mol1 Optional[Mol]

A molecule.

required
mol2 Optional[Mol]

A molecule.

required
use_non_standard_inchikey bool

Whether to use the standard or non-standard InChiKey.

False
Source code in datamol/mol.py
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
def same_mol(
    mol1: Optional[Mol],
    mol2: Optional[Mol],
    use_non_standard_inchikey: bool = False,
) -> bool:
    """Check two molecules are the same by comparing their InChiKey.

    Invalid molecules (None) are always considered as not the same.

    Args:
        mol1: A molecule.
        mol2: A molecule.
        use_non_standard_inchikey: Whether to use the standard or non-standard InChiKey.
    """

    if mol1 is None or mol2 is None:
        return False

    if use_non_standard_inchikey:
        return to_inchikey_non_standard(mol1) == to_inchikey_non_standard(mol2)
    else:
        return to_inchikey(mol1) == to_inchikey(mol2)

sanitize_first(mols, charge_neutral=False, sanifix=True)

Sanitize a list of molecules and return the first valid molecule seen in the list.

Parameters:

Name Type Description Default
mols List[Mol]

a list of molecules.

required
charge_neutral bool

whether charge neutralization should be applied.

False
sanifix bool

whether to run the sanifix from James Davidson (sanifix4.py) that try to adjust aromatic nitrogens.

True

Returns:

Name Type Description
mol Mol

a molecule.

Source code in datamol/mol.py
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
def sanitize_first(mols: List[Mol], charge_neutral: bool = False, sanifix: bool = True) -> Mol:
    """Sanitize a list of molecules and return the first valid molecule seen in the list.

    Args:
        mols: a list of molecules.
        charge_neutral: whether charge neutralization should be applied.
        sanifix: whether to run the sanifix from James Davidson
            (sanifix4.py) that try to adjust aromatic nitrogens.

    Returns:
        mol: a molecule.
    """
    for mol in mols:
        mol = sanitize_mol(mol, charge_neutral=charge_neutral, sanifix=sanifix)
        if mol:
            return mol
    return None

sanitize_mol(mol, charge_neutral=False, sanifix=True, verbose=True, add_hs=False)

An augmented version of RDKit sanitize=True. It uses a mol-SMILES-mol conversion to catch potential aromaticity errors and try to fix aromatic nitrogen (using the popular sanifix4 script). Optionally, it can neutralize the charge of the molecule.

Note #1: Only the first conformer (if present) will be preserved and a warning will be displayed if more than one conformer is detected.

Note #2: The molecule's properties will be preserved but the atom's properties will be lost.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
charge_neutral bool

whether charge neutralization should be applied.

False
sanifix bool

whether to run the sanifix from James Davidson (sanifix4.py) that try to adjust aromatic nitrogens.

True
verbose bool

Whether displaying a warning about multiple conformers.

True
add_hs bool

Add hydrogens to the returned molecule. Useful when the input molecule already contains hydrogens.

False

Returns:

Name Type Description
mol Optional[Mol]

a molecule.

Source code in datamol/mol.py
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
def sanitize_mol(
    mol: Mol,
    charge_neutral: bool = False,
    sanifix: bool = True,
    verbose: bool = True,
    add_hs: bool = False,
) -> Optional[Mol]:
    """An augmented version of RDKit `sanitize=True`. It uses a
    mol-SMILES-mol conversion to catch potential aromaticity errors
    and try to fix aromatic nitrogen (using the popular sanifix4 script).
    Optionally, it can neutralize the charge of the molecule.

    Note #1: Only the first conformer (if present) will be preserved and
    a warning will be displayed if more than one conformer is detected.

    Note #2: The molecule's properties will be preserved but the atom's
    properties will be lost.

    Args:
        mol: a molecule.
        charge_neutral: whether charge neutralization should be applied.
        sanifix: whether to run the sanifix from James Davidson
            (sanifix4.py) that try to adjust aromatic nitrogens.
        verbose: Whether displaying a warning about multiple conformers.
        add_hs: Add hydrogens to the returned molecule. Useful when the input
            molecule already contains hydrogens.

    Returns:
        mol: a molecule.
    """
    if mol is None:
        return mol

    # Extract properties.
    original_mol = copy_mol(mol)
    properties = original_mol.GetPropsAsDict()

    if charge_neutral:
        mol = to_neutral(mol)

    if sanifix:
        mol = _sanifix4.sanifix(mol)

    if mol is not None:

        # Detect multiple conformers
        if verbose and mol.GetNumConformers() > 1:
            logger.warning(
                f"The molecule contains multiple conformers. Only the first one will be preserved."
            )

        # Try catch to avoid occasional aromaticity errors
        try:
            # `cxsmiles` is used here to preserve the first conformer.
            mol = to_mol(to_smiles(mol, cxsmiles=True), sanitize=True, add_hs=add_hs)
        except Exception:
            mol = None

    if mol is not None:
        # Insert back properties.
        mol = set_mol_props(mol, properties)

    return mol

sanitize_smiles(smiles, isomeric=True)

Takes SMILES string and returns its sanitized version.

Parameters:

Name Type Description Default
smiles Optional[str]

smiles to be sanitized.

required
isomeric bool

Whether to include information about stereochemistry in the SMILES.

True

Returns:

Type Description
Optional[str]

sanitized smiles.

Source code in datamol/mol.py
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
def sanitize_smiles(smiles: Optional[str], isomeric: bool = True) -> Optional[str]:
    """Takes SMILES string and returns its sanitized version.

    Args:
        smiles: smiles to be sanitized.
        isomeric: Whether to include information about stereochemistry in the SMILES.

    Returns:
        sanitized smiles.
    """

    mol = None

    try:
        mol = to_mol(smiles, sanitize=False)
        mol = sanitize_mol(mol, False)
    except Exception:
        return None

    if mol is None:
        return None

    try:
        smiles = to_smiles(mol, isomeric=isomeric)
    except:
        return None

    return smiles

set_dative_bonds(mol, from_atoms=(7, 8))

Replaces some single bonds between metals and atoms with atomic numbers in fromAtoms with dative bonds. The replacement is only done if the atom has "too many" bonds.

Parameters:

Name Type Description Default
mol Mol

molecule with bond to modify

required
from_atoms Tuple[int, int]

List of atoms (symbol or atomic number) to consider for bond replacement. By default, only Nitrogen (7) and Oxygen (8) are considered.

(7, 8)

Returns:

Type Description
Optional[Mol]

The modified molecule.

Source code in datamol/mol.py
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
def set_dative_bonds(mol: Mol, from_atoms: Tuple[int, int] = (7, 8)) -> Optional[Mol]:
    """Replaces some single bonds between metals and atoms with atomic numbers in fromAtoms
    with dative bonds. The replacement is only done if the atom has "too many" bonds.

    Arguments:
        mol: molecule with bond to modify
        from_atoms: List of atoms  (symbol or atomic number) to consider for bond replacement.
            By default, only Nitrogen (7) and Oxygen (8) are considered.

    Returns:
        The modified molecule.
    """
    rwmol = RWMol(mol)
    rwmol.UpdatePropertyCache(strict=False)

    metals = [at for at in rwmol.GetAtoms() if is_transition_metal(at)]
    for metal in metals:
        for nbr in metal.GetNeighbors():
            if (nbr.GetAtomicNum() in from_atoms or nbr.GetSymbol() in from_atoms) and (
                nbr.GetExplicitValence() > PERIODIC_TABLE.GetDefaultValence(nbr.GetAtomicNum())
                and rwmol.GetBondBetweenAtoms(nbr.GetIdx(), metal.GetIdx()).GetBondType()
                == SINGLE_BOND
            ):
                rwmol.RemoveBond(nbr.GetIdx(), metal.GetIdx())
                rwmol.AddBond(nbr.GetIdx(), metal.GetIdx(), DATIVE_BOND)
    return rwmol

set_mol_props(mol, props, copy=False)

Set properties to a mol from a dict.

Parameters:

Name Type Description Default
mol Mol

the mol where to copy the props.

required
props Dict[str, Any]

the props to copy.

required
copy bool

whether to copy the provided mol

False
Source code in datamol/mol.py
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
def set_mol_props(
    mol: Mol,
    props: Dict[str, Any],
    copy: bool = False,
) -> Mol:
    """Set properties to a mol from a dict.

    Args:
        mol: the mol where to copy the props.
        props: the props to copy.
        copy: whether to copy the provided mol

    """

    if copy is True:
        mol = copy_mol(mol)

    for k, v in props.items():
        if isinstance(v, bool):
            mol.SetBoolProp(k, v)
        elif isinstance(v, int):
            # NOTE(hadim): A Python integer is 32 bits and RDKit seems
            # to overflow before that. Here we catch the error
            # and instead uses silently `SetDoubleProp` instead.
            try:
                mol.SetIntProp(k, v)
            except OverflowError:
                mol.SetDoubleProp(k, v)
        elif isinstance(v, float):
            mol.SetDoubleProp(k, v)
        else:
            mol.SetProp(k, str(v))

    return mol

standardize_mol(mol, disconnect_metals=False, normalize=True, reionize=True, uncharge=False, stereo=True)

This function returns a standardized version the given molecule. It relies on the RDKit rdMolStandardize module which is largely inspired from MolVS.

Parameters:

Name Type Description Default
mol Mol

A molecule to standardize.

required
disconnect_metals bool

Disconnect metals that are defined as covalently bonded to non-metal. Depending on the source of the database, some compounds may be reported in salt form or associated to metallic ions (e.g. the sodium salt of a carboxylic compound). In most cases, these counter-ions are not relevant so the use of this function is required before further utilization of the dataset. In summary the process is the following:

  • Break covalent bonds between metals and organic atoms under certain conditions.
  • First, disconnect N, O, F from any metal. Then disconnect other non-metals from transition metals (with exceptions).
  • For every bond broken, adjust the charges of the begin and end atoms accordingly.
False
normalize bool

Applies a series of standard transformations to correct functional groups and recombine charges. It corrects drawing errors and standardizes functional groups in the molecule as well as ensuring the overall proper charge of the compound. It includes:

  • Uncharge-separate sulfones
  • Charge-separate nitro groups
  • Charge-separate pyridine oxide
  • Charge-separate azide
  • Charge-separate diazo and azo groups
  • Charge-separate sulfoxides
  • Hydrazine-diazonium system
True
reionize bool

If one or more acidic functionalities are present in the molecule, this option ensures the correct neutral/ionized state for such functional groups. Molecules are uncharged by adding and/or removing hydrogens. For zwitterions, hydrogens are moved to eliminate charges where possible. However, in cases where there is a positive charge that is not neutralizable, an attempt is made to also preserve the corresponding negative charge The algorithm works as follows:

  • Use SMARTS to find the strongest protonated acid and the weakest ionized acid.
  • If the ionized acid is weaker than the protonated acid, swap proton and repeat.
True
uncharge bool

This option neutralize the molecule by reversing the protonation state of protonated and deprotonated groups, if present (e.g. a carboxylate is re-protonated to the corresponding carboxylic acid). In cases where there is a positive charge that is not neutralizable, an attempt is made to also preserve the corresponding negative charge to ensure a net zero charge.

False
stereo bool

Stereochemical information is corrected and/or added if missing using built-in RDKit functionality to force a clean recalculation of stereochemistry (AssignStereochemistry).

True

Returns:

Name Type Description
mol Mol

A standardized molecule.

Source code in datamol/mol.py
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
def standardize_mol(
    mol: Mol,
    disconnect_metals: bool = False,
    normalize: bool = True,
    reionize: bool = True,
    uncharge: bool = False,
    stereo: bool = True,
) -> Mol:
    r"""
    This function returns a standardized version the given molecule. It relies on the
    RDKit [`rdMolStandardize` module](https://www.rdkit.org/docs/source/rdkit.Chem.MolStandardize.rdMolStandardize.html)
    which is largely inspired from [MolVS](https://github.com/mcs07/MolVS).

    Arguments:
        mol: A molecule to standardize.

        disconnect_metals: Disconnect metals that are defined as covalently bonded to non-metal.
            Depending on the source of the database, some compounds may be reported in salt form
            or associated to metallic ions (e.g. the sodium salt of a carboxylic compound).
            In most cases, these counter-ions are not relevant so the use of this function is required
            before further utilization of the dataset. In summary the process is the following:

            - Break covalent bonds between metals and organic atoms under certain conditions.
            - First, disconnect N, O, F from any metal. Then disconnect other non-metals from transition metals (with exceptions).
            - For every bond broken, adjust the charges of the begin and end atoms accordingly.

        normalize: Applies a series of standard transformations to correct functional groups and recombine charges.
            It corrects drawing errors and standardizes functional groups in the molecule as well as ensuring the
            overall proper charge of the compound. It includes:

            - Uncharge-separate sulfones
            - Charge-separate nitro groups
            - Charge-separate pyridine oxide
            - Charge-separate azide
            - Charge-separate diazo and azo groups
            - Charge-separate sulfoxides
            - Hydrazine-diazonium system

        reionize: If one or more acidic functionalities are present in the molecule, this option ensures the correct
            neutral/ionized state for such functional groups. Molecules are uncharged by adding and/or removing hydrogens.
            For zwitterions, hydrogens are moved to eliminate charges where possible. However, in cases where there is a
            positive charge that is not neutralizable, an attempt is made to also preserve the corresponding negative charge
            The algorithm works as follows:

            - Use SMARTS to find the strongest protonated acid and the weakest ionized acid.
            - If the ionized acid is weaker than the protonated acid, swap proton and repeat.

        uncharge: This option neutralize the molecule by reversing the protonation state of protonated and deprotonated groups,
            if present (e.g. a carboxylate is re-protonated to the corresponding carboxylic acid).
            In cases where there is a positive charge that is not neutralizable, an attempt is made to also preserve the
            corresponding negative charge to ensure a net zero charge.

        stereo: Stereochemical information is corrected and/or added if missing using built-in RDKit functionality to force a clean recalculation of stereochemistry (`AssignStereochemistry`).

    Returns:
        mol: A standardized molecule.
    """
    mol = copy_mol(mol)

    if disconnect_metals:
        md = rdMolStandardize.MetalDisconnector()
        mol = md.Disconnect(mol)

    if normalize:
        mol = rdMolStandardize.Normalize(mol)

    if reionize:
        reionizer = rdMolStandardize.Reionizer()
        mol = reionizer.reionize(mol)

    if uncharge:
        uncharger = rdMolStandardize.Uncharger()
        mol = uncharger.uncharge(mol)

    if stereo:
        AssignStereochemistry(mol, force=False, cleanIt=True)

    return mol

standardize_smiles(smiles, tautomer=False)

Apply smile standardization procedure. This is a convenient function wrapped arrounf RDKit smiles standardizer and tautomeric canonicalization.

Parameters:

Name Type Description Default
smiles str

Smiles to standardize

required
tautomer bool

Whether to canonicalize tautomers

False

Returns:

Name Type Description
standard_smiles str

the standardized smiles

Source code in datamol/mol.py
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
def standardize_smiles(smiles: str, tautomer: bool = False) -> str:
    r"""
    Apply smile standardization procedure. This is a convenient function wrapped arrounf RDKit
    smiles standardizer and tautomeric canonicalization.

    Args:
        smiles: Smiles to standardize
        tautomer: Whether to canonicalize tautomers

    Returns:
        standard_smiles: the standardized smiles
    """

    smiles = rdMolStandardize.StandardizeSmiles(smiles)
    if tautomer:
        smiles = canonicalize_tautomer_smiles(smiles)
    return smiles

strip_mol_to_core(mol, bond_cutter=None)

Strip a molecule to its core, i.e. remove all atoms not in the core. This method 'guess' the molecular core, by finding the ring system.

Parameters:

Name Type Description Default
mol Mol

A molecule.

required
bond_cutter Mol

A molecule used to cut the bonds.

None
Source code in datamol/mol.py
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
def strip_mol_to_core(mol: Mol, bond_cutter: Mol = None):
    """Strip a molecule to its core, i.e. remove all atoms not in the core.
    This method 'guess' the molecular core, by finding the ring system.

    Args:
        mol: A molecule.
        bond_cutter: A molecule used to cut the bonds.
    """

    if bond_cutter is None:
        bond_cutter = from_smarts("[R;!$(*=,#[!#6])]!@!=!#[*;$([A;!R][A;!R])]")

    with without_rdkit_log():

        scaffold = MurckoScaffold.GetScaffoldForMol(mol)
        out = mol.GetSubstructMatches(bond_cutter)
        bond_inds = [mol.GetBondBetweenAtoms(i, j).GetIdx() for i, j in out]

        if len(bond_inds) > 0:
            fragmented = rdmolops.FragmentOnBonds(mol, bond_inds)
            fragmented = remove_dummies(fragmented)
            fragmented = to_scaffold_murcko(fragmented)
            scaffold = keep_largest_fragment(fragmented)

    return scaffold

substructure_matching_bonds(mol, query, **kwargs)

Perform a substructure match using GetSubstructMatches but instead of returning only the atom indices also return the bond indices.

Parameters:

Name Type Description Default
mol Mol

A molecule.

required
query Mol

A molecule used as a query to match against.

required
**kwargs Any

Any other arguments to pass to mol.GetSubstructMatches().

{}

Returns:

Name Type Description
atom_matches list

A list of lists of atom indices.

bond_matches list

A list of lists of bond indices.

Source code in datamol/mol.py
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
def substructure_matching_bonds(mol: Mol, query: Mol, **kwargs: Any) -> Tuple[list, list]:
    """Perform a substructure match using `GetSubstructMatches` but instead
    of returning only the atom indices also return the bond indices.

    Args:
        mol: A molecule.
        query: A molecule used as a query to match against.
        **kwargs: Any other arguments to pass to `mol.GetSubstructMatches()`.

    Returns:
        atom_matches: A list of lists of atom indices.
        bond_matches: A list of lists of bond indices.
    """

    # NOTE(hadim): If more substructure functions are added here, consider moving it to
    # a dedicated `substructure` module.

    # Set default arguments
    kwargs.setdefault("uniquify", True)

    # Get the matching atom indices
    atom_matches = list(mol.GetSubstructMatches(query, **kwargs))

    # Get the bond to highligh from the query
    query_bond_indices = [
        (bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()) for bond in query.GetBonds()
    ]

    # Retrieve the atom indices
    query_atom_indices = [atom.GetIdx() for i, atom in enumerate(query.GetAtoms())]

    bond_matches = []

    for match in atom_matches:

        # Map the atom of the query to the atom of the mol matching the query
        atom_map = dict(zip(query_atom_indices, match))

        # For this match atoms we now, we use the map to retrieve the matching bonds
        # in the mol.
        mol_bond_indices = [(atom_map[a1], atom_map[a2]) for a1, a2 in query_bond_indices]

        # Convert the bond atom indices to bond indices
        mol_bond_indices = [mol.GetBondBetweenAtoms(a1, a2).GetIdx() for a1, a2 in mol_bond_indices]

        bond_matches.append(mol_bond_indices)

    return atom_matches, bond_matches

to_mol(mol, add_hs=False, explicit_only=False, ordered=False, kekulize=False, sanitize=True)

Convert an input molecule (smiles representation) into a Mol.

Parameters:

Name Type Description Default
mol Union[str, Mol]

A SMILES or a molecule.

required
add_hs bool

Whether hydrogens should be added the molecule.

False
explicit_only bool

Whether to only add explicit hydrogen or both (implicit and explicit). when add_hs is set to True.

False
ordered bool

Whether the atom should be ordered. This option is important if you want to ensure that the features returned will always maintain a single atom order for the same molecule, regardless of its original SMILES representation.

False
kekulize bool

Whether to perform kekulization of the input molecules.

False
sanitize bool

Whether to apply rdkit sanitization when input is a SMILES.

True

Returns:

Name Type Description
mol Optional[Mol]

the molecule if some conversion have been made. If the conversion fails

Optional[Mol]

None is returned so make sure that you handle this case on your own.

Source code in datamol/mol.py
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def to_mol(
    mol: Union[str, Mol],
    add_hs: bool = False,
    explicit_only: bool = False,
    ordered: bool = False,
    kekulize: bool = False,
    sanitize: bool = True,
) -> Optional[Mol]:
    """Convert an input molecule (smiles representation) into a `Mol`.

    Args:
        mol: A SMILES or a molecule.
        add_hs: Whether hydrogens should be added the molecule.
        explicit_only: Whether to only add explicit hydrogen or both
            (implicit and explicit). when `add_hs` is set to True.
        ordered: Whether the atom should be ordered. This option is
            important if you want to ensure that the features returned will always maintain
            a single atom order for the same molecule, regardless of its original SMILES representation.
        kekulize: Whether to perform kekulization of the input molecules.
        sanitize: Whether to apply rdkit sanitization when input is a SMILES.

    Returns:
        mol: the molecule if some conversion have been made. If the conversion fails
        None is returned so make sure that you handle this case on your own.
    """

    if not isinstance(mol, (str, Mol)):
        raise ValueError(f"Input should be a Mol or a string instead of '{type(mol)}'")

    if isinstance(mol, str):
        _mol = MolFromSmiles(mol, sanitize=sanitize)

        if not sanitize and _mol is not None:
            _mol.UpdatePropertyCache(False)
    else:
        _mol = mol

    # Add hydrogens
    if _mol is not None and add_hs:
        _mol = AddHs(_mol, explicitOnly=explicit_only, addCoords=True)

    # Reorder atoms
    if _mol is not None and ordered:
        _mol = reorder_atoms(_mol)

    if _mol is not None and kekulize:
        Kekulize(_mol, clearAromaticFlags=False)
    return _mol

to_neutral(mol)

Neutralize the charge of a molecule.

Parameters:

Name Type Description Default
mol Optional[Mol]

a molecule.

required

Returns:

Name Type Description
mol Optional[Mol]

a molecule.

Source code in datamol/mol.py
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
def to_neutral(mol: Optional[Mol]) -> Optional[Mol]:
    """Neutralize the charge of a molecule.

    Args:
        mol: a molecule.

    Returns:
        mol: a molecule.
    """
    if mol is None:
        return mol

    for a in mol.GetAtoms():
        if a.GetFormalCharge() < 0 or (
            a.GetExplicitValence() >= PERIODIC_TABLE.GetDefaultValence(a.GetSymbol())
            and a.GetFormalCharge() > 0
        ):
            a.SetFormalCharge(0)
            a.UpdatePropertyCache(False)
    return mol

to_scaffold_murcko(mol, make_generic=False)

Extract the Murcko scaffold from a molecule.

Parameters:

Name Type Description Default
mol Mol

A molecule.

required
make_generic bool

Whether to make the scaffold generic.

False
Source code in datamol/mol.py
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
def to_scaffold_murcko(mol: Mol, make_generic: bool = False):
    """Extract the Murcko scaffold from a molecule.

    Args:
        mol: A molecule.
        make_generic: Whether to make the scaffold generic.
    """
    scf = MurckoScaffold.GetScaffoldForMol(mol)

    # NOTE(hadim): this is already done in `GetScaffoldForMol`
    # Note sure we need it here.
    scf.UpdatePropertyCache()
    Chem.GetSymmSSSR(scf)  # type: ignore

    if make_generic:
        scf = make_scaffold_generic(scf)
        scf = to_mol(scf)

    return scf

unique_id(mol)

A datamol unique molecule ID.

The ID is an MD5 hash of the non-standard InChiKey provided by dm.to_inchikey_non_standard(). It guarantees uniqueness for different tautomeric forms of the same molecule.

Parameters:

Name Type Description Default
mol Mol

A molecule.

required
Source code in datamol/mol.py
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
def unique_id(mol: Mol) -> Optional[str]:
    """A datamol unique molecule ID.

    The ID is an MD5 hash of the non-standard InChiKey provided
    by `dm.to_inchikey_non_standard()`. It guarantees uniqueness for
    different tautomeric forms of the same molecule.

    Args:
        mol: A molecule.
    """
    ik = to_inchikey_non_standard(mol)

    if ik is None:
        return None

    return hashlib.md5(ik.encode("utf-8")).hexdigest()