Skip to content

datamol.mol

add_hs(mol, explicit_only=False, add_coords=False, only_on_atoms=None, add_residue_info=False)

Adds hydrogens to the molecule.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
explicit_only bool

whether to only add explicit hydrogens.

False
add_coords bool

whether to add 3D coordinates to the hydrogens.

False
only_on_atoms Optional[List[int]]

a list of atoms to add hydrogens only on.

None
add_residue_info bool

whether to add residue information to the hydrogens. Useful for PDB files.

False
Source code in datamol/mol.py
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
def add_hs(
    mol: Mol,
    explicit_only: bool = False,
    add_coords: bool = False,
    only_on_atoms: Optional[List[int]] = None,
    add_residue_info: bool = False,
):
    """Adds hydrogens to the molecule.

    Args:
        mol: a molecule.
        explicit_only: whether to only add explicit hydrogens.
        add_coords: whether to add 3D coordinates to the hydrogens.
        only_on_atoms: a list of atoms to add hydrogens only on.
        add_residue_info: whether to add residue information to the hydrogens.
            Useful for PDB files.
    """
    mol = rdmolops.AddHs(
        mol,
        explicitOnly=explicit_only,
        addCoords=add_coords,
        onlyOnAtoms=only_on_atoms,
        addResidueInfo=add_residue_info,
    )

    return mol

adjust_singleton(mol)

Remove all atoms that are essentially disconnected singleton nodes in the molecular graph. For example, the chlorine atom and methane fragment will be removed in Cl.[N:1]1=CC(O)=CC2CCCCC12.CC.C", but not the ethane fragment.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
Source code in datamol/mol.py
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
def adjust_singleton(mol: Mol) -> Optional[Mol]:
    """Remove all atoms that are essentially disconnected singleton nodes in the molecular graph.
    For example, the chlorine atom and methane fragment will be removed in Cl.[N:1]1=CC(O)=CC2CCCCC12.CC.C",
    but not the ethane fragment.

    Args:
        mol: a molecule.
    """
    to_rem = []
    em = rdchem.RWMol(mol)
    for atom in mol.GetAtoms():
        if atom.GetExplicitValence() == 0:
            to_rem.append(atom.GetIdx())
    to_rem.sort(reverse=True)
    for a_idx in to_rem:
        em.RemoveAtom(a_idx)
    return em.GetMol()

atom_indices_to_mol(mol, copy=True)

Add the molAtomMapNumber property to each atoms.

Parameters:

Name Type Description Default
mol Mol

a molecule

required
copy bool

Whether to copy the molecule.

True
Source code in datamol/mol.py
911
912
913
914
915
916
917
918
919
920
921
922
923
924
def atom_indices_to_mol(mol: Mol, copy: bool = True):
    """Add the `molAtomMapNumber` property to each atoms.

    Args:
        mol: a molecule
        copy: Whether to copy the molecule.
    """

    if copy is True:
        mol = copy_mol(mol)

    for atom in mol.GetAtoms():
        atom.SetProp("molAtomMapNumber", str(atom.GetIdx()))
    return mol

atom_list_to_bond(mol, atom_indices, bond_as_idx=False)

Return a list of existing bond indices between a list of atom indices.

Parameters:

Name Type Description Default
mol Mol

A molecule.

required
atom_indices List[int]

A list of atom indices.

required
Source code in datamol/mol.py
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
def atom_list_to_bond(
    mol: Mol,
    atom_indices: List[int],
    bond_as_idx: bool = False,
):
    """Return a list of existing bond indices between a list of
    atom indices.

    Args:
        mol: A molecule.
        atom_indices: A list of atom indices.
    """

    # Build an atom map
    atom_map = {}
    submol = rdmolops.PathToSubmol(mol, atom_indices, useQuery=True, atomMap=atom_map)
    atom_map_reversed = {v: k for k, v in atom_map.items()}

    bonds = []

    for bond in submol.GetBonds():
        a1, a2 = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        ori_a1 = atom_map_reversed[a1]
        ori_a2 = atom_map_reversed[a2]

        if ori_a1 in atom_indices and ori_a2 in atom_indices:
            ori_bond = mol.GetBondBetweenAtoms(ori_a1, ori_a2)
            if bond_as_idx:
                bonds.append(ori_bond.GetIdx())
            else:
                bonds.append(ori_bond)

    return bonds

clear_atom_map_number(mol, copy=True)

Clear the molAtomMapNumber property of the atom's molecule.

Parameters:

Name Type Description Default
mol Mol

A molecule.

required
copy bool

Whether to copy the molecule.

True
Source code in datamol/mol.py
901
902
903
904
905
906
907
908
def clear_atom_map_number(mol: Mol, copy: bool = True):
    """Clear the `molAtomMapNumber` property of the atom's molecule.

    Args:
        mol: A molecule.
        copy: Whether to copy the molecule.
    """
    return clear_atom_props(mol, copy=copy, property_keys="molAtomMapNumber")

clear_atom_props(mol, property_keys=None, copy=True, include_private=False, include_computed=False)

Clear atom properties from a molecule.

Parameters:

Name Type Description Default
mol Mol

A molecule.

required
property_keys Optional[Union[List[str], str]]

If set, only the set properties will be cleared. It not set, all the properties are cleared.

None
copy bool

Whether to copy the molecule.

True
include_private bool

Whether to also clean the private properties.

False
include_computed bool

Whether to also clean the computed properties.

False
Source code in datamol/mol.py
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
def clear_atom_props(
    mol: Mol,
    property_keys: Optional[Union[List[str], str]] = None,
    copy: bool = True,
    include_private: bool = False,
    include_computed: bool = False,
) -> Mol:
    """Clear atom properties from a molecule.

    Args:
        mol: A molecule.
        property_keys: If set, only the set properties will be cleared.
            It not set, all the properties are cleared.
        copy: Whether to copy the molecule.
        include_private: Whether to also clean the private properties.
        include_computed: Whether to also clean the computed properties.
    """

    if copy:
        mol = copy_mol(mol)

    if property_keys is not None and isinstance(property_keys, str):
        property_keys = [property_keys]

    for atom in mol.GetAtoms():
        if property_keys is None:
            props = atom.GetPropsAsDict(
                includePrivate=include_private, includeComputed=include_computed
            )
            property_keys = list(props.keys())

        for key in property_keys:
            atom.ClearProp(key)

    return mol

clear_mol_props(mol, property_keys=None, copy=True, include_private=False, include_computed=False)

Clear properties from a molecule.

Parameters:

Name Type Description Default
mol Mol

A molecule.

required
property_keys Optional[Union[List[str], str]]

If set, only the set properties will be cleared. It not set, all the properties are cleared.

None
copy bool

Whether to copy the molecule.

True
include_private bool

Whether to also clean the private properties.

False
include_computed bool

Whether to also clean the computed properties.

False
Source code in datamol/mol.py
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
def clear_mol_props(
    mol: Mol,
    property_keys: Optional[Union[List[str], str]] = None,
    copy: bool = True,
    include_private: bool = False,
    include_computed: bool = False,
) -> Mol:
    """Clear properties from a molecule.

    Args:
        mol: A molecule.
        property_keys: If set, only the set properties will be cleared.
            It not set, all the properties are cleared.
        copy: Whether to copy the molecule.
        include_private: Whether to also clean the private properties.
        include_computed: Whether to also clean the computed properties.
    """

    if copy:
        mol = copy_mol(mol)

    if property_keys is not None and isinstance(property_keys, str):
        property_keys = [property_keys]

    if property_keys is None:
        props = mol.GetPropsAsDict(includePrivate=include_private, includeComputed=include_computed)
        property_keys = list(props.keys())

    for key in property_keys:
        mol.ClearProp(key)

    return mol

compute_ring_system(mol, include_spiro=True)

Compute the list of ring system in a molecule. This is based on RDKit's cookbook: https://www.rdkit.org/docs/Cookbook.html#rings-aromaticity-and-kekulization

Parameters:

Name Type Description Default
mol Mol

input molecule

required
include_spiro bool

whether to include spiro rings.

True

Returns:

Name Type Description
ring_system List[Set[int]]

list of ring system (atom indices).

Source code in datamol/mol.py
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
def compute_ring_system(mol: Mol, include_spiro: bool = True) -> List[Set[int]]:
    """Compute the list of ring system in a molecule. This is based on RDKit's cookbook:
    https://www.rdkit.org/docs/Cookbook.html#rings-aromaticity-and-kekulization

    Args:
        mol: input molecule
        include_spiro: whether to include spiro rings.

    Returns:
        ring_system: list of ring system (atom indices).
    """
    ri = mol.GetRingInfo()
    systems = []
    for ring in ri.AtomRings():
        ringAts = set(ring)
        nSystems = []
        for system in systems:
            nInCommon = len(ringAts.intersection(system))
            if nInCommon and (include_spiro or nInCommon > 1):
                ringAts = ringAts.union(system)
            else:
                nSystems.append(system)
        nSystems.append(ringAts)
        systems = nSystems
    return systems

copy_mol(mol)

Copy a molecule and return a new one.

Parameters:

Name Type Description Default
mol Mol

a molecule to copy.

required
Source code in datamol/mol.py
51
52
53
54
55
56
57
def copy_mol(mol: Mol) -> Mol:
    """Copy a molecule and return a new one.

    Args:
        mol: a molecule to copy.
    """
    return copy.deepcopy(mol)

copy_mol_props(source, destination, include_private=False, include_computed=False)

Copy properties from one source molecule to another destination molecule.

Parameters:

Name Type Description Default
source Mol

a molecule to copy from.

required
destination Mol

a molecule to copy to.

required
include_private bool

Include private properties.

False
include_computed bool

Include computed properties.

False
Source code in datamol/mol.py
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
def copy_mol_props(
    source: Mol,
    destination: Mol,
    include_private: bool = False,
    include_computed: bool = False,
):
    """Copy properties from one source molecule to another destination
    molecule.

    Args:
        source: a molecule to copy from.
        destination: a molecule to copy to.
        include_private: Include private properties.
        include_computed: Include computed properties.
    """

    props = source.GetPropsAsDict(includePrivate=include_private, includeComputed=include_computed)
    set_mol_props(destination, props)

decrease_bond(bond)

Remove one single bond from the input bond. Note that you should first kekulize your molecules and remove non-standard bond.

Parameters:

Name Type Description Default
bond Bond

a bond.

required
Source code in datamol/mol.py
546
547
548
549
550
551
552
553
554
555
556
557
558
559
def decrease_bond(bond: Chem.rdchem.Bond) -> Optional[Union[list, Chem.rdchem.Bond]]:
    """Remove one single bond from the input bond. Note that you should
    first kekulize your molecules and remove non-standard bond.

    Args:
        bond: a bond.
    """
    if bond.GetBondType() == TRIPLE_BOND:
        return DOUBLE_BOND
    if bond.GetBondType() == DOUBLE_BOND:
        return SINGLE_BOND
    if bond.GetBondType() == SINGLE_BOND:
        return None
    return bond

fix_mol(mol, n_iter=1, remove_singleton=False, largest_only=False, inplace=False)

Fix error in molecule using a greedy approach.

Parameters:

Name Type Description Default
mol Mol

input molecule to fix

required
n_iter int

Number of valence fix iteration to apply

1
remove_singleton bool

Whether adjust_singleton should be applied

False
largest_only bool

Whether only the largest fragment should be kept

False
inplace bool

Whether to return a copy of the mol or perform in place operation

False

Returns:

Type Description
Optional[Mol]

Fixed molecule.

Source code in datamol/mol.py
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
def fix_mol(
    mol: Mol,
    n_iter: int = 1,
    remove_singleton: bool = False,
    largest_only: bool = False,
    inplace: bool = False,
) -> Optional[Mol]:
    """Fix error in molecule using a greedy approach.

    Args:
        mol: input molecule to fix
        n_iter: Number of valence fix iteration to apply
        remove_singleton: Whether `adjust_singleton` should be applied
        largest_only: Whether only the largest fragment should be kept
        inplace: Whether to return a copy of the mol or perform in place operation

    Returns:
        Fixed molecule.
    """

    if not inplace:
        mol = copy.copy(mol)

    m = sanitize_mol(mol) or mol  # fail back to mol when the fixer fail

    if m is not None:
        m = remove_dummies(m)
        for _ in range(n_iter):
            m = fix_valence(m)

        if remove_singleton:
            m = adjust_singleton(m)

        if largest_only:
            # m = max(Chem.rdmolops.GetMolFrags(m, asMols=True, sanitizeFrags=False), key=lambda m: m.GetNumAtoms())
            m = rdMolStandardize.FragmentParent(m, skipStandardize=True)

    return m

fix_valence(mol, inplace=False, allow_ring_break=False)

Identify and try to fix valence issues by removing any supplemental bond that should not be in the graph.

Parameters:

Name Type Description Default
mol Mol

input molecule with incorrect valence for some atoms

required
inplace bool

Whether to modify in place or make a copy

False
allow_ring_break bool

Whether bond removal involving ring is allowed.

False

Returns:

Type Description
Optional[Mol]

Fixed potential valence issue in molecule or original molecule when nothing is broken

Optional[Mol]

of if failed.

Source code in datamol/mol.py
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
def fix_valence(mol: Mol, inplace: bool = False, allow_ring_break: bool = False) -> Optional[Mol]:
    """Identify and try to fix valence issues by removing any supplemental bond
    that should not be in the graph.

    Args:
        mol: input molecule with incorrect valence for some atoms
        inplace: Whether to modify in place or make a copy
        allow_ring_break: Whether bond removal involving ring is allowed.

    Returns:
        Fixed potential valence issue in molecule or original molecule when nothing is broken
        of if failed.
    """
    if not inplace:
        mol = copy.copy(mol)

    vm = rdMolStandardize.RDKitValidation()
    if len(vm.validate(mol)) == 0:  # don't fix something that is not broken
        return mol

    try:
        m = remove_hs(
            mol,
            implicit_only=False,
            update_explicit_count=True,
            sanitize=False,
        )
        m.UpdatePropertyCache(False)

        # first pass using explicit false count
        for atom in m.GetAtoms():
            while incorrect_valence(atom) and atom.GetTotalNumHs() > 0:
                cur_hydrogen = atom.GetTotalNumHs()
                atom.SetNumExplicitHs(max(0, cur_hydrogen - 1))
                atom.SetFormalCharge(max(0, atom.GetFormalCharge() - 1))
                # atom.SetNumRadicalElectrons(0)
            atom.UpdatePropertyCache(False)

        em = rdchem.RWMol(m)
        bonds = em.GetBonds()
        bonds = [
            bond
            for bond in bonds
            if any(
                [
                    incorrect_valence(bond.GetBeginAtom()),
                    incorrect_valence(bond.GetEndAtom()),
                ]
            )
        ]
        for bond in bonds:
            a1 = bond.GetBeginAtom()
            a2 = bond.GetEndAtom()
            if incorrect_valence(a1) or incorrect_valence(a2):
                mbond = decrease_bond(bond)
                if allow_ring_break or (mbond or not bond.IsInRing()):
                    em.RemoveBond(a1.GetIdx(), a2.GetIdx())
                    if mbond is not None:
                        em.AddBond(a1.GetIdx(), a2.GetIdx(), mbond)
            a1.UpdatePropertyCache(False)
            a2.UpdatePropertyCache(False)
        m = em.GetMol()

    except Exception:
        return None

    return m

fix_valence_charge(mol, inplace=False)

Fix valence issues that are due to incorrect charges.

Parameters:

Name Type Description Default
mol Mol

Input molecule with incorrect valence for some atoms

required
inplace bool

Whether to modify in place or make a copy.

False

Returns:

Type Description
Optional[Mol]

Fixed molecule via charge correction or original molecule if failed.

Source code in datamol/mol.py
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
def fix_valence_charge(mol: Mol, inplace: bool = False) -> Optional[Mol]:
    """Fix valence issues that are due to incorrect charges.

    Args:
        mol: Input molecule with incorrect valence for some atoms
        inplace: Whether to modify in place or make a copy.

    Returns:
        Fixed molecule via charge correction or original molecule if failed.
    """

    vm = rdMolStandardize.RDKitValidation()

    # Don't fix something that is not broken
    if len(vm.validate(mol)) > 0:
        if not inplace:
            mol = copy.copy(mol)

        mol.UpdatePropertyCache(False)
        for a in mol.GetAtoms():
            n_electron = (
                a.GetImplicitValence()
                + a.GetExplicitValence()
                - PERIODIC_TABLE.GetDefaultValence(a.GetSymbol())
            )
            a.SetFormalCharge(n_electron)

    return mol

get_atom_positions(mol, conf_id=-1, reorder_to_atom_map_number=False)

Return the atom positions of a given molecule.

Parameters:

Name Type Description Default
mol Mol

A molecule.

required
conf_id int

The conformer ID to set the conformer to.

-1
reorder_to_atom_map_number bool

Whether to reorder the positions to map the atom map numbers given by the molAtomMapNumber atom property.

False
Source code in datamol/mol.py
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
def get_atom_positions(
    mol: Mol,
    conf_id: int = -1,
    reorder_to_atom_map_number: bool = False,
) -> np.ndarray:
    """Return the atom positions of a given molecule.

    Args:
        mol: A molecule.
        conf_id: The conformer ID to set the conformer to.
        reorder_to_atom_map_number: Whether to reorder the positions to map the
            atom map numbers given by the `molAtomMapNumber` atom property.
    """

    if mol.GetNumConformers() == 0:
        raise ValueError("This molecule does not have conformers.")

    if reorder_to_atom_map_number and not all(
        ["molAtomMapNumber" in a.GetPropsAsDict() for a in mol.GetAtoms()]
    ):
        raise ValueError(
            "The atoms of the input molecule does not contain the molAtomMapNumber property."
            "Set it before calling this function or set `from_atom_map_numbers` to `False`."
        )

    conformer = mol.GetConformer(id=conf_id)
    positions = conformer.GetPositions()

    if reorder_to_atom_map_number:
        # Remap the rows order in `positions` so it matches
        # with the atom map numbers.
        mapped_indices = np.array([a.GetAtomMapNum() for a in mol.GetAtoms()]) - 1
        positions = positions[mapped_indices, :]

    return positions

hash_mol(mol, hash_scheme='all')

Generate a unique hash code for a molecule based on chemistry. If two molecules are chemically “the same”, they should have the same hash.

Using molhash adds value beyond using SMILES because it:

  • Ignores SMILES features that are not chemically meaningful (e.g. atom map numbers).
  • Canonicalizes enhanced stereochemistry groups. For example C[C@H](O)CC |&1:1| and C[C@@H](O)CC |&1:1| have the same molhash.
  • Canonicalizes S group data (for example, polymer data).

Parameters:

Name Type Description Default
mol Mol

A molecule.

required
hash_scheme str

There are three hash schemes: - "all": most strict hash scheme utilizing all layers. - "no_stereo": excludes stereo sensitive layers. - "no_tautomers": excludes tautomer sensitive layers.

'all'

Returns:

Type Description
str

The hash as a 40 chars string.

Source code in datamol/mol.py
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
def hash_mol(mol: Mol, hash_scheme: str = "all") -> str:
    """Generate a unique hash code for a molecule based on chemistry. If two molecules are
    chemically “the same”, they should have the same hash.

    Using molhash adds value beyond using SMILES because it:

    - Ignores SMILES features that are not chemically meaningful (e.g. atom map numbers).
    - Canonicalizes enhanced stereochemistry groups. For example `C[C@H](O)CC |&1:1|` and `C[C@@H](O)CC |&1:1|`
    have the same molhash.
    - Canonicalizes S group data (for example, polymer data).

    Args:
        mol: A molecule.
        hash_scheme: There are three hash schemes:
            - "all": most strict hash scheme utilizing all layers.
            - "no_stereo": excludes stereo sensitive layers.
            - "no_tautomers": excludes tautomer sensitive layers.

    Returns:
        The hash as a 40 chars string.
    """

    if is_lower_than_current_rdkit_version("2022.09"):
        raise NotImplementedError("`datamol.hash_mol() is only available with RDKit>=2022.09.")

    from rdkit.Chem import RegistrationHash

    # Compute the mol layers
    all_layers = RegistrationHash.GetMolLayers(mol)

    # Select the hash scheme
    if hash_scheme == "all":
        hash_scheme_enum = RegistrationHash.HashScheme.ALL_LAYERS
    elif hash_scheme == "no_stereo":
        hash_scheme_enum = RegistrationHash.HashScheme.STEREO_INSENSITIVE_LAYERS
    elif hash_scheme == "no_tautomers":
        hash_scheme_enum = RegistrationHash.HashScheme.TAUTOMER_INSENSITIVE_LAYERS
    else:
        raise ValueError(
            "`hash_scheme` is invalid. Please choose from: 'all', 'no_stereo' or 'no_tautomers'."
        )

    # Generate the hash
    return RegistrationHash.GetMolHash(all_layers=all_layers, hash_scheme=hash_scheme_enum)

incorrect_valence(a, update=False)

Check if an atom connection is not valid or all the atom of a molecule.

Parameters:

Name Type Description Default
a Union[Mol, Atom]

atom or molecule to check for valence issue.

required
update bool

Update owning molecule property cache first.

False

Returns:

Type Description
bool

Whether the input atom valence is correct.

Source code in datamol/mol.py
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
def incorrect_valence(a: Union[Mol, Chem.rdchem.Atom], update: bool = False) -> bool:
    """Check if an atom connection is not valid or all the atom of a molecule.

    Args:
        a: atom or molecule to check for valence issue.
        update: Update owning molecule property cache first.

    Returns:
        Whether the input atom valence is correct.
    """
    if isinstance(a, Mol):
        a.UpdatePropertyCache(False)
        vm = rdMolStandardize.RDKitValidation()
        return len(vm.validate(a)) > 0

    if update:
        m = a.GetOwningMol()
        m.UpdatePropertyCache(False)
    return (a.GetImplicitValence() == 0) and (
        a.GetExplicitValence() > max(PERIODIC_TABLE.GetValenceList(a.GetSymbol()))
    )

is_transition_metal(at)

Check if atom is a transition metal.

Parameters:

Name Type Description Default
at Atom

an atom.

required
Source code in datamol/mol.py
736
737
738
739
740
741
742
743
def is_transition_metal(at: Chem.rdchem.Atom) -> bool:
    """Check if atom is a transition metal.

    Args:
        at: an atom.
    """
    n = at.GetAtomicNum()
    return (n >= 22 and n <= 29) or (n >= 40 and n <= 47) or (n >= 72 and n <= 79)

keep_largest_fragment(mol)

Only keep largest fragment of each molecule.

Source code in datamol/mol.py
727
728
729
730
731
732
733
def keep_largest_fragment(mol: Mol) -> Optional[Mol]:
    """Only keep largest fragment of each molecule."""
    return max(
        rdmolops.GetMolFrags(mol, asMols=True),
        default=mol,
        key=lambda m: m.GetNumAtoms(),
    )

make_scaffold_generic(mol, include_bonds=False)

Make the atom in a scaffold or molecule generic.

Parameters:

Name Type Description Default
mol Mol

A molecule or a scaffold.

required
include_bonds bool

Whether we should also update bond order or keep as is.

False
Source code in datamol/mol.py
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
def make_scaffold_generic(mol: Mol, include_bonds: bool = False):
    """Make the atom in a scaffold or molecule generic.

    Args:
        mol: A molecule or a scaffold.
        include_bonds: Whether we should also update bond order or keep as is.
    """

    for atom in mol.GetAtoms():
        if atom.GetAtomicNum() != 1:
            atom.SetAtomicNum(0)

        atom.SetFormalCharge(0)
        atom.SetChiralTag(rdchem.ChiralType.CHI_UNSPECIFIED)
        atom.SetNoImplicit(0)
        atom.SetNumExplicitHs(0)

    if include_bonds:
        for bond in mol.GetBonds():
            bond.SetBondType(UNSPECIFIED_BOND)

    mol.UpdatePropertyCache()
    Chem.GetSymmSSSR(mol)  # type: ignore

    return mol

protect_atoms(mol, substruct=None, atoms=None, in_place=False)

Protect a list of atoms or substruct in a molecule.

The _protected attributes of a molecule is used by RDKit in several functions, especially for reactions where "protected" atoms are disallowed from taking part in reactions.

Parameters:

Name Type Description Default
mol Mol

input molecule to protect

required
substruct Optional[Mol]

optional substructure query to identify atoms to protect

None
atoms Optional[Union[List[int], int]]

optional list of atom indices to protect

None
in_place bool

whether to perform the protection in place or return a copy of the molecule

False
Source code in datamol/mol.py
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
def protect_atoms(
    mol: Mol,
    substruct: Optional[Mol] = None,
    atoms: Optional[Union[List[int], int]] = None,
    in_place: bool = False,
) -> Mol:
    """Protect a list of atoms or substruct in a molecule.

    The _protected attributes of a molecule is used by RDKit in several functions, especially for reactions
    where "protected" atoms are disallowed from taking part in reactions.

    Args:
        mol: input molecule to protect
        substruct: optional substructure query to identify atoms to protect
        atoms: optional list of atom indices to protect
        in_place: whether to perform the protection in place or return a copy of the molecule
    """
    if atoms is None:
        atoms = []
    elif not isinstance(atoms, (tuple, list)):
        atoms = [atoms]

    # do not perform protection in place
    if in_place:
        mol_copy = mol
    else:
        mol_copy = copy_mol(mol)

    if substruct is not None:
        matches = mol_copy.GetSubstructMatches(substruct)
        atoms.extend(itertools.chain(*matches))

    for a in atoms:
        if a is None:
            continue
        mol_copy.GetAtomWithIdx(a).SetProp("_protected", "1")

    return mol_copy

randomize_atoms(mol)

Randomize the position of the atoms in a mol.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required

Returns:

Name Type Description
mol Optional[Mol]

a molecule.

Source code in datamol/mol.py
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
def randomize_atoms(mol: Mol) -> Optional[Mol]:
    """Randomize the position of the atoms in a mol.

    Args:
        mol: a molecule.

    Returns:
        mol: a molecule.
    """
    if mol.GetNumAtoms() == 0:
        return mol

    atom_indices = list(range(mol.GetNumAtoms()))
    random.shuffle(atom_indices)
    return rdmolops.RenumberAtoms(mol, atom_indices)

remove_dummies(mol, dummy='*')

Remove dummy atoms from molecules.

Source code in datamol/mol.py
650
651
652
653
654
655
656
657
658
659
660
661
def remove_dummies(mol: Mol, dummy: str = "*") -> Optional[Mol]:
    """Remove dummy atoms from molecules."""

    du = to_mol(dummy)
    out = mol

    try:
        out = rdmolops.ReplaceSubstructs(mol, du, to_mol("[H]"), True)[0]
        out = remove_hs(out)
    except Exception:
        out = rdmolops.DeleteSubstructs(mol, du)
    return out

remove_hs(mol, implicit_only=False, update_explicit_count=False, sanitize=True)

Removes hydrogens from a molecule.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
implicit_only bool

whether to only remove implicit hydrogens.

False
update_explicit_count bool

whether to update the explicit hydrogen count.

False
sanitize bool

whether to sanitize the molecule after the hydrogens are removed.

True
Source code in datamol/mol.py
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
def remove_hs(
    mol: Mol,
    implicit_only: bool = False,
    update_explicit_count: bool = False,
    sanitize: bool = True,
):
    """Removes hydrogens from a molecule.

    Args:
        mol: a molecule.
        implicit_only: whether to only remove implicit hydrogens.
        update_explicit_count: whether to update the explicit hydrogen count.
        sanitize: whether to sanitize the molecule after the hydrogens are removed.
    """
    mol = rdmolops.RemoveHs(
        mol,
        implicitOnly=implicit_only,
        updateExplicitCount=update_explicit_count,
        sanitize=sanitize,
    )

    return mol

remove_salts_solvents(mol, defn_data=None, defn_format='smarts', dont_remove_everything=False, sanitize=True)

Remove all salts and solvents from the molecule. In most cases when dealing with small drug-like molecules, the salt/solvent units are smaller than the parent molecule. dm.mol.keep_largest_fragment can be applied in that scenario. However, in some cases the molecules of interested is smaller than the salt/solvent units, it's recommended to define the salt/solvent units and apply remove_salt_solvent to remove unwanted salt/solvent. A predefined salts and solvents are listed in file "datamol/data/salts_solvents.smi". User can also define the salt/solvent units by passing string to argument dafnData and defnFormat.

Args:
    mol: A molecule.
    defn_data: A string to define salts and solvents. Use "

" as seperator for multiple units. defn_format: "smarts" or "smiles" when define the above salt/solvent units. sanitize: Whether sanitize molecule after removing salt/solvent units. dont_remove_everything: When set to True, the last salt/solvent will remain when the molecule is consisted by multiple salt/solvent units.

See Also:
    <rdkit.Chem.SaltRemover.SaltRemover>
    <datamol.mol.keep_largest_fragment>
Source code in datamol/mol.py
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
def remove_salts_solvents(
    mol: Mol,
    defn_data: str = None,
    defn_format: str = "smarts",
    dont_remove_everything: bool = False,
    sanitize: bool = True,
) -> Mol:
    """Remove all salts and solvents from the molecule.
       In most cases when dealing with small drug-like molecules, the salt/solvent units are smaller
       than the parent molecule. `dm.mol.keep_largest_fragment` can be applied in that scenario.
       However, in some cases the molecules of interested is smaller than the salt/solvent units,
       it's recommended to define the salt/solvent units and apply `remove_salt_solvent` to remove
       unwanted salt/solvent. A predefined salts and solvents are listed in file "datamol/data/salts_solvents.smi".
       User can also define the salt/solvent units by passing string to argument `dafnData` and `defnFormat`.

    Args:
        mol: A molecule.
        defn_data: A string to define salts and solvents. Use "\n" as seperator for multiple units.
        defn_format: "smarts" or "smiles" when define the above salt/solvent units.
        sanitize: Whether sanitize molecule after removing salt/solvent units.
        dont_remove_everything: When set to `True`, the last salt/solvent will remain when the molecule is consisted by
                              multiple salt/solvent units.


    See Also:
        <rdkit.Chem.SaltRemover.SaltRemover>
        <datamol.mol.keep_largest_fragment>
    """
    mol_copy = copy_mol(mol)
    if defn_data is None:
        remover = SALT_SOLVENT_REMOVER
    else:
        remover = SaltRemover(defnData=defn_data, defnFormat=defn_format)
    return remover.StripMol(
        mol_copy, dontRemoveEverything=dont_remove_everything, sanitize=sanitize
    )

reorder_atoms(mol, break_ties=True, include_chirality=True, include_isotopes=True)

Reorder the atoms in a mol. It ensures a single atom order for the same molecule, regardless of its original representation.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
break_ties bool

Force breaking of ranked ties.

True
include_chirality bool

Use chiral information when computing rank.

True
include_isotopes bool

Use isotope information when computing rank.

True

Returns:

Name Type Description
mol Optional[Mol]

a molecule.

Source code in datamol/mol.py
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
def reorder_atoms(
    mol: Mol,
    break_ties: bool = True,
    include_chirality: bool = True,
    include_isotopes: bool = True,
) -> Optional[Mol]:
    """Reorder the atoms in a mol. It ensures a single atom order for the same molecule,
    regardless of its original representation.

    Args:
        mol: a molecule.
        break_ties: Force breaking of ranked ties.
        include_chirality: Use chiral information when computing rank.
        include_isotopes: Use isotope information when computing rank.

    Returns:
        mol: a molecule.
    """
    if mol.GetNumAtoms() == 0:
        return mol

    new_order = rdmolfiles.CanonicalRankAtoms(
        mol,
        breakTies=break_ties,
        includeChirality=include_chirality,
        includeIsotopes=include_isotopes,
    )
    new_order = sorted([(y, x) for x, y in enumerate(new_order)])
    return rdmolops.RenumberAtoms(mol, [y for (x, y) in new_order])

replace_dummies_atoms(mol, atom='C', dummy='*', replace_all=True)

Remove dummy atoms from molecules.

Parameters:

Name Type Description Default
mol Mol

molecule with dummies

required
atom str

replacement atom, default is carbon

'C'
dummy str

dummy atom representation

'*'
replace_all bool

Whether to replace all dummies

True

Returns:

Name Type Description
mol Optional[Mol]

Molecule with dummy replaced

Source code in datamol/mol.py
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
def replace_dummies_atoms(
    mol: Mol,
    atom: str = "C",
    dummy: str = "*",
    replace_all: bool = True,
) -> Optional[Mol]:
    """Remove dummy atoms from molecules.

    Args:
        mol: molecule with dummies
        atom: replacement atom, default is carbon
        dummy: dummy atom representation
        replace_all: Whether to replace all dummies

    Returns:
        mol: Molecule with dummy replaced
    """
    du = to_mol(dummy)
    replacement = to_mol(atom)
    out = rdmolops.ReplaceSubstructs(mol, du, replacement, replaceAll=replace_all)[0]
    return out

same_mol(mol1, mol2, use_non_standard_inchikey=False)

Check two molecules are the same by comparing their InChiKey.

Invalid molecules (None) are always considered as not the same.

Parameters:

Name Type Description Default
mol1 Optional[Mol]

A molecule.

required
mol2 Optional[Mol]

A molecule.

required
use_non_standard_inchikey bool

Whether to use the standard or non-standard InChiKey.

False
Source code in datamol/mol.py
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
def same_mol(
    mol1: Optional[Mol],
    mol2: Optional[Mol],
    use_non_standard_inchikey: bool = False,
) -> bool:
    """Check two molecules are the same by comparing their InChiKey.

    Invalid molecules (None) are always considered as not the same.

    Args:
        mol1: A molecule.
        mol2: A molecule.
        use_non_standard_inchikey: Whether to use the standard or non-standard InChiKey.
    """

    if mol1 is None or mol2 is None:
        return False

    if use_non_standard_inchikey:
        return to_inchikey_non_standard(mol1) == to_inchikey_non_standard(mol2)
    else:
        return to_inchikey(mol1) == to_inchikey(mol2)

sanitize_first(mols, charge_neutral=False, sanifix=True)

Sanitize a list of molecules and return the first valid molecule seen in the list.

Parameters:

Name Type Description Default
mols List[Mol]

a list of molecules.

required
charge_neutral bool

whether charge neutralization should be applied.

False
sanifix bool

whether to run the sanifix from James Davidson (sanifix4.py) that try to adjust aromatic nitrogens.

True

Returns:

Name Type Description
mol Mol

a molecule.

Source code in datamol/mol.py
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
def sanitize_first(mols: List[Mol], charge_neutral: bool = False, sanifix: bool = True) -> Mol:
    """Sanitize a list of molecules and return the first valid molecule seen in the list.

    Args:
        mols: a list of molecules.
        charge_neutral: whether charge neutralization should be applied.
        sanifix: whether to run the sanifix from James Davidson
            (sanifix4.py) that try to adjust aromatic nitrogens.

    Returns:
        mol: a molecule.
    """
    for mol in mols:
        mol = sanitize_mol(mol, charge_neutral=charge_neutral, sanifix=sanifix)
        if mol:
            return mol
    return None

sanitize_mol(mol, charge_neutral=False, sanifix=True, verbose=True, add_hs=False)

An augmented version of RDKit sanitize=True. It uses a mol-SMILES-mol conversion to catch potential aromaticity errors and try to fix aromatic nitrogen (using the popular sanifix4 script). Optionally, it can neutralize the charge of the molecule.

Note #1: Only the first conformer (if present) will be preserved and a warning will be displayed if more than one conformer is detected.

Note #2: The molecule's properties will be preserved but the atom's properties will be lost.

Parameters:

Name Type Description Default
mol Mol

a molecule.

required
charge_neutral bool

whether charge neutralization should be applied.

False
sanifix bool

whether to run the sanifix from James Davidson (sanifix4.py) that try to adjust aromatic nitrogens.

True
verbose bool

Whether displaying a warning about multiple conformers.

True
add_hs bool

Add hydrogens to the returned molecule. Useful when the input molecule already contains hydrogens.

False

Returns:

Name Type Description
mol Optional[Mol]

a molecule.

Source code in datamol/mol.py
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
def sanitize_mol(
    mol: Mol,
    charge_neutral: bool = False,
    sanifix: bool = True,
    verbose: bool = True,
    add_hs: bool = False,
) -> Optional[Mol]:
    """An augmented version of RDKit `sanitize=True`. It uses a
    mol-SMILES-mol conversion to catch potential aromaticity errors
    and try to fix aromatic nitrogen (using the popular sanifix4 script).
    Optionally, it can neutralize the charge of the molecule.

    Note #1: Only the first conformer (if present) will be preserved and
    a warning will be displayed if more than one conformer is detected.

    Note #2: The molecule's properties will be preserved but the atom's
    properties will be lost.

    Args:
        mol: a molecule.
        charge_neutral: whether charge neutralization should be applied.
        sanifix: whether to run the sanifix from James Davidson
            (sanifix4.py) that try to adjust aromatic nitrogens.
        verbose: Whether displaying a warning about multiple conformers.
        add_hs: Add hydrogens to the returned molecule. Useful when the input
            molecule already contains hydrogens.

    Returns:
        mol: a molecule.
    """
    if mol is None:
        return mol

    # Extract properties.
    original_mol = copy_mol(mol)
    properties = original_mol.GetPropsAsDict()

    if charge_neutral:
        mol = to_neutral(mol)

    if sanifix:
        mol = _sanifix4.sanifix(mol)

    if mol is not None:
        # Detect multiple conformers
        if verbose and mol.GetNumConformers() > 1:
            logger.warning(
                "The molecule contains multiple conformers. Only the first one will be preserved."
            )

        # Try catch to avoid occasional aromaticity errors
        try:
            # `cxsmiles` is used here to preserve the first conformer.
            mol = to_mol(to_smiles(mol, cxsmiles=True), sanitize=True, add_hs=add_hs)
        except Exception:
            mol = None

    if mol is not None:
        # Insert back properties.
        mol = set_mol_props(mol, properties)

    return mol

sanitize_smiles(smiles, isomeric=True)

Takes SMILES string and returns its sanitized version.

Parameters:

Name Type Description Default
smiles Optional[str]

smiles to be sanitized.

required
isomeric bool

Whether to include information about stereochemistry in the SMILES.

True

Returns:

Type Description
Optional[str]

sanitized smiles.

Source code in datamol/mol.py
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
def sanitize_smiles(smiles: Optional[str], isomeric: bool = True) -> Optional[str]:
    """Takes SMILES string and returns its sanitized version.

    Args:
        smiles: smiles to be sanitized.
        isomeric: Whether to include information about stereochemistry in the SMILES.

    Returns:
        sanitized smiles.
    """

    mol = None

    try:
        mol = to_mol(smiles, sanitize=False)
        mol = sanitize_mol(mol, False)
    except Exception:
        return None

    if mol is None:
        return None

    try:
        smiles = to_smiles(mol, isomeric=isomeric)
    except Exception:
        return None

    return smiles

set_atom_positions(mol, positions, conf_id=0, copy=True, use_atom_map_numbers=True, remove_previous_conformers=True)

Add a conformer to a molecule given the atom's positions.

The conformer 3D flag is automatically set if all the z coordinates are 0.

Example:

The below example is common when you want to reconstruct a molecule object from its SMILES and its held out atomic positions. The position array is ordered according to the atom number seen in the SMILES. This is a common data structure when working with Quantum Mechanics dataset.

import datamol as dm
import numpy as np

# We start with a SMILES where every atoms is mapped to a specific number
smiles = "[H:14][c:5]1[c:3]([c:7]([c:4]([c:6]([c:8]1[N:10]([H:18])[C:2](=[N+:11]([H:19])[H:20])[N:9]([H:16])[H:17])[H:15])[H:13])[F:1])[H:12]"

# Every atom position below is mapped to the atom number in the SMILES above
positions = [
    [1.7, -6.67, 3.15],
    [0.2, 4.72, 0.78],
    [3.54, -2.64, 2.88],
    [0.43, -3.87, -0.09],
    [3.44, -0.2, 1.8],
    [0.02, -1.5, -1.0],
    [2.12, -4.54, 1.9],
    [1.5, 0.48, 0.02],
    [0.53, 7.24, 0.25],
    [1.17, 2.91, -0.85],
    [-1.22, 4.15, 2.71],
    [4.64, -3.24, 4.55],
    [-0.89, -5.43, -0.78],
    [4.52, 1.43, 2.45],
    [-1.45, -1.02, -2.48],
    [-0.15, 8.68, 1.38],
    [1.65, 7.88, -1.21],
    [2.24, 3.64, -2.15],
    [-1.96, 2.4, 3.0],
    [-2.02, 5.59, 3.71],
]

# We build the mol object by setting `remove_hs` to `False`.
# This is important so the hydrogens and their atom number are preserved.
mol = dm.to_mol(smiles, remove_hs=False)

# If you plot the molecule with `dm.to_image(mol)`, you'll notice
# the atom numbers are added to the drawing.

# Now we set the atom positions to the newly constructed `mol` object.
# Here it's important to set `use_atom_map_numbers` to `True`, so the atom numbers
# from the SMILES are used to match the input positions array.
# Under the hood, RDKit has set the `molAtomMapNumber` property to all the atoms in the
# molecule.
new_mol = dm.set_atom_positions(
    mol=mol,
    positions=positions,
    conf_id=0,
    use_atom_map_numbers=True,
)

# The newly set conformer now had the correct 3D positions.
# You can visualize the molecule with `dm.to_image(new_mol)` in 2D
# or `dm.viz.conformers(new_mol)`.

Parameters:

Name Type Description Default
mol Mol

A molecule.

required
positions ArrayLike

An array or a list of atomic positions. Shape of [n_atoms, 3].

required
conf_id int

The conformer ID to set the conformer to.

0
copy bool

Whether to copy the molecule.

True
use_atom_map_numbers bool

Whether to input positions are ordered given the atom mapped numbers set in the molAtomMapNumber atom property keys. If set to False, the default atom indices order is assumed.

True
remove_previous_conformers bool

Whether to remove the previous conformers if any in the input molecule.

True
Source code in datamol/mol.py
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
def set_atom_positions(
    mol: Mol,
    positions: npt.ArrayLike,
    conf_id: int = 0,
    copy: bool = True,
    use_atom_map_numbers: bool = True,
    remove_previous_conformers: bool = True,
) -> Mol:
    """Add a conformer to a molecule given the atom's positions.

    The conformer 3D flag is automatically set if all the z coordinates are 0.

    **Example:**

    The below example is common when you want to reconstruct a molecule object
    from its SMILES and its held out atomic positions. The position array
    is ordered according to the atom number seen in the SMILES. This is a common
    data structure when working with Quantum Mechanics dataset.

    ```python

    import datamol as dm
    import numpy as np

    # We start with a SMILES where every atoms is mapped to a specific number
    smiles = "[H:14][c:5]1[c:3]([c:7]([c:4]([c:6]([c:8]1[N:10]([H:18])[C:2](=[N+:11]([H:19])[H:20])[N:9]([H:16])[H:17])[H:15])[H:13])[F:1])[H:12]"

    # Every atom position below is mapped to the atom number in the SMILES above
    positions = [
        [1.7, -6.67, 3.15],
        [0.2, 4.72, 0.78],
        [3.54, -2.64, 2.88],
        [0.43, -3.87, -0.09],
        [3.44, -0.2, 1.8],
        [0.02, -1.5, -1.0],
        [2.12, -4.54, 1.9],
        [1.5, 0.48, 0.02],
        [0.53, 7.24, 0.25],
        [1.17, 2.91, -0.85],
        [-1.22, 4.15, 2.71],
        [4.64, -3.24, 4.55],
        [-0.89, -5.43, -0.78],
        [4.52, 1.43, 2.45],
        [-1.45, -1.02, -2.48],
        [-0.15, 8.68, 1.38],
        [1.65, 7.88, -1.21],
        [2.24, 3.64, -2.15],
        [-1.96, 2.4, 3.0],
        [-2.02, 5.59, 3.71],
    ]

    # We build the mol object by setting `remove_hs` to `False`.
    # This is important so the hydrogens and their atom number are preserved.
    mol = dm.to_mol(smiles, remove_hs=False)

    # If you plot the molecule with `dm.to_image(mol)`, you'll notice
    # the atom numbers are added to the drawing.

    # Now we set the atom positions to the newly constructed `mol` object.
    # Here it's important to set `use_atom_map_numbers` to `True`, so the atom numbers
    # from the SMILES are used to match the input positions array.
    # Under the hood, RDKit has set the `molAtomMapNumber` property to all the atoms in the
    # molecule.
    new_mol = dm.set_atom_positions(
        mol=mol,
        positions=positions,
        conf_id=0,
        use_atom_map_numbers=True,
    )

    # The newly set conformer now had the correct 3D positions.
    # You can visualize the molecule with `dm.to_image(new_mol)` in 2D
    # or `dm.viz.conformers(new_mol)`.
    ```

    Args:
        mol: A molecule.
        positions: An array or a list of atomic positions. Shape of `[n_atoms, 3]`.
        conf_id: The conformer ID to set the conformer to.
        copy: Whether to copy the molecule.
        use_atom_map_numbers: Whether to input positions are ordered given the atom mapped
            numbers set in the `molAtomMapNumber` atom property keys. If set to False,
            the default atom indices order is assumed.
        remove_previous_conformers: Whether to remove the previous conformers if any in the input molecule.
    """

    if copy:
        mol = copy_mol(mol)

    # Convert to an array
    positions = np.array(positions)

    # Perform a few sanity checks

    if positions.ndim != 2:
        raise ValueError(f"The positions array is not of dimension 2. Found: {positions.ndim}.")

    excpected_shape = (mol.GetNumAtoms(), 3)
    if positions.shape != excpected_shape:
        raise ValueError(
            f"The shape of `positions` is {positions.shape} instead of {excpected_shape}."
        )

    if use_atom_map_numbers and not all(
        ["molAtomMapNumber" in a.GetPropsAsDict() for a in mol.GetAtoms()]
    ):
        raise ValueError(
            "The atoms of the input molecule does not contain the molAtomMapNumber property."
            "Set it before calling this function or set `from_atom_map_numbers` to `False`."
        )

    # Remove previous conformers
    if remove_previous_conformers:
        mol.RemoveAllConformers()

    # Remap the rows order in `positions` so it matches
    # with the atom map numbers.
    if use_atom_map_numbers:
        mapped_indices = np.array([a.GetAtomMapNum() for a in mol.GetAtoms()]) - 1
        positions = positions[mapped_indices, :]

    # Check if it's 3D or 2D coords
    is_3d = not np.all(positions[:, 2] == 0)

    # Create the conformer object
    conf = rdchem.Conformer()
    conf.Set3D(is_3d)
    conf.SetId(conf_id)

    # Set the positions
    for i, xyz in enumerate(positions):
        conf.SetAtomPosition(i, rdGeometry.Point3D(*xyz.tolist()))

    # Add the conformer to the molecule
    mol.AddConformer(conf)

    return mol

set_dative_bonds(mol, from_atoms=(7, 8))

Replaces some single bonds between metals and atoms with atomic numbers in fromAtoms with dative bonds. The replacement is only done if the atom has "too many" bonds.

Parameters:

Name Type Description Default
mol Mol

molecule with bond to modify

required
from_atoms Tuple[int, int]

List of atoms (symbol or atomic number) to consider for bond replacement. By default, only Nitrogen (7) and Oxygen (8) are considered.

(7, 8)

Returns:

Type Description
Optional[Mol]

The modified molecule.

Source code in datamol/mol.py
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
def set_dative_bonds(mol: Mol, from_atoms: Tuple[int, int] = (7, 8)) -> Optional[Mol]:
    """Replaces some single bonds between metals and atoms with atomic numbers in fromAtoms
    with dative bonds. The replacement is only done if the atom has "too many" bonds.

    Arguments:
        mol: molecule with bond to modify
        from_atoms: List of atoms  (symbol or atomic number) to consider for bond replacement.
            By default, only Nitrogen (7) and Oxygen (8) are considered.

    Returns:
        The modified molecule.
    """
    rwmol = rdchem.RWMol(mol)
    rwmol.UpdatePropertyCache(strict=False)

    metals = [at for at in rwmol.GetAtoms() if is_transition_metal(at)]
    for metal in metals:
        for nbr in metal.GetNeighbors():
            if (nbr.GetAtomicNum() in from_atoms or nbr.GetSymbol() in from_atoms) and (
                nbr.GetExplicitValence() > PERIODIC_TABLE.GetDefaultValence(nbr.GetAtomicNum())
                and rwmol.GetBondBetweenAtoms(nbr.GetIdx(), metal.GetIdx()).GetBondType()
                == SINGLE_BOND
            ):
                rwmol.RemoveBond(nbr.GetIdx(), metal.GetIdx())
                rwmol.AddBond(nbr.GetIdx(), metal.GetIdx(), DATIVE_BOND)
    return rwmol

set_mol_props(mol, props, copy=False)

Set properties to a mol from a dict.

Parameters:

Name Type Description Default
mol Mol

the mol where to copy the props.

required
props Dict[str, Any]

the props to copy.

required
copy bool

whether to copy the provided mol

False
Source code in datamol/mol.py
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
def set_mol_props(
    mol: Mol,
    props: Dict[str, Any],
    copy: bool = False,
) -> Mol:
    """Set properties to a mol from a dict.

    Args:
        mol: the mol where to copy the props.
        props: the props to copy.
        copy: whether to copy the provided mol

    """

    if copy is True:
        mol = copy_mol(mol)

    for k, v in props.items():
        if isinstance(v, bool):
            mol.SetBoolProp(k, v)
        elif isinstance(v, int):
            # NOTE(hadim): A Python integer is 32 bits and RDKit seems
            # to overflow before that. Here we catch the error
            # and instead uses silently `SetDoubleProp` instead.
            try:
                mol.SetIntProp(k, v)
            except OverflowError:
                mol.SetDoubleProp(k, v)
        elif isinstance(v, float):
            mol.SetDoubleProp(k, v)
        else:
            mol.SetProp(k, str(v))

    return mol

standardize_mol(mol, disconnect_metals=False, normalize=True, reionize=True, uncharge=False, stereo=True)

This function returns a standardized version the given molecule. It relies on the RDKit rdMolStandardize module which is largely inspired from MolVS.

Parameters:

Name Type Description Default
mol Mol

A molecule to standardize.

required
disconnect_metals bool

Disconnect metals that are defined as covalently bonded to non-metal. Depending on the source of the database, some compounds may be reported in salt form or associated to metallic ions (e.g. the sodium salt of a carboxylic compound). In most cases, these counter-ions are not relevant so the use of this function is required before further utilization of the dataset. In summary the process is the following:

  • Break covalent bonds between metals and organic atoms under certain conditions.
  • First, disconnect N, O, F from any metal. Then disconnect other non-metals from transition metals (with exceptions).
  • For every bond broken, adjust the charges of the begin and end atoms accordingly.
False
normalize bool

Applies a series of standard transformations to correct functional groups and recombine charges. It corrects drawing errors and standardizes functional groups in the molecule as well as ensuring the overall proper charge of the compound. It includes:

  • Uncharge-separate sulfones
  • Charge-separate nitro groups
  • Charge-separate pyridine oxide
  • Charge-separate azide
  • Charge-separate diazo and azo groups
  • Charge-separate sulfoxides
  • Hydrazine-diazonium system
True
reionize bool

If one or more acidic functionalities are present in the molecule, this option ensures the correct neutral/ionized state for such functional groups. Molecules are uncharged by adding and/or removing hydrogens. For zwitterions, hydrogens are moved to eliminate charges where possible. However, in cases where there is a positive charge that is not neutralizable, an attempt is made to also preserve the corresponding negative charge The algorithm works as follows:

  • Use SMARTS to find the strongest protonated acid and the weakest ionized acid.
  • If the ionized acid is weaker than the protonated acid, swap proton and repeat.
True
uncharge bool

This option neutralize the molecule by reversing the protonation state of protonated and deprotonated groups, if present (e.g. a carboxylate is re-protonated to the corresponding carboxylic acid). In cases where there is a positive charge that is not neutralizable, an attempt is made to also preserve the corresponding negative charge to ensure a net zero charge.

False
stereo bool

Stereochemical information is corrected and/or added if missing using built-in RDKit functionality to force a clean recalculation of stereochemistry (AssignStereochemistry).

True

Returns:

Name Type Description
mol Mol

A standardized molecule.

Source code in datamol/mol.py
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
def standardize_mol(
    mol: Mol,
    disconnect_metals: bool = False,
    normalize: bool = True,
    reionize: bool = True,
    uncharge: bool = False,
    stereo: bool = True,
) -> Mol:
    r"""
    This function returns a standardized version the given molecule. It relies on the
    RDKit [`rdMolStandardize` module](https://www.rdkit.org/docs/source/rdkit.Chem.MolStandardize.rdMolStandardize.html)
    which is largely inspired from [MolVS](https://github.com/mcs07/MolVS).

    Arguments:
        mol: A molecule to standardize.

        disconnect_metals: Disconnect metals that are defined as covalently bonded to non-metal.
            Depending on the source of the database, some compounds may be reported in salt form
            or associated to metallic ions (e.g. the sodium salt of a carboxylic compound).
            In most cases, these counter-ions are not relevant so the use of this function is required
            before further utilization of the dataset. In summary the process is the following:

            - Break covalent bonds between metals and organic atoms under certain conditions.
            - First, disconnect N, O, F from any metal. Then disconnect other non-metals from transition metals (with exceptions).
            - For every bond broken, adjust the charges of the begin and end atoms accordingly.

        normalize: Applies a series of standard transformations to correct functional groups and recombine charges.
            It corrects drawing errors and standardizes functional groups in the molecule as well as ensuring the
            overall proper charge of the compound. It includes:

            - Uncharge-separate sulfones
            - Charge-separate nitro groups
            - Charge-separate pyridine oxide
            - Charge-separate azide
            - Charge-separate diazo and azo groups
            - Charge-separate sulfoxides
            - Hydrazine-diazonium system

        reionize: If one or more acidic functionalities are present in the molecule, this option ensures the correct
            neutral/ionized state for such functional groups. Molecules are uncharged by adding and/or removing hydrogens.
            For zwitterions, hydrogens are moved to eliminate charges where possible. However, in cases where there is a
            positive charge that is not neutralizable, an attempt is made to also preserve the corresponding negative charge
            The algorithm works as follows:

            - Use SMARTS to find the strongest protonated acid and the weakest ionized acid.
            - If the ionized acid is weaker than the protonated acid, swap proton and repeat.

        uncharge: This option neutralize the molecule by reversing the protonation state of protonated and deprotonated groups,
            if present (e.g. a carboxylate is re-protonated to the corresponding carboxylic acid).
            In cases where there is a positive charge that is not neutralizable, an attempt is made to also preserve the
            corresponding negative charge to ensure a net zero charge.

        stereo: Stereochemical information is corrected and/or added if missing using built-in RDKit functionality to force a clean recalculation of stereochemistry (`AssignStereochemistry`).

    Returns:
        mol: A standardized molecule.
    """
    mol = copy_mol(mol)

    if disconnect_metals:
        md = rdMolStandardize.MetalDisconnector()
        mol = md.Disconnect(mol)

    if normalize:
        mol = rdMolStandardize.Normalize(mol)

    if reionize:
        reionizer = rdMolStandardize.Reionizer()
        mol = reionizer.reionize(mol)

    if uncharge:
        uncharger = rdMolStandardize.Uncharger()
        mol = uncharger.uncharge(mol)

    if stereo:
        rdmolops.AssignStereochemistry(mol, force=False, cleanIt=True)

    return mol

standardize_smiles(smiles)

Apply smile standardization procedure. This is a convenient function wrapped arrounf RDKit smiles standardizer and tautomeric canonicalization.

Parameters:

Name Type Description Default
smiles str

Smiles to standardize

required

Returns:

Name Type Description
standard_smiles str

the standardized smiles

Source code in datamol/mol.py
397
398
399
400
401
402
403
404
405
406
407
408
409
410
def standardize_smiles(smiles: str) -> str:
    r"""
    Apply smile standardization procedure. This is a convenient function wrapped arrounf RDKit
    smiles standardizer and tautomeric canonicalization.

    Args:
        smiles: Smiles to standardize

    Returns:
        standard_smiles: the standardized smiles
    """

    smiles = rdMolStandardize.StandardizeSmiles(smiles)
    return smiles

strip_mol_to_core(mol, bond_cutter=None)

Strip a molecule to its core, i.e. remove all atoms not in the core. This method 'guess' the molecular core, by finding the ring system.

Parameters:

Name Type Description Default
mol Mol

A molecule.

required
bond_cutter Mol

A molecule used to cut the bonds.

None
Source code in datamol/mol.py
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
def strip_mol_to_core(mol: Mol, bond_cutter: Mol = None):
    """Strip a molecule to its core, i.e. remove all atoms not in the core.
    This method 'guess' the molecular core, by finding the ring system.

    Args:
        mol: A molecule.
        bond_cutter: A molecule used to cut the bonds.
    """

    if bond_cutter is None:
        bond_cutter = from_smarts("[R;!$(*=,#[!#6])]!@!=!#[*;$([A;!R][A;!R])]")

    with without_rdkit_log():
        scaffold = MurckoScaffold.GetScaffoldForMol(mol)
        out = mol.GetSubstructMatches(bond_cutter)
        bond_inds = [mol.GetBondBetweenAtoms(i, j).GetIdx() for i, j in out]

        if len(bond_inds) > 0:
            fragmented = rdmolops.FragmentOnBonds(mol, bond_inds)
            fragmented = remove_dummies(fragmented)
            fragmented = to_scaffold_murcko(fragmented)
            scaffold = keep_largest_fragment(fragmented)

    return scaffold

substructure_matching_bonds(mol, query, **kwargs)

Perform a substructure match using GetSubstructMatches but instead of returning only the atom indices also return the bond indices.

Parameters:

Name Type Description Default
mol Mol

A molecule.

required
query Mol

A molecule used as a query to match against.

required
**kwargs Any

Any other arguments to pass to mol.GetSubstructMatches().

{}

Returns:

Name Type Description
atom_matches list

A list of lists of atom indices.

bond_matches list

A list of lists of bond indices.

Source code in datamol/mol.py
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
def substructure_matching_bonds(mol: Mol, query: Mol, **kwargs: Any) -> Tuple[list, list]:
    """Perform a substructure match using `GetSubstructMatches` but instead
    of returning only the atom indices also return the bond indices.

    Args:
        mol: A molecule.
        query: A molecule used as a query to match against.
        **kwargs: Any other arguments to pass to `mol.GetSubstructMatches()`.

    Returns:
        atom_matches: A list of lists of atom indices.
        bond_matches: A list of lists of bond indices.
    """

    # NOTE(hadim): If more substructure functions are added here, consider moving it to
    # a dedicated `substructure` module.

    # Set default arguments
    kwargs.setdefault("uniquify", True)

    # Get the matching atom indices
    atom_matches = list(mol.GetSubstructMatches(query, **kwargs))

    # Get the bond to highligh from the query
    query_bond_indices = [
        (bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()) for bond in query.GetBonds()
    ]

    # Retrieve the atom indices
    query_atom_indices = [atom.GetIdx() for i, atom in enumerate(query.GetAtoms())]

    bond_matches = []

    for match in atom_matches:
        # Map the atom of the query to the atom of the mol matching the query
        atom_map = dict(zip(query_atom_indices, match))

        # For this match atoms we now, we use the map to retrieve the matching bonds
        # in the mol.
        mol_bond_indices = [(atom_map[a1], atom_map[a2]) for a1, a2 in query_bond_indices]

        # Convert the bond atom indices to bond indices
        mol_bond_indices = [mol.GetBondBetweenAtoms(a1, a2).GetIdx() for a1, a2 in mol_bond_indices]

        bond_matches.append(mol_bond_indices)

    return atom_matches, bond_matches

to_mol(mol, add_hs=False, explicit_only=False, ordered=False, kekulize=False, sanitize=True, allow_cxsmiles=True, parse_name=True, remove_hs=True, strict_cxsmiles=True)

Convert an input molecule (smiles representation) into a Mol.

Parameters:

Name Type Description Default
mol Union[str, Mol]

A SMILES or a molecule.

required
add_hs bool

Whether hydrogens should be added the molecule after the SMILES has been parsed.

False
explicit_only bool

Whether to only add explicit hydrogen or both (implicit and explicit). when add_hs is set to True.

False
ordered bool

Whether the atom should be ordered. This option is important if you want to ensure that the features returned will always maintain a single atom order for the same molecule, regardless of its original SMILES representation.

False
kekulize bool

Whether to perform kekulization of the input molecules.

False
sanitize bool

Whether to apply rdkit sanitization when input is a SMILES.

True
allow_cxsmiles bool

Recognize and parse CXSMILES.

True
parse_name bool

Parse (and set) the molecule name as well.

True
remove_hs bool

Wether to remove the hydrogens from the input SMILES.

True
strict_cxsmiles bool

Throw an exception if the CXSMILES parsing fails.

True

Returns:

Name Type Description
mol Optional[Mol]

the molecule if some conversion have been made. If the conversion fails

Optional[Mol]

None is returned so make sure that you handle this case on your own.

Source code in datamol/mol.py
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
def to_mol(
    mol: Union[str, Mol],
    add_hs: bool = False,
    explicit_only: bool = False,
    ordered: bool = False,
    kekulize: bool = False,
    sanitize: bool = True,
    allow_cxsmiles: bool = True,
    parse_name: bool = True,
    remove_hs: bool = True,
    strict_cxsmiles: bool = True,
) -> Optional[Mol]:
    """Convert an input molecule (smiles representation) into a `Mol`.

    Args:
        mol: A SMILES or a molecule.
        add_hs: Whether hydrogens should be added the molecule after the SMILES has been parsed.
        explicit_only: Whether to only add explicit hydrogen or both
            (implicit and explicit). when `add_hs` is set to True.
        ordered: Whether the atom should be ordered. This option is
            important if you want to ensure that the features returned will always maintain
            a single atom order for the same molecule, regardless of its original SMILES representation.
        kekulize: Whether to perform kekulization of the input molecules.
        sanitize: Whether to apply rdkit sanitization when input is a SMILES.
        allow_cxsmiles: Recognize and parse CXSMILES.
        parse_name: Parse (and set) the molecule name as well.
        remove_hs: Wether to remove the hydrogens from the input SMILES.
        strict_cxsmiles: Throw an exception if the CXSMILES parsing fails.

    Returns:
        mol: the molecule if some conversion have been made. If the conversion fails
        None is returned so make sure that you handle this case on your own.
    """

    if not isinstance(mol, (str, Mol)):
        raise ValueError(f"Input should be a Mol or a string instead of '{type(mol)}'")

    if isinstance(mol, str):
        smiles_params = rdmolfiles.SmilesParserParams()
        smiles_params.sanitize = sanitize
        smiles_params.allowCXSMILES = allow_cxsmiles
        smiles_params.parseName = parse_name
        smiles_params.removeHs = remove_hs
        smiles_params.strictCXSMILES = strict_cxsmiles

        _mol = rdmolfiles.MolFromSmiles(mol, params=smiles_params)

        if not sanitize and _mol is not None:
            _mol.UpdatePropertyCache(False)
    else:
        _mol = mol

    # Add hydrogens
    if _mol is not None and add_hs:
        _mol = rdmolops.AddHs(_mol, explicitOnly=explicit_only, addCoords=True)

    # Reorder atoms
    if _mol is not None and ordered:
        _mol = reorder_atoms(_mol)

    if _mol is not None and kekulize:
        rdmolops.Kekulize(_mol, clearAromaticFlags=False)

    return _mol

to_neutral(mol)

Neutralize the charge of a molecule.

Parameters:

Name Type Description Default
mol Optional[Mol]

a molecule.

required

Returns:

Name Type Description
mol Optional[Mol]

a molecule.

Source code in datamol/mol.py
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
def to_neutral(mol: Optional[Mol]) -> Optional[Mol]:
    """Neutralize the charge of a molecule.

    Args:
        mol: a molecule.

    Returns:
        mol: a molecule.
    """
    if mol is None:
        return mol

    for a in mol.GetAtoms():
        if a.GetFormalCharge() < 0 or (
            a.GetExplicitValence() >= PERIODIC_TABLE.GetDefaultValence(a.GetSymbol())
            and a.GetFormalCharge() > 0
        ):
            a.SetFormalCharge(0)
            a.UpdatePropertyCache(False)
    return mol

to_scaffold_murcko(mol, make_generic=False)

Extract the Murcko scaffold from a molecule.

Parameters:

Name Type Description Default
mol Mol

A molecule.

required
make_generic bool

Whether to make the scaffold generic.

False
Source code in datamol/mol.py
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
def to_scaffold_murcko(mol: Mol, make_generic: bool = False):
    """Extract the Murcko scaffold from a molecule.

    Args:
        mol: A molecule.
        make_generic: Whether to make the scaffold generic.
    """
    scf = MurckoScaffold.GetScaffoldForMol(mol)

    # NOTE(hadim): this is already done in `GetScaffoldForMol`
    # Note sure we need it here.
    scf.UpdatePropertyCache()
    Chem.GetSymmSSSR(scf)  # type: ignore

    if make_generic:
        scf = make_scaffold_generic(scf)
        scf = to_mol(scf)

    return scf

unique_id(mol)

A datamol unique molecule ID.

The ID is an MD5 hash of the non-standard InChiKey provided by dm.to_inchikey_non_standard(). It guarantees uniqueness for different tautomeric forms of the same molecule.

Parameters:

Name Type Description Default
mol Mol

A molecule.

required
Source code in datamol/mol.py
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
def unique_id(mol: Mol) -> Optional[str]:
    """A datamol unique molecule ID.

    The ID is an MD5 hash of the non-standard InChiKey provided
    by `dm.to_inchikey_non_standard()`. It guarantees uniqueness for
    different tautomeric forms of the same molecule.

    Args:
        mol: A molecule.
    """
    ik = to_inchikey_non_standard(mol)

    if ik is None:
        return None

    return hashlib.md5(ik.encode("utf-8")).hexdigest()