`datamol.io`¶

`open_df(path, **kwargs)` ¶

Open a dataframe file whatever its filetype from csv, excel, parquet, json, sdf.

Parameters:

Name	Type	Description	Default
`path`	`str`	path to the file.	required
`**kwargs`	`Any`	keyword arguments to pass to the underlying reader.	`{}`

Source code in datamol/io.py

def open_df(path: str, **kwargs: Any) -> pd.DataFrame:
    """Open a dataframe file whatever its filetype from
    `csv, excel, parquet, json, sdf`.

    Args:
        path: path to the file.
        **kwargs: keyword arguments to pass to the underlying reader.
    """

    filetype = _guess_filetype(path)

    data = None
    if filetype == "csv":
        data = pd.read_csv(path, **kwargs)
    elif filetype == "excel":
        data = pd.read_excel(path, **kwargs)
    elif filetype == "parquet":
        data = pd.read_parquet(path, **kwargs)
    elif filetype == "json":
        data = pd.read_json(path, **kwargs)
    elif filetype == "sdf":
        kwargs.setdefault("as_df", True)
        data = dm.read_sdf(path, **kwargs)
    else:
        raise ValueError(f"The file type of {path} is not supported.")

    data = cast(pd.DataFrame, data)

    return data

`read_csv(urlpath, smiles_column=None, mol_column='mol', **kwargs)` ¶

Read a CSV file.

Parameters:

Name	Type	Description	Default
`urlpath`	`Union[str, PathLike, IO]`	Path to a file or a file-like object. Path can be remote or local.	required
`smiles_column`	`Optional[str]`	Use this column to build a mol column.	`None`
`mol_column`	`str`	Name to give to the mol column. If not None a mol column will be build. Avoid when loading a very large file.	`'mol'`
`**kwargs`	`Any`	Arguments to pass to `pd.read_csv()`.	`{}`

Returns:

Name	Type	Description
`df`	`DataFrame`	a `pandas.DataFrame`

Source code in datamol/io.py

def read_csv(
    urlpath: Union[str, os.PathLike, IO],
    smiles_column: Optional[str] = None,
    mol_column: str = "mol",
    **kwargs: Any,
) -> pd.DataFrame:
    """Read a CSV file.

    Args:
        urlpath: Path to a file or a file-like object. Path can be remote or local.
        smiles_column: Use this column to build a mol column.
        mol_column: Name to give to the mol column. If not None a mol column will be build.
            Avoid when loading a very large file.
        **kwargs: Arguments to pass to `pd.read_csv()`.

    Returns:
        df: a `pandas.DataFrame`
    """

    df = pd.read_csv(urlpath, **kwargs)
    df = cast(pd.DataFrame, df)

    if smiles_column is not None:
        PandasTools.AddMoleculeColumnToFrame(df, smiles_column, mol_column)

    return df

`read_excel(urlpath, sheet_name=0, smiles_column=None, mol_column='mol', **kwargs)` ¶

Read an excel file.

Parameters:

Name	Type	Description	Default
`urlpath`	`Union[str, PathLike, IO]`	Path to a file or a file-like object. Path can be remote or local.	required
`sheet_name`	`Optional[Union[str, int, list]]`	see `pandas.read_excel()` doc.	`0`
`mol_column`	`str`	Name to give to the mol column. If not None a mol column will be build. Avoid when loading a very large file.	`'mol'`
`mol_column`	`str`	name to give to the mol column.	`'mol'`
`**kwargs`	`Any`	Arguments to pass to `pd.read_excel()`.	`{}`

Returns:

Name	Type	Description
`df`	`DataFrame`	a `pandas.DataFrame`

Source code in datamol/io.py

def read_excel(
    urlpath: Union[str, os.PathLike, IO],
    sheet_name: Optional[Union[str, int, list]] = 0,
    smiles_column: Optional[str] = None,
    mol_column: str = "mol",
    **kwargs: Any,
) -> pd.DataFrame:
    """Read an excel file.

    Args:
        urlpath: Path to a file or a file-like object. Path can be remote or local.
        sheet_name: see `pandas.read_excel()` doc.
        mol_column: Name to give to the mol column. If not None a mol column will be build.
            Avoid when loading a very large file.
        mol_column: name to give to the mol column.
        **kwargs: Arguments to pass to `pd.read_excel()`.

    Returns:
        df: a `pandas.DataFrame`
    """

    df = pd.read_excel(urlpath, sheet_name=sheet_name, **kwargs)
    df = cast(pd.DataFrame, df)

    if smiles_column is not None:
        PandasTools.AddMoleculeColumnToFrame(df, smiles_column, mol_column)

    return df

`read_mol2file(urlpath, sanitize=True, cleanup_substructures=True, remove_hs=True, fail_if_invalid=False)` ¶

Read a Mol2 File

Parameters:

Name	Type	Description	Default
`urlpath`	`Union[str, PathLike, IO]`	Path to a file or a file-like object. Path can be remote or local.	required
`sanitize`	`bool`	Whether to sanitize the molecules.	`True`
`remove_hs`	`bool`	Whether to remove the existing hydrogens in the SDF files.	`True`
`cleanup_substructures`	`bool`	Whether to clean up substructure in the Mol2 Files.	`True`
`fail_if_invalid`	`bool`	If set to true, the parser will raise an exception if the molecule is invalid instead of returning None.	`False`

Source code in datamol/io.py

def read_mol2file(
    urlpath: Union[str, os.PathLike, IO],
    sanitize: bool = True,
    cleanup_substructures: bool = True,
    remove_hs: bool = True,
    fail_if_invalid: bool = False,
) -> List[Mol]:
    """Read a Mol2 File

    Args:
        urlpath: Path to a file or a file-like object. Path can be remote or local.
        sanitize: Whether to sanitize the molecules.
        remove_hs: Whether to remove the existing hydrogens in the SDF files.
        cleanup_substructures: Whether to clean up substructure in the Mol2 Files.
        fail_if_invalid: If set to true, the parser will raise an exception if the molecule is invalid
            instead of returning None.
    """

    block = []
    mols = []
    with fsspec.open(urlpath, compression="infer") as f:
        f = cast(IO, f)
        fReadLines = f.readlines()
        # reversing due to ambiguous end line for mol2 files
        fReadLines.reverse()
        for line in fReadLines:
            # ignores any header info
            if b"#" not in line:
                block.append(str(line, "utf-8"))
            # since reversed, this is the 'end' a mol2
            if b"@<TRIPOS>MOLECULE" in line:
                block.reverse()
                mol2block = ",".join(block).replace(",", "")
                mol = rdmolfiles.MolFromMol2Block(
                    mol2block,
                    sanitize=sanitize,
                    removeHs=remove_hs,
                    cleanupSubstructures=cleanup_substructures,
                )
                if mol is None and fail_if_invalid:
                    raise ValueError(f"Invalid molecule: {mol2block}")
                mols.append(mol)
                block = []

    mols.reverse()
    return mols

`read_molblock(molblock, sanitize=True, strict_parsing=True, remove_hs=True, fail_if_invalid=False)` ¶

Read a Mol block.

Note that potential molecule properties are not read.

Parameters:

Name	Type	Description	Default
`molblock`	`str`	String containing the Mol block.	required
`sanitize`	`bool`	Whether to sanitize the molecules.	`True`
`strict_parsing`	`bool`	If set to false, the parser is more lax about correctness of the contents.	`True`
`remove_hs`	`bool`	Whether to remove the existing hydrogens in the SDF files.	`True`
`fail_if_invalid`	`bool`	If set to true, the parser will raise an exception if the molecule is invalid instead of returning None.	`False`

Source code in datamol/io.py

def read_molblock(
    molblock: str,
    sanitize: bool = True,
    strict_parsing: bool = True,
    remove_hs: bool = True,
    fail_if_invalid: bool = False,
) -> Optional[dm.Mol]:
    """Read a Mol block.

    Note that potential molecule properties are **not** read.

    Args:
        molblock: String containing the Mol block.
        sanitize: Whether to sanitize the molecules.
        strict_parsing: If set to false, the parser is more lax about correctness of the contents.
        remove_hs: Whether to remove the existing hydrogens in the SDF files.
        fail_if_invalid: If set to true, the parser will raise an exception if the molecule is invalid
            instead of returning None.
    """

    mol = rdmolfiles.MolFromMolBlock(
        molblock,
        sanitize=sanitize,
        removeHs=remove_hs,
        strictParsing=strict_parsing,
    )

    if mol is None and fail_if_invalid:
        raise ValueError(f"Invalid molecule: {molblock}")

    return mol

`read_pdbblock(molblock, sanitize=True, remove_hs=True, flavor=0, proximity_bonding=True)` ¶

Read a PDB string block.

Parameters:

Name	Type	Description	Default
`molblock`	`str`	String containing the Mol block.	required
`sanitize`	`bool`	Whether to sanitize the molecules.	`True`
`remove_hs`	`bool`	Whether to remove the existing hydrogens in the SDF files.	`True`
`flavor`	`int`	RDKit flavor options.	`0`
`proximity_bonding`	`bool`	Whether to toggles automatic proximity bonding.	`True`

Source code in datamol/io.py

def read_pdbblock(
    molblock: str,
    sanitize: bool = True,
    remove_hs: bool = True,
    flavor: int = 0,
    proximity_bonding: bool = True,
) -> dm.Mol:
    """Read a PDB string block.

    Args:
        molblock: String containing the Mol block.
        sanitize: Whether to sanitize the molecules.
        remove_hs: Whether to remove the existing hydrogens in the SDF files.
        flavor: RDKit flavor options.
        proximity_bonding: Whether to toggles automatic proximity bonding.
    """

    mol = rdmolfiles.MolFromPDBBlock(
        molblock,
        sanitize=sanitize,
        removeHs=remove_hs,
        flavor=flavor,
        proximityBonding=proximity_bonding,
    )
    return mol

`read_pdbfile(urlpath, sanitize=True, remove_hs=True, flavor=0, proximity_bonding=True)` ¶

Read a PDB file.

Parameters:

Name	Type	Description	Default
`urlpath`	`Union[str, PathLike]`	Path to a file or a file-like object. Path can be remote or local.	required
`sanitize`	`bool`	Whether to sanitize the molecules.	`True`
`remove_hs`	`bool`	Whether to remove the existing hydrogens in the SDF files.	`True`
`flavor`	`int`	RDKit flavor options.	`0`
`proximity_bonding`	`bool`	Whether to toggles automatic proximity bonding.	`True`

Returns:

Name	Type	Description
`mol`	`Mol`	a molecule

Source code in datamol/io.py

def read_pdbfile(
    urlpath: Union[str, os.PathLike],
    sanitize: bool = True,
    remove_hs: bool = True,
    flavor: int = 0,
    proximity_bonding: bool = True,
) -> Mol:
    """Read a PDB file.

    Args:
        urlpath: Path to a file or a file-like object. Path can be remote or local.
        sanitize: Whether to sanitize the molecules.
        remove_hs: Whether to remove the existing hydrogens in the SDF files.
        flavor: RDKit flavor options.
        proximity_bonding: Whether to toggles automatic proximity bonding.

    Returns:
        mol: a molecule
    """

    with fsspec.open(urlpath, "r") as f:
        f = cast(IO, f)
        mol = read_pdbblock(
            f.read(),
            sanitize=sanitize,
            remove_hs=remove_hs,
            flavor=flavor,
            proximity_bonding=proximity_bonding,
        )
    return mol

`read_sdf(urlpath, sanitize=True, as_df=False, smiles_column='smiles', mol_column=None, include_private=False, include_computed=False, strict_parsing=True, remove_hs=True, max_num_mols=None, discard_invalid=True, n_jobs=1)` ¶

Read an SDF file.

Note: This function is meant to be used with dataset that fit in-memory. For a more advanced usage we suggest you to use directly Chem.ForwardSDMolSupplier.

Parameters:

Name	Type	Description	Default
`urlpath`	`Union[str, PathLike, IO]`	Path to a file or a file-like object. Path can be remote or local.	required
`sanitize`	`bool`	Whether to sanitize the molecules.	`True`
`as_df`	`bool`	Whether to return a list mol or a pandas DataFrame.	`False`
`smiles_column`	`Optional[str]`	Name of the SMILES column. Only relevant if `as_df` is True.	`'smiles'`
`mol_column`	`Optional[str]`	Name of the mol column. Only relevant if `as_df` is True.	`None`
`include_private`	`bool`	Include private properties in the columns. Only relevant if `as_df` is True.	`False`
`include_computed`	`bool`	Include computed properties in the columns. Only relevant if `as_df` is True.	`False`
`strict_parsing`	`bool`	If set to false, the parser is more lax about correctness of the contents.	`True`
`remove_hs`	`bool`	Whether to remove the existing hydrogens in the SDF files.	`True`
`max_num_mols`	`Optional[int]`	Maximum number of molecules to read from the SDF file. Read all by default when set to `None`.	`None`
`discard_invalid`	`bool`	Discard the molecules that failed to be read correctly. Otherwise, invalid molecules will be loaded as `None`.	`True`
`n_jobs`	`Optional[int]`	Optional number of jobs for parallelization of `to_df`. Leave to 1 for no parallelization. Set to -1 to use all available cores. Only relevant is `as_df` is True	`1`

Source code in datamol/io.py

def read_sdf(
    urlpath: Union[str, os.PathLike, IO],
    sanitize: bool = True,
    as_df: bool = False,
    smiles_column: Optional[str] = "smiles",
    mol_column: Optional[str] = None,
    include_private: bool = False,
    include_computed: bool = False,
    strict_parsing: bool = True,
    remove_hs: bool = True,
    max_num_mols: Optional[int] = None,
    discard_invalid: bool = True,
    n_jobs: Optional[int] = 1,
) -> Union[List[Mol], pd.DataFrame]:
    """Read an SDF file.

    Note: This function is meant to be used with dataset that fit _in-memory_.
    For a more advanced usage we suggest you to use directly `Chem.ForwardSDMolSupplier`.

    Args:
        urlpath: Path to a file or a file-like object. Path can be remote or local.
        sanitize: Whether to sanitize the molecules.
        as_df: Whether to return a list mol or a pandas DataFrame.
        smiles_column: Name of the SMILES column. Only relevant if `as_df` is True.
        mol_column: Name of the mol column. Only relevant if `as_df` is True.
        include_private: Include private properties in the columns.  Only relevant if
            `as_df` is True.
        include_computed: Include computed properties in the columns.  Only relevant if
            `as_df` is True.
        strict_parsing: If set to false, the parser is more lax about correctness of the contents.
        remove_hs: Whether to remove the existing hydrogens in the SDF files.
        max_num_mols: Maximum number of molecules to read from the SDF file. Read all by default when set
            to `None`.
        discard_invalid: Discard the molecules that failed to be read correctly. Otherwise,
            invalid molecules will be loaded as `None`.
        n_jobs: Optional number of jobs for parallelization of `to_df`. Leave to 1 for no
            parallelization. Set to -1 to use all available cores. Only relevant is `as_df` is True
    """

    # File-like object
    if isinstance(urlpath, io.IOBase):
        supplier = rdmolfiles.ForwardSDMolSupplier(
            urlpath,
            sanitize=sanitize,
            strictParsing=strict_parsing,
            removeHs=remove_hs,
        )
        mols = _get_supplier_mols(supplier, max_num_mols)

    # Regular local or remote paths
    else:
        with fsspec.open(urlpath, compression="infer") as f:
            supplier = rdmolfiles.ForwardSDMolSupplier(
                f,
                sanitize=sanitize,
                strictParsing=strict_parsing,
                removeHs=remove_hs,
            )
            mols = _get_supplier_mols(supplier, max_num_mols)

    # Discard None values
    if discard_invalid:
        mols = [mol for mol in mols if mol is not None]

    # Convert to dataframe
    if as_df:
        return dm.to_df(
            mols,
            smiles_column=smiles_column,
            mol_column=mol_column,
            include_private=include_private,
            include_computed=include_computed,
            n_jobs=n_jobs,
        )  # type: ignore

    return mols

`read_smi(urlpath)` ¶

Read a list of smiles from am .smi file.

Note: We strongly recommend you to use dm.read_csv or pandas.read_csv instead of dm.read_smi since .smi files are CSV-like format. The only difference are the default settings which changes:

The default separator is a space instead of a comma ,.
The headers of the column are not included.

By modifying the args of dm.read_csv(), you will be able to read an .smi files.

Parameters:

Name	Type	Description	Default
`urlpath`	`Union[str, Path, IOBase, OpenFile]`	Path to a file or a file-like object. Path can be remote or local.	required

Source code in datamol/io.py

def read_smi(
    urlpath: Union[str, pathlib.Path, io.IOBase, fsspec.core.OpenFile],
) -> Sequence[Mol]:
    """Read a list of smiles from am `.smi` file.

    Note: We **strongly** recommend you to use `dm.read_csv` or `pandas.read_csv` instead
    of `dm.read_smi` since `.smi` files are CSV-like format. The only difference are the
    default settings which changes:

    - The default separator is a space ` ` instead of a comma `,`.
    - The headers of the column are not included.

    By modifying the args of `dm.read_csv()`, you will be able to read an `.smi` files.

    Args:
        urlpath: Path to a file or a file-like object. Path can be remote or local.
    """

    active_path = urlpath

    # NOTE(hadim): the temporary local file copy
    # is because `SmilesMolSupplier` does not support
    # using file-like object, only path.

    # Copy to a local temporary path if the path is a remote one.
    if not fsspec.utils.can_be_local(str(urlpath)):
        active_path = pathlib.Path(tempfile.mkstemp()[1])
        dm.utils.fs.copy_file(urlpath, active_path, force=True)

    # Read the molecules
    supplier = rdmolfiles.SmilesMolSupplier(str(active_path), titleLine=0)
    mols = [mol for mol in supplier if mol is not None]

    # Delete the local temporary path
    if not fsspec.utils.can_be_local(str(urlpath)):
        pathlib.Path(str(active_path)).unlink()

    return mols

`save_df(data, path, **kwargs)` ¶

Save a dataframe file whatever its filetype from csv, excel, parquet, json, sdf.

Parameters:

Name	Type	Description	Default
`data`	`DataFrame`	dataframe to save.	required
`path`	`str`	path to save the file.	required
`**kwargs`	`Any`	additional arguments to pass that are specific to the file save type.	`{}`

Source code in datamol/io.py

def save_df(
    data: pd.DataFrame,
    path: str,
    **kwargs: Any,
):
    """Save a dataframe file whatever its filetype from
    `csv, excel, parquet, json, sdf`.

    Args:
        data: dataframe to save.
        path: path to save the file.
        **kwargs: additional arguments to pass that are specific to the file save type.
    """

    filetype = _guess_filetype(path)

    if filetype == "csv":
        kwargs.setdefault("index", False)
        data.to_csv(path, **kwargs)
    elif filetype == "excel":
        kwargs.setdefault("index", False)
        data.to_excel(path, **kwargs)
    elif filetype == "parquet":
        data.to_parquet(path, **kwargs)
    elif filetype == "json":
        data.to_json(path, **kwargs)
    elif filetype == "sdf":
        dm.to_sdf(data, path, **kwargs)
    else:
        raise ValueError(f"The file type of {path} is not supported.")

`to_molblock(mol, include_stereo=True, conf_id=-1, kekulize=True, force_V3000=False)` ¶

Convert a molecule to a mol block string.

Note that any molecule properties are lost.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	A molecule.	required
`include_stereo`	`bool`	Toggles inclusion of stereochemical information in the output.	`True`
`conf_id`	`int`	Selects which conformation to output.	`-1`
`kekulize`	`bool`	Triggers kekulization of the molecule before it's written, as suggested by the MDL spec.	`True`
`force_V3000`	`bool`	Force generation a V3000 mol block (happens automatically with more than 999 atoms or bonds).	`False`

Source code in datamol/io.py

def to_molblock(
    mol: Mol,
    include_stereo: bool = True,
    conf_id: int = -1,
    kekulize: bool = True,
    force_V3000: bool = False,
):
    """Convert a molecule to a mol block string.

    Note that any molecule properties are lost.

    Args:
        mol: A molecule.
        include_stereo: Toggles inclusion of stereochemical information in the output.
        conf_id: Selects which conformation to output.
        kekulize: Triggers kekulization of the molecule before it's written,
            as suggested by the MDL spec.
        force_V3000: Force generation a V3000 mol block (happens automatically
            with more than 999 atoms or bonds).
    """

    molblock = rdmolfiles.MolToMolBlock(
        mol,
        includeStereo=include_stereo,
        confId=conf_id,
        kekulize=kekulize,
        forceV3000=force_V3000,
    )

    return molblock

`to_pdbblock(mol, conf_id=-1)` ¶

Convert a molecule to a PDB string block.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	A molecule.	required
`conf_id`	`int`	Selects which conformation to use.	`-1`

Source code in datamol/io.py

def to_pdbblock(mol: Mol, conf_id: int = -1) -> str:
    """Convert a molecule to a PDB string block.

    Args:
        mol: A molecule.
        conf_id: Selects which conformation to use.
    """
    molblock = rdmolfiles.MolToPDBBlock(mol, confId=conf_id)
    return molblock

`to_pdbfile(mol, urlpath, conf_id=-1)` ¶

Save a molecule to a PDB file.

Parameters:

Name	Type	Description	Default
`mol`	`Mol`	A molecule.	required
`urlpath`	`Union[str, PathLike]`	Path to a file or a file-like object. Path can be remote or local.	required
`conf_id`	`int`	Selects which conformation to use.	`-1`

Source code in datamol/io.py

def to_pdbfile(
    mol: Mol,
    urlpath: Union[str, os.PathLike],
    conf_id: int = -1,
):
    """Save a molecule to a PDB file.

    Args:
        mol: A molecule.
        urlpath: Path to a file or a file-like object. Path can be remote or local.
        conf_id: Selects which conformation to use.
    """
    molblock = to_pdbblock(mol, conf_id=conf_id)
    with fsspec.open(urlpath, "w") as f:
        f = cast(IO, f)
        f.write(molblock)

`to_sdf(mols, urlpath, smiles_column='smiles', mol_column=None)` ¶

Write molecules to a file.

Parameters:

Name	Type	Description	Default
`mols`	`Union[Mol, Sequence[Mol], DataFrame]`	a dataframe, a molecule or a list of molecule.	required
`urlpath`	`Union[str, PathLike, IO]`	Path to a file or a file-like object. Path can be remote or local.	required
`smiles_column`	`Optional[str]`	Column name to extract the molecule.	`'smiles'`
`mol_column`	`Optional[str]`	Column name to extract the molecule. It takes precedence over `smiles_column`.	`None`

Source code in datamol/io.py

def to_sdf(
    mols: Union[Mol, Sequence[Mol], pd.DataFrame],
    urlpath: Union[str, os.PathLike, IO],
    smiles_column: Optional[str] = "smiles",
    mol_column: Optional[str] = None,
):
    """Write molecules to a file.

    Args:
        mols: a dataframe, a molecule or a list of molecule.
        urlpath: Path to a file or a file-like object. Path can be remote or local.
        smiles_column: Column name to extract the molecule.
        mol_column: Column name to extract the molecule. It takes
            precedence over `smiles_column`.
    """

    if isinstance(mols, pd.DataFrame):
        mols = dm.from_df(mols, smiles_column=smiles_column, mol_column=mol_column)

    elif isinstance(mols, Mol):
        mols = [mols]

    # Filter out None values
    mols = [mol for mol in mols if mol is not None]

    # File-like object
    if isinstance(urlpath, io.IOBase):
        writer = rdmolfiles.SDWriter(urlpath)
        for mol in mols:
            writer.write(mol)
        writer.close()

    # Regular local or remote paths
    else:
        with fsspec.open(urlpath, mode="w") as f:
            writer = rdmolfiles.SDWriter(f)
            for mol in mols:
                writer.write(mol)
            writer.close()

`to_smi(mols, urlpath, error_if_empty=False)` ¶

Save a list of molecules in an .smi file.

Note: We strongly recommend you to use dm.to_csv instead of dm.to_smi since .smi files are CSV-like format. The only difference are the default settings which changes:

The default separator is a space instead of a comma ,.
The headers of the column are not included.

By modifying the args of dm.to_csv(), you will be able to save a SMI compatible file.

Parameters:

Name	Type	Description	Default
`mols`	`Sequence[Mol]`	a list of molecules.	required
`urlpath`	`Union[str, PathLike, IO]`	Path to a file or a file-like object. Path can be remote or local.	required
`error_if_empty`	`bool`	whether to raise and error if the input list is empty.	`False`

Source code in datamol/io.py

def to_smi(
    mols: Sequence[Mol],
    urlpath: Union[str, os.PathLike, IO],
    error_if_empty: bool = False,
):
    """Save a list of molecules in an `.smi` file.

    Note: We **strongly** recommend you to use `dm.to_csv` instead
    of `dm.to_smi` since `.smi` files are CSV-like format. The only difference are the
    default settings which changes:

    - The default separator is a space ` ` instead of a comma `,`.
    - The headers of the column are not included.

    By modifying the args of `dm.to_csv()`, you will be able to save a SMI compatible file.

    Args:
        mols: a list of molecules.
        urlpath: Path to a file or a file-like object. Path can be remote or local.
        error_if_empty: whether to raise and error if the input list is empty.
    """

    if len(mols) == 0 and error_if_empty:
        raise ValueError("The list of mols/smiles provided is empty.")

    # Filter out None values
    mols = [mol for mol in mols if mol is not None]

    # File-like object
    if isinstance(urlpath, io.IOBase):
        writer = rdmolfiles.SmilesWriter(urlpath, includeHeader=False, nameHeader="")
        for mol in mols:
            writer.write(mol)
        writer.close()

    # Regular local or remote paths
    else:
        with fsspec.open(urlpath, "w") as f:
            writer = rdmolfiles.SmilesWriter(f, includeHeader=False, nameHeader="")
            for mol in mols:
                writer.write(mol)
            writer.close()

`to_xlsx(mols, urlpath, smiles_column='smiles', mol_column='mol', mol_size=[300, 300])` ¶

Write molecules to an Excel file with a molecule column as an RDKit rendered image.

Parameters:

Name	Type	Description	Default
`mols`	`Union[Mol, Sequence[Mol], DataFrame]`	a dataframe, a molecule or a list of molecule.	required
`urlpath`	`Union[str, PathLike]`	Path to a file or a file-like object. Path can be remote or local.	required
`smiles_column`	`Optional[str]`	Column name to extract the molecule.	`'smiles'`
`mol_column`	`str`	Column name to extract the molecule. It takes precedence over `smiles_column`. Column name to write the RDKit rendered image. If none, the molecule images are not written.	`'mol'`

Source code in datamol/io.py

def to_xlsx(
    mols: Union[Mol, Sequence[Mol], pd.DataFrame],
    urlpath: Union[str, os.PathLike],
    smiles_column: Optional[str] = "smiles",
    mol_column: str = "mol",
    mol_size: List[int] = [300, 300],
):
    """Write molecules to an Excel file with a molecule column as an RDKit rendered
    image.

    Args:
        mols: a dataframe, a molecule or a list of molecule.
        urlpath: Path to a file or a file-like object. Path can be remote or local.
        smiles_column: Column name to extract the molecule.
        mol_column: Column name to extract the molecule. It takes
            precedence over `smiles_column`.
            Column name to write the RDKit rendered image. If none,
            the molecule images are not written.
    """

    if isinstance(mols, Mol):
        mols = [mols]

    if isinstance(mols, Sequence):
        mols = [mol for mol in mols if mol is not None]
        mols = dm.to_df(mols, smiles_column=smiles_column, mol_column=mol_column)

    if mols is None or mols.empty:  # type: ignore
        raise ValueError("No molecules to write")

    with fsspec.open(urlpath, mode="wb") as f:
        PandasTools.SaveXlsxFromFrame(mols, f, molCol=mol_column, size=mol_size)

datamol.io¶

open_df(path, **kwargs) ¶

read_csv(urlpath, smiles_column=None, mol_column='mol', **kwargs) ¶

read_excel(urlpath, sheet_name=0, smiles_column=None, mol_column='mol', **kwargs) ¶

read_mol2file(urlpath, sanitize=True, cleanup_substructures=True, remove_hs=True, fail_if_invalid=False) ¶

read_molblock(molblock, sanitize=True, strict_parsing=True, remove_hs=True, fail_if_invalid=False) ¶

read_pdbblock(molblock, sanitize=True, remove_hs=True, flavor=0, proximity_bonding=True) ¶

read_pdbfile(urlpath, sanitize=True, remove_hs=True, flavor=0, proximity_bonding=True) ¶

read_sdf(urlpath, sanitize=True, as_df=False, smiles_column='smiles', mol_column=None, include_private=False, include_computed=False, strict_parsing=True, remove_hs=True, max_num_mols=None, discard_invalid=True, n_jobs=1) ¶

read_smi(urlpath) ¶

save_df(data, path, **kwargs) ¶

to_molblock(mol, include_stereo=True, conf_id=-1, kekulize=True, force_V3000=False) ¶

to_pdbblock(mol, conf_id=-1) ¶

to_pdbfile(mol, urlpath, conf_id=-1) ¶

to_sdf(mols, urlpath, smiles_column='smiles', mol_column=None) ¶

to_smi(mols, urlpath, error_if_empty=False) ¶

to_xlsx(mols, urlpath, smiles_column='smiles', mol_column='mol', mol_size=[300, 300]) ¶

`datamol.io`¶

`open_df(path, **kwargs)` ¶

`read_csv(urlpath, smiles_column=None, mol_column='mol', **kwargs)` ¶

`read_excel(urlpath, sheet_name=0, smiles_column=None, mol_column='mol', **kwargs)` ¶

`read_mol2file(urlpath, sanitize=True, cleanup_substructures=True, remove_hs=True, fail_if_invalid=False)` ¶

`read_molblock(molblock, sanitize=True, strict_parsing=True, remove_hs=True, fail_if_invalid=False)` ¶

`read_pdbblock(molblock, sanitize=True, remove_hs=True, flavor=0, proximity_bonding=True)` ¶

`read_pdbfile(urlpath, sanitize=True, remove_hs=True, flavor=0, proximity_bonding=True)` ¶

`read_sdf(urlpath, sanitize=True, as_df=False, smiles_column='smiles', mol_column=None, include_private=False, include_computed=False, strict_parsing=True, remove_hs=True, max_num_mols=None, discard_invalid=True, n_jobs=1)` ¶

`read_smi(urlpath)` ¶

`save_df(data, path, **kwargs)` ¶

`to_molblock(mol, include_stereo=True, conf_id=-1, kekulize=True, force_V3000=False)` ¶

`to_pdbblock(mol, conf_id=-1)` ¶

`to_pdbfile(mol, urlpath, conf_id=-1)` ¶

`to_sdf(mols, urlpath, smiles_column='smiles', mol_column=None)` ¶

`to_smi(mols, urlpath, error_if_empty=False)` ¶

`to_xlsx(mols, urlpath, smiles_column='smiles', mol_column='mol', mol_size=[300, 300])` ¶