Hugo Evers
06/29/2023, 8:23 AMdata_dir
as input, instead of filepath
, it took me roughly an hour of debugging to figure out why i loading the dataset was now dependant on the current working directory, and just wouldn;t load if i gave it a relative path (data/01_raw/..) instead of workspace/project_name/data/01_raw/….
Anyway, the issue was that filepath has a (buried) custom resolver in AbstractDataSet baseclass.
So would it be a good idea to add to the docs for custom datasets that filepath
has that behaviour, and maybe we could add an example of a how to make a FolderDataset. since all the current datasets in kedro-datasets point to specific files, but i’d wager there are folks out there who would want to read an entire folders’ worth of data.datajoely
06/29/2023, 8:24 AMPartitionedDataSet
as your base class?kedro-datasets
if you’re able to? cc @Juan LuisJuan Luis
06/29/2023, 8:39 AMfilepath
property specifically, sorry you had a rough experience @Hugo EversHugo Evers
06/29/2023, 8:42 AMdatajoely
06/29/2023, 8:43 AMHugo Evers
06/29/2023, 8:43 AMdatajoely
06/29/2023, 8:43 AMHugo Evers
06/29/2023, 8:44 AMfrom copy import deepcopy
from pathlib import Path, PurePosixPath
from typing import Any, Dict
import fsspec
import numpy as np
from datasets import load_dataset
from datasets.arrow_dataset import Dataset
from <http://kedro.io|kedro.io> import AbstractDataSet
from kedro.io.core import get_filepath_str, get_protocol_and_path, DataSetError
class AudioFolderDataSet(AbstractDataSet[Dict[str, Any], Dataset]):
"""``AudioFolderDataSet`` loads audio data from the Hugging Face AudioFolder dataset.
<https://huggingface.co/docs/datasets/audio_dataset#audiofolder>
Example:
::
>>> AudioFolderDataSet(data_dir='/path/to/data')
"""
DEFAULT_LOAD_ARGS: Dict[str, Any] = {}
DEFAULT_SAVE_ARGS: Dict[str, Any] = {}
def _init_(
self,
filepath: str,
load_args: Dict[str, Any] = None,
save_args: Dict[str, Any] = None,
credentials: Dict[str, Any] = None,
fs_args: Dict[str, Any] = None,
metadata: Dict[str, Any] = None,
):
"""Creates a new instance of AudioFolderDataSet to load audio data from the Hugging Face AudioFolder dataset.
Args:
data_dir: The location of the AudioFolder dataset.
"""
protocol, self.path = get_protocol_and_path(filepath)
self._protocol = protocol
_fs_args = deepcopy(fs_args) or {}
_fs_open_args_load = _fs_args.pop("open_args_load", {})
_fs_open_args_save = _fs_args.pop("open_args_save", {})
_credentials = deepcopy(credentials) or {}
if protocol == "file":
_fs_args.setdefault("auto_mkdir", True)
self._fs = fsspec.filesystem(self._protocol, **_credentials, **_fs_args)
self.metadata = metadata
super()._init_()
# super()._init_(
# filepath=PurePosixPath(path),
# exists_function=self._fs.exists,
# glob_function=self._fs.glob,
# )
# Handle default load and save arguments
self._load_args = deepcopy(self.DEFAULT_LOAD_ARGS)
if load_args is not None:
self._load_args.update(load_args)
self._save_args = deepcopy(self.DEFAULT_SAVE_ARGS)
if save_args is not None:
self._save_args.update(save_args)
_fs_open_args_save.setdefault("mode", "wb")
self._fs_open_args_load = _fs_open_args_load
self._fs_open_args_save = _fs_open_args_save
self._filepath = get_filepath_str(PurePosixPath(self.path), self._protocol)
def _load(self) -> Dict[str, Any]:
"""Loads data from the AudioFolder dataset.
Returns:
Data from the AudioFolder dataset as a dictionary of train, validation, and test sets.
"""
# with self._fs.open(load_path, **self._fs_open_args_load) as fs_file:
# fs_path = fs_file.path
return load_dataset("audiofolder", data_dir=self._filepath)
def _save(self, data: Dict[str, Any]) -> None:
"""Saves audio data to the specified filepath."""
raise NotImplementedError("AudioFolderDataSet does not support saving data.")
def _describe(self) -> Dict[str, Any]:
"""Returns a dict that describes the attributes of the dataset."""
return dict(filepath=self._filepath)
def _exists(self) -> bool:
return self._fs.exists(self._filepath)
datajoely
06/29/2023, 8:46 AMHugo Evers
06/29/2023, 8:47 AMdatajoely
06/29/2023, 8:49 AMJuan Luis
08/17/2023, 3:09 PMdatajoely
08/17/2023, 3:10 PMHugo Evers
08/18/2023, 3:24 PMdatajoely
08/18/2023, 3:25 PMHugo Evers
08/18/2023, 3:25 PMdatajoely
08/18/2023, 3:32 PMHugo Evers
08/18/2023, 3:33 PMdatajoely
08/18/2023, 3:34 PMHugo Evers
08/18/2023, 3:35 PMdatajoely
08/18/2023, 3:35 PMHugo Evers
08/18/2023, 3:35 PMdatajoely
08/18/2023, 3:35 PMHugo Evers
08/18/2023, 3:35 PMdatajoely
08/18/2023, 3:36 PMHugo Evers
08/18/2023, 3:36 PMdatajoely
08/18/2023, 3:43 PMJuan Luis
08/18/2023, 3:43 PMdatajoely
08/18/2023, 3:43 PMHugo Evers
08/18/2023, 4:26 PMfrom tempfile import TemporaryDirectory
from cloudpathlib import S3Path
from transformers import AutoModel
def load_from_s3(s3_path:S3Path)->AutoModel:
with TemporaryDirectory() as tmp_dir:
s3_path.download_to(tmp_dir)
return AutoModel.from_pretrained(tmp_dir)
def save_to_s3(model:AutoModel, s3_path:S3Path):
with TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir)
s3_path.upload_from(tmp_dir)
from tempfile import TemporaryDirectory
from cloudpathlib import CloudPath
from transformers import AutoModel
from <http://kedro.io|kedro.io> import AbstractDataSet
class HFTransformersDataset(AbstractDataSet):
def __init__(self, path: str):
self.cloud_path = CloudPath(path)
def _load(self) -> AutoModel:
"""
Loads the model from the cloud path.
Returns:
AutoModel: The loaded Hugging Face Transformers model.
"""
with TemporaryDirectory() as tmp_dir:
self.cloud_path.download_to(tmp_dir)
return AutoModel.from_pretrained(tmp_dir)
def _save(self, model: AutoModel) -> None:
"""
Saves the model to the cloud path.
Args:
model (AutoModel): The Hugging Face Transformers model to save.
"""
with TemporaryDirectory() as tmp_dir:
model.save_pretrained(tmp_dir)
self.cloud_path.upload_from(tmp_dir)
def _describe(self) -> Dict[str, Any]:
return dict(
cloud_path=str(self.cloud_path),
)