Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[data] Add from_huggingface #24464

Merged
merged 7 commits into from
May 6, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions doc/source/data/dataset.rst
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,9 @@ Supported Input Formats
* - Arrow Table Objects
- :func:`ray.data.from_arrow()`
- ✅
* - 🤗 (Hugging Face) Dataset
- :func:`ray.data.from_huggingface()`
- ✅
* - Custom Datasource
- :func:`ray.data.read_datasource()`
- ✅
Expand Down
1 change: 1 addition & 0 deletions doc/source/data/package-ref.rst
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ Creating Datasets
.. autofunction:: ray.data.from_items
.. autofunction:: ray.data.from_arrow
.. autofunction:: ray.data.from_arrow_refs
.. autofunction:: ray.data.from_huggingface
.. autofunction:: ray.data.from_spark
.. autofunction:: ray.data.from_dask
.. autofunction:: ray.data.from_modin
Expand Down
2 changes: 2 additions & 0 deletions python/ray/data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
from_arrow,
from_arrow_refs,
from_spark,
from_huggingface,
read_datasource,
read_numpy,
read_text,
Expand Down Expand Up @@ -50,6 +51,7 @@
"from_pandas",
"from_pandas_refs",
"from_spark",
"from_huggingface",
"range",
"range_arrow",
"range_tensor",
Expand Down
25 changes: 25 additions & 0 deletions python/ray/data/read_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import mars
import modin
import pyspark
import datasets

import ray
from ray.types import ObjectRef
Expand Down Expand Up @@ -962,6 +963,30 @@ def from_spark(
return raydp.spark.spark_dataframe_to_ray_dataset(df, parallelism)


@PublicAPI
def from_huggingface(
dataset: Union["datasets.Dataset", "datasets.DatasetDict"],
) -> Union[Dataset[ArrowRow], Dict[str, Dataset[ArrowRow]]]:
"""Create a dataset from a Hugging Face Datasets Dataset.

Args:
dataset: A Hugging Face ``Dataset``, or ``DatasetDict``.
``IterableDataset`` is not supported.

Returns:
Dataset holding Arrow records from the Dataset, or a
dict of datasets in case ``dataset`` is a ``DatasetDict``.
"""
import datasets

def convert(ds: "datasets.Dataset") -> Dataset[ArrowRow]:
return from_arrow(ds.data.table)

if isinstance(dataset, datasets.DatasetDict):
return {k: convert(ds) for k, ds in dataset.items()}
return convert(dataset)


def _df_to_block(df: "pandas.DataFrame") -> Block[ArrowRow]:
stats = BlockExecStats.builder()
import pyarrow as pa
Expand Down
35 changes: 35 additions & 0 deletions python/ray/data/tests/test_huggingface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import pytest
import ray
import datasets


@pytest.fixture(scope="module")
def ray_start_regular(request): # pragma: no cover
try:
yield ray.init(num_cpus=16)
finally:
ray.shutdown()


def test_huggingface(ray_start_regular):
data = datasets.load_dataset("emotion")

assert isinstance(data, datasets.DatasetDict)

ray_datasets = ray.data.from_huggingface(data)
assert isinstance(ray_datasets, dict)

assert ray.get(ray_datasets["train"].to_arrow_refs())[0].equals(
data["train"].data.table
)

ray_dataset = ray.data.from_huggingface(data["train"])
assert isinstance(ray_dataset, ray.data.Dataset)

assert ray.get(ray_dataset.to_arrow_refs())[0].equals(data["train"].data.table)


if __name__ == "__main__":
import sys

sys.exit(pytest.main(["-v", __file__]))