From b3c14fda362f755526d33a0dfaeedb277eda0367 Mon Sep 17 00:00:00 2001 From: Matthew Deng Date: Fri, 5 Aug 2022 17:10:17 -0700 Subject: [PATCH 1/7] [data] update datasets API structure Signed-off-by: Matthew Deng --- doc/source/_toc.yml | 1 + doc/source/data/api/api.rst | 16 +++ doc/source/data/api/data_representations.rst | 47 +++++++ doc/source/data/api/dataset.rst | 31 +++++ doc/source/data/api/dataset_context.rst | 6 + doc/source/data/api/dataset_pipeline.rst | 23 ++++ doc/source/data/api/grouped_dataset.rst | 34 +++++ doc/source/data/api/input_output.rst | 55 ++++++++ doc/source/data/api/random_access_dataset.rst | 6 + doc/source/data/api/utility.rst | 5 + doc/source/data/package-ref.rst | 128 +----------------- 11 files changed, 226 insertions(+), 126 deletions(-) create mode 100644 doc/source/data/api/api.rst create mode 100644 doc/source/data/api/data_representations.rst create mode 100644 doc/source/data/api/dataset.rst create mode 100644 doc/source/data/api/dataset_context.rst create mode 100644 doc/source/data/api/dataset_pipeline.rst create mode 100644 doc/source/data/api/grouped_dataset.rst create mode 100644 doc/source/data/api/input_output.rst create mode 100644 doc/source/data/api/random_access_dataset.rst create mode 100644 doc/source/data/api/utility.rst diff --git a/doc/source/_toc.yml b/doc/source/_toc.yml index 17eb61568b7f..7509d5d32bdc 100644 --- a/doc/source/_toc.yml +++ b/doc/source/_toc.yml @@ -60,6 +60,7 @@ parts: - file: data/advanced-pipelines - file: data/random-access - file: data/faq + - file: data/api/api - file: data/package-ref - file: data/integrations diff --git a/doc/source/data/api/api.rst b/doc/source/data/api/api.rst new file mode 100644 index 000000000000..233633774c44 --- /dev/null +++ b/doc/source/data/api/api.rst @@ -0,0 +1,16 @@ +.. _data_api: + +Ray Datasets API +================ + +.. toctree:: + :maxdepth: 2 + + input_output.rst + dataset.rst + dataset_pipeline.rst + grouped_dataset.rst + dataset_context.rst + data_representations.rst + random_access_dataset.rst + utility.rst diff --git a/doc/source/data/api/data_representations.rst b/doc/source/data/api/data_representations.rst new file mode 100644 index 000000000000..6159a6b13103 --- /dev/null +++ b/doc/source/data/api/data_representations.rst @@ -0,0 +1,47 @@ + +Data Representations +==================== + +Block API +--------- + +.. autoclass:: ray.data.block.Block + +.. autoclass:: ray.data.block.BlockExecStats + :members: + +.. autoclass:: ray.data.block.BlockMetadata + :members: + +.. autoclass:: ray.data.block.BlockAccessor + :members: + + +Batch API +--------- + +.. autoclass:: ray.data.block.DataBatch + +Row API +-------- + +.. autoclass:: ray.data.row.TableRow + :members: + + +.. _dataset-tensor-extension-api: + +Tensor Column Extension API +--------------------------- + +.. autoclass:: ray.data.extensions.tensor_extension.TensorDtype + :members: + +.. autoclass:: ray.data.extensions.tensor_extension.TensorArray + :members: + +.. autoclass:: ray.data.extensions.tensor_extension.ArrowTensorType + :members: + +.. autoclass:: ray.data.extensions.tensor_extension.ArrowTensorArray + :members: \ No newline at end of file diff --git a/doc/source/data/api/dataset.rst b/doc/source/data/api/dataset.rst new file mode 100644 index 000000000000..2d4390d92c25 --- /dev/null +++ b/doc/source/data/api/dataset.rst @@ -0,0 +1,31 @@ +.. _dataset-api: + +Dataset API +=========== + + +.. autoclass:: ray.data.Dataset + :members: + +.. TODO split methods into sections + +Basic Transformations +--------------------- + +Sorting, Shuffling, Repartitioning +---------------------------------- + +Splitting and Merging Datasets +------------------------------ + +Grouped and Global Aggregations +------------------------------- + +Converting to Pipeline +---------------------- + +Accessing Datasets +------------------ + +I/O and Conversion +------------------ diff --git a/doc/source/data/api/dataset_context.rst b/doc/source/data/api/dataset_context.rst new file mode 100644 index 000000000000..3960f2986bcc --- /dev/null +++ b/doc/source/data/api/dataset_context.rst @@ -0,0 +1,6 @@ + +DatasetContext API +================== + +.. autoclass:: ray.data.context.DatasetContext + :members: diff --git a/doc/source/data/api/dataset_pipeline.rst b/doc/source/data/api/dataset_pipeline.rst new file mode 100644 index 000000000000..304e409d904f --- /dev/null +++ b/doc/source/data/api/dataset_pipeline.rst @@ -0,0 +1,23 @@ + +.. _dataset-pipeline-api: + +DatasetPipeline API +=================== + + +.. autoclass:: ray.data.dataset_pipeline.DatasetPipeline + :members: + +.. TODO split methods into sections + +Map transformations +------------------- + +Shuffling transformations +------------------------- + +Splitting and Merging DatasetPipelines +-------------------------------------- + +Accessing DatasetPipelines +-------------------------- diff --git a/doc/source/data/api/grouped_dataset.rst b/doc/source/data/api/grouped_dataset.rst new file mode 100644 index 000000000000..047316122d3e --- /dev/null +++ b/doc/source/data/api/grouped_dataset.rst @@ -0,0 +1,34 @@ + +GroupedDataset API +================== + +.. autoclass:: ray.data.grouped_dataset.GroupedDataset + :members: + +Aggregations +------------ + +.. autoclass:: ray.data.aggregate.AggregateFn + :members: + +.. autoclass:: ray.data.aggregate.Count + :members: + +.. autoclass:: ray.data.aggregate.Sum + :members: + +.. autoclass:: ray.data.aggregate.Max + :members: + +.. autoclass:: ray.data.aggregate.Mean + :members: + +.. autoclass:: ray.data.aggregate.Std + :members: + +.. autoclass:: ray.data.aggregate.AbsMax + :members: + + +Map transformations +------------------- \ No newline at end of file diff --git a/doc/source/data/api/input_output.rst b/doc/source/data/api/input_output.rst new file mode 100644 index 000000000000..0628a5153b3a --- /dev/null +++ b/doc/source/data/api/input_output.rst @@ -0,0 +1,55 @@ + +Input/Output +============ + +Tabular data +------------ + +Tensor data +----------- + +Text data +--------- + +Binary data +----------- + + +Datasource API +-------------- + +.. autoclass:: ray.data.Datasource + :members: + +.. autoclass:: ray.data.ReadTask + :members: + + + +Partitioning API +---------------- + +.. autoclass:: ray.data.datasource.PartitionStyle + :members: + +.. autoclass:: ray.data.datasource.PathPartitionScheme + :members: + +.. autoclass:: ray.data.datasource.PathPartitionEncoder + :members: + +.. autoclass:: ray.data.datasource.PathPartitionParser + :members: + +.. autoclass:: ray.data.datasource.PathPartitionFilter + + + +MetadataProvider API +-------------------- + + +.. autoclass:: ray.data.datasource.FileMetadataProvider + :members: + +.. TODO fill the rest \ No newline at end of file diff --git a/doc/source/data/api/random_access_dataset.rst b/doc/source/data/api/random_access_dataset.rst new file mode 100644 index 000000000000..d238c513942a --- /dev/null +++ b/doc/source/data/api/random_access_dataset.rst @@ -0,0 +1,6 @@ + +(Experimental) RandomAccessDataset API +====================================== + +.. autoclass:: ray.data.random_access_dataset.RandomAccessDataset + :members: \ No newline at end of file diff --git a/doc/source/data/api/utility.rst b/doc/source/data/api/utility.rst new file mode 100644 index 000000000000..f73b06aac1ca --- /dev/null +++ b/doc/source/data/api/utility.rst @@ -0,0 +1,5 @@ + +Utility +======= + +.. autofunction:: ray.data.set_progress_bars \ No newline at end of file diff --git a/doc/source/data/package-ref.rst b/doc/source/data/package-ref.rst index b7714424b574..c0659d62c94b 100644 --- a/doc/source/data/package-ref.rst +++ b/doc/source/data/package-ref.rst @@ -1,6 +1,4 @@ -.. _data_api: - -Ray Datasets API +Ray Datasets API ================ Creating Datasets @@ -30,118 +28,6 @@ Creating Datasets .. autofunction:: ray.data.from_numpy .. autofunction:: ray.data.from_numpy_refs -.. _dataset-api: - -Dataset API ------------ - -.. autoclass:: ray.data.Dataset - :members: - -.. _dataset-pipeline-api: - -Block API ---------- - -.. autoclass:: ray.data.block.BlockExecStats - :members: - -.. autoclass:: ray.data.block.BlockMetadata - :members: - -.. autoclass:: ray.data.block.BlockAccessor - :members: - -DatasetContext API ------------------- - -.. autoclass:: ray.data.context.DatasetContext - :members: - -DatasetPipeline API -------------------- - -.. autoclass:: ray.data.dataset_pipeline.DatasetPipeline - :members: - -GroupedDataset API ------------------- - -.. autoclass:: ray.data.grouped_dataset.GroupedDataset - :members: - -Aggregate API -------------- - -.. autoclass:: ray.data.aggregate.AggregateFn - :members: - -.. autoclass:: ray.data.aggregate.Count - :members: - -.. autoclass:: ray.data.aggregate.Sum - :members: - -.. autoclass:: ray.data.aggregate.Max - :members: - -.. autoclass:: ray.data.aggregate.Mean - :members: - -.. autoclass:: ray.data.aggregate.Std - :members: - -.. autoclass:: ray.data.aggregate.AbsMax - :members: - -RandomAccessDataset API ------------------------ - -.. autoclass:: ray.data.random_access_dataset.RandomAccessDataset - :members: - -.. _dataset-tensor-extension-api: - -Tensor Column Extension API ---------------------------- - -.. autoclass:: ray.data.extensions.tensor_extension.TensorDtype - :members: - -.. autoclass:: ray.data.extensions.tensor_extension.TensorArray - :members: - -.. autoclass:: ray.data.extensions.tensor_extension.ArrowTensorType - :members: - -.. autoclass:: ray.data.extensions.tensor_extension.ArrowTensorArray - :members: - -Custom Datasource API ---------------------- - -.. autoclass:: ray.data.Datasource - :members: - -.. autoclass:: ray.data.ReadTask - :members: - -Datasource Partitioning API ---------------------------- - -.. autoclass:: ray.data.datasource.PartitionStyle - :members: - -.. autoclass:: ray.data.datasource.PathPartitionScheme - :members: - -.. autoclass:: ray.data.datasource.PathPartitionEncoder - :members: - -.. autoclass:: ray.data.datasource.PathPartitionParser - :members: - -.. autoclass:: ray.data.datasource.PathPartitionFilter Built-in Datasources -------------------- @@ -174,14 +60,4 @@ Built-in Datasources :members: .. autoclass:: ray.data.datasource.SimpleTorchDatasource - :members: - -Table Row API ---------------------- - -.. autoclass:: ray.data.row.TableRow - :members: - -Utility -------- -.. autofunction:: ray.data.set_progress_bars + :members: \ No newline at end of file From a5ee6327561f2befed7b481aac34b4c716882935 Mon Sep 17 00:00:00 2001 From: Matthew Deng Date: Fri, 5 Aug 2022 17:41:59 -0700 Subject: [PATCH 2/7] split dataset/pipeline methods Signed-off-by: Matthew Deng --- doc/source/data/api/dataset.rst | 136 ++++++++++++++++++++++- doc/source/data/api/dataset_pipeline.rst | 84 +++++++++++++- 2 files changed, 211 insertions(+), 9 deletions(-) diff --git a/doc/source/data/api/dataset.rst b/doc/source/data/api/dataset.rst index 2d4390d92c25..ea0613a20d94 100644 --- a/doc/source/data/api/dataset.rst +++ b/doc/source/data/api/dataset.rst @@ -3,29 +3,157 @@ Dataset API =========== - .. autoclass:: ray.data.Dataset - :members: - -.. TODO split methods into sections Basic Transformations --------------------- +.. automethod:: ray.data.Dataset.map + +.. automethod:: ray.data.Dataset.map_batches + +.. automethod:: ray.data.Dataset.flat_map + +.. automethod:: ray.data.Dataset.filter + +.. automethod:: ray.data.Dataset.add_column + +.. automethod:: ray.data.Dataset.drop_columns + +.. automethod:: ray.data.Dataset.random_sample + +.. automethod:: ray.data.Dataset.limit + Sorting, Shuffling, Repartitioning ---------------------------------- +.. automethod:: ray.data.Dataset.sort + +.. automethod:: ray.data.Dataset.random_shuffle + +.. automethod:: ray.data.Dataset.randomize_block_order + +.. automethod:: ray.data.Dataset.repartition + Splitting and Merging Datasets ------------------------------ +.. automethod:: ray.data.Dataset.split + +.. automethod:: ray.data.Dataset.split_at_indices + +.. automethod:: ray.data.Dataset.split_proportionately + +.. automethod:: ray.data.Dataset.train_test_split + +.. automethod:: ray.data.Dataset.union + +.. automethod:: ray.data.Dataset.zip + Grouped and Global Aggregations ------------------------------- +.. automethod:: ray.data.Dataset.groupby + +.. automethod:: ray.data.Dataset.aggregate + +.. automethod:: ray.data.Dataset.sum + +.. automethod:: ray.data.Dataset.min + +.. automethod:: ray.data.Dataset.max + +.. automethod:: ray.data.Dataset.mean + +.. automethod:: ray.data.Dataset.std + Converting to Pipeline ---------------------- +.. automethod:: ray.data.Dataset.repeat + +.. automethod:: ray.data.Dataset.window + Accessing Datasets ------------------ +.. automethod:: ray.data.Dataset.iter_rows + +.. automethod:: ray.data.Dataset.iter_batches + +.. automethod:: ray.data.Dataset.iter_torch_batches + +.. automethod:: ray.data.Dataset.iter_tf_batches + I/O and Conversion ------------------ + +.. automethod:: ray.data.Dataset.write_parquet + +.. automethod:: ray.data.Dataset.write_json + +.. automethod:: ray.data.Dataset.write_csv + +.. automethod:: ray.data.Dataset.write_numpy + +.. automethod:: ray.data.Dataset.write_datasource + +.. automethod:: ray.data.Dataset.to_torch + +.. automethod:: ray.data.Dataset.to_tf + +.. automethod:: ray.data.Dataset.to_dask + +.. automethod:: ray.data.Dataset.to_mars + +.. automethod:: ray.data.Dataset.to_modin + +.. automethod:: ray.data.Dataset.to_spark + +.. automethod:: ray.data.Dataset.to_pandas + +.. automethod:: ray.data.Dataset.to_pandas_refs + +.. automethod:: ray.data.Dataset.to_numpy_refs + +.. automethod:: ray.data.Dataset.to_arrow_refs + +.. automethod:: ray.data.Dataset.to_random_access_dataset + + +Other +----- + +.. TODO put these in the right section. + +.. automethod:: ray.data.Dataset.take + +.. automethod:: ray.data.Dataset.take_all + +.. automethod:: ray.data.Dataset.show + +.. automethod:: ray.data.Dataset.count + +.. automethod:: ray.data.Dataset.schema + +.. automethod:: ray.data.Dataset.num_blocks + +.. automethod:: ray.data.Dataset.size_bytes + +.. automethod:: ray.data.Dataset.input_files + +.. automethod:: ray.data.Dataset.fully_executed + +.. automethod:: ray.data.Dataset.is_fully_executed + +.. automethod:: ray.data.Dataset.stats + +.. automethod:: ray.data.Dataset.get_internal_block_refs + +.. automethod:: ray.data.Dataset.lazy + +.. automethod:: ray.data.Dataset.has_serializable_lineage + +.. automethod:: ray.data.Dataset.serialize_lineage + +.. automethod:: ray.data.Dataset.deserialize_lineage \ No newline at end of file diff --git a/doc/source/data/api/dataset_pipeline.rst b/doc/source/data/api/dataset_pipeline.rst index 304e409d904f..a2d3c3dfd245 100644 --- a/doc/source/data/api/dataset_pipeline.rst +++ b/doc/source/data/api/dataset_pipeline.rst @@ -1,23 +1,97 @@ - .. _dataset-pipeline-api: DatasetPipeline API =================== - .. autoclass:: ray.data.dataset_pipeline.DatasetPipeline - :members: - -.. TODO split methods into sections Map transformations ------------------- +.. automethod:: ray.data.DatasetPipeline.map + +.. automethod:: ray.data.DatasetPipeline.map_batches + +.. automethod:: ray.data.DatasetPipeline.flat_map + +.. automethod:: ray.data.DatasetPipeline.filter + +.. automethod:: ray.data.DatasetPipeline.add_column + +.. automethod:: ray.data.DatasetPipeline.drop_columns + Shuffling transformations ------------------------- +.. automethod:: ray.data.DatasetPipeline.randomize_block_order_each_window + +.. automethod:: ray.data.DatasetPipeline.random_shuffle_each_window + Splitting and Merging DatasetPipelines -------------------------------------- +.. automethod:: ray.data.DatasetPipeline.split + +.. automethod:: ray.data.DatasetPipeline.split_at_indices + Accessing DatasetPipelines -------------------------- + +.. automethod:: ray.data.DatasetPipeline.iter_rows + +.. automethod:: ray.data.DatasetPipeline.iter_batches + +.. automethod:: ray.data.DatasetPipeline.iter_epochs + +.. automethod:: ray.data.DatasetPipeline.iter_tf_batches + +.. automethod:: ray.data.DatasetPipeline.iter_torch_batches + +.. automethod:: ray.data.DatasetPipeline.iter_datasets + + + +Other +----- + +.. TODO put these in the right section. + +.. automethod:: ray.data.DatasetPipeline.rewindow + +.. automethod:: ray.data.DatasetPipeline.repeat + +.. automethod:: ray.data.DatasetPipeline.schema + +.. automethod:: ray.data.DatasetPipeline.count + +.. automethod:: ray.data.DatasetPipeline.sum + +.. automethod:: ray.data.DatasetPipeline.show_windows + +.. automethod:: ray.data.DatasetPipeline.repartition_each_window + +.. automethod:: ray.data.DatasetPipeline.sort_each_window + +.. automethod:: ray.data.DatasetPipeline.write_json + +.. automethod:: ray.data.DatasetPipeline.write_csv + +.. automethod:: ray.data.DatasetPipeline.write_parquet + +.. automethod:: ray.data.DatasetPipeline.write_datasource + +.. automethod:: ray.data.DatasetPipeline.take + +.. automethod:: ray.data.DatasetPipeline.take_all + +.. automethod:: ray.data.DatasetPipeline.show + +.. automethod:: ray.data.DatasetPipeline.to_tf + +.. automethod:: ray.data.DatasetPipeline.to_torch + +.. automethod:: ray.data.DatasetPipeline.foreach_window + +.. automethod:: ray.data.DatasetPipeline.stats + +.. automethod:: ray.data.DatasetPipeline.from_iterable \ No newline at end of file From 31827388aed9ba02db53f38faf3ff432473e846e Mon Sep 17 00:00:00 2001 From: Matthew Deng Date: Sun, 7 Aug 2022 18:57:40 -0700 Subject: [PATCH 3/7] address TODOs Signed-off-by: Matthew Deng --- doc/source/_toc.yml | 1 - doc/source/conf.py | 1 + doc/source/data/api/dataset.rst | 149 +++++++++++++++-- doc/source/data/api/dataset_pipeline.rst | 139 ++++++++++++---- doc/source/data/api/input_output.rst | 195 +++++++++++++++++++++-- doc/source/data/package-ref.rst | 63 -------- doc/source/ray-references/api.rst | 2 +- 7 files changed, 425 insertions(+), 125 deletions(-) delete mode 100644 doc/source/data/package-ref.rst diff --git a/doc/source/_toc.yml b/doc/source/_toc.yml index 7509d5d32bdc..1a2008f496e8 100644 --- a/doc/source/_toc.yml +++ b/doc/source/_toc.yml @@ -61,7 +61,6 @@ parts: - file: data/random-access - file: data/faq - file: data/api/api - - file: data/package-ref - file: data/integrations - file: train/train diff --git a/doc/source/conf.py b/doc/source/conf.py index 2395dcd9b507..149f0c81160c 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -46,6 +46,7 @@ "myst_nb", "sphinx.ext.doctest", "sphinx.ext.coverage", + "sphinx.ext.autosummary", "sphinx_external_toc", "sphinx_thebe", "sphinxcontrib.autodoc_pydantic", diff --git a/doc/source/data/api/dataset.rst b/doc/source/data/api/dataset.rst index ea0613a20d94..7ca7488028c4 100644 --- a/doc/source/data/api/dataset.rst +++ b/doc/source/data/api/dataset.rst @@ -5,6 +5,120 @@ Dataset API .. autoclass:: ray.data.Dataset +.. tabbed:: Basic Transformations + + .. autosummary:: + + ray.data.Dataset.map + ray.data.Dataset.map_batches + ray.data.Dataset.flat_map + ray.data.Dataset.filter + ray.data.Dataset.add_column + ray.data.Dataset.drop_columns + ray.data.Dataset.random_sample + ray.data.Dataset.limit + +.. tabbed:: Sorting, Shuffling, Repartitioning + + .. autosummary:: + + ray.data.Dataset.sort + ray.data.Dataset.random_shuffle + ray.data.Dataset.randomize_block_order + ray.data.Dataset.repartition + +.. tabbed:: Splitting and Merging Datasets + + .. autosummary:: + + ray.data.Dataset.split + ray.data.Dataset.split_at_indices + ray.data.Dataset.split_proportionately + ray.data.Dataset.train_test_split + ray.data.Dataset.union + ray.data.Dataset.zip + + +.. tabbed:: Grouped and Global Aggregations + + .. autosummary:: + + ray.data.Dataset.groupby + ray.data.Dataset.aggregate + ray.data.Dataset.sum + ray.data.Dataset.min + ray.data.Dataset.max + ray.data.Dataset.mean + ray.data.Dataset.std + +.. tabbed:: Converting to Pipeline + + .. autosummary:: + + ray.data.Dataset.repeat + ray.data.Dataset.window + +.. tabbed:: Consuming Datasets + + .. autosummary:: + + ray.data.Dataset.show + ray.data.Dataset.take + ray.data.Dataset.take_all + ray.data.Dataset.iter_rows + ray.data.Dataset.iter_batches + ray.data.Dataset.iter_torch_batches + ray.data.Dataset.iter_tf_batches + +.. tabbed:: I/O and Conversion + + .. autosummary:: + + ray.data.Dataset.write_parquet + ray.data.Dataset.write_json + ray.data.Dataset.write_csv + ray.data.Dataset.write_numpy + ray.data.Dataset.write_datasource + ray.data.Dataset.to_torch + ray.data.Dataset.to_tf + ray.data.Dataset.to_dask + ray.data.Dataset.to_mars + ray.data.Dataset.to_modin + ray.data.Dataset.to_spark + ray.data.Dataset.to_pandas + ray.data.Dataset.to_pandas_refs + ray.data.Dataset.to_numpy_refs + ray.data.Dataset.to_arrow_refs + ray.data.Dataset.to_random_access_dataset + +.. tabbed:: Inspecting Metadata + + .. autosummary:: + + ray.data.Dataset.count + ray.data.Dataset.schema + ray.data.Dataset.num_blocks + ray.data.Dataset.size_bytes + ray.data.Dataset.input_files + ray.data.Dataset.stats + ray.data.Dataset.get_internal_block_refs + +.. tabbed:: Execution + + .. autosummary:: + + ray.data.Dataset.fully_executed + ray.data.Dataset.is_fully_executed + ray.data.Dataset.lazy + +.. tabbed:: Serialization + + .. autosummary:: + + ray.data.Dataset.has_serializable_lineage + ray.data.Dataset.serialize_lineage + ray.data.Dataset.deserialize_lineage + Basic Transformations --------------------- @@ -74,9 +188,15 @@ Converting to Pipeline .. automethod:: ray.data.Dataset.window -Accessing Datasets +Consuming Datasets ------------------ +.. automethod:: ray.data.Dataset.show + +.. automethod:: ray.data.Dataset.take + +.. automethod:: ray.data.Dataset.take_all + .. automethod:: ray.data.Dataset.iter_rows .. automethod:: ray.data.Dataset.iter_batches @@ -120,17 +240,8 @@ I/O and Conversion .. automethod:: ray.data.Dataset.to_random_access_dataset - -Other ------ - -.. TODO put these in the right section. - -.. automethod:: ray.data.Dataset.take - -.. automethod:: ray.data.Dataset.take_all - -.. automethod:: ray.data.Dataset.show +Inspecting Metadata +------------------- .. automethod:: ray.data.Dataset.count @@ -141,16 +252,22 @@ Other .. automethod:: ray.data.Dataset.size_bytes .. automethod:: ray.data.Dataset.input_files + +.. automethod:: ray.data.Dataset.stats + +.. automethod:: ray.data.Dataset.get_internal_block_refs + +Execution +--------- .. automethod:: ray.data.Dataset.fully_executed .. automethod:: ray.data.Dataset.is_fully_executed -.. automethod:: ray.data.Dataset.stats - -.. automethod:: ray.data.Dataset.get_internal_block_refs - .. automethod:: ray.data.Dataset.lazy + +Serialization +------------- .. automethod:: ray.data.Dataset.has_serializable_lineage diff --git a/doc/source/data/api/dataset_pipeline.rst b/doc/source/data/api/dataset_pipeline.rst index a2d3c3dfd245..a47c20ad7a19 100644 --- a/doc/source/data/api/dataset_pipeline.rst +++ b/doc/source/data/api/dataset_pipeline.rst @@ -5,8 +5,77 @@ DatasetPipeline API .. autoclass:: ray.data.dataset_pipeline.DatasetPipeline -Map transformations -------------------- +.. tabbed:: Basic Transformations + + .. autosummary:: + + ray.data.DatasetPipeline.map + ray.data.DatasetPipeline.map_batches + ray.data.DatasetPipeline.flat_map + ray.data.DatasetPipeline.foreach_window + ray.data.DatasetPipeline.filter + ray.data.DatasetPipeline.add_column + ray.data.DatasetPipeline.drop_columns + +.. tabbed:: Sorting, Shuffling, Repartitioning + + .. autosummary:: + + ray.data.DatasetPipeline.sort_each_window + ray.data.DatasetPipeline.random_shuffle_each_window + ray.data.DatasetPipeline.randomize_block_order_each_window + ray.data.DatasetPipeline.repartition_each_window + +.. tabbed:: Splitting and Merging DatasetPipelines + + .. autosummary:: + + ray.data.DatasetPipeline.split + ray.data.DatasetPipeline.split_at_indices + +.. tabbed:: Creating DatasetPipelines + + .. autosummary:: + + ray.data.DatasetPipeline.repeat + ray.data.DatasetPipeline.rewindow + ray.data.DatasetPipeline.from_iterable + +.. tabbed:: Consuming DatasetPipelines + + .. autosummary:: + + ray.data.DatasetPipeline.show + ray.data.DatasetPipeline.show_windows + ray.data.DatasetPipeline.take + ray.data.DatasetPipeline.take_all + ray.data.DatasetPipeline.iter_rows + ray.data.DatasetPipeline.iter_batches + ray.data.DatasetPipeline.iter_torch_batches + ray.data.DatasetPipeline.iter_tf_batches + +.. tabbed:: I/O and Conversion + + .. autosummary:: + + ray.data.DatasetPipeline.write_json + ray.data.DatasetPipeline.write_csv + ray.data.DatasetPipeline.write_parquet + ray.data.DatasetPipeline.write_datasource + ray.data.DatasetPipeline.to_tf + ray.data.DatasetPipeline.to_torch + +.. tabbed:: Inspecting Metadata + + .. autosummary:: + + ray.data.DatasetPipeline.schema + ray.data.DatasetPipeline.count + ray.data.DatasetPipeline.stats + ray.data.DatasetPipeline.sum + +Basic transformations +--------------------- .. automethod:: ray.data.DatasetPipeline.map @@ -14,19 +83,25 @@ Map transformations .. automethod:: ray.data.DatasetPipeline.flat_map +.. automethod:: ray.data.DatasetPipeline.foreach_window + .. automethod:: ray.data.DatasetPipeline.filter .. automethod:: ray.data.DatasetPipeline.add_column .. automethod:: ray.data.DatasetPipeline.drop_columns -Shuffling transformations -------------------------- +Sorting, Shuffling, Repartitioning +---------------------------------- -.. automethod:: ray.data.DatasetPipeline.randomize_block_order_each_window +.. automethod:: ray.data.DatasetPipeline.sort_each_window .. automethod:: ray.data.DatasetPipeline.random_shuffle_each_window +.. automethod:: ray.data.DatasetPipeline.randomize_block_order_each_window + +.. automethod:: ray.data.DatasetPipeline.repartition_each_window + Splitting and Merging DatasetPipelines -------------------------------------- @@ -34,43 +109,41 @@ Splitting and Merging DatasetPipelines .. automethod:: ray.data.DatasetPipeline.split_at_indices -Accessing DatasetPipelines --------------------------- - -.. automethod:: ray.data.DatasetPipeline.iter_rows - -.. automethod:: ray.data.DatasetPipeline.iter_batches +Creating DatasetPipelines +------------------------- -.. automethod:: ray.data.DatasetPipeline.iter_epochs +.. automethod:: ray.data.DatasetPipeline.repeat -.. automethod:: ray.data.DatasetPipeline.iter_tf_batches +.. automethod:: ray.data.DatasetPipeline.rewindow -.. automethod:: ray.data.DatasetPipeline.iter_torch_batches +.. automethod:: ray.data.DatasetPipeline.from_iterable -.. automethod:: ray.data.DatasetPipeline.iter_datasets +Consuming DatasetPipelines +-------------------------- +.. automethod:: ray.data.DatasetPipeline.show +.. automethod:: ray.data.DatasetPipeline.show_windows -Other ------ +.. automethod:: ray.data.DatasetPipeline.take -.. TODO put these in the right section. +.. automethod:: ray.data.DatasetPipeline.take_all -.. automethod:: ray.data.DatasetPipeline.rewindow +.. automethod:: ray.data.DatasetPipeline.iter_rows -.. automethod:: ray.data.DatasetPipeline.repeat +.. automethod:: ray.data.DatasetPipeline.iter_batches -.. automethod:: ray.data.DatasetPipeline.schema +.. automethod:: ray.data.DatasetPipeline.iter_epochs -.. automethod:: ray.data.DatasetPipeline.count +.. automethod:: ray.data.DatasetPipeline.iter_tf_batches -.. automethod:: ray.data.DatasetPipeline.sum +.. automethod:: ray.data.DatasetPipeline.iter_torch_batches -.. automethod:: ray.data.DatasetPipeline.show_windows +.. automethod:: ray.data.DatasetPipeline.iter_datasets -.. automethod:: ray.data.DatasetPipeline.repartition_each_window -.. automethod:: ray.data.DatasetPipeline.sort_each_window +I/O and Conversion +------------------ .. automethod:: ray.data.DatasetPipeline.write_json @@ -80,18 +153,18 @@ Other .. automethod:: ray.data.DatasetPipeline.write_datasource -.. automethod:: ray.data.DatasetPipeline.take +.. automethod:: ray.data.DatasetPipeline.to_tf -.. automethod:: ray.data.DatasetPipeline.take_all +.. automethod:: ray.data.DatasetPipeline.to_torch -.. automethod:: ray.data.DatasetPipeline.show -.. automethod:: ray.data.DatasetPipeline.to_tf +Inspecting Metadata +------------------- -.. automethod:: ray.data.DatasetPipeline.to_torch +.. automethod:: ray.data.DatasetPipeline.schema -.. automethod:: ray.data.DatasetPipeline.foreach_window +.. automethod:: ray.data.DatasetPipeline.count .. automethod:: ray.data.DatasetPipeline.stats -.. automethod:: ray.data.DatasetPipeline.from_iterable \ No newline at end of file +.. automethod:: ray.data.DatasetPipeline.sum \ No newline at end of file diff --git a/doc/source/data/api/input_output.rst b/doc/source/data/api/input_output.rst index 0628a5153b3a..b08fa36ab946 100644 --- a/doc/source/data/api/input_output.rst +++ b/doc/source/data/api/input_output.rst @@ -2,22 +2,151 @@ Input/Output ============ -Tabular data ------------- +Synthetic Data +-------------- + +.. autofunction:: ray.data.range + +.. autofunction:: ray.data.range_table + +.. autofunction:: ray.data.range_tensor + +Python Objects +-------------- + +.. autofunction:: ray.data.from_items + +CSV +--- + +.. autofunction:: ray.data.read_csv + +.. automethod:: ray.data.Dataset.write_csv + :noindex: + +JSON +---- + +.. autofunction:: ray.data.read_json + +.. automethod:: ray.data.Dataset.write_json + :noindex: + +Pandas +------ + +.. autofunction:: ray.data.from_pandas + +.. autofunction:: ray.data.from_pandas_refs + +.. automethod:: ray.data.Dataset.to_pandas + :noindex: + +.. automethod:: ray.data.Dataset.to_pandas_refs + :noindex: + +NumPy +----- + +.. autofunction:: ray.data.read_numpy + +.. autofunction:: ray.data.from_numpy + +.. autofunction:: ray.data.from_numpy_refs + +.. automethod:: ray.data.Dataset.write_numpy + :noindex: + +.. automethod:: ray.data.Dataset.to_numpy_refs + :noindex: + +Parquet +------- + +.. autofunction:: ray.data.read_parquet + +.. autofunction:: ray.data.read_parquet_bulk + +.. automethod:: ray.data.Dataset.write_parquet + :noindex: + +Arrow +----- + +.. autofunction:: ray.data.from_arrow + +.. autofunction:: ray.data.from_arrow_refs + +.. automethod:: ray.data.Dataset.to_numpy_refs + :noindex: + +Text +---- + +.. autofunction:: ray.data.read_text + +Binary +------ + +.. autofunction:: ray.data.read_binary_files + +Dask +---- + +.. autofunction:: ray.data.from_dask + +.. automethod:: ray.data.Dataset.to_dask + :noindex: + +Spark +----- + +.. autofunction:: ray.data.from_spark + +.. automethod:: ray.data.Dataset.to_spark + :noindex: -Tensor data ------------ +Modin +----- -Text data ---------- +.. autofunction:: ray.data.from_modin -Binary data ------------ +.. automethod:: ray.data.Dataset.to_modin + :noindex: +Mars +---- + +.. autofunction:: ray.data.from_mars + +.. automethod:: ray.data.Dataset.to_mars + :noindex: + +Torch +----- + +.. automethod:: ray.data.Dataset.to_torch + :noindex: + +TensorFlow +---------- + +.. automethod:: ray.data.Dataset.to_tf + :noindex: + +HuggingFace +------------ + +.. autofunction:: ray.data.from_huggingface Datasource API -------------- +.. autofunction:: ray.data.read_datasource + +.. automethod:: ray.data.Dataset.write_datasource + :noindex: + .. autoclass:: ray.data.Datasource :members: @@ -25,6 +154,38 @@ Datasource API :members: +Built-in Datasources +#################### + +.. autoclass:: ray.data.datasource.BinaryDatasource + :members: + +.. autoclass:: ray.data.datasource.CSVDatasource + :members: + +.. autoclass:: ray.data.datasource.FileBasedDatasource + :members: + +.. autoclass:: ray.data.datasource.ImageFolderDatasource + :members: + +.. autoclass:: ray.data.datasource.JSONDatasource + :members: + +.. autoclass:: ray.data.datasource.NumpyDatasource + :members: + +.. autoclass:: ray.data.datasource.ParquetDatasource + :members: + +.. autoclass:: ray.data.datasource.RangeDatasource + :members: + +.. autoclass:: ray.data.datasource.SimpleTensorFlowDatasource + :members: + +.. autoclass:: ray.data.datasource.SimpleTorchDatasource + :members: Partitioning API ---------------- @@ -44,12 +205,24 @@ Partitioning API .. autoclass:: ray.data.datasource.PathPartitionFilter - MetadataProvider API -------------------- - .. autoclass:: ray.data.datasource.FileMetadataProvider :members: -.. TODO fill the rest \ No newline at end of file + +.. autoclass:: ray.data.datasource.BaseFileMetadataProvider + :members: + +.. autoclass:: ray.data.datasource.ParquetMetadataProvider + :members: + +.. autoclass:: ray.data.datasource.DefaultFileMetadataProvider + :members: + +.. autoclass:: ray.data.datasource.DefaultParquetMetadataProvider + :members: + +.. autoclass:: ray.data.datasource.FastFileMetadataProvider + :members: \ No newline at end of file diff --git a/doc/source/data/package-ref.rst b/doc/source/data/package-ref.rst deleted file mode 100644 index c0659d62c94b..000000000000 --- a/doc/source/data/package-ref.rst +++ /dev/null @@ -1,63 +0,0 @@ -Ray Datasets API -================ - -Creating Datasets ------------------ - -.. autofunction:: ray.data.range -.. autofunction:: ray.data.range_table -.. autofunction:: ray.data.range_tensor -.. autofunction:: ray.data.read_csv -.. autofunction:: ray.data.read_json -.. autofunction:: ray.data.read_parquet -.. autofunction:: ray.data.read_parquet_bulk -.. autofunction:: ray.data.read_numpy -.. autofunction:: ray.data.read_text -.. autofunction:: ray.data.read_binary_files -.. autofunction:: ray.data.read_datasource -.. autofunction:: ray.data.from_items -.. autofunction:: ray.data.from_arrow -.. autofunction:: ray.data.from_arrow_refs -.. autofunction:: ray.data.from_huggingface -.. autofunction:: ray.data.from_spark -.. autofunction:: ray.data.from_dask -.. autofunction:: ray.data.from_modin -.. autofunction:: ray.data.from_mars -.. autofunction:: ray.data.from_pandas -.. autofunction:: ray.data.from_pandas_refs -.. autofunction:: ray.data.from_numpy -.. autofunction:: ray.data.from_numpy_refs - - -Built-in Datasources --------------------- - -.. autoclass:: ray.data.datasource.BinaryDatasource - :members: - -.. autoclass:: ray.data.datasource.CSVDatasource - :members: - -.. autoclass:: ray.data.datasource.FileBasedDatasource - :members: - -.. autoclass:: ray.data.datasource.ImageFolderDatasource - :members: - -.. autoclass:: ray.data.datasource.JSONDatasource - :members: - -.. autoclass:: ray.data.datasource.NumpyDatasource - :members: - -.. autoclass:: ray.data.datasource.ParquetDatasource - :members: - -.. autoclass:: ray.data.datasource.RangeDatasource - :members: - -.. autoclass:: ray.data.datasource.SimpleTensorFlowDatasource - :members: - -.. autoclass:: ray.data.datasource.SimpleTorchDatasource - :members: \ No newline at end of file diff --git a/doc/source/ray-references/api.rst b/doc/source/ray-references/api.rst index 2dfbc57bc0ef..f93ccfb7753f 100644 --- a/doc/source/ray-references/api.rst +++ b/doc/source/ray-references/api.rst @@ -5,7 +5,7 @@ API References :maxdepth: 2 :caption: Ray API References - ../data/package-ref.rst + ../data/api/api.rst ../train/api.rst ../tune/api_docs/overview.rst ../serve/package-ref.rst From f207f95079ebe6a5407e00544cc584c0533de572 Mon Sep 17 00:00:00 2001 From: Matthew Deng Date: Sun, 7 Aug 2022 19:11:26 -0700 Subject: [PATCH 4/7] remove torch and tf from io Signed-off-by: Matthew Deng --- doc/source/data/api/input_output.rst | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/doc/source/data/api/input_output.rst b/doc/source/data/api/input_output.rst index b08fa36ab946..6c4797e05ca1 100644 --- a/doc/source/data/api/input_output.rst +++ b/doc/source/data/api/input_output.rst @@ -122,18 +122,6 @@ Mars .. automethod:: ray.data.Dataset.to_mars :noindex: -Torch ------ - -.. automethod:: ray.data.Dataset.to_torch - :noindex: - -TensorFlow ----------- - -.. automethod:: ray.data.Dataset.to_tf - :noindex: - HuggingFace ------------ From 7357baee7eaf429cc47f1db78ff0a172db72c318 Mon Sep 17 00:00:00 2001 From: Matthew Deng Date: Mon, 8 Aug 2022 17:24:37 -0700 Subject: [PATCH 5/7] address comments Signed-off-by: Matthew Deng --- doc/source/data/api/dataset.rst | 167 +++++++++++------------ doc/source/data/api/dataset_pipeline.rst | 100 +++++++------- 2 files changed, 133 insertions(+), 134 deletions(-) diff --git a/doc/source/data/api/dataset.rst b/doc/source/data/api/dataset.rst index 7ca7488028c4..b059b84ab719 100644 --- a/doc/source/data/api/dataset.rst +++ b/doc/source/data/api/dataset.rst @@ -5,119 +5,118 @@ Dataset API .. autoclass:: ray.data.Dataset -.. tabbed:: Basic Transformations +**Basic Transformations** - .. autosummary:: +.. autosummary:: - ray.data.Dataset.map - ray.data.Dataset.map_batches - ray.data.Dataset.flat_map - ray.data.Dataset.filter - ray.data.Dataset.add_column - ray.data.Dataset.drop_columns - ray.data.Dataset.random_sample - ray.data.Dataset.limit + ray.data.Dataset.map + ray.data.Dataset.map_batches + ray.data.Dataset.flat_map + ray.data.Dataset.filter + ray.data.Dataset.add_column + ray.data.Dataset.drop_columns + ray.data.Dataset.random_sample + ray.data.Dataset.limit -.. tabbed:: Sorting, Shuffling, Repartitioning +**Sorting, Shuffling, Repartitioning** - .. autosummary:: +.. autosummary:: - ray.data.Dataset.sort - ray.data.Dataset.random_shuffle - ray.data.Dataset.randomize_block_order - ray.data.Dataset.repartition + ray.data.Dataset.sort + ray.data.Dataset.random_shuffle + ray.data.Dataset.randomize_block_order + ray.data.Dataset.repartition -.. tabbed:: Splitting and Merging Datasets +**Splitting and Merging Datasets** - .. autosummary:: +.. autosummary:: - ray.data.Dataset.split - ray.data.Dataset.split_at_indices - ray.data.Dataset.split_proportionately - ray.data.Dataset.train_test_split - ray.data.Dataset.union - ray.data.Dataset.zip + ray.data.Dataset.split + ray.data.Dataset.split_at_indices + ray.data.Dataset.split_proportionately + ray.data.Dataset.train_test_split + ray.data.Dataset.union + ray.data.Dataset.zip +**Grouped and Global Aggregations** -.. tabbed:: Grouped and Global Aggregations +.. autosummary:: - .. autosummary:: + ray.data.Dataset.groupby + ray.data.Dataset.aggregate + ray.data.Dataset.sum + ray.data.Dataset.min + ray.data.Dataset.max + ray.data.Dataset.mean + ray.data.Dataset.std - ray.data.Dataset.groupby - ray.data.Dataset.aggregate - ray.data.Dataset.sum - ray.data.Dataset.min - ray.data.Dataset.max - ray.data.Dataset.mean - ray.data.Dataset.std +**Converting to Pipeline** -.. tabbed:: Converting to Pipeline +.. autosummary:: - .. autosummary:: + ray.data.Dataset.repeat + ray.data.Dataset.window - ray.data.Dataset.repeat - ray.data.Dataset.window +**Consuming Datasets** -.. tabbed:: Consuming Datasets +.. autosummary:: - .. autosummary:: + ray.data.Dataset.show + ray.data.Dataset.take + ray.data.Dataset.take_all + ray.data.Dataset.iter_rows + ray.data.Dataset.iter_batches + ray.data.Dataset.iter_torch_batches + ray.data.Dataset.iter_tf_batches - ray.data.Dataset.show - ray.data.Dataset.take - ray.data.Dataset.take_all - ray.data.Dataset.iter_rows - ray.data.Dataset.iter_batches - ray.data.Dataset.iter_torch_batches - ray.data.Dataset.iter_tf_batches +**I/O and Conversion** -.. tabbed:: I/O and Conversion +.. autosummary:: - .. autosummary:: + ray.data.Dataset.write_parquet + ray.data.Dataset.write_json + ray.data.Dataset.write_csv + ray.data.Dataset.write_numpy + ray.data.Dataset.write_datasource + ray.data.Dataset.to_torch + ray.data.Dataset.to_tf + ray.data.Dataset.to_dask + ray.data.Dataset.to_mars + ray.data.Dataset.to_modin + ray.data.Dataset.to_spark + ray.data.Dataset.to_pandas + ray.data.Dataset.to_pandas_refs + ray.data.Dataset.to_numpy_refs + ray.data.Dataset.to_arrow_refs + ray.data.Dataset.to_random_access_dataset - ray.data.Dataset.write_parquet - ray.data.Dataset.write_json - ray.data.Dataset.write_csv - ray.data.Dataset.write_numpy - ray.data.Dataset.write_datasource - ray.data.Dataset.to_torch - ray.data.Dataset.to_tf - ray.data.Dataset.to_dask - ray.data.Dataset.to_mars - ray.data.Dataset.to_modin - ray.data.Dataset.to_spark - ray.data.Dataset.to_pandas - ray.data.Dataset.to_pandas_refs - ray.data.Dataset.to_numpy_refs - ray.data.Dataset.to_arrow_refs - ray.data.Dataset.to_random_access_dataset +**Inspecting Metadata** -.. tabbed:: Inspecting Metadata +.. autosummary:: - .. autosummary:: + ray.data.Dataset.count + ray.data.Dataset.schema + ray.data.Dataset.num_blocks + ray.data.Dataset.size_bytes + ray.data.Dataset.input_files + ray.data.Dataset.stats + ray.data.Dataset.get_internal_block_refs - ray.data.Dataset.count - ray.data.Dataset.schema - ray.data.Dataset.num_blocks - ray.data.Dataset.size_bytes - ray.data.Dataset.input_files - ray.data.Dataset.stats - ray.data.Dataset.get_internal_block_refs +**Execution** -.. tabbed:: Execution +.. autosummary:: - .. autosummary:: + ray.data.Dataset.fully_executed + ray.data.Dataset.is_fully_executed + ray.data.Dataset.lazy - ray.data.Dataset.fully_executed - ray.data.Dataset.is_fully_executed - ray.data.Dataset.lazy +**Serialization** -.. tabbed:: Serialization +.. autosummary:: - .. autosummary:: - - ray.data.Dataset.has_serializable_lineage - ray.data.Dataset.serialize_lineage - ray.data.Dataset.deserialize_lineage + ray.data.Dataset.has_serializable_lineage + ray.data.Dataset.serialize_lineage + ray.data.Dataset.deserialize_lineage Basic Transformations --------------------- diff --git a/doc/source/data/api/dataset_pipeline.rst b/doc/source/data/api/dataset_pipeline.rst index a47c20ad7a19..7bf0c2932824 100644 --- a/doc/source/data/api/dataset_pipeline.rst +++ b/doc/source/data/api/dataset_pipeline.rst @@ -5,74 +5,74 @@ DatasetPipeline API .. autoclass:: ray.data.dataset_pipeline.DatasetPipeline -.. tabbed:: Basic Transformations +**Basic Transformations** - .. autosummary:: +.. autosummary:: - ray.data.DatasetPipeline.map - ray.data.DatasetPipeline.map_batches - ray.data.DatasetPipeline.flat_map - ray.data.DatasetPipeline.foreach_window - ray.data.DatasetPipeline.filter - ray.data.DatasetPipeline.add_column - ray.data.DatasetPipeline.drop_columns + ray.data.DatasetPipeline.map + ray.data.DatasetPipeline.map_batches + ray.data.DatasetPipeline.flat_map + ray.data.DatasetPipeline.foreach_window + ray.data.DatasetPipeline.filter + ray.data.DatasetPipeline.add_column + ray.data.DatasetPipeline.drop_columns -.. tabbed:: Sorting, Shuffling, Repartitioning +**Sorting, Shuffling, Repartitioning** - .. autosummary:: +.. autosummary:: - ray.data.DatasetPipeline.sort_each_window - ray.data.DatasetPipeline.random_shuffle_each_window - ray.data.DatasetPipeline.randomize_block_order_each_window - ray.data.DatasetPipeline.repartition_each_window + ray.data.DatasetPipeline.sort_each_window + ray.data.DatasetPipeline.random_shuffle_each_window + ray.data.DatasetPipeline.randomize_block_order_each_window + ray.data.DatasetPipeline.repartition_each_window -.. tabbed:: Splitting and Merging DatasetPipelines +**Splitting DatasetPipelines** - .. autosummary:: +.. autosummary:: - ray.data.DatasetPipeline.split - ray.data.DatasetPipeline.split_at_indices + ray.data.DatasetPipeline.split + ray.data.DatasetPipeline.split_at_indices -.. tabbed:: Creating DatasetPipelines +**Creating DatasetPipelines** - .. autosummary:: +.. autosummary:: - ray.data.DatasetPipeline.repeat - ray.data.DatasetPipeline.rewindow - ray.data.DatasetPipeline.from_iterable + ray.data.DatasetPipeline.repeat + ray.data.DatasetPipeline.rewindow + ray.data.DatasetPipeline.from_iterable -.. tabbed:: Consuming DatasetPipelines +**Consuming DatasetPipelines** - .. autosummary:: +.. autosummary:: - ray.data.DatasetPipeline.show - ray.data.DatasetPipeline.show_windows - ray.data.DatasetPipeline.take - ray.data.DatasetPipeline.take_all - ray.data.DatasetPipeline.iter_rows - ray.data.DatasetPipeline.iter_batches - ray.data.DatasetPipeline.iter_torch_batches - ray.data.DatasetPipeline.iter_tf_batches + ray.data.DatasetPipeline.show + ray.data.DatasetPipeline.show_windows + ray.data.DatasetPipeline.take + ray.data.DatasetPipeline.take_all + ray.data.DatasetPipeline.iter_rows + ray.data.DatasetPipeline.iter_batches + ray.data.DatasetPipeline.iter_torch_batches + ray.data.DatasetPipeline.iter_tf_batches -.. tabbed:: I/O and Conversion +**I/O and Conversion** - .. autosummary:: +.. autosummary:: - ray.data.DatasetPipeline.write_json - ray.data.DatasetPipeline.write_csv - ray.data.DatasetPipeline.write_parquet - ray.data.DatasetPipeline.write_datasource - ray.data.DatasetPipeline.to_tf - ray.data.DatasetPipeline.to_torch + ray.data.DatasetPipeline.write_json + ray.data.DatasetPipeline.write_csv + ray.data.DatasetPipeline.write_parquet + ray.data.DatasetPipeline.write_datasource + ray.data.DatasetPipeline.to_tf + ray.data.DatasetPipeline.to_torch -.. tabbed:: Inspecting Metadata +**Inspecting Metadata** - .. autosummary:: +.. autosummary:: - ray.data.DatasetPipeline.schema - ray.data.DatasetPipeline.count - ray.data.DatasetPipeline.stats - ray.data.DatasetPipeline.sum + ray.data.DatasetPipeline.schema + ray.data.DatasetPipeline.count + ray.data.DatasetPipeline.stats + ray.data.DatasetPipeline.sum Basic transformations --------------------- @@ -102,8 +102,8 @@ Sorting, Shuffling, Repartitioning .. automethod:: ray.data.DatasetPipeline.repartition_each_window -Splitting and Merging DatasetPipelines --------------------------------------- +Splitting DatasetPipelines +-------------------------- .. automethod:: ray.data.DatasetPipeline.split From ab87b6297aabd44e450d71c80dbc5fac19ce7e7b Mon Sep 17 00:00:00 2001 From: Matthew Deng Date: Mon, 8 Aug 2022 17:37:11 -0700 Subject: [PATCH 6/7] remove signatures Signed-off-by: Matthew Deng --- doc/source/data/api/dataset.rst | 10 ++++++++++ doc/source/data/api/dataset_pipeline.rst | 7 +++++++ 2 files changed, 17 insertions(+) diff --git a/doc/source/data/api/dataset.rst b/doc/source/data/api/dataset.rst index b059b84ab719..875d3da8702e 100644 --- a/doc/source/data/api/dataset.rst +++ b/doc/source/data/api/dataset.rst @@ -8,6 +8,7 @@ Dataset API **Basic Transformations** .. autosummary:: + :nosignatures: ray.data.Dataset.map ray.data.Dataset.map_batches @@ -21,6 +22,7 @@ Dataset API **Sorting, Shuffling, Repartitioning** .. autosummary:: + :nosignatures: ray.data.Dataset.sort ray.data.Dataset.random_shuffle @@ -30,6 +32,7 @@ Dataset API **Splitting and Merging Datasets** .. autosummary:: + :nosignatures: ray.data.Dataset.split ray.data.Dataset.split_at_indices @@ -41,6 +44,7 @@ Dataset API **Grouped and Global Aggregations** .. autosummary:: + :nosignatures: ray.data.Dataset.groupby ray.data.Dataset.aggregate @@ -53,6 +57,7 @@ Dataset API **Converting to Pipeline** .. autosummary:: + :nosignatures: ray.data.Dataset.repeat ray.data.Dataset.window @@ -60,6 +65,7 @@ Dataset API **Consuming Datasets** .. autosummary:: + :nosignatures: ray.data.Dataset.show ray.data.Dataset.take @@ -72,6 +78,7 @@ Dataset API **I/O and Conversion** .. autosummary:: + :nosignatures: ray.data.Dataset.write_parquet ray.data.Dataset.write_json @@ -93,6 +100,7 @@ Dataset API **Inspecting Metadata** .. autosummary:: + :nosignatures: ray.data.Dataset.count ray.data.Dataset.schema @@ -105,6 +113,7 @@ Dataset API **Execution** .. autosummary:: + :nosignatures: ray.data.Dataset.fully_executed ray.data.Dataset.is_fully_executed @@ -113,6 +122,7 @@ Dataset API **Serialization** .. autosummary:: + :nosignatures: ray.data.Dataset.has_serializable_lineage ray.data.Dataset.serialize_lineage diff --git a/doc/source/data/api/dataset_pipeline.rst b/doc/source/data/api/dataset_pipeline.rst index 7bf0c2932824..a82bd488aac7 100644 --- a/doc/source/data/api/dataset_pipeline.rst +++ b/doc/source/data/api/dataset_pipeline.rst @@ -8,6 +8,7 @@ DatasetPipeline API **Basic Transformations** .. autosummary:: + :nosignatures: ray.data.DatasetPipeline.map ray.data.DatasetPipeline.map_batches @@ -20,6 +21,7 @@ DatasetPipeline API **Sorting, Shuffling, Repartitioning** .. autosummary:: + :nosignatures: ray.data.DatasetPipeline.sort_each_window ray.data.DatasetPipeline.random_shuffle_each_window @@ -29,6 +31,7 @@ DatasetPipeline API **Splitting DatasetPipelines** .. autosummary:: + :nosignatures: ray.data.DatasetPipeline.split ray.data.DatasetPipeline.split_at_indices @@ -36,6 +39,7 @@ DatasetPipeline API **Creating DatasetPipelines** .. autosummary:: + :nosignatures: ray.data.DatasetPipeline.repeat ray.data.DatasetPipeline.rewindow @@ -44,6 +48,7 @@ DatasetPipeline API **Consuming DatasetPipelines** .. autosummary:: + :nosignatures: ray.data.DatasetPipeline.show ray.data.DatasetPipeline.show_windows @@ -57,6 +62,7 @@ DatasetPipeline API **I/O and Conversion** .. autosummary:: + :nosignatures: ray.data.DatasetPipeline.write_json ray.data.DatasetPipeline.write_csv @@ -68,6 +74,7 @@ DatasetPipeline API **Inspecting Metadata** .. autosummary:: + :nosignatures: ray.data.DatasetPipeline.schema ray.data.DatasetPipeline.count From 2fdcd31f2bb7839db9a34d11a34fb1de00cab1c0 Mon Sep 17 00:00:00 2001 From: Matthew Deng Date: Tue, 9 Aug 2022 15:46:02 -0700 Subject: [PATCH 7/7] address comments Signed-off-by: Matthew Deng --- doc/source/data/api/api.rst | 2 +- doc/source/data/api/data_representations.rst | 1 + doc/source/data/api/dataset.rst | 2 +- doc/source/data/api/dataset_context.rst | 1 + doc/source/data/api/grouped_dataset.rst | 5 +-- doc/source/data/api/input_output.rst | 43 ++++++++++--------- doc/source/data/api/random_access_dataset.rst | 1 + doc/source/data/api/utility.rst | 1 + doc/source/data/dataset.rst | 4 +- 9 files changed, 31 insertions(+), 29 deletions(-) diff --git a/doc/source/data/api/api.rst b/doc/source/data/api/api.rst index 233633774c44..a65dd1423cef 100644 --- a/doc/source/data/api/api.rst +++ b/doc/source/data/api/api.rst @@ -1,4 +1,4 @@ -.. _data_api: +.. _data-api: Ray Datasets API ================ diff --git a/doc/source/data/api/data_representations.rst b/doc/source/data/api/data_representations.rst index 6159a6b13103..cb064ae9f655 100644 --- a/doc/source/data/api/data_representations.rst +++ b/doc/source/data/api/data_representations.rst @@ -1,3 +1,4 @@ +.. _data-representations: Data Representations ==================== diff --git a/doc/source/data/api/dataset.rst b/doc/source/data/api/dataset.rst index 875d3da8702e..e5622063b528 100644 --- a/doc/source/data/api/dataset.rst +++ b/doc/source/data/api/dataset.rst @@ -54,7 +54,7 @@ Dataset API ray.data.Dataset.mean ray.data.Dataset.std -**Converting to Pipeline** +**Converting to Pipelines** .. autosummary:: :nosignatures: diff --git a/doc/source/data/api/dataset_context.rst b/doc/source/data/api/dataset_context.rst index 3960f2986bcc..5686a36a0195 100644 --- a/doc/source/data/api/dataset_context.rst +++ b/doc/source/data/api/dataset_context.rst @@ -1,3 +1,4 @@ +.. _dataset-context-api: DatasetContext API ================== diff --git a/doc/source/data/api/grouped_dataset.rst b/doc/source/data/api/grouped_dataset.rst index 047316122d3e..def8b6c79c34 100644 --- a/doc/source/data/api/grouped_dataset.rst +++ b/doc/source/data/api/grouped_dataset.rst @@ -1,3 +1,4 @@ +.. _grouped-dataset-api: GroupedDataset API ================== @@ -28,7 +29,3 @@ Aggregations .. autoclass:: ray.data.aggregate.AbsMax :members: - - -Map transformations -------------------- \ No newline at end of file diff --git a/doc/source/data/api/input_output.rst b/doc/source/data/api/input_output.rst index 6c4797e05ca1..3882ceecefb5 100644 --- a/doc/source/data/api/input_output.rst +++ b/doc/source/data/api/input_output.rst @@ -1,3 +1,4 @@ +.. _input-output: Input/Output ============ @@ -16,6 +17,16 @@ Python Objects .. autofunction:: ray.data.from_items +Parquet +------- + +.. autofunction:: ray.data.read_parquet + +.. autofunction:: ray.data.read_parquet_bulk + +.. automethod:: ray.data.Dataset.write_parquet + :noindex: + CSV --- @@ -32,6 +43,16 @@ JSON .. automethod:: ray.data.Dataset.write_json :noindex: +Text +---- + +.. autofunction:: ray.data.read_text + +Binary +------ + +.. autofunction:: ray.data.read_binary_files + Pandas ------ @@ -60,16 +81,6 @@ NumPy .. automethod:: ray.data.Dataset.to_numpy_refs :noindex: -Parquet -------- - -.. autofunction:: ray.data.read_parquet - -.. autofunction:: ray.data.read_parquet_bulk - -.. automethod:: ray.data.Dataset.write_parquet - :noindex: - Arrow ----- @@ -77,19 +88,9 @@ Arrow .. autofunction:: ray.data.from_arrow_refs -.. automethod:: ray.data.Dataset.to_numpy_refs +.. automethod:: ray.data.Dataset.to_arrow_refs :noindex: -Text ----- - -.. autofunction:: ray.data.read_text - -Binary ------- - -.. autofunction:: ray.data.read_binary_files - Dask ---- diff --git a/doc/source/data/api/random_access_dataset.rst b/doc/source/data/api/random_access_dataset.rst index d238c513942a..d1b41b1f1222 100644 --- a/doc/source/data/api/random_access_dataset.rst +++ b/doc/source/data/api/random_access_dataset.rst @@ -1,3 +1,4 @@ +.. _random-access-dataset-api: (Experimental) RandomAccessDataset API ====================================== diff --git a/doc/source/data/api/utility.rst b/doc/source/data/api/utility.rst index f73b06aac1ca..379ae29c1933 100644 --- a/doc/source/data/api/utility.rst +++ b/doc/source/data/api/utility.rst @@ -1,3 +1,4 @@ +.. _data-utility: Utility ======= diff --git a/doc/source/data/dataset.rst b/doc/source/data/dataset.rst index cfddc6e472cb..78bbd3db8a5f 100644 --- a/doc/source/data/dataset.rst +++ b/doc/source/data/dataset.rst @@ -60,7 +60,7 @@ Where to Go from Here? As new user of Ray Datasets, you may want to start with our :ref:`Getting Started guide`. If you've run your first examples already, you might want to dive into Ray Datasets' :ref:`key concepts ` or our :ref:`User Guide ` instead. -Advanced users can refer directly to the Ray Datasets :ref:`API reference ` for their projects. +Advanced users can refer directly to the Ray Datasets :ref:`API reference ` for their projects. .. panels:: :container: text-center @@ -142,7 +142,7 @@ Advanced users can refer directly to the Ray Datasets :ref:`API reference