apache · jackye1995 · Mar 23, 2023 · Mar 18, 2023 · Mar 20, 2023 · Mar 20, 2023
diff --git a/python/Makefile b/python/Makefile
@@ -17,7 +17,7 @@
 
 install:
 	pip install poetry
-	poetry install -E pyarrow -E hive -E s3fs -E glue -E adlfs -E duckdb
+	poetry install -E pyarrow -E hive -E s3fs -E glue -E adlfs -E duckdb -E ray
 
 check-license:
 	./dev/check-license
@@ -36,7 +36,7 @@ test-integration:
 	docker-compose -f dev/docker-compose-integration.yml kill
 	docker-compose -f dev/docker-compose-integration.yml build
 	docker-compose -f dev/docker-compose-integration.yml up -d
-	sleep 20
+	sleep 30
 	poetry run pytest tests/ -m integration ${PYTEST_ARGS}
 
 test-adlfs:

diff --git a/python/dev/provision.py b/python/dev/provision.py
@@ -17,6 +17,7 @@
 import time
 
 from pyspark.sql import SparkSession
+from pyspark.sql.functions import current_date, date_add, expr
 
 spark = SparkSession.builder.getOrCreate()
 
@@ -56,6 +57,12 @@
 """
 )
 
+spark.sql(
+    """
+  DROP TABLE IF EXISTS test_null_nan_rewritten;
+"""
+)
+
 spark.sql(
     """
   CREATE TABLE test_null_nan_rewritten
@@ -94,5 +101,28 @@
 """
 )
 
+all_types_dataframe = (
+    spark.range(0, 5, 1, 5)
+    .withColumnRenamed("id", "longCol")
+    .withColumn("intCol", expr("CAST(longCol AS INT)"))
+    .withColumn("floatCol", expr("CAST(longCol AS FLOAT)"))
+    .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)"))
+    .withColumn("dateCol", date_add(current_date(), 1))
+    .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)"))
+    .withColumn("stringCol", expr("CAST(dateCol AS STRING)"))
+    .withColumn("booleanCol", expr("longCol > 5"))
+    .withColumn("binaryCol", expr("CAST(longCol AS BINARY)"))
+    .withColumn("byteCol", expr("CAST(longCol AS BYTE)"))
+    .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(10, 2))"))
+    .withColumn("shortCol", expr("CAST(longCol AS SHORT)"))
+    .withColumn("mapCol", expr("MAP(longCol, decimalCol)"))
+    .withColumn("arrayCol", expr("ARRAY(longCol)"))
+    .withColumn("structCol", expr("STRUCT(mapCol, arrayCol)"))
+)
+
+all_types_dataframe.writeTo("default.test_all_types").tableProperty("format-version", "2").partitionedBy(
+    "intCol"
+).createOrReplace()
+
 while True:
     time.sleep(1)
diff --git a/python/mkdocs/docs/api.md b/python/mkdocs/docs/api.md
@@ -369,3 +369,51 @@ print(
     (datetime.timedelta(seconds=1581),),
 ]
 ```
+
+### Ray
+
+!!! note "Requirements"
+    This requires [Ray to be installed](index.md).
+
+A table scan can also be converted into a Ray dataset:
+
+```python
+ray_dataset = table.scan(
+    row_filter=GreaterThanOrEqual("trip_distance", 10.0),
+    selected_fields=("VendorID", "tpep_pickup_datetime", "tpep_dropoff_datetime"),
+).to_ray()
+```
+
+This will return a Ray dataset:
+
+```
+Dataset(
+    num_blocks=1,
+    num_rows=1168798,
+    schema={
+        VendorID: int64,
+        tpep_pickup_datetime: timestamp[us, tz=UTC],
+        tpep_dropoff_datetime: timestamp[us, tz=UTC]
+    }
+)
+```
+
+Using [Ray Dataset API](https://docs.ray.io/en/latest/data/api/dataset.html) to interact with the dataset:
+
+```python
+print(
+    ray_dataset.take(2)
+)
+[
+    {
+        'VendorID': 2,
+        'tpep_pickup_datetime': datetime.datetime(2008, 12, 31, 23, 23, 50, tzinfo=<UTC>),
+        'tpep_dropoff_datetime': datetime.datetime(2009, 1, 1, 0, 34, 31, tzinfo=<UTC>)
+    },
+    {
+        'VendorID': 2,
+        'tpep_pickup_datetime': datetime.datetime(2008, 12, 31, 23, 5, 3, tzinfo=<UTC>),
+        'tpep_dropoff_datetime': datetime.datetime(2009, 1, 1, 16, 10, 18, tzinfo=<UTC>)
+    }
+]
+```
diff --git a/python/mkdocs/docs/index.md b/python/mkdocs/docs/index.md
@@ -49,7 +49,9 @@ You can mix and match optional dependencies depending on your needs:
 | glue     | Support for AWS Glue                                                 |
 | dynamodb | Support for AWS DynamoDB                                             |
 | pyarrow  | PyArrow as a FileIO implementation to interact with the object store |
+| pandas   | Installs both PyArrow and Pandas                                     |
 | duckdb   | Installs both PyArrow and DuckDB                                     |
+| ray      | Installs PyArrow, Pandas, and Ray                                    |
 | s3fs     | S3FS as a FileIO implementation to interact with the object store    |
 | adlfs    | ADLFS as a FileIO implementation to interact with the object store   |
 | snappy   | Support for snappy Avro compression                                  |

diff --git a/python/poetry.lock b/python/poetry.lock
diff --git a/python/pyiceberg/table/__init__.py b/python/pyiceberg/table/__init__.py
@@ -66,6 +66,7 @@
 if TYPE_CHECKING:
     import pandas as pd
     import pyarrow as pa
+    import ray
     from duckdb import DuckDBPyConnection
 
 
@@ -415,3 +416,8 @@ def to_duckdb(self, table_name: str, connection: Optional[DuckDBPyConnection] =
         con.register(table_name, self.to_arrow())
 
         return con
+
+    def to_ray(self) -> ray.data.dataset.Dataset:
+        import ray
+
+        return ray.data.from_arrow(self.to_arrow())
diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -68,6 +68,8 @@ pandas = { version = ">=1.4.4,<=1.5.3", optional = true }
 
 duckdb = { version = ">=0.6.0,<=0.7.1", optional = true }
 
+ray = { version = ">=2.0.0,<=2.3.0", optional = true }
+
 python-snappy = { version = "0.6.1", optional = true }
 
 thrift = { version = "0.16.0", optional = true }
@@ -100,6 +102,7 @@ build-backend = "poetry.core.masonry.api"
 pyarrow = ["pyarrow"]
 pandas = ["pandas", "pyarrow"]
 duckdb = ["duckdb", "pyarrow"]
+ray = ["ray", "pyarrow", "pandas"]
 snappy = ["python-snappy"]
 hive = ["thrift"]
 s3fs = ["s3fs"]
@@ -237,6 +240,10 @@ ignore_missing_imports = true
 module = "duckdb.*"
 ignore_missing_imports = true
 
+[[tool.mypy.overrides]]
+module = "ray.*"
+ignore_missing_imports = true
+
 [[tool.mypy.overrides]]
 module = "pyparsing.*"
 ignore_missing_imports = true

diff --git a/python/tests/test_integration.py b/python/tests/test_integration.py
@@ -49,6 +49,11 @@ def table_test_null_nan_rewritten(catalog: Catalog) -> Table:
     return catalog.load_table("default.test_null_nan_rewritten")
 
 
+@pytest.fixture()
+def table_test_all_types(catalog: Catalog) -> Table:
+    return catalog.load_table("default.test_all_types")
+
+
 @pytest.mark.integration
 def test_pyarrow_nan(table_test_null_nan: Table) -> None:
     arrow_table = table_test_null_nan.scan(row_filter=IsNaN("col_numeric"), selected_fields=("idx", "col_numeric")).to_arrow()
@@ -80,3 +85,36 @@ def test_duckdb_nan(table_test_null_nan_rewritten: Table) -> None:
     result = con.query("SELECT idx, col_numeric FROM table_test_null_nan WHERE isnan(col_numeric)").fetchone()
     assert result[0] == 1
     assert math.isnan(result[1])
+
+
+@pytest.mark.integration
+def test_ray_nan(table_test_null_nan_rewritten: Table) -> None:
+    ray_dataset = table_test_null_nan_rewritten.scan().to_ray()
+    assert ray_dataset.count() == 3
+    assert math.isnan(ray_dataset.take()[0]["col_numeric"])
+
+
+@pytest.mark.integration
+def test_ray_nan_rewritten(table_test_null_nan_rewritten: Table) -> None:
+    ray_dataset = table_test_null_nan_rewritten.scan(
+        row_filter=IsNaN("col_numeric"), selected_fields=("idx", "col_numeric")
+    ).to_ray()
+    assert ray_dataset.count() == 1
+    assert ray_dataset.take()[0]["idx"] == 1
+    assert math.isnan(ray_dataset.take()[0]["col_numeric"])
+
+
+@pytest.mark.integration
+@pytest.mark.skip(reason="Fixing issues with NaN's: https://github.com/apache/arrow/issues/34162")
+def test_ray_not_nan_count(table_test_null_nan_rewritten: Table) -> None:
+    ray_dataset = table_test_null_nan_rewritten.scan(row_filter=NotNaN("col_numeric"), selected_fields=("idx",)).to_ray()
+    print(ray_dataset.take())
+    assert ray_dataset.count() == 2
+
+
+@pytest.mark.integration
+def test_ray_all_types(table_test_all_types: Table) -> None:
+    ray_dataset = table_test_all_types.scan().to_ray()
+    pandas_dataframe = table_test_all_types.scan().to_pandas()
+    assert ray_dataset.count() == pandas_dataframe.shape[0]
+    assert pandas_dataframe.equals(ray_dataset.to_pandas())