Skip to content

Commit e61b8d8

Browse files
chrisogemini-code-assist[bot]
authored andcommitted
[Data] Support serializing zero-length numpy arrays (#57858)
## Description Ray data can't serialize zero (byte) length numpy arrays: ```python3 import numpy as np import ray.data array = np.empty((2, 0), dtype=np.int8) ds = ray.data.from_items([{"array": array}]) for batch in ds.iter_batches(batch_size=1): print(batch) ``` What I expect to see: ``` {'array': array([], shape=(1, 2, 0), dtype=int8)} ``` What I see: ``` /Users/chris.ohara/Downloads/.venv/lib/python3.12/site-packages/ray/air/util/tensor_extensions/arrow.py:736: RuntimeWarning: invalid value encountered in scalar divide offsets = np.arange( 2025-10-17 17:18:09,499 WARNING arrow.py:189 -- Failed to convert column 'array' into pyarrow array due to: Error converting data to Arrow: column: 'array', shape: (1, 2, 0), dtype: int8, data: []; falling back to serialize as pickled python objects Traceback (most recent call last): File "/Users/chris.ohara/Downloads/.venv/lib/python3.12/site-packages/ray/air/util/tensor_extensions/arrow.py", line 672, in from_numpy return cls._from_numpy(arr) ^^^^^^^^^^^^^^^^^^^^ File "/Users/chris.ohara/Downloads/.venv/lib/python3.12/site-packages/ray/air/util/tensor_extensions/arrow.py", line 736, in _from_numpy offsets = np.arange( ^^^^^^^^^^ ValueError: arange: cannot compute length The above exception was the direct cause of the following exception: Traceback (most recent call last): File "/Users/chris.ohara/Downloads/.venv/lib/python3.12/site-packages/ray/air/util/tensor_extensions/arrow.py", line 141, in convert_to_pyarrow_array return ArrowTensorArray.from_numpy( ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ File "/Users/chris.ohara/Downloads/.venv/lib/python3.12/site-packages/ray/air/util/tensor_extensions/arrow.py", line 678, in from_numpy raise ArrowConversionError(data_str) from e ray.air.util.tensor_extensions.arrow.ArrowConversionError: Error converting data to Arrow: column: 'array', shape: (1, 2, 0), dtype: int8, data: [] 2025-10-17 17:18:09,789 INFO logging.py:293 -- Registered dataset logger for dataset dataset_0_0 2025-10-17 17:18:09,815 WARNING resource_manager.py:134 -- ⚠️ Ray's object store is configured to use only 33.5% of available memory (2.0GiB out of 6.0GiB total). For optimal Ray Data performance, we recommend setting the object store to at least 50% of available memory. You can do this by setting the 'object_store_memory' parameter when calling ray.init() or by setting the RAY_DEFAULT_OBJECT_STORE_MEMORY_PROPORTION environment variable. {'array': array([array([], shape=(2, 0), dtype=int8)], dtype=object)} ``` This PR fixes the issue so that zero-length arrays are serialized correctly, and the shape and dtype is preserved. ## Additional information This is `ray==2.50.0`. --------- Signed-off-by: Chris O'Hara <[email protected]> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com> Signed-off-by: Kamil Kaczmarek <[email protected]>
1 parent adef7b5 commit e61b8d8

File tree

2 files changed

+25
-6
lines changed

2 files changed

+25
-6
lines changed

python/ray/air/util/tensor_extensions/arrow.py

Lines changed: 11 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -732,13 +732,18 @@ def _from_numpy(
732732
else:
733733
pa_type_ = ArrowTensorType(element_shape, scalar_dtype)
734734

735+
offset_dtype = pa_type_.OFFSET_DTYPE.to_pandas_dtype()
736+
735737
# Create offsets buffer
736-
offsets = np.arange(
737-
0,
738-
(outer_len + 1) * num_items_per_element,
739-
num_items_per_element,
740-
dtype=pa_type_.OFFSET_DTYPE.to_pandas_dtype(),
741-
)
738+
if num_items_per_element == 0:
739+
offsets = np.zeros(outer_len + 1, dtype=offset_dtype)
740+
else:
741+
offsets = np.arange(
742+
0,
743+
(outer_len + 1) * num_items_per_element,
744+
num_items_per_element,
745+
dtype=offset_dtype,
746+
)
742747
offset_buffer = pa.py_buffer(offsets)
743748

744749
storage = pa.Array.from_buffers(

python/ray/data/tests/test_tensor_extension.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -507,6 +507,20 @@ def test_tensor_array_reductions(restore_data_context, tensor_format):
507507
np.testing.assert_equal(df["two"].agg(name), reducer(arr, axis=0, **np_kwargs))
508508

509509

510+
@pytest.mark.parametrize("tensor_format", ["v1", "v2"])
511+
@pytest.mark.parametrize("shape", [(2, 0), (2, 5, 0), (0, 5), (0, 0)])
512+
def test_zero_length_arrow_tensor_array_roundtrip(
513+
restore_data_context, tensor_format, shape
514+
):
515+
DataContext.get_current().use_arrow_tensor_v2 = tensor_format == "v2"
516+
517+
arr = np.empty(shape, dtype=np.int8)
518+
t_arr = ArrowTensorArray.from_numpy(arr)
519+
assert len(t_arr) == len(arr)
520+
out = t_arr.to_numpy()
521+
np.testing.assert_array_equal(out, arr)
522+
523+
510524
@pytest.mark.parametrize("tensor_format", ["v1", "v2"])
511525
@pytest.mark.parametrize("chunked", [False, True])
512526
def test_arrow_tensor_array_getitem(chunked, restore_data_context, tensor_format):

0 commit comments

Comments
 (0)