Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 28 additions & 0 deletions bigframes/core/blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -721,6 +721,34 @@ def to_pandas_batches(
for col in itertools.chain(self.value_columns, self.index_columns)
}
)
series_map = {}
for col in itertools.chain(self.value_columns, self.index_columns):
dtype = self.expr.get_column_type(col)
if bigframes.dtypes.contains_db_dtypes_json_dtype(dtype):
# Due to a limitation in Apache Arrow (#45262), JSON columns are not
# natively supported by the to_pandas_batches() method, which is
# used by the anywidget backend.
# Workaround for https://github.com/googleapis/python-bigquery-dataframes/issues/1273
# PyArrow doesn't support creating an empty array with db_dtypes.JSONArrowType,
# especially when nested.
# Create with string type and then cast.

# MyPy doesn't automatically narrow the type of 'dtype' here,
# so we add an explicit check.
if isinstance(dtype, pd.ArrowDtype):
safe_pa_type = bigframes.dtypes._replace_json_arrow_with_string(
dtype.pyarrow_dtype
)
safe_dtype = pd.ArrowDtype(safe_pa_type)
series_map[col] = pd.Series([], dtype=safe_dtype).astype(dtype)
else:
# This branch should ideally not be reached if
# contains_db_dtypes_json_dtype is accurate,
# but it's here for MyPy's sake.
series_map[col] = pd.Series([], dtype=dtype)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@chelsea-lin I assume we have similar code that does this, right? Maybe there's something that could be reused here?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, we have something similar in the loader component but they're slightly different.

def _validate_dtype_can_load(name: str, column_type: bigframes.dtypes.Dtype):

Also, I agree that we can simply logic a little bit, for example:

dtype = pd.ArrowDtype(pa.list_(pa.struct([("key", db_dtypes.JSONArrowType())])))
try:
    s = pd.Series([], dtype=dtype)
except pa.ArrowNotImplementedError as e:
    s = pd.Series([], dtype=pd.ArrowDtype(_replace_json_arrow_with_string(dtype.pyarrow_dtype))).astype(dtype)

Copy link
Contributor Author

@shuoweil shuoweil Oct 31, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The logic has been simplified

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks! The new logic looks even better!

else:
series_map[col] = pd.Series([], dtype=dtype)
empty_val = pd.DataFrame(series_map)
dfs = map(
lambda a: a[0],
itertools.zip_longest(
Expand Down
15 changes: 15 additions & 0 deletions bigframes/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -954,6 +954,21 @@ def contains_db_dtypes_json_dtype(dtype):
return contains_db_dtypes_json_arrow_type(dtype.pyarrow_dtype)


def _replace_json_arrow_with_string(pa_type: pa.DataType) -> pa.DataType:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This function may be similar as the following two methods. Can you help to remove the one in the loader.py?

def _has_json_arrow_type(arrow_type: pa.DataType) -> bool:

def contains_db_dtypes_json_arrow_type(type_):

Copy link
Contributor Author

@shuoweil shuoweil Oct 31, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since I removed this function, this code refactor is no longer relevant to this PR. I will start a new PR (#2221) for this code refactor.

"""Recursively replace JSONArrowType with string type."""
if isinstance(pa_type, db_dtypes.JSONArrowType):
return pa.string()
if isinstance(pa_type, pa.ListType):
return pa.list_(_replace_json_arrow_with_string(pa_type.value_type))
if isinstance(pa_type, pa.StructType):
new_fields = [
field.with_type(_replace_json_arrow_with_string(field.type))
for field in pa_type
]
return pa.struct(new_fields)
return pa_type


def warn_on_db_dtypes_json_dtype(dtypes):
"""Warn that the JSON dtype is changing.

Expand Down
71 changes: 71 additions & 0 deletions tests/system/small/test_dataframe_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -376,6 +376,77 @@ def test_to_pandas_batches_w_empty_dataframe(session):
pandas.testing.assert_series_equal(results[0].dtypes, empty.dtypes)


def test_to_pandas_batches_w_empty_dataframe_json_in_list(session):
"""Tests to_pandas_batches() with an empty DataFrame containing a list of JSON.

Regression test for https://github.com/googleapis/python-bigquery-dataframes/issues/1273
"""
import db_dtypes

json_list_dtype = pd.ArrowDtype(pa.list_(db_dtypes.JSONArrowType()))
empty_df_with_json_list = bpd.DataFrame(
{
"idx": pd.Series([], dtype="Int64"),
"json_list_col": pd.Series([], dtype=json_list_dtype),
},
session=session,
).set_index("idx", drop=True)

results = list(empty_df_with_json_list.to_pandas_batches())

assert len(results) == 1
assert list(results[0].columns) == ["json_list_col"]
assert results[0].dtypes["json_list_col"] == json_list_dtype
assert len(results[0]) == 0


def test_to_pandas_batches_w_empty_dataframe_json_in_struct(session):
"""Tests to_pandas_batches() with an empty DataFrame containing a struct of JSON.

Regression test for https://github.com/googleapis/python-bigquery-dataframes/issues/1273
"""
import db_dtypes

json_struct_dtype = pd.ArrowDtype(
pa.struct([("json_field", db_dtypes.JSONArrowType())])
)
empty_df_with_json_struct = bpd.DataFrame(
{
"idx": pd.Series([], dtype="Int64"),
"json_struct_col": pd.Series([], dtype=json_struct_dtype),
},
session=session,
).set_index("idx", drop=True)

results = list(empty_df_with_json_struct.to_pandas_batches())

assert len(results) == 1
assert list(results[0].columns) == ["json_struct_col"]
assert results[0].dtypes["json_struct_col"] == json_struct_dtype
assert len(results[0]) == 0


def test_to_pandas_batches_w_empty_dataframe_simple_json(session):
"""Tests to_pandas_batches() with an empty DataFrame containing a simple JSON column.

Regression test for https://github.com/googleapis/python-bigquery-dataframes/issues/1273
"""
empty_df_with_json = bpd.DataFrame(
{
"idx": pd.Series([], dtype="Int64"),
"json_col": pd.Series([], dtype=dtypes.JSON_DTYPE),
},
session=session,
).set_index("idx", drop=True)

results = list(empty_df_with_json.to_pandas_batches())

assert len(results) == 1
assert list(results[0].columns) == ["json_col"]
assert results[0].dtypes["json_col"] == dtypes.JSON_DTYPE
assert len(results[0]) == 0


@pytest.mark.parametrize("allow_large_results", (True, False))
def test_to_pandas_batches_w_page_size_and_max_results(session, allow_large_results):
"""Verify to_pandas_batches() APIs returns the expected page size.
Expand Down
Loading