Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions bigframes/bigquery/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@
json_extract,
json_extract_array,
json_extract_string_array,
json_keys,
json_query,
json_query_array,
json_set,
Expand Down Expand Up @@ -138,6 +139,7 @@
"json_extract",
"json_extract_array",
"json_extract_string_array",
"json_keys",
"json_query",
"json_query_array",
"json_set",
Expand Down
29 changes: 29 additions & 0 deletions bigframes/bigquery/_operations/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,6 +421,35 @@ def json_value_array(
return input._apply_unary_op(ops.JSONValueArray(json_path=json_path))


def json_keys(
input: series.Series,
max_depth: Optional[int] = None,
) -> series.Series:
"""Returns all keys in the root of a JSON object as an ARRAY of STRINGs.

**Examples:**

>>> import bigframes.pandas as bpd
>>> import bigframes.bigquery as bbq

>>> s = bpd.Series(['{"b": {"c": 2}, "a": 1}'], dtype="json")
>>> bbq.json_keys(s)
0 ['a' 'b' 'b.c']
dtype: list<item: string>[pyarrow]

Args:
input (bigframes.series.Series):
The Series containing JSON data.
max_depth (int, optional):
Specifies the maximum depth of nested fields to search for keys. If not
provided, searched keys at all levels.

Returns:
bigframes.series.Series: A new Series containing arrays of keys from the input JSON.
"""
return input._apply_unary_op(ops.JSONKeys(max_depth=max_depth))


def to_json(
input: series.Series,
) -> series.Series:
Expand Down
13 changes: 13 additions & 0 deletions bigframes/core/compile/ibis_compiler/scalar_op_registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -1234,6 +1234,11 @@ def json_value_array_op_impl(x: ibis_types.Value, op: ops.JSONValueArray):
return json_value_array(json_obj=x, json_path=op.json_path)


@scalar_op_compiler.register_unary_op(ops.JSONKeys, pass_op=True)
def json_keys_op_impl(x: ibis_types.Value, op: ops.JSONKeys):
return json_keys(x, op.max_depth)


# Blob Ops
@scalar_op_compiler.register_unary_op(ops.obj_fetch_metadata_op)
def obj_fetch_metadata_op_impl(obj_ref: ibis_types.Value):
Expand Down Expand Up @@ -2059,6 +2064,14 @@ def to_json_string(value) -> ibis_dtypes.String: # type: ignore[empty-body]
"""Convert value to JSON-formatted string."""


@ibis_udf.scalar.builtin(name="json_keys")
def json_keys( # type: ignore[empty-body]
json_obj: ibis_dtypes.JSON,
max_depth: ibis_dtypes.Int64,
) -> ibis_dtypes.Array[ibis_dtypes.String]:
"""Extracts unique JSON keys from a JSON expression."""


@ibis_udf.scalar.builtin(name="json_value")
def json_value( # type: ignore[empty-body]
json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String
Expand Down
5 changes: 5 additions & 0 deletions bigframes/core/compile/sqlglot/expressions/json_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ def _(expr: TypedExpr, op: ops.JSONExtractStringArray) -> sge.Expression:
return sge.func("JSON_EXTRACT_STRING_ARRAY", expr.expr, sge.convert(op.json_path))


@register_unary_op(ops.JSONKeys, pass_op=True)
def _(expr: TypedExpr, op: ops.JSONKeys) -> sge.Expression:
return sge.func("JSON_KEYS", expr.expr, sge.convert(op.max_depth))


@register_unary_op(ops.JSONQuery, pass_op=True)
def _(expr: TypedExpr, op: ops.JSONQuery) -> sge.Expression:
return sge.func("JSON_QUERY", expr.expr, sge.convert(op.json_path))
Expand Down
2 changes: 2 additions & 0 deletions bigframes/operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@
JSONExtract,
JSONExtractArray,
JSONExtractStringArray,
JSONKeys,
JSONQuery,
JSONQueryArray,
JSONSet,
Expand Down Expand Up @@ -381,6 +382,7 @@
"JSONExtract",
"JSONExtractArray",
"JSONExtractStringArray",
"JSONKeys",
"JSONQuery",
"JSONQueryArray",
"JSONSet",
Expand Down
17 changes: 17 additions & 0 deletions bigframes/operations/json_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,23 @@ def output_type(self, *input_types):
return input_type


@dataclasses.dataclass(frozen=True)
class JSONKeys(base_ops.UnaryOp):
name: typing.ClassVar[str] = "json_keys"
max_depth: typing.Optional[int] = None

def output_type(self, *input_types):
input_type = input_types[0]
if input_type != dtypes.JSON_DTYPE:
raise TypeError(
"Input type must be a valid JSON object or JSON-formatted string type."
+ f" Received type: {input_type}"
)
return pd.ArrowDtype(
pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(dtypes.STRING_DTYPE))
)


@dataclasses.dataclass(frozen=True)
class JSONDecode(base_ops.UnaryOp):
name: typing.ClassVar[str] = "json_decode"
Expand Down
50 changes: 50 additions & 0 deletions tests/system/small/bigquery/test_json.py
Original file line number Diff line number Diff line change
Expand Up @@ -434,3 +434,53 @@ def test_to_json_string_from_struct():
)

pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas())


def test_json_keys():
json_data = [
'{"name": "Alice", "age": 30}',
'{"city": "New York", "country": "USA", "active": true}',
"{}",
'{"items": [1, 2, 3]}',
]
s = bpd.Series(json_data, dtype=dtypes.JSON_DTYPE)
actual = bbq.json_keys(s)

expected_data_pandas = [
["age", "name"],
[
"active",
"city",
"country",
],
[],
["items"],
]
expected = bpd.Series(
expected_data_pandas, dtype=pd.ArrowDtype(pa.list_(pa.string()))
)
pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas())


def test_json_keys_with_max_depth():
json_data = [
'{"user": {"name": "Bob", "details": {"id": 123, "status": "approved"}}}',
'{"user": {"name": "Charlie"}}',
]
s = bpd.Series(json_data, dtype=dtypes.JSON_DTYPE)
actual = bbq.json_keys(s, max_depth=2)

expected_data_pandas = [
["user", "user.details", "user.name"],
["user", "user.name"],
]
expected = bpd.Series(
expected_data_pandas, dtype=pd.ArrowDtype(pa.list_(pa.string()))
)
pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas())


def test_json_keys_from_string_error():
s = bpd.Series(['{"a": 1, "b": 2}', '{"c": 3}'])
with pytest.raises(TypeError):
bbq.json_keys(s)
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
WITH `bfcte_0` AS (
SELECT
`json_col`
FROM `bigframes-dev`.`sqlglot_test`.`json_types`
), `bfcte_1` AS (
SELECT
*,
JSON_KEYS(`json_col`, NULL) AS `bfcol_1`,
JSON_KEYS(`json_col`, 2) AS `bfcol_2`
FROM `bfcte_0`
)
SELECT
`bfcol_1` AS `json_keys`,
`bfcol_2` AS `json_keys_w_max_depth`
FROM `bfcte_1`
13 changes: 13 additions & 0 deletions tests/unit/core/compile/sqlglot/expressions/test_json_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,19 @@ def test_json_extract_string_array(json_types_df: bpd.DataFrame, snapshot):
snapshot.assert_match(sql, "out.sql")


def test_json_keys(json_types_df: bpd.DataFrame, snapshot):
col_name = "json_col"
bf_df = json_types_df[[col_name]]

ops_map = {
"json_keys": ops.JSONKeys().as_expr(col_name),
"json_keys_w_max_depth": ops.JSONKeys(max_depth=2).as_expr(col_name),
}

sql = utils._apply_ops_to_sql(bf_df, list(ops_map.values()), list(ops_map.keys()))
snapshot.assert_match(sql, "out.sql")


def test_json_query(json_types_df: bpd.DataFrame, snapshot):
col_name = "json_col"
bf_df = json_types_df[[col_name]]
Expand Down