Skip to content

Commit b487cf1

Browse files
authored
feat: add bigquery.json_keys (#2286)
1 parent 0cb5217 commit b487cf1

File tree

9 files changed

+146
-0
lines changed

9 files changed

+146
-0
lines changed

bigframes/bigquery/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,7 @@
4747
json_extract,
4848
json_extract_array,
4949
json_extract_string_array,
50+
json_keys,
5051
json_query,
5152
json_query_array,
5253
json_set,
@@ -138,6 +139,7 @@
138139
"json_extract",
139140
"json_extract_array",
140141
"json_extract_string_array",
142+
"json_keys",
141143
"json_query",
142144
"json_query_array",
143145
"json_set",

bigframes/bigquery/_operations/json.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -421,6 +421,35 @@ def json_value_array(
421421
return input._apply_unary_op(ops.JSONValueArray(json_path=json_path))
422422

423423

424+
def json_keys(
425+
input: series.Series,
426+
max_depth: Optional[int] = None,
427+
) -> series.Series:
428+
"""Returns all keys in the root of a JSON object as an ARRAY of STRINGs.
429+
430+
**Examples:**
431+
432+
>>> import bigframes.pandas as bpd
433+
>>> import bigframes.bigquery as bbq
434+
435+
>>> s = bpd.Series(['{"b": {"c": 2}, "a": 1}'], dtype="json")
436+
>>> bbq.json_keys(s)
437+
0 ['a' 'b' 'b.c']
438+
dtype: list<item: string>[pyarrow]
439+
440+
Args:
441+
input (bigframes.series.Series):
442+
The Series containing JSON data.
443+
max_depth (int, optional):
444+
Specifies the maximum depth of nested fields to search for keys. If not
445+
provided, searched keys at all levels.
446+
447+
Returns:
448+
bigframes.series.Series: A new Series containing arrays of keys from the input JSON.
449+
"""
450+
return input._apply_unary_op(ops.JSONKeys(max_depth=max_depth))
451+
452+
424453
def to_json(
425454
input: series.Series,
426455
) -> series.Series:

bigframes/core/compile/ibis_compiler/scalar_op_registry.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1234,6 +1234,11 @@ def json_value_array_op_impl(x: ibis_types.Value, op: ops.JSONValueArray):
12341234
return json_value_array(json_obj=x, json_path=op.json_path)
12351235

12361236

1237+
@scalar_op_compiler.register_unary_op(ops.JSONKeys, pass_op=True)
1238+
def json_keys_op_impl(x: ibis_types.Value, op: ops.JSONKeys):
1239+
return json_keys(x, op.max_depth)
1240+
1241+
12371242
# Blob Ops
12381243
@scalar_op_compiler.register_unary_op(ops.obj_fetch_metadata_op)
12391244
def obj_fetch_metadata_op_impl(obj_ref: ibis_types.Value):
@@ -2059,6 +2064,14 @@ def to_json_string(value) -> ibis_dtypes.String: # type: ignore[empty-body]
20592064
"""Convert value to JSON-formatted string."""
20602065

20612066

2067+
@ibis_udf.scalar.builtin(name="json_keys")
2068+
def json_keys( # type: ignore[empty-body]
2069+
json_obj: ibis_dtypes.JSON,
2070+
max_depth: ibis_dtypes.Int64,
2071+
) -> ibis_dtypes.Array[ibis_dtypes.String]:
2072+
"""Extracts unique JSON keys from a JSON expression."""
2073+
2074+
20622075
@ibis_udf.scalar.builtin(name="json_value")
20632076
def json_value( # type: ignore[empty-body]
20642077
json_obj: ibis_dtypes.JSON, json_path: ibis_dtypes.String

bigframes/core/compile/sqlglot/expressions/json_ops.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,11 @@ def _(expr: TypedExpr, op: ops.JSONExtractStringArray) -> sge.Expression:
3939
return sge.func("JSON_EXTRACT_STRING_ARRAY", expr.expr, sge.convert(op.json_path))
4040

4141

42+
@register_unary_op(ops.JSONKeys, pass_op=True)
43+
def _(expr: TypedExpr, op: ops.JSONKeys) -> sge.Expression:
44+
return sge.func("JSON_KEYS", expr.expr, sge.convert(op.max_depth))
45+
46+
4247
@register_unary_op(ops.JSONQuery, pass_op=True)
4348
def _(expr: TypedExpr, op: ops.JSONQuery) -> sge.Expression:
4449
return sge.func("JSON_QUERY", expr.expr, sge.convert(op.json_path))

bigframes/operations/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@
128128
JSONExtract,
129129
JSONExtractArray,
130130
JSONExtractStringArray,
131+
JSONKeys,
131132
JSONQuery,
132133
JSONQueryArray,
133134
JSONSet,
@@ -381,6 +382,7 @@
381382
"JSONExtract",
382383
"JSONExtractArray",
383384
"JSONExtractStringArray",
385+
"JSONKeys",
384386
"JSONQuery",
385387
"JSONQueryArray",
386388
"JSONSet",

bigframes/operations/json_ops.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -199,6 +199,23 @@ def output_type(self, *input_types):
199199
return input_type
200200

201201

202+
@dataclasses.dataclass(frozen=True)
203+
class JSONKeys(base_ops.UnaryOp):
204+
name: typing.ClassVar[str] = "json_keys"
205+
max_depth: typing.Optional[int] = None
206+
207+
def output_type(self, *input_types):
208+
input_type = input_types[0]
209+
if input_type != dtypes.JSON_DTYPE:
210+
raise TypeError(
211+
"Input type must be a valid JSON object or JSON-formatted string type."
212+
+ f" Received type: {input_type}"
213+
)
214+
return pd.ArrowDtype(
215+
pa.list_(dtypes.bigframes_dtype_to_arrow_dtype(dtypes.STRING_DTYPE))
216+
)
217+
218+
202219
@dataclasses.dataclass(frozen=True)
203220
class JSONDecode(base_ops.UnaryOp):
204221
name: typing.ClassVar[str] = "json_decode"

tests/system/small/bigquery/test_json.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -434,3 +434,53 @@ def test_to_json_string_from_struct():
434434
)
435435

436436
pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas())
437+
438+
439+
def test_json_keys():
440+
json_data = [
441+
'{"name": "Alice", "age": 30}',
442+
'{"city": "New York", "country": "USA", "active": true}',
443+
"{}",
444+
'{"items": [1, 2, 3]}',
445+
]
446+
s = bpd.Series(json_data, dtype=dtypes.JSON_DTYPE)
447+
actual = bbq.json_keys(s)
448+
449+
expected_data_pandas = [
450+
["age", "name"],
451+
[
452+
"active",
453+
"city",
454+
"country",
455+
],
456+
[],
457+
["items"],
458+
]
459+
expected = bpd.Series(
460+
expected_data_pandas, dtype=pd.ArrowDtype(pa.list_(pa.string()))
461+
)
462+
pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas())
463+
464+
465+
def test_json_keys_with_max_depth():
466+
json_data = [
467+
'{"user": {"name": "Bob", "details": {"id": 123, "status": "approved"}}}',
468+
'{"user": {"name": "Charlie"}}',
469+
]
470+
s = bpd.Series(json_data, dtype=dtypes.JSON_DTYPE)
471+
actual = bbq.json_keys(s, max_depth=2)
472+
473+
expected_data_pandas = [
474+
["user", "user.details", "user.name"],
475+
["user", "user.name"],
476+
]
477+
expected = bpd.Series(
478+
expected_data_pandas, dtype=pd.ArrowDtype(pa.list_(pa.string()))
479+
)
480+
pd.testing.assert_series_equal(actual.to_pandas(), expected.to_pandas())
481+
482+
483+
def test_json_keys_from_string_error():
484+
s = bpd.Series(['{"a": 1, "b": 2}', '{"c": 3}'])
485+
with pytest.raises(TypeError):
486+
bbq.json_keys(s)
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
WITH `bfcte_0` AS (
2+
SELECT
3+
`json_col`
4+
FROM `bigframes-dev`.`sqlglot_test`.`json_types`
5+
), `bfcte_1` AS (
6+
SELECT
7+
*,
8+
JSON_KEYS(`json_col`, NULL) AS `bfcol_1`,
9+
JSON_KEYS(`json_col`, 2) AS `bfcol_2`
10+
FROM `bfcte_0`
11+
)
12+
SELECT
13+
`bfcol_1` AS `json_keys`,
14+
`bfcol_2` AS `json_keys_w_max_depth`
15+
FROM `bfcte_1`

tests/unit/core/compile/sqlglot/expressions/test_json_ops.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,19 @@ def test_json_extract_string_array(json_types_df: bpd.DataFrame, snapshot):
5252
snapshot.assert_match(sql, "out.sql")
5353

5454

55+
def test_json_keys(json_types_df: bpd.DataFrame, snapshot):
56+
col_name = "json_col"
57+
bf_df = json_types_df[[col_name]]
58+
59+
ops_map = {
60+
"json_keys": ops.JSONKeys().as_expr(col_name),
61+
"json_keys_w_max_depth": ops.JSONKeys(max_depth=2).as_expr(col_name),
62+
}
63+
64+
sql = utils._apply_ops_to_sql(bf_df, list(ops_map.values()), list(ops_map.keys()))
65+
snapshot.assert_match(sql, "out.sql")
66+
67+
5568
def test_json_query(json_types_df: bpd.DataFrame, snapshot):
5669
col_name = "json_col"
5770
bf_df = json_types_df[[col_name]]

0 commit comments

Comments
 (0)