diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index fb51fa743975c..aba755cec8990 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -539,6 +539,7 @@ object FunctionRegistry { expression[JsonToStructs]("from_json"), expression[SchemaOfJson]("schema_of_json"), expression[LengthOfJsonArray]("json_array_length"), + expression[JsonObjectKeys]("json_object_keys"), // cast expression[Cast]("cast"), diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala index fbb11de6a310e..98068360183ff 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/expressions/jsonExpressions.scala @@ -19,6 +19,7 @@ package org.apache.spark.sql.catalyst.expressions import java.io._ +import scala.collection.mutable.ArrayBuffer import scala.util.parsing.combinator.RegexParsers import com.fasterxml.jackson.core._ @@ -864,3 +865,71 @@ case class LengthOfJsonArray(child: Expression) extends UnaryExpression length } } + +/** + * A function which returns all the keys of the outmost JSON object. + */ +@ExpressionDescription( + usage = "_FUNC_(json_object) - Returns all the keys of the outmost JSON object as an array.", + arguments = """ + Arguments: + * json_object - A JSON object. If a valid JSON object is given, all the keys of the outmost + object will be returned as an array. If it is any other valid JSON string, an invalid JSON + string or an empty string, the function returns null. + """, + examples = """ + Examples: + > Select _FUNC_('{}'); + [] + > Select _FUNC_('{"key": "value"}'); + ["key"] + > Select _FUNC_('{"f1":"abc","f2":{"f3":"a", "f4":"b"}}'); + ["f1","f2"] + """, + since = "3.1.0" +) +case class JsonObjectKeys(child: Expression) extends UnaryExpression with CodegenFallback + with ExpectsInputTypes { + + override def inputTypes: Seq[DataType] = Seq(StringType) + override def dataType: DataType = ArrayType(StringType) + override def nullable: Boolean = true + override def prettyName: String = "json_object_keys" + + override def eval(input: InternalRow): Any = { + val json = child.eval(input).asInstanceOf[UTF8String] + // return null for `NULL` input + if(json == null) { + return null + } + + try { + Utils.tryWithResource(CreateJacksonParser.utf8String(SharedFactory.jsonFactory, json)) { + parser => { + // return null if an empty string or any other valid JSON string is encountered + if (parser.nextToken() == null || parser.currentToken() != JsonToken.START_OBJECT) { + return null + } + // Parse the JSON string to get all the keys of outmost JSON object + getJsonKeys(parser, input) + } + } + } catch { + case _: JsonProcessingException | _: IOException => null + } + } + + private def getJsonKeys(parser: JsonParser, input: InternalRow): GenericArrayData = { + var arrayBufferOfKeys = ArrayBuffer.empty[UTF8String] + + // traverse until the end of input and ensure it returns valid key + while(parser.nextValue() != null && parser.currentName() != null) { + // add current fieldName to the ArrayBuffer + arrayBufferOfKeys += UTF8String.fromString(parser.getCurrentName) + + // skip all the children of inner object or array + parser.skipChildren() + } + new GenericArrayData(arrayBufferOfKeys.toArray) + } +} diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala index 2536d28b7d6cf..6f062dcc9a1ce 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/expressions/JsonExpressionsSuite.scala @@ -831,4 +831,26 @@ class JsonExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper with checkEvaluation(LengthOfJsonArray(Literal(literal)), expectedValue) } } + + test("json_object_keys") { + Seq( + // Invalid inputs + ("", null), + ("[]", null), + ("""[{"key": "JSON"}]""", null), + ("""{"key": 45, "random_string"}""", null), + ("""{[1, 2, {"Key": "Invalid JSON"}]}""", null), + // JSON objects + ("{}", Seq.empty[UTF8String]), + ("""{"key": 1}""", Seq("key")), + ("""{"key": "value", "key2": 2}""", Seq("key", "key2")), + ("""{"arrayKey": [1, 2, 3]}""", Seq("arrayKey")), + ("""{"key":[1,2,3,{"key":"value"},[1,2,3]]}""", Seq("key")), + ("""{"f1":"abc","f2":{"f3":"a", "f4":"b"}}""", Seq("f1", "f2")), + ("""{"k1": [1, 2, {"key": 5}], "k2": {"key2": [1, 2]}}""", Seq("k1", "k2")) + ).foreach { + case (input, expected) => + checkEvaluation(JsonObjectKeys(Literal(input)), expected) + } + } } diff --git a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql index 06de7982efce8..131890fddb0db 100644 --- a/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql +++ b/sql/core/src/test/resources/sql-tests/inputs/json-functions.sql @@ -72,5 +72,21 @@ select json_array_length('[1,2,3,[33,44],{"key":[2,3,4]}]'); select json_array_length('{"key":"not a json array"}'); select json_array_length('[1,2,3,4,5'); +-- json_object_keys +select json_object_keys(); +select json_object_keys(null); +select json_object_keys(200); +select json_object_keys(''); +select json_object_keys('{}'); +select json_object_keys('{"key": 1}'); +select json_object_keys('{"key": "value", "key2": 2}'); +select json_object_keys('{"arrayKey": [1, 2, 3]}'); +select json_object_keys('{"key":[1,2,3,{"key":"value"},[1,2,3]]}'); +select json_object_keys('{"f1":"abc","f2":{"f3":"a", "f4":"b"}}'); +select json_object_keys('{"k1": [1, 2, {"key": 5}], "k2": {"key2": [1, 2]}}'); +select json_object_keys('{[1,2]}'); +select json_object_keys('{"key": 45, "random_string"}'); +select json_object_keys('[1, 2, 3]'); + -- Clean up DROP VIEW IF EXISTS jsonTable; diff --git a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out index 135b18cd29801..866fd1245d0ed 100644 --- a/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out +++ b/sql/core/src/test/resources/sql-tests/results/json-functions.sql.out @@ -1,5 +1,5 @@ -- Automatically generated by SQLQueryTestSuite --- Number of queries: 53 +-- Number of queries: 67 -- !query @@ -436,6 +436,120 @@ struct NULL +-- !query +select json_object_keys() +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +Invalid number of arguments for function json_object_keys. Expected: 1; Found: 0; line 1 pos 7 + + +-- !query +select json_object_keys(null) +-- !query schema +struct> +-- !query output +NULL + + +-- !query +select json_object_keys(200) +-- !query schema +struct<> +-- !query output +org.apache.spark.sql.AnalysisException +cannot resolve 'json_object_keys(200)' due to data type mismatch: argument 1 requires string type, however, '200' is of int type.; line 1 pos 7 + + +-- !query +select json_object_keys('') +-- !query schema +struct> +-- !query output +NULL + + +-- !query +select json_object_keys('{}') +-- !query schema +struct> +-- !query output +[] + + +-- !query +select json_object_keys('{"key": 1}') +-- !query schema +struct> +-- !query output +["key"] + + +-- !query +select json_object_keys('{"key": "value", "key2": 2}') +-- !query schema +struct> +-- !query output +["key","key2"] + + +-- !query +select json_object_keys('{"arrayKey": [1, 2, 3]}') +-- !query schema +struct> +-- !query output +["arrayKey"] + + +-- !query +select json_object_keys('{"key":[1,2,3,{"key":"value"},[1,2,3]]}') +-- !query schema +struct> +-- !query output +["key"] + + +-- !query +select json_object_keys('{"f1":"abc","f2":{"f3":"a", "f4":"b"}}') +-- !query schema +struct> +-- !query output +["f1","f2"] + + +-- !query +select json_object_keys('{"k1": [1, 2, {"key": 5}], "k2": {"key2": [1, 2]}}') +-- !query schema +struct> +-- !query output +["k1","k2"] + + +-- !query +select json_object_keys('{[1,2]}') +-- !query schema +struct> +-- !query output +NULL + + +-- !query +select json_object_keys('{"key": 45, "random_string"}') +-- !query schema +struct> +-- !query output +NULL + + +-- !query +select json_object_keys('[1, 2, 3]') +-- !query schema +struct> +-- !query output +NULL + + -- !query DROP VIEW IF EXISTS jsonTable -- !query schema