Skip to content
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 14 additions & 4 deletions R/pkg/R/functions.R
Original file line number Diff line number Diff line change
Expand Up @@ -2430,33 +2430,43 @@ setMethod("date_format", signature(y = "Column", x = "character"),
column(jc)
})

setClassUnion("characterOrstructType", c("character", "structType"))
Copy link
Member Author

@HyukjinKwon HyukjinKwon Mar 6, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can I use a class union here in this file?


#' from_json
#'
#' Parses a column containing a JSON string into a Column of \code{structType} with the specified
#' \code{schema}. If the string is unparseable, the Column will contains the value NA.
#'
#' @param x Column containing the JSON string.
#' @param schema a structType object to use as the schema to use when parsing the JSON string.
#' @param schema a structType object or the data type string representing an array or struct type
#' used in structField to use as the schema to use when parsing the JSON string.
#' @param ... additional named properties to control how the json is parsed, accepts the same
#' options as the JSON data source.
#'
#' @family normal_funcs
#' @rdname from_json
#' @name from_json
#' @aliases from_json,Column,structType-method
#' @aliases from_json,Column,characterOrstructType-method
#' @export
#' @examples
#' \dontrun{
#' schema <- structType(structField("name", "string"),
#' select(df, from_json(df$value, schema, dateFormat = "dd/MM/yyyy"))
#'}
#' @note from_json since 2.2.0
setMethod("from_json", signature(x = "Column", schema = "structType"),
setMethod("from_json", signature(x = "Column", schema = "characterOrstructType"),
function(x, schema, ...) {
if (is.character(schema)) {
jschema <- callJStatic("org.apache.spark.sql.api.r.SQLUtils",
"getSQLDataType",
schema)
} else {
jschema <- schema$jobj
}
options <- varargsToStrEnv(...)
jc <- callJStatic("org.apache.spark.sql.functions",
"from_json",
x@jc, schema$jobj, options)
x@jc, jschema, options)
column(jc)
})

Expand Down
68 changes: 46 additions & 22 deletions R/pkg/inst/tests/testthat/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -1342,28 +1342,52 @@ test_that("column functions", {
df <- read.json(mapTypeJsonPath)
j <- collect(select(df, alias(to_json(df$info), "json")))
expect_equal(j[order(j$json), ][1], "{\"age\":16,\"height\":176.5}")
df <- as.DataFrame(j)
schema <- structType(structField("age", "integer"),
structField("height", "double"))
s <- collect(select(df, alias(from_json(df$json, schema), "structcol")))
expect_equal(ncol(s), 1)
expect_equal(nrow(s), 3)
expect_is(s[[1]][[1]], "struct")
expect_true(any(apply(s, 1, function(x) { x[[1]]$age == 16 } )))

# passing option
df <- as.DataFrame(list(list("col" = "{\"date\":\"21/10/2014\"}")))
schema2 <- structType(structField("date", "date"))
expect_error(tryCatch(collect(select(df, from_json(df$col, schema2))),
error = function(e) { stop(e) }),
paste0(".*(java.lang.NumberFormatException: For input string:).*"))
s <- collect(select(df, from_json(df$col, schema2, dateFormat = "dd/MM/yyyy")))
expect_is(s[[1]][[1]]$date, "Date")
expect_equal(as.character(s[[1]][[1]]$date), "2014-10-21")

# check for unparseable
df <- as.DataFrame(list(list("a" = "")))
expect_equal(collect(select(df, from_json(df$a, schema)))[[1]][[1]], NA)

schemas <- list(structType(structField("age", "integer"), structField("height", "double")),
"struct<age:integer,height:double>")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wonder if this is too loosely-typed and format hard to explain/illustrate to R user
struct<age:integer,height:double>

Copy link
Member Author

@HyukjinKwon HyukjinKwon Mar 7, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes.. I persuaded myself that it is okay because it's a valid type string for structField. If you prefer optional parameter one, I could try. Otherwise, let me close this for now If you are not sure of both ways :).

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it might be ok if this is well documented and well checked.
checkType in schema.R is strangely largely unused or untested though.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let me give a shot at my best.

Copy link
Member

@felixcheung felixcheung Mar 7, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh to clarify

"it might be ok if this is well documented and well checked. "-> if it's something we could formally document it might be ok

"checkType in schema.R is strangely largely unused or untested though. "-> I don't know the string specification is actually accidental - I see no use of it outside of checkType and it's not referenced anywhere. It's definitely not being tested as well

Actually, on 2nd thought, I recall there's a JIRA on accepting the JSON string as schema that is supported on Scala side. That might be a better way to go if we are to take a string, instead of inventing our own format. But that could be a bigger change.

Copy link
Member Author

@HyukjinKwon HyukjinKwon Mar 8, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hm, @felixcheung, I think this resembles catalog string, maybe we could reuse CatalystSqlParser.parseDataType to make this more formal and to do not duplicate the efforts for defining a format or documentation. This might be a big change but if this is what we want in the future, I would like to argue that we should keep this way.

For JSON string schema, there is an overloaded version of from_json that takes that schema string. If we are going to expose it, I think it can be easily done.

However, I think you meant it is a bigger change because we need to provide a way to produce this JSON string from types. Up to my knowledge, we can only manually specify the type via this calalog string in R. Is this true? If so, I don't have a good idea for now to support this and I would rather close this if you so as well.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If it is a format that is documented and globally defined in Spark, it shouldn't be a problem to use that. If a variant of from_json is already taking a schema string, we should be able call it directly from R without having to make it public, although it might make sense to do so, if, say, read.format("json") supports it too.

I'm not super worry about having a way to produce JSON schema string for the R user - we would just accept such JSON schema string and assume the user can create it.

But yea, overall this feels very heavy-weighed to support multiple JSON objects in a column value. How about just take a bool flag like we started with? :)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Doh, I meant there is a public from_json that takes JSON string schema -

def from_json(e: Column, schema: String, options: java.util.Map[String, String]): Column =
from_json(e, DataType.fromJson(schema), options)

Yup, let me give a shot to provide an option for it first to show if it looks okay to you.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yap that's what I thought. sounds good!

for (schema in schemas) {
Copy link
Member Author

@HyukjinKwon HyukjinKwon Mar 6, 2017

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I re-used the existing tests codes with the loop below:

- df <- as.DataFrame(j)
- schema <- structType(structField("age", "integer"),
-                      structField("height", "double"))
+ schemas <- list(structType(structField("age", "integer"), structField("height", "double")),
+                 "struct<age:integer,height:double>")
+ for (schema in schemas) {
+   df <- as.DataFrame(j)
  ...
+ }

df <- as.DataFrame(j)
s <- collect(select(df, alias(from_json(df$json, schema), "structcol")))
expect_equal(ncol(s), 1)
expect_equal(nrow(s), 3)
expect_is(s[[1]][[1]], "struct")
expect_true(any(apply(s, 1, function(x) { x[[1]]$age == 16 } )))

# passing option
df <- as.DataFrame(list(list("col" = "{\"date\":\"21/10/2014\"}")))
schema2 <- structType(structField("date", "date"))
expect_error(tryCatch(collect(select(df, from_json(df$col, schema2))),
error = function(e) { stop(e) }),
paste0(".*(java.lang.NumberFormatException: For input string:).*"))
s <- collect(select(df, from_json(df$col, schema2, dateFormat = "dd/MM/yyyy")))
expect_is(s[[1]][[1]]$date, "Date")
expect_equal(as.character(s[[1]][[1]]$date), "2014-10-21")

# check for unparseable
df <- as.DataFrame(list(list("a" = "")))
expect_equal(collect(select(df, from_json(df$a, schema)))[[1]][[1]], NA)
}

# check if array type in string is correctly supported.
jsonArr <- "[{\"name\":\"Bob\"}, {\"name\":\"Alice\"}]"
df <- as.DataFrame(list(list("people" = jsonArr)))
arr <- collect(select(df, alias(from_json(df$people, "array<struct<name:string>>"), "arrcol")))
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(Just in case, from_json takes both DataType and json string from DataType.json. In that sense, I thought it'd be nicer if it takes what structField takes in R)

expect_equal(ncol(arr), 1)
expect_equal(nrow(arr), 1)
expect_is(arr[[1]][[1]], "list")
expect_equal(length(arr$arrcol[[1]]), 2)
expect_equal(arr$arrcol[[1]][[1]]$name, "Bob")
expect_equal(arr$arrcol[[1]][[2]]$name, "Alice")

# check for unparseable data type
expect_error(tryCatch(collect(select(df, from_json(df$people, "unknown"))),
error = function(e) { stop(e) }),
paste0(".*(Invalid type unknown).*"))

# check for incorrect data type
expect_error(tryCatch(collect(select(df, from_json(df$people, "integer"))),
error = function(e) { stop(e) }),
paste0(".*(data type mismatch: Input schema int must be a struct or an array of structs).*"))
})

test_that("column binary mathfunctions", {
Expand Down