Skip to content

Commit 896edb5

Browse files
Sun Ruishivaram
authored andcommitted
[SPARK-10050] [SPARKR] Support collecting data of MapType in DataFrame.
1. Support collecting data of MapType from DataFrame. 2. Support data of MapType in createDataFrame. Author: Sun Rui <[email protected]> Closes #8711 from sun-rui/SPARK-10050.
1 parent 5dbaf3d commit 896edb5

File tree

6 files changed

+123
-23
lines changed

6 files changed

+123
-23
lines changed

R/pkg/R/SQLContext.R

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -41,10 +41,7 @@ infer_type <- function(x) {
4141
if (type == "map") {
4242
stopifnot(length(x) > 0)
4343
key <- ls(x)[[1]]
44-
list(type = "map",
45-
keyType = "string",
46-
valueType = infer_type(get(key, x)),
47-
valueContainsNull = TRUE)
44+
paste0("map<string,", infer_type(get(key, x)), ">")
4845
} else if (type == "array") {
4946
stopifnot(length(x) > 0)
5047
names <- names(x)

R/pkg/R/deserialize.R

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ readTypedObject <- function(con, type) {
5050
"t" = readTime(con),
5151
"a" = readArray(con),
5252
"l" = readList(con),
53+
"e" = readEnv(con),
5354
"n" = NULL,
5455
"j" = getJobj(readString(con)),
5556
stop(paste("Unsupported type for deserialization", type)))
@@ -121,6 +122,19 @@ readList <- function(con) {
121122
}
122123
}
123124

125+
readEnv <- function(con) {
126+
env <- new.env()
127+
len <- readInt(con)
128+
if (len > 0) {
129+
for (i in 1:len) {
130+
key <- readString(con)
131+
value <- readObject(con)
132+
env[[key]] <- value
133+
}
134+
}
135+
env
136+
}
137+
124138
readRaw <- function(con) {
125139
dataLen <- readInt(con)
126140
readBin(con, raw(), as.integer(dataLen), endian = "big")

R/pkg/R/schema.R

Lines changed: 27 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -131,13 +131,33 @@ checkType <- function(type) {
131131
if (type %in% primtiveTypes) {
132132
return()
133133
} else {
134-
m <- regexec("^array<(.*)>$", type)
135-
matchedStrings <- regmatches(type, m)
136-
if (length(matchedStrings[[1]]) >= 2) {
137-
elemType <- matchedStrings[[1]][2]
138-
checkType(elemType)
139-
return()
140-
}
134+
# Check complex types
135+
firstChar <- substr(type, 1, 1)
136+
switch (firstChar,
137+
a = {
138+
# Array type
139+
m <- regexec("^array<(.*)>$", type)
140+
matchedStrings <- regmatches(type, m)
141+
if (length(matchedStrings[[1]]) >= 2) {
142+
elemType <- matchedStrings[[1]][2]
143+
checkType(elemType)
144+
return()
145+
}
146+
},
147+
m = {
148+
# Map type
149+
m <- regexec("^map<(.*),(.*)>$", type)
150+
matchedStrings <- regmatches(type, m)
151+
if (length(matchedStrings[[1]]) >= 3) {
152+
keyType <- matchedStrings[[1]][2]
153+
if (keyType != "string" && keyType != "character") {
154+
stop("Key type in a map must be string or character")
155+
}
156+
valueType <- matchedStrings[[1]][3]
157+
checkType(valueType)
158+
return()
159+
}
160+
})
141161
}
142162

143163
stop(paste("Unsupported type for Dataframe:", type))

R/pkg/inst/tests/test_sparkSQL.R

Lines changed: 44 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ mockLinesComplexType <-
5757
complexTypeJsonPath <- tempfile(pattern="sparkr-test", fileext=".tmp")
5858
writeLines(mockLinesComplexType, complexTypeJsonPath)
5959

60-
test_that("infer types", {
60+
test_that("infer types and check types", {
6161
expect_equal(infer_type(1L), "integer")
6262
expect_equal(infer_type(1.0), "double")
6363
expect_equal(infer_type("abc"), "string")
@@ -72,9 +72,9 @@ test_that("infer types", {
7272
checkStructField(testStruct$fields()[[2]], "b", "StringType", TRUE)
7373
e <- new.env()
7474
assign("a", 1L, envir = e)
75-
expect_equal(infer_type(e),
76-
list(type = "map", keyType = "string", valueType = "integer",
77-
valueContainsNull = TRUE))
75+
expect_equal(infer_type(e), "map<string,integer>")
76+
77+
expect_error(checkType("map<integer,integer>"), "Key type in a map must be string or character")
7878
})
7979

8080
test_that("structType and structField", {
@@ -242,7 +242,7 @@ test_that("create DataFrame with different data types", {
242242
expect_equal(collect(df), data.frame(l, stringsAsFactors = FALSE))
243243
})
244244

245-
test_that("create DataFrame with nested array and struct", {
245+
test_that("create DataFrame with nested array and map", {
246246
# e <- new.env()
247247
# assign("n", 3L, envir = e)
248248
# l <- list(1:10, list("a", "b"), e, list(a="aa", b=3L))
@@ -253,21 +253,35 @@ test_that("create DataFrame with nested array and struct", {
253253
# ldf <- collect(df)
254254
# expect_equal(ldf[1,], l[[1]])
255255

256+
# ArrayType and MapType
257+
e <- new.env()
258+
assign("n", 3L, envir = e)
256259

257-
# ArrayType only for now
258-
l <- list(as.list(1:10), list("a", "b"))
259-
df <- createDataFrame(sqlContext, list(l), c("a", "b"))
260-
expect_equal(dtypes(df), list(c("a", "array<int>"), c("b", "array<string>")))
260+
l <- list(as.list(1:10), list("a", "b"), e)
261+
df <- createDataFrame(sqlContext, list(l), c("a", "b", "c"))
262+
expect_equal(dtypes(df), list(c("a", "array<int>"),
263+
c("b", "array<string>"),
264+
c("c", "map<string,int>")))
261265
expect_equal(count(df), 1)
262266
ldf <- collect(df)
263-
expect_equal(names(ldf), c("a", "b"))
267+
expect_equal(names(ldf), c("a", "b", "c"))
264268
expect_equal(ldf[1, 1][[1]], l[[1]])
265269
expect_equal(ldf[1, 2][[1]], l[[2]])
270+
e <- ldf$c[[1]]
271+
expect_equal(class(e), "environment")
272+
expect_equal(ls(e), "n")
273+
expect_equal(e$n, 3L)
266274
})
267275

276+
# For test map type in DataFrame
277+
mockLinesMapType <- c("{\"name\":\"Bob\",\"info\":{\"age\":16,\"height\":176.5}}",
278+
"{\"name\":\"Alice\",\"info\":{\"age\":20,\"height\":164.3}}",
279+
"{\"name\":\"David\",\"info\":{\"age\":60,\"height\":180}}")
280+
mapTypeJsonPath <- tempfile(pattern="sparkr-test", fileext=".tmp")
281+
writeLines(mockLinesMapType, mapTypeJsonPath)
282+
268283
test_that("Collect DataFrame with complex types", {
269-
# only ArrayType now
270-
# TODO: tests for StructType and MapType after they are supported
284+
# ArrayType
271285
df <- jsonFile(sqlContext, complexTypeJsonPath)
272286

273287
ldf <- collect(df)
@@ -277,6 +291,24 @@ test_that("Collect DataFrame with complex types", {
277291
expect_equal(ldf$c1, list(list(1, 2, 3), list(4, 5, 6), list (7, 8, 9)))
278292
expect_equal(ldf$c2, list(list("a", "b", "c"), list("d", "e", "f"), list ("g", "h", "i")))
279293
expect_equal(ldf$c3, list(list(1.0, 2.0, 3.0), list(4.0, 5.0, 6.0), list (7.0, 8.0, 9.0)))
294+
295+
# MapType
296+
schema <- structType(structField("name", "string"),
297+
structField("info", "map<string,double>"))
298+
df <- read.df(sqlContext, mapTypeJsonPath, "json", schema)
299+
expect_equal(dtypes(df), list(c("name", "string"),
300+
c("info", "map<string,double>")))
301+
ldf <- collect(df)
302+
expect_equal(nrow(ldf), 3)
303+
expect_equal(ncol(ldf), 2)
304+
expect_equal(names(ldf), c("name", "info"))
305+
expect_equal(ldf$name, c("Bob", "Alice", "David"))
306+
bob <- ldf$info[[1]]
307+
expect_equal(class(bob), "environment")
308+
expect_equal(bob$age, 16)
309+
expect_equal(bob$height, 176.5)
310+
311+
# TODO: tests for StructType after it is supported
280312
})
281313

282314
test_that("jsonFile() on a local file returns a DataFrame", {

core/src/main/scala/org/apache/spark/api/r/SerDe.scala

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,11 +209,23 @@ private[spark] object SerDe {
209209
case "array" => dos.writeByte('a')
210210
// Array of objects
211211
case "list" => dos.writeByte('l')
212+
case "map" => dos.writeByte('e')
212213
case "jobj" => dos.writeByte('j')
213214
case _ => throw new IllegalArgumentException(s"Invalid type $typeStr")
214215
}
215216
}
216217

218+
private def writeKeyValue(dos: DataOutputStream, key: Object, value: Object): Unit = {
219+
if (key == null) {
220+
throw new IllegalArgumentException("Key in map can't be null.")
221+
} else if (!key.isInstanceOf[String]) {
222+
throw new IllegalArgumentException(s"Invalid map key type: ${key.getClass.getName}")
223+
}
224+
225+
writeString(dos, key.asInstanceOf[String])
226+
writeObject(dos, value)
227+
}
228+
217229
def writeObject(dos: DataOutputStream, obj: Object): Unit = {
218230
if (obj == null) {
219231
writeType(dos, "void")
@@ -306,6 +318,25 @@ private[spark] object SerDe {
306318
writeInt(dos, v.length)
307319
v.foreach(elem => writeObject(dos, elem))
308320

321+
// Handle map
322+
case v: java.util.Map[_, _] =>
323+
writeType(dos, "map")
324+
writeInt(dos, v.size)
325+
val iter = v.entrySet.iterator
326+
while(iter.hasNext) {
327+
val entry = iter.next
328+
val key = entry.getKey
329+
val value = entry.getValue
330+
331+
writeKeyValue(dos, key.asInstanceOf[Object], value.asInstanceOf[Object])
332+
}
333+
case v: scala.collection.Map[_, _] =>
334+
writeType(dos, "map")
335+
writeInt(dos, v.size)
336+
v.foreach { case (key, value) =>
337+
writeKeyValue(dos, key.asInstanceOf[Object], value.asInstanceOf[Object])
338+
}
339+
309340
case _ =>
310341
writeType(dos, "jobj")
311342
writeJObj(dos, value)

sql/core/src/main/scala/org/apache/spark/sql/api/r/SQLUtils.scala

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,12 @@ private[r] object SQLUtils {
6464
case r"\Aarray<(.*)${elemType}>\Z" => {
6565
org.apache.spark.sql.types.ArrayType(getSQLDataType(elemType))
6666
}
67+
case r"\Amap<(.*)${keyType},(.*)${valueType}>\Z" => {
68+
if (keyType != "string" && keyType != "character") {
69+
throw new IllegalArgumentException("Key type of a map must be string or character")
70+
}
71+
org.apache.spark.sql.types.MapType(getSQLDataType(keyType), getSQLDataType(valueType))
72+
}
6773
case _ => throw new IllegalArgumentException(s"Invaid type $dataType")
6874
}
6975
}

0 commit comments

Comments
 (0)