Skip to content
Closed
5 changes: 5 additions & 0 deletions R/pkg/R/DataFrame.R
Original file line number Diff line number Diff line change
Expand Up @@ -1138,6 +1138,11 @@ setMethod("collect",
if (!is.null(PRIMITIVE_TYPES[[colType]]) && colType != "binary") {
vec <- do.call(c, col)
stopifnot(class(vec) != "list")
class(vec) <-
if (colType == "timestamp")
c("POSIXct", "POSIXt")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why should the class be c("POSIXct", "POSIXt") in this case?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because PRIMITIVE_TYPES[["timestamp"]] is POSIXct, it usually comes with POSIXt together. POSIXt is virtual class used to allow operations such as subtraction to mix the two classes POSIXct and POSIXlt.
The previous convertion will also convert timestamp to c("POSIXct", "POSIXt").

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks better if it won't affect other methods. I will try it. Thanks for the advice.

else
PRIMITIVE_TYPES[[colType]]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

by setting these instead of having it inferred - does this break any existing behavior? does any type differ because of this line of change?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Currently all tests are passed, except for the two modified tests with NA types as discussed before. The followings are the all type convertions from SparkDataframe to R data.frame, which have been tested in the existing tests in test_sparkSQL.R.

PRIMITIVE_TYPES <- as.environment(list(
  "tinyint" = "integer",
  "smallint" = "integer",
  "int" = "integer",
  "bigint" = "numeric",
  "float" = "numeric",
  "double" = "numeric",
  "decimal" = "numeric",
  "string" = "character",
  "binary" = "raw",
  "boolean" = "logical",
  "timestamp" = "POSIXct",
  "date" = "Date",
  # following types are not SQL types returned by dtypes(). They are listed here for usage
  # by checkType() in schema.R.
  # TODO: refactor checkType() in schema.R.
  "byte" = "integer",
  "integer" = "integer"
  ))

df[[colIndex]] <- vec
} else {
df[[colIndex]] <- col
Expand Down
42 changes: 40 additions & 2 deletions R/pkg/inst/tests/testthat/test_sparkSQL.R
Original file line number Diff line number Diff line change
Expand Up @@ -1277,9 +1277,9 @@ test_that("column functions", {

# Test first(), last()
df <- read.json(jsonPath)
expect_equal(collect(select(df, first(df$age)))[[1]], NA)
expect_equal(collect(select(df, first(df$age)))[[1]], NA_real_)
expect_equal(collect(select(df, first(df$age, TRUE)))[[1]], 30)
expect_equal(collect(select(df, first("age")))[[1]], NA)
expect_equal(collect(select(df, first("age")))[[1]], NA_real_)
expect_equal(collect(select(df, first("age", TRUE)))[[1]], 30)
expect_equal(collect(select(df, last(df$age)))[[1]], 19)
expect_equal(collect(select(df, last(df$age, TRUE)))[[1]], 19)
Expand Down Expand Up @@ -2748,6 +2748,44 @@ test_that("Call DataFrameWriter.load() API in Java without path and check argume
"Unnamed arguments ignored: 2, 3, a.")
})

test_that("Collect on DataFrame when NAs exists at the top of a timestamp column", {
ldf <- data.frame(col1 = c(0, 1, 2),
col2 = c(as.POSIXct("2017-01-01 00:00:01"),
NA,
as.POSIXct("2017-01-01 12:00:01")),
col3 = c(as.POSIXlt("2016-01-01 00:59:59"),
NA,
as.POSIXlt("2016-01-01 12:01:01")))
sdf1 <- createDataFrame(ldf)
ldf1 <- collect(sdf1)
expect_equal(dtypes(sdf1), list(c("col1", "double"),
c("col2", "timestamp"),
c("col3", "timestamp")))
expect_equal(class(ldf1$col1), "numeric")
expect_equal(class(ldf1$col2), c("POSIXct", "POSIXt"))
expect_equal(class(ldf1$col3), c("POSIXct", "POSIXt"))

# Columns with NAs at the top
sdf2 <- filter(sdf1, "col1 > 1")
ldf2 <- collect(sdf2)
expect_equal(dtypes(sdf2), list(c("col1", "double"),
c("col2", "timestamp"),
c("col3", "timestamp")))
expect_equal(class(ldf2$col1), "numeric")
expect_equal(class(ldf2$col2), c("POSIXct", "POSIXt"))
expect_equal(class(ldf2$col3), c("POSIXct", "POSIXt"))

# Columns with only NAs, the type will also be cast to PRIMITIVE_TYPE
sdf3 <- filter(sdf1, "col1 == 0")
ldf3 <- collect(sdf3)
expect_equal(dtypes(sdf3), list(c("col1", "double"),
c("col2", "timestamp"),
c("col3", "timestamp")))
expect_equal(class(ldf3$col1), "numeric")
expect_equal(class(ldf3$col2), c("POSIXct", "POSIXt"))
expect_equal(class(ldf3$col3), c("POSIXct", "POSIXt"))
})

unlink(parquetPath)
unlink(orcPath)
unlink(jsonPath)
Expand Down