From 7afcd3054239f135a04f92c025ad21c2fb1784c2 Mon Sep 17 00:00:00 2001 From: hadley Date: Fri, 18 Mar 2016 07:10:37 -0500 Subject: [PATCH 01/27] Don't trim ws in repair_names. Closes #47 --- R/repair-names.R | 13 +++++++------ man/repair_names.Rd | 6 +++--- tests/testthat/test-repair_names.R | 2 -- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/R/repair-names.R b/R/repair-names.R index 9edaa5294..228c17804 100644 --- a/R/repair-names.R +++ b/R/repair-names.R @@ -1,12 +1,12 @@ #' Repair object names. #' #' \code{repair_names} ensures its input has non-missing and -#' unique names. It also strips any leading or trailing spaces. -#' Valid names are left as is. +#' unique names (duplicated names get a numeric suffix). Valid names are +#' left as is. #' #' @param x A named vector. #' @param prefix A string, the prefix to use for new column names. -#' @param sep A string, inserted between the column name and de-duplicating +#' @param sep A string inserted between the column name and de-duplicating #' number. #' @return \code{x} with valid names. #' @export @@ -41,8 +41,9 @@ repair_names <- function(x, prefix = "V", sep = "") { init_names <- function(x) { xnames <- names(x) - if (is.null(xnames)) + if (is.null(xnames)) { rep("", length(x)) - else - ifelse(is.na(xnames), "", trim_ws(xnames)) + } else { + ifelse(is.na(xnames), "", xnames) + } } diff --git a/man/repair_names.Rd b/man/repair_names.Rd index 7ef434a5f..bae3e9814 100644 --- a/man/repair_names.Rd +++ b/man/repair_names.Rd @@ -11,7 +11,7 @@ repair_names(x, prefix = "V", sep = "") \item{prefix}{A string, the prefix to use for new column names.} -\item{sep}{A string, inserted between the column name and de-duplicating +\item{sep}{A string inserted between the column name and de-duplicating number.} } \value{ @@ -19,8 +19,8 @@ number.} } \description{ \code{repair_names} ensures its input has non-missing and -unique names. It also strips any leading or trailing spaces. -Valid names are left as is. +unique names (duplicated names get a numeric suffix). Valid names are +left as is. } \examples{ repair_names(list(3, 4, 5)) # works for lists, too diff --git a/tests/testthat/test-repair_names.R b/tests/testthat/test-repair_names.R index 199917676..ba288fc32 100644 --- a/tests/testthat/test-repair_names.R +++ b/tests/testthat/test-repair_names.R @@ -45,8 +45,6 @@ test_that("repair various name problems", { # ensure we start with a "bad" state old_names <- colnames(dat) - if (!is.null(old_names)) - old_names <- trim_ws(old_names) expect_true(is.null(old_names) || any(table(old_names) > 1) || any(old_names == '' | is.na(old_names)) || From 84c3fa093c9484f2b5b2ae5d0f53627a6e1d5dff Mon Sep 17 00:00:00 2001 From: hadley Date: Fri, 18 Mar 2016 07:16:15 -0500 Subject: [PATCH 02/27] Combine names2 and init_names --- R/repair-names.R | 10 +--------- R/utils.r | 7 ++++++- 2 files changed, 7 insertions(+), 10 deletions(-) diff --git a/R/repair-names.R b/R/repair-names.R index 228c17804..109e44f44 100644 --- a/R/repair-names.R +++ b/R/repair-names.R @@ -19,7 +19,7 @@ repair_names <- function(x, prefix = "V", sep = "") { if (length(x) == 0) return(x) - xnames <- init_names(x) + xnames <- names2(x) blanks <- xnames == "" # The order vector defines the order in which make.unique() should process the @@ -39,11 +39,3 @@ repair_names <- function(x, prefix = "V", sep = "") { x } -init_names <- function(x) { - xnames <- names(x) - if (is.null(xnames)) { - rep("", length(x)) - } else { - ifelse(is.na(xnames), "", xnames) - } -} diff --git a/R/utils.r b/R/utils.r index acf7b1f1d..5b098001f 100644 --- a/R/utils.r +++ b/R/utils.r @@ -1,5 +1,10 @@ names2 <- function(x) { - names(x) %||% rep("", length(x)) + xnames <- names(x) + if (is.null(xnames)) { + rep("", length(x)) + } else { + ifelse(is.na(xnames), "", xnames) + } } "%||%" <- function(x, y) { From f56bb439e54c7a5716a4249a1c9d6ef2b7b11f6a Mon Sep 17 00:00:00 2001 From: hadley Date: Fri, 18 Mar 2016 07:16:59 -0500 Subject: [PATCH 03/27] Simpler strategy for repair_names --- R/repair-names.R | 20 +++++++------------- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/R/repair-names.R b/R/repair-names.R index 109e44f44..7b989dc85 100644 --- a/R/repair-names.R +++ b/R/repair-names.R @@ -20,20 +20,14 @@ repair_names <- function(x, prefix = "V", sep = "") { return(x) xnames <- names2(x) - blanks <- xnames == "" + blank <- xnames == "" - # The order vector defines the order in which make.unique() should process the - # entries. Blanks are initialized with the prefix. The index of the first - # blank entry appears twice in this vector if there's no column named like the - # prefix, to make sure that blank columns always start with V1 (or a higher - # index if appropriate). See also the "pathological cases" test. - order <- c( - which(!blanks), - if (all(xnames[!blanks] != prefix) && any(blanks)) - which.max(blanks), - which(blanks)) - xnames[blanks] <- prefix - xnames[order] <- make.unique(xnames[order], sep = sep) + # Ensure existing names are unique + xnames[!blank] <- make.unique(xnames[!blank], sep = sep) + + # Replace blank names + new_vars <- setdiff(paste(prefix, seq_along(x), sep = sep), xnames) + xnames[blank] <- new_vars[seq_len(sum(blank))] names(x) <- xnames x From 17f36944a1b12e29b27e1cafb8e9d15bf3142dc9 Mon Sep 17 00:00:00 2001 From: hadley Date: Fri, 18 Mar 2016 07:18:36 -0500 Subject: [PATCH 04/27] Remove unneeded trim_ws --- R/utils-format.r | 3 --- tests/testthat/test-repair_names.R | 6 ------ 2 files changed, 9 deletions(-) diff --git a/R/utils-format.r b/R/utils-format.r index bbbd7d7d9..5cfd4a89d 100644 --- a/R/utils-format.r +++ b/R/utils-format.r @@ -194,6 +194,3 @@ big_mark <- function(x, ...) { mark <- if (identical(getOption("OutDec"), ",")) "." else "," formatC(x, big.mark = mark, ...) } - -# trimws() is not available in R 3.1.3 -trim_ws <- function(x) gsub("^ *(|(.*[^ ])) *$", "\\1", x) diff --git a/tests/testthat/test-repair_names.R b/tests/testthat/test-repair_names.R index ba288fc32..f7f174d3a 100644 --- a/tests/testthat/test-repair_names.R +++ b/tests/testthat/test-repair_names.R @@ -1,11 +1,5 @@ context("repair_names") -test_that("trim_ws", { - expect_equal(trim_ws(" a"), "a") - expect_equal(trim_ws("a "), "a") - expect_equal(trim_ws(" a "), "a") -}) - test_that("repair missing column names", { dat <- data.frame(a = 1, b = 2, c = 3) colnames(dat)[2] <- NA From 9700106fb84c24e39d863f1f8e0a2fd84150d055 Mon Sep 17 00:00:00 2001 From: hadley Date: Fri, 18 Mar 2016 07:29:08 -0500 Subject: [PATCH 05/27] Extract out make_unique and simplify tests --- R/repair-names.R | 20 ++++--- tests/testthat/test-repair_names.R | 91 +++++++++--------------------- 2 files changed, 39 insertions(+), 72 deletions(-) diff --git a/R/repair-names.R b/R/repair-names.R index 7b989dc85..830058aaf 100644 --- a/R/repair-names.R +++ b/R/repair-names.R @@ -16,20 +16,24 @@ #' tbl <- as_data_frame(structure(list(3, 4, 5), class = "data.frame")) #' repair_names(tbl) repair_names <- function(x, prefix = "V", sep = "") { - if (length(x) == 0) + if (length(x) == 0) { + names(x) <- character() return(x) + } - xnames <- names2(x) - blank <- xnames == "" + new_names <- make_unique(names2(x), prefix = prefix, sep = sep) + setNames(x, new_names) +} + +make_unique <- function(x, prefix = "V", sep = "") { + blank <- x == "" # Ensure existing names are unique - xnames[!blank] <- make.unique(xnames[!blank], sep = sep) + x[!blank] <- make.unique(x[!blank], sep = sep) # Replace blank names - new_vars <- setdiff(paste(prefix, seq_along(x), sep = sep), xnames) - xnames[blank] <- new_vars[seq_len(sum(blank))] + new_vars <- setdiff(paste(prefix, seq_along(x), sep = sep), x) + x[blank] <- new_vars[seq_len(sum(blank))] - names(x) <- xnames x } - diff --git a/tests/testthat/test-repair_names.R b/tests/testthat/test-repair_names.R index f7f174d3a..49efdd289 100644 --- a/tests/testthat/test-repair_names.R +++ b/tests/testthat/test-repair_names.R @@ -1,79 +1,42 @@ context("repair_names") -test_that("repair missing column names", { - dat <- data.frame(a = 1, b = 2, c = 3) - colnames(dat)[2] <- NA +test_that("zero-length inputs given character names", { + out <- repair_names(character()) + expect_equal(names(out), character()) +}) - # ensure we start with a "bad" state - expect_true(any(is.na(colnames(dat)))) +test_that("unnamed input gives uniquely named output", { + out <- repair_names(1:3) + expect_equal(names(out), c("V1", "V2", "V3")) +}) - fixed_dat <- repair_names(dat) - fixed_names <- colnames(fixed_dat) - # no repeats - expect_false(any(table(fixed_names) > 1)) +# make_unique ------------------------------------------------------------- - # ensure all valid column names are retained - expect_equal(length(setdiff(Filter(function(a) ! (is.na(a) | a == ''), - colnames(dat)), - fixed_names)), 0) +test_that("duplicates are de-deduped", { + expect_equal(make_unique(c("x", "x")), c("x", "x1")) }) -test_that("repair various name problems", { - combos <- list(Null = NULL, - Empty = c('', '', ''), - Spaces = c('a', 'b', ' '), - EmptyWithNA = c('', NA, NA), - Dup1 = c('a', 'a', 'b'), - Evil1 = c('a', 'a ', 'a1'), - OneNA = c('a', 'b', NA), - Missing2 = c('', '', 'b'), - Vnames1 = c('V1', '', ''), - Vnames2 = c('V2', ' ', ''), - Vnames3 = c('V1', '', 'a'), - VnamesDup1 = c('V1', ' V1 ', 'c'), - VnamesDup2 = c(' V1', 'V1', '') - ) - for (combo_name in names(combos)) { - dat <- data.frame(a = 1, b = 2, c = 3) - colnames(dat) <- combos[[ combo_name ]] +test_that("blanks get prefix + numeric id", { + expect_equal(make_unique(c("", "")), c("V1", "V2")) +}) - # ensure we start with a "bad" state - old_names <- colnames(dat) - expect_true(is.null(old_names) || - any(table(old_names) > 1) || - any(old_names == '' | is.na(old_names)) || - any(grepl('^ +| +$', old_names)), - info = combo_name) +test_that("blanks skip existing names", { + expect_equal(make_unique(c("", "V1")), c("V2", "V1")) +}) - fixed_dat <- repair_names(dat) - fixed_names <- colnames(fixed_dat) +test_that("blanks skip names created when de-duping", { + expect_equal(make_unique(c("", "V", "V")), c("V2", "V", "V1")) +}) - # no repeats - expect_false(any(table(fixed_names) > 1), info = combo_name) +# names2 ------------------------------------------------------------------ - # ensure all valid column names are retained - if (! is.null(old_names)) { - valid <- ! is.na(old_names) & old_names != '' & - ! duplicated(old_names) - expect_equal(fixed_names[valid], old_names[valid]) - } - } +test_that("names2 returns character vector even if names NULL", { + expect_equal(names2(1:3), rep("", 3)) }) -test_that("check pathological cases", { - df <- data.frame() - expect_identical(repair_names(df), df) - df <- data.frame(row.names = 1:3) - expect_identical(repair_names(df), df) - l <- list(3, 4, 5) - expect_identical(repair_names(l), setNames(l, paste0("V", 1:3))) - l <- list(V = 3, W = 4, 5) - expect_identical(repair_names(l), setNames(l, c("V", "W", "V1"))) -}) +test_that("names2 replaces missing value with blanks", { + x <- 1:3 + names(x) <- c("a", "b", NA) -test_that("check object class", { - expect_equal(class(iris), class(repair_names(iris))) - expect_equal(class(tbl_df(iris)), class(repair_names(tbl_df(iris)))) - expect_equal(class(repair_names(1:10)), "integer") - expect_error(repair_names(cat), "non-vector") + expect_equal(names2(x), c("a", "b", "")) }) From 1badc4a5d1019c27faf51eadf5edbe9209217fe5 Mon Sep 17 00:00:00 2001 From: hadley Date: Fri, 18 Mar 2016 07:36:52 -0500 Subject: [PATCH 06/27] Improve frame_data docs And a few minor code tweaks. --- R/frame-data.R | 38 +++++++++++++++++++++++++++----------- man/frame_data.Rd | 20 +++++++++++++++++--- 2 files changed, 44 insertions(+), 14 deletions(-) diff --git a/R/frame-data.R b/R/frame-data.R index 2a465abd1..33dc8ceaf 100644 --- a/R/frame-data.R +++ b/R/frame-data.R @@ -1,9 +1,12 @@ -#' Row-wise data_frame creation +#' Row-wise tibble creation #' -#' Create a row-wise \code{\link{data_frame}}. +#' Create \code{\link{data_frame}}s laying out the data in rows, rather than +#' in columns. This is useful for small tables of data where readability is +#' important. #' #' @param ... Arguments specifying the structure of a \code{data_frame}. -#' +#' Variable names should be formulas, and may only appear before the data. +#' @return A \code{\link{tbl_df}}. #' @export #' @examples #' frame_data( @@ -12,6 +15,14 @@ #' "b", 2, #' "c", 3 #' ) +#' +#' # frame_data will create a list column if the value in each cell is +#' # not a scalar +#' frame_data( +#' ~x, ~y, +#' "a", 1:3, +#' "b", 4:6 +#' ) frame_data <- function(...) { dots <- list(...) @@ -32,12 +43,14 @@ frame_data <- function(...) { break if (length(el) != 2) { - stop("expected a column name with a single argument; e.g. '~ name'") + stop("expected a column name with a single argument; e.g. '~ name'", + call. = FALSE) } candidate <- el[[2]] if (!(is.symbol(candidate) || is.character(candidate))) { - stop("expected a symbol or string denoting a column name") + stop("expected a symbol or string denoting a column name", + call. = FALSE) } frame_names <- c(frame_names, as.character(el[[2]])) @@ -46,7 +59,7 @@ frame_data <- function(...) { } if (!length(frame_names)) { - stop("no column names detected in 'frame_data()' call") + stop("no column names detected in 'frame_data()' call", call. = FALSE) } frame_rest <- dots[i:length(dots)] @@ -57,11 +70,14 @@ frame_data <- function(...) { # structure. frame_ncol <- length(frame_names) if (n_elements %% frame_ncol != 0) { - stop(sprintf( - "invalid 'frame_data()' specification: had %s elements and %s columns", - n_elements, - frame_ncol - )) + stop( + sprintf( + "invalid 'frame_data()' specification: had %s elements and %s columns", + n_elements, + frame_ncol + ), + call. = FALSE + ) } frame_mat <- matrix(frame_rest, ncol = frame_ncol, byrow = TRUE) diff --git a/man/frame_data.Rd b/man/frame_data.Rd index 7e4ec1cf8..0b2a6fe45 100644 --- a/man/frame_data.Rd +++ b/man/frame_data.Rd @@ -3,17 +3,23 @@ \name{frame_data} \alias{frame_data} \alias{tibble} -\title{Row-wise data_frame creation} +\title{Row-wise tibble creation} \usage{ frame_data(...) tibble(...) } \arguments{ -\item{...}{Arguments specifying the structure of a \code{data_frame}.} +\item{...}{Arguments specifying the structure of a \code{data_frame}. +Variable names should be formulas, and may only appear before the data.} +} +\value{ +A \code{\link{tbl_df}}. } \description{ -Create a row-wise \code{\link{data_frame}}. +Create \code{\link{data_frame}}s laying out the data in rows, rather than +in columns. This is useful for small tables of data where readability is +important. } \examples{ frame_data( @@ -22,5 +28,13 @@ frame_data( "b", 2, "c", 3 ) + +# frame_data will create a list column if the value in each cell is +# not a scalar +frame_data( + ~x, ~y, + "a", 1:3, + "b", 4:6 +) } From 25ab19628793f92873ac9c7f1b5a0ec4908da2c2 Mon Sep 17 00:00:00 2001 From: hadley Date: Fri, 18 Mar 2016 07:38:55 -0500 Subject: [PATCH 07/27] Return 0-row but n-col data frame if no data --- R/frame-data.R | 4 +++- tests/testthat/test-frame-data.R | 7 +++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/R/frame-data.R b/R/frame-data.R index 33dc8ceaf..7748785b4 100644 --- a/R/frame-data.R +++ b/R/frame-data.R @@ -32,7 +32,9 @@ frame_data <- function(...) { i <- 1 while (TRUE) { if (i > length(dots)) { - return(data_frame()) + out <- rep(list(logical()), length(frame_names)) + names(out) <- frame_names + return(as_data_frame(out)) } el <- dots[[i]] diff --git a/tests/testthat/test-frame-data.R b/tests/testthat/test-frame-data.R index 3febd649d..c5e79b290 100644 --- a/tests/testthat/test-frame-data.R +++ b/tests/testthat/test-frame-data.R @@ -11,8 +11,6 @@ test_that("frame_data() constructs 'data_frame' as expected", { compared <- data_frame(colA = c("a", "b"), colB = c(1, 2)) expect_equal(result, compared) - expect_identical(frame_data(~a, ~b), data_frame()) - ## wide wide <- frame_data( ~colA, ~colB, ~colC, ~colD, @@ -92,3 +90,8 @@ test_that("frame_data can have list columns", { expect_equal(df$x, c(1, 2)) expect_equal(df$y, list(list(a = 1), list(b = 2))) }) + +test_that("frame_data creates n-col empty data frame", { + df <- frame_data(~x, ~y) + expect_equal(names(df), c("x", "y")) +}) From fc84a86b504deee14054be71363987da379a334a Mon Sep 17 00:00:00 2001 From: hadley Date: Fri, 18 Mar 2016 07:45:03 -0500 Subject: [PATCH 08/27] Tests for missing obj_type cases Remove unused method --- NAMESPACE | 1 - R/utils-format.r | 8 ++------ tests/testthat/test-trunc-mat.r | 20 ++++++++++++++++++-- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 9bf6556e1..5644ae92c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -18,7 +18,6 @@ S3method(glimpse,default) S3method(glimpse,tbl) S3method(obj_type,"NULL") S3method(obj_type,data.frame) -S3method(obj_type,data_frame) S3method(obj_type,default) S3method(print,tbl_df) S3method(print,trunc_mat) diff --git a/R/utils-format.r b/R/utils-format.r index 5cfd4a89d..2799e10e1 100644 --- a/R/utils-format.r +++ b/R/utils-format.r @@ -173,9 +173,9 @@ obj_type.default <- function(x) { if (!is.object(x)) { paste0("<", type_sum(x), if (!is.array(x)) paste0("[", length(x), "]"), ">") } else if (!isS4(x)) { - paste0("") + paste0("") } else { - paste0("") + paste0("") } } @@ -183,10 +183,6 @@ obj_type.default <- function(x) { obj_type.data.frame <- function(x) { paste0("<", class(x)[1], " [", paste0(dim(x), collapse = ","), "]", ">") } -#' @export -obj_type.data_frame <- function(x) { - paste0("") -} # function for the thousand separator, # returns "," unless it's used for the decimal point, in which case returns "." diff --git a/tests/testthat/test-trunc-mat.r b/tests/testthat/test-trunc-mat.r index 6efbd1dea..78c1af3da 100644 --- a/tests/testthat/test-trunc-mat.r +++ b/tests/testthat/test-trunc-mat.r @@ -103,7 +103,23 @@ test_that("trunc_mat output matches known output", { ) }) -test_that("obj_type shows only first class name for S4", { + +# obj_type ---------------------------------------------------------------- + +test_that("shows only first class name for S4", { A <- methods::setClass("A") - expect_equal(obj_type(A), "") + expect_equal(obj_type(A), "") +}) + +test_that("NULL handled specially", { + expect_equal(obj_type(NULL), "") +}) + +test_that("data frame includes rows and cols", { + expect_equal(obj_type(mtcars), "") +}) + +test_that("S3 others list all classes", { + x <- structure(list(), class = c("a", "b", "c")) + expect_equal(obj_type(x), "") }) From 3dccd9bdc1f896ad704d9f51601c6f2e46fc7dce Mon Sep 17 00:00:00 2001 From: hadley Date: Fri, 18 Mar 2016 07:49:45 -0500 Subject: [PATCH 09/27] Special-case obj_type for common vector types --- R/utils-format.r | 12 +++++++++++- tests/testthat/test-trunc-mat.r | 6 ++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/R/utils-format.r b/R/utils-format.r index 2799e10e1..f9ffc0d61 100644 --- a/R/utils-format.r +++ b/R/utils-format.r @@ -171,7 +171,7 @@ obj_type.NULL <- function(x) "" #' @export obj_type.default <- function(x) { if (!is.object(x)) { - paste0("<", type_sum(x), if (!is.array(x)) paste0("[", length(x), "]"), ">") + obj_type_atomic(x) } else if (!isS4(x)) { paste0("") } else { @@ -179,6 +179,16 @@ obj_type.default <- function(x) { } } +obj_type_atomic <- function(x) { + paste0("<", type_sum(x), if (!is.array(x)) paste0("[", length(x), "]"), ">") +} +#' @export +obj_type.factor <- obj_type_atomic +#' @export +obj_type.Date <- obj_type_atomic +#' @export +obj_type.POSIXct <- obj_type_atomic + #' @export obj_type.data.frame <- function(x) { paste0("<", class(x)[1], " [", paste0(dim(x), collapse = ","), "]", ">") diff --git a/tests/testthat/test-trunc-mat.r b/tests/testthat/test-trunc-mat.r index 78c1af3da..8fb810127 100644 --- a/tests/testthat/test-trunc-mat.r +++ b/tests/testthat/test-trunc-mat.r @@ -123,3 +123,9 @@ test_that("S3 others list all classes", { x <- structure(list(), class = c("a", "b", "c")) expect_equal(obj_type(x), "") }) + +test_that("common data vectors treated as atomic", { + expect_equal(obj_type(factor(1:3)), "") + expect_equal(obj_type(Sys.Date() + 1:3), "") + expect_equal(obj_type(Sys.time() + 1:3), "") +}) From 17f855ee933da2e3266dc86908908d341318af21 Mon Sep 17 00:00:00 2001 From: hadley Date: Fri, 18 Mar 2016 08:02:38 -0500 Subject: [PATCH 10/27] Improve [.tbl_df error message --- R/tbl-df.r | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/tbl-df.r b/R/tbl-df.r index acbfe082d..51a4f6636 100644 --- a/R/tbl-df.r +++ b/R/tbl-df.r @@ -73,7 +73,7 @@ print.tbl_df <- function(x, ..., n = NULL, width = NULL) { #' @export `$.tbl_df` <- function(x, i) { if (is.character(i) && !(i %in% names(x))) - stop("Unknown name", call. = FALSE) + stop("Unknown column '", i, "'", call. = FALSE) .subset2(x, i) } From 0f62f981cd655ae2122a0567a95f32f119d89805 Mon Sep 17 00:00:00 2001 From: hadley Date: Fri, 18 Mar 2016 08:09:22 -0500 Subject: [PATCH 11/27] Update readme --- README.Rmd | 58 +++++++++++++++++++++----- README.md | 117 +++++++++++++++++++++++++++++++++++++---------------- 2 files changed, 131 insertions(+), 44 deletions(-) diff --git a/README.Rmd b/README.Rmd index 02b338b7f..179edba94 100644 --- a/README.Rmd +++ b/README.Rmd @@ -1,7 +1,5 @@ --- -output: - md_document: - variant: markdown_github +output: github_document --- @@ -14,14 +12,56 @@ knitr::opts_chunk$set( ) ``` -# tibble [![Build Status](https://travis-ci.org/hadley/tibble.svg?branch=master)](https://travis-ci.org/hadley/tibble) [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/hadley/tibble?branch=master&svg=true)](https://ci.appveyor.com/project/hadley/tibble) [![Coverage Status](https://img.shields.io/codecov/c/github/hadley/tibble/master.svg)](https://codecov.io/github/hadley/tibble?branch=master) +# tibble -Data frames in `dplyr` style. +[![Build Status](https://travis-ci.org/hadley/tibble.svg?branch=master)](https://travis-ci.org/hadley/tibble) +[![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/hadley/tibble?branch=master&svg=true)](https://ci.appveyor.com/project/hadley/tibble) +[![Coverage Status](https://img.shields.io/codecov/c/github/hadley/tibble/master.svg)](https://codecov.io/github/hadley/tibble?branch=master) + +tibble extracts the idea of a __data_frame__ (aka a tibble diff, or tibble for short) from dplyr. As the name suggests a __data_frame__ is a modern reimagining of a data.frame, keeping what time has proven to be effective, and throwing out what is not. In spoken language it's hard to tell the difference between a `data.frame` and `data_frame` so we call the new style tibble dfs (inspired by `dplyr::tbl_df()`), or just tibbles for short. + +## Creating tibbles + +You can create a tibble from an existing object with `as_data_frame()`: ```{r} library(tibble) -tbl_df(iris) -glimpse(iris) -head(rownames_to_column(mtcars, "model")) -trunc_mat(iris) +as_data_frame(iris) +``` + +You can create a new tibble from vectors that represent the columns with `data_frame()`: + +```{r} +data_frame(x = 1:5, y = 1, z = x ^ 2 + y) +``` + +`data_frame()` is does much less than `data.frame()`: it never changes the type of the inputs (e.g. it never converts strings to factors!), it never changes the names of variabels, and it never creates `row.names()`. You can read more about these features in the vignette, `vignette("tibble")`. + +You can define a tibble row-by-row with `frame_data()`: + +```{r} +frame_data( + ~x, ~y, ~z, + "a", 2, 3.6, + "b", 1, 8.5 +) +``` + +## Tibbles vs data frames + +There are two main differences in the usage of a data frame vs a tibble: printing, and subsetting. + +Tibbles have a refined print method that shows only the first 10 rows, and all the columns that fit on screen. Each column gives both the name and its type. This makes it much eaiser to work with large data: + +```{r} +library(nycflights13) +flights +``` + +Tibles are strict about subsetting. If you try and access a variable that does not, you'll get an error: + +```{r, error = TRUE} +flights$yea ``` + +Tibbles clearly delinerate `[` and `[[`: `[` always returns another tibble, `[[` always returns a vector. diff --git a/README.md b/README.md index a36c07ab2..47c69bbf3 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,20 @@ + -tibble [![Build Status](https://travis-ci.org/hadley/tibble.svg?branch=master)](https://travis-ci.org/hadley/tibble) [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/hadley/tibble?branch=master&svg=true)](https://ci.appveyor.com/project/hadley/tibble) [![Coverage Status](https://img.shields.io/codecov/c/github/hadley/tibble/master.svg)](https://codecov.io/github/hadley/tibble?branch=master) -===================================================================================================================================================================================================================================================================================================================================================================================================================================== +tibble +====== + +[![Build Status](https://travis-ci.org/hadley/tibble.svg?branch=master)](https://travis-ci.org/hadley/tibble) [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/hadley/tibble?branch=master&svg=true)](https://ci.appveyor.com/project/hadley/tibble) [![Coverage Status](https://img.shields.io/codecov/c/github/hadley/tibble/master.svg)](https://codecov.io/github/hadley/tibble?branch=master) + +tibble extracts the idea of a **data\_frame** (aka a tibble diff, or tibble for short) from dplyr. As the name suggests a **data\_frame** is a modern reimagining of a data.frame, keeping what time has proven to be effective, and throwing out what is not. In spoken language it's hard to tell the difference between a `data.frame` and `data_frame` so we call the new style tibble dfs (inspired by `dplyr::tbl_df()`), or just tibbles for short. + +Creating tibbles +---------------- -Data frames in `dplyr` style. +You can create a tibble from an existing object with `as_data_frame()`: ``` r library(tibble) -tbl_df(iris) +as_data_frame(iris) #> Source: local data frame [150 x 5] #> #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species @@ -22,36 +30,75 @@ tbl_df(iris) #> 9 4.4 2.9 1.4 0.2 setosa #> 10 4.9 3.1 1.5 0.1 setosa #> .. ... ... ... ... ... -glimpse(iris) -#> Observations: 150 -#> Variables: 5 -#> $ Sepal.Length (dbl) 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9,... -#> $ Sepal.Width (dbl) 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1,... -#> $ Petal.Length (dbl) 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5,... -#> $ Petal.Width (dbl) 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1,... -#> $ Species (fctr) setosa, setosa, setosa, setosa, setosa, setosa, ... -head(rownames_to_column(mtcars, "model")) -#> model mpg cyl disp hp drat wt qsec vs am gear carb -#> 1 Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 -#> 2 Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 -#> 3 Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 -#> 4 Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 -#> 5 Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 -#> 6 Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 -trunc_mat(iris) ``` -| Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | -|:-------------|:------------|:-------------|:------------|:--------| -| (dbl) | (dbl) | (dbl) | (dbl) | (fctr) | -| 5.1 | 3.5 | 1.4 | 0.2 | setosa | -| 4.9 | 3.0 | 1.4 | 0.2 | setosa | -| 4.7 | 3.2 | 1.3 | 0.2 | setosa | -| 4.6 | 3.1 | 1.5 | 0.2 | setosa | -| 5.0 | 3.6 | 1.4 | 0.2 | setosa | -| 5.4 | 3.9 | 1.7 | 0.4 | setosa | -| 4.6 | 3.4 | 1.4 | 0.3 | setosa | -| 5.0 | 3.4 | 1.5 | 0.2 | setosa | -| 4.4 | 2.9 | 1.4 | 0.2 | setosa | -| 4.9 | 3.1 | 1.5 | 0.1 | setosa | -| ... | ... | ... | ... | ... | +You can create a new tibble from vectors that represent the columns with `data_frame()`: + +``` r +data_frame(x = 1:5, y = 1, z = x ^ 2 + y) +#> Source: local data frame [5 x 3] +#> +#> x y z +#> (int) (dbl) (dbl) +#> 1 1 1 2 +#> 2 2 1 5 +#> 3 3 1 10 +#> 4 4 1 17 +#> 5 5 1 26 +``` + +`data_frame()` is does much less than `data.frame()`: it never changes the type of the inputs (e.g. it never converts strings to factors!), it never changes the names of variabels, and it never creates `row.names()`. You can read more about these features in the vignette, `vignette("tibble")`. + +You can define a tibble row-by-row with `frame_data()`: + +``` r +frame_data( + ~x, ~y, ~z, + "a", 2, 3.6, + "b", 1, 8.5 +) +#> Source: local data frame [2 x 3] +#> +#> x y z +#> (chr) (dbl) (dbl) +#> 1 a 2 3.6 +#> 2 b 1 8.5 +``` + +Tibbles vs data frames +---------------------- + +There are two main differences in the usage of a data frame vs a tibble: printing, and subsetting. + +Tibbles have a refined print method that shows only the first 10 rows, and all the columns that fit on screen. Each column gives both the name and its type. This makes it much eaiser to work with large data: + +``` r +library(nycflights13) +flights +#> Source: local data frame [336,776 x 16] +#> +#> year month day dep_time dep_delay arr_time arr_delay carrier tailnum +#> (int) (int) (int) (int) (dbl) (int) (dbl) (chr) (chr) +#> 1 2013 1 1 517 2 830 11 UA N14228 +#> 2 2013 1 1 533 4 850 20 UA N24211 +#> 3 2013 1 1 542 2 923 33 AA N619AA +#> 4 2013 1 1 544 -1 1004 -18 B6 N804JB +#> 5 2013 1 1 554 -6 812 -25 DL N668DN +#> 6 2013 1 1 554 -4 740 12 UA N39463 +#> 7 2013 1 1 555 -5 913 19 B6 N516JB +#> 8 2013 1 1 557 -3 709 -14 EV N829AS +#> 9 2013 1 1 557 -3 838 -8 B6 N593JB +#> 10 2013 1 1 558 -2 753 8 AA N3ALAA +#> .. ... ... ... ... ... ... ... ... ... +#> Variables not shown: flight (int), origin (chr), dest (chr), air_time +#> (dbl), distance (dbl), hour (dbl), minute (dbl). +``` + +Tibles are strict about subsetting. If you try and access a variable that does not, you'll get an error: + +``` r +flights$yea +#> Error: Unknown column 'yea' +``` + +Tibbles clearly delinerate `[` and `[[`: `[` always returns another tibble, `[[` always returns a vector. From 16c9a0f2b23eadc9df73c3e1d0323b352f281fc0 Mon Sep 17 00:00:00 2001 From: hadley Date: Fri, 18 Mar 2016 08:12:20 -0500 Subject: [PATCH 12/27] Rename vignette to tibble --- vignettes/{data_frames.Rmd => tibble.Rmd} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename vignettes/{data_frames.Rmd => tibble.Rmd} (100%) diff --git a/vignettes/data_frames.Rmd b/vignettes/tibble.Rmd similarity index 100% rename from vignettes/data_frames.Rmd rename to vignettes/tibble.Rmd From efc5934fe383778231af953b0ea1619c91281646 Mon Sep 17 00:00:00 2001 From: hadley Date: Fri, 18 Mar 2016 08:25:44 -0500 Subject: [PATCH 13/27] Update namespace --- NAMESPACE | 3 +++ 1 file changed, 3 insertions(+) diff --git a/NAMESPACE b/NAMESPACE index 5644ae92c..339b0db22 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -17,8 +17,11 @@ S3method(glimpse,data.frame) S3method(glimpse,default) S3method(glimpse,tbl) S3method(obj_type,"NULL") +S3method(obj_type,Date) +S3method(obj_type,POSIXct) S3method(obj_type,data.frame) S3method(obj_type,default) +S3method(obj_type,factor) S3method(print,tbl_df) S3method(print,trunc_mat) S3method(type_sum,Date) From 304ccc751e2afc6cfafb358c857f5fb76a35f868 Mon Sep 17 00:00:00 2001 From: hadley Date: Fri, 18 Mar 2016 08:26:00 -0500 Subject: [PATCH 14/27] Rewrite vignette to focus on tibbles --- DESCRIPTION | 3 +- vignettes/tibble.Rmd | 151 +++++++++++++++++-------------------------- 2 files changed, 61 insertions(+), 93 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index cc2a78641..8bf52edb0 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -13,8 +13,7 @@ URL: https://github.com/krlmlr/tibble BugReports: https://github.com/krlmlr/tibble/issues Depends: R (>= 3.1.2) Imports: methods, assertthat, utils, lazyeval (>= 0.1.10), Rcpp -Suggests: testthat, knitr, rmarkdown, Lahman (>= 3.0.1), magrittr, - microbenchmark +Suggests: testthat, knitr, rmarkdown, Lahman (>= 3.0.1), microbenchmark LinkingTo: Rcpp LazyData: yes License: MIT + file LICENSE diff --git a/vignettes/tibble.Rmd b/vignettes/tibble.Rmd index 116cac422..a18b97c4a 100644 --- a/vignettes/tibble.Rmd +++ b/vignettes/tibble.Rmd @@ -1,20 +1,21 @@ --- -title: "Data frames" +title: "Tibbles" date: "`r Sys.Date()`" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Data frames} + %\VignetteIndexEntry{Tibbles} %\VignetteEngine{knitr::rmarkdown} \usepackage[utf8]{inputenc} --- ```{r, echo = FALSE, message = FALSE} -knitr::opts_chunk$set(collapse = T, comment = "#>") +knitr::opts_chunk$set(collapse = TRUE, comment = "#>") options(tibble.print_min = 4L, tibble.print_max = 4L) -library(magrittr) library(tibble) ``` +Tibbles are a modern take on data frames. They keep the features that have stood the test of time, and drop the features that used to be convenient but are now frustrating (i.e. converting character vectors to factors). + ## Creating `data_frame()` is a nice way to create data frames. It encapsulates best practices for data frames: @@ -22,8 +23,7 @@ library(tibble) * It never changes an input's type (i.e., no more `stringsAsFactors = FALSE`!). ```{r} - data.frame(x = letters) %>% sapply(class) - data_frame(x = letters) %>% sapply(class) + data_frame(x = letters) ``` This makes it easier to use with list-columns: @@ -38,8 +38,8 @@ library(tibble) * It never adjusts the names of variables: ```{r} - data.frame(`crazy name` = 1) %>% names() - data_frame(`crazy name` = 1) %>% names() + names(data.frame(`crazy name` = 1)) + names(data_frame(`crazy name` = 1)) ``` * It evaluates its arguments lazily and sequentially: @@ -48,28 +48,6 @@ library(tibble) data_frame(x = 1:5, y = x ^ 2) ``` - * It adds the `tbl_df()` class to the output so that if you accidentally print a large - data frame you only get the first few rows. - - ```{r} - data_frame(x = 1:5) %>% class() - ``` - - * It changes the behaviour of `[` to always return the same type of object: - subsetting using `[` always returns a `tbl_df()` object; subsetting using - `[[` always returns a column. - - You should be aware of one case where subsetting a `tbl_df()` object - will produce a different result than a `data.frame()` object: - - ```{r} - df <- data.frame(a = 1:2, b = 1:2) - str(df[, "a"]) - - tbldf <- tbl_df(df) - str(tbldf[, "a"]) - ``` - * It never uses `row.names()`. The whole point of tidy data is to store variables in a consistent way. So it never stores a variable as special attribute. @@ -79,79 +57,70 @@ library(tibble) ## Coercion -To complement `data_frame()`, dplyr provides `as_data_frame()` to coerce lists into data frames. It does two things: +To complement `data_frame()`, tibble provides `as_data_frame()` to coerce objects into tibbles. Generally, `as_data_frame()` methods are much simpler than `as.data.frame()` methods, and in fact, it's precisely what `as.data.frame()` does, but it's similar to `do.call(cbind, lapply(x, data.frame))` - i.e. it coerces each component to a data frame and then `cbinds()` them all together. -* It checks that the input list is valid for a data frame, i.e. that each element - is named, is a 1d atomic vector or list, and all elements have the same - length. - -* It sets the class and attributes of the list to make it behave like a data frame. - This modification does not require a deep copy of the input list, so it's - very fast. - -This is much simpler than `as.data.frame()`. It's hard to explain precisely what `as.data.frame()` does, but it's similar to `do.call(cbind, lapply(x, data.frame))` - i.e. it coerces each component to a data frame and then `cbinds()` them all together. Consequently `as_data_frame()` is much faster than `as.data.frame()`: +`as_data_frame()` has been written with an eye for performance: ```{r} -l2 <- replicate(26, sample(100), simplify = FALSE) -names(l2) <- letters +l <- replicate(26, sample(100), simplify = FALSE) +names(l) <- letters + microbenchmark::microbenchmark( - as_data_frame(l2), - as.data.frame(l2) + as_data_frame(l), + as.data.frame(l) ) ``` The speed of `as.data.frame()` is not usually a bottleneck when used interactively, but can be a problem when combining thousands of messy inputs into one tidy data frame. -## tbl_dfs vs data.frames +## Tibbles vs data frames -There are three key differences between tbl_dfs and data.frames: +There are two key differences between tibbles and data frames: printing and subsetting. -* When you print a tbl_df, it only shows the first ten rows and all the - columns that fit on one screen. It also prints an abbreviated description - of the column type: - - ```{r} - data_frame(x = 1:1000) - ``` - - You can control the default appearance with options: - - * `options(tibble.print_max = n, tibble.print_min = m)`: if more than `n` - rows print `m` rows. Use `options(tibble.print_max = Inf)` to always - show all rows. - - * `options(tibble.width = Inf)` will always print all columns, regardless - of the width of the screen. +### Printing +When you print a tibble, it only shows the first ten rows and all the columns that fit on one screen. It also prints an abbreviated description of the column type: -* When you subset a tbl\_df with `[`, it always returns another tbl\_df. - Contrast this with a data frame: sometimes `[` returns a data frame and - sometimes it just returns a single column: - - ```{r} - df1 <- data.frame(x = 1:3, y = 3:1) - class(df1[, 1:2]) - class(df1[, 1]) - - df2 <- data_frame(x = 1:3, y = 3:1) - class(df2[, 1:2]) - class(df2[, 1]) - ``` - - To extract a single column it's use `[[` or `$`: - - ```{r} - class(df2[[1]]) - class(df2$x) - ``` +```{r} +data_frame(x = 1:1000) +``` -* When you extract a variable with `$`, tbl\_dfs never do partial - matching. They'll throw an error if the column doesn't exist: - - ```{r, error = TRUE} - df <- data.frame(abc = 1) - df$a +You can control the default appearance with options: + +* `options(tibble.print_max = n, tibble.print_min = m)`: if there are more + than `n` rows, print only the first `m` rows. Use + `options(tibble.print_max = Inf)` to always show all rows. + +* `options(tibble.width = Inf)` will always print all columns, regardless + of the width of the screen. + +### Subsetting + +Tibbles are quite strict about subsetting. `[` always returns another tibble. Contrast this with a data frame: sometimes `[` returns a data frame and sometimes it just returns a single column: - df2 <- data_frame(abc = 1) - df2$a - ``` +```{r} +df1 <- data.frame(x = 1:3, y = 3:1) +class(df1[, 1:2]) +class(df1[, 1]) + +df2 <- data_frame(x = 1:3, y = 3:1) +class(df2[, 1:2]) +class(df2[, 1]) +``` + +To extract a single column use `[[` or `$`: + +```{r} +class(df2[[1]]) +class(df2$x) +``` + +Tibbles are also stricter with `$`. Tibbles never do partial matching, and will throw an error if the column does not exist: + +```{r, error = TRUE} +df <- data.frame(abc = 1) +df$a + +df2 <- data_frame(abc = 1) +df2$a +``` From 7d15f0fce4a384d9d350e24016e59dbf5f5e3f35 Mon Sep 17 00:00:00 2001 From: hadley Date: Fri, 18 Mar 2016 08:28:22 -0500 Subject: [PATCH 15/27] Clean out .Rbuildignore --- .Rbuildignore | 3 --- 1 file changed, 3 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index 65f611b99..ba6e4aa30 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -2,9 +2,6 @@ ^\.Rproj\.user$ ^\.travis\.yml$ .Rprofile -inst/db -man-roxygen -demo/pandas ^\.httr-oauth$ ^cran-comments\.md$ ^README\.Rmd$ From c019ebce4c6d69f26a2da7bbca7d5712632babea Mon Sep 17 00:00:00 2001 From: hadley Date: Fri, 18 Mar 2016 08:31:19 -0500 Subject: [PATCH 16/27] Remove empty files --- R/src-local.r | 0 R/src.r | 0 2 files changed, 0 insertions(+), 0 deletions(-) delete mode 100644 R/src-local.r delete mode 100644 R/src.r diff --git a/R/src-local.r b/R/src-local.r deleted file mode 100644 index e69de29bb..000000000 diff --git a/R/src.r b/R/src.r deleted file mode 100644 index e69de29bb..000000000 From e0223ea2a61a40a364e4caedbad9f80455141ec6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kirill=20M=C3=BCller?= Date: Fri, 18 Mar 2016 14:33:11 +0100 Subject: [PATCH 17/27] Partially revert "bump version to 0.3-3" This reverts commit 046f3711a3b065f60ddf8545c34159bdd66c7108. --- DESCRIPTION | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 8bf52edb0..faecec4b8 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -3,17 +3,28 @@ Encoding: UTF-8 Version: 0.3-3 Title: Simple Data Frames Description: Provides a 'tbl_df' class that offers better checking and - printing capabilities than traditional data frames. -Authors@R: c( person("Hadley", "Wickham", , "hadley@rstudio.com", - "aut"), person("Romain", "Francois", , - "romain@r-enthusiasts.com", "aut"), person("Kirill", "Müller", - , "krlmlr+r@mailbox.org", c("aut", "cre")), person("RStudio", - role = "cph") ) + printing capabilities than traditional data frames. +Authors@R: c( + person("Hadley", "Wickham", , "hadley@rstudio.com", "aut"), + person("Romain", "Francois", , "romain@r-enthusiasts.com", "aut"), + person("Kirill", "Müller", , "krlmlr+r@mailbox.org", c("aut", "cre")), + person("RStudio", role = "cph") + ) URL: https://github.com/krlmlr/tibble BugReports: https://github.com/krlmlr/tibble/issues Depends: R (>= 3.1.2) -Imports: methods, assertthat, utils, lazyeval (>= 0.1.10), Rcpp -Suggests: testthat, knitr, rmarkdown, Lahman (>= 3.0.1), microbenchmark +Imports: + methods, + assertthat, + utils, + lazyeval (>= 0.1.10), + Rcpp +Suggests: + testthat, + knitr, + rmarkdown, + Lahman (>= 3.0.1), + microbenchmark LinkingTo: Rcpp LazyData: yes License: MIT + file LICENSE From f3f28fd1624653f331b0518be3dc4f12539ad6b9 Mon Sep 17 00:00:00 2001 From: hadley Date: Fri, 18 Mar 2016 08:41:28 -0500 Subject: [PATCH 18/27] Start clarifying role of tbl_df --- R/dataframe.R | 2 -- R/tbl-df.r | 18 ++++-------------- man/tbl_df.Rd | 15 ++------------- 3 files changed, 6 insertions(+), 29 deletions(-) diff --git a/R/dataframe.R b/R/dataframe.R index 641fb3611..6a096aa99 100644 --- a/R/dataframe.R +++ b/R/dataframe.R @@ -1,5 +1,3 @@ -methods::setOldClass(c("tbl_df", "tbl", "data.frame")) - #' Build a data frame or list. #' #' \code{data_frame} is trimmed down version of \code{\link{data.frame}} that: diff --git a/R/tbl-df.r b/R/tbl-df.r index 51a4f6636..0bb9f8df0 100644 --- a/R/tbl-df.r +++ b/R/tbl-df.r @@ -1,4 +1,4 @@ -#' Create a data frame tbl. +#' S3 class: tbl_df #' #' A data frame tbl wraps a local data frame. The main advantage to using #' a \code{tbl_df} over a regular data frame is the printing: @@ -16,25 +16,15 @@ #' \item{\code{[[}, \code{$}}{Calls \code{\link{.subset2}} directly, #' so is considerably faster. Throws error if column does not exist.} #' } -#' -#' #' @export #' @param data a data frame -#' @examples -#' ds <- tbl_df(mtcars) -#' ds -#' as.data.frame(ds) -#' -#' if (require("Lahman")) { -#' batting <- tbl_df(Batting) -#' dim(batting) -#' colnames(batting) -#' head(batting) -#' } +#' @keywords internal tbl_df <- function(data) { as_data_frame(data) } +methods::setOldClass(c("tbl_df", "tbl", "data.frame")) + # Standard data frame methods -------------------------------------------------- #' @export diff --git a/man/tbl_df.Rd b/man/tbl_df.Rd index 36dced2a2..9abd40fa3 100644 --- a/man/tbl_df.Rd +++ b/man/tbl_df.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/tbl-df.r \name{tbl_df} \alias{tbl_df} -\title{Create a data frame tbl.} +\title{S3 class: tbl_df} \usage{ tbl_df(data) } @@ -28,16 +28,5 @@ screen, describing the rest of it as text. so is considerably faster. Throws error if column does not exist.} } } -\examples{ -ds <- tbl_df(mtcars) -ds -as.data.frame(ds) - -if (require("Lahman")) { -batting <- tbl_df(Batting) -dim(batting) -colnames(batting) -head(batting) -} -} +\keyword{internal} From 2bab2b922b21b682b6a4f29ab4c5de2cd3c228db Mon Sep 17 00:00:00 2001 From: hadley Date: Fri, 18 Mar 2016 08:41:39 -0500 Subject: [PATCH 19/27] Improve rowname tools docs --- R/rownames.R | 46 +++++++++++++++++++++++----------------------- man/rownames.Rd | 29 ++++++++++++++--------------- 2 files changed, 37 insertions(+), 38 deletions(-) diff --git a/R/rownames.R b/R/rownames.R index f9b446b11..daa7ca310 100644 --- a/R/rownames.R +++ b/R/rownames.R @@ -1,39 +1,45 @@ -#' Row names +#' Tools for working with row names #' -#' \code{has_rownames} checks if a data frame has row names. -#' @param df Input data frame -#' @export -#' @rdname rownames +#' Generally, it is best to avoid row names, because they are basically a +#' character column with different semantics to every other column. These +#' functions allow to you detect if a data frame has row names +#' (\code{has_rownames}), remove them (\code{remove_rownames}), or convert +#' them back-and-forth between an explicit column (\code{rownames_to_column}, +#' and \code{column_to_rownames}). +#' +#' @param df A data frame +#' @param var Name of column to use for rownames. #' @examples #' has_rownames(mtcars) #' has_rownames(iris) +#' has_rownames(remove_rownames(mtcars)) +#' +#' head(rownames_to_column(mtcars)) +#' +#' mtcars_tbl <- rownames_to_column(tbl_df(mtcars)) +#' mtcars_tbl +#' column_to_rownames(mtcars_tbl) +#' @name rownames +NULL + + +#' @export +#' @rdname rownames has_rownames <- function(df) { stopifnot(is.data.frame(df)) .row_names_info(df) > 0L } -#' \code{remove_rownames} removes all row names. #' @export #' @rdname rownames -#' @examples -#' rownames(remove_rownames(mtcars)) remove_rownames <- function(df) { stopifnot(is.data.frame(df)) rownames(df) <- NULL df } -#' \code{rownames_to_column} convert row names to an explicit variable. -#' -#' @param var Name of variable to use #' @export #' @rdname rownames -#' @importFrom stats setNames -#' @examples -#' rownames_to_column(mtcars) -#' -#' mtcars_tbl <- rownames_to_column(tbl_df(mtcars)) -#' mtcars_tbl rownames_to_column <- function(df, var = "rowname") { stopifnot(is.data.frame(df)) @@ -53,14 +59,8 @@ rownames_to_column <- function(df, var = "rowname") { new_df } -#' \code{column_to_rownames} convert a column variable to row names. This is an -#' inverted operation of \code{rownames_to_column}. -#' #' @rdname rownames #' @export -#' @examples -#' -#' column_to_rownames(mtcars_tbl) column_to_rownames <- function(df, var = "rowname") { stopifnot(is.data.frame(df)) diff --git a/man/rownames.Rd b/man/rownames.Rd index c4ae6180c..0dc9597cf 100644 --- a/man/rownames.Rd +++ b/man/rownames.Rd @@ -1,11 +1,12 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/rownames.R -\name{has_rownames} +\name{rownames} \alias{column_to_rownames} \alias{has_rownames} \alias{remove_rownames} +\alias{rownames} \alias{rownames_to_column} -\title{Row names} +\title{Tools for working with row names} \usage{ has_rownames(df) @@ -16,29 +17,27 @@ rownames_to_column(df, var = "rowname") column_to_rownames(df, var = "rowname") } \arguments{ -\item{df}{Input data frame} +\item{df}{A data frame} -\item{var}{Name of variable to use} +\item{var}{Name of column to use for rownames.} } \description{ -\code{has_rownames} checks if a data frame has row names. - -\code{remove_rownames} removes all row names. - -\code{rownames_to_column} convert row names to an explicit variable. - -\code{column_to_rownames} convert a column variable to row names. This is an -inverted operation of \code{rownames_to_column}. +Generally, it is best to avoid row names, because they are basically a +character column with different semantics to every other column. These +functions allow to you detect if a data frame has row names +(\code{has_rownames}), remove them (\code{remove_rownames}), or convert +them back-and-forth between an explicit column (\code{rownames_to_column}, +and \code{column_to_rownames}). } \examples{ has_rownames(mtcars) has_rownames(iris) -rownames(remove_rownames(mtcars)) -rownames_to_column(mtcars) +has_rownames(remove_rownames(mtcars)) + +head(rownames_to_column(mtcars)) mtcars_tbl <- rownames_to_column(tbl_df(mtcars)) mtcars_tbl - column_to_rownames(mtcars_tbl) } From 0478a7665f26268c98f4aba1aa066f01cd20e250 Mon Sep 17 00:00:00 2001 From: hadley Date: Fri, 18 Mar 2016 08:46:03 -0500 Subject: [PATCH 20/27] Test subsetting errors --- R/tbl-df.r | 8 +++++--- tests/testthat/test-tbl-df.r | 18 ++++++++++++++++++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/R/tbl-df.r b/R/tbl-df.r index 0bb9f8df0..3116a6c66 100644 --- a/R/tbl-df.r +++ b/R/tbl-df.r @@ -51,8 +51,9 @@ print.tbl_df <- function(x, ..., n = NULL, width = NULL) { #' @export `[[.tbl_df` <- function(x, i, exact = TRUE) { - if (is.character(i) && length(i) == 1L && !(i %in% names(x))) - stop("Unknown name", call. = FALSE) + if (is.character(i) && length(i) == 1L && !(i %in% names(x))) { + stop("Unknown column '", i, "'", call. = FALSE) + } if (!exact) { warning("exact ignored", call. = FALSE) } @@ -62,8 +63,9 @@ print.tbl_df <- function(x, ..., n = NULL, width = NULL) { #' @export `$.tbl_df` <- function(x, i) { - if (is.character(i) && !(i %in% names(x))) + if (is.character(i) && !(i %in% names(x))) { stop("Unknown column '", i, "'", call. = FALSE) + } .subset2(x, i) } diff --git a/tests/testthat/test-tbl-df.r b/tests/testthat/test-tbl-df.r index bb727a660..8dd2f158b 100644 --- a/tests/testthat/test-tbl-df.r +++ b/tests/testthat/test-tbl-df.r @@ -1,5 +1,8 @@ context("tbl_df") + +# [ ----------------------------------------------------------------------- + test_that("[ never drops", { mtcars2 <- tbl_df(mtcars) expect_is(mtcars2[, 1], "data.frame") @@ -32,6 +35,9 @@ test_that("[.tbl_df is careful about names (#1245)",{ expect_error( foo[, c("x", "y", "z") ] ) }) + +# [[ ---------------------------------------------------------------------- + test_that("[[.tbl_df ignores exact argument",{ foo <- data_frame(x = 1:10, y = 1:10) expect_warning(foo[["x"]], NA) @@ -44,3 +50,15 @@ test_that("can use recursive indexing with [[", { expect_equal(foo[[c(1, 1)]], 1:3) expect_equal(foo[[c("x", "y")]], 1:3) }) + +test_that("[[ throws error if name doesn't exist", { + df <- data_frame(x = 1) + expect_error(df[["y"]], "Unknown column 'y'") +}) + +# $ ----------------------------------------------------------------------- + +test_that("[[ throws error if name doesn't exist", { + df <- data_frame(x = 1) + expect_error(df$y, "Unknown column 'y'") +}) From f98fa48cbf56206fc0855912670476ae98081c58 Mon Sep 17 00:00:00 2001 From: hadley Date: Fri, 18 Mar 2016 09:00:38 -0500 Subject: [PATCH 21/27] Rename obj_type to obj_sum And better integrate with type_sum. --- NAMESPACE | 16 ++++---- R/type-sum.r | 72 ++++++++++++++++++++++++--------- R/utils-format.r | 29 +------------ R/utils.r | 4 ++ man/{type_sum.Rd => obj_sum.Rd} | 12 ++++-- tests/testthat/test-obj-sum.R | 27 +++++++++++++ tests/testthat/test-trunc-mat.r | 26 ------------ 7 files changed, 100 insertions(+), 86 deletions(-) rename man/{type_sum.Rd => obj_sum.Rd} (57%) create mode 100644 tests/testthat/test-obj-sum.R diff --git a/NAMESPACE b/NAMESPACE index 339b0db22..91b8d1fee 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,24 +16,21 @@ S3method(format_v,default) S3method(glimpse,data.frame) S3method(glimpse,default) S3method(glimpse,tbl) -S3method(obj_type,"NULL") -S3method(obj_type,Date) -S3method(obj_type,POSIXct) -S3method(obj_type,data.frame) -S3method(obj_type,default) -S3method(obj_type,factor) +S3method(obj_sum,"NULL") +S3method(obj_sum,Date) +S3method(obj_sum,POSIXct) +S3method(obj_sum,data.frame) +S3method(obj_sum,default) +S3method(obj_sum,factor) S3method(print,tbl_df) S3method(print,trunc_mat) S3method(type_sum,Date) S3method(type_sum,POSIXt) -S3method(type_sum,array) S3method(type_sum,character) -S3method(type_sum,data.frame) S3method(type_sum,default) S3method(type_sum,factor) S3method(type_sum,integer) S3method(type_sum,logical) -S3method(type_sum,matrix) S3method(type_sum,numeric) export(add_row) export(as_data_frame) @@ -47,6 +44,7 @@ export(has_rownames) export(knit_print.trunc_mat) export(lst) export(lst_) +export(obj_sum) export(remove_rownames) export(repair_names) export(rownames_to_column) diff --git a/R/type-sum.r b/R/type-sum.r index d67e347ee..2d65555ce 100644 --- a/R/type-sum.r +++ b/R/type-sum.r @@ -1,27 +1,53 @@ -#' Provide a succinct summary of a type +#' Provide a succinct summary of an object #' -#' All methods should return a string with four or less characters, suitable -#' for succinctly display column types. +#' \code{type_sum} gives a brief summary of object type. Objects that commonly +#' occur in a data frame should return a string with four or less characters. +#' \code{obj_sum} also includes the size of the object. #' #' @param x an object to summarise. Generally only methods of atomic vectors #' and variants have been implemented. #' @keywords internal -#' @export #' @examples -#' type_sum(1:10) -#' type_sum(matrix(1:10)) -#' type_sum(Sys.Date()) -#' type_sum(Sys.time()) -#' type_sum(mean) -type_sum <- function(x) UseMethod("type_sum") +#' obj_sum(1:10) +#' obj_sum(matrix(1:10)) +#' obj_sum(Sys.Date()) +#' obj_sum(Sys.time()) +#' obj_sum(mean) +#' @export +obj_sum <- function(x) UseMethod("obj_sum") +#' @export +obj_sum.default <- function(x) { + if (!is.object(x)) { + obj_sum_atomic(x) + } else if (!isS4(x)) { + paste0("") + } else { + paste0("") + } +} #' @export -type_sum.data.frame <- function(x) { - if (length(x) == 0) return(character(0)) +obj_sum.NULL <- function(x) "" + +obj_sum_atomic <- function(x) { + paste0("<", type_sum(x), size_sum(x), ">") +} +#' @export +obj_sum.factor <- obj_sum_atomic +#' @export +obj_sum.Date <- obj_sum_atomic +#' @export +obj_sum.POSIXct <- obj_sum_atomic - vapply(x, type_sum, character(1)) +#' @export +obj_sum.data.frame <- function(x) { + paste0("<", class(x)[1], size_sum(x), ">") } +#' @export +#' @rdname obj_sum +type_sum <- function(x) UseMethod("type_sum") + #' @export type_sum.numeric <- function(x) "dbl" #' @export @@ -39,11 +65,19 @@ type_sum.POSIXt <- function(x) "time" type_sum.Date <- function(x) "date" #' @export -type_sum.matrix <- function(x) { - paste0(NextMethod(), "[", paste0(dim(x), collapse = ","), "]") +type_sum.default <- function(x) { + if (!is.object(x)) { + typeof(x) + } else if (!isS4(x)) { + paste0("S3: ", paste0(class(x), collapse = "/")) + } else { + paste0("S4: ", methods::is(x)[[1]]) + } } -#' @export -type_sum.array <- type_sum.matrix -#' @export -type_sum.default <- function(x) unname(abbreviate(class(x)[1], 4)) +size_sum <- function(x) { + if (!is_vector(x)) return("") + + dim <- dim(x) %||% length(x) + paste0(" [", paste0(dim, collapse = ","), "]" ) +} diff --git a/R/utils-format.r b/R/utils-format.r index f9ffc0d61..131918fa1 100644 --- a/R/utils-format.r +++ b/R/utils-format.r @@ -76,7 +76,7 @@ shrink_mat <- function(df, width, n_extra, var_names, var_types, rows, n) { # List columns need special treatment because format can't be trusted classes <- paste0("(", vapply(df, type_sum, character(1)), ")") is_list <- vapply(df, is.list, logical(1)) - df[is_list] <- lapply(df[is_list], function(x) vapply(x, obj_type, character(1))) + df[is_list] <- lapply(df[is_list], function(x) vapply(x, obj_sum, character(1))) mat <- format(df, justify = "left") values <- c(format(rownames(mat))[[1]], unlist(mat[1, ])) @@ -165,34 +165,7 @@ wrap <- function(..., indent = 0, width) { paste0(wrapped, collapse = "\n") } -obj_type <- function(x) UseMethod("obj_type") -#' @export -obj_type.NULL <- function(x) "" -#' @export -obj_type.default <- function(x) { - if (!is.object(x)) { - obj_type_atomic(x) - } else if (!isS4(x)) { - paste0("") - } else { - paste0("") - } -} - -obj_type_atomic <- function(x) { - paste0("<", type_sum(x), if (!is.array(x)) paste0("[", length(x), "]"), ">") -} -#' @export -obj_type.factor <- obj_type_atomic -#' @export -obj_type.Date <- obj_type_atomic -#' @export -obj_type.POSIXct <- obj_type_atomic -#' @export -obj_type.data.frame <- function(x) { - paste0("<", class(x)[1], " [", paste0(dim(x), collapse = ","), "]", ">") -} # function for the thousand separator, # returns "," unless it's used for the decimal point, in which case returns "." diff --git a/R/utils.r b/R/utils.r index 5b098001f..53c2952f9 100644 --- a/R/utils.r +++ b/R/utils.r @@ -16,6 +16,10 @@ is_atomic <- function(x) { is.atomic(x) && !is.null(x) } +is_vector <- function(x) { + is_atomic(x) || is.list(x) +} + is_1d <- function(x) { # dimension check is for matrices and data.frames (is_atomic(x) || is.list(x)) && length(dim(x)) <= 1 diff --git a/man/type_sum.Rd b/man/obj_sum.Rd similarity index 57% rename from man/type_sum.Rd rename to man/obj_sum.Rd index b661322fc..4180c63c6 100644 --- a/man/type_sum.Rd +++ b/man/obj_sum.Rd @@ -1,9 +1,12 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/type-sum.r -\name{type_sum} +\name{obj_sum} +\alias{obj_sum} \alias{type_sum} -\title{Provide a succinct summary of a type} +\title{Provide a succinct summary of an object} \usage{ +obj_sum(x) + type_sum(x) } \arguments{ @@ -11,8 +14,9 @@ type_sum(x) and variants have been implemented.} } \description{ -All methods should return a string with four or less characters, suitable -for succinctly display column types. +\code{type_sum} gives a brief summary of object type. Objects that commonly +occur in a data frame should return a string with four or less characters. +\code{obj_sum} also includes the size of the object. } \examples{ type_sum(1:10) diff --git a/tests/testthat/test-obj-sum.R b/tests/testthat/test-obj-sum.R new file mode 100644 index 000000000..2c501fa6b --- /dev/null +++ b/tests/testthat/test-obj-sum.R @@ -0,0 +1,27 @@ +context("obj_sum") + +# obj_sum ---------------------------------------------------------------- + +test_that("shows only first class name for S4", { + A <- methods::setClass("A") + expect_equal(obj_sum(A), "") +}) + +test_that("NULL handled specially", { + expect_equal(obj_sum(NULL), "") +}) + +test_that("data frame includes rows and cols", { + expect_equal(obj_sum(mtcars), "") +}) + +test_that("S3 others list all classes", { + x <- structure(list(), class = c("a", "b", "c")) + expect_equal(obj_sum(x), "") +}) + +test_that("common data vectors treated as atomic", { + expect_equal(obj_sum(factor(1:3)), "") + expect_equal(obj_sum(Sys.Date() + 1:3), "") + expect_equal(obj_sum(Sys.time() + 1:3), "