diff --git a/.Rbuildignore b/.Rbuildignore index 65f611b99..ba6e4aa30 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -2,9 +2,6 @@ ^\.Rproj\.user$ ^\.travis\.yml$ .Rprofile -inst/db -man-roxygen -demo/pandas ^\.httr-oauth$ ^cran-comments\.md$ ^README\.Rmd$ diff --git a/DESCRIPTION b/DESCRIPTION index cc2a78641..06164f3c9 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,20 +1,30 @@ Package: tibble Encoding: UTF-8 -Version: 0.3-3 +Version: 0.3-4 Title: Simple Data Frames Description: Provides a 'tbl_df' class that offers better checking and - printing capabilities than traditional data frames. -Authors@R: c( person("Hadley", "Wickham", , "hadley@rstudio.com", - "aut"), person("Romain", "Francois", , - "romain@r-enthusiasts.com", "aut"), person("Kirill", "Müller", - , "krlmlr+r@mailbox.org", c("aut", "cre")), person("RStudio", - role = "cph") ) + printing capabilities than traditional data frames. +Authors@R: c( + person("Hadley", "Wickham", , "hadley@rstudio.com", "aut"), + person("Romain", "Francois", , "romain@r-enthusiasts.com", "aut"), + person("Kirill", "Müller", , "krlmlr+r@mailbox.org", c("aut", "cre")), + person("RStudio", role = "cph") + ) URL: https://github.com/krlmlr/tibble BugReports: https://github.com/krlmlr/tibble/issues Depends: R (>= 3.1.2) -Imports: methods, assertthat, utils, lazyeval (>= 0.1.10), Rcpp -Suggests: testthat, knitr, rmarkdown, Lahman (>= 3.0.1), magrittr, - microbenchmark +Imports: + methods, + assertthat, + utils, + lazyeval (>= 0.1.10), + Rcpp +Suggests: + testthat, + knitr, + rmarkdown, + Lahman (>= 3.0.1), + microbenchmark LinkingTo: Rcpp LazyData: yes License: MIT + file LICENSE diff --git a/NAMESPACE b/NAMESPACE index 9bf6556e1..fca7e96bf 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -16,23 +16,19 @@ S3method(format_v,default) S3method(glimpse,data.frame) S3method(glimpse,default) S3method(glimpse,tbl) -S3method(obj_type,"NULL") -S3method(obj_type,data.frame) -S3method(obj_type,data_frame) -S3method(obj_type,default) +S3method(is_vector_s3,Date) +S3method(is_vector_s3,POSIXct) +S3method(is_vector_s3,data.frame) +S3method(is_vector_s3,default) +S3method(is_vector_s3,factor) +S3method(obj_sum,default) S3method(print,tbl_df) S3method(print,trunc_mat) S3method(type_sum,Date) S3method(type_sum,POSIXt) -S3method(type_sum,array) -S3method(type_sum,character) S3method(type_sum,data.frame) S3method(type_sum,default) S3method(type_sum,factor) -S3method(type_sum,integer) -S3method(type_sum,logical) -S3method(type_sum,matrix) -S3method(type_sum,numeric) export(add_row) export(as_data_frame) export(column_to_rownames) @@ -42,9 +38,11 @@ export(dim_desc) export(frame_data) export(glimpse) export(has_rownames) +export(is_vector_s3) export(knit_print.trunc_mat) export(lst) export(lst_) +export(obj_sum) export(remove_rownames) export(repair_names) export(rownames_to_column) diff --git a/NEWS.md b/NEWS.md index 1da92edf8..61a056e46 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,16 @@ +Version 0.3-4 (2016-03-18) +=== + +- Renamed `obj_type()` to `obj_sum()`, improvements, better integration with `type_sum()`. +- Add tests. +- Improve documentation and vignette. +- Internal cleanup. +- Improve `[.tbl_df()` error message. +- `frame_data()` returns 0-row but n-col data frame if no data. +- Further cleanup of `repair_names()`. +- Don't trim ws in `repair_names()` (#47). + + Version 0.3-3 (2016-03-18) === diff --git a/R/dataframe.R b/R/dataframe.R index 641fb3611..6a096aa99 100644 --- a/R/dataframe.R +++ b/R/dataframe.R @@ -1,5 +1,3 @@ -methods::setOldClass(c("tbl_df", "tbl", "data.frame")) - #' Build a data frame or list. #' #' \code{data_frame} is trimmed down version of \code{\link{data.frame}} that: diff --git a/R/frame-data.R b/R/frame-data.R index 2a465abd1..7748785b4 100644 --- a/R/frame-data.R +++ b/R/frame-data.R @@ -1,9 +1,12 @@ -#' Row-wise data_frame creation +#' Row-wise tibble creation #' -#' Create a row-wise \code{\link{data_frame}}. +#' Create \code{\link{data_frame}}s laying out the data in rows, rather than +#' in columns. This is useful for small tables of data where readability is +#' important. #' #' @param ... Arguments specifying the structure of a \code{data_frame}. -#' +#' Variable names should be formulas, and may only appear before the data. +#' @return A \code{\link{tbl_df}}. #' @export #' @examples #' frame_data( @@ -12,6 +15,14 @@ #' "b", 2, #' "c", 3 #' ) +#' +#' # frame_data will create a list column if the value in each cell is +#' # not a scalar +#' frame_data( +#' ~x, ~y, +#' "a", 1:3, +#' "b", 4:6 +#' ) frame_data <- function(...) { dots <- list(...) @@ -21,7 +32,9 @@ frame_data <- function(...) { i <- 1 while (TRUE) { if (i > length(dots)) { - return(data_frame()) + out <- rep(list(logical()), length(frame_names)) + names(out) <- frame_names + return(as_data_frame(out)) } el <- dots[[i]] @@ -32,12 +45,14 @@ frame_data <- function(...) { break if (length(el) != 2) { - stop("expected a column name with a single argument; e.g. '~ name'") + stop("expected a column name with a single argument; e.g. '~ name'", + call. = FALSE) } candidate <- el[[2]] if (!(is.symbol(candidate) || is.character(candidate))) { - stop("expected a symbol or string denoting a column name") + stop("expected a symbol or string denoting a column name", + call. = FALSE) } frame_names <- c(frame_names, as.character(el[[2]])) @@ -46,7 +61,7 @@ frame_data <- function(...) { } if (!length(frame_names)) { - stop("no column names detected in 'frame_data()' call") + stop("no column names detected in 'frame_data()' call", call. = FALSE) } frame_rest <- dots[i:length(dots)] @@ -57,11 +72,14 @@ frame_data <- function(...) { # structure. frame_ncol <- length(frame_names) if (n_elements %% frame_ncol != 0) { - stop(sprintf( - "invalid 'frame_data()' specification: had %s elements and %s columns", - n_elements, - frame_ncol - )) + stop( + sprintf( + "invalid 'frame_data()' specification: had %s elements and %s columns", + n_elements, + frame_ncol + ), + call. = FALSE + ) } frame_mat <- matrix(frame_rest, ncol = frame_ncol, byrow = TRUE) diff --git a/R/repair-names.R b/R/repair-names.R index 9edaa5294..830058aaf 100644 --- a/R/repair-names.R +++ b/R/repair-names.R @@ -1,12 +1,12 @@ #' Repair object names. #' #' \code{repair_names} ensures its input has non-missing and -#' unique names. It also strips any leading or trailing spaces. -#' Valid names are left as is. +#' unique names (duplicated names get a numeric suffix). Valid names are +#' left as is. #' #' @param x A named vector. #' @param prefix A string, the prefix to use for new column names. -#' @param sep A string, inserted between the column name and de-duplicating +#' @param sep A string inserted between the column name and de-duplicating #' number. #' @return \code{x} with valid names. #' @export @@ -16,33 +16,24 @@ #' tbl <- as_data_frame(structure(list(3, 4, 5), class = "data.frame")) #' repair_names(tbl) repair_names <- function(x, prefix = "V", sep = "") { - if (length(x) == 0) + if (length(x) == 0) { + names(x) <- character() return(x) + } - xnames <- init_names(x) - blanks <- xnames == "" + new_names <- make_unique(names2(x), prefix = prefix, sep = sep) + setNames(x, new_names) +} - # The order vector defines the order in which make.unique() should process the - # entries. Blanks are initialized with the prefix. The index of the first - # blank entry appears twice in this vector if there's no column named like the - # prefix, to make sure that blank columns always start with V1 (or a higher - # index if appropriate). See also the "pathological cases" test. - order <- c( - which(!blanks), - if (all(xnames[!blanks] != prefix) && any(blanks)) - which.max(blanks), - which(blanks)) - xnames[blanks] <- prefix - xnames[order] <- make.unique(xnames[order], sep = sep) +make_unique <- function(x, prefix = "V", sep = "") { + blank <- x == "" - names(x) <- xnames - x -} + # Ensure existing names are unique + x[!blank] <- make.unique(x[!blank], sep = sep) -init_names <- function(x) { - xnames <- names(x) - if (is.null(xnames)) - rep("", length(x)) - else - ifelse(is.na(xnames), "", trim_ws(xnames)) + # Replace blank names + new_vars <- setdiff(paste(prefix, seq_along(x), sep = sep), x) + x[blank] <- new_vars[seq_len(sum(blank))] + + x } diff --git a/R/rownames.R b/R/rownames.R index f9b446b11..daa7ca310 100644 --- a/R/rownames.R +++ b/R/rownames.R @@ -1,39 +1,45 @@ -#' Row names +#' Tools for working with row names #' -#' \code{has_rownames} checks if a data frame has row names. -#' @param df Input data frame -#' @export -#' @rdname rownames +#' Generally, it is best to avoid row names, because they are basically a +#' character column with different semantics to every other column. These +#' functions allow to you detect if a data frame has row names +#' (\code{has_rownames}), remove them (\code{remove_rownames}), or convert +#' them back-and-forth between an explicit column (\code{rownames_to_column}, +#' and \code{column_to_rownames}). +#' +#' @param df A data frame +#' @param var Name of column to use for rownames. #' @examples #' has_rownames(mtcars) #' has_rownames(iris) +#' has_rownames(remove_rownames(mtcars)) +#' +#' head(rownames_to_column(mtcars)) +#' +#' mtcars_tbl <- rownames_to_column(tbl_df(mtcars)) +#' mtcars_tbl +#' column_to_rownames(mtcars_tbl) +#' @name rownames +NULL + + +#' @export +#' @rdname rownames has_rownames <- function(df) { stopifnot(is.data.frame(df)) .row_names_info(df) > 0L } -#' \code{remove_rownames} removes all row names. #' @export #' @rdname rownames -#' @examples -#' rownames(remove_rownames(mtcars)) remove_rownames <- function(df) { stopifnot(is.data.frame(df)) rownames(df) <- NULL df } -#' \code{rownames_to_column} convert row names to an explicit variable. -#' -#' @param var Name of variable to use #' @export #' @rdname rownames -#' @importFrom stats setNames -#' @examples -#' rownames_to_column(mtcars) -#' -#' mtcars_tbl <- rownames_to_column(tbl_df(mtcars)) -#' mtcars_tbl rownames_to_column <- function(df, var = "rowname") { stopifnot(is.data.frame(df)) @@ -53,14 +59,8 @@ rownames_to_column <- function(df, var = "rowname") { new_df } -#' \code{column_to_rownames} convert a column variable to row names. This is an -#' inverted operation of \code{rownames_to_column}. -#' #' @rdname rownames #' @export -#' @examples -#' -#' column_to_rownames(mtcars_tbl) column_to_rownames <- function(df, var = "rowname") { stopifnot(is.data.frame(df)) diff --git a/R/src-local.r b/R/src-local.r deleted file mode 100644 index e69de29bb..000000000 diff --git a/R/src.r b/R/src.r deleted file mode 100644 index e69de29bb..000000000 diff --git a/R/tbl-df.r b/R/tbl-df.r index acbfe082d..3116a6c66 100644 --- a/R/tbl-df.r +++ b/R/tbl-df.r @@ -1,4 +1,4 @@ -#' Create a data frame tbl. +#' S3 class: tbl_df #' #' A data frame tbl wraps a local data frame. The main advantage to using #' a \code{tbl_df} over a regular data frame is the printing: @@ -16,25 +16,15 @@ #' \item{\code{[[}, \code{$}}{Calls \code{\link{.subset2}} directly, #' so is considerably faster. Throws error if column does not exist.} #' } -#' -#' #' @export #' @param data a data frame -#' @examples -#' ds <- tbl_df(mtcars) -#' ds -#' as.data.frame(ds) -#' -#' if (require("Lahman")) { -#' batting <- tbl_df(Batting) -#' dim(batting) -#' colnames(batting) -#' head(batting) -#' } +#' @keywords internal tbl_df <- function(data) { as_data_frame(data) } +methods::setOldClass(c("tbl_df", "tbl", "data.frame")) + # Standard data frame methods -------------------------------------------------- #' @export @@ -61,8 +51,9 @@ print.tbl_df <- function(x, ..., n = NULL, width = NULL) { #' @export `[[.tbl_df` <- function(x, i, exact = TRUE) { - if (is.character(i) && length(i) == 1L && !(i %in% names(x))) - stop("Unknown name", call. = FALSE) + if (is.character(i) && length(i) == 1L && !(i %in% names(x))) { + stop("Unknown column '", i, "'", call. = FALSE) + } if (!exact) { warning("exact ignored", call. = FALSE) } @@ -72,8 +63,9 @@ print.tbl_df <- function(x, ..., n = NULL, width = NULL) { #' @export `$.tbl_df` <- function(x, i) { - if (is.character(i) && !(i %in% names(x))) - stop("Unknown name", call. = FALSE) + if (is.character(i) && !(i %in% names(x))) { + stop("Unknown column '", i, "'", call. = FALSE) + } .subset2(x, i) } diff --git a/R/type-sum.r b/R/type-sum.r index d67e347ee..bd94a2e8a 100644 --- a/R/type-sum.r +++ b/R/type-sum.r @@ -1,35 +1,29 @@ -#' Provide a succinct summary of a type +#' Provide a succinct summary of an object #' -#' All methods should return a string with four or less characters, suitable -#' for succinctly display column types. +#' \code{type_sum} gives a brief summary of object type. Objects that commonly +#' occur in a data frame should return a string with four or less characters. +#' \code{obj_sum} also includes the size of the object if \code{is_s3_vector} +#' is \code{TRUE}. #' #' @param x an object to summarise. Generally only methods of atomic vectors #' and variants have been implemented. #' @keywords internal -#' @export #' @examples -#' type_sum(1:10) -#' type_sum(matrix(1:10)) -#' type_sum(Sys.Date()) -#' type_sum(Sys.time()) -#' type_sum(mean) -type_sum <- function(x) UseMethod("type_sum") - +#' obj_sum(1:10) +#' obj_sum(matrix(1:10)) +#' obj_sum(Sys.Date()) +#' obj_sum(Sys.time()) +#' obj_sum(mean) #' @export -type_sum.data.frame <- function(x) { - if (length(x) == 0) return(character(0)) - - vapply(x, type_sum, character(1)) +obj_sum <- function(x) UseMethod("obj_sum") +#' @export +obj_sum.default <- function(x) { + paste0(type_sum(x), if (is_vector_s3(x)) size_sum(x)) } #' @export -type_sum.numeric <- function(x) "dbl" -#' @export -type_sum.integer <- function(x) "int" -#' @export -type_sum.logical <- function(x) "lgl" -#' @export -type_sum.character <- function(x) "chr" +#' @rdname obj_sum +type_sum <- function(x) UseMethod("type_sum") #' @export type_sum.factor <- function(x) "fctr" @@ -37,13 +31,45 @@ type_sum.factor <- function(x) "fctr" type_sum.POSIXt <- function(x) "time" #' @export type_sum.Date <- function(x) "date" - #' @export -type_sum.matrix <- function(x) { - paste0(NextMethod(), "[", paste0(dim(x), collapse = ","), "]") -} +type_sum.data.frame <- function(x) class(x)[[1]] #' @export -type_sum.array <- type_sum.matrix +type_sum.default <- function(x) { + if (!is.object(x)) { + switch(typeof(x), + logical = "lgl", + integer = "int", + double = "dbl", + character = "chr", + complex = "cplx", + closure = "fun", + environment = "env", + typeof(x) + ) + } else if (!isS4(x)) { + paste0("S3: ", paste0(class(x), collapse = "/")) + } else { + paste0("S4: ", methods::is(x)[[1]]) + } +} + +size_sum <- function(x) { + if (!is_vector(x)) return("") + + dim <- dim(x) %||% length(x) + paste0(" [", paste0(dim, collapse = ","), "]" ) +} #' @export -type_sum.default <- function(x) unname(abbreviate(class(x)[1], 4)) +#' @rdname obj_sum +is_vector_s3 <- function(x) UseMethod("is_vector_s3") +#' @export +is_vector_s3.factor <- function(x) TRUE +#' @export +is_vector_s3.Date <- function(x) TRUE +#' @export +is_vector_s3.POSIXct <- function(x) TRUE +#' @export +is_vector_s3.data.frame <- function(x) TRUE +#' @export +is_vector_s3.default <- function(x) !is.object(x) && is_vector(x) diff --git a/R/utils-format.r b/R/utils-format.r index bbbd7d7d9..0de5430d1 100644 --- a/R/utils-format.r +++ b/R/utils-format.r @@ -74,9 +74,12 @@ shrink_mat <- function(df, width, n_extra, var_names, var_types, rows, n) { } # List columns need special treatment because format can't be trusted - classes <- paste0("(", vapply(df, type_sum, character(1)), ")") + classes <- paste0("<", vapply(df, type_sum, character(1)), ">") is_list <- vapply(df, is.list, logical(1)) - df[is_list] <- lapply(df[is_list], function(x) vapply(x, obj_type, character(1))) + df[is_list] <- lapply(df[is_list], function(x) { + summary <- vapply(x, obj_sum, character(1)) + paste0("<", summary, ">") + }) mat <- format(df, justify = "left") values <- c(format(rownames(mat))[[1]], unlist(mat[1, ])) @@ -133,7 +136,7 @@ print.trunc_mat <- function(x, ...) { } if (length(x$extra) > 0) { - var_types <- paste0(names(x$extra), " (", x$extra, ")", collapse = ", ") + var_types <- paste0(names(x$extra), " <", x$extra, ">", collapse = ", ") cat(wrap("Variables not shown: ", var_types, width = x$width), ".\n", sep = "") } @@ -147,7 +150,7 @@ knit_print.trunc_mat <- function(x, options) { kable <- knitr::kable(x$table, row.names = FALSE) if (length(x$extra) > 0) { - var_types <- paste0(names(x$extra), " (", x$extra, ")", collapse = ", ") + var_types <- paste0(names(x$extra), " <", x$extra, ">", collapse = ", ") extra <- wrap("\n(_Variables not shown_: ", var_types, ")", width = x$width) } else { extra <- "\n" @@ -165,28 +168,7 @@ wrap <- function(..., indent = 0, width) { paste0(wrapped, collapse = "\n") } -obj_type <- function(x) UseMethod("obj_type") -#' @export -obj_type.NULL <- function(x) "" -#' @export -obj_type.default <- function(x) { - if (!is.object(x)) { - paste0("<", type_sum(x), if (!is.array(x)) paste0("[", length(x), "]"), ">") - } else if (!isS4(x)) { - paste0("") - } else { - paste0("") - } -} -#' @export -obj_type.data.frame <- function(x) { - paste0("<", class(x)[1], " [", paste0(dim(x), collapse = ","), "]", ">") -} -#' @export -obj_type.data_frame <- function(x) { - paste0("") -} # function for the thousand separator, # returns "," unless it's used for the decimal point, in which case returns "." @@ -194,6 +176,3 @@ big_mark <- function(x, ...) { mark <- if (identical(getOption("OutDec"), ",")) "." else "," formatC(x, big.mark = mark, ...) } - -# trimws() is not available in R 3.1.3 -trim_ws <- function(x) gsub("^ *(|(.*[^ ])) *$", "\\1", x) diff --git a/R/utils.r b/R/utils.r index acf7b1f1d..53c2952f9 100644 --- a/R/utils.r +++ b/R/utils.r @@ -1,5 +1,10 @@ names2 <- function(x) { - names(x) %||% rep("", length(x)) + xnames <- names(x) + if (is.null(xnames)) { + rep("", length(x)) + } else { + ifelse(is.na(xnames), "", xnames) + } } "%||%" <- function(x, y) { @@ -11,6 +16,10 @@ is_atomic <- function(x) { is.atomic(x) && !is.null(x) } +is_vector <- function(x) { + is_atomic(x) || is.list(x) +} + is_1d <- function(x) { # dimension check is for matrices and data.frames (is_atomic(x) || is.list(x)) && length(dim(x)) <= 1 diff --git a/README.Rmd b/README.Rmd index 02b338b7f..179edba94 100644 --- a/README.Rmd +++ b/README.Rmd @@ -1,7 +1,5 @@ --- -output: - md_document: - variant: markdown_github +output: github_document --- @@ -14,14 +12,56 @@ knitr::opts_chunk$set( ) ``` -# tibble [![Build Status](https://travis-ci.org/hadley/tibble.svg?branch=master)](https://travis-ci.org/hadley/tibble) [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/hadley/tibble?branch=master&svg=true)](https://ci.appveyor.com/project/hadley/tibble) [![Coverage Status](https://img.shields.io/codecov/c/github/hadley/tibble/master.svg)](https://codecov.io/github/hadley/tibble?branch=master) +# tibble -Data frames in `dplyr` style. +[![Build Status](https://travis-ci.org/hadley/tibble.svg?branch=master)](https://travis-ci.org/hadley/tibble) +[![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/hadley/tibble?branch=master&svg=true)](https://ci.appveyor.com/project/hadley/tibble) +[![Coverage Status](https://img.shields.io/codecov/c/github/hadley/tibble/master.svg)](https://codecov.io/github/hadley/tibble?branch=master) + +tibble extracts the idea of a __data_frame__ (aka a tibble diff, or tibble for short) from dplyr. As the name suggests a __data_frame__ is a modern reimagining of a data.frame, keeping what time has proven to be effective, and throwing out what is not. In spoken language it's hard to tell the difference between a `data.frame` and `data_frame` so we call the new style tibble dfs (inspired by `dplyr::tbl_df()`), or just tibbles for short. + +## Creating tibbles + +You can create a tibble from an existing object with `as_data_frame()`: ```{r} library(tibble) -tbl_df(iris) -glimpse(iris) -head(rownames_to_column(mtcars, "model")) -trunc_mat(iris) +as_data_frame(iris) +``` + +You can create a new tibble from vectors that represent the columns with `data_frame()`: + +```{r} +data_frame(x = 1:5, y = 1, z = x ^ 2 + y) +``` + +`data_frame()` is does much less than `data.frame()`: it never changes the type of the inputs (e.g. it never converts strings to factors!), it never changes the names of variabels, and it never creates `row.names()`. You can read more about these features in the vignette, `vignette("tibble")`. + +You can define a tibble row-by-row with `frame_data()`: + +```{r} +frame_data( + ~x, ~y, ~z, + "a", 2, 3.6, + "b", 1, 8.5 +) +``` + +## Tibbles vs data frames + +There are two main differences in the usage of a data frame vs a tibble: printing, and subsetting. + +Tibbles have a refined print method that shows only the first 10 rows, and all the columns that fit on screen. Each column gives both the name and its type. This makes it much eaiser to work with large data: + +```{r} +library(nycflights13) +flights +``` + +Tibles are strict about subsetting. If you try and access a variable that does not, you'll get an error: + +```{r, error = TRUE} +flights$yea ``` + +Tibbles clearly delinerate `[` and `[[`: `[` always returns another tibble, `[[` always returns a vector. diff --git a/README.md b/README.md index a36c07ab2..47c69bbf3 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,20 @@ + -tibble [![Build Status](https://travis-ci.org/hadley/tibble.svg?branch=master)](https://travis-ci.org/hadley/tibble) [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/hadley/tibble?branch=master&svg=true)](https://ci.appveyor.com/project/hadley/tibble) [![Coverage Status](https://img.shields.io/codecov/c/github/hadley/tibble/master.svg)](https://codecov.io/github/hadley/tibble?branch=master) -===================================================================================================================================================================================================================================================================================================================================================================================================================================== +tibble +====== + +[![Build Status](https://travis-ci.org/hadley/tibble.svg?branch=master)](https://travis-ci.org/hadley/tibble) [![AppVeyor Build Status](https://ci.appveyor.com/api/projects/status/github/hadley/tibble?branch=master&svg=true)](https://ci.appveyor.com/project/hadley/tibble) [![Coverage Status](https://img.shields.io/codecov/c/github/hadley/tibble/master.svg)](https://codecov.io/github/hadley/tibble?branch=master) + +tibble extracts the idea of a **data\_frame** (aka a tibble diff, or tibble for short) from dplyr. As the name suggests a **data\_frame** is a modern reimagining of a data.frame, keeping what time has proven to be effective, and throwing out what is not. In spoken language it's hard to tell the difference between a `data.frame` and `data_frame` so we call the new style tibble dfs (inspired by `dplyr::tbl_df()`), or just tibbles for short. + +Creating tibbles +---------------- -Data frames in `dplyr` style. +You can create a tibble from an existing object with `as_data_frame()`: ``` r library(tibble) -tbl_df(iris) +as_data_frame(iris) #> Source: local data frame [150 x 5] #> #> Sepal.Length Sepal.Width Petal.Length Petal.Width Species @@ -22,36 +30,75 @@ tbl_df(iris) #> 9 4.4 2.9 1.4 0.2 setosa #> 10 4.9 3.1 1.5 0.1 setosa #> .. ... ... ... ... ... -glimpse(iris) -#> Observations: 150 -#> Variables: 5 -#> $ Sepal.Length (dbl) 5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.6, 5.0, 4.4, 4.9,... -#> $ Sepal.Width (dbl) 3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 3.4, 2.9, 3.1,... -#> $ Petal.Length (dbl) 1.4, 1.4, 1.3, 1.5, 1.4, 1.7, 1.4, 1.5, 1.4, 1.5,... -#> $ Petal.Width (dbl) 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.3, 0.2, 0.2, 0.1,... -#> $ Species (fctr) setosa, setosa, setosa, setosa, setosa, setosa, ... -head(rownames_to_column(mtcars, "model")) -#> model mpg cyl disp hp drat wt qsec vs am gear carb -#> 1 Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 -#> 2 Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 -#> 3 Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 -#> 4 Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 -#> 5 Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 -#> 6 Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 -trunc_mat(iris) ``` -| Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species | -|:-------------|:------------|:-------------|:------------|:--------| -| (dbl) | (dbl) | (dbl) | (dbl) | (fctr) | -| 5.1 | 3.5 | 1.4 | 0.2 | setosa | -| 4.9 | 3.0 | 1.4 | 0.2 | setosa | -| 4.7 | 3.2 | 1.3 | 0.2 | setosa | -| 4.6 | 3.1 | 1.5 | 0.2 | setosa | -| 5.0 | 3.6 | 1.4 | 0.2 | setosa | -| 5.4 | 3.9 | 1.7 | 0.4 | setosa | -| 4.6 | 3.4 | 1.4 | 0.3 | setosa | -| 5.0 | 3.4 | 1.5 | 0.2 | setosa | -| 4.4 | 2.9 | 1.4 | 0.2 | setosa | -| 4.9 | 3.1 | 1.5 | 0.1 | setosa | -| ... | ... | ... | ... | ... | +You can create a new tibble from vectors that represent the columns with `data_frame()`: + +``` r +data_frame(x = 1:5, y = 1, z = x ^ 2 + y) +#> Source: local data frame [5 x 3] +#> +#> x y z +#> (int) (dbl) (dbl) +#> 1 1 1 2 +#> 2 2 1 5 +#> 3 3 1 10 +#> 4 4 1 17 +#> 5 5 1 26 +``` + +`data_frame()` is does much less than `data.frame()`: it never changes the type of the inputs (e.g. it never converts strings to factors!), it never changes the names of variabels, and it never creates `row.names()`. You can read more about these features in the vignette, `vignette("tibble")`. + +You can define a tibble row-by-row with `frame_data()`: + +``` r +frame_data( + ~x, ~y, ~z, + "a", 2, 3.6, + "b", 1, 8.5 +) +#> Source: local data frame [2 x 3] +#> +#> x y z +#> (chr) (dbl) (dbl) +#> 1 a 2 3.6 +#> 2 b 1 8.5 +``` + +Tibbles vs data frames +---------------------- + +There are two main differences in the usage of a data frame vs a tibble: printing, and subsetting. + +Tibbles have a refined print method that shows only the first 10 rows, and all the columns that fit on screen. Each column gives both the name and its type. This makes it much eaiser to work with large data: + +``` r +library(nycflights13) +flights +#> Source: local data frame [336,776 x 16] +#> +#> year month day dep_time dep_delay arr_time arr_delay carrier tailnum +#> (int) (int) (int) (int) (dbl) (int) (dbl) (chr) (chr) +#> 1 2013 1 1 517 2 830 11 UA N14228 +#> 2 2013 1 1 533 4 850 20 UA N24211 +#> 3 2013 1 1 542 2 923 33 AA N619AA +#> 4 2013 1 1 544 -1 1004 -18 B6 N804JB +#> 5 2013 1 1 554 -6 812 -25 DL N668DN +#> 6 2013 1 1 554 -4 740 12 UA N39463 +#> 7 2013 1 1 555 -5 913 19 B6 N516JB +#> 8 2013 1 1 557 -3 709 -14 EV N829AS +#> 9 2013 1 1 557 -3 838 -8 B6 N593JB +#> 10 2013 1 1 558 -2 753 8 AA N3ALAA +#> .. ... ... ... ... ... ... ... ... ... +#> Variables not shown: flight (int), origin (chr), dest (chr), air_time +#> (dbl), distance (dbl), hour (dbl), minute (dbl). +``` + +Tibles are strict about subsetting. If you try and access a variable that does not, you'll get an error: + +``` r +flights$yea +#> Error: Unknown column 'yea' +``` + +Tibbles clearly delinerate `[` and `[[`: `[` always returns another tibble, `[[` always returns a vector. diff --git a/man/frame_data.Rd b/man/frame_data.Rd index 7e4ec1cf8..0b2a6fe45 100644 --- a/man/frame_data.Rd +++ b/man/frame_data.Rd @@ -3,17 +3,23 @@ \name{frame_data} \alias{frame_data} \alias{tibble} -\title{Row-wise data_frame creation} +\title{Row-wise tibble creation} \usage{ frame_data(...) tibble(...) } \arguments{ -\item{...}{Arguments specifying the structure of a \code{data_frame}.} +\item{...}{Arguments specifying the structure of a \code{data_frame}. +Variable names should be formulas, and may only appear before the data.} +} +\value{ +A \code{\link{tbl_df}}. } \description{ -Create a row-wise \code{\link{data_frame}}. +Create \code{\link{data_frame}}s laying out the data in rows, rather than +in columns. This is useful for small tables of data where readability is +important. } \examples{ frame_data( @@ -22,5 +28,13 @@ frame_data( "b", 2, "c", 3 ) + +# frame_data will create a list column if the value in each cell is +# not a scalar +frame_data( + ~x, ~y, + "a", 1:3, + "b", 4:6 +) } diff --git a/man/obj_sum.Rd b/man/obj_sum.Rd new file mode 100644 index 000000000..27826da58 --- /dev/null +++ b/man/obj_sum.Rd @@ -0,0 +1,33 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/type-sum.r +\name{obj_sum} +\alias{is_vector_s3} +\alias{obj_sum} +\alias{type_sum} +\title{Provide a succinct summary of an object} +\usage{ +obj_sum(x) + +type_sum(x) + +is_vector_s3(x) +} +\arguments{ +\item{x}{an object to summarise. Generally only methods of atomic vectors +and variants have been implemented.} +} +\description{ +\code{type_sum} gives a brief summary of object type. Objects that commonly +occur in a data frame should return a string with four or less characters. +\code{obj_sum} also includes the size of the object if \code{is_s3_vector} +is \code{TRUE}. +} +\examples{ +obj_sum(1:10) +obj_sum(matrix(1:10)) +obj_sum(Sys.Date()) +obj_sum(Sys.time()) +obj_sum(mean) +} +\keyword{internal} + diff --git a/man/repair_names.Rd b/man/repair_names.Rd index 7ef434a5f..bae3e9814 100644 --- a/man/repair_names.Rd +++ b/man/repair_names.Rd @@ -11,7 +11,7 @@ repair_names(x, prefix = "V", sep = "") \item{prefix}{A string, the prefix to use for new column names.} -\item{sep}{A string, inserted between the column name and de-duplicating +\item{sep}{A string inserted between the column name and de-duplicating number.} } \value{ @@ -19,8 +19,8 @@ number.} } \description{ \code{repair_names} ensures its input has non-missing and -unique names. It also strips any leading or trailing spaces. -Valid names are left as is. +unique names (duplicated names get a numeric suffix). Valid names are +left as is. } \examples{ repair_names(list(3, 4, 5)) # works for lists, too diff --git a/man/rownames.Rd b/man/rownames.Rd index c4ae6180c..0dc9597cf 100644 --- a/man/rownames.Rd +++ b/man/rownames.Rd @@ -1,11 +1,12 @@ % Generated by roxygen2: do not edit by hand % Please edit documentation in R/rownames.R -\name{has_rownames} +\name{rownames} \alias{column_to_rownames} \alias{has_rownames} \alias{remove_rownames} +\alias{rownames} \alias{rownames_to_column} -\title{Row names} +\title{Tools for working with row names} \usage{ has_rownames(df) @@ -16,29 +17,27 @@ rownames_to_column(df, var = "rowname") column_to_rownames(df, var = "rowname") } \arguments{ -\item{df}{Input data frame} +\item{df}{A data frame} -\item{var}{Name of variable to use} +\item{var}{Name of column to use for rownames.} } \description{ -\code{has_rownames} checks if a data frame has row names. - -\code{remove_rownames} removes all row names. - -\code{rownames_to_column} convert row names to an explicit variable. - -\code{column_to_rownames} convert a column variable to row names. This is an -inverted operation of \code{rownames_to_column}. +Generally, it is best to avoid row names, because they are basically a +character column with different semantics to every other column. These +functions allow to you detect if a data frame has row names +(\code{has_rownames}), remove them (\code{remove_rownames}), or convert +them back-and-forth between an explicit column (\code{rownames_to_column}, +and \code{column_to_rownames}). } \examples{ has_rownames(mtcars) has_rownames(iris) -rownames(remove_rownames(mtcars)) -rownames_to_column(mtcars) +has_rownames(remove_rownames(mtcars)) + +head(rownames_to_column(mtcars)) mtcars_tbl <- rownames_to_column(tbl_df(mtcars)) mtcars_tbl - column_to_rownames(mtcars_tbl) } diff --git a/man/tbl_df.Rd b/man/tbl_df.Rd index 36dced2a2..9abd40fa3 100644 --- a/man/tbl_df.Rd +++ b/man/tbl_df.Rd @@ -2,7 +2,7 @@ % Please edit documentation in R/tbl-df.r \name{tbl_df} \alias{tbl_df} -\title{Create a data frame tbl.} +\title{S3 class: tbl_df} \usage{ tbl_df(data) } @@ -28,16 +28,5 @@ screen, describing the rest of it as text. so is considerably faster. Throws error if column does not exist.} } } -\examples{ -ds <- tbl_df(mtcars) -ds -as.data.frame(ds) - -if (require("Lahman")) { -batting <- tbl_df(Batting) -dim(batting) -colnames(batting) -head(batting) -} -} +\keyword{internal} diff --git a/man/type_sum.Rd b/man/type_sum.Rd deleted file mode 100644 index b661322fc..000000000 --- a/man/type_sum.Rd +++ /dev/null @@ -1,25 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/type-sum.r -\name{type_sum} -\alias{type_sum} -\title{Provide a succinct summary of a type} -\usage{ -type_sum(x) -} -\arguments{ -\item{x}{an object to summarise. Generally only methods of atomic vectors -and variants have been implemented.} -} -\description{ -All methods should return a string with four or less characters, suitable -for succinctly display column types. -} -\examples{ -type_sum(1:10) -type_sum(matrix(1:10)) -type_sum(Sys.Date()) -type_sum(Sys.time()) -type_sum(mean) -} -\keyword{internal} - diff --git a/tests/testthat/test-frame-data.R b/tests/testthat/test-frame-data.R index 3febd649d..c5e79b290 100644 --- a/tests/testthat/test-frame-data.R +++ b/tests/testthat/test-frame-data.R @@ -11,8 +11,6 @@ test_that("frame_data() constructs 'data_frame' as expected", { compared <- data_frame(colA = c("a", "b"), colB = c(1, 2)) expect_equal(result, compared) - expect_identical(frame_data(~a, ~b), data_frame()) - ## wide wide <- frame_data( ~colA, ~colB, ~colC, ~colD, @@ -92,3 +90,8 @@ test_that("frame_data can have list columns", { expect_equal(df$x, c(1, 2)) expect_equal(df$y, list(list(a = 1), list(b = 2))) }) + +test_that("frame_data creates n-col empty data frame", { + df <- frame_data(~x, ~y) + expect_equal(names(df), c("x", "y")) +}) diff --git a/tests/testthat/test-obj-sum.R b/tests/testthat/test-obj-sum.R new file mode 100644 index 000000000..8ac4d03cd --- /dev/null +++ b/tests/testthat/test-obj-sum.R @@ -0,0 +1,35 @@ +context("obj_sum") + +# obj_sum ---------------------------------------------------------------- + +test_that("shows only first class name for S4", { + A <- methods::setClass("A") + expect_equal(obj_sum(A), "S4: classGeneratorFunction") +}) + +test_that("NULL handled specially", { + expect_equal(obj_sum(NULL), "NULL") +}) + +test_that("data frame includes rows and cols", { + expect_equal(obj_sum(mtcars), "data.frame [32,11]") +}) + +test_that("S3 others list all classes", { + x <- structure(list(), class = c("a", "b", "c")) + expect_equal(obj_sum(x), "S3: a/b/c") +}) + +test_that("common data vectors treated as atomic", { + expect_equal(obj_sum(factor(1:3)), "fctr [3]") + expect_equal(obj_sum(Sys.Date() + 1:3), "date [3]") + expect_equal(obj_sum(Sys.time() + 1:3), "time [3]") +}) + + +# type_sum ---------------------------------------------------------------- + +test_that("less common objects get abbreviations", { + expect_equal(type_sum(environment()), "env") + expect_equal(type_sum(environment), "fun") +}) diff --git a/tests/testthat/test-repair_names.R b/tests/testthat/test-repair_names.R index 199917676..49efdd289 100644 --- a/tests/testthat/test-repair_names.R +++ b/tests/testthat/test-repair_names.R @@ -1,87 +1,42 @@ context("repair_names") -test_that("trim_ws", { - expect_equal(trim_ws(" a"), "a") - expect_equal(trim_ws("a "), "a") - expect_equal(trim_ws(" a "), "a") +test_that("zero-length inputs given character names", { + out <- repair_names(character()) + expect_equal(names(out), character()) }) -test_that("repair missing column names", { - dat <- data.frame(a = 1, b = 2, c = 3) - colnames(dat)[2] <- NA - - # ensure we start with a "bad" state - expect_true(any(is.na(colnames(dat)))) +test_that("unnamed input gives uniquely named output", { + out <- repair_names(1:3) + expect_equal(names(out), c("V1", "V2", "V3")) +}) - fixed_dat <- repair_names(dat) - fixed_names <- colnames(fixed_dat) - # no repeats - expect_false(any(table(fixed_names) > 1)) +# make_unique ------------------------------------------------------------- - # ensure all valid column names are retained - expect_equal(length(setdiff(Filter(function(a) ! (is.na(a) | a == ''), - colnames(dat)), - fixed_names)), 0) +test_that("duplicates are de-deduped", { + expect_equal(make_unique(c("x", "x")), c("x", "x1")) }) -test_that("repair various name problems", { - combos <- list(Null = NULL, - Empty = c('', '', ''), - Spaces = c('a', 'b', ' '), - EmptyWithNA = c('', NA, NA), - Dup1 = c('a', 'a', 'b'), - Evil1 = c('a', 'a ', 'a1'), - OneNA = c('a', 'b', NA), - Missing2 = c('', '', 'b'), - Vnames1 = c('V1', '', ''), - Vnames2 = c('V2', ' ', ''), - Vnames3 = c('V1', '', 'a'), - VnamesDup1 = c('V1', ' V1 ', 'c'), - VnamesDup2 = c(' V1', 'V1', '') - ) - for (combo_name in names(combos)) { - dat <- data.frame(a = 1, b = 2, c = 3) - colnames(dat) <- combos[[ combo_name ]] +test_that("blanks get prefix + numeric id", { + expect_equal(make_unique(c("", "")), c("V1", "V2")) +}) - # ensure we start with a "bad" state - old_names <- colnames(dat) - if (!is.null(old_names)) - old_names <- trim_ws(old_names) - expect_true(is.null(old_names) || - any(table(old_names) > 1) || - any(old_names == '' | is.na(old_names)) || - any(grepl('^ +| +$', old_names)), - info = combo_name) +test_that("blanks skip existing names", { + expect_equal(make_unique(c("", "V1")), c("V2", "V1")) +}) - fixed_dat <- repair_names(dat) - fixed_names <- colnames(fixed_dat) +test_that("blanks skip names created when de-duping", { + expect_equal(make_unique(c("", "V", "V")), c("V2", "V", "V1")) +}) - # no repeats - expect_false(any(table(fixed_names) > 1), info = combo_name) +# names2 ------------------------------------------------------------------ - # ensure all valid column names are retained - if (! is.null(old_names)) { - valid <- ! is.na(old_names) & old_names != '' & - ! duplicated(old_names) - expect_equal(fixed_names[valid], old_names[valid]) - } - } +test_that("names2 returns character vector even if names NULL", { + expect_equal(names2(1:3), rep("", 3)) }) -test_that("check pathological cases", { - df <- data.frame() - expect_identical(repair_names(df), df) - df <- data.frame(row.names = 1:3) - expect_identical(repair_names(df), df) - l <- list(3, 4, 5) - expect_identical(repair_names(l), setNames(l, paste0("V", 1:3))) - l <- list(V = 3, W = 4, 5) - expect_identical(repair_names(l), setNames(l, c("V", "W", "V1"))) -}) +test_that("names2 replaces missing value with blanks", { + x <- 1:3 + names(x) <- c("a", "b", NA) -test_that("check object class", { - expect_equal(class(iris), class(repair_names(iris))) - expect_equal(class(tbl_df(iris)), class(repair_names(tbl_df(iris)))) - expect_equal(class(repair_names(1:10)), "integer") - expect_error(repair_names(cat), "non-vector") + expect_equal(names2(x), c("a", "b", "")) }) diff --git a/tests/testthat/test-tbl-df.r b/tests/testthat/test-tbl-df.r index bb727a660..8dd2f158b 100644 --- a/tests/testthat/test-tbl-df.r +++ b/tests/testthat/test-tbl-df.r @@ -1,5 +1,8 @@ context("tbl_df") + +# [ ----------------------------------------------------------------------- + test_that("[ never drops", { mtcars2 <- tbl_df(mtcars) expect_is(mtcars2[, 1], "data.frame") @@ -32,6 +35,9 @@ test_that("[.tbl_df is careful about names (#1245)",{ expect_error( foo[, c("x", "y", "z") ] ) }) + +# [[ ---------------------------------------------------------------------- + test_that("[[.tbl_df ignores exact argument",{ foo <- data_frame(x = 1:10, y = 1:10) expect_warning(foo[["x"]], NA) @@ -44,3 +50,15 @@ test_that("can use recursive indexing with [[", { expect_equal(foo[[c(1, 1)]], 1:3) expect_equal(foo[[c("x", "y")]], 1:3) }) + +test_that("[[ throws error if name doesn't exist", { + df <- data_frame(x = 1) + expect_error(df[["y"]], "Unknown column 'y'") +}) + +# $ ----------------------------------------------------------------------- + +test_that("[[ throws error if name doesn't exist", { + df <- data_frame(x = 1) + expect_error(df$y, "Unknown column 'y'") +}) diff --git a/tests/testthat/test-trunc-mat.r b/tests/testthat/test-trunc-mat.r index 6efbd1dea..3c36d8350 100644 --- a/tests/testthat/test-trunc-mat.r +++ b/tests/testthat/test-trunc-mat.r @@ -5,7 +5,7 @@ test_that("trunc_mat output matches known output", { capture.output(print(tbl_df(mtcars), n = 8L, width = 30L)), c("Source: local data frame [32 x 11]", "", " mpg cyl disp hp", - " (dbl) (dbl) (dbl) (dbl)", + " ", "1 21.0 6 160.0 110", "2 21.0 6 160.0 110", "3 22.8 4 108.0 93", @@ -16,9 +16,9 @@ test_that("trunc_mat output matches known output", { "8 24.4 4 146.7 62", ".. ... ... ... ...", "Variables not shown: drat", - " (dbl), wt (dbl), qsec", - " (dbl), vs (dbl), am (dbl),", - " gear (dbl), carb (dbl)." + " , wt , qsec", + " , vs , am ,", + " gear , carb ." ) ) @@ -26,7 +26,7 @@ test_that("trunc_mat output matches known output", { capture.output(print(tbl_df(iris), n = 5L, width = 30L)), c("Source: local data frame [150 x 5]", "", " Sepal.Length Sepal.Width", - " (dbl) (dbl)", + " ", "1 5.1 3.5", "2 4.9 3.0", "3 4.7 3.2", @@ -34,15 +34,15 @@ test_that("trunc_mat output matches known output", { "5 5.0 3.6", ".. ... ...", "Variables not shown:", - " Petal.Length (dbl),", - " Petal.Width (dbl), Species", - " (fctr).")) + " Petal.Length ,", + " Petal.Width , Species", + " .")) expect_identical( capture.output(print(tbl_df(iris), n = 3L, width = 5L))[1:8], c("Source: local data frame [150 x 5]", "", " Sepal.Length", - " (dbl)", + " ", "1 5.1", "2 4.9", "3 4.7", @@ -52,20 +52,20 @@ test_that("trunc_mat output matches known output", { capture.output(print(df_all, n = NULL, width = 30L)), c("Source: local data frame [2 x 8]", "", " a b c d", - " (dbl) (int) (lgl) (chr)", + " ", "1 1.0 1 TRUE a", "2 2.5 2 FALSE b", "Variables not shown: e", - " (fctr), f (date), g (time),", - " h (list).")) + " , f , g