From 92cf882a731cc063bd63bb51625a9b8eadcc89ae Mon Sep 17 00:00:00 2001 From: Davis Vaughan Date: Wed, 19 Nov 2025 17:08:14 -0500 Subject: [PATCH 01/14] Implement `filter_out()` --- NAMESPACE | 2 + R/data-mask.R | 3 +- R/filter.R | 91 ++++++++++++++++++++++++++++----- man/filter.Rd | 3 ++ src/dplyr.h | 2 +- src/filter.cpp | 7 +++ src/init.cpp | 2 +- tests/testthat/_snaps/filter.md | 4 +- 8 files changed, 95 insertions(+), 19 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index b898e342cb..c26586125c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -61,6 +61,7 @@ S3method(filter_,data.frame) S3method(filter_,tbl_df) S3method(filter_bullets,"dplyr:::filter_incompatible_size") S3method(filter_bullets,"dplyr:::filter_incompatible_type") +S3method(filter_out,data.frame) S3method(full_join,data.frame) S3method(group_by,data.frame) S3method(group_by_,data.frame) @@ -281,6 +282,7 @@ export(filter_) export(filter_all) export(filter_at) export(filter_if) +export(filter_out) export(first) export(full_join) export(funs) diff --git a/R/data-mask.R b/R/data-mask.R index 56c2353d5c..5c7eca71a6 100644 --- a/R/data-mask.R +++ b/R/data-mask.R @@ -116,11 +116,12 @@ DataMask <- R6Class( eval() }, - eval_all_filter = function(quos, env_filter) { + eval_all_filter = function(quos, invert, env_filter) { eval <- function() { .Call( `dplyr_mask_eval_all_filter`, quos, + invert, private, private$size, env_filter diff --git a/R/filter.R b/R/filter.R index 269c74971c..8b38e1b091 100644 --- a/R/filter.R +++ b/R/filter.R @@ -72,7 +72,9 @@ #' #' The following methods are currently available in loaded packages: #' \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("filter")}. -#' @export +#' +#' @name filter +#' #' @examples #' # Filtering by one criterion #' filter(starwars, species == "Human") @@ -107,31 +109,76 @@ #' .data[[vars[[2]]]] > cond[[2]] #' ) #' # Learn more in ?rlang::args_data_masking +NULL + +#' @rdname filter +#' @export filter <- function(.data, ..., .by = NULL, .preserve = FALSE) { check_by_typo(...) - - by <- enquo(.by) - - if (!quo_is_null(by) && !is_false(.preserve)) { - abort("Can't supply both `.by` and `.preserve`.") - } - + check_not_both_by_and_preserve({{ .by }}, .preserve) UseMethod("filter") } +#' @rdname filter +#' @export +filter_out <- function(.data, ..., .by = NULL, .preserve = FALSE) { + check_by_typo(...) + check_not_both_by_and_preserve({{ .by }}, .preserve) + UseMethod("filter_out") +} + #' @export filter.data.frame <- function(.data, ..., .by = NULL, .preserve = FALSE) { + filter_impl( + .data = .data, + ..., + .by = {{ .by }}, + .preserve = .preserve, + .verb = "filter" + ) +} + +#' @export +filter_out.data.frame <- function(.data, ..., .by = NULL, .preserve = FALSE) { + filter_impl( + .data = .data, + ..., + .by = {{ .by }}, + .preserve = .preserve, + .verb = "filter_out" + ) +} + +filter_impl <- function( + .data, + ..., + .by, + .preserve, + .invert, + .verb, + .error_call = caller_env(), + .user_env = caller_env(2) +) { dots <- dplyr_quosures(...) - check_filter(dots) + check_filter(dots, error_call = .error_call) by <- compute_by( by = {{ .by }}, data = .data, by_arg = ".by", - data_arg = ".data" + data_arg = ".data", + error_call = .error_call + ) + + loc <- filter_rows( + data = .data, + dots = dots, + by = by, + verb = .verb, + error_call = .error_call, + user_env = .user_env ) - loc <- filter_rows(.data, dots, by) dplyr_row_slice(.data, loc, preserve = .preserve) } @@ -139,20 +186,24 @@ filter_rows <- function( data, dots, by, + verb, error_call = caller_env(), user_env = caller_env(2) ) { error_call <- dplyr_error_call(error_call) - mask <- DataMask$new(data, by, "filter", error_call = error_call) + mask <- DataMask$new(data, by, verb, error_call = error_call) on.exit(mask$forget(), add = TRUE) # 1:1 mapping between `dots` and `dots_expanded` dots_expanded <- filter_expand(dots, mask = mask, error_call = error_call) + invert <- verb == "filter_out" + filter_eval( dots = dots, dots_expanded = dots_expanded, + invert = invert, mask = mask, error_call = error_call, user_env = user_env @@ -208,6 +259,7 @@ filter_expand <- function(dots, mask, error_call = caller_env()) { filter_eval <- function( dots, dots_expanded, + invert, mask, error_call = caller_env(), user_env = caller_env(2) @@ -229,7 +281,7 @@ filter_eval <- function( ) out <- withCallingHandlers( - mask$eval_all_filter(dots_expanded, env_filter), + mask$eval_all_filter(dots_expanded, invert, env_filter), error = dplyr_error_handler( dots = dots, mask = mask, @@ -288,10 +340,21 @@ filter_bullets <- function(cnd, ...) { warn_filter_one_column_matrix <- function(env, user_env) { lifecycle::deprecate_warn( when = "1.1.0", - what = I("Using one column matrices in `filter()`"), + what = I("Using one column matrices in `filter()` or `filter_out()`"), with = I("one dimensional logical vectors"), env = env, user_env = user_env, always = TRUE ) } + +check_not_both_by_and_preserve <- function( + .by, + .preserve, + error_call = caller_env() +) { + if (!quo_is_null(enquo(.by)) && !is_false(.preserve)) { + abort("Can't supply both `.by` and `.preserve`.", call = error_call) + } + invisible(NULL) +} diff --git a/man/filter.Rd b/man/filter.Rd index a4e7c94ef2..b38bb4c535 100644 --- a/man/filter.Rd +++ b/man/filter.Rd @@ -2,9 +2,12 @@ % Please edit documentation in R/filter.R \name{filter} \alias{filter} +\alias{filter_out} \title{Keep rows that match a condition} \usage{ filter(.data, ..., .by = NULL, .preserve = FALSE) + +filter_out(.data, ..., .by = NULL, .preserve = FALSE) } \arguments{ \item{.data}{A data frame, data frame extension (e.g. a tibble), or a diff --git a/src/dplyr.h b/src/dplyr.h index 2f9170df4c..b25c5b2d0b 100644 --- a/src/dplyr.h +++ b/src/dplyr.h @@ -94,7 +94,7 @@ SEXP dplyr_validate_rowwise_df(SEXP df); SEXP dplyr_mask_eval_all(SEXP quo, SEXP env_private); SEXP dplyr_mask_eval_all_summarise(SEXP quo, SEXP env_private); SEXP dplyr_mask_eval_all_mutate(SEXP quo, SEXP env_private); -SEXP dplyr_mask_eval_all_filter(SEXP quos, SEXP env_private, SEXP s_n, SEXP env_filter); +SEXP dplyr_mask_eval_all_filter(SEXP quos, SEXP invert, SEXP env_private, SEXP s_n, SEXP env_filter); SEXP dplyr_summarise_check_all_size_one(SEXP list_of_chunks); SEXP dplyr_reframe_recycle_horizontally_in_place(SEXP list_of_chunks, SEXP list_of_result); SEXP dplyr_group_indices(SEXP data, SEXP rows); diff --git a/src/filter.cpp b/src/filter.cpp index ab1ec9a03f..4f07241436 100644 --- a/src/filter.cpp +++ b/src/filter.cpp @@ -141,6 +141,7 @@ SEXP eval_filter_one(SEXP quos, } SEXP dplyr_mask_eval_all_filter(SEXP quos, + SEXP invert, SEXP env_private, SEXP s_n, SEXP env_filter) { @@ -181,6 +182,12 @@ SEXP dplyr_mask_eval_all_filter(SEXP quos, DPLYR_MASK_ITERATION_FINALISE(); } + if (LOGICAL_ELT(invert, 0)) { + for (R_xlen_t i = 0; i < n; ++i) { + p_keep[i] = !p_keep[i]; + } + } + UNPROTECT(1); DPLYR_MASK_FINALISE(); diff --git a/src/init.cpp b/src/init.cpp index 863f24a94d..308be26f64 100644 --- a/src/init.cpp +++ b/src/init.cpp @@ -109,7 +109,7 @@ static const R_CallMethodDef CallEntries[] = { {"dplyr_mask_eval_all", (DL_FUNC)& dplyr_mask_eval_all, 2}, {"dplyr_mask_eval_all_summarise", (DL_FUNC)& dplyr_mask_eval_all_summarise, 2}, {"dplyr_mask_eval_all_mutate", (DL_FUNC)& dplyr_mask_eval_all_mutate, 2}, - {"dplyr_mask_eval_all_filter", (DL_FUNC)& dplyr_mask_eval_all_filter, 4}, + {"dplyr_mask_eval_all_filter", (DL_FUNC)& dplyr_mask_eval_all_filter, 5}, {"dplyr_summarise_check_all_size_one", (DL_FUNC)& dplyr_summarise_check_all_size_one, 1}, {"dplyr_reframe_recycle_horizontally_in_place", (DL_FUNC)& dplyr_reframe_recycle_horizontally_in_place, 2}, diff --git a/tests/testthat/_snaps/filter.md b/tests/testthat/_snaps/filter.md index 84ea11cbab..24e4bd32d7 100644 --- a/tests/testthat/_snaps/filter.md +++ b/tests/testthat/_snaps/filter.md @@ -4,7 +4,7 @@ out <- filter(df, matrix(c(TRUE, FALSE), nrow = 2)) Condition Warning: - Using one column matrices in `filter()` was deprecated in dplyr 1.1.0. + Using one column matrices in `filter()` or `filter_out()` was deprecated in dplyr 1.1.0. i Please use one dimensional logical vectors instead. --- @@ -13,7 +13,7 @@ out <- filter(gdf, matrix(c(TRUE, FALSE), nrow = 2)) Condition Warning: - Using one column matrices in `filter()` was deprecated in dplyr 1.1.0. + Using one column matrices in `filter()` or `filter_out()` was deprecated in dplyr 1.1.0. i Please use one dimensional logical vectors instead. # filter() disallows matrices with >1 column From 822c679534e0d3128294fbaad99ecd41fae41319 Mon Sep 17 00:00:00 2001 From: Davis Vaughan Date: Thu, 20 Nov 2025 17:08:59 -0500 Subject: [PATCH 02/14] Document `filter_out()` --- R/filter.R | 177 ++++++++++++++++++++++++++++++++++++------------- man/filter.Rd | 179 ++++++++++++++++++++++++++++++++++++++++---------- man/slice.Rd | 5 +- 3 files changed, 280 insertions(+), 81 deletions(-) diff --git a/R/filter.R b/R/filter.R index 8b38e1b091..0ede7e8f7f 100644 --- a/R/filter.R +++ b/R/filter.R @@ -1,17 +1,107 @@ -#' Keep rows that match a condition +#' Keep or drop rows that match a condition #' -#' The `filter()` function is used to subset a data frame, -#' retaining all rows that satisfy your conditions. -#' To be retained, the row must produce a value of `TRUE` for all conditions. -#' Note that when a condition evaluates to `NA` -#' the row will be dropped, unlike base subsetting with `[`. +#' @description +#' These functions are used to subset a data frame, applying the expressions in +#' `...` to determine which rows should be kept or dropped. #' -#' The `filter()` function is used to subset the rows of -#' `.data`, applying the expressions in `...` to the column values to determine which -#' rows should be retained. It can be applied to both grouped and ungrouped data (see [group_by()] and -#' [ungroup()]). However, dplyr is not yet smart enough to optimise the filtering -#' operation on grouped datasets that do not need grouped calculations. For this -#' reason, filtering is often considerably faster on ungrouped data. +#' - `filter()` _keeps_ rows where the conditions evaluate to `TRUE`. +#' +#' - `filter_out()` _drops_ rows where the conditions evaluate to `TRUE`. +#' +#' Multiple conditions can be supplied separated by a comma. These will be +#' combined with the `&` operator. +#' +#' Both `filter()` and `filter_out()` treat `NA` like `FALSE`. This subtle +#' behavior can impact how you write your conditions when missing values are +#' involved. See the section on `Missing values` for important details and +#' examples. +#' +#' @inheritParams arrange +#' @inheritParams args_by +#' +#' @param ... <[`data-masking`][rlang::args_data_masking]> Expressions that +#' return a logical value, and are defined in terms of the variables in +#' `.data`. If multiple expressions are included, they are combined with the +#' `&` operator. Only rows for which all conditions evaluate to `TRUE` are +#' kept (for `filter()`) or dropped (for `filter_out()`). +#' +#' @param .preserve Relevant when the `.data` input is grouped. If `.preserve = +#' FALSE` (the default), the grouping structure is recalculated based on the +#' resulting data, otherwise the grouping is kept as is. +#' +#' @returns +#' An object of the same type as `.data`. The output has the following +#' properties: +#' +#' * Rows are a subset of the input, but appear in the same order. +#' * Columns are not modified. +#' * The number of groups may be reduced (if `.preserve` is not `TRUE`). +#' * Data frame attributes are preserved. +#' +#' @section Missing values: +#' +#' Both `filter()` and `filter_out()` treat `NA` like `FALSE`. This results in +#' the following behavior: +#' +#' - `filter()` _drops_ both `NA` and `FALSE`. +#' +#' - `filter_out()` _keeps_ both `NA` and `FALSE`. +#' +#' The `NA` handling of these functions has been designed to match your +#' _intent_. When your intent is to keep rows, use `filter()`. When your intent +#' is to drop rows, use `filter_out()`. +#' +#' For example, if your goal with this `cars` data is to "drop rows where the +#' `class` is suv", then you might write this in one of two ways: +#' +#' ```{r} +#' cars <- tibble(class = c("suv", NA, "coupe")) +#' cars +#' ``` +#' +#' ```{r} +#' cars |> filter(class != "suv") +#' ``` +#' +#' ```{r} +#' cars |> filter_out(class == "suv") +#' ``` +#' +#' Note how `filter()` drops the `NA` rows even though our goal was only to drop +#' `"suv"` rows, but `filter_out()` matches our intuition. +#' +#' To generate the correct result with `filter()`, you'd need to use: +#' +#' ```{r} +#' cars |> filter(class != "suv" | is.na(class)) +#' ``` +#' +#' This quickly gets unwieldy when multiple conditions are involved. +#' +#' In general, if you find yourself: +#' +#' - Using "negative" operators like `!=` or `!` +#' - Adding in `NA` handling like `| is.na(col)` or `& !is.na(col)` +#' +#' then you should consider if swapping to the other filtering variant would +#' make your conditions simpler. +#' +#' ## Comparison to base subsetting +#' +#' Base subsetting with `[` doesn't treat `NA` like `TRUE` or `FALSE`. Instead, +#' it generates a fully missing row, which is different from how both `filter()` +#' and `filter_out()` work. +#' +#' ```{r} +#' cars <- tibble(class = c("suv", NA, "coupe"), mpg = c(10, 12, 14)) +#' cars +#' ``` +#' +#' ```{r} +#' cars[cars$class == "suv",] +#' +#' cars |> filter(class == "suv") +#' ``` #' #' @section Useful filter functions: #' @@ -25,10 +115,10 @@ #' #' @section Grouped tibbles: #' -#' Because filtering expressions are computed within groups, they may -#' yield different results on grouped tibbles. This will be the case -#' as soon as an aggregating, lagging, or ranking function is -#' involved. Compare this ungrouped filtering: +#' Because filtering expressions are computed within groups, they may yield +#' different results on grouped tibbles. This will be the case as soon as an +#' aggregating, lagging, or ranking function is involved. Compare this ungrouped +#' filtering: #' #' ``` #' starwars |> filter(mass > mean(mass, na.rm = TRUE)) @@ -37,35 +127,17 @@ #' With the grouped equivalent: #' #' ``` -#' starwars |> group_by(gender) |> filter(mass > mean(mass, na.rm = TRUE)) +#' starwars |> filter(mass > mean(mass, na.rm = TRUE), .by = gender) #' ``` #' -#' In the ungrouped version, `filter()` compares the value of `mass` in each row to -#' the global average (taken over the whole data set), keeping only the rows with -#' `mass` greater than this global average. In contrast, the grouped version calculates -#' the average mass separately for each `gender` group, and keeps rows with `mass` greater -#' than the relevant within-gender average. -#' -#' @family single table verbs -#' @inheritParams arrange -#' @inheritParams args_by -#' @param ... <[`data-masking`][rlang::args_data_masking]> Expressions that -#' return a logical value, and are defined in terms of the variables in -#' `.data`. If multiple expressions are included, they are combined with the -#' `&` operator. Only rows for which all conditions evaluate to `TRUE` are -#' kept. -#' @param .preserve Relevant when the `.data` input is grouped. -#' If `.preserve = FALSE` (the default), the grouping structure -#' is recalculated based on the resulting data, otherwise the grouping is kept as is. -#' @return -#' An object of the same type as `.data`. The output has the following properties: -#' -#' * Rows are a subset of the input, but appear in the same order. -#' * Columns are not modified. -#' * The number of groups may be reduced (if `.preserve` is not `TRUE`). -#' * Data frame attributes are preserved. +#' In the ungrouped version, `filter()` compares the value of `mass` in each row +#' to the global average (taken over the whole data set), keeping only the rows +#' with `mass` greater than this global average. In contrast, the grouped +#' version calculates the average mass separately for each `gender` group, and +#' keeps rows with `mass` greater than the relevant within-gender average. #' #' @section Methods: +#' #' This function is a **generic**, which means that packages can provide #' implementations (methods) for other classes. See the documentation of #' individual methods for extra arguments and differences in behaviour. @@ -73,6 +145,7 @@ #' The following methods are currently available in loaded packages: #' \Sexpr[stage=render,results=rd]{dplyr:::methods_rd("filter")}. #' +#' @family single table verbs #' @name filter #' #' @examples @@ -91,16 +164,30 @@ #' # The filtering operation may yield different results on grouped #' # tibbles because the expressions are computed within groups. #' # -#' # The following filters rows where `mass` is greater than the +#' # The following keeps rows where `mass` is greater than the #' # global average: #' starwars |> filter(mass > mean(mass, na.rm = TRUE)) #' -#' # Whereas this keeps rows with `mass` greater than the gender +#' # Whereas this keeps rows with `mass` greater than the per `gender` #' # average: -#' starwars |> group_by(gender) |> filter(mass > mean(mass, na.rm = TRUE)) +#' starwars |> filter(mass > mean(mass, na.rm = TRUE), .by = gender) +#' +#' # If your intent is to drop rows, use `filter_out()`. +#' # To remove blond individuals: +#' starwars |> filter_out(hair_color == "blond") +#' +#' # Notice how this is different from using `filter()` and `!= "blond"`. +#' # With `filter()`, rows with a `hair_color` of `NA` are also unintentionally +#' # dropped. +#' starwars |> filter(hair_color != "blond") #' +#' # To retain `NA`s with `filter()`, you'd need `| is.na()`. +#' # If you find yourself using `is.na()` in this way, consider switching to +#' # `filter_out()` instead. +#' starwars |> filter(hair_color != "blond" | is.na(hair_color)) #' -#' # To refer to column names that are stored as strings, use the `.data` pronoun: +#' # To refer to column names that are stored as strings, use the `.data` +#' # pronoun: #' vars <- c("mass", "height") #' cond <- c(80, 150) #' starwars |> diff --git a/man/filter.Rd b/man/filter.Rd index b38bb4c535..e9f76bc04f 100644 --- a/man/filter.Rd +++ b/man/filter.Rd @@ -3,7 +3,7 @@ \name{filter} \alias{filter} \alias{filter_out} -\title{Keep rows that match a condition} +\title{Keep or drop rows that match a condition} \usage{ filter(.data, ..., .by = NULL, .preserve = FALSE) @@ -18,18 +18,18 @@ more details.} return a logical value, and are defined in terms of the variables in \code{.data}. If multiple expressions are included, they are combined with the \code{&} operator. Only rows for which all conditions evaluate to \code{TRUE} are -kept.} +kept (for \code{filter()}) or dropped (for \code{filter_out()}).} \item{.by}{<\code{\link[=dplyr_tidy_select]{tidy-select}}> Optionally, a selection of columns to group by for just this operation, functioning as an alternative to \code{\link[=group_by]{group_by()}}. For details and examples, see \link[=dplyr_by]{?dplyr_by}.} -\item{.preserve}{Relevant when the \code{.data} input is grouped. -If \code{.preserve = FALSE} (the default), the grouping structure -is recalculated based on the resulting data, otherwise the grouping is kept as is.} +\item{.preserve}{Relevant when the \code{.data} input is grouped. If \code{.preserve = FALSE} (the default), the grouping structure is recalculated based on the +resulting data, otherwise the grouping is kept as is.} } \value{ -An object of the same type as \code{.data}. The output has the following properties: +An object of the same type as \code{.data}. The output has the following +properties: \itemize{ \item Rows are a subset of the input, but appear in the same order. \item Columns are not modified. @@ -38,20 +38,118 @@ An object of the same type as \code{.data}. The output has the following propert } } \description{ -The \code{filter()} function is used to subset a data frame, -retaining all rows that satisfy your conditions. -To be retained, the row must produce a value of \code{TRUE} for all conditions. -Note that when a condition evaluates to \code{NA} -the row will be dropped, unlike base subsetting with \code{[}. -} -\details{ -The \code{filter()} function is used to subset the rows of -\code{.data}, applying the expressions in \code{...} to the column values to determine which -rows should be retained. It can be applied to both grouped and ungrouped data (see \code{\link[=group_by]{group_by()}} and -\code{\link[=ungroup]{ungroup()}}). However, dplyr is not yet smart enough to optimise the filtering -operation on grouped datasets that do not need grouped calculations. For this -reason, filtering is often considerably faster on ungrouped data. +These functions are used to subset a data frame, applying the expressions in +\code{...} to determine which rows should be kept or dropped. +\itemize{ +\item \code{filter()} \emph{keeps} rows where the conditions evaluate to \code{TRUE}. +\item \code{filter_out()} \emph{drops} rows where the conditions evaluate to \code{TRUE}. +} + +Multiple conditions can be supplied separated by a comma. These will be +combined with the \code{&} operator. + +Both \code{filter()} and \code{filter_out()} treat \code{NA} like \code{FALSE}. This subtle +behavior can impact how you write your conditions when missing values are +involved. See the section on \verb{Missing values} for important details and +examples. +} +\section{Missing values}{ + + +Both \code{filter()} and \code{filter_out()} treat \code{NA} like \code{FALSE}. This results in +the following behavior: +\itemize{ +\item \code{filter()} \emph{drops} both \code{NA} and \code{FALSE}. +\item \code{filter_out()} \emph{keeps} both \code{NA} and \code{FALSE}. } + +The \code{NA} handling of these functions has been designed to match your +\emph{intent}. When your intent is to keep rows, use \code{filter()}. When your intent +is to drop rows, use \code{filter_out()}. + +For example, if your goal with this \code{cars} data is to "drop rows where the +\code{class} is suv", then you might write this in one of two ways: + +\if{html}{\out{
}}\preformatted{cars <- tibble(class = c("suv", NA, "coupe")) +cars +#> # A tibble: 3 x 1 +#> class +#> +#> 1 suv +#> 2 +#> 3 coupe +}\if{html}{\out{
}} + +\if{html}{\out{
}}\preformatted{cars |> filter(class != "suv") +#> # A tibble: 1 x 1 +#> class +#> +#> 1 coupe +}\if{html}{\out{
}} + +\if{html}{\out{
}}\preformatted{cars |> filter_out(class == "suv") +#> # A tibble: 2 x 1 +#> class +#> +#> 1 +#> 2 coupe +}\if{html}{\out{
}} + +Note how \code{filter()} drops the \code{NA} rows even though our goal was only to drop +\code{"suv"} rows, but \code{filter_out()} matches our intuition. + +To generate the correct result with \code{filter()}, you'd need to use: + +\if{html}{\out{
}}\preformatted{cars |> filter(class != "suv" | is.na(class)) +#> # A tibble: 2 x 1 +#> class +#> +#> 1 +#> 2 coupe +}\if{html}{\out{
}} + +This quickly gets unwieldy when multiple conditions are involved. + +In general, if you find yourself: +\itemize{ +\item Using "negative" operators like \code{!=} or \code{!} +\item Adding in \code{NA} handling like \verb{| is.na(col)} or \verb{& !is.na(col)} +} + +then you should consider if swapping to the other filtering variant would +make your conditions simpler. +\subsection{Comparison to base subsetting}{ + +Base subsetting with \code{[} doesn't treat \code{NA} like \code{TRUE} or \code{FALSE}. Instead, +it generates a fully missing row, which is different from how both \code{filter()} +and \code{filter_out()} work. + +\if{html}{\out{
}}\preformatted{cars <- tibble(class = c("suv", NA, "coupe"), mpg = c(10, 12, 14)) +cars +#> # A tibble: 3 x 2 +#> class mpg +#> +#> 1 suv 10 +#> 2 12 +#> 3 coupe 14 +}\if{html}{\out{
}} + +\if{html}{\out{
}}\preformatted{cars[cars$class == "suv",] +#> # A tibble: 2 x 2 +#> class mpg +#> +#> 1 suv 10 +#> 2 NA + +cars |> filter(class == "suv") +#> # A tibble: 1 x 2 +#> class mpg +#> +#> 1 suv 10 +}\if{html}{\out{
}} +} +} + \section{Useful filter functions}{ @@ -68,28 +166,29 @@ expressions used to filter the data: \section{Grouped tibbles}{ -Because filtering expressions are computed within groups, they may -yield different results on grouped tibbles. This will be the case -as soon as an aggregating, lagging, or ranking function is -involved. Compare this ungrouped filtering: +Because filtering expressions are computed within groups, they may yield +different results on grouped tibbles. This will be the case as soon as an +aggregating, lagging, or ranking function is involved. Compare this ungrouped +filtering: \if{html}{\out{
}}\preformatted{starwars |> filter(mass > mean(mass, na.rm = TRUE)) }\if{html}{\out{
}} With the grouped equivalent: -\if{html}{\out{
}}\preformatted{starwars |> group_by(gender) |> filter(mass > mean(mass, na.rm = TRUE)) +\if{html}{\out{
}}\preformatted{starwars |> filter(mass > mean(mass, na.rm = TRUE), .by = gender) }\if{html}{\out{
}} -In the ungrouped version, \code{filter()} compares the value of \code{mass} in each row to -the global average (taken over the whole data set), keeping only the rows with -\code{mass} greater than this global average. In contrast, the grouped version calculates -the average mass separately for each \code{gender} group, and keeps rows with \code{mass} greater -than the relevant within-gender average. +In the ungrouped version, \code{filter()} compares the value of \code{mass} in each row +to the global average (taken over the whole data set), keeping only the rows +with \code{mass} greater than this global average. In contrast, the grouped +version calculates the average mass separately for each \code{gender} group, and +keeps rows with \code{mass} greater than the relevant within-gender average. } \section{Methods}{ + This function is a \strong{generic}, which means that packages can provide implementations (methods) for other classes. See the documentation of individual methods for extra arguments and differences in behaviour. @@ -114,16 +213,30 @@ filter(starwars, hair_color == "none", eye_color == "black") # The filtering operation may yield different results on grouped # tibbles because the expressions are computed within groups. # -# The following filters rows where `mass` is greater than the +# The following keeps rows where `mass` is greater than the # global average: starwars |> filter(mass > mean(mass, na.rm = TRUE)) -# Whereas this keeps rows with `mass` greater than the gender +# Whereas this keeps rows with `mass` greater than the per `gender` # average: -starwars |> group_by(gender) |> filter(mass > mean(mass, na.rm = TRUE)) +starwars |> filter(mass > mean(mass, na.rm = TRUE), .by = gender) + +# If your intent is to drop rows, use `filter_out()`. +# To remove blond individuals: +starwars |> filter_out(hair_color == "blond") + +# Notice how this is different from using `filter()` and `!= "blond"`. +# With `filter()`, rows with a `hair_color` of `NA` are also unintentionally +# dropped. +starwars |> filter(hair_color != "blond") +# To retain `NA`s with `filter()`, you'd need `| is.na()`. +# If you find yourself using `is.na()` in this way, consider switching to +# `filter_out()` instead. +starwars |> filter(hair_color != "blond" | is.na(hair_color)) -# To refer to column names that are stored as strings, use the `.data` pronoun: +# To refer to column names that are stored as strings, use the `.data` +# pronoun: vars <- c("mass", "height") cond <- c(80, 150) starwars |> diff --git a/man/slice.Rd b/man/slice.Rd index eb9e31d9d7..e4edf92ae4 100644 --- a/man/slice.Rd +++ b/man/slice.Rd @@ -57,9 +57,8 @@ For \verb{slice_*()}, these arguments are passed on to methods.} group by for just this operation, functioning as an alternative to \code{\link[=group_by]{group_by()}}. For details and examples, see \link[=dplyr_by]{?dplyr_by}.} -\item{.preserve}{Relevant when the \code{.data} input is grouped. -If \code{.preserve = FALSE} (the default), the grouping structure -is recalculated based on the resulting data, otherwise the grouping is kept as is.} +\item{.preserve}{Relevant when the \code{.data} input is grouped. If \code{.preserve = FALSE} (the default), the grouping structure is recalculated based on the +resulting data, otherwise the grouping is kept as is.} \item{n, prop}{Provide either \code{n}, the number of rows, or \code{prop}, the proportion of rows to select. If neither are supplied, \code{n = 1} will be From dee6b99523472931cfd43040c3603230b1fa7ddf Mon Sep 17 00:00:00 2001 From: Davis Vaughan Date: Tue, 25 Nov 2025 13:10:41 -0500 Subject: [PATCH 03/14] Revise examples --- R/filter.R | 33 ++++++++++++++++++++------------- man/filter.Rd | 33 ++++++++++++++++++++------------- 2 files changed, 40 insertions(+), 26 deletions(-) diff --git a/R/filter.R b/R/filter.R index 0ede7e8f7f..95a26da7f1 100644 --- a/R/filter.R +++ b/R/filter.R @@ -149,17 +149,24 @@ #' @name filter #' #' @examples -#' # Filtering by one criterion +#' # Filtering for one criterion #' filter(starwars, species == "Human") -#' filter(starwars, mass > 1000) #' -#' # Filtering by multiple criteria within a single logical expression +#' # Filtering for multiple criteria within a single logical expression #' filter(starwars, hair_color == "none" & eye_color == "black") #' filter(starwars, hair_color == "none" | eye_color == "black") #' #' # When multiple expressions are used, they are combined using & #' filter(starwars, hair_color == "none", eye_color == "black") #' +#' # Filtering out to drop rows +#' filter_out(starwars, hair_color == "none") +#' +#' # When filtering out, it can be useful to first interactively filter for the +#' # rows you want to drop, just to double check that you've written the +#' # conditions correctly. Then, just change `filter()` to `filter_out()`. +#' filter(starwars, mass > 1000, eye_color == "orange") +#' filter_out(starwars, mass > 1000, eye_color == "orange") #' #' # The filtering operation may yield different results on grouped #' # tibbles because the expressions are computed within groups. @@ -172,20 +179,20 @@ #' # average: #' starwars |> filter(mass > mean(mass, na.rm = TRUE), .by = gender) #' -#' # If your intent is to drop rows, use `filter_out()`. -#' # To remove blond individuals: -#' starwars |> filter_out(hair_color == "blond") -#' -#' # Notice how this is different from using `filter()` and `!= "blond"`. -#' # With `filter()`, rows with a `hair_color` of `NA` are also unintentionally -#' # dropped. +#' # If you find yourself trying to use a `filter()` to drop rows, then +#' # you should consider if switching to `filter_out()` can simplify your +#' # conditions. For example, to drop blond individuals, you might try: #' starwars |> filter(hair_color != "blond") #' -#' # To retain `NA`s with `filter()`, you'd need `| is.na()`. -#' # If you find yourself using `is.na()` in this way, consider switching to -#' # `filter_out()` instead. +#' # But this also drops rows with an `NA` hair color! To retain those: #' starwars |> filter(hair_color != "blond" | is.na(hair_color)) #' +#' # But explicit `NA` handling like this can quickly get unwieldy, especially +#' # with multiple conditions. Since your intent was to specify rows to drop +#' # rather than rows to keep, use `filter_out()`. This also removes the need +#' # for any explicit `NA` handling. +#' starwars |> filter_out(hair_color == "blond") +#' #' # To refer to column names that are stored as strings, use the `.data` #' # pronoun: #' vars <- c("mass", "height") diff --git a/man/filter.Rd b/man/filter.Rd index e9f76bc04f..8bb0886e1b 100644 --- a/man/filter.Rd +++ b/man/filter.Rd @@ -198,17 +198,24 @@ The following methods are currently available in loaded packages: } \examples{ -# Filtering by one criterion +# Filtering for one criterion filter(starwars, species == "Human") -filter(starwars, mass > 1000) -# Filtering by multiple criteria within a single logical expression +# Filtering for multiple criteria within a single logical expression filter(starwars, hair_color == "none" & eye_color == "black") filter(starwars, hair_color == "none" | eye_color == "black") # When multiple expressions are used, they are combined using & filter(starwars, hair_color == "none", eye_color == "black") +# Filtering out to drop rows +filter_out(starwars, hair_color == "none") + +# When filtering out, it can be useful to first interactively filter for the +# rows you want to drop, just to double check that you've written the +# conditions correctly. Then, just change `filter()` to `filter_out()`. +filter(starwars, mass > 1000, eye_color == "orange") +filter_out(starwars, mass > 1000, eye_color == "orange") # The filtering operation may yield different results on grouped # tibbles because the expressions are computed within groups. @@ -221,20 +228,20 @@ starwars |> filter(mass > mean(mass, na.rm = TRUE)) # average: starwars |> filter(mass > mean(mass, na.rm = TRUE), .by = gender) -# If your intent is to drop rows, use `filter_out()`. -# To remove blond individuals: -starwars |> filter_out(hair_color == "blond") - -# Notice how this is different from using `filter()` and `!= "blond"`. -# With `filter()`, rows with a `hair_color` of `NA` are also unintentionally -# dropped. +# If you find yourself trying to use a `filter()` to drop rows, then +# you should consider if switching to `filter_out()` can simplify your +# conditions. For example, to drop blond individuals, you might try: starwars |> filter(hair_color != "blond") -# To retain `NA`s with `filter()`, you'd need `| is.na()`. -# If you find yourself using `is.na()` in this way, consider switching to -# `filter_out()` instead. +# But this also drops rows with an `NA` hair color! To retain those: starwars |> filter(hair_color != "blond" | is.na(hair_color)) +# But explicit `NA` handling like this can quickly get unwieldy, especially +# with multiple conditions. Since your intent was to specify rows to drop +# rather than rows to keep, use `filter_out()`. This also removes the need +# for any explicit `NA` handling. +starwars |> filter_out(hair_color == "blond") + # To refer to column names that are stored as strings, use the `.data` # pronoun: vars <- c("mass", "height") From 3c1c4867e5c8bf71f39782aecd34a63f02505bb7 Mon Sep 17 00:00:00 2001 From: Davis Vaughan Date: Tue, 25 Nov 2025 13:44:49 -0500 Subject: [PATCH 04/14] Overhaul tests to include `filter_out()` --- tests/testthat/_snaps/filter.md | 175 +++++++++++++++++++++-- tests/testthat/test-filter.R | 243 ++++++++++++++++++++++++++------ 2 files changed, 368 insertions(+), 50 deletions(-) diff --git a/tests/testthat/_snaps/filter.md b/tests/testthat/_snaps/filter.md index 24e4bd32d7..bf68a9ade5 100644 --- a/tests/testthat/_snaps/filter.md +++ b/tests/testthat/_snaps/filter.md @@ -1,4 +1,4 @@ -# filter() allows matrices with 1 column with a deprecation warning (#6091) +# filter() and filter_out() allow matrices with 1 column with a deprecation warning (#6091) Code out <- filter(df, matrix(c(TRUE, FALSE), nrow = 2)) @@ -7,6 +7,15 @@ Using one column matrices in `filter()` or `filter_out()` was deprecated in dplyr 1.1.0. i Please use one dimensional logical vectors instead. +--- + + Code + out <- filter_out(df, matrix(c(TRUE, FALSE), nrow = 2)) + Condition + Warning: + Using one column matrices in `filter()` or `filter_out()` was deprecated in dplyr 1.1.0. + i Please use one dimensional logical vectors instead. + --- Code @@ -16,28 +25,55 @@ Using one column matrices in `filter()` or `filter_out()` was deprecated in dplyr 1.1.0. i Please use one dimensional logical vectors instead. -# filter() disallows matrices with >1 column +--- Code - (expect_error(filter(df, matrix(TRUE, nrow = 3, ncol = 2)))) - Output - + out <- filter_out(gdf, matrix(c(TRUE, FALSE), nrow = 2)) + Condition + Warning: + Using one column matrices in `filter()` or `filter_out()` was deprecated in dplyr 1.1.0. + i Please use one dimensional logical vectors instead. + +# filter() and filter_out() disallow matrices with >1 column + + Code + filter(df, matrix(TRUE, nrow = 3, ncol = 2)) + Condition Error in `filter()`: i In argument: `matrix(TRUE, nrow = 3, ncol = 2)`. Caused by error: ! `..1` must be a logical vector, not a logical matrix. -# filter() disallows arrays with >2 dimensions +--- Code - (expect_error(filter(df, array(TRUE, dim = c(3, 1, 1))))) - Output - + filter_out(df, matrix(TRUE, nrow = 3, ncol = 2)) + Condition + Error in `filter_out()`: + i In argument: `matrix(TRUE, nrow = 3, ncol = 2)`. + Caused by error: + ! `..1` must be a logical vector, not a logical matrix. + +# filter() and filter_out() disallow arrays with >2 dimensions + + Code + filter(df, array(TRUE, dim = c(3, 1, 1))) + Condition Error in `filter()`: i In argument: `array(TRUE, dim = c(3, 1, 1))`. Caused by error: ! `..1` must be a logical vector, not a logical array. +--- + + Code + filter_out(df, array(TRUE, dim = c(3, 1, 1))) + Condition + Error in `filter_out()`: + i In argument: `array(TRUE, dim = c(3, 1, 1))`. + Caused by error: + ! `..1` must be a logical vector, not a logical array. + # filter() gives useful error messages Code @@ -150,7 +186,7 @@ Caused by error: ! { -# Using data frames in `filter()` is defunct (#7758) +# Using data frames in `filter()` and `filter_out()` is defunct (#7758) Code filter(df, across(everything(), ~ .x > 0)) @@ -161,6 +197,17 @@ ! `..1` must be a logical vector, not a object. i If you used `across()` to generate this data frame, please use `if_any()` or `if_all()` instead. +--- + + Code + filter_out(df, across(everything(), ~ .x > 0)) + Condition + Error in `filter_out()`: + i In argument: `across(everything(), ~.x > 0)`. + Caused by error: + ! `..1` must be a logical vector, not a object. + i If you used `across()` to generate this data frame, please use `if_any()` or `if_all()` instead. + --- Code @@ -173,6 +220,18 @@ ! `..1` must be a logical vector, not a object. i If you used `across()` to generate this data frame, please use `if_any()` or `if_all()` instead. +--- + + Code + filter_out(gdf, across(everything(), ~ .x > 0)) + Condition + Error in `filter_out()`: + i In argument: `across(everything(), ~.x > 0)`. + i In group 1: `x = 1`. + Caused by error: + ! `..1` must be a logical vector, not a object. + i If you used `across()` to generate this data frame, please use `if_any()` or `if_all()` instead. + --- Code @@ -185,6 +244,18 @@ ! `..1` must be a logical vector, not a object. i If you used `across()` to generate this data frame, please use `if_any()` or `if_all()` instead. +--- + + Code + filter_out(rdf, across(everything(), ~ .x > 0)) + Condition + Error in `filter_out()`: + i In argument: `across(everything(), ~.x > 0)`. + i In row 1. + Caused by error: + ! `..1` must be a logical vector, not a object. + i If you used `across()` to generate this data frame, please use `if_any()` or `if_all()` instead. + --- Code @@ -196,6 +267,17 @@ ! `..1` must be a logical vector, not a object. i If you used `across()` to generate this data frame, please use `if_any()` or `if_all()` instead. +--- + + Code + filter_out(df, tibble(x > 0, y > 0)) + Condition + Error in `filter_out()`: + i In argument: `tibble(x > 0, y > 0)`. + Caused by error: + ! `..1` must be a logical vector, not a object. + i If you used `across()` to generate this data frame, please use `if_any()` or `if_all()` instead. + --- Code @@ -208,6 +290,18 @@ ! `..1` must be a logical vector, not a object. i If you used `across()` to generate this data frame, please use `if_any()` or `if_all()` instead. +--- + + Code + filter_out(gdf, tibble(x > 0, y > 0)) + Condition + Error in `filter_out()`: + i In argument: `tibble(x > 0, y > 0)`. + i In group 1: `x = 1`. + Caused by error: + ! `..1` must be a logical vector, not a object. + i If you used `across()` to generate this data frame, please use `if_any()` or `if_all()` instead. + --- Code @@ -220,6 +314,18 @@ ! `..1` must be a logical vector, not a object. i If you used `across()` to generate this data frame, please use `if_any()` or `if_all()` instead. +--- + + Code + filter_out(rdf, tibble(x > 0, y > 0)) + Condition + Error in `filter_out()`: + i In argument: `tibble(x > 0, y > 0)`. + i In row 1. + Caused by error: + ! `..1` must be a logical vector, not a object. + i If you used `across()` to generate this data frame, please use `if_any()` or `if_all()` instead. + # `filter()` doesn't allow data frames with missing or empty names (#6758) Code @@ -228,6 +334,14 @@ Error in `filter()`: ! Can't transform a data frame with `NA` or `""` names. +--- + + Code + filter_out(df1) + Condition + Error in `filter_out()`: + ! Can't transform a data frame with `NA` or `""` names. + --- Code @@ -236,6 +350,14 @@ Error in `filter()`: ! Can't transform a data frame with missing names. +--- + + Code + filter_out(df2) + Condition + Error in `filter_out()`: + ! Can't transform a data frame with missing names. + # can't use `.by` with `.preserve` Code @@ -244,6 +366,14 @@ Error in `filter()`: ! Can't supply both `.by` and `.preserve`. +--- + + Code + filter_out(df, .by = x, .preserve = TRUE) + Condition + Error in `filter_out()`: + ! Can't supply both `.by` and `.preserve`. + # catches `.by` with grouped-df Code @@ -252,6 +382,14 @@ Error in `filter()`: ! Can't supply `.by` when `.data` is a grouped data frame. +--- + + Code + filter_out(gdf, .by = x) + Condition + Error in `filter_out()`: + ! Can't supply `.by` when `.data` is a grouped data frame. + # catches `.by` with rowwise-df Code @@ -260,6 +398,14 @@ Error in `filter()`: ! Can't supply `.by` when `.data` is a rowwise data frame. +--- + + Code + filter_out(rdf, .by = x) + Condition + Error in `filter_out()`: + ! Can't supply `.by` when `.data` is a rowwise data frame. + # catches `by` typo (#6647) Code @@ -269,3 +415,12 @@ ! Can't specify an argument named `by` in this verb. i Did you mean to use `.by` instead? +--- + + Code + filter_out(df, by = x) + Condition + Error in `filter_out()`: + ! Can't specify an argument named `by` in this verb. + i Did you mean to use `.by` instead? + diff --git a/tests/testthat/test-filter.R b/tests/testthat/test-filter.R index e5ca66f5c3..e89ae5894f 100644 --- a/tests/testthat/test-filter.R +++ b/tests/testthat/test-filter.R @@ -56,20 +56,32 @@ test_that("filter handlers scalar results", { ) }) -test_that("filter propagates attributes", { +test_that("filter and filter_out propagate attributes", { date.start <- ISOdate(2010, 01, 01, 0) test <- data.frame(Date = ISOdate(2010, 01, 01, 1:10)) test2 <- test |> filter(Date < ISOdate(2010, 01, 01, 5)) expect_equal(test$Date[1:4], test2$Date) + test2 <- test |> filter_out(Date < ISOdate(2010, 01, 01, 5)) + expect_equal(test$Date[5:10], test2$Date) }) -test_that("filter discards NA", { +test_that("filter and filter_out discards NA", { temp <- data.frame( i = 1:5, x = c(NA, 1L, 1L, 0L, 0L) ) + res <- filter(temp, x == 1) - expect_equal(nrow(res), 2L) + expect_identical( + res, + data.frame(i = c(2L, 3L), x = c(1L, 1L)) + ) + + res <- filter_out(temp, x == 1) + expect_identical( + res, + data.frame(i = c(1L, 4L, 5L), x = c(NA, 0L, 0L)) + ) }) test_that("date class remains on filter (#273)", { @@ -97,9 +109,16 @@ test_that("filter handles $ correctly (#278)", { expect_equal(res1, res2) }) -test_that("filter() returns the input data if no parameters are given", { +test_that("filter() and filter_out() are still a union if no parameters are given", { + # Justification is that `filter(df, ...)` performs a `pall(...)` style + # operation over the `...` to determine which rows to keep. This defaults to + # `TRUE` for each row when no inputs are provided, so in `filter()` all rows + # are retained. Which implies that `filter_out()` retains no rows. expect_identical(filter(mtcars), mtcars) + expect_identical(filter_out(mtcars), mtcars[0, ]) + expect_identical(filter(mtcars, !!!list()), mtcars) + expect_identical(filter_out(mtcars, !!!list()), mtcars[0, ]) }) test_that("$ does not end call traversing. #502", { @@ -183,19 +202,21 @@ test_that("hybrid evaluation handles $ correctly (#1134)", { expect_equal(nrow(res), 9L) }) -test_that("filter correctly handles empty data frames (#782)", { - res <- tibble() |> filter(F) - expect_equal(nrow(res), 0L) - expect_equal(length(names(res)), 0L) +test_that("filter and filter_out correctly handle empty data frames (#782)", { + expect_identical(filter(tibble(), TRUE), tibble()) + expect_identical(filter(tibble(), FALSE), tibble()) + + expect_identical(filter_out(tibble(), TRUE), tibble()) + expect_identical(filter_out(tibble(), FALSE), tibble()) }) test_that("filter(.,TRUE,TRUE) works (#1210)", { df <- data.frame(x = 1:5) - res <- filter(df, TRUE, TRUE) - expect_equal(res, df) + expect_identical(filter(df, TRUE, TRUE), df) + expect_identical(filter_out(df, TRUE, TRUE), df[0, , drop = FALSE]) }) -test_that("filter, slice and arrange preserves attributes (#1064)", { +test_that("filter, slice, and arrange preserves attributes (#1064)", { df <- structure( data.frame(x = 1:10, g1 = rep(1:2, each = 5), g2 = rep(1:5, 2)), meta = "this is important" @@ -203,9 +224,15 @@ test_that("filter, slice and arrange preserves attributes (#1064)", { res <- filter(df, x < 5) |> attr("meta") expect_equal(res, "this is important") + res <- filter_out(df, x < 5) |> attr("meta") + expect_equal(res, "this is important") + res <- filter(df, x < 5, x > 4) |> attr("meta") expect_equal(res, "this is important") + res <- filter_out(df, x < 5, x > 4) |> attr("meta") + expect_equal(res, "this is important") + res <- df |> slice(1:50) |> attr("meta") expect_equal(res, "this is important") @@ -231,21 +258,34 @@ test_that("grouped filter handles indices (#880)", { expect_equal(group_keys(res), group_keys(res2)) }) -test_that("filter(FALSE) handles indices", { +test_that("filter(FALSE) and filter_out(TRUE) handle indices", { + indices <- list_of(integer(), integer(), integer(), .ptype = integer()) + out <- mtcars |> group_by(cyl) |> filter(FALSE, .preserve = TRUE) |> group_rows() - expect_identical( - out, - list_of(integer(), integer(), integer(), .ptype = integer()) - ) + expect_identical(out, indices) + + out <- mtcars |> + group_by(cyl) |> + filter_out(TRUE, .preserve = TRUE) |> + group_rows() + expect_identical(out, indices) + + indices <- list_of(.ptype = integer()) out <- mtcars |> group_by(cyl) |> filter(FALSE, .preserve = FALSE) |> group_rows() - expect_identical(out, list_of(.ptype = integer())) + expect_identical(out, indices) + + out <- mtcars |> + group_by(cyl) |> + filter_out(TRUE, .preserve = FALSE) |> + group_rows() + expect_identical(out, indices) }) test_that("filter handles S4 objects (#1366)", { @@ -319,8 +359,13 @@ test_that("hybrid function row_number does not trigger warning in filter (#3750) expect_true(out) }) -test_that("filter() preserve order across groups (#3989)", { - df <- tibble(g = c(1, 2, 1, 2, 1), time = 5:1, x = 5:1) +test_that("filter() and filter_out() preserve order across groups (#3989)", { + df <- tibble( + g = c(1, 2, 1, 2, 1), + time = 5:1, + x = 5:1 + ) + res1 <- df |> group_by(g) |> filter(x <= 4) |> @@ -336,11 +381,28 @@ test_that("filter() preserve order across groups (#3989)", { arrange(time) |> group_by(g) + expect_identical(res1$time, 1:4) + expect_equal(res1, res2) + expect_equal(res1, res3) + + res1 <- df |> + group_by(g) |> + filter_out(x <= 2) |> + arrange(time) + + res2 <- df |> + group_by(g) |> + arrange(time) |> + filter_out(x <= 2) + + res3 <- df |> + filter_out(x <= 2) |> + arrange(time) |> + group_by(g) + + expect_identical(res1$time, 3:5) expect_equal(res1, res2) expect_equal(res1, res3) - expect_false(is.unsorted(res1$time)) - expect_false(is.unsorted(res2$time)) - expect_false(is.unsorted(res3$time)) }) test_that("filter() with two conditions does not freeze (#4049)", { @@ -376,9 +438,12 @@ test_that("filter() handles matrix and data frame columns (#3630)", { expect_equal(filter(gdf, z$A == 1), gdf[1, ]) }) -test_that("filter() handles named logical (#4638)", { +test_that("filter() and filter_out() handle named logical (#4638)", { tbl <- tibble(a = c(a = TRUE)) - expect_equal(filter(tbl, a), tbl) + expect_identical(filter(tbl, a), tbl) + + tbl <- tibble(a = c(a = FALSE)) + expect_identical(filter_out(tbl, a), tbl) }) test_that("filter() allows named constants that resolve to logical vectors (#4612)", { @@ -394,40 +459,55 @@ test_that("filter() allows named constants that resolve to logical vectors (#461 ) }) -test_that("filter() allows 1 dimension arrays", { +test_that("filter() and filter_out() allow 1 dimension arrays", { df <- tibble(x = array(c(TRUE, FALSE, TRUE))) expect_identical(filter(df, x), df[c(1, 3), ]) + expect_identical(filter_out(df, x), df[2, ]) }) -test_that("filter() allows matrices with 1 column with a deprecation warning (#6091)", { +test_that("filter() and filter_out() allow matrices with 1 column with a deprecation warning (#6091)", { df <- tibble(x = 1:2) expect_snapshot({ out <- filter(df, matrix(c(TRUE, FALSE), nrow = 2)) }) expect_identical(out, tibble(x = 1L)) + expect_snapshot({ + out <- filter_out(df, matrix(c(TRUE, FALSE), nrow = 2)) + }) + expect_identical(out, tibble(x = 2L)) # Only warns once when grouped - df <- tibble(x = c(1, 1, 2, 2)) + df <- tibble(x = c(1, 1, 2, 2), y = c(1, 2, 3, 4)) gdf <- group_by(df, x) expect_snapshot({ out <- filter(gdf, matrix(c(TRUE, FALSE), nrow = 2)) }) - expect_identical(out, group_by(tibble(x = c(1, 2)), x)) + expect_identical(out, group_by(tibble(x = c(1, 2), y = c(1, 3)), x)) + expect_snapshot({ + out <- filter_out(gdf, matrix(c(TRUE, FALSE), nrow = 2)) + }) + expect_identical(out, group_by(tibble(x = c(1, 2), y = c(2, 4)), x)) }) -test_that("filter() disallows matrices with >1 column", { +test_that("filter() and filter_out() disallow matrices with >1 column", { df <- tibble(x = 1:3) - expect_snapshot({ - (expect_error(filter(df, matrix(TRUE, nrow = 3, ncol = 2)))) + expect_snapshot(error = TRUE, { + filter(df, matrix(TRUE, nrow = 3, ncol = 2)) + }) + expect_snapshot(error = TRUE, { + filter_out(df, matrix(TRUE, nrow = 3, ncol = 2)) }) }) -test_that("filter() disallows arrays with >2 dimensions", { +test_that("filter() and filter_out() disallow arrays with >2 dimensions", { df <- tibble(x = 1:3) - expect_snapshot({ - (expect_error(filter(df, array(TRUE, dim = c(3, 1, 1))))) + expect_snapshot(error = TRUE, { + filter(df, array(TRUE, dim = c(3, 1, 1))) + }) + expect_snapshot(error = TRUE, { + filter_out(df, array(TRUE, dim = c(3, 1, 1))) }) }) @@ -501,7 +581,7 @@ test_that("filter() gives useful error messages", { }) }) -test_that("Using data frames in `filter()` is defunct (#7758)", { +test_that("Using data frames in `filter()` and `filter_out()` is defunct (#7758)", { df <- data.frame(x = 1, y = 1) gdf <- group_by(df, x) rdf <- rowwise(df, x) @@ -510,26 +590,44 @@ test_that("Using data frames in `filter()` is defunct (#7758)", { expect_snapshot(error = TRUE, { filter(df, across(everything(), ~ .x > 0)) }) + expect_snapshot(error = TRUE, { + filter_out(df, across(everything(), ~ .x > 0)) + }) expect_snapshot(error = TRUE, { filter(gdf, across(everything(), ~ .x > 0)) }) + expect_snapshot(error = TRUE, { + filter_out(gdf, across(everything(), ~ .x > 0)) + }) expect_snapshot(error = TRUE, { filter(rdf, across(everything(), ~ .x > 0)) }) + expect_snapshot(error = TRUE, { + filter_out(rdf, across(everything(), ~ .x > 0)) + }) # Can't filter with a data frame of logicals (same as the `across()` case) expect_snapshot(error = TRUE, { filter(df, tibble(x > 0, y > 0)) }) + expect_snapshot(error = TRUE, { + filter_out(df, tibble(x > 0, y > 0)) + }) expect_snapshot(error = TRUE, { filter(gdf, tibble(x > 0, y > 0)) }) + expect_snapshot(error = TRUE, { + filter_out(gdf, tibble(x > 0, y > 0)) + }) expect_snapshot(error = TRUE, { filter(rdf, tibble(x > 0, y > 0)) }) + expect_snapshot(error = TRUE, { + filter_out(rdf, tibble(x > 0, y > 0)) + }) }) -test_that("filter preserves grouping", { +test_that("filter and filter_out preserve grouping", { gf <- group_by(tibble(g = c(1, 1, 1, 2, 2), x = 1:5), g) i <- count_regroups(out <- filter(gf, x %in% c(3, 4))) @@ -537,13 +635,23 @@ test_that("filter preserves grouping", { expect_equal(group_vars(gf), "g") expect_equal(group_rows(out), list_of(1L, 2L)) + i <- count_regroups(out <- filter_out(gf, x %in% c(3, 4))) + expect_equal(i, 0L) + expect_equal(group_vars(gf), "g") + expect_equal(group_rows(out), list_of(1:2, 3L)) + i <- count_regroups(out <- filter(gf, x < 3)) expect_equal(i, 0L) expect_equal(group_vars(gf), "g") expect_equal(group_rows(out), list_of(c(1L, 2L))) + + i <- count_regroups(out <- filter_out(gf, x < 3)) + expect_equal(i, 0L) + expect_equal(group_vars(gf), "g") + expect_equal(group_rows(out), list_of(1L, 2:3)) }) -test_that("filter() with empty dots still calls dplyr_row_slice()", { +test_that("filter() and filter_out() with empty dots still calls dplyr_row_slice()", { tbl <- new_tibble(list(x = 1), nrow = 1L) foo <- structure(tbl, class = c("foo_df", class(tbl))) @@ -556,7 +664,10 @@ test_that("filter() with empty dots still calls dplyr_row_slice()", { ) expect_s3_class(filter(foo), class(tbl), exact = TRUE) + expect_s3_class(filter_out(foo), class(tbl), exact = TRUE) + expect_s3_class(filter(foo, x == 1), class(tbl), exact = TRUE) + expect_s3_class(filter_out(foo, x == 1), class(tbl), exact = TRUE) }) test_that("can filter() with unruly class", { @@ -588,18 +699,27 @@ test_that("filter() preserves the call stack on error (#5308)", { test_that("if_any() and if_all() work", { df <- tibble(x1 = 1:10, x2 = c(1:5, 10:6)) + expect_equal( filter(df, if_all(starts_with("x"), ~ . > 6)), filter(df, x1 > 6 & x2 > 6) ) + expect_equal( + filter_out(df, if_all(starts_with("x"), ~ . > 6)), + filter_out(df, x1 > 6 & x2 > 6) + ) expect_equal( filter(df, if_any(starts_with("x"), ~ . > 6)), filter(df, x1 > 6 | x2 > 6) ) + expect_equal( + filter_out(df, if_any(starts_with("x"), ~ . > 6)), + filter_out(df, x1 > 6 | x2 > 6) + ) }) -test_that("filter keeps zero length groups", { +test_that("filter and filter_out keep zero length groups", { df <- tibble( e = 1, f = factor(c(1, 1, 2, 2), levels = 1:3), @@ -608,7 +728,8 @@ test_that("filter keeps zero length groups", { ) df <- group_by(df, e, f, g, .drop = FALSE) - expect_equal(group_size(filter(df, f == 1)), c(2, 0, 0)) + expect_identical(group_size(filter(df, f == 1)), c(2L, 0L, 0L)) + expect_identical(group_size(filter_out(df, f == 1)), c(0L, 2L, 0L)) }) test_that("filtering retains labels for zero length groups", { @@ -629,6 +750,15 @@ test_that("filtering retains labels for zero length groups", { n = c(2L, 0L, 0L) ) ) + expect_equal( + ungroup(count(filter_out(df, f == 1))), + tibble( + e = 1, + f = factor(1:3), + g = c(1, 2, NA), + n = c(0L, 2L, 0L) + ) + ) }) test_that("`filter()` doesn't allow data frames with missing or empty names (#6758)", { @@ -638,9 +768,16 @@ test_that("`filter()` doesn't allow data frames with missing or empty names (#67 expect_snapshot(error = TRUE, { filter(df1) }) + expect_snapshot(error = TRUE, { + filter_out(df1) + }) + expect_snapshot(error = TRUE, { filter(df2) }) + expect_snapshot(error = TRUE, { + filter_out(df2) + }) }) # .by ------------------------------------------------------------------------- @@ -649,16 +786,24 @@ test_that("can group transiently using `.by`", { df <- tibble(g = c(1, 1, 2, 1, 2), x = c(5, 10, 1, 2, 3)) out <- filter(df, x > mean(x), .by = g) - expect_identical(out$g, c(1, 2)) expect_identical(out$x, c(10, 3)) expect_s3_class(out, class(df), exact = TRUE) + + out <- filter_out(df, x > mean(x), .by = g) + expect_identical(out$g, c(1, 2, 1)) + expect_identical(out$x, c(5, 1, 2)) + expect_s3_class(out, class(df), exact = TRUE) }) test_that("transient grouping retains bare data.frame class", { df <- tibble(g = c(1, 1, 2, 1, 2), x = c(5, 10, 1, 2, 3)) + out <- filter(df, x > mean(x), .by = g) expect_s3_class(out, class(df), exact = TRUE) + + out <- filter_out(df, x > mean(x), .by = g) + expect_s3_class(out, class(df), exact = TRUE) }) test_that("transient grouping retains data frame attributes", { @@ -672,8 +817,14 @@ test_that("transient grouping retains data frame attributes", { out <- filter(df, x > mean(x), .by = g) expect_identical(attr(out, "foo"), "bar") + out <- filter_out(df, x > mean(x), .by = g) + expect_identical(attr(out, "foo"), "bar") + out <- filter(tbl, x > mean(x), .by = g) expect_identical(attr(out, "foo"), "bar") + + out <- filter_out(tbl, x > mean(x), .by = g) + expect_identical(attr(out, "foo"), "bar") }) test_that("can't use `.by` with `.preserve`", { @@ -682,6 +833,9 @@ test_that("can't use `.by` with `.preserve`", { expect_snapshot(error = TRUE, { filter(df, .by = x, .preserve = TRUE) }) + expect_snapshot(error = TRUE, { + filter_out(df, .by = x, .preserve = TRUE) + }) }) test_that("catches `.by` with grouped-df", { @@ -691,6 +845,9 @@ test_that("catches `.by` with grouped-df", { expect_snapshot(error = TRUE, { filter(gdf, .by = x) }) + expect_snapshot(error = TRUE, { + filter_out(gdf, .by = x) + }) }) test_that("catches `.by` with rowwise-df", { @@ -700,6 +857,9 @@ test_that("catches `.by` with rowwise-df", { expect_snapshot(error = TRUE, { filter(rdf, .by = x) }) + expect_snapshot(error = TRUE, { + filter_out(rdf, .by = x) + }) }) test_that("catches `by` typo (#6647)", { @@ -708,4 +868,7 @@ test_that("catches `by` typo (#6647)", { expect_snapshot(error = TRUE, { filter(df, by = x) }) + expect_snapshot(error = TRUE, { + filter_out(df, by = x) + }) }) From f40e8e0b1cf67c90ebc4f917aeb1390af15948af Mon Sep 17 00:00:00 2001 From: Davis Vaughan Date: Tue, 25 Nov 2025 13:58:22 -0500 Subject: [PATCH 05/14] Add to list of `.by` supported verbs --- man/dplyr_by.Rd | 1 + man/rmd/by.Rmd | 2 ++ 2 files changed, 3 insertions(+) diff --git a/man/dplyr_by.Rd b/man/dplyr_by.Rd index b398644417..3d4a381092 100644 --- a/man/dplyr_by.Rd +++ b/man/dplyr_by.Rd @@ -26,6 +26,7 @@ This idea comes from \href{https://CRAN.R-project.org/package=data.table}{data.t \item \code{\link[=summarise]{summarise(.by = )}} \item \code{\link[=reframe]{reframe(.by = )}} \item \code{\link[=filter]{filter(.by = )}} +\item \code{\link[=filter_out]{filter_out(.by = )}} \item \code{\link[=slice]{slice(.by = )}} \item \code{\link[=slice_head]{slice_head(by = )}} and \code{\link[=slice_tail]{slice_tail(by = )}} \item \code{\link[=slice_min]{slice_min(by = )}} and \code{\link[=slice_max]{slice_max(by = )}} diff --git a/man/rmd/by.Rmd b/man/rmd/by.Rmd index 0744777e48..9a0ff472d6 100644 --- a/man/rmd/by.Rmd +++ b/man/rmd/by.Rmd @@ -35,6 +35,8 @@ This idea comes from [data.table](https://CRAN.R-project.org/package=data.table) - [`filter(.by = )`][filter()] +- [`filter_out(.by = )`][filter_out()] + - [`slice(.by = )`][slice()] - [`slice_head(by = )`][slice_head()] and [`slice_tail(by = )`][slice_tail()] From 584fdc0ff3f65431137e5b2b32a4683f47c17af2 Mon Sep 17 00:00:00 2001 From: Davis Vaughan Date: Tue, 25 Nov 2025 13:59:03 -0500 Subject: [PATCH 06/14] Mention in extending dplyr docs --- R/generics.R | 7 ++++--- man/dplyr_extending.Rd | 7 ++++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/R/generics.R b/R/generics.R index f16e073724..df6a5b4277 100644 --- a/R/generics.R +++ b/R/generics.R @@ -52,9 +52,10 @@ #' #' # Current usage #' -#' * `arrange()`, `filter()`, `slice()` (and the rest of the `slice_*()` -#' family), `semi_join()`, and `anti_join()` work by generating a vector of -#' row indices, and then subsetting with `dplyr_row_slice()`. +#' * `arrange()`, `filter()` (and `filter_out()`), `slice()` (and the rest of +#' the `slice_*()` family), `semi_join()`, and `anti_join()` work by +#' generating a vector of row indices, and then subsetting with +#' `dplyr_row_slice()`. #' #' * `mutate()` generates a list of new column value (using `NULL` to indicate #' when columns should be deleted), then passes that to `dplyr_col_modify()`. diff --git a/man/dplyr_extending.Rd b/man/dplyr_extending.Rd index 0aec418efe..7614daa905 100644 --- a/man/dplyr_extending.Rd +++ b/man/dplyr_extending.Rd @@ -73,9 +73,10 @@ methods as needed. \section{Current usage}{ \itemize{ -\item \code{arrange()}, \code{filter()}, \code{slice()} (and the rest of the \verb{slice_*()} -family), \code{semi_join()}, and \code{anti_join()} work by generating a vector of -row indices, and then subsetting with \code{dplyr_row_slice()}. +\item \code{arrange()}, \code{filter()} (and \code{filter_out()}), \code{slice()} (and the rest of +the \verb{slice_*()} family), \code{semi_join()}, and \code{anti_join()} work by +generating a vector of row indices, and then subsetting with +\code{dplyr_row_slice()}. \item \code{mutate()} generates a list of new column value (using \code{NULL} to indicate when columns should be deleted), then passes that to \code{dplyr_col_modify()}. It also uses 1d \code{[} to implement \code{.keep}, and will call \code{relocate()} if From be08beb2a0a85ff1922f6802d9a2f91a0b01895b Mon Sep 17 00:00:00 2001 From: Davis Vaughan Date: Tue, 25 Nov 2025 13:59:13 -0500 Subject: [PATCH 07/14] Mention in `across()` example --- R/across.R | 5 +++++ man/across.Rd | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/R/across.R b/R/across.R index fb916b446f..16eb3d7666 100644 --- a/R/across.R +++ b/R/across.R @@ -204,7 +204,12 @@ #' iris |> #' filter(if_any(ends_with("Width"), ~ . > 4)) #' iris |> +#' filter_out(if_any(ends_with("Width"), ~ . > 4)) +#' +#' iris |> #' filter(if_all(ends_with("Width"), ~ . > 2)) +#' iris |> +#' filter_out(if_all(ends_with("Width"), ~ . > 2)) #' #' @export #' @seealso [c_across()] for a function that returns a vector diff --git a/man/across.Rd b/man/across.Rd index 2087b07eb6..1093645a33 100644 --- a/man/across.Rd +++ b/man/across.Rd @@ -248,8 +248,13 @@ iris |> # if_any() and if_all() ---------------------------------------------------- iris |> filter(if_any(ends_with("Width"), ~ . > 4)) +iris |> + filter_out(if_any(ends_with("Width"), ~ . > 4)) + iris |> filter(if_all(ends_with("Width"), ~ . > 2)) +iris |> + filter_out(if_all(ends_with("Width"), ~ . > 2)) } \seealso{ From 443eb14ab722da1af2fbf4deec48b999d92f5e1a Mon Sep 17 00:00:00 2001 From: Davis Vaughan Date: Tue, 25 Nov 2025 13:59:30 -0500 Subject: [PATCH 08/14] Test alongside `pick()` --- tests/testthat/_snaps/pick.md | 46 +++++++++++++++++++++++++++++++++-- tests/testthat/test-pick.R | 34 ++++++++++++++++++++------ 2 files changed, 71 insertions(+), 9 deletions(-) diff --git a/tests/testthat/_snaps/pick.md b/tests/testthat/_snaps/pick.md index 7ba36b24ee..3af20c600a 100644 --- a/tests/testthat/_snaps/pick.md +++ b/tests/testthat/_snaps/pick.md @@ -185,7 +185,7 @@ Caused by error in `foo()`: ! could not find function "foo" -# `filter()` with `pick()` that uses invalid tidy-selection errors +# `filter()` / `filter_out()` with `pick()` that uses invalid tidy-selection errors Code filter(df, pick(x, a)) @@ -207,7 +207,29 @@ ! Can't select columns that don't exist. x Column `a` doesn't exist. -# `filter()` that doesn't use `pick()` result correctly errors +--- + + Code + filter_out(df, pick(x, a)) + Condition + Error in `filter_out()`: + i In argument: `pick(x, a)`. + Caused by error in `pick()`: + ! Can't select columns that don't exist. + x Column `a` doesn't exist. + +--- + + Code + filter_out(df, pick_wrapper(x, a)) + Condition + Error in `filter_out()`: + i In argument: `pick_wrapper(x, a)`. + Caused by error in `pick()`: + ! Can't select columns that don't exist. + x Column `a` doesn't exist. + +# `filter()` / `filter_out()` that doesn't use `pick()` result correctly errors Code filter(df, pick(x, y)$x) @@ -227,3 +249,23 @@ Caused by error: ! `..1` must be a logical vector, not a double vector. +--- + + Code + filter_out(df, pick(x, y)$x) + Condition + Error in `filter_out()`: + i In argument: `pick(x, y)$x`. + Caused by error: + ! `..1` must be a logical vector, not a double vector. + +--- + + Code + filter_out(df, pick_wrapper(x, y)$x) + Condition + Error in `filter_out()`: + i In argument: `pick_wrapper(x, y)$x`. + Caused by error: + ! `..1` must be a logical vector, not a double vector. + diff --git a/tests/testthat/test-pick.R b/tests/testthat/test-pick.R index de370056cb..19c40312b0 100644 --- a/tests/testthat/test-pick.R +++ b/tests/testthat/test-pick.R @@ -466,19 +466,25 @@ test_that("`pick()` errors in `arrange()` are useful", { }) # ------------------------------------------------------------------------------ -# pick() + filter() +# pick() + filter() / filter_out() -test_that("can `pick()` inside `filter()`", { +test_that("can `pick()` inside `filter()` / `filter_out()`", { df <- tibble(x = c(1, 2, NA, 3), y = c(2, NA, 5, 3)) + expect <- df[c(1, 4), ] out <- filter(df, vec_detect_complete(pick(x, y))) - expect_identical(out, df[c(1, 4), ]) - + expect_identical(out, expect) out <- filter(df, vec_detect_complete(pick_wrapper(x, y))) - expect_identical(out, df[c(1, 4), ]) + expect_identical(out, expect) + + expect <- df[c(2, 3), ] + out <- filter_out(df, vec_detect_complete(pick(x, y))) + expect_identical(out, expect) + out <- filter_out(df, vec_detect_complete(pick_wrapper(x, y))) + expect_identical(out, expect) }) -test_that("`filter()` with `pick()` that uses invalid tidy-selection errors", { +test_that("`filter()` / `filter_out()` with `pick()` that uses invalid tidy-selection errors", { df <- tibble(x = c(1, 2, NA, 3), y = c(2, NA, 5, 3)) expect_snapshot(error = TRUE, { @@ -487,9 +493,16 @@ test_that("`filter()` with `pick()` that uses invalid tidy-selection errors", { expect_snapshot(error = TRUE, { filter(df, pick_wrapper(x, a)) }) + + expect_snapshot(error = TRUE, { + filter_out(df, pick(x, a)) + }) + expect_snapshot(error = TRUE, { + filter_out(df, pick_wrapper(x, a)) + }) }) -test_that("`filter()` that doesn't use `pick()` result correctly errors", { +test_that("`filter()` / `filter_out()` that doesn't use `pick()` result correctly errors", { df <- tibble(x = c(1, 2, NA, 3), y = c(2, NA, 5, 3)) expect_snapshot(error = TRUE, { @@ -498,6 +511,13 @@ test_that("`filter()` that doesn't use `pick()` result correctly errors", { expect_snapshot(error = TRUE, { filter(df, pick_wrapper(x, y)$x) }) + + expect_snapshot(error = TRUE, { + filter_out(df, pick(x, y)$x) + }) + expect_snapshot(error = TRUE, { + filter_out(df, pick_wrapper(x, y)$x) + }) }) # ------------------------------------------------------------------------------ From c79eb828ad407530a5420a40cac0399b8ab27332 Mon Sep 17 00:00:00 2001 From: Davis Vaughan Date: Tue, 25 Nov 2025 13:59:38 -0500 Subject: [PATCH 09/14] Test alongside `rowwise()` --- tests/testthat/test-rowwise.R | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/testthat/test-rowwise.R b/tests/testthat/test-rowwise.R index cc3dbb6446..124208bbaf 100644 --- a/tests/testthat/test-rowwise.R +++ b/tests/testthat/test-rowwise.R @@ -9,6 +9,10 @@ test_that("rowwise status preserved by major verbs", { expect_s3_class(out, "rowwise_df") expect_equal(group_vars(out), "x") + out <- filter_out(rf, x < 3) + expect_s3_class(out, "rowwise_df") + expect_equal(group_vars(out), "x") + out <- mutate(rf, x = x + 1) expect_s3_class(out, "rowwise_df") expect_equal(group_vars(out), "x") From ce87a10567bb53dd039b85f5a8b6b4eda0b9a12b Mon Sep 17 00:00:00 2001 From: Davis Vaughan Date: Tue, 25 Nov 2025 14:08:06 -0500 Subject: [PATCH 10/14] Use `filter_out()` in a few vignettes --- vignettes/colwise.Rmd | 10 +++++----- vignettes/grouping.Rmd | 6 +++--- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/vignettes/colwise.Rmd b/vignettes/colwise.Rmd index a9f86176ec..2d4f8a0d64 100644 --- a/vignettes/colwise.Rmd +++ b/vignettes/colwise.Rmd @@ -211,24 +211,24 @@ For some verbs, like `group_by()`, `count()` and `distinct()`, you don't need to `across()` doesn't work with `select()` or `rename()` because they already use tidy select syntax; if you want to transform column names with a function, you can use `rename_with()`. -### filter() +### filter() and filter_out() -We cannot directly use `across()` in `filter()` because we need an extra step to combine -the results. To that end, `filter()` has two special purpose companion functions: +We cannot directly use `across()` in `filter()` or `filter_out()` because we need an extra step to combine +the results into a single logical vector. To that end, `filter()` and `filter_out()` have two special purpose companion functions: * `if_any()` keeps the rows where the predicate is true for *at least one* selected column: ```{r} starwars |> - filter(if_any(everything(), ~ !is.na(.x))) + filter_out(if_any(everything(), is.na)) ``` * `if_all()` keeps the rows where the predicate is true for *all* selected columns: ```{r} starwars |> - filter(if_all(everything(), ~ !is.na(.x))) + filter_out(if_all(everything(), is.na)) ``` ## `_if`, `_at`, `_all` diff --git a/vignettes/grouping.Rmd b/vignettes/grouping.Rmd index bbd474a4cc..88c4817914 100644 --- a/vignettes/grouping.Rmd +++ b/vignettes/grouping.Rmd @@ -245,11 +245,11 @@ by_species |> filter(height == max(height)) ``` -You can also use `filter()` to remove entire groups. For example, the following code eliminates all groups that only have a single member: +You can also use `filter_out()` to remove entire groups. For example, the following code eliminates all groups that only have a single member: ```{r filter_group} by_species |> - filter(n() != 1) |> + filter_out(n() == 1) |> tally() ``` @@ -267,6 +267,6 @@ Similarly, we can use `slice_min()` to select the smallest `n` values of a varia ```{r slice_min} by_species |> - filter(!is.na(height)) |> + filter_out(is.na(height)) |> slice_min(height, n = 2) ``` From 4901aa4fd75dc5760a22cd70b97e922432b2face Mon Sep 17 00:00:00 2001 From: Davis Vaughan Date: Tue, 25 Nov 2025 14:09:47 -0500 Subject: [PATCH 11/14] Mark `filter_out()` as experimental --- R/filter.R | 3 ++- man/filter.Rd | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/R/filter.R b/R/filter.R index 95a26da7f1..6b262b6896 100644 --- a/R/filter.R +++ b/R/filter.R @@ -6,7 +6,8 @@ #' #' - `filter()` _keeps_ rows where the conditions evaluate to `TRUE`. #' -#' - `filter_out()` _drops_ rows where the conditions evaluate to `TRUE`. +#' - `r lifecycle::badge("experimental")` `filter_out()` _drops_ rows where the +#' conditions evaluate to `TRUE`. #' #' Multiple conditions can be supplied separated by a comma. These will be #' combined with the `&` operator. diff --git a/man/filter.Rd b/man/filter.Rd index 8bb0886e1b..08a4c509df 100644 --- a/man/filter.Rd +++ b/man/filter.Rd @@ -42,7 +42,8 @@ These functions are used to subset a data frame, applying the expressions in \code{...} to determine which rows should be kept or dropped. \itemize{ \item \code{filter()} \emph{keeps} rows where the conditions evaluate to \code{TRUE}. -\item \code{filter_out()} \emph{drops} rows where the conditions evaluate to \code{TRUE}. +\item \ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#experimental}{\figure{lifecycle-experimental.svg}{options: alt='[Experimental]'}}}{\strong{[Experimental]}} \code{filter_out()} \emph{drops} rows where the +conditions evaluate to \code{TRUE}. } Multiple conditions can be supplied separated by a comma. These will be From 2e377e79e4f9b8873edb45d8a03df4b6eca03d32 Mon Sep 17 00:00:00 2001 From: Davis Vaughan Date: Tue, 25 Nov 2025 14:22:40 -0500 Subject: [PATCH 12/14] NEWS bullet --- NEWS.md | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/NEWS.md b/NEWS.md index 06c5ad86b7..9df68ced32 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,23 @@ # dplyr (development version) +* New experimental `filter_out()` companion to `filter()`. + + * Use `filter()` when specifying rows to _keep_. + + * Use `filter_out()` when specifying rows to _drop_. + + `filter_out()` simplifies cases where you would have previously used a `filter()` to drop rows. It is particularly useful when missing values are involved. For example, to drop rows where the `count` is zero: + + ```r + df |> filter(count != 0 | !is.na(count)) + + df |> filter_out(count == 0) + ``` + + With `filter()`, you must provide a "negative" condition of `!= 0` and must explicitly guard against accidentally dropping rows with `NA`. With `filter_out()`, you directly specify rows to drop and you don't have to guard against dropping rows with `NA`, which tends to result in much clearer code. + + This work is a result of [Tidyup 8: Expanding the `filter()` family](https://github.com/tidyverse/tidyups/pull/30), with a lot of great feedback from the community (#6560, #6891). + * The `.groups` message emitted by `summarise()` is hopefully more clear now (#6986). * `if_any()` and `if_all()` are now more consistent in all use cases (#7059, #7077, #7746, @jrwinget). In particular: From 3155f84cda29a723269a934a0c5f3a4bfc9405bb Mon Sep 17 00:00:00 2001 From: Davis Vaughan Date: Tue, 25 Nov 2025 14:25:48 -0500 Subject: [PATCH 13/14] Add complement test --- tests/testthat/test-filter.R | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/testthat/test-filter.R b/tests/testthat/test-filter.R index e89ae5894f..3019eee421 100644 --- a/tests/testthat/test-filter.R +++ b/tests/testthat/test-filter.R @@ -780,6 +780,20 @@ test_that("`filter()` doesn't allow data frames with missing or empty names (#67 }) }) +test_that("`filter()` and `filter_out()` are complements", { + df <- tibble( + x = c(TRUE, TRUE, TRUE, FALSE, FALSE, FALSE, NA, NA, NA), + y = c(TRUE, FALSE, NA, TRUE, FALSE, NA, TRUE, FALSE, NA) + ) + + # Important invariant is that these are equivalent up to row ordering + # `union(filter(df, ...), filter_out(df, ...)) ~= df` + expect_identical( + union(filter(df, x, y), filter_out(df, x, y)) |> arrange(x, y), + df |> arrange(x, y) + ) +}) + # .by ------------------------------------------------------------------------- test_that("can group transiently using `.by`", { From 6054744ddd66d0912c2cb82d0bfa6c027e0cff0f Mon Sep 17 00:00:00 2001 From: Davis Vaughan Date: Tue, 25 Nov 2025 14:34:11 -0500 Subject: [PATCH 14/14] Fix NEWS example BECAUSE THIS IS HARD Y'ALL --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 9df68ced32..fd2cb5c6f1 100644 --- a/NEWS.md +++ b/NEWS.md @@ -9,7 +9,7 @@ `filter_out()` simplifies cases where you would have previously used a `filter()` to drop rows. It is particularly useful when missing values are involved. For example, to drop rows where the `count` is zero: ```r - df |> filter(count != 0 | !is.na(count)) + df |> filter(count != 0 | is.na(count)) df |> filter_out(count == 0) ```