Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add view to filter, sort, dropmissing, and unique #2386

Merged
merged 19 commits into from
Sep 9, 2020
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 77 additions & 48 deletions src/abstractdataframe/abstractdataframe.jl
Original file line number Diff line number Diff line change
Expand Up @@ -747,15 +747,20 @@ completecases(df::AbstractDataFrame, cols::MultiColumnIndex) =
completecases(df[!, cols])

"""
dropmissing(df::AbstractDataFrame, cols=:; disallowmissing::Bool=true)
dropmissing(df::AbstractDataFrame, cols=:; view::Bool=false, disallowmissing::Bool=!view)

Return a copy of data frame `df` excluding rows with missing values.
Return a data frame excluding rows with missing values in `df`.

If `cols` is provided, only missing values in the corresponding columns are considered.
`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR).

If `disallowmissing` is `true` (the default) then columns specified in `cols` will
be converted so as not to allow for missing values using [`disallowmissing!`](@ref).
If `view=false` a freshly allocated `DataFrame` is returned.
If `view=true` then a view into `df` is returned. In this case
bkamins marked this conversation as resolved.
Show resolved Hide resolved
`disallowmissing` must be `false`.

If `disallowmissing` is `true` (the default when `view` is `false`)
then columns specified in `cols` will be converted so as not to allow for missing
values using [`disallowmissing!`](@ref).

See also: [`completecases`](@ref) and [`dropmissing!`](@ref).

Expand Down Expand Up @@ -809,12 +814,20 @@ julia> dropmissing(df, [:x, :y])
│ 2 │ 5 │ 1 │ e │
```
"""
function dropmissing(df::AbstractDataFrame,
@inline function dropmissing(df::AbstractDataFrame,
cols::Union{ColumnIndex, MultiColumnIndex}=:;
bkamins marked this conversation as resolved.
Show resolved Hide resolved
disallowmissing::Bool=true)
newdf = df[completecases(df, cols), :]
disallowmissing && disallowmissing!(newdf, cols)
newdf
view::Bool=false, disallowmissing::Bool=!view)
rowidxs = completecases(df, cols)
if view
if disallowmissing
throw(ArgumentError("disallowmissing=true is incompatible with view=true"))
end
return Base.view(df, rowidxs, :)
else
newdf = df[rowidxs, :]
disallowmissing && disallowmissing!(newdf, cols)
return newdf
end
end

"""
Expand Down Expand Up @@ -887,10 +900,10 @@ function dropmissing!(df::AbstractDataFrame,
end

"""
filter(fun, df::AbstractDataFrame)
filter(cols => fun, df::AbstractDataFrame)
filter(fun, df::AbstractDataFrame; view::Bool=false)
filter(cols => fun, df::AbstractDataFrame; view::Bool=false)

Return a copy of data frame `df` containing only rows for which `fun`
Return a data frame containing only rows from `df` for which `fun`
returns `true`.

If `cols` is not specified then the predicate `fun` is passed `DataFrameRow`s.
Expand All @@ -902,6 +915,9 @@ corresponding columns as separate positional arguments, unless `cols` is an
column duplicates are allowed if a vector of `Symbol`s, strings, or integers is
passed.

If `view=false` a freshly allocated `DataFrame` is returned.
If `view=true` then a view into `df` is returned.
bkamins marked this conversation as resolved.
Show resolved Hide resolved

Passing `cols` leads to a more efficient execution of the operation for large data frames.

See also: [`filter!`](@ref)
Expand Down Expand Up @@ -953,38 +969,44 @@ julia> filter(AsTable(:) => nt -> nt.x == 1 || nt.y == "b", df)
│ 3 │ 1 │ b │
```
"""
Base.filter(f, df::AbstractDataFrame) = _filter_helper(df, f, eachrow(df))
Base.filter((col, f)::Pair{<:ColumnIndex}, df::AbstractDataFrame) =
_filter_helper(df, f, df[!, col])
Base.filter((cols, f)::Pair{<:AbstractVector{Symbol}}, df::AbstractDataFrame) =
filter([index(df)[col] for col in cols] => f, df)
Base.filter((cols, f)::Pair{<:AbstractVector{<:AbstractString}}, df::AbstractDataFrame) =
filter([index(df)[col] for col in cols] => f, df)
Base.filter((cols, f)::Pair, df::AbstractDataFrame) =
filter(index(df)[cols] => f, df)

function Base.filter((cols, f)::Pair{<:AbstractVector{Int}}, df::AbstractDataFrame)
cdf = _columns(df)
return _filter_helper(df, f, (cdf[i] for i in cols)...)
end

function _filter_helper(df::AbstractDataFrame, f, cols...)
@inline Base.filter(f, df::AbstractDataFrame; view::Bool=false) =
_filter_helper(df, f, eachrow(df), view=view)
@inline Base.filter((col, f)::Pair{<:ColumnIndex}, df::AbstractDataFrame;
view::Bool=false) = _filter_helper(df, f, df[!, col], view=view)
bkamins marked this conversation as resolved.
Show resolved Hide resolved
@inline Base.filter((cols, f)::Pair{<:AbstractVector{Symbol}},
df::AbstractDataFrame; view::Bool=false) =
filter([index(df)[col] for col in cols] => f, df, view=view)
@inline Base.filter((cols, f)::Pair{<:AbstractVector{<:AbstractString}},
df::AbstractDataFrame; view::Bool=false) =
filter([index(df)[col] for col in cols] => f, df, view=view)
@inline Base.filter((cols, f)::Pair, df::AbstractDataFrame; view::Bool=false) =
filter(index(df)[cols] => f, df, view=view)
@inline Base.filter((cols, f)::Pair{<:AbstractVector{Int}},
df::AbstractDataFrame; view::Bool=false) =
_filter_helper(df, f, (df[!, i] for i in cols)...; view=view)

@inline function _filter_helper(df::AbstractDataFrame, f, cols...; view::Bool)
if length(cols) == 0
throw(ArgumentError("At least one column must be passed to filter on"))
end
return df[((x...) -> f(x...)::Bool).(cols...), :]
rowidxs = ((x...) -> f(x...)::Bool).(cols...)
return view ? Base.view(df, rowidxs, :) : df[rowidxs, :]
end

function Base.filter((cols, f)::Pair{<:AsTable}, df::AbstractDataFrame)
@inline function Base.filter((cols, f)::Pair{<:AsTable}, df::AbstractDataFrame;
view::Bool=false)
df_tmp = select(df, cols.cols, copycols=false)
if ncol(df_tmp) == 0
throw(ArgumentError("At least one column must be passed to filter on"))
end
return _filter_helper_astable(df, Tables.namedtupleiterator(df_tmp), f)
return _filter_helper_astable(df, Tables.namedtupleiterator(df_tmp), f, view=view)
end

_filter_helper_astable(df::AbstractDataFrame, nti::Tables.NamedTupleIterator, f) =
df[(x -> f(x)::Bool).(nti), :]
@inline function _filter_helper_astable(df::AbstractDataFrame,
nti::Tables.NamedTupleIterator, f; view::Bool)
rowidxs = (x -> f(x)::Bool).(nti)
return view ? Base.view(df, rowidxs, :) : df[rowidxs, :]
end

"""
filter!(fun, df::AbstractDataFrame)
Expand Down Expand Up @@ -1070,11 +1092,8 @@ Base.filter!((cols, f)::Pair{<:AbstractVector{<:AbstractString}}, df::AbstractDa
filter!([index(df)[col] for col in cols] => f, df)
Base.filter!((cols, f)::Pair, df::AbstractDataFrame) =
filter!(index(df)[cols] => f, df)

function Base.filter!((cols, f)::Pair{<:AbstractVector{Int}}, df::AbstractDataFrame)
cdf = _columns(df)
return _filter!_helper(df, f, (cdf[i] for i in cols)...)
end
Base.filter!((cols, f)::Pair{<:AbstractVector{Int}}, df::AbstractDataFrame) =
_filter!_helper(df, f, (df[!, i] for i in cols)...)

function _filter!_helper(df::AbstractDataFrame, f, cols...)
if length(cols) == 0
Expand Down Expand Up @@ -1175,22 +1194,32 @@ Base.unique!(df::AbstractDataFrame, cols) =
delete!(df, findall(nonunique(df, cols)))

# Unique rows of an AbstractDataFrame.
Base.unique(df::AbstractDataFrame) = df[(!).(nonunique(df)), :]
Base.unique(df::AbstractDataFrame, cols) =
df[(!).(nonunique(df, cols)), :]
@inline function Base.unique(df::AbstractDataFrame; view::Bool=false)
rowidxs = (!).(nonunique(df))
return view ? Base.view(df, rowidxs, :) : df[rowidxs, :]
end

@inline function Base.unique(df::AbstractDataFrame, cols; view::Bool=false)
rowidxs = (!).(nonunique(df, cols))
return view ? Base.view(df, rowidxs, :) : df[rowidxs, :]
end

"""
unique(df::AbstractDataFrame)
unique(df::AbstractDataFrame, cols)
unique(df::AbstractDataFrame; view::Bool=false)
unique(df::AbstractDataFrame, cols; view::Bool=false)
unique!(df::AbstractDataFrame)
unique!(df::AbstractDataFrame, cols)

Delete duplicate rows of data frame `df`, keeping only the first occurrence of unique rows.
When `cols` is specified, the returned `DataFrame` contains complete rows,
retaining in each case the first instance for which `df[cols]` is unique.
`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR).
Return a data frame containing only the first occurrence of unique rows in `df`.
When `cols` is specified, the returned `DataFrame` contains
complete rows, retaining in each case the first instance for which `df[cols]` is
unique. `cols` can be any column selector ($COLUMNINDEX_STR;
$MULTICOLUMNINDEX_STR).

For `unique`, if `view=false` a freshly allocated `DataFrame` is returned,
and if `view=true` then a view into `df` is returned.
bkamins marked this conversation as resolved.
Show resolved Hide resolved

When `unique` is called a new data frame is returned; `unique!` updates `df` in-place.
`unique!` updates `df` in-place and does not support the `view` keyword argument.

See also [`nonunique`](@ref).

Expand Down
58 changes: 28 additions & 30 deletions src/abstractdataframe/sort.jl
Original file line number Diff line number Diff line change
Expand Up @@ -332,34 +332,12 @@ function Base.issorted(df::AbstractDataFrame, cols=[];
end
end

# sort and sortperm functions

for s in [:(Base.sort), :(Base.sortperm)]
@eval begin
function $s(df::AbstractDataFrame, cols=[];
alg=nothing, lt=isless, by=identity, rev=false, order=Forward)
if !(isa(by, Function) || eltype(by) <: Function)
msg = "'by' must be a Function or a vector of Functions. " *
" Perhaps you wanted 'cols'."
throw(ArgumentError(msg))
end
# exclude AbstractVector as in that case cols can contain order(...) clauses
if cols isa MultiColumnIndex && !(cols isa AbstractVector)
cols = index(df)[cols]
end
ord = ordering(df, cols, lt, by, rev, order)
_alg = Sort.defalg(df, ord; alg=alg, cols=cols)
return $s(df, _alg, ord)
end
end
end

"""
sort(df::AbstractDataFrame, cols;
alg::Union{Algorithm, Nothing}=nothing, lt=isless, by=identity,
rev::Bool=false, order::Ordering=Forward)
rev::Bool=false, order::Ordering=Forward, view::Bool=false)

Return a copy of data frame `df` sorted by column(s) `cols`.
Return a data frame containing the rows in `df` sorted by column(s) `cols`.

`cols` can be any column selector ($COLUMNINDEX_STR; $MULTICOLUMNINDEX_STR).

Expand All @@ -369,6 +347,10 @@ on the type of the sorting columns and on the number of rows in `df`.
If `rev` is `true`, reverse sorting is performed. To enable reverse sorting
only for some columns, pass `order(c, rev=true)` in `cols`, with `c` the
corresponding column index (see example below).

If `view=false` a freshly allocated `DataFrame` is returned.
If `view=true` then a view into `df` is returned.
bkamins marked this conversation as resolved.
Show resolved Hide resolved

See [`sort!`](@ref) for a description of other keyword arguments.

# Examples
Expand Down Expand Up @@ -424,7 +406,11 @@ julia> sort(df, [:x, order(:y, rev=true)])
│ 4 │ 3 │ b │
```
"""
sort(::AbstractDataFrame, ::Any)
@inline function Base.sort(df::AbstractDataFrame, cols=[]; alg=nothing, lt=isless,
by=identity, rev=false, order=Forward, view::Bool=false)
bkamins marked this conversation as resolved.
Show resolved Hide resolved
rowidxs = sortperm(df, cols, alg=alg, lt=lt, by=by, rev=rev, order=order)
return view ? Base.view(df, rowidxs, :) : df[rowidxs, :]
end

"""
sortperm(df::AbstractDataFrame, cols;
Expand Down Expand Up @@ -485,11 +471,23 @@ julia> sortperm(df, (:x, :y), rev=true)
1
```
"""
sortperm(::AbstractDataFrame, ::Any)
function Base.sortperm(df::AbstractDataFrame, cols=[];
alg=nothing, lt=isless, by=identity, rev=false, order=Forward)
if !(isa(by, Function) || eltype(by) <: Function)
msg = "'by' must be a Function or a vector of Functions. " *
" Perhaps you wanted 'cols'."
throw(ArgumentError(msg))
end
# exclude AbstractVector as in that case cols can contain order(...) clauses
if cols isa MultiColumnIndex && !(cols isa AbstractVector)
cols = index(df)[cols]
end
ord = ordering(df, cols, lt, by, rev, order)
_alg = Sort.defalg(df, ord; alg=alg, cols=cols)
return _sortperm(df, _alg, ord)
end

Base.sort(df::AbstractDataFrame, a::Algorithm, o::Ordering) =
df[sortperm(df, a, o),:]
Base.sortperm(df::AbstractDataFrame, a::Algorithm, o::Union{Perm,DFPerm}) =
_sortperm(df::AbstractDataFrame, a::Algorithm, o::Union{Perm,DFPerm}) =
sort!([1:size(df, 1);], a, o)
Base.sortperm(df::AbstractDataFrame, a::Algorithm, o::Ordering) =
_sortperm(df::AbstractDataFrame, a::Algorithm, o::Ordering) =
sortperm(df, a, DFPerm(o,df))
2 changes: 1 addition & 1 deletion src/dataframe/sort.jl
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ function Base.sort!(df::DataFrame, cols=[]; alg=nothing,
end

function Base.sort!(df::DataFrame, a::Base.Sort.Algorithm, o::Base.Sort.Ordering)
p = sortperm(df, a, o)
p = _sortperm(df, a, o)
pp = similar(p)
c = _columns(df)

Expand Down
36 changes: 36 additions & 0 deletions test/data.jl
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,18 @@ end
@test eltype(dropmissing!(df).b) == Int
end

@testset "dropmissing and unique view kwarg test" begin
df = DataFrame(rand(3,4))
for fun in (dropmissing, unique)
@test fun(df) isa DataFrame
@test fun(view(df, 1:2, 1:2)) isa DataFrame
@test fun(df, view=false) isa DataFrame
@test fun(view(df, 1:2, 1:2), view=false) isa DataFrame
@test fun(df, view=true) isa SubDataFrame
@test fun(view(df, 1:2, 1:2), view=true) isa SubDataFrame
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also test returned value? Or is that covered elsewhere?

It would also be nice to use @inferred when view=true/false isn't specified to prevent any regression: it would be easy to remove one of the @inlined without realizing why they are here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have added the test and @inferred (I thought it would fail on Julia 1.0, but it passes - which is good)

end
end

@testset "merge" begin
Random.seed!(1)
df1 = DataFrame(a = shuffle!(Vector{Union{Int, Missing}}(1:10)),
Expand Down Expand Up @@ -389,6 +401,30 @@ end
@test_throws TypeError filter!((:) => (r...) -> r[1] > 1, df)
end

@testset "filter view kwarg test" begin
df = DataFrame(rand(3,4))
for fun in (row -> row.x1 > 0, :x1 => >(0), "x1" => >(0),
[:x1] => >(0), ["x1"] => >(0),
r"1" => >(0), AsTable(:) => x -> x.x1 > 0)
@test filter(fun, df) isa DataFrame
@test filter(fun, view(df, 1:2, 1:2)) isa DataFrame
@test filter(fun, df, view=false) isa DataFrame
@test filter(fun, view(df, 1:2, 1:2), view=false) isa DataFrame
@test filter(fun, df, view=true) isa SubDataFrame
@test filter(fun, view(df, 1:2, 1:2), view=true) isa SubDataFrame
end
end

@testset "filter and filter! with SubDataFrame" begin
dfv = view(DataFrame(x = [0, 0, 3, 1, 3, 1], y = 1:6), 3:6, 1:1)

@test filter(:x => x -> x > 2, dfv) == DataFrame(x = [3, 3])
@test filter(:x => x -> x > 2, dfv, view=true) == DataFrame(x = [3, 3])
@test parent(filter(:x => x -> x > 2, dfv, view=true)) === parent(dfv)

@test_throws ArgumentError filter!(:x => x -> x > 2, dfv)
end

@testset "filter and filter! with AsTable" begin
df = DataFrame(x = [3, 1, 2, 1], y = ["b", "c", "a", "b"])

Expand Down
10 changes: 10 additions & 0 deletions test/sort.jl
Original file line number Diff line number Diff line change
Expand Up @@ -147,4 +147,14 @@ end
end
end

@testset "view kwarg test" begin
df = DataFrame(rand(3,4))
@test sort(df) isa DataFrame
@test sort(view(df, 1:2, 1:2)) isa DataFrame
@test sort(df, view=false) isa DataFrame
@test sort(view(df, 1:2, 1:2), view=false) isa DataFrame
@test sort(df, view=true) isa SubDataFrame
@test sort(view(df, 1:2, 1:2), view=true) isa SubDataFrame
end

end # module