-
Notifications
You must be signed in to change notification settings - Fork 11
Enhance joining and grouping #17
Changes from 1 commit
dd68a65
a5fd472
9424201
a652768
0cdf755
d292dd3
53774f5
2adc883
d52c791
5e9664a
e1b4d0e
74c36d1
de09a5c
160be5c
7f28a14
61bf607
6147d0c
bab097f
cdac010
8cf4a67
199f96b
f3b06a3
8308879
1c842dc
637b8cf
49d6328
b6c1f98
839c558
46aaae2
01b3ce8
7b9b8e2
7fe0389
cf0486a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,18 +37,17 @@ Base.convert(::Type{Array}, r::DataTableRow) = convert(Array, r.dt[r.row,:]) | |
|
||
Base.collect(r::DataTableRow) = Tuple{Symbol, Any}[x for x in r] | ||
|
||
# the equal elements of nullable and normal arrays would have the same hashes | ||
const NULL_MAGIC = 0xBADDEED # what to hash if the element is null | ||
|
||
# hash column element | ||
Base.@propagate_inbounds hash_colel(v::AbstractArray, i, h::UInt = zero(UInt)) = hash(v[i], h) | ||
Base.@propagate_inbounds hash_colel{T<:Nullable}(v::AbstractArray{T}, i, h::UInt = zero(UInt)) = | ||
isnull(v[i]) ? hash(Base.nullablehash_seed, h) : hash(get(v[i]), h) | ||
Base.@propagate_inbounds hash_colel{T}(v::NullableArray{T}, i, h::UInt = zero(UInt)) = | ||
isnull(v, i) ? hash(NULL_MAGIC, h) : hash(get(v[i]), h) | ||
isnull(v, i) ? hash(Base.nullablehash_seed, h) : hash(v.values[i], h) | ||
Base.@propagate_inbounds hash_colel{T}(v::AbstractCategoricalArray{T}, i, h::UInt = zero(UInt)) = | ||
hash(CategoricalArrays.index(v.pool)[v.refs[i]], h) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Are you sure this is really more efficient than the default There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I guess it's not as efficient. But in these functions the constraint is to make hashes invariant to the hashed value representation: whether it's nullable or not and whether it's stored "as is" or in a categorical array. Otherwise joins would not work (we may require that joins only use the columns of identical types, but that would result in too much overhead on the user side). So we have to check if the default hash functions have this property ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be surprising that it would be significantly slower, since the code is very similar. Though since we need the special method for |
||
Base.@propagate_inbounds function hash_colel{T}(v::AbstractCategoricalArray{T}, i, h::UInt = zero(UInt)) | ||
Base.@propagate_inbounds function hash_colel{T}(v::AbstractNullableCategoricalArray{T}, i, h::UInt = zero(UInt)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. need to add a test to distinguish this from above function per now out-of-date comment
|
||
ref = v.refs[i] | ||
ref == 0 ? hash(NULL_MAGIC, h) : hash(CategoricalArrays.index(v.pool)[ref], h) | ||
ref == 0 ? hash(Base.nullablehash_seed, h) : hash(CategoricalArrays.index(v.pool)[ref], h) | ||
end | ||
|
||
# hash of DataTable rows based on its values | ||
|
@@ -79,7 +78,7 @@ function @compat(Base.:(==))(r1::DataTableRow, r2::DataTableRow) | |
end | ||
return eq | ||
else | ||
r1.row == r2.row && return Nullable(true) | ||
r1.row == r2.row && return Nullable(true) | ||
eq = Nullable(true) | ||
@inbounds for col in columns(r1.dt) | ||
eq_col = convert(Nullable{Bool}, col[r1.row] == col[r2.row]) | ||
|
@@ -104,13 +103,13 @@ function isequal_colel{T}(col::Union{NullableArray{T}, | |
end | ||
|
||
isequal_colel(a::Any, b::Any) = isequal(a, b) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. These two-argument definitions are not needed AFAICT. The only place where they are called could use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Tests fail after making the changes. They all seem necessary to get around Nullable comparisons julia> using DataTables
julia> DataTables.isequal_colel(Nullable(1), 1)
true
julia> isequal(Nullable(1), 1)
false There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, right, I forgot about the need to unwrap nullables when comparing with non-nullable. That behavior is quite annoying, but well... |
||
isequal_colel(a::Nullable, b::Any) = !isnull(a) && isequal(get(a), b) | ||
isequal_colel(a::Nullable, b::Any) = !isnull(a) & isequal(unsafe_get(a), b) | ||
isequal_colel(a::Any, b::Nullable) = isequal_colel(b, a) | ||
isequal_colel(a::Nullable, b::Nullable) = isnull(a)==isnull(b) && (isnull(a) || isequal(get(a), get(b))) | ||
isequal_colel(a::Nullable, b::Nullable) = isnull(a)==isnull(b) && (isnull(a) || isequal(a, b)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suppose here it could be just isequal_colel(a::Nullable, b::Nullable) = isequal(a, b) |
||
|
||
# comparison of DataTable rows | ||
function isequal_row(dt::AbstractDataTable, r1::Int, r2::Int) | ||
(r1 == r2) && return true # same raw | ||
(r1 == r2) && return true # same row | ||
@inbounds for col in columns(dt) | ||
isequal_colel(col, r1, r2) || return false | ||
end | ||
|
@@ -120,7 +119,7 @@ end | |
function isequal_row(dt1::AbstractDataTable, r1::Int, dt2::AbstractDataTable, r2::Int) | ||
(dt1 === dt2) && return isequal_row(dt1, r1, r2) | ||
(ncol(dt1) == ncol(dt2)) || | ||
throw(ArgumentError("Rows of the data frames that have different number of columns cannot be compared ($(ncol(dt1)) and $(ncol(dt2)))")) | ||
throw(ArgumentError("Rows of the data tables that have different number of columns cannot be compared ($(ncol(dt1)) and $(ncol(dt2)))")) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. "of data tables" |
||
@inbounds for (col1, col2) in zip(columns(dt1), columns(dt2)) | ||
isequal_colel(col1[r1], col2[r2]) || return false | ||
end | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -25,11 +25,11 @@ function hashrows_col!(h::Vector{UInt}, v::AbstractVector) | |
h | ||
end | ||
|
||
function hashrows_col!{T}(h::Vector{UInt}, v::AbstractVector{T}) | ||
function hashrows_col!{T<:Nullable}(h::Vector{UInt}, v::AbstractVector{T}) | ||
@inbounds for i in eachindex(h) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't you call There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. AFAIR, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, |
||
h[i] = isnull(v[i]) ? | ||
hash(NULL_MAGIC, h[i]) : | ||
hash(v[i], h[i]) | ||
hash(Base.nullablehash_seed, h[i]) : | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also applies above. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I got hash errors after making these changes (and others probably downstream of this) Test Failed
Expression: dt_rowhashes == [hash(dr) for dr = eachrow(dt)]
Evaluated: UInt64[0x1bdefb2976bd94c3,0xdd9fa2a42135ac50,0x1e6098864a87ed7d,0x1bdefb2976bd94c3,0xdd9fa2a42135ac50,0x0e42569badde05fc] == UInt64[0x1bdefb2976bd94c3,0xbb2ac60630b5eb56,0x1e6098864a87ed7d,0x1bdefb2976bd94c3,0xbb2ac60630b5eb56,0xd4ac22e6c0fc8065] I think you're referencing this line, but There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't want to imply that that function was called, just that it was consistent with it. I don't think changing this should generate any failures since the hash functions are completely under our control. Have you made the same change to all places where |
||
hash(unsafe_get(v[i]), h[i]) | ||
end | ||
h | ||
end | ||
|
@@ -48,7 +48,7 @@ function hashrows_col!{T}(h::Vector{UInt}, v::AbstractNullableCategoricalVector{ | |
# TODO is it possible to optimize by hashing the pool values once? | ||
@inbounds for (i, ref) in enumerate(v.refs) | ||
h[i] = ref == 0 ? | ||
hash(NULL_MAGIC, h[i]) : | ||
hash(Base.nullablehash_seed, h[i]) : | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same here. |
||
hash(CategoricalArrays.index(v.pool)[ref], h[i]) | ||
end | ||
h | ||
|
@@ -72,7 +72,7 @@ end | |
# the indices of the first row in a group | ||
# Optional group vector is set to the group indices of each row | ||
function row_group_slots(dt::AbstractDataTable, | ||
groups::Union{Vector{Int}, Void} = nothing) | ||
groups::Union{Vector{Int}, Void} = nothing) | ||
@assert groups === nothing || length(groups) == nrow(dt) | ||
rhashes = hashrows(dt) | ||
sz = Base._tablesz(length(rhashes)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you add a comment explaining why we need this? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function is based on the Dict hash table from the Base. AFAIR, this line should reduce the number of reallocations, since we know the final hash size. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK. So would be good to explain that it's based on the Base code. Isn't there any generic data structure that we could use instead of custom code? Would it make sense to have this in e.g. DataStructures.jl? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @alyst Could you explain what's the general strategy here and how it differs from the current one? Can it be as fast as the current one when grouping on While the benchmark improvements compared with master are great, they don't seem to match the performance I obtained in #12 for the most efficient case. Of course this PR greatly improves performance for other cases, which are quite common, but ideally we would get the best possible performance for each case. See also my question above. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @nalimilan I'm not quite sure I understand what you mean by the general strategy. Partly I explained it above. Here we are using hash to group the identical rows. AFAIR, at the time I was writing the original PR DataFrames used a temporary row Ids array and indexed like this
where There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, the old code overflowed quite easily, but Pandas (from which it was inspired) has a solution to compress the integer codes to avoid that. I'm not saying it's necessarily the best solution but they must have considered this question quite carefully. So before adopting a different approach, I'd like to be sure it can be about as fast as Pandas. I guess the only way to find out is to benchmark it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @nalimilan Benchmarking is a nice idea anyway, because the proper benchmark should be based on some big data-derived datasets (millions of rows, >1000-categories arrays, multi-column joins using different data types, etc). There's a long term benefit in having it. I was testing PR850 using somewhat realistic datasets. Unfortunately, ATM I don't have a test system and time resources to help you with the benchmark. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you also explain why we can't simply use a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. One of my intermediate implementations was using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK, thanks, it's good to have this written somewhere. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you just add a comment mentioning that this code is inspired by the dict code from Base? |
||
|
@@ -106,7 +106,7 @@ function row_group_slots(dt::AbstractDataTable, | |
end | ||
slotix = slotix & szm1 + 1 # check the next slot | ||
probe += 1 | ||
probe < sz || error("Cannot find free row slot") | ||
@assert probe < sz | ||
end | ||
if groups !== nothing | ||
groups[i] = gix | ||
|
@@ -115,15 +115,15 @@ function row_group_slots(dt::AbstractDataTable, | |
return ngroups, rhashes, gslots | ||
end | ||
|
||
# Builds RowGroupDict for a given dataframe. | ||
# Builds RowGroupDict for a given datatable. | ||
# Partly uses the code of Wes McKinney's groupsort_indexer in pandas (file: src/groupby.pyx). | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should update the name of the file while we remember it. |
||
function group_rows(dt::AbstractDataTable) | ||
groups = Vector{Int}(nrow(dt)) | ||
ngroups, rhashes, gslots = row_group_slots(dt, groups) | ||
|
||
# count elements in each group | ||
stops = zeros(Int, ngroups) | ||
for g_ix in groups | ||
@inbounds for g_ix in groups | ||
stops[g_ix] += 1 | ||
end | ||
|
||
|
@@ -170,20 +170,20 @@ function findrow(gd::RowGroupDict, dt::DataTable, row::Int) | |
return 0 # not found | ||
end | ||
|
||
# Finds indices of rows in 'gd' that match given row by content. | ||
# returns empty set if no row matches | ||
# Find indices of rows in 'gd' that match given row by content. | ||
# return empty set if no row matches | ||
function Base.get(gd::RowGroupDict, dt::DataTable, row::Int) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This function and the next one don't seem to match the signature and behavior of the corresponding Base methods. Better give them a separate name or adapt their signature so that they take a single key/index. |
||
g_row = findrow(gd, dt, row) | ||
(g_row == 0) && return Compat.view(gd.rperm, 0:-1) | ||
(g_row == 0) && return view(gd.rperm, 0:-1) | ||
gix = gd.groups[g_row] | ||
return Compat.view(gd.rperm, gd.starts[gix]:gd.stops[gix]) | ||
return view(gd.rperm, gd.starts[gix]:gd.stops[gix]) | ||
end | ||
|
||
function Base.getindex(gd::RowGroupDict, dtr::DataTableRow) | ||
g_row = findrow(gd, dtr.dt, dtr.row) | ||
(g_row == 0) && throw(KeyError(dtr)) | ||
gix = gd.groups[g_row] | ||
return Compat.view(gd.rperm, gd.starts[gix]:gd.stops[gix]) | ||
return view(gd.rperm, gd.starts[gix]:gd.stops[gix]) | ||
end | ||
|
||
# Check if there is matching row in gd | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Also use
unsafe_get
here.