Enhance joining and grouping #17

cjprybol · 2017-02-20T21:44:51Z

The implementation @alyst wrote for JuliaData/DataFrames.jl#850 is very well done. It's clearly more performant than the current master and what I wrote for #3. I merged JuliaData/DataFrames.jl#850 with DataTables but if I could get some keen eyes, particularly @alyst's to make sure I didn't make any meaningful errors while fixing merge conflicts that would be appreciated. I merged benchmarks from JuliaData/DataFrames.jl#850, #3, and #12 to generate this list of benchmarks.

benchmarks

using DataTables
using BenchmarkTools
using Distributions

srand(1);
a = rand([1, 2, Nullable(), 4], 20);
b = rand([:a, :b], 20);
c = rand(["a", "b"], 20);
d = rand(rand(2), 20);
small = DataTable(A = a, B = b, C = c, D = c);
A = repeat(rand(8891), inner=2);
B = repeat(rand(8891), inner=2);
C = repeat(rand(8891), inner=2);
D = repeat(rand(8891), inner=2);
E = repeat(rand(8891), inner=2);
F = repeat(rand(17782));
large = DataTable(A = A, B = B, C = C, D = D, E = E, F = F);
small_ordered = DataTable(v1 = categorical(collect(1:1000)), v2 = categorical(fill(1, 1000)));
small_unordered = DataTable(v1 = categorical(rand(1000)), v2 = categorical(rand(1000)));
large_ordered = DataTable(v1 = categorical(collect(1:100000)), v2 = categorical(fill(1, 100000)));
large_unordered = DataTable(v1 = categorical(rand(100000)), v2 = categorical(rand(100000)));
wide_ordered = DataTable(v1 = categorical(collect(1:100000)),
                         v2 = categorical(fill(1, 100000)),
                         v3 = categorical(collect(1:100000)),
                         v4 = categorical(collect(1:100000)));

dt1 = DataTable(v1 = CategoricalArray(repeat(1:10, inner=1000)), v2 = CategoricalArray(repeat(1:10, outer=1000)));
dt2 = DataTable(v1 = CategoricalArray(repeat(1:100, inner=100)), v2 = CategoricalArray(repeat(1:100, outer=100)));
dt3 = hcat(dt1, dt2);

function random_frame(nrow::Int, col_values::Dict{Symbol, Any})
  DataTable(Any[isa(col_values[key], CategoricalArray) ?
                sample(col_values[key], nrow) :
                sample(col_values[key], nrow) for key in keys(col_values)],
            keys(col_values) |> collect)
end

function random_join(kind::Symbol, nrow_left::Int, nrow_right::Int,
                     on_col_values::Dict{Symbol, Any},
                     left_col_values::Dict{Symbol, Any},
                     right_col_values::Dict{Symbol, Any})
  dtl = random_frame(nrow_left, merge(on_col_values, left_col_values))
  dtr = random_frame(nrow_right, merge(on_col_values, right_col_values))
  join(dtl, dtr, on = keys(on_col_values) |> collect, kind = kind)
end

function f(n::Int)
    for i in 1:n
    r = random_join(:outer, 1000, 2000,
                Dict{Symbol,Any}(:A => 1:10, :B => [:A, :B, :C, :D],
                                 :C => 1:10, :D => 1:10),
                Dict{Symbol,Any}(:E => 1:10, :F => [:A, :B, :C, :D]),
                Dict{Symbol,Any}(:G => 1:10, :H => [:A, :B, :C, :D]))
    end
end
f(1)

function h(n::Int)
    for i in 1:n
        r = random_join(:outer, 10000, 20000,
                       Dict{Symbol,Any}(:A => collect(1:10000)),
                       Dict{Symbol,Any}(:B => collect(1:10000)),
                       Dict{Symbol,Any}(:C => collect(1:10000)))
        end
    end

h(1)

@benchmark groupby(small, [:A, :B])
@benchmark groupby(large, [:A, :B])
@benchmark groupby(large, [:A, :B, :C, :D, :E])
@benchmark groupby(small_ordered, [:v1, :v2])
@benchmark groupby(small_unordered, [:v1, :v2])
@benchmark groupby(large_ordered, [:v1, :v2])
@benchmark groupby(large_unordered, [:v1, :v2])
@benchmark groupby(wide_ordered, [:v1, :v2, :v3, :v4])

@benchmark groupby(dt1, [:v1, :v2])
@benchmark groupby(dt2, [:v1, :v2])
@benchmark groupby(dt3, [:v1, :v2, :v1_1, :v2_1])

@time f(100)
@benchmark f(100)
@time h(100)
@benchmark h(100)

results

julia> @benchmark groupby(small, [:A, :B])
  #master
  memory estimate:  11.95 KiB
  allocs estimate:  185
  median time:      51.035 μs (0.00% GC)
  #3
  memory estimate:  15.91 KiB
  allocs estimate:  516
  median time:      21.434 μs (0.00% GC)
  #this pr/DataFrames:850
  memory estimate:  6.55 KiB
  allocs estimate:  130
  median time:      4.213 μs (0.00% GC)


julia> @benchmark groupby(large, [:A, :B])
  #master
  memory estimate:  1.26 GiB
  allocs estimate:  388362
  median time:      1.418 s (21.80% GC)
  #3
  memory estimate:  18.87 MiB
  allocs estimate:  815974
  median time:      32.828 ms (14.98% GC)
  #this pr/DataFrames:850
  memory estimate:  1.39 MiB
  allocs estimate:  34592
  median time:      1.648 ms (0.00% GC)

julia> @benchmark groupby(large, [:A, :B, :C, :D, :E])
  #master
  ERROR: InexactError()
  #3
  memory estimate:  39.59 MiB
  allocs estimate:  1834418
  median time:      63.197 ms (16.02% GC)
  #this pr/DataFrames:850
  memory estimate:  2.18 MiB
  allocs estimate:  86411
  median time:      3.514 ms (0.00% GC)

julia> @benchmark groupby(small_ordered, [:v1, :v2])
  #master
  memory estimate:  329.38 KiB
  allocs estimate:  9412
  median time:      2.076 ms (0.00% GC)
  #3
  memory estimate:  923.17 KiB
  allocs estimate:  34720
  median time:      1.237 ms (0.00% GC)
  #this pr/DataFrames:850
  memory estimate:  57.98 KiB
  allocs estimate:  43
  median time:      51.100 μs (0.00% GC)

julia> @benchmark groupby(small_unordered, [:v1, :v2])
  #master
  memory estimate:  16.72 MiB
  allocs estimate:  15905
  median time:      14.618 ms (0.00% GC)
  #3
  memory estimate:  1018.45 KiB
  allocs estimate:  38912
  median time:      1.526 ms (0.00% GC)
  #this pr/DataFrames:850
  memory estimate:  57.98 KiB
  allocs estimate:  43
  median time:      56.681 μs (0.00% GC)

julia> @benchmark groupby(large_ordered, [:v1, :v2])
  #master
  memory estimate:  44.40 MiB
  allocs estimate:  1890360
  median time:      216.415 ms (0.00% GC)
  #3
  memory estimate:  104.09 MiB
  allocs estimate:  4357825
  median time:      313.996 ms (24.12% GC)
  #this pr/DataFrames:850
  memory estimate:  5.58 MiB
  allocs estimate:  50
  median time:      4.952 ms (0.00% GC)

julia> @benchmark groupby(large_unordered, [:v1, :v2])
  #master
  ERROR: InexactError()
  #3
  memory estimate:  104.21 MiB
  allocs estimate:  4362897
  median time:      312.446 ms (22.47% GC)
  #this pr/DataFrames:850
  memory estimate:  5.58 MiB
  allocs estimate:  50
  median time:      5.192 ms (0.00% GC)

julia> @benchmark groupby(wide_ordered, [:v1, :v2, :v3, :v4])
  #master
  ERROR: InexactError()
  #3
  memory estimate:  175.00 MiB
  allocs estimate:  7968344
  median time:      484.698 ms (22.13% GC)
  #this pr/DataFrames:850
  memory estimate:  5.58 MiB
  allocs estimate:  54
  median time:      5.723 ms (0.00% GC)

julia> @benchmark groupby(dt1, [:v1, :v2])
  #master
  memory estimate:  1.22 MiB
  allocs estimate:  66536
  median time:      3.547 ms (0.00% GC)
  #3
  memory estimate:  10.68 MiB
  allocs estimate:  476970
  median time:      19.039 ms (0.00% GC)
  #this pr/DataFrames:850
  memory estimate:  3.33 MiB
  allocs estimate:  115865
  median time:      2.119 ms (0.00% GC)

julia> @benchmark groupby(dt2, [:v1, :v2])
  #master
  memory estimate:  1.99 MiB
  allocs estimate:  85972
  median time:      4.067 ms (0.00% GC)
  #3
  memory estimate:  10.06 MiB
  allocs estimate:  409364
  median time:      18.255 ms (0.00% GC)
  #this pr/DataFrames:850
  memory estimate:  599.53 KiB
  allocs estimate:  50
  median time:      332.527 μs (0.00% GC)

julia> @benchmark groupby(dt3, [:v1, :v2, :v1_1, :v2_1])
  #master
  memory estimate:  20.45 MiB
  allocs estimate:  238751
  median time:      18.464 ms (13.56% GC)
  #3
  memory estimate:  16.74 MiB
  allocs estimate:  744976
  median time:      32.042 ms (15.40% GC)
  #this pr/DataFrames:850
  memory estimate:  599.70 KiB
  allocs estimate:  54
  median time:      416.803 μs (0.00% GC)

julia> @time f(100)
  #master
  0.519709 seconds (5.89 M allocations: 268.796 MB, 8.70% gc time)
  #3
  0.523668 seconds (5.89 M allocations: 268.796 MB, 7.42% gc time)
  #this pr/DataFrames:850
  0.487742 seconds (5.89 M allocations: 268.796 MB, 7.94% gc time)

julia> @benchmark f(100)
  #master
  memory estimate:  268.61 MiB
  allocs estimate:  5888594
  median time:      483.714 ms (7.89% GC)
  #3
  memory estimate:  268.61 MiB
  allocs estimate:  5888594
  median time:      524.871 ms (8.02% GC)
  #this pr/DataFrames:850
  memory estimate:  268.61 MiB
  allocs estimate:  5888594
  median time:      508.119 ms (7.48% GC)

julia> @time h(100)
  #master
  1.288805 seconds (9.32 M allocations: 733.918 MB, 16.57% gc time)
  #3
  1.230394 seconds (9.31 M allocations: 733.837 MB, 8.34% gc time)
  #this pr/DataFrames:850
  1.220218 seconds (9.31 M allocations: 733.837 MB, 9.96% gc time)

julia> @benchmark h(100)
  #master
  memory estimate:  733.79 MiB
  allocs estimate:  9312901
  median time:      1.277 s (9.13% GC)
  #3
  memory estimate:  733.81 MiB
  allocs estimate:  9312588
  median time:      1.275 s (8.45% GC)
  #this pr/DataFrames:850
  memory estimate:  733.79 MiB
  allocs estimate:  9312588
  median time:      1.186 s (9.87% GC)

cjprybol · 2017-02-21T18:35:04Z

and sorry for hijacking your pull request from DataFrames @alyst. Feel free to fork this and resubmit the pull request under your account if you'd like!

alyst · 2017-02-22T00:41:16Z

@cjprybol Great, thanks a lot for transferring the code to DataTables and for the benchmarks! I've looked at it briefly and it LGTM. Do you also plan to cherry pick the other uses of row grouping, i.e. joins (which was my original motivation) and nonunique()?

cjprybol · 2017-02-22T02:23:52Z

Yes, thanks for mentioning those (joins & nonunique()), let's not leave anything behind! nonunique() was easy to add but I couldn't get the joins to work

julia> dt1 = DataTable(a = shuffle!([1:10;]),
                       b = [:A,:B][rand(1:2, 10)],
                       v1 = randn(10))
10×3 DataTables.DataTable
│ Row │ a  │ b  │ v1         │
├─────┼────┼────┼────────────┤
│ 1   │ 9  │ :A │ -1.72976   │
│ 2   │ 5  │ :A │ 0.795949   │
│ 3   │ 1  │ :A │ 0.670062   │
│ 4   │ 7  │ :B │ 0.550852   │
│ 5   │ 4  │ :A │ -0.0633746 │
│ 6   │ 10 │ :B │ 1.33694    │
│ 7   │ 6  │ :B │ -0.0731486 │
│ 8   │ 2  │ :B │ -0.745464  │
│ 9   │ 8  │ :A │ -1.22006   │
│ 10  │ 3  │ :B │ -0.0531773 │

julia> dt2 = DataTable(a = shuffle!(reverse([1:5;])),
                       b2 = [:A,:B,:C][rand(1:3, 5)],
                       v2 = randn(5))
5×3 DataTables.DataTable
│ Row │ a │ b2 │ v2        │
├─────┼───┼────┼───────────┤
│ 1   │ 1 │ :A │ 0.0704676 │
│ 2   │ 5 │ :A │ 0.341794  │
│ 3   │ 2 │ :C │ 1.73517   │
│ 4   │ 3 │ :A │ 1.29992   │
│ 5   │ 4 │ :B │ 0.206364  │

julia> m1 = join(dt1, dt2, on = :a)
0×5 DataTables.DataTable

I merged your join.jl as seen here. Any idea what I'm missing?

alyst · 2017-02-22T11:53:38Z

@cjprybol Strange. Could you please give me the link to the Git branch of DataTables with joins merged so that I can try debugging it?

nalimilan

Thank you both, that sounds really cool. I have reviewed what I could, without really diving into the details of the algorithm.

The benchmarks are really good, I wonder how they compare to e.g. Pandas. There just one case where the number and amount of allocations increased, though, and it would be good to check that it's not because of an easy to fix mistake:

julia> @benchmark groupby(dt1, [:v1, :v2])
  #master
  memory estimate:  1.22 MiB
  allocs estimate:  66536
  median time:      3.547 ms (0.00% GC)
  #3
  memory estimate:  10.68 MiB
  allocs estimate:  476970
  median time:      19.039 ms (0.00% GC)
  #this pr/DataFrames:850
  memory estimate:  3.33 MiB
  allocs estimate:  115865
  median time:      2.119 ms (0.00% GC)

nalimilan · 2017-02-21T10:05:17Z

src/datatablerow/datatablerow.jl

+    isnull(v, i) ? hash(NULL_MAGIC, h) : hash(get(v[i]), h)
+Base.@propagate_inbounds hash_colel{T}(v::AbstractCategoricalArray{T}, i, h::UInt = zero(UInt)) =
+    hash(CategoricalArrays.index(v.pool)[v.refs[i]], h)
+Base.@propagate_inbounds function hash_colel{T}(v::AbstractCategoricalArray{T}, i, h::UInt = zero(UInt))


AbstractCategoricalArray should be AbstractNullableCategoricalArray AFAICT. That also means a test is missing to catch this.

nalimilan · 2017-02-21T10:07:39Z

src/datatablerow/datatablerow.jl

+Base.@propagate_inbounds hash_colel{T}(v::NullableArray{T}, i, h::UInt = zero(UInt)) =
+    isnull(v, i) ? hash(NULL_MAGIC, h) : hash(get(v[i]), h)
+Base.@propagate_inbounds hash_colel{T}(v::AbstractCategoricalArray{T}, i, h::UInt = zero(UInt)) =
+    hash(CategoricalArrays.index(v.pool)[v.refs[i]], h)


Are you sure this is really more efficient than the default hash method for CategoricalValue?

I guess it's not as efficient. But in these functions the constraint is to make hashes invariant to the hashed value representation: whether it's nullable or not and whether it's stored "as is" or in a categorical array. Otherwise joins would not work (we may require that joins only use the columns of identical types, but that would result in too much overhead on the user side). So we have to check if the default hash functions have this property (Nullable AFAIR is not).

It would be surprising that it would be significantly slower, since the code is very similar. Though since we need the special method for NullableCategoricalArray to avoid the cost of creating a Nullable just to unwrap it, I guess it doesn't matter too much what we do here.

nalimilan · 2017-02-22T12:59:51Z

src/abstractdatatable/sort.jl

 Base.sortperm(dt::AbstractDataTable, a::Algorithm, o::Ordering) = sortperm(dt, a, DTPerm(o,dt))
+
+# Extras to speed up sorting
+#Base.sortperm{V}(dt::AbstractDataTable, a::Algorithm, o::FastPerm{Sort.ForwardOrdering,V}) = sortperm(o.vec)


Why is this commented out?

I don't recall now. Maybe something to do with methods ambiguity, but worth rechecking now.

FastPerm is part of DataArrays so these (this and below) aren't applicable. I'll remove them

@nalimilan see here. I'll add these back

Ah, funny how you can forget about issues. I'll try to add these.

nalimilan · 2017-02-22T13:01:17Z

src/abstractdatatable/sort.jl

+#Base.sortperm{V}(dt::AbstractDataTable, a::Algorithm, o::FastPerm{Sort.ReverseOrdering,V}) = reverse(sortperm(o.vec))
+
+# permute rows
+function Base.permute!(dt::AbstractDataTable, p::AbstractVector)


Could you add a docstring to explain that permutation is applied to rows?

nalimilan · 2017-02-22T13:06:30Z

src/datatablerow/datatablerow.jl

+Base.collect(r::DataTableRow) = Tuple{Symbol, Any}[x for x in r]
+
+# the equal elements of nullable and normal arrays would have the same hashes
+const NULL_MAGIC = 0xBADDEED # what to hash if the element is null


There's already a hash for null values in base/nullable.jl:

const nullablehash_seed = UInt === UInt64 ? 0x932e0143e51d0171 : 0xe51d0171

Ah, I missed that. Then we should use nullablehash_seed instead NULL_MAGIC.

nalimilan · 2017-02-22T13:48:01Z

src/datatablerow/utils.jl

+    (gd.dt === dt) && return row # same frame, return itself
+    # different frames, content matching required
+    rhash = rowhash(dt, row)
+    szm1 = length(gd.gslots)-1


Again move inside the loop.

nalimilan · 2017-02-22T13:49:29Z

src/datatablerow/utils.jl

+    return 0 # not found
+end
+
+# Finds indices of rows in 'gd' that match given row by content.


Finds -> Find
returns -> return

nalimilan · 2017-02-22T13:50:03Z

src/datatablerow/utils.jl

+# returns empty set if no row matches
+function Base.get(gd::RowGroupDict, dt::DataTable, row::Int)
+    g_row = findrow(gd, dt, row)
+    (g_row == 0) && return Compat.view(gd.rperm, 0:-1)


No need for Compat now (same below).

nalimilan · 2017-02-22T13:50:49Z

src/groupeddatatable/grouping.jl

+    dt_groups = group_rows(sdt)
+    # sort the groups
+    if sort
+        group_perm = sortperm(sub(sdt, dt_groups.rperm[dt_groups.starts]))


sub -> view

nalimilan · 2017-02-22T13:51:36Z

src/groupeddatatable/grouping.jl

+    if sort
+        group_perm = sortperm(sub(sdt, dt_groups.rperm[dt_groups.starts]))
+        permute!(dt_groups.starts, group_perm)
+        permute!(dt_groups.stops, group_perm)


The second call could be Base.permute!! since the second argument isn't used after that?

cjprybol · 2017-02-22T20:02:00Z

I just pushed the join.jl changes to this branch (available here https://github.com/cjprybol/DataTables.jl/tree/cjp/alyst-groupby)

EDIT: I added @alyst as a collaborator to my DataTables fork so I think everyone has the ability to write to this pull request

alyst · 2017-02-22T21:52:05Z

src/datatablerow/datatablerow.jl

 isequal_colel(a::Any, b::Nullable) = isequal_colel(b, a)
-isequal_colel(a::Nullable, b::Nullable) = isnull(a)==isnull(b) && (isnull(a) || isequal(get(a), get(b)))
+isequal_colel(a::Nullable, b::Nullable) = isnull(a)==isnull(b) && (isnull(a) || isequal(a, b))


I suppose here it could be just

isequal_colel(a::Nullable, b::Nullable) = isequal(a, b)

cjprybol · 2017-02-22T23:39:55Z

src/datatablerow/datatablerow.jl

+    isnull(v, i) ? hash(Base.nullablehash_seed, h) : hash(v.values[i], h)
+Base.@propagate_inbounds hash_colel{T}(v::AbstractCategoricalArray{T}, i, h::UInt = zero(UInt)) =
+    hash(CategoricalArrays.index(v.pool)[v.refs[i]], h)
+Base.@propagate_inbounds function hash_colel{T}(v::AbstractNullableCategoricalArray{T}, i, h::UInt = zero(UInt))


need to add a test to distinguish this from above function per now out-of-date comment

AbstractCategoricalArray should be AbstractNullableCategoricalArray AFAICT. That also means a test is missing to catch this.

cjprybol · 2017-02-22T23:53:29Z

src/abstractdatatable/join.jl

+    @assert length(left_ixs) == length(right_ixs)
+    # compose left half of the result taking all left columns
+    # FIXME is it still relevant? complicated way to do vcat that avoids expensive setindex!() for PooledDataVector
+    all_orig_left_ixs = [left_ixs.orig; leftonly_ixs.orig]


Do we want to change this to vcat or keep it and remove the # FIXME? These might not be the best tests but I'm not seeing any noticable performance difference so maybe this consideration doesn't apply to the CategoricalArrays that replaced PooledDataVector?

x = rand(1000); y = rand(100) julia> @benchmark vcat(x, y) BenchmarkTools.Trial: memory estimate: 8.75 KiB allocs estimate: 1 -------------- minimum time: 587.663 ns (0.00% GC) median time: 1.007 μs (0.00% GC) mean time: 1.967 μs (33.84% GC) maximum time: 17.909 μs (92.71% GC) -------------- samples: 10000 evals/sample: 181 time tolerance: 5.00% memory tolerance: 1.00% julia> @benchmark [x; y] BenchmarkTools.Trial: memory estimate: 8.75 KiB allocs estimate: 1 -------------- minimum time: 599.674 ns (0.00% GC) median time: 1.004 μs (0.00% GC) mean time: 1.892 μs (33.37% GC) maximum time: 23.995 μs (76.91% GC) -------------- samples: 10000 evals/sample: 175 time tolerance: 5.00% memory tolerance: 1.00% x = CategoricalArray(rand(1000)); y = CategoricalArray(rand(100)) julia> @benchmark vcat(x, y) BenchmarkTools.Trial: memory estimate: 494.70 KiB allocs estimate: 2834 -------------- minimum time: 340.367 μs (0.00% GC) median time: 382.938 μs (0.00% GC) mean time: 492.088 μs (13.61% GC) maximum time: 5.615 ms (79.17% GC) -------------- samples: 9981 evals/sample: 1 time tolerance: 5.00% memory tolerance: 1.00% julia> @benchmark [x; y] BenchmarkTools.Trial: memory estimate: 494.70 KiB allocs estimate: 2834 -------------- minimum time: 341.812 μs (0.00% GC) median time: 380.336 μs (0.00% GC) mean time: 496.584 μs (14.43% GC) maximum time: 5.783 ms (79.25% GC) -------------- samples: 9897 evals/sample: 1 time tolerance: 5.00% memory tolerance: 1.00%

AFAIR setindex(PooledDataVector) was very slow, because it did linear search in the pool for the value being assigned. It should not be the case for CategoricalArray. The problem is that now I don't remember anymore how the simple version should look like. :) Need to get into the code once again.

vcat should be fast for CategoricalArray (and for PooledDataArray now too), it works directly with integer codes. (Though it still allocates copies of original codes before concatenating them, which could be improved for the simple 1-D case.)

cjprybol · 2017-02-22T23:53:54Z

src/abstractdatatable/join.jl

+    dtr_noon = without(joiner.dtr, joiner.on_cols)
+    # FIXME is it still relevant? complicated way to do vcat that avoids expensive setindex!() for PooledDataVector
+    # permutation to swap rightonly and leftonly rows
+    right_perm = [1:length(right_ixs);


same here re: # FIXME

see: https://github.com/alyst/DataFrames.jl/pull/1/files

cjprybol · 2017-02-23T05:26:17Z

I think I've worked everything out aside from interactions with CategoricalArrays. @nalimilan we're back to the CategoricalArrays sort issue and a new one, resize!

julia> Pkg.test("DataTables")
INFO: Computing test dependencies for DataTables...
INFO: Installing Atom v0.5.9
INFO: Installing Blink v0.5.1
INFO: Installing CodeTools v0.4.3
INFO: Installing Codecs v0.2.0
INFO: Installing DataArrays v0.3.12
INFO: Installing HttpCommon v0.2.6
INFO: Installing HttpParser v0.2.0
INFO: Installing HttpServer v0.1.7
INFO: Installing LNR v0.0.2
INFO: Installing LaTeXStrings v0.2.0
INFO: Installing Lazy v0.11.5
INFO: Installing MbedTLS v0.4.3
INFO: Installing Mustache v0.1.3
INFO: Installing Mux v0.2.3
INFO: Installing RData v0.0.4
INFO: Installing RDatasets v0.2.0
INFO: Installing WebSockets v0.2.1
INFO: Building HttpParser
INFO: Building Homebrew
Already up-to-date.
INFO: Building MbedTLS
Using system libraries...
INFO: Testing DataTables
Running tests:
	PASSED: utils.jl
	PASSED: cat.jl
WARNING: using DataTables.sub in module TestData conflicts with an existing identifier.
	FAILED: data.jl
LoadError: Unordered CategoricalValue objects cannot be tested for order; use the ordered! function on the parent array to change this
 in macro expansion; at /Users/Cameron/.julia/v0.5/DataTables/test/runtests.jl:40 [inlined]
 in anonymous at ./<missing>:?
 in include_from_node1(::String) at ./loading.jl:488
 in include_from_node1(::String) at /Users/Cameron/julia/usr/lib/julia/sys.dylib:?
 in process_options(::Base.JLOptions) at ./client.jl:262
 in _start() at ./client.jl:318
 in _start() at /Users/Cameron/julia/usr/lib/julia/sys.dylib:?
while loading /Users/Cameron/.julia/v0.5/DataTables/test/data.jl, in expression starting on line 113
	PASSED: index.jl
	FAILED: datatable.jl
LoadError: Unordered CategoricalValue objects cannot be tested for order; use the ordered! function on the parent array to change this
 in macro expansion; at /Users/Cameron/.julia/v0.5/DataTables/test/runtests.jl:40 [inlined]
 in anonymous at ./<missing>:?
 in include_from_node1(::String) at ./loading.jl:488
 in include_from_node1(::String) at /Users/Cameron/julia/usr/lib/julia/sys.dylib:?
 in process_options(::Base.JLOptions) at ./client.jl:262
 in _start() at ./client.jl:318
 in _start() at /Users/Cameron/julia/usr/lib/julia/sys.dylib:?
while loading /Users/Cameron/.julia/v0.5/DataTables/test/datatable.jl, in expression starting on line 320
	PASSED: datatablerow.jl
	PASSED: io.jl
	PASSED: constructors.jl
	PASSED: conversions.jl
	PASSED: sort.jl
	PASSED: grouping.jl
	FAILED: join.jl
LoadError: MethodError: no method matching resize!(::CategoricalArrays.NullableCategoricalArray{Int64,1,UInt8}, ::Int64)
Closest candidates are:
  resize!(::Array{T,1}, ::Integer) at array.jl:510
  resize!(::BitArray{1}, ::Integer) at bitarray.jl:687
  resize!{T}(::NullableArrays.NullableArray{T,1}, ::Int64) at /Users/Cameron/.julia/v0.5/NullableArrays/src/primitives.jl:104
 in macro expansion; at /Users/Cameron/.julia/v0.5/DataTables/test/runtests.jl:40 [inlined]
 in anonymous at ./<missing>:?
 in include_from_node1(::String) at ./loading.jl:488
 in include_from_node1(::String) at /Users/Cameron/julia/usr/lib/julia/sys.dylib:?
 in process_options(::Base.JLOptions) at ./client.jl:262
 in _start() at ./client.jl:318
 in _start() at /Users/Cameron/julia/usr/lib/julia/sys.dylib:?
while loading /Users/Cameron/.julia/v0.5/DataTables/test/join.jl, in expression starting on line 100
	PASSED: iteration.jl
	PASSED: duplicates.jl
	PASSED: show.jl
ERROR: LoadError: "Tests failed"
 in include_from_node1(::String) at ./loading.jl:488
 in include_from_node1(::String) at /Users/Cameron/julia/usr/lib/julia/sys.dylib:?
 in process_options(::Base.JLOptions) at ./client.jl:262
 in _start() at ./client.jl:318
 in _start() at /Users/Cameron/julia/usr/lib/julia/sys.dylib:?
while loading /Users/Cameron/.julia/v0.5/DataTables/test/runtests.jl, in expression starting on line 47
===========================================================================================[ ERROR: DataTables ]============================================================================================

failed process: Process(`/Users/Cameron/julia/usr/bin/julia -Cnative -J/Users/Cameron/julia/usr/lib/julia/sys.dylib --compile=yes --depwarn=yes --check-bounds=yes --code-coverage=none --color=yes --compilecache=yes /Users/Cameron/.julia/v0.5/DataTables/test/runtests.jl`, ProcessExited(1)) [1]

============================================================================================================================================================================================================
INFO: Removing Atom v0.5.9
INFO: Removing Blink v0.5.1
INFO: Removing CodeTools v0.4.3
INFO: Removing Codecs v0.2.0
INFO: Removing DataArrays v0.3.12
INFO: Removing HttpCommon v0.2.6
INFO: Removing HttpParser v0.2.0
INFO: Removing HttpServer v0.1.7
INFO: Removing LNR v0.0.2
INFO: Removing LaTeXStrings v0.2.0
INFO: Removing Lazy v0.11.5
INFO: Removing MbedTLS v0.4.3
INFO: Removing Mustache v0.1.3
INFO: Removing Mux v0.2.3
INFO: Removing RData v0.0.4
INFO: Removing RDatasets v0.2.0
INFO: Removing WebSockets v0.2.1
ERROR: DataTables had test errors
 in #test#61(::Bool, ::Function, ::Array{AbstractString,1}) at ./pkg/entry.jl:740
 in (::Base.Pkg.Entry.#kw##test)(::Array{Any,1}, ::Base.Pkg.Entry.#test, ::Array{AbstractString,1}) at ./<missing>:0
 in (::Base.Pkg.Dir.##2#3{Array{Any,1},Base.Pkg.Entry.#test,Tuple{Array{AbstractString,1}}})() at ./pkg/dir.jl:31
 in cd(::Base.Pkg.Dir.##2#3{Array{Any,1},Base.Pkg.Entry.#test,Tuple{Array{AbstractString,1}}}, ::String) at ./file.jl:59
 in #cd#1(::Array{Any,1}, ::Function, ::Function, ::Array{AbstractString,1}, ::Vararg{Array{AbstractString,1},N}) at ./pkg/dir.jl:31
 in (::Base.Pkg.Dir.#kw##cd)(::Array{Any,1}, ::Base.Pkg.Dir.#cd, ::Function, ::Array{AbstractString,1}, ::Vararg{Array{AbstractString,1},N}) at ./<missing>:0
 in #test#3(::Bool, ::Function, ::String, ::Vararg{String,N}) at ./pkg/pkg.jl:258
 in test(::String, ::Vararg{String,N}) at ./pkg/pkg.jl:258

nalimilan · 2017-03-06T20:35:26Z

Thank you both for persisting through this terribly long review! The features and benchmarks are great -- except of course for the last one which regresses. Since the number of allocations is higher, it probably comes from a type instability. The memory profiler should tell that. Note that AFAICT random_frame never creates CategoricalArray columns that type currently does not override sample: it creates Array{CategoricalValue} columns, which are really not supposed to exist, and which may generate any kinds of weird issues.

Now, to the next step...

nalimilan · 2017-03-07T15:55:42Z

I've adapted the benchmarks a bit to run on DataFrames in order to compare the new DataTables with it. I've also dropped the "ordered" and "unordered" tables, which were actually all unordered and did not really correspond to a plausible scenario since they only contained unique values. The new code is here: https://gist.github.com/nalimilan/aa9391f204967adf70fce3700eab5885

Overall, DataTables master is faster, but the difference is not so large as in the comparison with the old DataTables, which suffered from major type instabilities. In some cases DataTable allocates more (e.g. dt1), which likely indicates a type instability. So looks like we still have a long way to go before matching the performance of e.g. data.table, which was said to be much faster than DataFrames (see benchmarks at JuliaData/DataFramesMeta.jl#36, which we should replicate).

# DataTables
julia> @benchmark groupby(small, [:A, :B])
BenchmarkTools.Trial: 
  memory estimate:  362.80 KiB
  allocs estimate:  11016
  --------------
  minimum time:     263.435 μs (0.00% GC)
  median time:      284.065 μs (0.00% GC)
  mean time:        366.683 μs (18.16% GC)
  maximum time:     9.307 ms (94.58% GC)
  --------------
  samples:          10000
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%
# DataFrames
julia> @benchmark groupby(small, [:A, :B])
BenchmarkTools.Trial: 
  memory estimate:  192.45 KiB
  allocs estimate:  7186
  --------------
  minimum time:     525.938 μs (0.00% GC)
  median time:      550.305 μs (0.00% GC)
  mean time:        600.612 μs (3.22% GC)
  maximum time:     5.727 ms (79.89% GC)
  --------------
  samples:          8227
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

# DataTables
julia> @benchmark groupby(dt1, [:v1, :v2])
BenchmarkTools.Trial: 
  memory estimate:  2.13 MiB
  allocs estimate:  76276
  --------------
  minimum time:     1.434 ms (0.00% GC)
  median time:      1.614 ms (0.00% GC)
  mean time:        2.183 ms (20.18% GC)
  maximum time:     12.800 ms (78.78% GC)
  --------------
  samples:          2266
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%
# DataFrames
julia> @benchmark groupby(dt1, [:v1, :v2])
BenchmarkTools.Trial: 
  memory estimate:  606.95 KiB
  allocs estimate:  28487
  --------------
  minimum time:     1.468 ms (0.00% GC)
  median time:      1.537 ms (0.00% GC)
  mean time:        1.602 ms (3.19% GC)
  maximum time:     4.247 ms (46.88% GC)
  --------------
  samples:          3106
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

# DataTables
julia> @benchmark groupby(dt2, [:v1, :v2])
BenchmarkTools.Trial: 
  memory estimate:  600.27 KiB
  allocs estimate:  61
  --------------
  minimum time:     294.550 μs (0.00% GC)
  median time:      324.514 μs (0.00% GC)
  mean time:        360.833 μs (5.49% GC)
  maximum time:     2.328 ms (69.33% GC)
  --------------
  samples:          10000
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%
# DataFrames
julia> @benchmark groupby(dt2, [:v1, :v2])
BenchmarkTools.Trial: 
  memory estimate:  1.34 MiB
  allocs estimate:  47387
  --------------
  minimum time:     1.619 ms (0.00% GC)
  median time:      1.720 ms (0.00% GC)
  mean time:        1.861 ms (6.07% GC)
  maximum time:     6.255 ms (58.91% GC)
  --------------
  samples:          2672
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

# DataTables
julia> @benchmark groupby(dt3, [:v1, :v2, :v1_1, :v2_1])
BenchmarkTools.Trial: 
  memory estimate:  600.44 KiB
  allocs estimate:  65
  --------------
  minimum time:     388.856 μs (0.00% GC)
  median time:      425.296 μs (0.00% GC)
  mean time:        456.659 μs (4.25% GC)
  maximum time:     2.979 ms (65.61% GC)
  --------------
  samples:          10000
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%
# DataFrames
julia> @benchmark groupby(dt3, [:v1, :v2, :v1_1, :v2_1])
BenchmarkTools.Trial: 
  memory estimate:  19.13 MiB
  allocs estimate:  161096
  --------------
  minimum time:     11.457 ms (0.00% GC)
  median time:      13.299 ms (11.49% GC)
  mean time:        13.629 ms (8.97% GC)
  maximum time:     27.229 ms (13.16% GC)
  --------------
  samples:          366
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

# DataTables
julia> @benchmark f()
BenchmarkTools.Trial: 
  memory estimate:  2.29 MiB
  allocs estimate:  63263
  --------------
  minimum time:     3.119 ms (0.00% GC)
  median time:      3.654 ms (0.00% GC)
  mean time:        4.303 ms (9.84% GC)
  maximum time:     16.701 ms (52.10% GC)
  --------------
  samples:          1157
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%
# DataFrames
julia> @benchmark f()
BenchmarkTools.Trial: 
  memory estimate:  1.19 MiB
  allocs estimate:  24076
  --------------
  minimum time:     4.154 ms (0.00% GC)
  median time:      4.392 ms (0.00% GC)
  mean time:        4.628 ms (3.36% GC)
  maximum time:     10.013 ms (47.09% GC)
  --------------
  samples:          1075
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

# DataTables
julia> @benchmark h()
BenchmarkTools.Trial: 
  memory estimate:  13.46 MiB
  allocs estimate:  399702
  --------------
  minimum time:     21.948 ms (0.00% GC)
  median time:      26.196 ms (11.28% GC)
  mean time:        26.022 ms (7.88% GC)
  maximum time:     38.248 ms (10.68% GC)
  --------------
  samples:          192
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%
# DataFrames
julia> @benchmark h()
BenchmarkTools.Trial: 
  memory estimate:  8.00 MiB
  allocs estimate:  214342
  --------------
  minimum time:     12.387 ms (0.00% GC)
  median time:      13.102 ms (0.00% GC)
  mean time:        13.922 ms (6.42% GC)
  maximum time:     18.948 ms (0.00% GC)
  --------------
  samples:          359
  evals/sample:     1
  time tolerance:   5.00%
  memory tolerance: 1.00%

cjprybol · 2017-03-07T18:45:37Z

Thanks Milan! I started a branch to try and get the benchmarks setup using PkgBenchmark.jl as you had suggested earlier. I wonder if removing the column promotion to NullableArrays in #24 will have any affect on these benchmarks. I've started working on JuliaData/DataFrames.jl#485 too, which may have implications on aggregate speed and is probably something we should also benchmark. I think benchmarking transformations (stack, unstack, join, groupby, aggregate, etc.) will be a great start and we can build out additional benchmarks from there

nalimilan · 2017-03-15T17:07:45Z

I've just found this, might be relevant regarding possible optimizations: http://scattered-thoughts.net/blog/2016/10/11/a-practical-relational-query-compiler-in-500-lines/

nalimilan · 2017-06-15T07:51:28Z

Another possibly interesting reference: the Stata ftools package, whose algorithm is documented at https://github.com/sergiocorreia/ftools/blob/master/src/ftools.sthlp

nalimilan · 2017-06-16T09:16:13Z

This Discourse post shows that for a simple example our groupby is about four times slower than Pandas. I've done some profiling, and it turns out most of the time is spent checking that elements are equal for rows whose hashes are equal (by calling isequal_colel). Removing that check makes the operation about twice faster. It's kind of silly, since these checks are only needed in case hash collisions happen, which isn't that common. Of course that's the standard behaviors of dicts. So it could be a good idea to use a perfect hash function, even if I'm not sure how efficient it would be to build and for lookups.

For reference, Pandas uses a different strategy, which we also used before this PR: instead of hashing it assigns an integer code to each level of each grouping variables, and combines them. This has been explained here and here, though it's a bit old. (Our old implementation did not support compressing the integer codes when the number of possible combinations was too high, which Pandas does.) This also requires using a hash table to assign an integer code to each level (essentially what CategoricalArrays does), but separately for each column (the combination of codes is done in another step). Since the hash table isn't perfect, you also need to check for collisions. But this is likely much more efficient because you're working on one vector at a time, so the operation is type-stable and cache locality is good. On the contrary, our current strategy is very inefficient since we iterate over columns, whose types are not statically known; and even if they were, the values are scattered across memory so it could still be slower.

Overall, I'd be inclined to move back to the old strategy (which should only require changing limited parts of this PR). There must be a reason Pandas uses it. And among other advantages, it's really really fast for CategoricalArrays, which is useful when you know you'll repeatedly group other a variable (see my previous concerns above).

@cjprybol @alyst Comments?

alyst · 2017-06-16T17:46:50Z

In the current implementation the rows hash should be updated column-wise, so it is type-stable and memory-efficient.
Was the bottle neck the isequal_colel(col, i, g_row) in row_group_slots()?

nalimilan · 2017-06-16T18:39:15Z

Yes. The problem is that, while hashing is optimized, the equality check isn't.

There's also a much bigger issue with joining, see the example at https://discourse.julialang.org/t/how-is-the-data-ecosystem-right-now-for-large-datasets/4281/19.

alyst · 2017-06-16T19:43:16Z

One possible approach is that hashrows() would build the two-part hash: unique (combining all the columns that allow cheap unique hashing, e.g. categoricals or integers bounded to a reasonable range) and non-unique (strings and floating point numbers).
So if both hashes are equal, only the columns of the non-unique one need to be checked.
Then, of course, if grouping by strings or floats makes sense for a particular table, they could be converted into categoricals by the user so that non-unique part of hash is eliminated completely.

nalimilan · 2017-06-16T19:59:18Z

I'm not sure that's acceptable either. Grouping/joining on strings is quite common too, and even "reasonable ranges" can get out of control quickly if you combine multiple columns.

Basically, the two approaches are similar in many respects, but they differ in the way the per-column hashes are combined:

what we do now: compute a non-perfect hash for each column, combine it to that for previous columns (on the fly), and then do the dict lookup to find out which group each row belongs to
what we did before, and what Pandas does: compute hash for each column separately, do a dict lookup to transform each value into an integer code, and combine the integer codes into a global code (which is actually a non-minimal perfect hash)

So basically, the Pandas approach differs from the current one we use by the fact that a perfect hashing function is created. The cost of checking for collision is paid separately for each column, where an equality check is more efficient; then it cannot happen when grouping rows. AFAICT, the only drawback of that approach is that you need to compress the integer codes when the number of possible combinations of levels gets too high. But that's not the end of the world, we have code examples for that in Pandas and recoding an integer vector is quite fast.

What are the advantages of the current approach in your opinion? It could be efficient if the column types were statically known, but even then I'm not sure it would beat the other approach. If Pandas uses it since 2012, it probably means they couldn't find a more efficient solution, and they have put much more resources than us on this.

alyst · 2017-06-16T20:47:20Z

But, basically, "building a hash for each column separately" is an implicit conversion to a categorical. And then, IIUC, what I propose would match what Pandas is doing.
It's possible to do this conversion for all columns and have only the perfect hash, I just thought that in certain situations (e.g. joins by string columns that contain little duplicated elements, only a small fraction of rows match between the tables) "conversion to a categorical" and maintaining the perfect hash might be more expensive.

One advantage (that motivated this PR) is that it doesn't rely on an efficient compression scheme, which was a big problem for multi-column joins of reasonably-sized tables at that time.

nalimilan · 2017-06-16T21:15:20Z

But, basically, "building a hash for each column separately" is an implicit conversion to a categorical. And then, IIUC, what I propose would match what Pandas is doing.
It's possible to do this conversion for all columns and have only the perfect hash, I just thought that in certain situations (e.g. joins by string columns that contain little duplicated elements, only a small fraction of rows match between the tables) "conversion to a categorical" and maintaining the perfect hash might be more expensive.

What's the problem with "implicit conversion to categorical"? That's just a dict lookup. I've experimented a bit locally with a more efficient approach than the old one which actually used to build a complete CategoricalVector: we can just create a dict mapping levels to integer codes, and combine these codes with those of previous columns on the fly, without allocating a new vector for each column. So it's really just an alternative way of computing hashes, but hashes that are perfect. That approach can be combined with more efficient methods for particular types, e.g. for CategoricalArray the integers codes are already there so it's much more efficient. We could do the same for integer vectors with a limited range.

One advantage (that motivated this PR) is that it doesn't rely on an efficient compression scheme, which was a big problem for multi-column joins of reasonably-sized tables at that time.

AFAIK it was a big problem because it wasn't implemented at all. Since Pandas uses it, it appears to work well in practice. We should run some benchmarks, but the cost of compressing an integer vector will likely be quite negligible compared with the cost of hashing and dict lookups.

I haven't looked too deeply at join algorithms yet, unfortunately there's no blog post by Wes McKinney to ease the understanding of the code.

alyst · 2017-06-16T21:47:28Z

What's the problem with "implicit conversion to categorical"? That's just a dict lookup.

That's true, I'm just saying that in certain situations not all the keys would be looked up. But probably it doesn't justify complicating the code with unique and non-unique hashes.

As I see it now, to implement Pandas-like behaviour, one strategy could be to modify hashrows()/hashrows_cols!() to generate the perfect hash.
row_group_slots() already implements the compression (from rhashes to groups), so it could be reused + the same code should be run after the perfect hash is built to generate RowGroupDict.

In the current implementation joins are just about enumerating all pairs of matching rows from the two tables, where the right table (or left for the right-join) is used to build RowGroupDict and the left one is looking into it.
The problem with the perfect hash is that it's perfect just for one table, whereas the columns of the joining tables might even have different types.
RowGroupDict approach uses the fact that hashes are generated from the promotion type of left and right columns. If that is not true, efficient joins might need a different strategy.

nalimilan · 2017-06-18T14:40:50Z

I've worked on grouping (since it's simpler than joining), and improving the implementation of the old algorithm I could make it twice faster than both the old and the existing code: #76. That means we should more or less be about twice slower than Pandas (for this particular example), which sounds promising (especially given that half of the time is now spent in areas I haven't explored yet).

Do you think you could have a look at joins? You're much more familiar than I am with that code. It would be great to summarize the differences between our implementation and Pandas' so that we can identify places where we could use a more clever algorithm.

cjprybol added 4 commits February 19, 2017 23:51

merge alyst groupby to DataTables

dd68a65

passing tests

a5fd472

Merge branch 'master' into cjp/alyst-groupby

9424201

revert seemingly unrelated changes

a652768

cjprybol changed the title ~~https://github.com/JuliaStats/DataFrames.jl/pull/850 updated for DataFrames~~ https://github.com/JuliaStats/DataFrames.jl/pull/850 updated for DataTables Feb 21, 2017

cjprybol added 2 commits February 20, 2017 19:19

revert unnecessary changes for variable name and spacing

0cdf755

fix indentation issue

d292dd3

add nonunique()

53774f5

nalimilan mentioned this pull request Feb 22, 2017

WIP: Fix type instability in groupby() #12

Closed

nalimilan reviewed Feb 22, 2017

View reviewed changes

commit join.jl merge for Alyst to debug

2adc883

make the easy changes requested during review

d52c791

alyst reviewed Feb 22, 2017

View reviewed changes

cjprybol commented Feb 22, 2017

View reviewed changes

cjprybol added 7 commits February 22, 2017 16:20

add docstrings to row permutation functions

5e9664a

clarify error message

e1b4d0e

remove unused function

74c36d1

update function to use isequal(a::Nullable, b::Nullable) from base

de09a5c

frame -> table

160be5c

update merge based on helpful diff

7f28a14

see: https://github.com/alyst/DataFrames.jl/pull/1/files

pass all tests that don't use Categorical

61bf607

added back commented out functions

6147d0c

nalimilan merged commit 20c71d6 into JuliaData:master Mar 6, 2017

This was referenced Mar 6, 2017

Enhance joining and grouping JuliaData/DataFrames.jl#850

Closed

rewrite groupby #3

Closed

Faster grouping and aggregation JuliaData/DataFrames.jl#894

Closed

cjprybol deleted the cjp/alyst-groupby branch March 6, 2017 21:07

nalimilan mentioned this pull request Mar 6, 2017

Add benchmarks from R's data.table benchmarks JuliaData/DataFramesMeta.jl#36

Merged

cjprybol mentioned this pull request Mar 6, 2017

groupby() and InexactError (again) JuliaData/DataFrames.jl#985

Closed

cjprybol mentioned this pull request Mar 16, 2017

"correct" join results #34

Closed

alyst mentioned this pull request Mar 17, 2017

Stop auto-promoting column-types #30

Closed

cjprybol mentioned this pull request Jun 23, 2017

WIP: Restore old grouping algorithm and improve it #76

Closed

cjprybol mentioned this pull request Aug 18, 2017

WIP: DataTables.jl Backport JuliaData/DataFrames.jl#1214

Closed

4 tasks

nalimilan mentioned this pull request Sep 3, 2017

Backport DataTables using merge commit JuliaData/DataFrames.jl#1220

Merged

ChrisRackauckas mentioned this pull request Oct 8, 2017

Set of operations to benchmark xiaodaigh/data_manipulation_benchmarks#1

Open

This was referenced Oct 10, 2017

Decide what to do with null values in grouping JuliaData/DataFrames.jl#1249

Closed

join() doesn't preserve categorical levels order JuliaData/DataFrames.jl#1257

Closed

nalimilan mentioned this pull request Oct 18, 2017

Add skipnull argument to groupby() JuliaData/DataFrames.jl#1267

Merged

This pull request was closed.

Enhance joining and grouping #17

Enhance joining and grouping #17

Conversation

cjprybol commented Feb 20, 2017

cjprybol commented Feb 21, 2017

alyst commented Feb 22, 2017

cjprybol commented Feb 22, 2017

alyst commented Feb 22, 2017

nalimilan left a comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

alyst Feb 22, 2017 • edited Loading

Choose a reason for hiding this comment

nalimilan Feb 22, 2017 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

cjprybol commented Feb 22, 2017 • edited Loading

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

Choose a reason for hiding this comment

cjprybol commented Feb 23, 2017

nalimilan commented Mar 6, 2017

nalimilan commented Mar 7, 2017

cjprybol commented Mar 7, 2017

nalimilan commented Mar 15, 2017

nalimilan commented Jun 15, 2017

nalimilan commented Jun 16, 2017

alyst commented Jun 16, 2017 • edited Loading

nalimilan commented Jun 16, 2017

alyst commented Jun 16, 2017

nalimilan commented Jun 16, 2017

alyst commented Jun 16, 2017

nalimilan commented Jun 16, 2017

alyst commented Jun 16, 2017 • edited Loading

nalimilan commented Jun 18, 2017 • edited Loading

alyst Feb 22, 2017 •

edited

Loading

nalimilan Feb 22, 2017 •

edited

Loading

cjprybol commented Feb 22, 2017 •

edited

Loading

alyst commented Jun 16, 2017 •

edited

Loading

alyst commented Jun 16, 2017 •

edited

Loading

nalimilan commented Jun 18, 2017 •

edited

Loading