Fix zipping of extrema outputs · andrewjradcliffe/VectorizedReduction.jl@8541e13

chriselrod · 2022-06-10T03:23:56Z

a custom implementation of extrema is not really worth it, as the time/memory
cost is approximately the same. Also, it suffers from first dimension reduction error.

@andrewjradcliffe

I'm not sure what you mean when saying single-pass extrema isn't worth it. Single pass has a significant advantage starting from N=64 on my computer, and is about twice as fast for larger values of N when the problem becomes memory-bandwidth dominated:

julia> for i = 1:20
         for j = -1:1
           N = (1 << i) + j
           x = rand(N)
           println("N = $N; vextrema (single pass):")
           @btime vextrema($x)
           println("N = $N; vextrema2 (two passes):")
           @btime vextrema2($x)
         end
       end
N = 1; vextrema (single pass):
  10.542 ns (0 allocations: 0 bytes)
N = 1; vextrema2 (two passes):
  10.455 ns (0 allocations: 0 bytes)
N = 2; vextrema (single pass):
  10.538 ns (0 allocations: 0 bytes)
N = 2; vextrema2 (two passes):
  10.906 ns (0 allocations: 0 bytes)
N = 3; vextrema (single pass):
  10.539 ns (0 allocations: 0 bytes)
N = 3; vextrema2 (two passes):
  10.907 ns (0 allocations: 0 bytes)
N = 3; vextrema (single pass):
  10.539 ns (0 allocations: 0 bytes)
N = 3; vextrema2 (two passes):
  10.818 ns (0 allocations: 0 bytes)
N = 4; vextrema (single pass):
  10.540 ns (0 allocations: 0 bytes)
N = 4; vextrema2 (two passes):
  10.623 ns (0 allocations: 0 bytes)
N = 5; vextrema (single pass):
  10.540 ns (0 allocations: 0 bytes)
N = 5; vextrema2 (two passes):
  10.826 ns (0 allocations: 0 bytes)
N = 7; vextrema (single pass):
  10.539 ns (0 allocations: 0 bytes)
N = 7; vextrema2 (two passes):
  10.865 ns (0 allocations: 0 bytes)
N = 8; vextrema (single pass):
  10.537 ns (0 allocations: 0 bytes)
N = 8; vextrema2 (two passes):
  10.580 ns (0 allocations: 0 bytes)
N = 9; vextrema (single pass):
  10.537 ns (0 allocations: 0 bytes)
N = 9; vextrema2 (two passes):
  10.577 ns (0 allocations: 0 bytes)
N = 15; vextrema (single pass):
  10.540 ns (0 allocations: 0 bytes)
N = 15; vextrema2 (two passes):
  10.950 ns (0 allocations: 0 bytes)
N = 16; vextrema (single pass):
  10.027 ns (0 allocations: 0 bytes)
N = 16; vextrema2 (two passes):
  9.319 ns (0 allocations: 0 bytes)
N = 17; vextrema (single pass):
  11.052 ns (0 allocations: 0 bytes)
N = 17; vextrema2 (two passes):
  11.234 ns (0 allocations: 0 bytes)
N = 31; vextrema (single pass):
  11.050 ns (0 allocations: 0 bytes)
N = 31; vextrema2 (two passes):
  11.402 ns (0 allocations: 0 bytes)
N = 32; vextrema (single pass):
  11.820 ns (0 allocations: 0 bytes)
N = 32; vextrema2 (two passes):
  10.349 ns (0 allocations: 0 bytes)
N = 33; vextrema (single pass):
  12.596 ns (0 allocations: 0 bytes)
N = 33; vextrema2 (two passes):
  11.930 ns (0 allocations: 0 bytes)
N = 63; vextrema (single pass):
  13.410 ns (0 allocations: 0 bytes)
N = 63; vextrema2 (two passes):
  13.367 ns (0 allocations: 0 bytes)
N = 64; vextrema (single pass):
  13.253 ns (0 allocations: 0 bytes)
N = 64; vextrema2 (two passes):
  16.550 ns (0 allocations: 0 bytes)
N = 65; vextrema (single pass):
  14.627 ns (0 allocations: 0 bytes)
N = 65; vextrema2 (two passes):
  17.801 ns (0 allocations: 0 bytes)
N = 127; vextrema (single pass):
  17.505 ns (0 allocations: 0 bytes)
N = 127; vextrema2 (two passes):
  24.806 ns (0 allocations: 0 bytes)
N = 128; vextrema (single pass):
  17.047 ns (0 allocations: 0 bytes)
N = 128; vextrema2 (two passes):
  23.816 ns (0 allocations: 0 bytes)
N = 129; vextrema (single pass):
  17.853 ns (0 allocations: 0 bytes)
N = 129; vextrema2 (two passes):
  24.873 ns (0 allocations: 0 bytes)
N = 255; vextrema (single pass):
  28.939 ns (0 allocations: 0 bytes)
N = 255; vextrema2 (two passes):
  38.693 ns (0 allocations: 0 bytes)
N = 256; vextrema (single pass):
  26.585 ns (0 allocations: 0 bytes)
N = 256; vextrema2 (two passes):
  33.074 ns (0 allocations: 0 bytes)
N = 257; vextrema (single pass):
  28.272 ns (0 allocations: 0 bytes)
N = 257; vextrema2 (two passes):
  34.627 ns (0 allocations: 0 bytes)
N = 511; vextrema (single pass):
  39.689 ns (0 allocations: 0 bytes)
N = 511; vextrema2 (two passes):
  60.158 ns (0 allocations: 0 bytes)
N = 512; vextrema (single pass):
  37.722 ns (0 allocations: 0 bytes)
N = 512; vextrema2 (two passes):
  54.213 ns (0 allocations: 0 bytes)
N = 513; vextrema (single pass):
  38.205 ns (0 allocations: 0 bytes)
N = 513; vextrema2 (two passes):
  55.501 ns (0 allocations: 0 bytes)
N = 1023; vextrema (single pass):
  56.986 ns (0 allocations: 0 bytes)
N = 1023; vextrema2 (two passes):
  80.043 ns (0 allocations: 0 bytes)
N = 1024; vextrema (single pass):
  55.695 ns (0 allocations: 0 bytes)
N = 1024; vextrema2 (two passes):
  73.785 ns (0 allocations: 0 bytes)
N = 1025; vextrema (single pass):
  55.448 ns (0 allocations: 0 bytes)
N = 1025; vextrema2 (two passes):
  75.370 ns (0 allocations: 0 bytes)
N = 2047; vextrema (single pass):
  91.517 ns (0 allocations: 0 bytes)
N = 2047; vextrema2 (two passes):
  119.650 ns (0 allocations: 0 bytes)
N = 2048; vextrema (single pass):
  89.909 ns (0 allocations: 0 bytes)
N = 2048; vextrema2 (two passes):
  113.030 ns (0 allocations: 0 bytes)
N = 2049; vextrema (single pass):
  90.053 ns (0 allocations: 0 bytes)
N = 2049; vextrema2 (two passes):
  114.394 ns (0 allocations: 0 bytes)
N = 4095; vextrema (single pass):
  169.584 ns (0 allocations: 0 bytes)
N = 4095; vextrema2 (two passes):
  231.518 ns (0 allocations: 0 bytes)
N = 4096; vextrema (single pass):
  175.249 ns (0 allocations: 0 bytes)
N = 4096; vextrema2 (two passes):
  239.409 ns (0 allocations: 0 bytes)
N = 4097; vextrema (single pass):
  173.878 ns (0 allocations: 0 bytes)
N = 4097; vextrema2 (two passes):
  235.070 ns (0 allocations: 0 bytes)
N = 8191; vextrema (single pass):
  408.010 ns (0 allocations: 0 bytes)
N = 8191; vextrema2 (two passes):
  766.355 ns (0 allocations: 0 bytes)
N = 8192; vextrema (single pass):
  406.555 ns (0 allocations: 0 bytes)
N = 8192; vextrema2 (two passes):
  763.839 ns (0 allocations: 0 bytes)
N = 8193; vextrema (single pass):
  409.150 ns (0 allocations: 0 bytes)
N = 8193; vextrema2 (two passes):
  765.559 ns (0 allocations: 0 bytes)
N = 16383; vextrema (single pass):
  796.240 ns (0 allocations: 0 bytes)
N = 16383; vextrema2 (two passes):
  1.434 μs (0 allocations: 0 bytes)
N = 16384; vextrema (single pass):
  797.642 ns (0 allocations: 0 bytes)
N = 16384; vextrema2 (two passes):
  1.454 μs (0 allocations: 0 bytes)
N = 16385; vextrema (single pass):
  793.938 ns (0 allocations: 0 bytes)
N = 16385; vextrema2 (two passes):
  1.446 μs (0 allocations: 0 bytes)
N = 32767; vextrema (single pass):
  1.593 μs (0 allocations: 0 bytes)
N = 32767; vextrema2 (two passes):
  2.995 μs (0 allocations: 0 bytes)
N = 32768; vextrema (single pass):
  1.588 μs (0 allocations: 0 bytes)
N = 32768; vextrema2 (two passes):
  2.985 μs (0 allocations: 0 bytes)
N = 32769; vextrema (single pass):
  1.584 μs (0 allocations: 0 bytes)
N = 32769; vextrema2 (two passes):
  2.990 μs (0 allocations: 0 bytes)
N = 65535; vextrema (single pass):
  3.223 μs (0 allocations: 0 bytes)
N = 65535; vextrema2 (two passes):
  6.194 μs (0 allocations: 0 bytes)
N = 65536; vextrema (single pass):
  3.298 μs (0 allocations: 0 bytes)
N = 65536; vextrema2 (two passes):
  6.197 μs (0 allocations: 0 bytes)
N = 65537; vextrema (single pass):
  3.301 μs (0 allocations: 0 bytes)
N = 65537; vextrema2 (two passes):
  6.201 μs (0 allocations: 0 bytes)
N = 131071; vextrema (single pass):
  16.945 μs (0 allocations: 0 bytes)
N = 131071; vextrema2 (two passes):
  32.336 μs (0 allocations: 0 bytes)
N = 131072; vextrema (single pass):
  17.520 μs (0 allocations: 0 bytes)
N = 131072; vextrema2 (two passes):
  33.411 μs (0 allocations: 0 bytes)
N = 131073; vextrema (single pass):
  16.175 μs (0 allocations: 0 bytes)
N = 131073; vextrema2 (two passes):
  30.367 μs (0 allocations: 0 bytes)
N = 262143; vextrema (single pass):
  69.480 μs (0 allocations: 0 bytes)
N = 262143; vextrema2 (two passes):
  139.376 μs (0 allocations: 0 bytes)
N = 262144; vextrema (single pass):
  69.620 μs (0 allocations: 0 bytes)
N = 262144; vextrema2 (two passes):
  140.045 μs (0 allocations: 0 bytes)
N = 262145; vextrema (single pass):
  68.665 μs (0 allocations: 0 bytes)
N = 262145; vextrema2 (two passes):
  138.535 μs (0 allocations: 0 bytes)
N = 524287; vextrema (single pass):
  139.871 μs (0 allocations: 0 bytes)
N = 524287; vextrema2 (two passes):
  283.342 μs (0 allocations: 0 bytes)
N = 524288; vextrema (single pass):
  139.943 μs (0 allocations: 0 bytes)
N = 524288; vextrema2 (two passes):
  283.536 μs (0 allocations: 0 bytes)
N = 524289; vextrema (single pass):
  141.164 μs (0 allocations: 0 bytes)
N = 524289; vextrema2 (two passes):
  283.599 μs (0 allocations: 0 bytes)
N = 1048575; vextrema (single pass):
  282.893 μs (0 allocations: 0 bytes)
N = 1048575; vextrema2 (two passes):
  568.932 μs (0 allocations: 0 bytes)
N = 1048576; vextrema (single pass):
  284.042 μs (0 allocations: 0 bytes)
N = 1048576; vextrema2 (two passes):
  569.835 μs (0 allocations: 0 bytes)
N = 1048577; vextrema (single pass):
  283.598 μs (0 allocations: 0 bytes)
N = 1048577; vextrema2 (two passes):
  569.551 μs (0 allocations: 0 bytes)

Definitions:

using LoopVectorization

function vmaximum(x)
  mx = typemin(Base.eltypeof(x))
  @turbo for i = eachindex(x)
    mx = max(mx, x[i])
  end
  mx
end
function vminimum(x)
  mn = typemax(Base.eltypeof(x))
  @turbo for i = eachindex(x)
    mn = min(mn, x[i])
  end
  mn
end
function vextrema(x)
  mx = typemin(Base.eltypeof(x))
  mn = typemax(Base.eltypeof(x))
  @turbo for i = eachindex(x)
    mn = min(mn, x[i])
    mx = max(mx, x[i])
  end
  mn, mx
end
vextrema2(x) = vminimum(x), vmaximum(x)

andrewjradcliffe · 2022-06-10T04:34:14Z

Quite right you are sir, the single-pass is superior (as it should be -- streaming the same array through the cache 2x should cost approximately double, as your benchmark series shows). The "not really worth it" reference is actually to the effort involved when I wrote that comment (was more troubled by the reduction-on-first-dimension error). Looking back I see that oh, yeah, the problem is that I had been thinking of findmin/findmax, and not specifically just minimum/maximum. The reduction-on-first-dimension error arises from the booleans used to select the min/max (which are totally unnecessary >_<). I will have to rectify this ASAP.

Then, I also think of the collect(zip(A_min, A_max)) call, and realize that actually, one can probably optimize more by wrapping only the inner reduction loop block in @turbo, taking the mn, mx and storing them directly in an an Array{Tuple{T,T},N} -- this eliminates the wasteful memory allocation as well (and whatever cost that incurs). I had noticed a while ago that LV doesn't handle tuples, otherwise it would be possible to just continue wrapping both outer and inner blocks in @turbo, hence the current use of zip.

andrewjradcliffe · 2022-06-10T21:19:50Z

@chriselrod

Incidentally, benchmarking reveals that non-zip is faster only when the array is very small (length(A) < 100), and/or when the reduction is taking place across a large chunk of the array. If the dimensions are equal size, as in the examples above, this typically requires > ndims(A) ÷ 2. As one would expect, there is dependence on memory traversal order, i.e. which dimensions are being reduced over, as they must appear in the innermost loop. This is easier to elicit with equal size dimensions. For unequal size dimensions, cost modeling is needed to determine what the optimal action -- out of scope for this little note. In any case, making all the loops available to LoopVectorization will yield superior performance in most cases, despite the need to zip the result.
An aside: When the reduction occurs over all dimensions, there is clearly no penalty to the non-zip method. It is an unfortunate side effect, but it goes un-used in such a case.

For reference -- your benchmark repeated on following machine. Same breakpoint at which single-pass exceeds two-pass.

julia> versioninfo()
Julia Version 1.8.0-beta3
Commit 3e092a2521* (2022-03-29 15:42 UTC)
Platform Info:
  OS: Linux (x86_64-pc-linux-gnu)
  CPU: 96 × Intel(R) Xeon(R) Gold 6336Y CPU @ 2.40GHz
  WORD_SIZE: 64
  LIBM: libopenlibm
  LLVM: libLLVM-13.0.1 (ORCJIT, icelake-server)
  Threads: 96 on 96 virtual cores

Architecture:        x86_64
CPU op-mode(s):      32-bit, 64-bit
Byte Order:          Little Endian
Address sizes:       46 bits physical, 57 bits virtual
CPU(s):              96
On-line CPU(s) list: 0-95
Thread(s) per core:  2
Core(s) per socket:  24
Socket(s):           2
NUMA node(s):        2
Vendor ID:           GenuineIntel
CPU family:          6
Model:               106
Model name:          Intel(R) Xeon(R) Gold 6336Y CPU @ 2.40GHz
Stepping:            6
CPU MHz:             2400.000
CPU max MHz:         3600.0000
CPU min MHz:         800.0000
BogoMIPS:            4800.00
Virtualization:      VT-x
L1d cache:           48K
L1i cache:           32K
L2 cache:            1280K
L3 cache:            36864K
NUMA node0 CPU(s):   0-23,48-71
NUMA node1 CPU(s):   24-47,72-95

N = 1; vextrema (single pass):
  9.150 ns (0 allocations: 0 bytes)
N = 1; vextrema2 (two passes):
  10.208 ns (0 allocations: 0 bytes)
N = 2; vextrema (single pass):
  9.150 ns (0 allocations: 0 bytes)
N = 2; vextrema2 (two passes):
  10.215 ns (0 allocations: 0 bytes)
N = 3; vextrema (single pass):
  10.138 ns (0 allocations: 0 bytes)
N = 3; vextrema2 (two passes):
  11.306 ns (0 allocations: 0 bytes)
N = 3; vextrema (single pass):
  9.150 ns (0 allocations: 0 bytes)
N = 3; vextrema2 (two passes):
  10.213 ns (0 allocations: 0 bytes)
N = 4; vextrema (single pass):
  9.151 ns (0 allocations: 0 bytes)
N = 4; vextrema2 (two passes):
  10.276 ns (0 allocations: 0 bytes)
N = 5; vextrema (single pass):
  9.151 ns (0 allocations: 0 bytes)
N = 5; vextrema2 (two passes):
  10.216 ns (0 allocations: 0 bytes)
N = 7; vextrema (single pass):
  9.151 ns (0 allocations: 0 bytes)
N = 7; vextrema2 (two passes):
  10.273 ns (0 allocations: 0 bytes)
N = 8; vextrema (single pass):
  9.153 ns (0 allocations: 0 bytes)
N = 8; vextrema2 (two passes):
  10.217 ns (0 allocations: 0 bytes)
N = 9; vextrema (single pass):
  9.151 ns (0 allocations: 0 bytes)
N = 9; vextrema2 (two passes):
  10.323 ns (0 allocations: 0 bytes)
N = 15; vextrema (single pass):
  9.152 ns (0 allocations: 0 bytes)
N = 15; vextrema2 (two passes):
  10.275 ns (0 allocations: 0 bytes)
N = 16; vextrema (single pass):
  8.869 ns (0 allocations: 0 bytes)
N = 16; vextrema2 (two passes):
  8.800 ns (0 allocations: 0 bytes)
N = 17; vextrema (single pass):
  9.448 ns (0 allocations: 0 bytes)
N = 17; vextrema2 (two passes):
  10.647 ns (0 allocations: 0 bytes)
N = 31; vextrema (single pass):
  9.495 ns (0 allocations: 0 bytes)
N = 31; vextrema2 (two passes):
  11.229 ns (0 allocations: 0 bytes)
N = 32; vextrema (single pass):
  10.706 ns (0 allocations: 0 bytes)
N = 32; vextrema2 (two passes):
  10.333 ns (0 allocations: 0 bytes)
N = 33; vextrema (single pass):
  11.897 ns (0 allocations: 0 bytes)
N = 33; vextrema2 (two passes):
  12.254 ns (0 allocations: 0 bytes)
N = 63; vextrema (single pass):
  12.012 ns (0 allocations: 0 bytes)
N = 63; vextrema2 (two passes):
  12.539 ns (0 allocations: 0 bytes)
N = 64; vextrema (single pass):
  11.463 ns (0 allocations: 0 bytes)
N = 64; vextrema2 (two passes):
  15.950 ns (0 allocations: 0 bytes)
N = 65; vextrema (single pass):
  12.800 ns (0 allocations: 0 bytes)
N = 65; vextrema2 (two passes):
  17.472 ns (0 allocations: 0 bytes)
N = 127; vextrema (single pass):
  14.854 ns (0 allocations: 0 bytes)
N = 127; vextrema2 (two passes):
  21.818 ns (0 allocations: 0 bytes)
N = 128; vextrema (single pass):
  13.889 ns (0 allocations: 0 bytes)
N = 128; vextrema2 (two passes):
  18.339 ns (0 allocations: 0 bytes)
N = 129; vextrema (single pass):
  15.423 ns (0 allocations: 0 bytes)
N = 129; vextrema2 (two passes):
  19.783 ns (0 allocations: 0 bytes)
N = 255; vextrema (single pass):
  22.672 ns (0 allocations: 0 bytes)
N = 255; vextrema2 (two passes):
  43.086 ns (0 allocations: 0 bytes)
N = 256; vextrema (single pass):
  23.597 ns (0 allocations: 0 bytes)
N = 256; vextrema2 (two passes):
  27.958 ns (0 allocations: 0 bytes)
N = 257; vextrema (single pass):
  23.229 ns (0 allocations: 0 bytes)
N = 257; vextrema2 (two passes):
  27.822 ns (0 allocations: 0 bytes)
N = 511; vextrema (single pass):
  36.393 ns (0 allocations: 0 bytes)
N = 511; vextrema2 (two passes):
  43.088 ns (0 allocations: 0 bytes)
N = 512; vextrema (single pass):
  32.241 ns (0 allocations: 0 bytes)
N = 512; vextrema2 (two passes):
  41.475 ns (0 allocations: 0 bytes)
N = 513; vextrema (single pass):
  34.212 ns (0 allocations: 0 bytes)
N = 513; vextrema2 (two passes):
  43.144 ns (0 allocations: 0 bytes)
N = 1023; vextrema (single pass):
  53.762 ns (0 allocations: 0 bytes)
N = 1023; vextrema2 (two passes):
  75.346 ns (0 allocations: 0 bytes)
N = 1024; vextrema (single pass):
  52.044 ns (0 allocations: 0 bytes)
N = 1024; vextrema2 (two passes):
  75.313 ns (0 allocations: 0 bytes)
N = 1025; vextrema (single pass):
  53.495 ns (0 allocations: 0 bytes)
N = 1025; vextrema2 (two passes):
  75.312 ns (0 allocations: 0 bytes)
N = 2047; vextrema (single pass):
  92.727 ns (0 allocations: 0 bytes)
N = 2047; vextrema2 (two passes):
  142.551 ns (0 allocations: 0 bytes)
N = 2048; vextrema (single pass):
  91.587 ns (0 allocations: 0 bytes)
N = 2048; vextrema2 (two passes):
  141.216 ns (0 allocations: 0 bytes)
N = 2049; vextrema (single pass):
  93.790 ns (0 allocations: 0 bytes)
N = 2049; vextrema2 (two passes):
  143.177 ns (0 allocations: 0 bytes)
N = 4095; vextrema (single pass):
  169.865 ns (0 allocations: 0 bytes)
N = 4095; vextrema2 (two passes):
  278.090 ns (0 allocations: 0 bytes)
N = 4096; vextrema (single pass):
  167.685 ns (0 allocations: 0 bytes)
N = 4096; vextrema2 (two passes):
  276.201 ns (0 allocations: 0 bytes)
N = 4097; vextrema (single pass):
  170.099 ns (0 allocations: 0 bytes)
N = 4097; vextrema2 (two passes):
  276.579 ns (0 allocations: 0 bytes)
N = 8191; vextrema (single pass):
  412.320 ns (0 allocations: 0 bytes)
N = 8191; vextrema2 (two passes):
  839.181 ns (0 allocations: 0 bytes)
N = 8192; vextrema (single pass):
  412.580 ns (0 allocations: 0 bytes)
N = 8192; vextrema2 (two passes):
  840.789 ns (0 allocations: 0 bytes)
N = 8193; vextrema (single pass):
  411.175 ns (0 allocations: 0 bytes)
N = 8193; vextrema2 (two passes):
  834.068 ns (0 allocations: 0 bytes)
N = 16383; vextrema (single pass):
  808.545 ns (0 allocations: 0 bytes)
N = 16383; vextrema2 (two passes):
  1.661 μs (0 allocations: 0 bytes)
N = 16384; vextrema (single pass):
  809.828 ns (0 allocations: 0 bytes)
N = 16384; vextrema2 (two passes):
  1.662 μs (0 allocations: 0 bytes)
N = 16385; vextrema (single pass):
  810.227 ns (0 allocations: 0 bytes)
N = 16385; vextrema2 (two passes):
  1.666 μs (0 allocations: 0 bytes)
N = 32767; vextrema (single pass):
  1.613 μs (0 allocations: 0 bytes)
N = 32767; vextrema2 (two passes):
  3.319 μs (0 allocations: 0 bytes)
N = 32768; vextrema (single pass):
  1.614 μs (0 allocations: 0 bytes)
N = 32768; vextrema2 (two passes):
  3.306 μs (0 allocations: 0 bytes)
N = 32769; vextrema (single pass):
  1.610 μs (0 allocations: 0 bytes)
N = 32769; vextrema2 (two passes):
  3.325 μs (0 allocations: 0 bytes)
N = 65535; vextrema (single pass):
  3.275 μs (0 allocations: 0 bytes)
N = 65535; vextrema2 (two passes):
  6.661 μs (0 allocations: 0 bytes)
N = 65536; vextrema (single pass):
  3.335 μs (0 allocations: 0 bytes)
N = 65536; vextrema2 (two passes):
  6.731 μs (0 allocations: 0 bytes)
N = 65537; vextrema (single pass):
  3.301 μs (0 allocations: 0 bytes)
N = 65537; vextrema2 (two passes):
  6.673 μs (0 allocations: 0 bytes)
N = 131071; vextrema (single pass):
  9.586 μs (0 allocations: 0 bytes)
N = 131071; vextrema2 (two passes):
  20.236 μs (0 allocations: 0 bytes)
N = 131072; vextrema (single pass):
  13.492 μs (0 allocations: 0 bytes)
N = 131072; vextrema2 (two passes):
  24.465 μs (0 allocations: 0 bytes)
N = 131073; vextrema (single pass):
  11.974 μs (0 allocations: 0 bytes)
N = 131073; vextrema2 (two passes):
  21.672 μs (0 allocations: 0 bytes)
N = 262143; vextrema (single pass):
  57.908 μs (0 allocations: 0 bytes)
N = 262143; vextrema2 (two passes):
  115.691 μs (0 allocations: 0 bytes)
N = 262144; vextrema (single pass):
  57.090 μs (0 allocations: 0 bytes)
N = 262144; vextrema2 (two passes):
  113.558 μs (0 allocations: 0 bytes)
N = 262145; vextrema (single pass):
  58.729 μs (0 allocations: 0 bytes)
N = 262145; vextrema2 (two passes):
  117.106 μs (0 allocations: 0 bytes)
N = 524287; vextrema (single pass):
  121.333 μs (0 allocations: 0 bytes)
N = 524287; vextrema2 (two passes):
  242.773 μs (0 allocations: 0 bytes)
N = 524288; vextrema (single pass):
  121.278 μs (0 allocations: 0 bytes)
N = 524288; vextrema2 (two passes):
  242.873 μs (0 allocations: 0 bytes)
N = 524289; vextrema (single pass):
  121.502 μs (0 allocations: 0 bytes)
N = 524289; vextrema2 (two passes):
  242.625 μs (0 allocations: 0 bytes)
N = 1048575; vextrema (single pass):
  242.838 μs (0 allocations: 0 bytes)
N = 1048575; vextrema2 (two passes):
  486.244 μs (0 allocations: 0 bytes)
N = 1048576; vextrema (single pass):
  242.899 μs (0 allocations: 0 bytes)
N = 1048576; vextrema2 (two passes):
  486.181 μs (0 allocations: 0 bytes)
N = 1048577; vextrema (single pass):
  242.929 μs (0 allocations: 0 bytes)
N = 1048577; vextrema2 (two passes):
  486.017 μs (0 allocations: 0 bytes)

-Original file line number
+Diff line change
@@ Expand Up @@
     # a custom implementation of extrema is not really worth it, as the time/memory
     # cost is approximately the same. Also, it suffers from first dimension reduction error.
+    # Convenience to handle zipping of results
+    _vvextrema_zip(a::Number, b::Number) = a, b
+    _vvextrema_zip(a, b) = collect(zip(a, b))
     """
         vvextrema(f, A::AbstractArray, dims=:)
@@ Expand All / @@ -140,7 +144,7 @@ over the given `dims`. @@
     # Warning
     `NaN` values are not handled!
     """
-    vvextrema(f::F, A, dims) where {F} = collect(zip(vvminimum(f, A, dims), vvmaximum(f, A, dims)))
+    vvextrema(f::F, A, dims) where {F} = _vvextrema_zip(vvminimum(f, A, dims), vvmaximum(f, A, dims))
     vvextrema(f::F, A, ::Colon) where {F} = (vvminimum(f, A, :), vvmaximum(f, A, :))
     vvextrema(f::F, A) where {F<:Function} = vvextrema(f, A, :)
     # ::AbstractArray required in order for kwargs interface to work
@@ Expand Down Expand Up @@
     which accept a single type argument.
     """
     vvextrema(f, A; dims=:, init=(typemax, typemin)) =
-        collect(zip(vvmapreduce(f, min, init[1], A, dims), vvmapreduce(f, max, init[2], A, dims)))
+        _vvextrema_zip(vvmapreduce(f, min, init[1], A, dims), vvmapreduce(f, max, init[2], A, dims))
     """
         vvextrema(A::AbstractArray; dims=:, init=(typemax, typemin))
@@ Expand All @@
     with the min and max initialized by `init`.
     """
     vvextrema(A; dims=:, init=(typemax, typemin)) =
-        collect(zip(vvmapreduce(identity, min, init[1], A, dims), vvmapreduce(identity, max, init[2], A, dims)))
+        _vvextrema_zip(vvmapreduce(identity, min, init[1], A, dims), vvmapreduce(identity, max, init[2], A, dims))
     # reduction over all dims
@@ Expand Down Expand Up / @@ -707,7 +711,7 @@ over the given `dims`. @@
     # Warning
     `NaN` values are not handled!
     """
-    vtextrema(f::F, A, dims) where {F} = collect(zip(vtminimum(f, A, dims), vtmaximum(f, A, dims)))
+    vtextrema(f::F, A, dims) where {F} = _vvextrema_zip(vtminimum(f, A, dims), vtmaximum(f, A, dims))
     vtextrema(f::F, A, ::Colon) where {F} = (vtminimum(f, A, :), vtmaximum(f, A, :))
     vtextrema(f::F, A) where {F<:Function} = vtextrema(f, A, :)
     # ::AbstractArray required in order for kwargs interface to work
@@ Expand Down Expand Up @@
     which accept a single type argument.
     """
     vtextrema(f, A; dims=:, init=(typemax, typemin)) =
-        collect(zip(vtmapreduce(f, min, init[1], A, dims), vtmapreduce(f, max, init[2], A, dims)))
+        _vvextrema_zip(vtmapreduce(f, min, init[1], A, dims), vtmapreduce(f, max, init[2], A, dims))
     """
         vtextrema(A::AbstractArray; dims=:, init=(typemax, typemin))
@@ Expand All @@
     with the min and max initialized by `init`.
     """
     vtextrema(A; dims=:, init=(typemax, typemin)) =
-        collect(zip(vtmapreduce(identity, min, init[1], A, dims), vtmapreduce(identity, max, init[2], A, dims)))
+        _vvextrema_zip(vtmapreduce(identity, min, init[1], A, dims), vtmapreduce(identity, max, init[2], A, dims))
     # reduction over all dims
     @generated function vtmapreduce(f::F, op::OP, init::I, A::AbstractArray{T, N}, ::Colon) where {F, OP, I, T, N}
@@ Expand Down @@

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Commit

There are no files selected for viewing

3 comments on commit `8541e13`

chriselrod commented on `8541e13` Jun 10, 2022 •

edited

Loading

andrewjradcliffe commented on `8541e13` Jun 10, 2022

andrewjradcliffe commented on `8541e13` Jun 10, 2022 •

edited

Loading

Commit

There are no files selected for viewing

3 comments on commit 8541e13

chriselrod commented on 8541e13 Jun 10, 2022 • edited Loading

Choose a reason for hiding this comment

andrewjradcliffe commented on 8541e13 Jun 10, 2022

Choose a reason for hiding this comment

andrewjradcliffe commented on 8541e13 Jun 10, 2022 • edited Loading

Choose a reason for hiding this comment

3 comments on commit `8541e13`

chriselrod commented on `8541e13` Jun 10, 2022 •

edited

Loading

andrewjradcliffe commented on `8541e13` Jun 10, 2022

andrewjradcliffe commented on `8541e13` Jun 10, 2022 •

edited

Loading