Merge pull request #92 from brycelelbach/cartesian_product_internal_i…

…teration Implement internal iteration for `cartesian_product`
tcbrindle · Aug 2, 2023 · d906839 · d906839
2 parents caa30bd + abba492
commit d906839
Show file tree

Hide file tree

Showing 10 changed files with 1,132 additions and 49 deletions.
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
@@ -6,4 +6,7 @@ add_executable(internal-iteration-benchmark internal_iteration_benchmark.cpp)
 target_link_libraries(internal-iteration-benchmark PUBLIC nanobench flux)
 
 add_executable(sort-benchmark sort_benchmark.cpp)
-target_link_libraries(sort-benchmark PUBLIC nanobench flux)
+target_link_libraries(sort-benchmark PUBLIC nanobench flux)
+
+add_executable(multidimensional-memset-benchmark multidimensional_memset_benchmark.cpp multidimensional_memset_benchmark_kernels.cpp)
+target_link_libraries(multidimensional-memset-benchmark PUBLIC nanobench flux)
diff --git a/benchmark/multidimensional_memset_benchmark.cpp b/benchmark/multidimensional_memset_benchmark.cpp
@@ -0,0 +1,93 @@
+
+// Copyright (c) 2021 Barry Revzin
+// Copyright (c) 2023 NVIDIA Corporation (reply-to: [email protected])
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <nanobench.h>
+
+#include <flux.hpp>
+
+#include <numeric>
+#include <ranges>
+#include <iostream>
+
+namespace an = ankerl::nanobench;
+
+// Kernels are placed in a separate translation unit to prevent compilers from
+// optimizing them based on the input that we'll be giving them and to make it
+// easier to study their compiled assembly.
+extern void memset_2d_reference(double* A, flux::distance_t N, flux::distance_t M);
+extern void memset_2d_std_cartesian_product_iota(double* A, flux::distance_t N, flux::distance_t M);
+extern void memset_2d_flux_cartesian_product_iota(double* A, flux::distance_t N, flux::distance_t M);
+extern void memset_diagonal_2d_reference(double* A, flux::distance_t N, flux::distance_t M);
+extern void memset_diagonal_2d_std_cartesian_product_iota_filter(double* A, flux::distance_t N, flux::distance_t M);
+extern void memset_diagonal_2d_flux_cartesian_product_iota_filter(double* A, flux::distance_t N, flux::distance_t M);
+
+int main(int argc, char** argv)
+{
+    int const n_iters = argc > 1 ? std::atoi(argv[1]) : 40;
+
+    constexpr flux::distance_t N = 1024;
+    constexpr flux::distance_t M = 2048;
+    std::vector<double> A(N * M);
+
+    const auto run_benchmark =
+    [] (auto& bench, auto& A, auto N, auto M, auto name, auto func, auto check) {
+        std::iota(A.begin(), A.end(), 0);
+        bench.run(name, [&] { func(A.data(), N, M); });
+        check(A, N, M);
+    };
+
+    {
+        const auto check_2d = [] (auto& A, auto N, auto M) {
+            const auto it = std::ranges::find_if_not(A, [&] (auto e) { return e == 0.0; });
+            if (it != A.end())
+                throw false;
+        };
+
+        auto bench = an::Bench()
+            .minEpochIterations(n_iters)
+            .relative(true)
+            .performanceCounters(false);
+
+        const auto run_2d_benchmark_impl = [&] (auto name, auto func) {
+            run_benchmark(bench, A, N, M, name, func, check_2d);
+        };
+
+        #define run_2d_benchmark(func) run_2d_benchmark_impl(#func, func)
+
+        run_2d_benchmark(memset_2d_reference);
+        run_2d_benchmark(memset_2d_std_cartesian_product_iota);
+        run_2d_benchmark(memset_2d_flux_cartesian_product_iota);
+    }
+
+    {
+        const auto check_diagonal_2d = [] (auto& A, auto N, auto M) {
+            for (auto i : std::views::iota(0, N))
+                for (auto j : std::views::iota(0, M)) {
+                    if (i == j) {
+                        if (A[i * M + j] != 0.0) throw false;
+                    } else {
+                        if (A[i * M + j] != i * M + j) throw false;
+                    }
+                }
+        };
+
+        auto bench = an::Bench()
+            .minEpochIterations(n_iters)
+            .relative(true)
+            .performanceCounters(false);
+
+        const auto run_diagonal_2d_benchmark_impl = [&] (auto name, auto func) {
+            run_benchmark(bench, A, N, M, name, func, check_diagonal_2d);
+        };
+
+        #define run_diagonal_2d_benchmark(func) run_diagonal_2d_benchmark_impl(#func, func)
+
+        run_diagonal_2d_benchmark(memset_diagonal_2d_reference);
+        run_diagonal_2d_benchmark(memset_diagonal_2d_std_cartesian_product_iota_filter);
+        run_diagonal_2d_benchmark(memset_diagonal_2d_flux_cartesian_product_iota_filter);
+    }
+}
diff --git a/benchmark/multidimensional_memset_benchmark_kernels.cpp b/benchmark/multidimensional_memset_benchmark_kernels.cpp
@@ -0,0 +1,68 @@
+
+// Copyright (c) 2023 NVIDIA Corporation (reply-to: [email protected])
+//
+// Distributed under the Boost Software License, Version 1.0. (See accompanying
+// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
+
+#include <flux/op/cartesian_product.hpp>
+#include <flux/source/iota.hpp>
+#include <flux/op/for_each.hpp>
+#include <flux/op/filter.hpp>
+
+#include "ranges_cartesian_product.hpp"
+
+#include <ranges>
+#include <algorithm>
+
+void memset_2d_reference(double* A, flux::distance_t N, flux::distance_t M)
+{
+    for (flux::distance_t i = 0; i != N; ++i)
+        for (flux::distance_t j = 0; j != M; ++j)
+            A[i * M + j] = 0.0;
+}
+
+void memset_2d_std_cartesian_product_iota(double* A, flux::distance_t N, flux::distance_t M)
+{
+    std::ranges::for_each(
+        std::views::cartesian_product(std::views::iota(0, N), std::views::iota(0, M)),
+        flux::unpack([&] (auto i, auto j) {
+            A[i * M + j] = 0.0;
+        }));
+}
+
+void memset_2d_flux_cartesian_product_iota(double* A, flux::distance_t N, flux::distance_t M)
+{
+    flux::for_each(
+        flux::cartesian_product(flux::ints(0, N), flux::ints(0, M)),
+        flux::unpack([&] (auto i, auto j) {
+            A[i * M + j] = 0.0;
+        }));
+}
+
+void memset_diagonal_2d_reference(double* A, flux::distance_t N, flux::distance_t M)
+{
+    for (flux::distance_t i = 0; i != N; ++i)
+        for (flux::distance_t j = 0; j != M; ++j)
+            if (i == j) A[i * M + j] = 0.0;
+}
+
+void memset_diagonal_2d_std_cartesian_product_iota_filter(double* A, flux::distance_t N, flux::distance_t M)
+{
+    std::ranges::for_each(
+        std::views::cartesian_product(std::views::iota(0, N), std::views::iota(0, M))
+            | std::views::filter(flux::unpack([] (auto i, auto j) { return i == j; })),
+        flux::unpack([&] (auto i, auto j) {
+            A[i * M + j] = 0.0;
+        }));
+}
+
+void memset_diagonal_2d_flux_cartesian_product_iota_filter(double* A, flux::distance_t N, flux::distance_t M)
+{
+    flux::for_each(
+        flux::cartesian_product(flux::ints(0, N), flux::ints(0, M))
+            .filter(flux::unpack([] (auto i, auto j) { return i == j; })),
+        flux::unpack([&] (auto i, auto j) {
+            A[i * M + j] = 0.0;
+        }));
+}
+