Skip to content

Commit

Permalink
[math] Use dynamic dispatch for highway SIMD
Browse files Browse the repository at this point in the history
  • Loading branch information
jwnimmer-tri committed Jul 2, 2024
1 parent af0551e commit 23b88b6
Show file tree
Hide file tree
Showing 18 changed files with 876 additions and 793 deletions.
75 changes: 75 additions & 0 deletions common/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,57 @@ drake_cc_library(
],
)

drake_cc_library(
name = "hwy_dynamic",
srcs = ["hwy_dynamic.cc"],
hdrs = [
"hwy_dynamic.h",
"hwy_dynamic_impl.h",
],
defines = [
# Highway automatically compiles our code that uses it for multiple CPU
# targets, so that the code is tuned as best as possible for the user's
# CPU that it will run on.
#
# By default Highway compiles to a ~dozen different CPU targets, but
# in practice Drake only cares about a few of them. We'll disable the
# default list of targets and write down the CPUs we care about one by
# one so that the test / support burden is better matched to our needs.
#
# Highway represents target sets using a bitmask. Each target is given
# a single bit (e.g., '#define HWY_AVX2 (1LL << 9)'). To express a set
# of targets (e.g., to set HWY_DISABLED_TARGETS) we must binary-or the
# per-target constants together.
#
# Highway only offers a deny-list for targets, so we'll need to deny
# everything and then un-deny what we want to allow. Per De Morgan's
# law, an ENABLE list folded over OR is the same as the negation of the
# same list folded over NAND; to end up with a DISABLE list, we can
# just drop the outermost negation, i.e., ENABLE list folded over NAND.
"HWY_DISABLED_TARGETS=(" + "&".join([
"(~({}))".format(name)
for name in [
# We must always provide a portable fallback.
"HWY_BASELINE_SCALAR",
# x86: SIMD with 256-bit lanes and FMA.
"HWY_AVX2",
# x86: SIMD with the improved 512VL encodings.
# TODO(jwnimmer-tri) Enable this once we support opt-out of CPU
# targets via NPY_DISABLE_CPU_FEATURES environment variable.
# "HWY_AVX3",
# arm: SIMD that has fixed 256-bit lanes.
# TODO(jwnimmer-tri) Enable this once we can test it.
# "HWY_SVE_256",
]
]) + ")",
],
internal = True,
visibility = ["//:__subpackages__"],
deps = [
":essential",
],
)

drake_cc_library(
name = "is_approx_equal_abstol",
hdrs = ["is_approx_equal_abstol.h"],
Expand Down Expand Up @@ -844,6 +895,30 @@ drake_cc_googletest(
],
)

drake_cc_library(
name = "hwy_dynamic_test_array_mul",
testonly = True,
srcs = ["test/hwy_dynamic_test_array_mul.cc"],
hdrs = ["test/hwy_dynamic_test_array_mul.h"],
textual_hdrs = ["test/hwy_dynamic_test_array_mul.cc"],
visibility = ["//visibility:private"],
deps = [
":hwy_dynamic",
":string_container",
"@highway_internal//:hwy",
],
)

drake_cc_googletest(
name = "hwy_dynamic_test",
deps = [
":hwy_dynamic",
":hwy_dynamic_test_array_mul",
"//common/test_utilities:eigen_matrix_compare",
"@highway_internal//:hwy_test_util",
],
)

drake_cc_googletest(
name = "identifier_test",
deps = [
Expand Down
48 changes: 48 additions & 0 deletions common/hwy_dynamic.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
#include "drake/common/hwy_dynamic.h"

#include <mutex>
#include <vector>

#include "drake/common/drake_assert.h"
#include "drake/common/never_destroyed.h"

namespace drake {
namespace internal {
namespace {

struct Globals {
mutable std::mutex mutex;
std::vector<void (*)()> resets;
};

Globals& get_singleton() {
static never_destroyed<Globals> storage;
return storage.access();
}

} // namespace

void HwyDynamicRegisterResetFunction(void (*reset)()) {
DRAKE_DEMAND(reset != nullptr);
auto& singleton = get_singleton();
std::lock_guard<std::mutex> guard(singleton.mutex);
// Check for duplicates.
for (const auto& existing : singleton.resets) {
if (reset == existing) {
return;
}
}
// Not yet added; add it now.
singleton.resets.push_back(reset);
}

void HwyDynamicReset() {
const auto& singleton = get_singleton();
// No mutex guard here; this function is documented as not thread-safe.
for (const auto& reset : singleton.resets) {
reset();
}
}

} // namespace internal
} // namespace drake
30 changes: 30 additions & 0 deletions common/hwy_dynamic.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
#pragma once

namespace drake {
namespace internal {

/* This file provides two singleton-like functions whose purpose is to assist
with unit tests that want to probe code that uses "hwy/highway.h" for SIMD.
The first function is called from non-test code to register hooks that will
allow tests to reset CPU detection back to its initial conditions.
The second function is called by tests to actually do the resetting. */

/* Anywhere in Drake that uses Highway for dynamic CPU dispatch should call this
helper function to register a reset handler to clear the latched CPU detection.
(Most developers never need to worry about this, because it's automatic when you
use the tools in `hwy_dynamic_impl.h`.) This function is safe to call from
multiple threads concurrently. It's a no-op to register the same handler pointer
multiple times, i.e., duplicates are ignored. */
void HwyDynamicRegisterResetFunction(void (*)());

/* (For testing only) Clears the latched CPU target detection by calling all of
the reset handlers registered using the HwyDynamicRegisterResetFunction() above.
This allows for testing multiple different CPU targets from within the same test
program. This function is NOT thread-safe; only use this in single-threaded
tests. */
void HwyDynamicReset();

} // namespace internal
} // namespace drake
101 changes: 101 additions & 0 deletions common/hwy_dynamic_impl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
#pragma once

#include <atomic>
#include <utility>

#include "drake/common/hwy_dynamic.h"

// This file should only ever be included from `*.cc` implementation files,
// and we always want its code to be private to that file, so we'll use an
// anonymous namespace in a header file, for simplicity.
namespace { // NOLINT(build/namespaces)

/* LateBoundFunction is a small wrapper class around a C-style raw function
pointer. (It doesn't support std::function objects.)
In a given process, the first time our Call() function is called, it will
latch-initialize the wrapped function pointer as follows:
- call the ChooseFunctor to select which function pointer to use,
- memorize that answer for next time;
- call the selected function pointer and return its result.
After the selected function pointer is memorized, future calls to Call() will
invoke it efficiently (with no conditional branching).
Note that the memorization is thread-safe, but not limited to at-most-once
semantics. Multiple threads might concurrently select and memorize the choice,
the ChooseFunctor might be called multiple times.
In support of unit testing, selections can be unlatched via HwyDynamicReset()
in "drake/common/hwy_dynamic.h".
@tparam ChooseFunctor thread-safe functor that selects the function pointer to
be wrapped. */
template <typename ChooseFunctor>
class LateBoundFunction {
public:
/* A function pointer type, determined by what the ChooseFunctor returns. */
using FunctionPointer = decltype(ChooseFunctor()());

/* Calls this trampoline. The first time we're called, we'll choose the best
target function and memorize it; subsequent calls will directly call into the
chosen target without any conditional checks. Caveat: our memorization does
not use the sequential consistency flavor of atomics, so the "first time" here
actually means "first time on this thread"; if other threads happen to be
bootstrapping at the same time, then in the extreme ChooseThenCall() could
end up being run once per CPU core. */
template <typename... Args>
__attribute__((always_inline)) static decltype(auto) Call(Args... args) {
// We use "consume" memory order here because we only care about memory
// consistency as viewed from the current thread, not other threads. It's
// safe for multiple threads to end up calling ChooseThenCall() in case they
// all happen to run at once, in which case they will all store identical
// values back into `function_`.
auto impl = function_.load(std::memory_order::consume);
return impl(std::forward<Args>(args)...);
}

private:
template <typename... Args>
__attribute__((cold)) static decltype(auto) ChooseThenCall(Args... args) {
// Add our reset handler into the global registry. (This only registers the
// function, it doesn't call it.)
drake::internal::HwyDynamicRegisterResetFunction(&Reset);
// Force highway to select a CPU target, if it hasn't already.
if (auto& target = hwy::GetChosenTarget(); !target.IsInitialized()) {
target.Update(hwy::SupportedTargets());
}
// Retrieve the CPU-specific function pointer from the table of pointers.
auto impl = ChooseFunctor()();
// Memorize the selected pointer for next time (or until a Reset()). We use
// "release" memory order here because we only care about memory consistency
// as viewed from the current thread.
function_.store(impl, std::memory_order::release);
// Call the actual function we want and return its result.
return impl(args...);
}

/* (For testing only) Clears the latched target detection. This allows for
testing multiple different targets from the same test program. This function
is NOT thread-safe; only use this in single-threaded tests. */
__attribute__((cold)) static void Reset() {
// For the test-only function, we use the default memory order (sequential
// consistency) because we might as well keep it simple, even though under
// "only use in single-threaded tests" the memory order doesn't matter.
function_.store(&LateBoundFunction::ChooseThenCall);
}

// Static globals must be trivially destructible.
static_assert(std::is_trivially_destructible_v<std::atomic<FunctionPointer>>);

// Note that the default value for this variable is provided out-of-line
// below; it does NOT default to null.
static std::atomic<FunctionPointer> function_;
};

template <typename ChooseFunctor>
std::atomic<decltype(ChooseFunctor()())>
LateBoundFunction<ChooseFunctor>::function_ =
&LateBoundFunction::ChooseThenCall;

} // namespace
73 changes: 73 additions & 0 deletions common/test/hwy_dynamic_test.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#include <Eigen/Dense>
#include <gmock/gmock.h>
#include <gtest/gtest.h>

#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wold-style-cast"
#include "hwy/tests/hwy_gtest.h"
#pragma GCC diagnostic pop

#include "drake/common/hwy_dynamic.h"
#include "drake/common/test/hwy_dynamic_test_array_mul.h"
#include "drake/common/test_utilities/eigen_matrix_compare.h"

namespace drake {
namespace internal {
namespace {

using Eigen::ArrayXd;
using Eigen::VectorXd;

/* This hwy-infused test fixture replicates every test case to be run against
every target architecture variant (e.g., SSE4, AVX2, AVX512VL, etc). When run,
it filters the suite to only run tests that the current CPU can handle. */
class Fixture : public hwy::TestWithParamTarget {
protected:
void SetUp() override {
// Reset Drake's target selection.
drake::internal::HwyDynamicReset();
// Reset Highway's target selection.
hwy::TestWithParamTarget::SetUp();
// Clear old statistics from prior test cases.
GetMutableArrayMulCounters().clear();
}
};

HWY_TARGET_INSTANTIATE_TEST_SUITE_P(Fixture);

TEST_P(Fixture, ArrayMulTest) {
const ArrayXd x0 = ArrayXd::LinSpaced(16, 1.0, 16.0);
const ArrayXd x1 = ArrayXd::LinSpaced(16, 0.0625, 1.0);
ArrayXd y1;

// Run the test logic twice, so that we'll be able to confirm that things
// which should happen once vs twice were correct.
for (int i = 0; i < 2; ++i) {
// Clear the prior output.
y1 = {};

// Run the calculation.
ArrayMul(x0, x1, &y1);

// Check the computed answer.
const ArrayXd expected = x0 * x1;
ASSERT_TRUE(CompareMatrices(VectorXd{y1}, VectorXd{expected}));
}

// Check that Highway chose something.
ASSERT_TRUE(hwy::GetChosenTarget().IsInitialized());

// We should see exactly 1 call to the target selector, and 2 calls to the
// target-specific impl function.
const std::string expected_impl =
fmt::format("ArrayMulImpl.{}", hwy::TargetName(GetParam()));
using Pair = std::pair<std::string, int>;
string_map<int>& counters = GetMutableArrayMulCounters();
EXPECT_THAT(counters,
testing::UnorderedElementsAre(Pair("ChooseBestArrayMul", 1),
Pair(expected_impl, 2)));
}

} // namespace
} // namespace internal
} // namespace drake
Loading

0 comments on commit 23b88b6

Please sign in to comment.