Skip to content

Commit 90832f4

Browse files
jwnimmer-triRussTedrake
authored andcommitted
[math] Use dynamic dispatch for highway SIMD (RobotLocomotion#21626)
1 parent bf11a7f commit 90832f4

18 files changed

+883
-793
lines changed

Diff for: common/BUILD.bazel

+75
Original file line numberDiff line numberDiff line change
@@ -345,6 +345,57 @@ drake_cc_library(
345345
],
346346
)
347347

348+
drake_cc_library(
349+
name = "hwy_dynamic",
350+
srcs = ["hwy_dynamic.cc"],
351+
hdrs = [
352+
"hwy_dynamic.h",
353+
"hwy_dynamic_impl.h",
354+
],
355+
defines = [
356+
# Highway automatically compiles our code that uses it for multiple CPU
357+
# targets, so that the code is tuned as best as possible for the user's
358+
# CPU that it will run on.
359+
#
360+
# By default Highway compiles to a ~dozen different CPU targets, but
361+
# in practice Drake only cares about a few of them. We'll disable the
362+
# default list of targets and write down the CPUs we care about one by
363+
# one so that the test / support burden is better matched to our needs.
364+
#
365+
# Highway represents target sets using a bitmask. Each target is given
366+
# a single bit (e.g., '#define HWY_AVX2 (1LL << 9)'). To express a set
367+
# of targets (e.g., to set HWY_DISABLED_TARGETS) we must binary-or the
368+
# per-target constants together.
369+
#
370+
# Highway only offers a deny-list for targets, so we'll need to deny
371+
# everything and then un-deny what we want to allow. Per De Morgan's
372+
# law, an ENABLE list folded over OR is the same as the negation of the
373+
# same list folded over NAND; to end up with a DISABLE list, we can
374+
# just drop the outermost negation, i.e., ENABLE list folded over NAND.
375+
"HWY_DISABLED_TARGETS=(" + "&".join([
376+
"(~({}))".format(name)
377+
for name in [
378+
# We must always provide a portable fallback.
379+
"HWY_BASELINE_SCALAR",
380+
# x86: SIMD with 256-bit lanes and FMA.
381+
"HWY_AVX2",
382+
# x86: SIMD with the improved 512VL encodings.
383+
# TODO(jwnimmer-tri) Enable this once we support opt-out of CPU
384+
# targets via NPY_DISABLE_CPU_FEATURES environment variable.
385+
# "HWY_AVX3",
386+
# arm: SIMD that has fixed 256-bit lanes.
387+
# TODO(jwnimmer-tri) Enable this once we can test it.
388+
# "HWY_SVE_256",
389+
]
390+
]) + ")",
391+
],
392+
internal = True,
393+
visibility = ["//:__subpackages__"],
394+
deps = [
395+
":essential",
396+
],
397+
)
398+
348399
drake_cc_library(
349400
name = "is_approx_equal_abstol",
350401
hdrs = ["is_approx_equal_abstol.h"],
@@ -844,6 +895,30 @@ drake_cc_googletest(
844895
],
845896
)
846897

898+
drake_cc_library(
899+
name = "hwy_dynamic_test_array_mul",
900+
testonly = True,
901+
srcs = ["test/hwy_dynamic_test_array_mul.cc"],
902+
hdrs = ["test/hwy_dynamic_test_array_mul.h"],
903+
textual_hdrs = ["test/hwy_dynamic_test_array_mul.cc"],
904+
visibility = ["//visibility:private"],
905+
deps = [
906+
":hwy_dynamic",
907+
":string_container",
908+
"@highway_internal//:hwy",
909+
],
910+
)
911+
912+
drake_cc_googletest(
913+
name = "hwy_dynamic_test",
914+
deps = [
915+
":hwy_dynamic",
916+
":hwy_dynamic_test_array_mul",
917+
"//common/test_utilities:eigen_matrix_compare",
918+
"@highway_internal//:hwy_test_util",
919+
],
920+
)
921+
847922
drake_cc_googletest(
848923
name = "identifier_test",
849924
deps = [

Diff for: common/hwy_dynamic.cc

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
#include "drake/common/hwy_dynamic.h"
2+
3+
#include <mutex>
4+
#include <vector>
5+
6+
#include "drake/common/drake_assert.h"
7+
#include "drake/common/never_destroyed.h"
8+
9+
namespace drake {
10+
namespace internal {
11+
namespace {
12+
13+
struct Globals {
14+
mutable std::mutex mutex;
15+
std::vector<void (*)()> resets;
16+
};
17+
18+
Globals& get_singleton() {
19+
static never_destroyed<Globals> storage;
20+
return storage.access();
21+
}
22+
23+
} // namespace
24+
25+
void HwyDynamicRegisterResetFunction(void (*reset)()) {
26+
DRAKE_DEMAND(reset != nullptr);
27+
auto& singleton = get_singleton();
28+
std::lock_guard<std::mutex> guard(singleton.mutex);
29+
// Check for duplicates.
30+
for (const auto& existing : singleton.resets) {
31+
if (reset == existing) {
32+
return;
33+
}
34+
}
35+
// Not yet added; add it now.
36+
singleton.resets.push_back(reset);
37+
}
38+
39+
void HwyDynamicReset() {
40+
const auto& singleton = get_singleton();
41+
// No mutex guard here; this function is documented as not thread-safe.
42+
for (const auto& reset : singleton.resets) {
43+
reset();
44+
}
45+
}
46+
47+
} // namespace internal
48+
} // namespace drake

Diff for: common/hwy_dynamic.h

+30
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
#pragma once
2+
3+
namespace drake {
4+
namespace internal {
5+
6+
/* This file provides two singleton-like functions whose purpose is to assist
7+
with unit tests that want to probe code that uses "hwy/highway.h" for SIMD.
8+
9+
The first function is called from non-test code to register hooks that will
10+
allow tests to reset CPU detection back to its initial conditions.
11+
12+
The second function is called by tests to actually do the resetting. */
13+
14+
/* Anywhere in Drake that uses Highway for dynamic CPU dispatch should call this
15+
helper function to register a reset handler to clear the latched CPU detection.
16+
(Most developers never need to worry about this, because it's automatic when you
17+
use the tools in `hwy_dynamic_impl.h`.) This function is safe to call from
18+
multiple threads concurrently. It's a no-op to register the same handler pointer
19+
multiple times, i.e., duplicates are ignored. */
20+
void HwyDynamicRegisterResetFunction(void (*)());
21+
22+
/* (For testing only) Clears the latched CPU target detection by calling all of
23+
the reset handlers registered using the HwyDynamicRegisterResetFunction() above.
24+
This allows for testing multiple different CPU targets from within the same test
25+
program. This function is NOT thread-safe; only use this in single-threaded
26+
tests. */
27+
void HwyDynamicReset();
28+
29+
} // namespace internal
30+
} // namespace drake

Diff for: common/hwy_dynamic_impl.h

+101
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
#pragma once
2+
3+
#include <atomic>
4+
#include <utility>
5+
6+
#include "drake/common/hwy_dynamic.h"
7+
8+
// This file should only ever be included from `*.cc` implementation files,
9+
// and we always want its code to be private to that file, so we'll use an
10+
// anonymous namespace in a header file, for simplicity.
11+
namespace { // NOLINT(build/namespaces)
12+
13+
/* LateBoundFunction is a small wrapper class around a C-style raw function
14+
pointer. (It doesn't support std::function objects.)
15+
16+
In a given process, the first time our Call() function is called, it will
17+
latch-initialize the wrapped function pointer as follows:
18+
- call the ChooseFunctor to select which function pointer to use,
19+
- memorize that answer for next time;
20+
- call the selected function pointer and return its result.
21+
22+
After the selected function pointer is memorized, future calls to Call() will
23+
invoke it efficiently (with no conditional branching).
24+
25+
Note that the memorization is thread-safe, but not limited to at-most-once
26+
semantics. Multiple threads might concurrently select and memorize the choice,
27+
the ChooseFunctor might be called multiple times.
28+
29+
In support of unit testing, selections can be unlatched via HwyDynamicReset()
30+
in "drake/common/hwy_dynamic.h".
31+
32+
@tparam ChooseFunctor thread-safe functor that selects the function pointer to
33+
be wrapped. */
34+
template <typename ChooseFunctor>
35+
class LateBoundFunction {
36+
public:
37+
/* A function pointer type, determined by what the ChooseFunctor returns. */
38+
using FunctionPointer = decltype(ChooseFunctor()());
39+
40+
/* Calls this trampoline. The first time we're called, we'll choose the best
41+
target function and memorize it; subsequent calls will directly call into the
42+
chosen target without any conditional checks. Caveat: our memorization does
43+
not use the sequential consistency flavor of atomics, so the "first time" here
44+
actually means "first time on this thread"; if other threads happen to be
45+
bootstrapping at the same time, then in the extreme ChooseThenCall() could
46+
end up being run once per CPU core. */
47+
template <typename... Args>
48+
__attribute__((always_inline)) static decltype(auto) Call(Args... args) {
49+
// We use "consume" memory order here because we only care about memory
50+
// consistency as viewed from the current thread, not other threads. It's
51+
// safe for multiple threads to end up calling ChooseThenCall() in case they
52+
// all happen to run at once, in which case they will all store identical
53+
// values back into `function_`.
54+
auto impl = function_.load(std::memory_order::consume);
55+
return impl(std::forward<Args>(args)...);
56+
}
57+
58+
private:
59+
template <typename... Args>
60+
__attribute__((cold)) static decltype(auto) ChooseThenCall(Args... args) {
61+
// Add our reset handler into the global registry. (This only registers the
62+
// function, it doesn't call it.)
63+
drake::internal::HwyDynamicRegisterResetFunction(&Reset);
64+
// Force highway to select a CPU target, if it hasn't already.
65+
if (auto& target = hwy::GetChosenTarget(); !target.IsInitialized()) {
66+
target.Update(hwy::SupportedTargets());
67+
}
68+
// Retrieve the CPU-specific function pointer from the table of pointers.
69+
auto impl = ChooseFunctor()();
70+
// Memorize the selected pointer for next time (or until a Reset()). We use
71+
// "release" memory order here because we only care about memory consistency
72+
// as viewed from the current thread.
73+
function_.store(impl, std::memory_order::release);
74+
// Call the actual function we want and return its result.
75+
return impl(args...);
76+
}
77+
78+
/* (For testing only) Clears the latched target detection. This allows for
79+
testing multiple different targets from the same test program. This function
80+
is NOT thread-safe; only use this in single-threaded tests. */
81+
__attribute__((cold)) static void Reset() {
82+
// For the test-only function, we use the default memory order (sequential
83+
// consistency) because we might as well keep it simple, even though under
84+
// "only use in single-threaded tests" the memory order doesn't matter.
85+
function_.store(&LateBoundFunction::ChooseThenCall);
86+
}
87+
88+
// Static globals must be trivially destructible.
89+
static_assert(std::is_trivially_destructible_v<std::atomic<FunctionPointer>>);
90+
91+
// Note that the default value for this variable is provided out-of-line
92+
// below; it does NOT default to null.
93+
static std::atomic<FunctionPointer> function_;
94+
};
95+
96+
template <typename ChooseFunctor>
97+
std::atomic<decltype(ChooseFunctor()())>
98+
LateBoundFunction<ChooseFunctor>::function_ =
99+
&LateBoundFunction::ChooseThenCall;
100+
101+
} // namespace

Diff for: common/test/hwy_dynamic_test.cc

+73
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,73 @@
1+
#include <Eigen/Dense>
2+
#include <gmock/gmock.h>
3+
#include <gtest/gtest.h>
4+
5+
#pragma GCC diagnostic push
6+
#pragma GCC diagnostic ignored "-Wold-style-cast"
7+
#include "hwy/tests/hwy_gtest.h"
8+
#pragma GCC diagnostic pop
9+
10+
#include "drake/common/hwy_dynamic.h"
11+
#include "drake/common/test/hwy_dynamic_test_array_mul.h"
12+
#include "drake/common/test_utilities/eigen_matrix_compare.h"
13+
14+
namespace drake {
15+
namespace internal {
16+
namespace {
17+
18+
using Eigen::ArrayXd;
19+
using Eigen::VectorXd;
20+
21+
/* This hwy-infused test fixture replicates every test case to be run against
22+
every target architecture variant (e.g., SSE4, AVX2, AVX512VL, etc). When run,
23+
it filters the suite to only run tests that the current CPU can handle. */
24+
class Fixture : public hwy::TestWithParamTarget {
25+
protected:
26+
void SetUp() override {
27+
// Reset Drake's target selection.
28+
drake::internal::HwyDynamicReset();
29+
// Reset Highway's target selection.
30+
hwy::TestWithParamTarget::SetUp();
31+
// Clear old statistics from prior test cases.
32+
GetMutableArrayMulCounters().clear();
33+
}
34+
};
35+
36+
HWY_TARGET_INSTANTIATE_TEST_SUITE_P(Fixture);
37+
38+
TEST_P(Fixture, ArrayMulTest) {
39+
const ArrayXd x0 = ArrayXd::LinSpaced(16, 1.0, 16.0);
40+
const ArrayXd x1 = ArrayXd::LinSpaced(16, 0.0625, 1.0);
41+
ArrayXd y1;
42+
43+
// Run the test logic twice, so that we'll be able to confirm that things
44+
// which should happen once vs twice were correct.
45+
for (int i = 0; i < 2; ++i) {
46+
// Clear the prior output.
47+
y1 = {};
48+
49+
// Run the calculation.
50+
ArrayMul(x0, x1, &y1);
51+
52+
// Check the computed answer.
53+
const ArrayXd expected = x0 * x1;
54+
ASSERT_TRUE(CompareMatrices(VectorXd{y1}, VectorXd{expected}));
55+
}
56+
57+
// Check that Highway chose something.
58+
ASSERT_TRUE(hwy::GetChosenTarget().IsInitialized());
59+
60+
// We should see exactly 1 call to the target selector, and 2 calls to the
61+
// target-specific impl function.
62+
const std::string expected_impl =
63+
fmt::format("ArrayMulImpl.{}", hwy::TargetName(GetParam()));
64+
using Pair = std::pair<std::string, int>;
65+
string_map<int>& counters = GetMutableArrayMulCounters();
66+
EXPECT_THAT(counters,
67+
testing::UnorderedElementsAre(Pair("ChooseBestArrayMul", 1),
68+
Pair(expected_impl, 2)));
69+
}
70+
71+
} // namespace
72+
} // namespace internal
73+
} // namespace drake

0 commit comments

Comments
 (0)