-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[math] Use dynamic dispatch for highway SIMD
- Loading branch information
1 parent
af0551e
commit 23b88b6
Showing
18 changed files
with
876 additions
and
793 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
#include "drake/common/hwy_dynamic.h" | ||
|
||
#include <mutex> | ||
#include <vector> | ||
|
||
#include "drake/common/drake_assert.h" | ||
#include "drake/common/never_destroyed.h" | ||
|
||
namespace drake { | ||
namespace internal { | ||
namespace { | ||
|
||
struct Globals { | ||
mutable std::mutex mutex; | ||
std::vector<void (*)()> resets; | ||
}; | ||
|
||
Globals& get_singleton() { | ||
static never_destroyed<Globals> storage; | ||
return storage.access(); | ||
} | ||
|
||
} // namespace | ||
|
||
void HwyDynamicRegisterResetFunction(void (*reset)()) { | ||
DRAKE_DEMAND(reset != nullptr); | ||
auto& singleton = get_singleton(); | ||
std::lock_guard<std::mutex> guard(singleton.mutex); | ||
// Check for duplicates. | ||
for (const auto& existing : singleton.resets) { | ||
if (reset == existing) { | ||
return; | ||
} | ||
} | ||
// Not yet added; add it now. | ||
singleton.resets.push_back(reset); | ||
} | ||
|
||
void HwyDynamicReset() { | ||
const auto& singleton = get_singleton(); | ||
// No mutex guard here; this function is documented as not thread-safe. | ||
for (const auto& reset : singleton.resets) { | ||
reset(); | ||
} | ||
} | ||
|
||
} // namespace internal | ||
} // namespace drake |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
#pragma once | ||
|
||
namespace drake { | ||
namespace internal { | ||
|
||
/* This file provides two singleton-like functions whose purpose is to assist | ||
with unit tests that want to probe code that uses "hwy/highway.h" for SIMD. | ||
The first function is called from non-test code to register hooks that will | ||
allow tests to reset CPU detection back to its initial conditions. | ||
The second function is called by tests to actually do the resetting. */ | ||
|
||
/* Anywhere in Drake that uses Highway for dynamic CPU dispatch should call this | ||
helper function to register a reset handler to clear the latched CPU detection. | ||
(Most developers never need to worry about this, because it's automatic when you | ||
use the tools in `hwy_dynamic_impl.h`.) This function is safe to call from | ||
multiple threads concurrently. It's a no-op to register the same handler pointer | ||
multiple times, i.e., duplicates are ignored. */ | ||
void HwyDynamicRegisterResetFunction(void (*)()); | ||
|
||
/* (For testing only) Clears the latched CPU target detection by calling all of | ||
the reset handlers registered using the HwyDynamicRegisterResetFunction() above. | ||
This allows for testing multiple different CPU targets from within the same test | ||
program. This function is NOT thread-safe; only use this in single-threaded | ||
tests. */ | ||
void HwyDynamicReset(); | ||
|
||
} // namespace internal | ||
} // namespace drake |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
#pragma once | ||
|
||
#include <atomic> | ||
#include <utility> | ||
|
||
#include "drake/common/hwy_dynamic.h" | ||
|
||
// This file should only ever be included from `*.cc` implementation files, | ||
// and we always want its code to be private to that file, so we'll use an | ||
// anonymous namespace in a header file, for simplicity. | ||
namespace { // NOLINT(build/namespaces) | ||
|
||
/* LateBoundFunction is a small wrapper class around a C-style raw function | ||
pointer. (It doesn't support std::function objects.) | ||
In a given process, the first time our Call() function is called, it will | ||
latch-initialize the wrapped function pointer as follows: | ||
- call the ChooseFunctor to select which function pointer to use, | ||
- memorize that answer for next time; | ||
- call the selected function pointer and return its result. | ||
After the selected function pointer is memorized, future calls to Call() will | ||
invoke it efficiently (with no conditional branching). | ||
Note that the memorization is thread-safe, but not limited to at-most-once | ||
semantics. Multiple threads might concurrently select and memorize the choice, | ||
the ChooseFunctor might be called multiple times. | ||
In support of unit testing, selections can be unlatched via HwyDynamicReset() | ||
in "drake/common/hwy_dynamic.h". | ||
@tparam ChooseFunctor thread-safe functor that selects the function pointer to | ||
be wrapped. */ | ||
template <typename ChooseFunctor> | ||
class LateBoundFunction { | ||
public: | ||
/* A function pointer type, determined by what the ChooseFunctor returns. */ | ||
using FunctionPointer = decltype(ChooseFunctor()()); | ||
|
||
/* Calls this trampoline. The first time we're called, we'll choose the best | ||
target function and memorize it; subsequent calls will directly call into the | ||
chosen target without any conditional checks. Caveat: our memorization does | ||
not use the sequential consistency flavor of atomics, so the "first time" here | ||
actually means "first time on this thread"; if other threads happen to be | ||
bootstrapping at the same time, then in the extreme ChooseThenCall() could | ||
end up being run once per CPU core. */ | ||
template <typename... Args> | ||
__attribute__((always_inline)) static decltype(auto) Call(Args... args) { | ||
// We use "consume" memory order here because we only care about memory | ||
// consistency as viewed from the current thread, not other threads. It's | ||
// safe for multiple threads to end up calling ChooseThenCall() in case they | ||
// all happen to run at once, in which case they will all store identical | ||
// values back into `function_`. | ||
auto impl = function_.load(std::memory_order::consume); | ||
return impl(std::forward<Args>(args)...); | ||
} | ||
|
||
private: | ||
template <typename... Args> | ||
__attribute__((cold)) static decltype(auto) ChooseThenCall(Args... args) { | ||
// Add our reset handler into the global registry. (This only registers the | ||
// function, it doesn't call it.) | ||
drake::internal::HwyDynamicRegisterResetFunction(&Reset); | ||
// Force highway to select a CPU target, if it hasn't already. | ||
if (auto& target = hwy::GetChosenTarget(); !target.IsInitialized()) { | ||
target.Update(hwy::SupportedTargets()); | ||
} | ||
// Retrieve the CPU-specific function pointer from the table of pointers. | ||
auto impl = ChooseFunctor()(); | ||
// Memorize the selected pointer for next time (or until a Reset()). We use | ||
// "release" memory order here because we only care about memory consistency | ||
// as viewed from the current thread. | ||
function_.store(impl, std::memory_order::release); | ||
// Call the actual function we want and return its result. | ||
return impl(args...); | ||
} | ||
|
||
/* (For testing only) Clears the latched target detection. This allows for | ||
testing multiple different targets from the same test program. This function | ||
is NOT thread-safe; only use this in single-threaded tests. */ | ||
__attribute__((cold)) static void Reset() { | ||
// For the test-only function, we use the default memory order (sequential | ||
// consistency) because we might as well keep it simple, even though under | ||
// "only use in single-threaded tests" the memory order doesn't matter. | ||
function_.store(&LateBoundFunction::ChooseThenCall); | ||
} | ||
|
||
// Static globals must be trivially destructible. | ||
static_assert(std::is_trivially_destructible_v<std::atomic<FunctionPointer>>); | ||
|
||
// Note that the default value for this variable is provided out-of-line | ||
// below; it does NOT default to null. | ||
static std::atomic<FunctionPointer> function_; | ||
}; | ||
|
||
template <typename ChooseFunctor> | ||
std::atomic<decltype(ChooseFunctor()())> | ||
LateBoundFunction<ChooseFunctor>::function_ = | ||
&LateBoundFunction::ChooseThenCall; | ||
|
||
} // namespace |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,73 @@ | ||
#include <Eigen/Dense> | ||
#include <gmock/gmock.h> | ||
#include <gtest/gtest.h> | ||
|
||
#pragma GCC diagnostic push | ||
#pragma GCC diagnostic ignored "-Wold-style-cast" | ||
#include "hwy/tests/hwy_gtest.h" | ||
#pragma GCC diagnostic pop | ||
|
||
#include "drake/common/hwy_dynamic.h" | ||
#include "drake/common/test/hwy_dynamic_test_array_mul.h" | ||
#include "drake/common/test_utilities/eigen_matrix_compare.h" | ||
|
||
namespace drake { | ||
namespace internal { | ||
namespace { | ||
|
||
using Eigen::ArrayXd; | ||
using Eigen::VectorXd; | ||
|
||
/* This hwy-infused test fixture replicates every test case to be run against | ||
every target architecture variant (e.g., SSE4, AVX2, AVX512VL, etc). When run, | ||
it filters the suite to only run tests that the current CPU can handle. */ | ||
class Fixture : public hwy::TestWithParamTarget { | ||
protected: | ||
void SetUp() override { | ||
// Reset Drake's target selection. | ||
drake::internal::HwyDynamicReset(); | ||
// Reset Highway's target selection. | ||
hwy::TestWithParamTarget::SetUp(); | ||
// Clear old statistics from prior test cases. | ||
GetMutableArrayMulCounters().clear(); | ||
} | ||
}; | ||
|
||
HWY_TARGET_INSTANTIATE_TEST_SUITE_P(Fixture); | ||
|
||
TEST_P(Fixture, ArrayMulTest) { | ||
const ArrayXd x0 = ArrayXd::LinSpaced(16, 1.0, 16.0); | ||
const ArrayXd x1 = ArrayXd::LinSpaced(16, 0.0625, 1.0); | ||
ArrayXd y1; | ||
|
||
// Run the test logic twice, so that we'll be able to confirm that things | ||
// which should happen once vs twice were correct. | ||
for (int i = 0; i < 2; ++i) { | ||
// Clear the prior output. | ||
y1 = {}; | ||
|
||
// Run the calculation. | ||
ArrayMul(x0, x1, &y1); | ||
|
||
// Check the computed answer. | ||
const ArrayXd expected = x0 * x1; | ||
ASSERT_TRUE(CompareMatrices(VectorXd{y1}, VectorXd{expected})); | ||
} | ||
|
||
// Check that Highway chose something. | ||
ASSERT_TRUE(hwy::GetChosenTarget().IsInitialized()); | ||
|
||
// We should see exactly 1 call to the target selector, and 2 calls to the | ||
// target-specific impl function. | ||
const std::string expected_impl = | ||
fmt::format("ArrayMulImpl.{}", hwy::TargetName(GetParam())); | ||
using Pair = std::pair<std::string, int>; | ||
string_map<int>& counters = GetMutableArrayMulCounters(); | ||
EXPECT_THAT(counters, | ||
testing::UnorderedElementsAre(Pair("ChooseBestArrayMul", 1), | ||
Pair(expected_impl, 2))); | ||
} | ||
|
||
} // namespace | ||
} // namespace internal | ||
} // namespace drake |
Oops, something went wrong.