Skip to content

Commit

Permalink
[math] Use dynamic dispatch for highway SIMD
Browse files Browse the repository at this point in the history
  • Loading branch information
jwnimmer-tri committed Jun 26, 2024
1 parent 0053d9d commit 6e02f33
Show file tree
Hide file tree
Showing 15 changed files with 610 additions and 793 deletions.
39 changes: 39 additions & 0 deletions common/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,45 @@ drake_cc_library(
],
)

drake_cc_library(
name = "hwy_dynamic",
srcs = ["hwy_dynamic.cc"],
hdrs = [
"hwy_dynamic.h",
"hwy_dynamic_impl.h",
],
defines = [
# Highway automatically compiles our functions for multiple CPU
# targets, so that the code is tuned as best as possible for that
# target. However, some of that specialization is unncessary for our
# use of SIMD in Drake, so we opt-out of targets that do not offer any
# advantages vs their build time / code size.
"HWY_DISABLED_TARGETS=(" + "|".join([
# SSSE3 is not useful (HWY_MAX_BYTES is smaller than 32).
"HWY_SSSE3",
# SSE4 is not useful (HWY_MAX_BYTES is smaller than 32).
"HWY_SSE4",
# AVX3_ZEN4 is no better than AVX3 (we don't use BF16).
"HWY_AVX3_ZEN4",
# NEON is no better than NEON_WITHOUT_AES (we don't use AES).
"HWY_NEON",
# NEON_BF16 is no better than NEON (we don't use BF16).
"HWY_NEON_BF16",
# SVE is not useful (we can't effectively use scaleable vectors).
"HWY_SVE",
# SVE is not useful (we can't effectively use scaleable vectors).
"HWY_SVE2",
# SVE2_128 is not useful (HWY_MAX_BYTES is smaller than 32).
"HWY_SVE2_128",
]) + ")",
],
internal = True,
visibility = ["//:__subpackages__"],
deps = [
":essential",
],
)

drake_cc_library(
name = "is_approx_equal_abstol",
hdrs = ["is_approx_equal_abstol.h"],
Expand Down
41 changes: 41 additions & 0 deletions common/hwy_dynamic.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
#include "drake/common/hwy_dynamic.h"

#include <mutex>
#include <vector>

#include "drake/common/drake_assert.h"
#include "drake/common/never_destroyed.h"

namespace drake {
namespace internal {
namespace {

struct Globals {
mutable std::mutex mutex;
std::vector<void (*)()> resets;
};

Globals& get_singleton() {
static never_destroyed<Globals> storage;
return storage.access();
}

} // namespace

void HwyDynamicRegisterResetFunction(void (*reset)()) {
DRAKE_DEMAND(reset != nullptr);
auto& singleton = get_singleton();
std::lock_guard<std::mutex> guard(singleton.mutex);
singleton.resets.push_back(reset);
}

void HwyDynamicReset() {
const auto& singleton = get_singleton();
// No mutex guard here; this function is documented as not thread-safe.
for (const auto& reset : singleton.resets) {
reset();
}
}

} // namespace internal
} // namespace drake
24 changes: 24 additions & 0 deletions common/hwy_dynamic.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#pragma once

namespace drake {
namespace internal {

/* This file provides two singleton-like functions whose purpose is to assist
with unit tests that want to probe code that uses "hwy/highway.h" for SIMD. */

/* Anywhere in Drake that uses Highway for dynamic CPU dispatch should call this
helper function to register a reset handler to clear the latched CPU detction.
(Most developers never need to worry about this, because it's automatic when you
use the tools in `hwy_dynamic_impl.h`.) This function is safe to call from
multiple threads currently. */
void HwyDynamicRegisterResetFunction(void (*)());

/* (For testing only) Clears the latched CPU target detection by calling all of
the reset handlers registered using the HwyDynamicRegisterResetFunction() above.
This allows for testing multiple different CPU targets from within the same test
program. This function is NOT thread-safe; only use this in single-threaded
tests. */
void HwyDynamicReset();

} // namespace internal
} // namespace drake
84 changes: 84 additions & 0 deletions common/hwy_dynamic_impl.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
#pragma once

#include <atomic>
#include <utility>

#include "drake/common/hwy_dynamic.h"

// This file should only ever be included from `*.cc` implementation files,
// and we always want its code to be private to that file, so we'll use an
// anonymous namespace in a header file, for simplicity.
namespace { // NOLINT(build/namespaces)

/* LateBoundFunction is a small wrapper class around a C-style raw function
pointer. (It doesn't support std::function objects.)
In a given process, the first time our Call() function is called, it will
latch-initialize the wrapped function pointer as follows:
- call the ChooseFunctor to select a which function pointer to use,
- memorize that answer for next time;
- call the selected function pointer and return its result.
After the selected function pointer is memorized, future calls to Call() will
invoke it efficiently (with no conditional branching).
Note that the memorization is thread-safe, but not limited to at-most-once
semantics. Multiple threads might concurrently select and memorize the choice,
the ChooseFunctor might be called multiple times.
In support of unit testing, selections can be unlatched via HwyDynamicReset()
in "drake/common/hwy_dynamic.h".
@tparam ChooseFunctor thread-safe functor that selects the function pointer to
be wrapped. */
template <typename ChooseFunctor>
class LateBoundFunction {
public:
/* A function pointer type, determined by what the ChooseFunctor returns. */
using FunctionPointer = decltype(ChooseFunctor()());

/* Calls this trampoline. The first time we're called, we'll choose the best
target function and memorize it; subsequent calls will directly call into the
chosen target without any conditional checks. */
template <typename... Args>
__attribute__((always_inline)) static decltype(auto) Call(Args... args) {
auto impl = function_.load(std::memory_order::relaxed);
return impl(std::forward<Args>(args)...);
}

private:
template <typename... Args>
__attribute__((cold)) static decltype(auto) ChooseThenCall(Args... args) {
drake::internal::HwyDynamicRegisterResetFunction(&Reset);
hwy::GetChosenTarget().Update(hwy::SupportedTargets());
auto impl = ChooseFunctor()();
function_.store(impl, std::memory_order::relaxed);
return impl(args...);
}

/* (For testing only) Clears the latched target detection. This allows for
testing multiple different targets from the same test program. This function
is NOT thread-safe; only use this in single-threaded tests. */
__attribute__((cold)) static void Reset() {
// The memory order here doesn't really matter; this must only ever be
// called in a single-threaded context. Anyway we'll use still use relaxed
// to match the rest of this file.
function_.store(&LateBoundFunction::ChooseThenCall,
std::memory_order::relaxed);
}

// Static globals must be trivially destructible.
static_assert(std::is_trivially_destructible_v<std::atomic<FunctionPointer>>);

// All operations on this pointer must use memory_order::relaxed, which is
// zero-cost on the platforms we care about. Note that the default value for
// this variable is provided out-of-line below; it does NOT default to null.
static std::atomic<FunctionPointer> function_;
};

template <typename ChooseFunctor>
std::atomic<decltype(ChooseFunctor()())>
LateBoundFunction<ChooseFunctor>::function_ =
&LateBoundFunction::ChooseThenCall;

} // namespace
62 changes: 13 additions & 49 deletions math/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ drake_cc_package_library(
":discrete_lyapunov_equation",
":eigen_sparse_triplet",
":evenly_distributed_pts_on_sphere",
":fast_pose_composition_functions",
":geometric_transform",
":gradient",
":gray_code",
Expand Down Expand Up @@ -145,17 +144,21 @@ drake_cc_library(
"rigid_transform.cc",
"roll_pitch_yaw.cc",
"rotation_matrix.cc",
# TODO(jeremy.nimmer) For now, this filename is inconsistent with its
# header name. We plan to rename it after the editing churn finishes.
"fast_pose_composition_functions_avx2_fma.cc",
],
hdrs = [
"fast_pose_composition_functions.h",
"hopf_coordinate.h",
"quaternion.h",
"random_rotation.h",
"rigid_transform.h",
"roll_pitch_yaw.h",
"rotation_matrix.h",
],
deps = [
":fast_pose_composition_functions",
textual_hdrs = ["fast_pose_composition_functions_avx2_fma.cc"],
interface_deps = [
":vector3_util",
"//common:default_scalars",
"//common:drake_bool",
Expand All @@ -165,6 +168,10 @@ drake_cc_library(
"//common:unused",
"//math:gradient",
],
deps = [
"//common:hwy_dynamic",
"@highway_internal//:hwy",
],
)

drake_cc_library(
Expand Down Expand Up @@ -297,50 +304,6 @@ drake_cc_library(
],
)

# This setting governs when we'll compile with Intel AVX2 and FMA enabled.
# Compiling for Broadwell (or later) gets those instructions.
#
# Note that we have runtime detection of CPU support; this flag only affects
# what happens at build time, i.e., will the compiler support it.
config_setting(
name = "build_avx2_fma",
constraint_values = [
"@platforms//cpu:x86_64",
# On macOS, we opt-out of this feature (even for Apple hardware that
# supports it) to reduce our test matrix burden for the deprecated
# architecture.
"@platforms//os:linux",
],
)

drake_cc_library(
name = "fast_pose_composition_functions_avx2_fma",
srcs = ["fast_pose_composition_functions_avx2_fma.cc"],
hdrs = ["fast_pose_composition_functions_avx2_fma.h"],
copts = select({
":build_avx2_fma": ["-march=broadwell"],
"//conditions:default": [],
}),
internal = True,
visibility = ["//visibility:private"],
deps = [
"@highway_internal//:hwy",
],
)

drake_cc_library(
name = "fast_pose_composition_functions",
srcs = [
# TODO(jeremy.nimmer) For now, this filename is inconsistent with its
# library name. We will soon combine the two "fast pose" targets into
# one, using consistent naming.
"fast_pose_composition_functions_wrapper.cc",
],
hdrs = ["fast_pose_composition_functions.h"],
interface_deps = [],
deps = [":fast_pose_composition_functions_avx2_fma"],
)

# === test/ ===

drake_cc_googletest(
Expand Down Expand Up @@ -613,10 +576,11 @@ drake_cc_googletest(
drake_cc_googletest(
name = "fast_pose_composition_functions_test",
deps = [
":fast_pose_composition_functions",
":fast_pose_composition_functions_avx2_fma",
":geometric_transform",
"//common:essential",
"//common:hwy_dynamic",
"//common/test_utilities:eigen_matrix_compare",
"@highway_internal//:hwy_test_util",
],
)

Expand Down
Loading

0 comments on commit 6e02f33

Please sign in to comment.