Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Runtime detection of VEX variants of AVX512 #1586

Merged
merged 1 commit into from
Jun 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions crates/std_detect/src/detect/arch/x86.rs
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,11 @@ features! {
/// * `"avx512bf16"`
/// * `"avx512vp2intersect"`
/// * `"avx512fp16"`
/// * `"avxvnni"`
/// * `"avxifma"`
/// * `"avxneconvert"`
/// * `"avxvnniint8"`
/// * `"avxvnniint16"`
/// * `"f16c"`
/// * `"fma"`
/// * `"bmi1"`
Expand Down Expand Up @@ -172,6 +177,16 @@ features! {
/// AVX-512 P2INTERSECT
@FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] avx512fp16: "avx512fp16";
/// AVX-512 FP16 (FLOAT16 instructions)
@FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxifma: "avxifma";
/// AVX-IFMA (Integer Fused Multiply Add)
@FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxneconvert: "avxneconvert";
/// AVX-NE-CONVERT (Exceptionless Convert)
@FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxvnni: "avxvnni";
/// AVX-VNNI (Vector Neural Network Instructions)
@FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxvnniint16: "avxvnniint16";
/// AVX-VNNI_INT8 (VNNI with 16-bit Integers)
@FEATURE: #[unstable(feature = "avx512_target_feature", issue = "44839")] avxvnniint8: "avxvnniint8";
/// AVX-VNNI_INT16 (VNNI with 8-bit integers)
@FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] f16c: "f16c";
/// F16C (Conversions between IEEE-754 `binary16` and `binary32` formats)
@FEATURE: #[stable(feature = "simd_x86", since = "1.27.0")] fma: "fma";
Expand Down
30 changes: 19 additions & 11 deletions crates/std_detect/src/detect/cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,30 +9,30 @@ use core::sync::atomic::AtomicUsize;

/// Sets the `bit` of `x`.
#[inline]
const fn set_bit(x: u64, bit: u32) -> u64 {
const fn set_bit(x: u128, bit: u32) -> u128 {
x | 1 << bit
}

/// Tests the `bit` of `x`.
#[inline]
const fn test_bit(x: u64, bit: u32) -> bool {
const fn test_bit(x: u128, bit: u32) -> bool {
x & (1 << bit) != 0
}

/// Unset the `bit of `x`.
#[inline]
const fn unset_bit(x: u64, bit: u32) -> u64 {
const fn unset_bit(x: u128, bit: u32) -> u128 {
x & !(1 << bit)
}

/// Maximum number of features that can be cached.
const CACHE_CAPACITY: u32 = 62;
const CACHE_CAPACITY: u32 = 93;

/// This type is used to initialize the cache
// The derived `Default` implementation will initialize the field to zero,
// which is what we want.
#[derive(Copy, Clone, Default)]
pub(crate) struct Initializer(u64);
pub(crate) struct Initializer(u128);

// NOTE: the `debug_assert!` would catch that we do not add more Features than
// the one fitting our cache.
Expand Down Expand Up @@ -71,10 +71,15 @@ impl Initializer {
}

/// This global variable is a cache of the features supported by the CPU.
// Note: on x64, we only use the first slot
static CACHE: [Cache; 2] = [Cache::uninitialized(), Cache::uninitialized()];

/// Feature cache with capacity for `size_of::<usize::MAX>() * 8 - 1` features.
// Note: the third slot is only used in x86
// Another Slot can be added if needed without any change to `Initializer`
static CACHE: [Cache; 3] = [
Cache::uninitialized(),
Cache::uninitialized(),
Cache::uninitialized(),
];

/// Feature cache with capacity for `size_of::<usize>() * 8 - 1` features.
///
/// Note: 0 is used to represent an uninitialized cache, and (at least) the most
/// significant bit is set on any cache which has been initialized.
Expand Down Expand Up @@ -102,7 +107,7 @@ impl Cache {
if cached == 0 {
None
} else {
Some(test_bit(cached as u64, bit))
Some(test_bit(cached as u128, bit))
}
}

Expand Down Expand Up @@ -173,6 +178,7 @@ cfg_if::cfg_if! {
fn do_initialize(value: Initializer) {
CACHE[0].initialize((value.0) as usize & Cache::MASK);
CACHE[1].initialize((value.0 >> Cache::CAPACITY) as usize & Cache::MASK);
CACHE[2].initialize((value.0 >> 2 * Cache::CAPACITY) as usize & Cache::MASK);
}

// We only have to detect features once, and it's fairly costly, so hint to LLVM
Expand Down Expand Up @@ -205,8 +211,10 @@ fn detect_and_initialize() -> Initializer {
pub(crate) fn test(bit: u32) -> bool {
let (relative_bit, idx) = if bit < Cache::CAPACITY {
(bit, 0)
} else {
} else if bit < 2 * Cache::CAPACITY {
(bit - Cache::CAPACITY, 1)
} else {
(bit - 2 * Cache::CAPACITY, 2)
};
CACHE[idx]
.test(relative_bit)
Expand Down
30 changes: 21 additions & 9 deletions crates/std_detect/src/detect/os/x86.rs
Original file line number Diff line number Diff line change
Expand Up @@ -74,13 +74,17 @@ pub(crate) fn detect_features() -> cache::Initializer {
extended_features_ecx,
extended_features_edx,
extended_features_eax_leaf_1,
extended_features_edx_leaf_1,
) = if max_basic_leaf >= 7 {
let CpuidResult { ebx, ecx, edx, .. } = unsafe { __cpuid(0x0000_0007_u32) };
let CpuidResult { eax: eax_1, .. } =
unsafe { __cpuid_count(0x0000_0007_u32, 0x0000_0001_u32) };
(ebx, ecx, edx, eax_1)
let CpuidResult {
eax: eax_1,
edx: edx_1,
..
} = unsafe { __cpuid_count(0x0000_0007_u32, 0x0000_0001_u32) };
(ebx, ecx, edx, eax_1, edx_1)
} else {
(0, 0, 0, 0) // CPUID does not support "Extended Features"
(0, 0, 0, 0, 0) // CPUID does not support "Extended Features"
};

// EAX = 0x8000_0000, ECX = 0: Get Highest Extended Function Supported
Expand Down Expand Up @@ -129,6 +133,10 @@ pub(crate) fn detect_features() -> cache::Initializer {
enable(proc_info_edx, 26, Feature::sse2);
enable(extended_features_ebx, 29, Feature::sha);

enable(extended_features_ecx, 8, Feature::gfni);
enable(extended_features_ecx, 9, Feature::vaes);
enable(extended_features_ecx, 10, Feature::vpclmulqdq);

enable(extended_features_ebx, 3, Feature::bmi1);
enable(extended_features_ebx, 8, Feature::bmi2);

Expand Down Expand Up @@ -165,8 +173,8 @@ pub(crate) fn detect_features() -> cache::Initializer {
let xcr0 = unsafe { _xgetbv(0) };
// Test `XCR0.SSE[1]` and `XCR0.AVX[2]` with the mask `0b110 == 6`:
let os_avx_support = xcr0 & 6 == 6;
// Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 224`:
let os_avx512_support = xcr0 & 224 == 224;
// Test `XCR0.AVX-512[7:5]` with the mask `0b1110_0000 == 0xe0`:
let os_avx512_support = xcr0 & 0xe0 == 0xe0;

// Only if the OS and the CPU support saving/restoring the AVX
// registers we enable `xsave` support:
Expand Down Expand Up @@ -203,6 +211,13 @@ pub(crate) fn detect_features() -> cache::Initializer {
enable(proc_info_ecx, 28, Feature::avx);
enable(extended_features_ebx, 5, Feature::avx2);

// "Short" versions of AVX512 instructions
enable(extended_features_eax_leaf_1, 4, Feature::avxvnni);
enable(extended_features_eax_leaf_1, 23, Feature::avxifma);
enable(extended_features_edx_leaf_1, 4, Feature::avxvnniint8);
enable(extended_features_edx_leaf_1, 5, Feature::avxneconvert);
enable(extended_features_edx_leaf_1, 10, Feature::avxvnniint16);

// For AVX-512 the OS also needs to support saving/restoring
// the extended state, only then we enable AVX-512 support:
if os_avx512_support {
Expand All @@ -216,9 +231,6 @@ pub(crate) fn detect_features() -> cache::Initializer {
enable(extended_features_ebx, 31, Feature::avx512vl);
enable(extended_features_ecx, 1, Feature::avx512vbmi);
enable(extended_features_ecx, 6, Feature::avx512vbmi2);
enable(extended_features_ecx, 8, Feature::gfni);
enable(extended_features_ecx, 9, Feature::vaes);
enable(extended_features_ecx, 10, Feature::vpclmulqdq);
enable(extended_features_ecx, 11, Feature::avx512vnni);
enable(extended_features_ecx, 12, Feature::avx512bitalg);
enable(extended_features_ecx, 14, Feature::avx512vpopcntdq);
Expand Down
13 changes: 12 additions & 1 deletion crates/std_detect/tests/x86-specific.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#![cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#![allow(internal_features)]
#![feature(stdarch_internal)]
#![feature(stdarch_internal, avx512_target_feature)]

extern crate cupid;
#[macro_use]
Expand Down Expand Up @@ -68,6 +68,17 @@ fn dump() {
println!("adx: {:?}", is_x86_feature_detected!("adx"));
println!("rtm: {:?}", is_x86_feature_detected!("rtm"));
println!("movbe: {:?}", is_x86_feature_detected!("movbe"));
println!("avxvnni: {:?}", is_x86_feature_detected!("avxvnni"));
println!("avxvnniint8: {:?}", is_x86_feature_detected!("avxvnniint8"));
println!(
"avxneconvert: {:?}",
is_x86_feature_detected!("avxneconvert")
);
println!("avxifma: {:?}", is_x86_feature_detected!("avxifma"));
println!(
"avxvnniint16: {:?}",
is_x86_feature_detected!("avxvnniint16")
);
}

#[cfg(feature = "std_detect_env_override")]
Expand Down
Loading