From ea8c1c79d8780f7c4b9c7c9886b536b18650a719 Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Wed, 20 Sep 2017 15:44:27 +0200 Subject: [PATCH 1/2] [runtime] initial run-time feature detection support --- Cargo.toml | 2 +- src/lib.rs | 4 +- src/macros.rs | 52 ++++++++++++ src/x86/mod.rs | 5 ++ src/x86/runtime.rs | 201 +++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 262 insertions(+), 2 deletions(-) create mode 100644 src/x86/runtime.rs diff --git a/Cargo.toml b/Cargo.toml index 9a6e7c95d7049..5dd9a2fec1b80 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -12,4 +12,4 @@ license = "MIT" [profile.release] debug = true -opt-level = 3 +opt-level = 3 \ No newline at end of file diff --git a/src/lib.rs b/src/lib.rs index e8c9c518e8c6d..1d7a80820efc6 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,7 +1,7 @@ #![allow(dead_code)] #![feature( const_fn, link_llvm_intrinsics, platform_intrinsics, repr_simd, simd_ffi, - target_feature, cfg_target_feature, i128_type + target_feature, cfg_target_feature, i128_type, asm, const_atomic_usize_new )] /// Platform independent SIMD vector types and operations. @@ -25,5 +25,7 @@ mod v128; mod v256; mod v512; mod v64; + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[macro_use] mod x86; diff --git a/src/macros.rs b/src/macros.rs index 339736e8694f0..7ba8c1e01b370 100644 --- a/src/macros.rs +++ b/src/macros.rs @@ -267,3 +267,55 @@ macro_rules! define_casts { )+ } } + +/// Is a feature supported by the host CPU? +/// +/// This macro performs run-time feature detection. It returns true if the host +/// CPU in which the binary is running on supports a particular feature. +#[macro_export] +macro_rules! cfg_feature_enabled { + ($name:tt) => ( + { + #[cfg(target_feature = $name)] + { + true + } + #[cfg(not(target_feature = $name))] + { + __unstable_detect_feature!($name) + } + } + ) +} + +/// On ARM features are only detected at compile-time using +/// cfg(target_feature), so if this macro is executed the +/// feature is not supported. +#[cfg(any(target_arch = "arm", + target_arch = "aarch64"))] +#[macro_export] +#[doc(hidden)] +macro_rules! __unstable_detect_feature { + ("neon") => { false }; + ($t:tt) => { compile_error!(concat!("unknown target feature: ", $t)) }; +} + +/// In all unsupported architectures using the macro is an error +#[cfg(not(any(target_arch = "x86", + target_arch = "x86_64", + target_arch = "arm", + target_arch = "aarch64")))] +#[macro_export] +#[doc(hidden)] +macro_rules! __unstable_detect_feature { + ($t:tt) => { compile_error!(concat!("unknown target feature: ", $t)) }; +} + +#[cfg(test)] +mod tests { + #[cfg(target_arch = "x86_64")] + #[test] + fn test_macros() { + assert!(cfg_feature_enabled!("sse")); + } +} diff --git a/src/x86/mod.rs b/src/x86/mod.rs index 2cde200c086db..fac08b6f00fb9 100644 --- a/src/x86/mod.rs +++ b/src/x86/mod.rs @@ -11,6 +11,8 @@ pub use self::bmi::*; pub use self::bmi2::*; pub use self::tbm::*; +pub use self::runtime::{__Feature, __unstable_detect_feature}; + #[allow(non_camel_case_types)] pub type __m128i = ::v128::i8x16; #[allow(non_camel_case_types)] @@ -28,3 +30,6 @@ mod abm; mod bmi; mod bmi2; mod tbm; + +#[macro_use] +mod runtime; diff --git a/src/x86/runtime.rs b/src/x86/runtime.rs new file mode 100644 index 0000000000000..b2b315a98f828 --- /dev/null +++ b/src/x86/runtime.rs @@ -0,0 +1,201 @@ +//! This module implements minimal run-time feature detection for x86. +//! +//! The features are detected using the `detect_features` function below. This function +//! uses the CPUID instruction to read the feature flags from the CPU and encodes them in +//! an `usize` where each bit position represents whether a feature is available (bit is set) +//! or unavaiable (bit is cleared). +//! +//! The enum `__Feature` is used to map bit positions to feature names, and the +//! the `__unstable_detect_feature!` macro is used to map string literals (e.g. +//! "avx") to these bit positions (e.g. `__Feature::avx`). +//! +//! +//! The run-time feature detection is performed by the +//! `__unstable_detect_feature(__Feature) -> bool` function. On its first call, +//! this functions queries the CPU for the available features and stores them in +//! a global `AtomicUsize` variable. The query is performed by just checking whether the +//! feature bit in this global variable is set or cleared. +use ::std::sync::atomic::{AtomicUsize, Ordering}; + +/// This macro maps the string-literal feature names to values of the +/// `__Feature` enum at compile-time. The feature names used are the same as +/// those of rustc `target_feature` and `cfg_target_feature` features. +/// +/// PLESE: do not use this, it is an implementation detail subjected to change. +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[macro_export] +#[doc(hidden)] +macro_rules! __unstable_detect_feature { + ("sse") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::sse{}) }; + ("sse2") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::sse2{}) }; + ("sse3") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::sse3{}) }; + ("ssse3") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::ssse3{}) }; + ("sse4.1") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::sse4_1{}) }; + ("sse4.2") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::sse4_2{}) }; + ("avx") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::avx{}) }; + ("avx2") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::avx2{}) }; + ("fma") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::fma{}) }; + ("bmi") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::bmi{}) }; + ("bmi2") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::bmi2{}) }; + ("abm") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::abm{}) }; + ("lzcnt") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::abm{}) }; + ("tbm") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::tbm{}) }; + ("popcnt") => { $crate::vendor::__unstable_detect_feature($crate::vendor::__Feature::popcnt{}) }; + ($t:tt) => { compile_error!(concat!("unknown target feature: ", $t)) }; +} + +/// X86 CPU Feature enum. Each variant denotes a position in a bitset for a +/// particular feature. +/// +/// PLEASE: do not use this, it is an implementation detail subject to change. +#[doc(hidden)] +#[allow(non_camel_case_types)] +#[repr(u8)] +pub enum __Feature { + /// SSE (Streaming SIMD Extensions) + sse = 0, + /// SSE2 (Streaming SIMD Extensions 2) + sse2 = 1, + /// SSE3 (Streaming SIMD Extensions 3) + sse3 = 2, + /// SSSE3 (Supplemental Streaming SIMD Extensions 3) + ssse3 = 3, + /// SSE4.1 (Streaming SIMD Extensions 4.1) + sse4_1 = 4, + /// SSE4.2 (Streaming SIMD Extensions 4.2) + sse4_2 = 5, + /// AVX (Advanced Vector Extensions) + avx = 6, + /// AVX2 (Advanced Vector Extensions 2) + avx2 = 7, + /// FMA (Fused Multiply Add) + fma = 8, + /// BMI1 (Bit Manipulation Instructions 1) + bmi = 9, + /// BMI1 (Bit Manipulation Instructions 2) + bmi2 = 10, + /// ABM (Advanced Bit Manipulation) on AMD / LZCNT (Leading Zero Count) on Intel + abm = 11, + /// TBM (Trailing Bit Manipulation) + tbm = 12, + /// POPCNT (Population Count) + popcnt = 13, + + #[doc(hidden)] + __NonExhaustive +} + +fn set_bit(x: usize, bit: u32) -> usize { + debug_assert!(32 > bit); + x | 1 << bit +} + +fn test_bit(x: usize, bit: u32) -> bool { + debug_assert!(32 > bit); + x & (1 << bit) != 0 +} + +fn inv_test_bit(v: usize, idx: u32) -> bool { + debug_assert!(32 > idx); + ((v >> idx) & 1) != 0 +} + +/// Run-time feature detection on x86 works by using the CPUID instruction. +/// +/// The [CPUID Wikipedia page](https://en.wikipedia.org/wiki/CPUID) contains all +/// the information about which flags to set to query which values, and in which +/// registers these are reported. +/// +/// The definitive references are: +/// - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2: Instruction Set Reference, A-Z](http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf). +/// - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and System Instructions](http://support.amd.com/TechDocs/24594.pdf). +/// +fn detect_features() -> usize { + let ebx; + let ecx; + let edx; + + unsafe { + /// To obtain all feature flags we need two CPUID queries: + + /// 1. EAX=1, ECX=0: Queries "Processor Info and Feature Bits" + /// This gives us most of the CPU features in ECX and EDX (see below), + asm!("cpuid" + : "={ecx}"(ecx), "={edx}"(edx) + : "{eax}"(0x00000001u32), "{ecx}"(0 as u32) + : :); + + /// 2. EAX=7, ECX=0: Queries "Extended Features" + /// This gives us information about bmi,bmi2, and avx2 support (see below). + asm!("cpuid" + : "={ebx}"(ebx) + : "{eax}"(0x00000007u32), "{ecx}"(0 as u32) + : :); + } + + let mut value: usize = 0; + + // CPUID call with EAX=7, ECX=0 => Extended Features in EBX and ECX (unneeded): + if inv_test_bit(ebx, 3) { value = set_bit(value, __Feature::bmi as u32); } + if inv_test_bit(ebx, 5) { value = set_bit(value, __Feature::avx2 as u32); } + if inv_test_bit(ebx, 8) { value = set_bit(value, __Feature::bmi2 as u32); } + + // CPUID call with EAX=1 => feature bits in ECX and EDX: + if inv_test_bit(ecx, 0) { value = set_bit(value, __Feature::sse3 as u32); } + if inv_test_bit(ecx, 5) { value = set_bit(value, __Feature::abm as u32); } + if inv_test_bit(ecx, 9) { value = set_bit(value, __Feature::ssse3 as u32); } + if inv_test_bit(ecx, 12) { value = set_bit(value, __Feature::fma as u32); } + if inv_test_bit(ecx, 19) { value = set_bit(value, __Feature::sse4_1 as u32); } + if inv_test_bit(ecx, 20) { value = set_bit(value, __Feature::sse4_2 as u32); } + if inv_test_bit(ecx, 21) { value = set_bit(value, __Feature::tbm as u32); } + if inv_test_bit(ecx, 23) { value = set_bit(value, __Feature::popcnt as u32); } + if inv_test_bit(ecx, 28) { value = set_bit(value, __Feature::avx as u32); } + + if inv_test_bit(edx, 25) { value = set_bit(value, __Feature::sse as u32); } + if inv_test_bit(edx, 26) { value = set_bit(value, __Feature::sse2 as u32); } + + value +} + +/// This global variable is a bitset used to cache the features supported by the +/// CPU. +static FEATURES: AtomicUsize = AtomicUsize::new(::std::usize::MAX); + +/// Performs run-time feature detection. +/// +/// On its first invocation, it detects the CPU features and caches them in the +/// `FEATURES` global variable as an `AtomicUsize`. +/// +/// It uses the `__Feature` variant to index into this variable as a bitset. If +/// the bit is set, the feature is enabled, and otherwise it is disabled. +/// +/// PLEASE: do not use this, it is an implementation detail subject to change. +#[doc(hidden)] +pub fn __unstable_detect_feature(x: __Feature) -> bool { + if FEATURES.load(Ordering::Relaxed) == ::std::usize::MAX { + FEATURES.store(detect_features(), Ordering::Relaxed); + } + test_bit(FEATURES.load(Ordering::Relaxed), x as u32) +} + +#[cfg(test)] +mod tests { + #[test] + fn runtime_detection_x86_nocapture() { + println!("sse: {:?}", cfg_feature_enabled!("sse")); + println!("sse2: {:?}", cfg_feature_enabled!("sse2")); + println!("sse3: {:?}", cfg_feature_enabled!("sse3")); + println!("ssse3: {:?}", cfg_feature_enabled!("ssse3")); + println!("sse4.1: {:?}", cfg_feature_enabled!("sse4.1")); + println!("sse4.2: {:?}", cfg_feature_enabled!("sse4.2")); + println!("avx: {:?}", cfg_feature_enabled!("avx")); + println!("avx2: {:?}", cfg_feature_enabled!("avx2")); + println!("abm: {:?}", cfg_feature_enabled!("abm")); + println!("bmi: {:?}", cfg_feature_enabled!("bmi")); + println!("bmi2: {:?}", cfg_feature_enabled!("bmi2")); + println!("tbm: {:?}", cfg_feature_enabled!("tbm")); + println!("popcnt: {:?}", cfg_feature_enabled!("popcnt")); + println!("lzcnt: {:?}", cfg_feature_enabled!("lzcnt")); + println!("fma: {:?}", cfg_feature_enabled!("fma")); + } +} From 9f3c3331f09b123035c8f9a26af6a98df4f5ff77 Mon Sep 17 00:00:00 2001 From: gnzlbg Date: Wed, 20 Sep 2017 20:56:33 +0200 Subject: [PATCH 2/2] remove explicit discriminant values --- src/x86/runtime.rs | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/x86/runtime.rs b/src/x86/runtime.rs index b2b315a98f828..4071a1ce587dc 100644 --- a/src/x86/runtime.rs +++ b/src/x86/runtime.rs @@ -53,33 +53,33 @@ macro_rules! __unstable_detect_feature { #[repr(u8)] pub enum __Feature { /// SSE (Streaming SIMD Extensions) - sse = 0, + sse, /// SSE2 (Streaming SIMD Extensions 2) - sse2 = 1, + sse2, /// SSE3 (Streaming SIMD Extensions 3) - sse3 = 2, + sse3, /// SSSE3 (Supplemental Streaming SIMD Extensions 3) - ssse3 = 3, + ssse3, /// SSE4.1 (Streaming SIMD Extensions 4.1) - sse4_1 = 4, + sse4_1, /// SSE4.2 (Streaming SIMD Extensions 4.2) - sse4_2 = 5, + sse4_2, /// AVX (Advanced Vector Extensions) - avx = 6, + avx, /// AVX2 (Advanced Vector Extensions 2) - avx2 = 7, + avx2, /// FMA (Fused Multiply Add) - fma = 8, + fma, /// BMI1 (Bit Manipulation Instructions 1) - bmi = 9, + bmi, /// BMI1 (Bit Manipulation Instructions 2) - bmi2 = 10, + bmi2, /// ABM (Advanced Bit Manipulation) on AMD / LZCNT (Leading Zero Count) on Intel - abm = 11, + abm, /// TBM (Trailing Bit Manipulation) - tbm = 12, + tbm, /// POPCNT (Population Count) - popcnt = 13, + popcnt, #[doc(hidden)] __NonExhaustive