Merge pull request #12172 from ArchRobison/adr/subnormal

Add Julia routines for setting/getting "subnormals are zero" mode.
JuliaLang · Jul 19, 2015 · 4b757af · 4b757af
2 parents 2539fcf + 591e0a9
commit 4b757af
Show file tree

Hide file tree

Showing 6 changed files with 139 additions and 27 deletions.
diff --git a/base/exports.jl b/base/exports.jl
@@ -916,6 +916,8 @@ export
     get_rounding,
     set_rounding,
     with_rounding,
+    get_zero_subnormals,
+    set_zero_subnormals,
 
 # statistics
     cor,

diff --git a/base/rounding.jl b/base/rounding.jl
@@ -6,7 +6,8 @@ include("fenv_constants.jl")
 export
     RoundingMode, RoundNearest, RoundToZero, RoundUp, RoundDown, RoundFromZero,
     RoundNearestTiesAway, RoundNearestTiesUp,
-    get_rounding, set_rounding, with_rounding
+    get_rounding, set_rounding, with_rounding,
+    get_zero_subnormals, set_zero_subnormals
 
 ## rounding modes ##
 immutable RoundingMode{T} end
@@ -85,4 +86,7 @@ function _convert_rounding{T<:FloatingPoint}(::Type{T},x::Real,r::RoundingMode{:
     end
 end
 
+set_zero_subnormals(yes::Bool) = ccall(:jl_set_zero_subnormals,Int32,(Int8,),yes)==0
+get_zero_subnormals() = ccall(:jl_get_zero_subnormals,Int32,())!=0
+
 end #module
diff --git a/doc/manual/performance-tips.rst b/doc/manual/performance-tips.rst
@@ -761,6 +761,63 @@ resulting speedup depend very much on the hardware. You can examine
 the change in generated code by using Julia's :func:`code_native`
 function.
 
+Treat Subnormal Numbers as Zeros
+--------------------------------
+
+Subnormal numbers, formerly called `denormal numbers <https://en.wikipedia.org/wiki/Denormal_number>`_,
+are useful in many contexts, but incur a performance penalty on some hardware.
+A call :func:`set_zero_subnormals(true) <set_zero_subnormals>`
+grants permission for floating-point operations to treat subnormal
+inputs or outputs as zeros, which may improve performance on some hardware.
+A call :func:`set_zero_subnormals(false) <set_zero_subnormals>`
+enforces strict IEEE behavior for subnormal numbers.
+
+Below is an example where subnormals noticeably impact performance on some hardware::
+
+    function timestep{T}( b::Vector{T}, a::Vector{T}, Δt::T )
+        @assert length(a)==length(b)
+        n = length(b)
+        b[1] = 1                            # Boundary condition
+        for i=2:n-1
+            b[i] = a[i] + (a[i-1] - T(2)*a[i] + a[i+1]) * Δt
+        end
+        b[n] = 0                            # Boundary condition
+    end
+
+    function heatflow{T}( a::Vector{T}, nstep::Integer )
+        b = similar(a)
+        for t=1:div(nstep,2)                # Assume nstep is even
+            timestep(b,a,T(0.1))
+            timestep(a,b,T(0.1))
+        end
+    end
+
+    heatflow(zeros(Float32,10),2)           # Force compilation
+    for trial=1:6
+        a = zeros(Float32,1000)
+        set_zero_subnormals(iseven(trial))  # Odd trials use strict IEEE arithmetic
+        @time heatflow(a,1000)
+    end
+
+This example generates many subnormal numbers because the values in ``a`` become
+an exponentially decreasing curve, which slowly flattens out over time.
+
+Treating subnormals as zeros should be used with caution, because doing so
+breaks some identities, such as ``x-y==0`` implies ``x==y``::
+
+    julia> x=3f-38; y=2f-38;
+
+    julia> set_zero_subnormals(false); (x-y,x==y)
+    (1.0000001f-38,false)
+
+    julia> set_zero_subnormals(true); (x-y,x==y)
+    (0.0f0,false)
+
+In some applications, an alternative to zeroing subnormal numbers is
+to inject a tiny bit of noise.  For example, instead of
+initializing ``a`` with zeros, initialize it with::
+
+     a = rand(Float32,1000) * 1.f-9
 
 .. _man-code-warntype:
 

diff --git a/doc/stdlib/numbers.rst b/doc/stdlib/numbers.rst
@@ -310,6 +310,25 @@ General Number Functions and Constants
 
    See ``get_rounding`` for available rounding modes.
 
+.. function:: get_zero_subnormals() -> Bool
+
+   Returns ``false`` if operations on subnormal floating-point values
+   ("denormals") obey rules for IEEE arithmetic, and ``true`` if they
+   might be converted to zeros.
+
+.. function:: set_zero_subnormals(yes::Bool) -> Bool
+
+   If ``yes`` is ``false``, subsequent floating-point operations follow
+   rules for IEEE arithmetic on subnormal values ("denormals").
+   Otherwise, floating-point operations are permitted (but not required)
+   to convert subnormal inputs or outputs to zero.  Returns ``true``
+   unless ``yes==true`` but the hardware does not support zeroing of
+   subnormal numbers.
+
+   ``set_zero_subnormals(true)`` can speed up some computations on
+   some hardware. However, it can break identities such as
+   ``(x-y==0) == (x==y)``.
+
 Integers
 ~~~~~~~~
 

diff --git a/src/sys.c b/src/sys.c
@@ -430,43 +430,71 @@ DLLEXPORT void jl_cpuid(int32_t CPUInfo[4], int32_t InfoType)
 // -- set/clear the FZ/DAZ flags on x86 & x86-64 --
 #ifdef __SSE__
 
-DLLEXPORT uint8_t jl_zero_subnormals(uint8_t isZero)
-{
-    uint32_t flags = 0x00000000;
-    int32_t info[4];
-
-    jl_cpuid(info, 0);
-    if (info[0] >= 1) {
-        jl_cpuid(info, 0x00000001);
-        if ((info[3] & ((int)1 << 26)) != 0) {
-            // SSE2 supports both FZ and DAZ
-            flags = 0x00008040;
-        }
-        else if ((info[3] & ((int)1 << 25)) != 0) {
-            // SSE supports only the FZ flag
-            flags = 0x00008000;
+// Cache of information recovered from jl_cpuid.
+// In a multithreaded environment, there will be races on subnormal_flags,
+// but they are harmless idempotent races.  If we ever embrace C11, then
+// subnormal_flags should be declared atomic.
+static volatile int32_t subnormal_flags = 1;
+
+static int32_t get_subnormal_flags() {
+    uint32_t f = subnormal_flags;
+    if (f & 1) {
+        // CPU capabilities not yet inspected.
+        f = 0;
+        int32_t info[4];
+        jl_cpuid(info, 0);
+        if (info[0] >= 1) {
+            jl_cpuid(info, 0x00000001);
+            if (info[3] & (1 << 26)) {
+                // SSE2 supports both FZ and DAZ
+                f = 0x00008040;
+            } else if (info[3] & (1 << 25)) {
+                // SSE supports only the FZ flag
+                f = 0x00008000;
+            }
         }
+        subnormal_flags = f;
     }
+    return f;
+}
 
+// Returns non-zero if subnormals go to 0; zero otherwise.
+DLLEXPORT int32_t jl_get_zero_subnormals(int8_t isZero)
+{
+    uint32_t flags = get_subnormal_flags();
+    return _mm_getcsr() & flags;
+}
+
+// Return zero on success, non-zero on failure.
+DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
+{
+    uint32_t flags = get_subnormal_flags();
     if (flags) {
-        if (isZero) {
-            _mm_setcsr(_mm_getcsr() | flags);
-        }
-        else {
-            _mm_setcsr(_mm_getcsr() & ~flags);
-        }
-        return 1;
+        uint32_t state = _mm_getcsr();
+        if (isZero)
+            state |= flags;
+        else
+            state &= ~flags;
+        _mm_setcsr(state);
+        return 0;
+    } else {
+        // Report a failure only if user is trying to enable FTZ/DAZ.
+        return isZero;
     }
-    return 0;
 }
 
 #else
 
-DLLEXPORT uint8_t jl_zero_subnormals(uint8_t isZero)
+DLLEXPORT int32_t jl_get_zero_subnormals(int8_t isZero)
 {
     return 0;
 }
 
+DLLEXPORT int32_t jl_set_zero_subnormals(int8_t isZero)
+{
+    return isZero;
+}
+
 #endif
 
 // -- processor native alignment information --

diff --git a/test/math.jl b/test/math.jl
@@ -364,8 +364,10 @@ end
 @test_approx_eq quadgk(cos, 0,0.7,1, norm=abs)[1] sin(1)
 
 # Ensure subnormal flags functions don't segfault
-@test any(ccall("jl_zero_subnormals", UInt8, (UInt8,), 1) .== [0x00 0x01])
-@test any(ccall("jl_zero_subnormals", UInt8, (UInt8,), 0) .== [0x00 0x01])
+@test any(set_zero_subnormals(true) .== [false,true])
+@test any(get_zero_subnormals() .== [false,true])
+@test set_zero_subnormals(false)
+@test !get_zero_subnormals()
 
 # useful test functions for relative error
 err(z, x) = abs(z - x) / abs(x)