Add support for new float64 representation

Numbits can be constructed from float64 and can be losslessly converted back to float64. The current fast branchless conversion is possible due to a nerd-snipe of @Merovius. He also threw it a godbolt and gave it some scrutiny. Thanks, Axel!
timbray · Jul 20, 2024 · 6a81b27 · 6a81b27
1 parent 6fb7cd4
commit 6a81b27
Show file tree

Hide file tree

Showing 4 changed files with 275 additions and 0 deletions.
diff --git a/numbits-cmp1_19_test.go b/numbits-cmp1_19_test.go
@@ -0,0 +1,50 @@
+//go:build !go1.20
+
+package quamina
+
+// code below is copied and slightly adapted from Go 1.20+
+
+// compare returns
+//
+//	-1 if x is less than y,
+//	 0 if x equals y,
+//	+1 if x is greater than y.
+//
+// For floating-point types, a NaN is considered less than any non-NaN,
+// a NaN is considered equal to a NaN, and -0.0 is equal to 0.0.
+func compare[T go_1_19_Ordered](x, y T) int {
+	xNaN := go_1_19_isNaN(x)
+	yNaN := go_1_19_isNaN(y)
+	if xNaN && yNaN {
+		return 0
+	}
+	if xNaN || x < y {
+		return -1
+	}
+	if yNaN || x > y {
+		return +1
+	}
+	return 0
+}
+
+// Ordered is a constraint that permits any ordered type: any type
+// that supports the operators < <= >= >.
+// If future releases of Go add new ordered types,
+// this constraint will be modified to include them.
+//
+// Note that floating-point types may contain NaN ("not-a-number") values.
+// An operator such as == or < will always report false when
+// comparing a NaN value with any other value, NaN or not.
+// See the [Compare] function for a consistent way to compare NaN values.
+type go_1_19_Ordered interface {
+	~int | ~int8 | ~int16 | ~int32 | ~int64 |
+		~uint | ~uint8 | ~uint16 | ~uint32 | ~uint64 | ~uintptr |
+		~float32 | ~float64 |
+		~string
+}
+
+// isNaN reports whether x is a NaN without requiring the math package.
+// This will always return false if T is not floating-point.
+func go_1_19_isNaN[T go_1_19_Ordered](x T) bool {
+	return x != x
+}
diff --git a/numbits-cmp1_20_test.go b/numbits-cmp1_20_test.go
@@ -0,0 +1,11 @@
+//go:build go1.20
+
+package quamina
+
+import "cmp"
+
+// TODO: when Go 1.19 support is dropped, replace invocations with cmp.Compare directly.
+
+func compare[T cmp.Ordered](x, y T) int {
+	return cmp.Compare[T](x, y)
+}
diff --git a/numbits.go b/numbits.go
@@ -0,0 +1,99 @@
+package quamina
+
+import (
+	"encoding/binary"
+	"math"
+)
+
+// float64 are stored as (sign | exponent | mantissa)
+// with 1 bit sign, 11 bits exponent, 52 bits mantissa
+const (
+	maskSign     uint64 = 1 << 63
+	maskExponent uint64 = 0b11111111111 << 52
+	maskMantissa uint64 = ^uint64(0) >> 12
+)
+
+// Numbits representation of some boundary values.
+const (
+	numbitsZero          = Numbits(maskSign)
+	numbitsNegZero       = numbitsZero - 1
+	numbitsNegInf        = Numbits(maskMantissa)
+	numbitsPosInf        = Numbits(maskSign | maskExponent)
+	numbitsNormalizedNaN = numbitsNegInf - 1
+)
+
+// Numbits is an alternative binary representation of float64 numbers.
+// They can be represented as [8]byte or as string and can be created from
+// these representations.
+// All possible float64 values are representable as Numbits.
+//
+// The comparability differs from cmp.Compare for float64, though:
+//   - 0.0 and -0.0 are not equal.
+//   - NaNs are equal if their representation as bits is equal.
+//   - NaNs can be either larger than Infinity
+//     or smaller than -Infinity (depending on the sign bit).
+//   - use Normalize() to align the comparability.
+type Numbits uint64
+
+// NumbitsFromFloat64 converts a float64 value to its Numbits representation.
+func NumbitsFromFloat64(f float64) Numbits {
+	u := math.Float64bits(f)
+	// transform without branching (inverse of Numbits.Float64):
+	// if high bit is 0, xor with sign bit 1 << 63, else negate (xor with ^0)
+	mask := (u>>63)*^uint64(0) | (1 << 63)
+	return Numbits(u ^ mask)
+}
+
+// NumbitsFromBytes converts a [8]byte value to its Numbits representation.
+func NumbitsFromBytes(b [8]byte) Numbits {
+	return Numbits(binary.BigEndian.Uint64(b[:]))
+}
+
+// NumbitsFromBinaryString converts a string value created by BinaryString to its Numbits representation.
+// It uses the first 8 bytes from the string and panics if it is shorter.
+func NumbitsFromBinaryString(s string) Numbits {
+	// This code could use slice to array conversion, but at implementation time,
+	// quamina still supported Go 1.19. The feature was introduced in 1.20.
+	return Numbits(binary.BigEndian.Uint64([]byte(s[:8])))
+}
+
+// Float64 converts Numbits back to its float64 representation
+func (n Numbits) Float64() float64 {
+	u := uint64(n)
+	// transform without branching (inverse of NumbitsFromFloat64):
+	// if high bit is 1, xor with sign bit 1 << 63, else negate (xor with ^0)
+	mask := (1-(u>>63))*^uint64(0) | (1 << 63)
+	return math.Float64frombits(u ^ mask)
+}
+
+// Normalize the value to align the comparability to cmp.Compare.
+//
+// Normalization only affects -0.0 (converted to 0.0) and NaN (all converted to the same representation).
+func (n Numbits) Normalize() Numbits {
+	if n == numbitsNegZero {
+		return numbitsZero
+	}
+	if n < numbitsNegInf || numbitsPosInf < n {
+		return numbitsNormalizedNaN
+	}
+	return n
+}
+
+// IsFinite returns true iff n is not infinite or NaN.
+func (n Numbits) IsFinite() bool {
+	return numbitsNegInf < n && n < numbitsPosInf
+}
+
+// Bytes retrieves a representation as [8]byte.
+// The returned bytes are in big-endian order.
+func (n Numbits) Bytes() [8]byte {
+	var b [8]byte
+	binary.BigEndian.PutUint64(b[:], uint64(n))
+	return b
+}
+
+// BinaryString retrieves a lexically ordered string representation.
+func (n Numbits) BinaryString() string {
+	b := n.Bytes()
+	return string(b[:])
+}
diff --git a/numbits_test.go b/numbits_test.go
@@ -0,0 +1,115 @@
+package quamina
+
+import (
+	"cmp"
+	"math"
+	"testing"
+)
+
+var (
+	// special case, compiler does not create it when writing -0.0
+	f64_negZero = math.Float64frombits(0b1_00000000000_0000_00000000_00000000_00000000_00000000_00000000_00000000)
+
+	// boundaries of floating point value ranges
+	f64_zero      = math.Float64frombits(0b0_00000000000_0000_00000000_00000000_00000000_00000000_00000000_00000000)
+	f64_subnormLo = math.Float64frombits(0b0_00000000000_0000_00000000_00000000_00000000_00000000_00000000_00000001)
+	f64_subnormHi = math.Float64frombits(0b0_00000000000_1111_11111111_11111111_11111111_11111111_11111111_11111111)
+	f64_normLoLo  = math.Float64frombits(0b0_00000000001_0000_00000000_00000000_00000000_00000000_00000000_00000000)
+	f64_normLoHi  = math.Float64frombits(0b0_00000000001_1111_11111111_11111111_11111111_11111111_11111111_11111111)
+	f64_normHiLo  = math.Float64frombits(0b0_11111111110_0000_00000000_00000000_00000000_00000000_00000000_00000000)
+	f64_normHiHi  = math.Float64frombits(0b0_11111111110_1111_11111111_11111111_11111111_11111111_11111111_11111111)
+	f64_inf       = math.Float64frombits(0b0_11111111111_0000_00000000_00000000_00000000_00000000_00000000_00000000)
+	f64_nanLo     = math.Float64frombits(0b0_11111111111_0000_00000000_00000000_00000000_00000000_00000000_00000001)
+	f64_nanHi     = math.Float64frombits(0b0_11111111111_1111_11111111_11111111_11111111_11111111_11111111_11111111)
+
+	// named values including boundaries
+	values = func(positive map[string]float64) map[string]float64 {
+		// this function mirrors the values to negative
+		const sign uint64 = 1 << 63
+		m2 := make(map[string]float64, len(positive)*2)
+		for n, v := range positive {
+			m2[n] = v
+			m2["negative "+n] = math.Float64frombits(math.Float64bits(v) | sign)
+		}
+		return m2
+	}(map[string]float64{
+		"zero":                  f64_zero,
+		"subnormal;lo":          f64_subnormLo,
+		"subnormal;hi":          f64_subnormHi,
+		"normal;lo-exp,lo-mant": f64_normLoLo,
+		"normal;lo-exp,hi-mant": f64_normLoHi,
+		"normal;hi-exp,lo-mant": f64_normHiLo,
+		"normal;hi-exp,hi-mant": f64_normHiHi,
+		"infinity":              f64_inf,
+		"NaN;lo":                f64_nanLo,
+		"NaN;hi":                f64_nanHi,
+		"0.1":                   0.1,
+		"1.0":                   1.0,
+		"pi":                    math.Pi,
+	})
+)
+
+func TestNumbits(t *testing.T) {
+	// roundtrips for various values - creation and conversion
+	normalNaN := NumbitsFromFloat64(math.NaN()).Normalize()
+	for n, f := range values {
+		t.Run(n, func(t *testing.T) {
+			got := NumbitsFromFloat64(f)
+			if f2 := got.Float64(); cmp.Compare(f2, f) != 0 {
+				t.Errorf("NumbitsFromFloat64().Float64() = %v, want %v", f2, f)
+			}
+			if bin := got.Bytes(); NumbitsFromBytes(bin) != got {
+				t.Errorf("NumbitsFromBytes().Bytes() = %x, want %x", got, bin)
+			}
+			if str := got.BinaryString(); NumbitsFromBinaryString(str) != got {
+				t.Errorf("NumbitsFromBytes().BinaryString() = %x, want %x", got, str)
+			}
+			if math.IsNaN(f) && got.Normalize() != normalNaN {
+				t.Errorf("Normalize for NaN failed")
+			}
+			if got.IsFinite() == (math.IsNaN(f) || math.IsInf(f, 0)) {
+				t.Errorf("IsFinite failed for %v", f)
+			}
+		})
+	}
+	t.Run("neg-zero_to_zero", func(t *testing.T) {
+		negZero := NumbitsFromFloat64(f64_negZero)
+		normZero := negZero.Normalize()
+		if normZero == negZero {
+			t.Errorf("Normalize for -0.0 failed")
+		}
+		if negZero.Float64() != 0 || normZero.Float64() != 0 {
+			t.Errorf("0.0 representation error")
+		}
+	})
+}
+
+func TestNumbits_Compare(t *testing.T) {
+	for n1, f1 := range values {
+		v1 := NumbitsFromFloat64(f1)
+		nanf1 := math.IsNaN(f1)
+		for n2, f2 := range values {
+			// redefine in scope so v1 can be changed without changing the outer one
+			v1 := v1
+			v2 := NumbitsFromFloat64(f2)
+			nanf2 := math.IsNaN(f2)
+			order := compare(f1, f2)
+			if o := compare(v1.Float64(), v2.Float64()); order != o {
+				t.Errorf("%v->%v: comparison after Float64() failed: want %v, got %v", n1, n2, order, o)
+			}
+			if nanf1 || (f1 == 0 && f2 == 0) {
+				v1 = v1.Normalize()
+			}
+			if nanf2 || (f1 == 0 && f2 == 0) {
+				v2 = v2.Normalize()
+			}
+			b1, b2 := v1.BinaryString(), v2.BinaryString()
+			if o := compare(v1, v2); order != o {
+				t.Errorf("%v->%v: direct comparison of Numbits failed: want %v, got %v for %x -> %x", n1, n2, order, o, b1, b2)
+			}
+			if o := compare(v1.BinaryString(), v2.BinaryString()); order != o {
+				t.Errorf("%v->%v: comparison after BinaryString() failed: want %v, got %v for %x -> %x", n1, n2, order, o, b1, b2)
+			}
+		}
+	}
+}