Merge branch 'main' into dependabot/github_actions/github/codeql-acti…

…on-3.26.5
timbray · Aug 28, 2024 · 0e6f9d6 · 0e6f9d6
2 parents 8781c8f + cd8d31a
commit 0e6f9d6
Show file tree

Hide file tree

Showing 12 changed files with 756 additions and 703 deletions.
diff --git a/PATTERNS.md b/PATTERNS.md
@@ -58,16 +58,8 @@ Thus, the following Pattern would match both JSON events above:
 
 ### Numeric Values
 
-It would be convenient if Quamina knew, for matching purposes, that 35,
-35.00, and 3.5e1 were all the same number.
-
-In many cases, Quamina can manage this. Specifically, for numbers that:
-
-* are between -5.0e9 and 5.0e9 inclusive.
-* have five or fewer fractional digits.
-
-Numbers which do not meet these criteria will be treated as strings, which
-usually produces good results.
+Quamina can match numeric values with precision and range exactly the same as that provided by 
+Go's `float64` data type, which is said to conform to IEE 754 `binary64`.
 
 ## Extended Patterns
 An **Extended Pattern** **MUST** be a JSON object containing

diff --git a/README.md b/README.md
@@ -150,9 +150,6 @@ The `"exists":true` and `"exists":false` patterns
 have corner cases; details are covered in
 [Patterns in Quamina](PATTERNS.md).
 
-Quamina can match numeric values correctly, subject to 
-certain limits; details are in [Patterns in Quamina](PATTERNS.md).
-
 ## Flattening and Matching
 
 The first step in finding matches for an Event is
@@ -386,3 +383,5 @@ colonies before slavery was abolished.
 @embano1: CI/CD and project structure.
 
 @yosiat: Flattening optimization.
+
+@arnehormann: compact high-precision number representation.
diff --git a/case_folding.go b/case_folding.go
diff --git a/core_matcher.go b/core_matcher.go
@@ -149,7 +149,13 @@ func (m *coreMatcher) deletePatterns(_ X) error {
 // This is a leftover from previous times, is only used by tests, but it's used by a *lot*
 // and it's a convenient API for testing.
 func (m *coreMatcher) matchesForJSONEvent(event []byte) ([]X, error) {
-	fields, _ := newJSONFlattener().Flatten(event, m.getSegmentsTreeTracker())
+	return m.matchesForJSONWithFlattener(event, newJSONFlattener())
+}
+
+// if your test is a benchmark, call newJSONFlattener and pass it to this routine, matchesForJSONWithFlattener
+// because newJSONFlattener() is fairly heavyweight and you want it out of the benchmark loop
+func (m *coreMatcher) matchesForJSONWithFlattener(event []byte, f Flattener) ([]X, error) {
+	fields, _ := f.Flatten(event, m.getSegmentsTreeTracker())
 	return m.matchesForFields(fields)
 }
 

diff --git a/flatten_json.go b/flatten_json.go
@@ -214,7 +214,7 @@ func (fj *flattenJSON) readObject(pathNode SegmentsTreeTracker) error {
 			}
 
 			var val []byte
-			isQNumber := false
+			isNumber := false
 			switch ch {
 			case '"':
 				if fj.skipping > 0 || !memberIsUsed {
@@ -233,7 +233,10 @@ func (fj *flattenJSON) readObject(pathNode SegmentsTreeTracker) error {
 				val, err = fj.readLiteral(nullBytes)
 				isLeaf = true
 			case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
-				val, isQNumber, err = fj.readNumber()
+				val, err = fj.readNumber()
+				if err == nil {
+					isNumber = true
+				}
 				isLeaf = true
 			case '[':
 				if !pathNode.IsSegmentUsed(memberName) {
@@ -296,7 +299,7 @@ func (fj *flattenJSON) readObject(pathNode SegmentsTreeTracker) error {
 			}
 			if val != nil {
 				if memberIsUsed {
-					fj.storeObjectMemberField(pathNode.PathForSegment(memberName), arrayTrail, val, isQNumber)
+					fj.storeObjectMemberField(pathNode.PathForSegment(memberName), arrayTrail, val, isNumber)
 					fieldsCount--
 				}
 			}
@@ -340,7 +343,7 @@ func (fj *flattenJSON) readArray(pathName []byte, pathNode SegmentsTreeTracker)
 	for {
 		ch := fj.ch()
 		var val []byte // resets on each loop
-		isQNumber := false
+		isNumber := false
 		switch state {
 		case fjInArrayState:
 			// bypass space before element value. A bit klunky but allows for immense simplification
@@ -365,7 +368,10 @@ func (fj *flattenJSON) readArray(pathName []byte, pathNode SegmentsTreeTracker)
 				val, err = fj.readLiteral(nullBytes)
 				isLeaf = true
 			case '-', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
-				val, isQNumber, err = fj.readNumber()
+				val, err = fj.readNumber()
+				if err == nil {
+					isNumber = true
+				}
 				isLeaf = true
 			case '{':
 				if fj.skipping == 0 {
@@ -398,7 +404,7 @@ func (fj *flattenJSON) readArray(pathName []byte, pathNode SegmentsTreeTracker)
 			if val != nil {
 				if fj.skipping == 0 {
 					fj.stepOneArrayElement()
-					fj.storeArrayElementField(pathName, val, isQNumber)
+					fj.storeArrayElementField(pathName, val, isNumber)
 				}
 			}
 			state = fjAfterValueState
@@ -427,13 +433,10 @@ func (fj *flattenJSON) readArray(pathName []byte, pathNode SegmentsTreeTracker)
  *  these higher-level funcs are going to advance the pointer after each invocation
  */
 
-func (fj *flattenJSON) readNumber() ([]byte, bool, error) {
+func (fj *flattenJSON) readNumber() ([]byte, error) {
 	// points at the first character in the number
 	numStart := fj.eventIndex
 	state := fjNumberStartState
-	isQNumber := false
-	fracStart := 0
-	expStart := 0
 	for {
 		ch := fj.ch()
 		switch state {
@@ -450,38 +453,33 @@ func (fj *flattenJSON) readNumber() ([]byte, bool, error) {
 				// no-op
 			case '.':
 				state = fjNumberFracState
-				fracStart = fj.eventIndex + 1
 			case 'e', 'E':
 				state = fjNumberAfterEState
-				expStart = fj.eventIndex + 1
 			case ',', ']', '}', ' ', '\t', '\n', '\r':
 				fj.eventIndex--
-				return fj.event[numStart : fj.eventIndex+1], true, nil
+				return fj.event[numStart : fj.eventIndex+1], nil
 			default:
-				return nil, false, fj.error(fmt.Sprintf("illegal char '%c' in number", ch))
+				return nil, fj.error(fmt.Sprintf("illegal char '%c' in number", ch))
 			}
 		case fjNumberFracState:
 			switch ch {
 			case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
 				// no-op
 			case ',', ']', '}', ' ', '\t', '\n', '\r':
-				fractionalDigits := (expStart - 1) - fracStart
-				isQNumber = fractionalDigits <= MaxFractionalDigits
 				fj.eventIndex--
 				bytes := fj.event[numStart : fj.eventIndex+1]
-				return bytes, isQNumber, nil
+				return bytes, nil
 			case 'e', 'E':
 				state = fjNumberAfterEState
-				expStart = fj.eventIndex + 1
 			default:
-				return nil, false, fj.error(fmt.Sprintf("illegal char '%c' in number", ch))
+				return nil, fj.error(fmt.Sprintf("illegal char '%c' in number", ch))
 			}
 		case fjNumberAfterEState:
 			switch ch {
 			case '-', '1', '2', '3', '4', '5', '6', '7', '8', '9':
 				// no-op
 			default:
-				return nil, false, fj.error(fmt.Sprintf("illegal char '%c' after 'e' in number", ch))
+				return nil, fj.error(fmt.Sprintf("illegal char '%c' after 'e' in number", ch))
 			}
 			state = fjNumberExpState
 
@@ -490,27 +488,14 @@ func (fj *flattenJSON) readNumber() ([]byte, bool, error) {
 			case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9':
 				// no-op
 			case ',', ']', '}', ' ', '\t', '\n', '\r':
-				fractionalDigits := 0
-				if fracStart != 0 {
-					fractionalDigits = (expStart - 1) - fracStart
-					if fractionalDigits > MaxFractionalDigits {
-						if expStart != 0 {
-							exp, err := strconv.ParseInt(string(fj.event[expStart:fj.eventIndex]), 10, 32)
-							if err == nil {
-								fractionalDigits -= int(exp)
-							}
-						}
-					}
-				}
-				isQNumber = fractionalDigits <= MaxFractionalDigits
 				fj.eventIndex--
-				return fj.event[numStart : fj.eventIndex+1], isQNumber, nil
+				return fj.event[numStart : fj.eventIndex+1], nil
 			default:
-				return nil, false, fj.error(fmt.Sprintf("illegal char '%c' in exponent", ch))
+				return nil, fj.error(fmt.Sprintf("illegal char '%c' in exponent", ch))
 			}
 		}
 		if fj.step() != nil {
-			return nil, false, fj.error("event truncated in number")
+			return nil, fj.error("event truncated in number")
 		}
 	}
 }
@@ -811,6 +796,8 @@ func (fj *flattenJSON) readHexUTF16(from int) ([]byte, int, error) {
 				if hexDigitCount == 4 {
 					hexString := string(fj.event[from-3 : from+1])
 					r, _ := strconv.ParseUint(hexString, 16, 16)
+					// parsing 4 hex digits can't overflow a uint16
+					//nolint:gosec
 					codepoints = append(codepoints, uint16(r))
 					state = fjStartEscapeState
 				}
@@ -831,14 +818,14 @@ func (fj *flattenJSON) readHexUTF16(from int) ([]byte, int, error) {
 // its own snapshot of the array-trail data, because it'll be different for each array element
 // NOTE: The profiler says this is the most expensive function in the whole matchesForJSONEvent universe, presumably
 // because of the necessity to construct a new arrayTrail for each element.
-func (fj *flattenJSON) storeArrayElementField(path []byte, val []byte, isQNumber bool) {
-	f := Field{Path: path, ArrayTrail: make([]ArrayPos, len(fj.arrayTrail)), Val: val, IsQNumber: isQNumber}
+func (fj *flattenJSON) storeArrayElementField(path []byte, val []byte, isNumber bool) {
+	f := Field{Path: path, ArrayTrail: make([]ArrayPos, len(fj.arrayTrail)), Val: val, IsNumber: isNumber}
 	copy(f.ArrayTrail, fj.arrayTrail)
 	fj.fields = append(fj.fields, f)
 }
 
-func (fj *flattenJSON) storeObjectMemberField(path []byte, arrayTrail []ArrayPos, val []byte, isQNumber bool) {
-	fj.fields = append(fj.fields, Field{Path: path, ArrayTrail: arrayTrail, Val: val, IsQNumber: isQNumber})
+func (fj *flattenJSON) storeObjectMemberField(path []byte, arrayTrail []ArrayPos, val []byte, isNumber bool) {
+	fj.fields = append(fj.fields, Field{Path: path, ArrayTrail: arrayTrail, Val: val, IsNumber: isNumber})
 }
 
 func (fj *flattenJSON) enterArray() {

diff --git a/flattener.go b/flattener.go
@@ -58,5 +58,5 @@ type Field struct {
 	Path       []byte
 	Val        []byte
 	ArrayTrail []ArrayPos
-	IsQNumber  bool
+	IsNumber   bool
 }
diff --git a/numbers.go b/numbers.go
@@ -1,121 +1,54 @@
 package quamina
 
 import (
-	"encoding/binary"
 	"errors"
+	"fmt"
 	"strconv"
 )
 
 // You can't easily build automata to compare numbers based on either the decimal notation found
-// in text data or the internal floating-point bits. Therefore, for a restricted subset of numbers,
-// we define a 7-byte (14 hex digit) representation that facilitates building automata to support
-// equality and ordering comparison.
-//
-// The representation supports 10**15 numbers. The first three are:
-// decimal: -5_000_000_000, -4_999_999_999.99999, -4_999_999_999.99998, ...
-// 14-byte: 00000000000000,       00000000000009,       00000000000014
-// and the last three are
-// decimal: .., 4_999_999_999.99998, 4_999_999_999.99999,  5_000_000_000
-// 14-byte:          2386F26FC0FFEC,      2386F26FC0FFF6, 2386F26FC10000
-//
-// In English: all numbers that are between negative and positive 5 billion inclusive, with up to five
-// digits after the decimal point.
-// These numbers have fifteen decimal digits of precision, which is what double floats can offer.
-// They include most numbers that are used in practice, including prices, occurrence counts, size
-// measurements, and so on.
-// Examples of numbers that do NOT meet these criteria include AWS account numbers, some telephone
-// numbers, and cryptographic keys/signatures. For these, treatment as strings seems to produce
-// satisfactory results for equality testing.
+// in text data or the internal floating-point bits. Therefore, we map floating-point numbers
+// (which is what JSON numbers basically are) to comparable slices of 7-bit bytes which preserve the
+// numbers' ordering. Versions of Quamina up to 1.3 used a home-grown format which used 14 hex digits
+// to represent a subset of numbers. This has now been replaced by Arne Hormann's "numbits"
+// construct, see numbits.go. It uses up to 10 base128 bytes to represent the entire range of float64 numbers.
+// Both this file and numbits.go are very short, but I'm keeping them separated because someone might
+// figure out a still-better serialization of numbers and then this part wouldn't have to change.
 // In Quamina these are called "Q numbers".
-// How It's Done
+
 // There is considerable effort to track, at the NFA level, which NFAs are built to match field values
-// that are Q numbers; see vmFields.hasQNumbers. Similarly, the JSONFlattener, since it has to
+// that are Q numbers; see vmFields.hasNumbers. Similarly, the JSONFlattener, since it has to
 // look at all the digits in a number in order to parse it, can keep track of whether it can be made
 // a Q number. The key benefit of this is in valueMatcher.transitionOn, which incurs the cost of
 // making a Q number only if it is known that the valueMatcher's NFA can benefit from it and
 // that the number in the incoming event can in fact be made a Q number.
 
-const (
-	TenE6               = 1e6
-	FiveBillion         = 5e9
-	Hexes               = "0123456789ABCDEF"
-	MaxFractionalDigits = 5
-)
-
 type qNumber []byte
 
 // qNumFromBytes works out whether a string representing a number falls within the
 // limits imposed for Q numbers. It is heavily optimized and relies on  the form
 // of the number already having been validated, e.g. by flattenJSON().
 func qNumFromBytes(bytes []byte) (qNumber, error) {
-	// shortcut: The shorest number with more than 5 fractional digits is like 0.123456
-	if len(bytes) < 8 {
-		numeric, err := strconv.ParseFloat(string(bytes), 64)
-		if err != nil {
-			return nil, errors.New("not a float") // should never happen, json parser upstream
-		}
-		return qNumFromFloat(numeric)
-	}
-	// compute number of fractional digits. The loop below relies on the fact that anything between '.' and either
-	// 'e' or the end of the string must be a digit, as must anything between 'e' and the end of the string.
-	//. NOTE: This will be fooled by "35.000000"
-	fracStart := 0
-	expStart := 0
-	index := 0
-	var utf8Byte byte
-	fractionalDigits := 0
-ForEachByte:
-	for index, utf8Byte = range bytes {
-		switch utf8Byte {
-		case '.':
-			fracStart = index + 1
-		case 'e', 'E':
-			expStart = index + 1
-			break ForEachByte
-		}
-	}
-	if fracStart != 0 {
-		fractionalDigits = index - fracStart
-	}
-	// if too many fractional digits, perhaps the exponent will push the '.' to the right
-	if fractionalDigits > MaxFractionalDigits {
-		if expStart != 0 {
-			exp, err := strconv.ParseInt(string(bytes[expStart:]), 10, 32)
-			if err == nil {
-				fractionalDigits -= int(exp)
-			}
-		}
-	}
-	if fractionalDigits > MaxFractionalDigits {
-		return nil, errors.New("more than 5 fractional digits")
-	}
-
 	numeric, err := strconv.ParseFloat(string(bytes), 64)
 	if err != nil {
-		return nil, errors.New("not a float") // shouldn't happen, upstream parser should prvent
+		return nil, errors.New("not a float") // should never happen, json parser upstream
 	}
-	return qNumFromFloat(numeric)
+	return qNumFromFloat(numeric), nil
 }
 
-func qNumFromFloat(f float64) (qNumber, error) {
-	if f < -FiveBillion || f > FiveBillion {
-		return nil, errors.New("value must be between -5e9 and +5e9 inclusive")
-	}
-	value := uint64(TenE6 * (FiveBillion + f))
-	return toHexStringSkippingFirstByte(value), nil
+// qNumFromFLoat is here mostly to support testing
+func qNumFromFloat(f float64) qNumber {
+	return numbitsFromFloat64(f).toQNumber()
 }
 
-func toHexStringSkippingFirstByte(value uint64) []byte {
-	var buf [8]byte
-	binary.BigEndian.PutUint64(buf[:], value)
-	var outputChars [14]byte
-	for i, utf8Byte := range buf {
-		if i == 0 {
-			continue
+// for debugging
+func (q qNumber) String() string {
+	ret := ""
+	for i, b := range q {
+		if i != 0 {
+			ret += "-"
 		}
-		pos := (i - 1) * 2
-		outputChars[pos] = Hexes[utf8Byte>>4]
-		outputChars[pos+1] = Hexes[buf[i]&0xf]
+		ret += fmt.Sprintf("%02x", b)
 	}
-	return outputChars[:]
+	return ret
 }