Skip to content

Commit

Permalink
KT-72492: Optimize String.toFloatOrNull()
Browse files Browse the repository at this point in the history
The existing implementation used a regular expression which caused memory
allocations, which are expensive on mobile devices. In addition, a custom
parser can outperform regular expressions.

The new implementation is compatible with the original regular expression
and performs ~22x faster on OpenJDK 22 with a 2021 MacBook Pro M1 Pro:

Benchmark                      Mode  Cnt    Score   Error   Units
KotlinBenchmark.customParser  thrpt       482.020          ops/ms
KotlinBenchmark.regex         thrpt        21.471          ops/ms

On a Pixel 6 running Android 14, the new implementation is ~225x faster:

    8,595,686   ns       10428 allocs    Trace    ColorBenchmark.isFloatRegex
       37,755   ns           0 allocs    Trace    ColorBenchmark.isFloat

It also has the benefit of never allocating anything (vs ~940 allocations
per invocation for the existing implementation).

^KT-72492

Co-authored-by: Romain Guy <[email protected]>
Co-authored-by: Jake Wharton <[email protected]>
Co-authored-by: Filipp Zhinkin <[email protected]>

Merge-request: KT-MR-18243
Merged-by: Filipp Zhinkin <[email protected]>
  • Loading branch information
romainguy authored and qodana-bot committed Nov 21, 2024
1 parent 79165c7 commit 3675473
Show file tree
Hide file tree
Showing 2 changed files with 472 additions and 25 deletions.
260 changes: 237 additions & 23 deletions libraries/stdlib/jvm/src/kotlin/text/StringNumberConversionsJVM.kt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright 2010-2018 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Copyright 2010-2024 JetBrains s.r.o. and Kotlin Programming Language contributors.
* Use of this source code is governed by the Apache 2.0 license that can be found in the license/LICENSE.txt file.
*/

Expand Down Expand Up @@ -239,35 +239,249 @@ public fun String.toBigDecimalOrNull(): java.math.BigDecimal? =
public fun String.toBigDecimalOrNull(mathContext: java.math.MathContext): java.math.BigDecimal? =
screenFloatValue(this) { it.toBigDecimal(mathContext) }

/**
* Recommended floating point number validation RegEx from the javadoc of `java.lang.Double.valueOf(String)`
*/
private object ScreenFloatValueRegEx {
@JvmField val value = run {
val Digits = "(\\p{Digit}+)"
val HexDigits = "(\\p{XDigit}+)"
val Exp = "[eE][+-]?$Digits"
private inline fun <T> screenFloatValue(str: String, parse: (String) -> T): T? {
return try {
if (isValidFloat(str))
parse(str)
else
null
} catch (_: NumberFormatException) { // overflow
null
}
}

private fun isValidFloat(s: String): Boolean {
// A float can have one of two representations:
//
// 1. Standard:
// - With an integer part only: 1234
//. - With an integer part followed by the decimal point: 1234.
// - With integer and fractional parts: 1234.4678
// - With a fractional part only: .4678
//
// Optional sign prefix: + or -
// Optional signed exponent: e or E, followed by optionally signed digits (+12, -12, 12)
// Optional suffix: f, F, d, or D (for instance 12.34f or .34D)
//
// 2. Hexadecimal:
// - With an integer part only: 0x12ab
// - With an integer part followed by the decimal point: 0x12ab.
// - With integer and fractional parts: 0x12ab.CD78
// - With a fractional part only: 0x.CD78
//
// Mandatory signed exponent: p or P, followed by optionally signed decimal digits (+12, -12, 12)
//
// Optional sign prefix: + or -
// Optional suffix: f, F, d, or D (for instance 0xAB.01P1f or 0x.34P0D)
//
// Two special cases:
// "NaN" and "Infinity" strings, can have an optional sign prefix (+ or -)
//
// Implementation notes:
// - The pattern "myChar.code or 0x20 == 'x'.code" is used to perform a case-insensitive
// comparison of a character. Adding the 0x20 bit turns an upper case ASCII letter into
// a lower case one. This is encapsulated in the asciiLetterToLowerCaseCode() extension

var start = 0
var endInclusive = s.length - 1

// Skip leading spaces
start = s.advanceWhile(start, endInclusive) { it.code <= 0x20 }

// Empty/whitespace string
if (start > endInclusive) return false

// Skip trailing spaces
endInclusive = s.backtrackWhile(start, endInclusive) { it.code <= 0x20 }

// Number starts with a positive or negative sign
if (s[start] == '+' || s[start] == '-') start++
// If we have nothing after the sign, the string is invalid
if (start > endInclusive) return false

var isHex = false

// Might be a hex string
if (s[start] == '0') {
start++
// A "0" on its own is valid
if (start > endInclusive) return true

// Test for [xX] to see if we truly have a hex string
if (s[start].asciiLetterToLowerCaseCode() == 'x'.code) {
start++

start = s.advanceAndValidateMantissa(start, endInclusive, true) { it.isAsciiDigit() || it.isHexLetter() }

// A hex string must have an exponent, the string is invalid if we only found an
// integer and/or fractional part
if (start == -1 || start > endInclusive) return false

isHex = true
} else {
// Rewind the 0 we just parsed to make things easier below and try to parse a non-
// hexadecimal string representation of a float
start--
}
}

// Parse a non-hexadecimal representations
if (!isHex) {
start = s.advanceAndValidateMantissa(start, endInclusive, false) { it.isAsciiDigit() }

// We couldn't validate the mantissa, stop here
if (start == -1) return false

// If we have validated the mantissa, we can stop here if we've run out of characters
if (start > endInclusive) return true
}

val HexString = "(0[xX]$HexDigits(\\.)?)|" + // 0[xX] HexDigits ._opt BinaryExponent FloatTypeSuffix_opt
"(0[xX]$HexDigits?(\\.)$HexDigits)" // 0[xX] HexDigits_opt . HexDigits BinaryExponent FloatTypeSuffix_opt
// Look for an exponent:
// - Mandatory for hexadecimal strings (marked by a p or P)
// - Optional for "regular" strings (marked by an e or E)
var l = s[start++].asciiLetterToLowerCaseCode()
if (l != if (isHex) 'p'.code else 'e'.code) {
// We're here if the exponent character is not valid, but if the string is a "regular"
// string, it could be a valid f/F/d/D suffix, so check for that (it must be the last
// character too)
return !isHex && (l == 'f'.code || l == 'd'.code) && start > endInclusive
}

// An exponent must be followed by digits
if (start > endInclusive) return false

val Number = "($Digits(\\.)?($Digits?)($Exp)?)|" + // Digits ._opt Digits_opt ExponentPart_opt FloatTypeSuffix_opt
"(\\.($Digits)($Exp)?)|" + // . Digits ExponentPart_opt FloatTypeSuffix_opt
"(($HexString)[pP][+-]?$Digits)" // HexSignificand BinaryExponent
// There may be a sign prefix before the exponent digits
if (s[start] == '+' || s[start] == '-') {
start++
if (start > endInclusive) return false
}

val fpRegex = "[\\x00-\\x20]*[+-]?(NaN|Infinity|(($Number)[fFdD]?))[\\x00-\\x20]*"
// Look for digits after the exponent and its optional sign
start = s.advanceWhile(start, endInclusive) { it.isAsciiDigit() }

Regex(fpRegex)
// The last suffix is optional, the string is valid here
if (start > endInclusive) return true

// We may have an optional fFdD suffix
if (start == endInclusive) {
l = s[start].asciiLetterToLowerCaseCode()
return l == 'f'.code || l == 'd'.code
}

// Anything left is invalid
return false
}

private inline fun <T> screenFloatValue(str: String, parse: (String) -> T): T? {
return try {
if (ScreenFloatValueRegEx.value.matches(str))
parse(str)
else
null
} catch (e: NumberFormatException) { // overflow
/**
* Given a [start] and [endInclusive] index in a string, returns what possible float
* named constant could be in that string. For instance, if there are 3 characters
* between [start] and [endInclusive], this function will return "NaN".
*
* This function can return "NaN", "Infinity", or null. Null is returned when none of
* the non-null constants can be stored in the string given the [start]/[endInclusive]
* constraints.
*/
@kotlin.internal.InlineOnly
private inline fun guessNamedFloatConstant(start: Int, endInclusive: Int): String? = when (endInclusive) {
start + 3 - 1 -> { // "NaN".length == 3, - 1 because we used and inclusive end index
"NaN"
}
start + 8 - 1 -> { // "Infinity".length == 8, - 1 because we used and inclusive end index
"Infinity"
}
else -> {
// We have too many or too few characters, there's no valid constant
null
}
}

@kotlin.internal.InlineOnly
private inline fun Char.isAsciiDigit(): Boolean {
// "and 0xFFFF" wraps negative values
return (this - '0') and 0xFFFF < 10
}

@kotlin.internal.InlineOnly
private inline fun Char.isHexLetter(): Boolean {
// "and 0xFFFF" wraps negative values
return (asciiLetterToLowerCaseCode() - 'a'.code) and 0xFFFF < 6
}

/**
* Speculatively transforms an upper-case ASCII character into its lower-case counterpart
* and returns resulting code unit.
*
* The transformation is based on the fact that a difference between codes of
* upper- and lower-case representations of the same ASCII letter is exactly 32.
* So an upper-case letter could be transformed to a lower-case by adding 32.
*
* If [this] character lies outside the 'A'..'Z' range, a resulting code unit will not make much sense.
* This function is not a general purpose solution for a case transformation,
* and it is intended for use in conjunction with comparison,
* like `'R'.asciiLetterToLowerCaseCode() == 'r'.code`.
*/
@kotlin.internal.InlineOnly
private inline fun Char.asciiLetterToLowerCaseCode(): Int = this.code or 0x20

@kotlin.internal.InlineOnly
private inline fun String.advanceWhile(start: Int, endInclusive: Int, predicate: (Char) -> Boolean): Int {
var start = start
while (start <= endInclusive && predicate(this[start])) start++
return start
}

@kotlin.internal.InlineOnly
private inline fun String.backtrackWhile(start: Int, endInclusive: Int, predicate: (Char) -> Boolean): Int {
var endInclusive = endInclusive
while (endInclusive > start && predicate(this[endInclusive])) endInclusive--
return endInclusive
}

/**
* Advances until after the end of the mantissa, in the substring defined by the [start] and [endInclusive] indices.
* If a valid mantissa cannot be found, this method returns -1.
* If a valid mantissa is found, this method returns [endInclusive] + 1.
*/
@kotlin.internal.InlineOnly
private inline fun String.advanceAndValidateMantissa(start: Int, endInclusive: Int, hexFormat: Boolean, predicate: (Char) -> Boolean): Int {
var start = start

// Look for hex digits after the 0x prefix
var checkpoint = start
start = advanceWhile(start, endInclusive, predicate)

// Check if we found the integer part of the number
val hasIntegerPart = checkpoint != start

// A hex string must have an exponent, the string is invalid if we only found an
// integer part, but a non-hex string is valid if there's only an integer part
if (start > endInclusive) return if (hexFormat) -1 else start

var hasFractionalPart = false
if (this[start] == '.') {
start++

// Look for hex digits for the fractional part
checkpoint = start
start = advanceWhile(start, endInclusive, predicate)

// Did we find a fractional part?
hasFractionalPart = checkpoint != start
}

// Both hex and non-hex strings must have an integer part, or a fractional part, or both
if (!hasIntegerPart && !hasFractionalPart) {
if (hexFormat) {
return -1
} else {
// Check for non-finite constants
val constant = guessNamedFloatConstant(start, endInclusive)
if (constant == null) return -1

// If the string contains exactly the constant we guessed, advance to after the constant
return if (indexOf(constant, start, false) == start) endInclusive + 1 else -1
}
}

return start
}
Loading

0 comments on commit 3675473

Please sign in to comment.