Improve the formatting API.

This addressed #96 and #97, fixing the lack of processing with consecutive digit separators by enhancing the internal logic, adds logic for internal and first digit separators to simplify logic and improve performance, fix unittests, and also make it so the errors are consistent by adding checks when formatting is enabled to ensure the correct logic is used. Closes #96 Closes #97
Alexhuszagh · Sep 24, 2024 · 6d82e08 · 6d82e08
1 parent c102122
commit 6d82e08
Show file tree

Hide file tree

Showing 19 changed files with 2,292 additions and 262 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -7,11 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Changed
+
+- Higher performance when parsing floats with digit separators.
+
 ### Fixed
 
 - Inlining inconsistency between public API methods (credit to @zheland)
 - Incorrectly accepting leading zeros when `no_integer_leading_zeros` was enabled.
 - Have consistent errors when an invalid leading digit is found for floating point numbers to always be `Error::InvalidDigit`.
+- Incorrect parsing of consecutive digit separators.
+- Inaccuracies when parsing digit separators at various positions leading to incorect errors being returned.
 
 ## [1.0.1] 2024-09-16
 

diff --git a/ci/comprehensive.sh b/ci/comprehensive.sh
@@ -18,6 +18,7 @@ run_tests() {
     cd "${home}"
     cd lexical-parse-float/etc/correctness
     cargo run "${@}" --release --bin test-parse-golang
+    cargo run "${@}" --release --bin test-parse-golang --features digit-separator
     cargo run "${@}" --release --bin test-parse-unittests
 
     # Test the write-float correctness tests.

diff --git a/clippy.toml b/clippy.toml
@@ -5,6 +5,8 @@ disallowed-macros = [
     { path = "std::println", reason = "no IO allowed" },
     { path = "std::format", reason = "no string allocation allowed" },
     { path = "std::debug", reason = "debugging macros should not be present in any release" },
+    # NOTE: unimplemented is fine because this can be for intentionally disabled methods
+    { path = "std::todo", reason = "should never have TODO macros in releases" },
 ]
 disallowed-methods = [
     { path = "std::io::stdout", reason = "no IO allowed" },

diff --git a/lexical-parse-float/etc/correctness/Cargo.toml b/lexical-parse-float/etc/correctness/Cargo.toml
@@ -10,19 +10,27 @@ path = "../.."
 default-features = false
 features = []
 
+[dependencies.lexical-util]
+path = "../../../lexical-util"
+default-features = false
+features = []
+
 [dependencies]
 rand = "0.8"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
 toml = "0.8"
 rand_isaac = ">=0.3.0"
+regex = { version = ">=1.10.6", optional = true}
+lazy_static = { version = ">=1.5.0", optional = true }
 
 [features]
-std = ["lexical-parse-float/std"]
-power-of-two = ["lexical-parse-float/power-of-two"]
-radix = ["lexical-parse-float/radix"]
-format = ["lexical-parse-float/format"]
+std = ["lexical-parse-float/std", "lexical-util/std"]
+power-of-two = ["lexical-parse-float/power-of-two", "lexical-util/power-of-two"]
+radix = ["lexical-parse-float/radix", "lexical-util/radix"]
+format = ["lexical-parse-float/format", "lexical-util/format"]
 compact = ["lexical-parse-float/compact"]
+digit-separator = ["format", "regex", "lazy_static"]
 
 [workspace]
 

diff --git a/lexical-parse-float/etc/correctness/test-parse-golang/main.rs b/lexical-parse-float/etc/correctness/test-parse-golang/main.rs
@@ -1,10 +1,70 @@
 // Copyright 2021, Alex Huszagh. Unlicensed.
 // See https://unlicense.org/
 
-use lexical_parse_float::FromLexical;
+#![allow(unused_imports)]
+
+use lexical_parse_float::{FromLexicalWithOptions, Options};
+use lexical_util::format::{NumberFormatBuilder, STANDARD};
+use rand::{Rng, SeedableRng};
+use rand_isaac::Isaac64Rng;
+use core::{num, str};
 use std::collections::HashMap;
 
-fn run_test(line: &str) {
+#[allow(dead_code)]
+pub const ISAAC_SEED: [u8; 32] = [
+    49, 52, 49, 53, 57, 50, 54, 53, 51, 53, 56, 57, 55, 57, 51, 50, 51, 56, 52, 54, 50, 54, 52, 51,
+    51, 56, 51, 50, 55, 57, 53, 48,
+];
+
+#[cfg(feature = "digit-separator")]
+lazy_static::lazy_static! {
+    static ref SIGN: regex::Regex = regex::Regex::new("(_+)([+-])").unwrap();
+}
+
+#[cfg(feature = "digit-separator")]
+fn run_test<Random: Rng>(line: &str, rng: &mut Random) {
+    const FMT: u128 = NumberFormatBuilder::new()
+        .digit_separator(num::NonZeroU8::new(b'_'))
+        .leading_digit_separator(true)
+        .internal_digit_separator(true)
+        .trailing_digit_separator(true)
+        .consecutive_digit_separator(true)
+        .build();
+
+    // Tests have the following format:
+    //      hhhh ssssssss dddddddddddddddddd ....
+    // The `hhhh` part is the hexadecimal representation for f16,
+    // the `ssssssss` part is the hexadecimal representation of f32,
+    // the `dddddddddddddddddd` is the hex representation of f64,
+    // and the remaining bytes are the string to parse.
+    let hex32 = line[5..13].to_lowercase();
+    let hex64 = line[14..30].to_lowercase();
+    let string = &line[31..];
+    let options = Options::new();
+
+    // now we want to add the number of digit separators we'll use
+    let count = rng.gen_range(1..=4);
+    let mut vec = string.as_bytes().to_vec();
+    let length = vec.len();
+    for _ in 0..count {
+        let idx = rng.gen_range(0..length);
+        vec.insert(idx, b'_');
+    }
+    // we need to make sure that our digit separators are in the correct location
+    // that is, they cannot be before a `+-` symbol
+    let string = str::from_utf8(&vec).unwrap();
+    let valid = SIGN.replace(string, "${2}${1}");
+
+    let float32 = f32::from_lexical_with_options::<FMT>(valid.as_bytes(), &options).unwrap();
+    let float64 = f64::from_lexical_with_options::<FMT>(valid.as_bytes(), &options).unwrap();
+    assert_eq!(hex32, format!("{:0>8x}", float32.to_bits()));
+    assert_eq!(hex64, format!("{:0>16x}", float64.to_bits()));
+}
+
+#[cfg(not(feature = "digit-separator"))]
+fn run_test<Random: Rng>(line: &str, _: &mut Random) {
+    const FMT: u128 = STANDARD;
+
     // Tests have the following format:
     //      hhhh ssssssss dddddddddddddddddd ....
     // The `hhhh` part is the hexadecimal representation for f16,
@@ -14,9 +74,10 @@ fn run_test(line: &str) {
     let hex32 = line[5..13].to_lowercase();
     let hex64 = line[14..30].to_lowercase();
     let string = &line[31..];
+    let options = Options::new();
 
-    let float32 = f32::from_lexical(string.as_bytes()).unwrap();
-    let float64 = f64::from_lexical(string.as_bytes()).unwrap();
+    let float32 = f32::from_lexical_with_options::<FMT>(string.as_bytes(), &options).unwrap();
+    let float64 = f64::from_lexical_with_options::<FMT>(string.as_bytes(), &options).unwrap();
     assert_eq!(hex32, format!("{:0>8x}", float32.to_bits()));
     assert_eq!(hex64, format!("{:0>16x}", float64.to_bits()));
 }
@@ -68,13 +129,14 @@ fn main() {
     ]);
 
     // Unfortunately, randomize the data with miri is too expensive so we just use it normally.
+    let mut rng = Isaac64Rng::from_seed(ISAAC_SEED);
     for (&filename, data) in tests.iter() {
         println!("Running Test: {}", filename);
         for (count, line) in data.lines().enumerate() {
             if cfg!(miri) && count % 10 == 0 {
                 println!("Running test {count} for conversion tests.");
             }
-            run_test(line);
+            run_test(line, &mut rng);
             if cfg!(miri) && count > 3000 {
                 break;
             }

diff --git a/lexical-parse-float/etc/correctness/test-parse-random/rand-f64.rs b/lexical-parse-float/etc/correctness/test-parse-random/rand-f64.rs
@@ -16,10 +16,10 @@ use rand_isaac::Isaac64Rng;
 use std::mem::transmute;
 
 fn main() {
-    let mut rnd = Isaac64Rng::from_seed(ISAAC_SEED);
+    let mut rng = Isaac64Rng::from_seed(ISAAC_SEED);
     let mut i = 0;
     while i < 10_000_000 {
-        let bits = rnd.next_u64();
+        let bits = rng.next_u64();
         let x: f64 = unsafe { transmute(bits) };
         if x.is_finite() {
             validate(&format!("{:e}", x));