Merge pull request #161 from Alexhuszagh/issue_96

Improve the formatting API.
Alexhuszagh · Sep 24, 2024 · eb3eb29 · eb3eb29
2 parents c102122 + df828cd
commit eb3eb29
Show file tree

Hide file tree

Showing 19 changed files with 2,299 additions and 262 deletions.
diff --git a/CHANGELOG b/CHANGELOG
@@ -7,11 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+### Changed
+
+- Higher performance when parsing floats with digit separators.
+
 ### Fixed
 
 - Inlining inconsistency between public API methods (credit to @zheland)
 - Incorrectly accepting leading zeros when `no_integer_leading_zeros` was enabled.
 - Have consistent errors when an invalid leading digit is found for floating point numbers to always be `Error::InvalidDigit`.
+- Incorrect parsing of consecutive digit separators.
+- Inaccuracies when parsing digit separators at various positions leading to incorect errors being returned.
 
 ## [1.0.1] 2024-09-16
 

diff --git a/ci/comprehensive.sh b/ci/comprehensive.sh
@@ -18,6 +18,7 @@ run_tests() {
     cd "${home}"
     cd lexical-parse-float/etc/correctness
     cargo run "${@}" --release --bin test-parse-golang
+    cargo run "${@}" --release --bin test-parse-golang --features digit-separator
     cargo run "${@}" --release --bin test-parse-unittests
 
     # Test the write-float correctness tests.

diff --git a/clippy.toml b/clippy.toml
@@ -5,6 +5,8 @@ disallowed-macros = [
     { path = "std::println", reason = "no IO allowed" },
     { path = "std::format", reason = "no string allocation allowed" },
     { path = "std::debug", reason = "debugging macros should not be present in any release" },
+    # NOTE: unimplemented is fine because this can be for intentionally disabled methods
+    { path = "std::todo", reason = "should never have TODO macros in releases" },
 ]
 disallowed-methods = [
     { path = "std::io::stdout", reason = "no IO allowed" },

diff --git a/lexical-parse-float/etc/correctness/Cargo.toml b/lexical-parse-float/etc/correctness/Cargo.toml
@@ -10,19 +10,27 @@ path = "../.."
 default-features = false
 features = []
 
+[dependencies.lexical-util]
+path = "../../../lexical-util"
+default-features = false
+features = []
+
 [dependencies]
 rand = "0.8"
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1.0"
 toml = "0.8"
 rand_isaac = ">=0.3.0"
+regex = { version = ">=1.10.6", optional = true}
+lazy_static = { version = ">=1.5.0", optional = true }
 
 [features]
-std = ["lexical-parse-float/std"]
-power-of-two = ["lexical-parse-float/power-of-two"]
-radix = ["lexical-parse-float/radix"]
-format = ["lexical-parse-float/format"]
+std = ["lexical-parse-float/std", "lexical-util/std"]
+power-of-two = ["lexical-parse-float/power-of-two", "lexical-util/power-of-two"]
+radix = ["lexical-parse-float/radix", "lexical-util/radix"]
+format = ["lexical-parse-float/format", "lexical-util/format"]
 compact = ["lexical-parse-float/compact"]
+digit-separator = ["format", "regex", "lazy_static"]
 
 [workspace]
 

diff --git a/lexical-parse-float/etc/correctness/test-parse-golang/main.rs b/lexical-parse-float/etc/correctness/test-parse-golang/main.rs
@@ -1,10 +1,70 @@
 // Copyright 2021, Alex Huszagh. Unlicensed.
 // See https://unlicense.org/
 
-use lexical_parse_float::FromLexical;
+#![allow(unused_imports)]
+
+use lexical_parse_float::{FromLexicalWithOptions, Options};
+use lexical_util::format::{NumberFormatBuilder, STANDARD};
+use rand::{Rng, SeedableRng};
+use rand_isaac::Isaac64Rng;
+use core::{num, str};
 use std::collections::HashMap;
 
-fn run_test(line: &str) {
+#[allow(dead_code)]
+pub const ISAAC_SEED: [u8; 32] = [
+    49, 52, 49, 53, 57, 50, 54, 53, 51, 53, 56, 57, 55, 57, 51, 50, 51, 56, 52, 54, 50, 54, 52, 51,
+    51, 56, 51, 50, 55, 57, 53, 48,
+];
+
+#[cfg(feature = "digit-separator")]
+lazy_static::lazy_static! {
+    static ref SIGN: regex::Regex = regex::Regex::new("(_+)([+-])").unwrap();
+}
+
+#[cfg(feature = "digit-separator")]
+fn run_test<Random: Rng>(line: &str, rng: &mut Random) {
+    const FMT: u128 = NumberFormatBuilder::new()
+        .digit_separator(num::NonZeroU8::new(b'_'))
+        .leading_digit_separator(true)
+        .internal_digit_separator(true)
+        .trailing_digit_separator(true)
+        .consecutive_digit_separator(true)
+        .build();
+
+    // Tests have the following format:
+    //      hhhh ssssssss dddddddddddddddddd ....
+    // The `hhhh` part is the hexadecimal representation for f16,
+    // the `ssssssss` part is the hexadecimal representation of f32,
+    // the `dddddddddddddddddd` is the hex representation of f64,
+    // and the remaining bytes are the string to parse.
+    let hex32 = line[5..13].to_lowercase();
+    let hex64 = line[14..30].to_lowercase();
+    let string = &line[31..];
+    let options = Options::new();
+
+    // now we want to add the number of digit separators we'll use
+    let count = rng.gen_range(1..=4);
+    let mut vec = string.as_bytes().to_vec();
+    let length = vec.len();
+    for _ in 0..count {
+        let idx = rng.gen_range(0..length);
+        vec.insert(idx, b'_');
+    }
+    // we need to make sure that our digit separators are in the correct location
+    // that is, they cannot be before a `+-` symbol
+    let string = str::from_utf8(&vec).unwrap();
+    let valid = SIGN.replace(string, "${2}${1}");
+
+    let float32 = f32::from_lexical_with_options::<FMT>(valid.as_bytes(), &options).unwrap();
+    let float64 = f64::from_lexical_with_options::<FMT>(valid.as_bytes(), &options).unwrap();
+    assert_eq!(hex32, format!("{:0>8x}", float32.to_bits()));
+    assert_eq!(hex64, format!("{:0>16x}", float64.to_bits()));
+}
+
+#[cfg(not(feature = "digit-separator"))]
+fn run_test<Random: Rng>(line: &str, _: &mut Random) {
+    const FMT: u128 = STANDARD;
+
     // Tests have the following format:
     //      hhhh ssssssss dddddddddddddddddd ....
     // The `hhhh` part is the hexadecimal representation for f16,
@@ -14,9 +74,10 @@ fn run_test(line: &str) {
     let hex32 = line[5..13].to_lowercase();
     let hex64 = line[14..30].to_lowercase();
     let string = &line[31..];
+    let options = Options::new();
 
-    let float32 = f32::from_lexical(string.as_bytes()).unwrap();
-    let float64 = f64::from_lexical(string.as_bytes()).unwrap();
+    let float32 = f32::from_lexical_with_options::<FMT>(string.as_bytes(), &options).unwrap();
+    let float64 = f64::from_lexical_with_options::<FMT>(string.as_bytes(), &options).unwrap();
     assert_eq!(hex32, format!("{:0>8x}", float32.to_bits()));
     assert_eq!(hex64, format!("{:0>16x}", float64.to_bits()));
 }
@@ -68,13 +129,14 @@ fn main() {
     ]);
 
     // Unfortunately, randomize the data with miri is too expensive so we just use it normally.
+    let mut rng = Isaac64Rng::from_seed(ISAAC_SEED);
     for (&filename, data) in tests.iter() {
         println!("Running Test: {}", filename);
         for (count, line) in data.lines().enumerate() {
             if cfg!(miri) && count % 10 == 0 {
                 println!("Running test {count} for conversion tests.");
             }
-            run_test(line);
+            run_test(line, &mut rng);
             if cfg!(miri) && count > 3000 {
                 break;
             }

diff --git a/lexical-parse-float/etc/correctness/test-parse-random/rand-f64.rs b/lexical-parse-float/etc/correctness/test-parse-random/rand-f64.rs
@@ -16,10 +16,10 @@ use rand_isaac::Isaac64Rng;
 use std::mem::transmute;
 
 fn main() {
-    let mut rnd = Isaac64Rng::from_seed(ISAAC_SEED);
+    let mut rng = Isaac64Rng::from_seed(ISAAC_SEED);
     let mut i = 0;
     while i < 10_000_000 {
-        let bits = rnd.next_u64();
+        let bits = rng.next_u64();
         let x: f64 = unsafe { transmute(bits) };
         if x.is_finite() {
             validate(&format!("{:e}", x));