diff --git a/arrow-cast/src/parse.rs b/arrow-cast/src/parse.rs index b266cc4aa360..a23f421c34db 100644 --- a/arrow-cast/src/parse.rs +++ b/arrow-cast/src/parse.rs @@ -741,32 +741,27 @@ fn parse_e_notation( let mut exp: i16 = 0; let base = T::Native::usize_as(10); - let mut exp_start: bool = false; // e has a plus sign let mut pos_shift_direction: bool = true; - // skip to point or exponent index - let mut bs; - if fractionals > 0 { - // it's a fraction, so the point index needs to be skipped, so +1 - bs = s.as_bytes().iter().skip(index + fractionals as usize + 1); - } else { - // it's actually an integer that is already written into the result, so let's skip on to e - bs = s.as_bytes().iter().skip(index); - } + // skip to the exponent index directly or just after any processed fractionals + let mut bs = s.as_bytes().iter().skip(index + fractionals as usize); - while let Some(b) = bs.next() { + // This function is only called from `parse_decimal`, in which we skip parsing any fractionals + // after we reach `scale` digits, not knowing ahead of time whether the decimal contains an + // e-notation or not. + // So once we do hit into an e-notation, and drop down into this function, we need to parse the + // remaining unprocessed fractionals too, since otherwise we might lose precision. + for b in bs.by_ref() { match b { b'0'..=b'9' => { result = result.mul_wrapping(base); result = result.add_wrapping(T::Native::usize_as((b - b'0') as usize)); - if fractionals > 0 { - fractionals += 1; - } + fractionals += 1; digits += 1; } - &b'e' | &b'E' => { - exp_start = true; + b'e' | b'E' => { + break; } _ => { return Err(ArrowError::ParseError(format!( @@ -774,39 +769,29 @@ fn parse_e_notation( ))); } }; + } - if exp_start { - pos_shift_direction = match bs.next() { - Some(&b'-') => false, - Some(&b'+') => true, - Some(b) => { - if !b.is_ascii_digit() { - return Err(ArrowError::ParseError(format!( - "can't parse the string value {s} to decimal" - ))); - } - - exp *= 10; - exp += (b - b'0') as i16; - - true - } - None => { - return Err(ArrowError::ParseError(format!( - "can't parse the string value {s} to decimal" - ))); - } - }; - - for b in bs.by_ref() { - if !b.is_ascii_digit() { - return Err(ArrowError::ParseError(format!( - "can't parse the string value {s} to decimal" - ))); - } + // parse the exponent itself + let mut signed = false; + for b in bs { + match b { + b'-' if !signed => { + pos_shift_direction = false; + signed = true; + } + b'+' if !signed => { + pos_shift_direction = true; + signed = true; + } + b if b.is_ascii_digit() => { exp *= 10; exp += (b - b'0') as i16; } + _ => { + return Err(ArrowError::ParseError(format!( + "can't parse the string value {s} to decimal" + ))); + } } } @@ -850,7 +835,11 @@ fn parse_e_notation( } /// Parse the string format decimal value to i128/i256 format and checking the precision and scale. -/// The result value can't be out of bounds. +/// Expected behavior: +/// - The result value can't be out of bounds. +/// - When parsing a decimal with scale 0, all fractional digits will be discarded. The final +/// fractional digits may be a subset or a superset of the digits after the decimal point when +/// e-notation is used. pub fn parse_decimal( s: &str, precision: u8, @@ -862,18 +851,24 @@ pub fn parse_decimal( let base = T::Native::usize_as(10); let bs = s.as_bytes(); - let (signed, negative) = match bs.first() { - Some(b'-') => (true, true), - Some(b'+') => (true, false), - _ => (false, false), - }; - if bs.is_empty() || signed && bs.len() == 1 { + if !bs + .last() + .is_some_and(|b| b.is_ascii_digit() || (b == &b'.' && s.len() > 1)) + { + // If the last character is not a digit (or a decimal point prefixed with some digits), then + // it's not a valid decimal. return Err(ArrowError::ParseError(format!( "can't parse the string value {s} to decimal" ))); } + let (signed, negative) = match bs.first() { + Some(b'-') => (true, true), + Some(b'+') => (true, false), + _ => (false, false), + }; + // Iterate over the raw input bytes, skipping the sign if any let mut bs = bs.iter().enumerate().skip(signed as usize); @@ -903,7 +898,7 @@ pub fn parse_decimal( digits as u16, fractionals as i16, result, - point_index, + point_index + 1, precision as u16, scale as i16, )?; @@ -916,7 +911,7 @@ pub fn parse_decimal( "can't parse the string value {s} to decimal" ))); } - if fractionals == scale && scale != 0 { + if fractionals == scale { // We have processed all the digits that we need. All that // is left is to validate that the rest of the string contains // valid digits. @@ -931,13 +926,6 @@ pub fn parse_decimal( if is_e_notation { break; } - - // Fail on "." - if digits == 0 { - return Err(ArrowError::ParseError(format!( - "can't parse the string value {s} to decimal" - ))); - } } b'e' | b'E' => { result = parse_e_notation::( @@ -2613,6 +2601,9 @@ mod tests { "1.1e.12", "1.23e+3.", "1.23e+3.1", + "1e", + "1e+", + "1e-", ]; for s in can_not_parse_tests { let result_128 = parse_decimal::(s, 20, 3); @@ -2636,6 +2627,7 @@ mod tests { ("12345678908765.123456", 3), ("123456789087651234.56e-4", 3), ("1234560000000", 0), + ("12345678900.0", 0), ("1.23456e12", 0), ]; for (s, scale) in overflow_parse_tests { @@ -2752,6 +2744,45 @@ mod tests { let result = parse_decimal::(s, 76, scale); assert_eq!(i, result.unwrap()); } + + let zero_scale_tests = [ + (".123", 0, 3), + ("0.123", 0, 3), + ("1.0", 1, 3), + ("1.2", 1, 3), + ("1.00", 1, 3), + ("1.23", 1, 3), + ("1.000", 1, 3), + ("1.123", 1, 3), + ("123.0", 123, 3), + ("123.4", 123, 3), + ("123.00", 123, 3), + ("123.45", 123, 3), + ("123.000000000000000000004", 123, 3), + ("0.123e2", 12, 3), + ("0.123e4", 1230, 10), + ("1.23e4", 12300, 10), + ("12.3e4", 123000, 10), + ("123e4", 1230000, 10), + ( + "20000000000000000000000000000000000002.0", + 20000000000000000000000000000000000002, + 38, + ), + ]; + for (s, i, precision) in zero_scale_tests { + let result_128 = parse_decimal::(s, precision, 0).unwrap(); + assert_eq!(i, result_128); + } + + let can_not_parse_zero_scale = [".", "blag", "", "+", "-", "e"]; + for s in can_not_parse_zero_scale { + let result_128 = parse_decimal::(s, 5, 0); + assert_eq!( + format!("Parser error: can't parse the string value {s} to decimal"), + result_128.unwrap_err().to_string(), + ); + } } #[test]