-
Notifications
You must be signed in to change notification settings - Fork 1.1k
fix: liberal parsing of zero scale decimals #8700
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
780ae35
fef1ddd
3a5d0dc
f4f3ff5
c3154aa
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -741,72 +741,57 @@ fn parse_e_notation<T: DecimalType>( | |
| let mut exp: i16 = 0; | ||
| let base = T::Native::usize_as(10); | ||
|
|
||
| let mut exp_start: bool = false; | ||
| // e has a plus sign | ||
| let mut pos_shift_direction: bool = true; | ||
|
|
||
| // skip to point or exponent index | ||
| let mut bs; | ||
| if fractionals > 0 { | ||
| // it's a fraction, so the point index needs to be skipped, so +1 | ||
| bs = s.as_bytes().iter().skip(index + fractionals as usize + 1); | ||
| } else { | ||
| // it's actually an integer that is already written into the result, so let's skip on to e | ||
| bs = s.as_bytes().iter().skip(index); | ||
| } | ||
| // skip to the exponent index directly or just after any processed fractionals | ||
| let mut bs = s.as_bytes().iter().skip(index + fractionals as usize); | ||
|
|
||
| while let Some(b) = bs.next() { | ||
| // This function is only called from `parse_decimal`, in which we skip parsing any fractionals | ||
| // after we reach `scale` digits, not knowing ahead of time whether the decimal contains an | ||
| // e-notation or not. | ||
| // So once we do hit into an e-notation, and drop down into this function, we need to parse the | ||
| // remaining unprocessed fractionals too, since otherwise we might lose precision. | ||
| for b in bs.by_ref() { | ||
| match b { | ||
| b'0'..=b'9' => { | ||
| result = result.mul_wrapping(base); | ||
| result = result.add_wrapping(T::Native::usize_as((b - b'0') as usize)); | ||
| if fractionals > 0 { | ||
| fractionals += 1; | ||
| } | ||
| fractionals += 1; | ||
| digits += 1; | ||
| } | ||
| &b'e' | &b'E' => { | ||
| exp_start = true; | ||
| b'e' | b'E' => { | ||
| break; | ||
| } | ||
| _ => { | ||
| return Err(ArrowError::ParseError(format!( | ||
| "can't parse the string value {s} to decimal" | ||
| ))); | ||
| } | ||
| }; | ||
| } | ||
|
|
||
| if exp_start { | ||
| pos_shift_direction = match bs.next() { | ||
| Some(&b'-') => false, | ||
| Some(&b'+') => true, | ||
| Some(b) => { | ||
| if !b.is_ascii_digit() { | ||
| return Err(ArrowError::ParseError(format!( | ||
| "can't parse the string value {s} to decimal" | ||
| ))); | ||
| } | ||
|
|
||
| exp *= 10; | ||
| exp += (b - b'0') as i16; | ||
|
|
||
| true | ||
| } | ||
| None => { | ||
| return Err(ArrowError::ParseError(format!( | ||
| "can't parse the string value {s} to decimal" | ||
| ))); | ||
| } | ||
| }; | ||
|
|
||
| for b in bs.by_ref() { | ||
| if !b.is_ascii_digit() { | ||
| return Err(ArrowError::ParseError(format!( | ||
| "can't parse the string value {s} to decimal" | ||
| ))); | ||
| } | ||
| // parse the exponent itself | ||
| let mut signed = false; | ||
| for b in bs { | ||
| match b { | ||
| b'-' if !signed => { | ||
| pos_shift_direction = false; | ||
| signed = true; | ||
| } | ||
| b'+' if !signed => { | ||
| pos_shift_direction = true; | ||
| signed = true; | ||
| } | ||
| b if b.is_ascii_digit() => { | ||
| exp *= 10; | ||
| exp += (b - b'0') as i16; | ||
| } | ||
| _ => { | ||
| return Err(ArrowError::ParseError(format!( | ||
| "can't parse the string value {s} to decimal" | ||
| ))); | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
@@ -850,7 +835,11 @@ fn parse_e_notation<T: DecimalType>( | |
| } | ||
|
|
||
| /// Parse the string format decimal value to i128/i256 format and checking the precision and scale. | ||
| /// The result value can't be out of bounds. | ||
| /// Expected behavior: | ||
| /// - The result value can't be out of bounds. | ||
| /// - When parsing a decimal with scale 0, all fractional digits will be discarded. The final | ||
| /// fractional digits may be a subset or a superset of the digits after the decimal point when | ||
| /// e-notation is used. | ||
| pub fn parse_decimal<T: DecimalType>( | ||
| s: &str, | ||
| precision: u8, | ||
|
|
@@ -862,18 +851,24 @@ pub fn parse_decimal<T: DecimalType>( | |
| let base = T::Native::usize_as(10); | ||
|
|
||
| let bs = s.as_bytes(); | ||
| let (signed, negative) = match bs.first() { | ||
| Some(b'-') => (true, true), | ||
| Some(b'+') => (true, false), | ||
| _ => (false, false), | ||
| }; | ||
|
|
||
| if bs.is_empty() || signed && bs.len() == 1 { | ||
| if !bs | ||
| .last() | ||
| .is_some_and(|b| b.is_ascii_digit() || (b == &b'.' && s.len() > 1)) | ||
| { | ||
| // If the last character is not a digit (or a decimal point prefixed with some digits), then | ||
| // it's not a valid decimal. | ||
| return Err(ArrowError::ParseError(format!( | ||
| "can't parse the string value {s} to decimal" | ||
| ))); | ||
| } | ||
|
|
||
| let (signed, negative) = match bs.first() { | ||
| Some(b'-') => (true, true), | ||
| Some(b'+') => (true, false), | ||
| _ => (false, false), | ||
| }; | ||
|
|
||
| // Iterate over the raw input bytes, skipping the sign if any | ||
| let mut bs = bs.iter().enumerate().skip(signed as usize); | ||
|
|
||
|
|
@@ -903,7 +898,7 @@ pub fn parse_decimal<T: DecimalType>( | |
| digits as u16, | ||
| fractionals as i16, | ||
| result, | ||
| point_index, | ||
| point_index + 1, | ||
| precision as u16, | ||
| scale as i16, | ||
| )?; | ||
|
|
@@ -916,7 +911,7 @@ pub fn parse_decimal<T: DecimalType>( | |
| "can't parse the string value {s} to decimal" | ||
| ))); | ||
| } | ||
| if fractionals == scale && scale != 0 { | ||
| if fractionals == scale { | ||
| // We have processed all the digits that we need. All that | ||
| // is left is to validate that the rest of the string contains | ||
| // valid digits. | ||
|
|
@@ -931,13 +926,6 @@ pub fn parse_decimal<T: DecimalType>( | |
| if is_e_notation { | ||
| break; | ||
| } | ||
|
|
||
| // Fail on "." | ||
| if digits == 0 { | ||
| return Err(ArrowError::ParseError(format!( | ||
| "can't parse the string value {s} to decimal" | ||
| ))); | ||
| } | ||
| } | ||
| b'e' | b'E' => { | ||
| result = parse_e_notation::<T>( | ||
|
|
@@ -2613,6 +2601,9 @@ mod tests { | |
| "1.1e.12", | ||
| "1.23e+3.", | ||
| "1.23e+3.1", | ||
| "1e", | ||
| "1e+", | ||
| "1e-", | ||
| ]; | ||
| for s in can_not_parse_tests { | ||
| let result_128 = parse_decimal::<Decimal128Type>(s, 20, 3); | ||
|
|
@@ -2636,6 +2627,7 @@ mod tests { | |
| ("12345678908765.123456", 3), | ||
| ("123456789087651234.56e-4", 3), | ||
| ("1234560000000", 0), | ||
| ("12345678900.0", 0), | ||
| ("1.23456e12", 0), | ||
| ]; | ||
| for (s, scale) in overflow_parse_tests { | ||
|
|
@@ -2752,6 +2744,45 @@ mod tests { | |
| let result = parse_decimal::<Decimal256Type>(s, 76, scale); | ||
| assert_eq!(i, result.unwrap()); | ||
| } | ||
|
|
||
| let zero_scale_tests = [ | ||
gruuya marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| (".123", 0, 3), | ||
| ("0.123", 0, 3), | ||
| ("1.0", 1, 3), | ||
| ("1.2", 1, 3), | ||
| ("1.00", 1, 3), | ||
| ("1.23", 1, 3), | ||
| ("1.000", 1, 3), | ||
| ("1.123", 1, 3), | ||
| ("123.0", 123, 3), | ||
| ("123.4", 123, 3), | ||
| ("123.00", 123, 3), | ||
| ("123.45", 123, 3), | ||
| ("123.000000000000000000004", 123, 3), | ||
| ("0.123e2", 12, 3), | ||
| ("0.123e4", 1230, 10), | ||
| ("1.23e4", 12300, 10), | ||
| ("12.3e4", 123000, 10), | ||
| ("123e4", 1230000, 10), | ||
| ( | ||
| "20000000000000000000000000000000000002.0", | ||
| 20000000000000000000000000000000000002, | ||
| 38, | ||
| ), | ||
| ]; | ||
| for (s, i, precision) in zero_scale_tests { | ||
| let result_128 = parse_decimal::<Decimal128Type>(s, precision, 0).unwrap(); | ||
| assert_eq!(i, result_128); | ||
| } | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you also add some error tests, for example
Should both fail to parse Also how about these values? Should they be valid? "1e", "1e+", "1e-", ".", and ".123"
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks, these cases turned out to be very valuable. For one thing if matches!(s, "" | "-" | "+" | ".") {
return Err(ArrowError::ParseError(format!(
"can't parse the string value {s} to decimal"
)));
}I found this is a lot easier to grok, since previously these validations were scattered throughout For another thing, the test cases To make these consistent I think we should error out on all of them (that's what Postgres does), at least until there's a worthwhile precedent to parse some of these. So if the
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think that sounds good to me. I tested some other systems and they also can't parse these examples D select '1e'::decimal;
Conversion Error:
Could not convert string "1e" to DECIMAL(18,3)
LINE 1: select '1e'::decimal;spark also doesn't parse them spark-sql (default)> SELECT cast ('1' as DECIMAL(5,3));
1.000
Time taken: 0.048 seconds, Fetched 1 row(s)
spark-sql (default)> SELECT cast ('1e' as DECIMAL(5,3));
NULL
Time taken: 0.052 seconds, Fetched 1 row(s)
spark-sql (default)> SELECT cast ('e' as DECIMAL(5,3));
NULL
Time taken: 0.034 seconds, Fetched 1 row(s)
spark-sql (default)> SELECT cast ('1e+' as DECIMAL(5,3));
NULL
Time taken: 0.033 seconds, Fetched 1 row(s)
spark-sql (default)> SELECT cast ('1e' as DECIMAL(5,0));
NULL
Time taken: 0.038 seconds, Fetched 1 row(s)
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice, I've appended these cases to unsupported check and test cases. |
||
|
|
||
| let can_not_parse_zero_scale = [".", "blag", "", "+", "-", "e"]; | ||
| for s in can_not_parse_zero_scale { | ||
| let result_128 = parse_decimal::<Decimal128Type>(s, 5, 0); | ||
| assert_eq!( | ||
| format!("Parser error: can't parse the string value {s} to decimal"), | ||
| result_128.unwrap_err().to_string(), | ||
| ); | ||
| } | ||
| } | ||
|
|
||
| #[test] | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we're refactoring perhaps we should change the type of fractions to be unsigned? It was a little confusing to have it be signed to me as I had to verify that your refactor to collapse the if/else made sense since fractionals was never negative (and the
+ 1in the if block is accounted for by adjustingindexat the callsite)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That makes sense, though this leads to some cascading casting and type changes, as we use
fractionalsto compute/compare with other variables, notably withscalewhich isi8.So we can then make
scalean unsigned type too, since it looks like negative scale is not supported right now.However I'd rather retain the possibility of supporting negative scales at some point, so I went with changing the
scaletype toi16instead. That wayfractionals: u8can be safely cast toi16for comparison/arithmetic (plusparse_e_notationalready expectsscaleto bei16).There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Hmm,
scalebeingi8is more widespread than I thought, I reverted the commit that changes it.I don't think we can seamlessly change
fractionalsto an unsigned type then.