Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 2 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ atoi_simd = "0.16"
atomic-waker = "1"
avro-schema = { version = "0.3" }
base64 = "0.22.0"
bigdecimal = "0.4.8"
bincode = { version = "2.0", features = ["serde", "std"] }
bitflags = "2"
boxcar = "0.2.12"
Expand Down Expand Up @@ -59,6 +60,7 @@ libm = "0.2"
memchr = "2.6"
memmap = { package = "memmap2", version = "0.9" }
ndarray = { version = "0.16", default-features = false }
num-bigint = "0.4.6"
num-traits = "0.2"
numpy = "0.25"
object_store = { version = "0.12", default-features = false, features = ["fs"] }
Expand Down
304 changes: 1 addition & 303 deletions crates/polars-arrow/src/compute/decimal.rs
Original file line number Diff line number Diff line change
@@ -1,313 +1,11 @@
use num_traits::Euclid;
use polars_utils::relaxed_cell::RelaxedCell;

static TRIM_DECIMAL_ZEROS: RelaxedCell<bool> = RelaxedCell::new_bool(false);

pub fn get_trim_decimal_zeros() -> bool {
TRIM_DECIMAL_ZEROS.load()
}

pub fn set_trim_decimal_zeros(trim: Option<bool>) {
TRIM_DECIMAL_ZEROS.store(trim.unwrap_or(false))
}

/// Assuming bytes are a well-formed decimal number (with or without a separator),
/// infer the scale of the number. If no separator is present, the scale is 0.
pub fn infer_scale(bytes: &[u8]) -> u8 {
let Some(separator) = bytes.iter().position(|b| *b == b'.') else {
return 0;
};
(bytes.len() - (1 + separator)) as u8
}

/// Deserialize bytes to a single i128 representing a decimal, at a specified
/// precision (optional) and scale (required). The number is checked to ensure
/// it fits within the specified precision and scale. Consistent with float
/// parsing, no decimal separator is required (eg "500", "500.", and "500.0" are
/// all accepted); this allows mixed integer/decimal sequences to be parsed as
/// decimals. All trailing zeros are assumed to be significant, whether or not
/// a separator is present: 1200 requires precision >= 4, while 1200.200
/// requires precision >= 7 and scale >= 3. Returns None if the number is not
/// well-formed, or does not fit. Only b'.' is allowed as a decimal separator
/// (issue #6698).
#[inline]
pub fn deserialize_decimal(bytes: &[u8], precision: Option<u8>, scale: u8) -> Option<i128> {
let precision_digits = precision.unwrap_or(38).min(38) as usize;
if scale as usize > precision_digits {
return None;
}

let separator = bytes.iter().position(|b| *b == b'.').unwrap_or(bytes.len());
let (mut int, mut frac) = bytes.split_at(separator);
if frac.len() <= 1 || scale == 0 {
// Only integer fast path.
let n: i128 = atoi_simd::parse(int).ok()?;
let ret = n.checked_mul(POW10[scale as usize] as i128)?;
if precision.is_some() && ret >= POW10[precision_digits] as i128 {
return None;
}
return Some(ret);
}

// Skip period.
frac = &frac[1..];

// Skip sign.
let negative = match bytes.first() {
Some(s @ (b'+' | b'-')) => {
int = &int[1..];
*s == b'-'
},
_ => false,
};

// Truncate trailing digits that extend beyond the scale.
let frac_scale = if scale as usize <= frac.len() {
frac = &frac[..scale as usize];
0
} else {
scale as usize - frac.len()
};

// Parse and combine parts.
let pint: u128 = if int.is_empty() {
0
} else {
atoi_simd::parse_pos(int).ok()?
};
let pfrac: u128 = atoi_simd::parse_pos(frac).ok()?;

let ret = pint
.checked_mul(POW10[scale as usize])?
.checked_add(pfrac.checked_mul(POW10[frac_scale])?)?;
if precision.is_some() && ret >= POW10[precision_digits] {
return None;
}
if negative {
if ret > (1 << 127) {
None
} else {
Some(ret.wrapping_neg() as i128)
}
} else {
ret.try_into().ok()
}
}

const MAX_DECIMAL_LEN: usize = 48;

#[derive(Clone, Copy)]
pub struct DecimalFmtBuffer {
data: [u8; MAX_DECIMAL_LEN],
len: usize,
}

impl Default for DecimalFmtBuffer {
fn default() -> Self {
Self::new()
}
}

impl DecimalFmtBuffer {
#[inline]
pub const fn new() -> Self {
Self {
data: [0; MAX_DECIMAL_LEN],
len: 0,
}
}

pub fn format(&mut self, x: i128, scale: usize, trim_zeros: bool) -> &str {
let factor = POW10[scale];
let mut itoa_buf = itoa::Buffer::new();

self.len = 0;
let (div, rem) = x.unsigned_abs().div_rem_euclid(&factor);
if x < 0 {
self.data[0] = b'-';
self.len += 1;
}

let div_fmt = itoa_buf.format(div);
self.data[self.len..self.len + div_fmt.len()].copy_from_slice(div_fmt.as_bytes());
self.len += div_fmt.len();

if scale == 0 {
return unsafe { std::str::from_utf8_unchecked(&self.data[..self.len]) };
}

self.data[self.len] = b'.';
self.len += 1;

let rem_fmt = itoa_buf.format(rem + factor); // + factor adds leading 1 where period would be.
self.data[self.len..self.len + rem_fmt.len() - 1].copy_from_slice(&rem_fmt.as_bytes()[1..]);
self.len += rem_fmt.len() - 1;

if trim_zeros {
while self.data.get(self.len - 1) == Some(&b'0') {
self.len -= 1;
}
if self.data.get(self.len - 1) == Some(&b'.') {
self.len -= 1;
}
}

unsafe { std::str::from_utf8_unchecked(&self.data[..self.len]) }
}
}

const POW10: [u128; 39] = [
1,
10,
100,
1000,
10000,
100000,
1000000,
10000000,
100000000,
1000000000,
10000000000,
100000000000,
1000000000000,
10000000000000,
100000000000000,
1000000000000000,
10000000000000000,
100000000000000000,
1000000000000000000,
10000000000000000000,
100000000000000000000,
1000000000000000000000,
10000000000000000000000,
100000000000000000000000,
1000000000000000000000000,
10000000000000000000000000,
100000000000000000000000000,
1000000000000000000000000000,
10000000000000000000000000000,
100000000000000000000000000000,
1000000000000000000000000000000,
10000000000000000000000000000000,
100000000000000000000000000000000,
1000000000000000000000000000000000,
10000000000000000000000000000000000,
100000000000000000000000000000000000,
1000000000000000000000000000000000000,
10000000000000000000000000000000000000,
100000000000000000000000000000000000000,
];

#[cfg(test)]
mod test {
use super::*;
#[test]
fn test_decimal() {
let precision = Some(8);
let scale = 2;

let val = "12.09";
assert_eq!(
deserialize_decimal(val.as_bytes(), precision, scale),
Some(1209)
);

let val = "1200.90";
assert_eq!(
deserialize_decimal(val.as_bytes(), precision, scale),
Some(120090)
);

let val = "143.9";
assert_eq!(
deserialize_decimal(val.as_bytes(), precision, scale),
Some(14390)
);

let val = "+000000.5";
assert_eq!(
deserialize_decimal(val.as_bytes(), precision, scale),
Some(50)
);

let val = "-0.5";
assert_eq!(
deserialize_decimal(val.as_bytes(), precision, scale),
Some(-50)
);

let val = "-1.5";
assert_eq!(
deserialize_decimal(val.as_bytes(), precision, scale),
Some(-150)
);

let scale = 20;
let val = "0.01";
assert_eq!(deserialize_decimal(val.as_bytes(), precision, scale), None);
assert_eq!(
deserialize_decimal(val.as_bytes(), None, scale),
Some(1000000000000000000)
);

let scale = 5;
let val = "12ABC.34";
assert_eq!(deserialize_decimal(val.as_bytes(), precision, scale), None);

let val = "1ABC2.34";
assert_eq!(deserialize_decimal(val.as_bytes(), precision, scale), None);

let val = "12.3ABC4";
assert_eq!(deserialize_decimal(val.as_bytes(), precision, scale), None);

let val = "12.3.ABC4";
assert_eq!(deserialize_decimal(val.as_bytes(), precision, scale), None);

let val = "12.-3";
assert_eq!(deserialize_decimal(val.as_bytes(), precision, scale), None);

let val = "";
assert_eq!(deserialize_decimal(val.as_bytes(), precision, scale), None);

let val = "5.";
assert_eq!(
deserialize_decimal(val.as_bytes(), precision, scale),
Some(500000i128)
);

let val = "5";
assert_eq!(
deserialize_decimal(val.as_bytes(), precision, scale),
Some(500000i128)
);

let val = ".5";
assert_eq!(
deserialize_decimal(val.as_bytes(), precision, scale),
Some(50000i128)
);

// Precision and scale fitting:
let val = b"1200";
assert_eq!(deserialize_decimal(val, None, 0), Some(1200));
assert_eq!(deserialize_decimal(val, Some(4), 0), Some(1200));
assert_eq!(deserialize_decimal(val, Some(3), 0), None);
assert_eq!(deserialize_decimal(val, Some(4), 1), None);

let val = b"1200.010";
assert_eq!(deserialize_decimal(val, None, 0), Some(1200)); // truncate scale
assert_eq!(deserialize_decimal(val, None, 3), Some(1200010)); // exact scale
assert_eq!(deserialize_decimal(val, None, 6), Some(1200010000)); // excess scale
assert_eq!(deserialize_decimal(val, Some(7), 0), Some(1200)); // sufficient precision and truncate scale
assert_eq!(deserialize_decimal(val, Some(7), 3), Some(1200010)); // exact precision and scale
assert_eq!(deserialize_decimal(val, Some(10), 6), Some(1200010000)); // exact precision, excess scale
assert_eq!(deserialize_decimal(val, Some(5), 6), None); // insufficient precision, excess scale
assert_eq!(deserialize_decimal(val, Some(5), 3), None); // insufficient precision, exact scale
assert_eq!(deserialize_decimal(val, Some(12), 5), Some(120001000)); // excess precision, excess scale
assert_eq!(
deserialize_decimal(val, None, 35),
Some(120001000000000000000000000000000000000)
);
assert_eq!(deserialize_decimal(val, None, 36), None);
assert_eq!(deserialize_decimal(val, Some(38), 35), None); // scale causes insufficient precision
}
}
7 changes: 5 additions & 2 deletions crates/polars-arrow/src/datatypes/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -334,14 +334,17 @@ impl ArrowDataType {
pub fn underlying_physical_type(&self) -> ArrowDataType {
use ArrowDataType::*;
match self {
Date32 | Time32(_) | Interval(IntervalUnit::YearMonth) => Int32,
Date64
Decimal32(_, _) | Date32 | Time32(_) | Interval(IntervalUnit::YearMonth) => Int32,
Decimal64(_, _)
| Date64
| Timestamp(_, _)
| Time64(_)
| Duration(_)
| Interval(IntervalUnit::DayTime) => Int64,
Interval(IntervalUnit::MonthDayNano) => unimplemented!(),
Binary => Binary,
Decimal(_, _) => Int128,
Decimal256(_, _) => unimplemented!(),
List(field) => List(Box::new(Field {
dtype: field.dtype.underlying_physical_type(),
..*field.clone()
Expand Down
8 changes: 8 additions & 0 deletions crates/polars-arrow/src/types/native.rs
Original file line number Diff line number Diff line change
Expand Up @@ -710,6 +710,14 @@ impl i256 {
}
}

impl TryFrom<i256> for i128 {
type Error = core::num::TryFromIntError;

fn try_from(value: i256) -> Result<Self, Self::Error> {
value.0.try_into()
}
}

impl IsNull for i256 {
const HAS_NULLS: bool = false;
type Inner = i256;
Expand Down
2 changes: 2 additions & 0 deletions crates/polars-compute/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,8 @@ strength_reduce = { workspace = true }
strum_macros = { workspace = true }

[dev-dependencies]
bigdecimal = { workspace = true }
num-bigint = { workspace = true }
rand = { workspace = true }

arrow = { workspace = true, features = ["proptest"] }
Expand Down
Loading
Loading