Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
97 commits
Select commit Hold shift + click to select a range
9250bfe
Fix off-by-one-error.
orlp May 30, 2025
dddb355
Some more work, most notably Arrow round-trip.
orlp May 30, 2025
b53c94e
Implement casts.
orlp Jun 2, 2025
7faaf92
Simplify SeriesBuilder.
orlp Jun 3, 2025
ca8aa9b
Fix tests.
orlp Jun 3, 2025
ff46cc6
Simplify from_chunks_list_dtype.
orlp Jun 3, 2025
e4708cf
Fix arr_to_any_value.
orlp Jun 3, 2025
c2ab3a0
Series::from_physical_unchecked.
orlp Jun 3, 2025
f9dbbfd
Simplify any_values_to_categorical/enum.
orlp Jun 3, 2025
05c6e66
Series downcasting.
orlp Jun 3, 2025
5cef45d
Misc fixes.
orlp Jun 3, 2025
d52f929
Fix fmt.
orlp Jun 3, 2025
5416bcd
Add full_null_with_dtype.
orlp Jun 3, 2025
5f89f25
new_logical should be unsafe.
orlp Jun 3, 2025
7baa829
Append/extend.
orlp Jun 3, 2025
60d9753
Sort.
orlp Jun 3, 2025
4ac089c
SeriesTrait.
orlp Jun 3, 2025
a3e42e7
Various misc fixes.
orlp Jun 3, 2025
1c489b7
Inner compare.
orlp Jun 3, 2025
e8ea6da
Min/max.
orlp Jun 3, 2025
742b42d
WIP
orlp Jun 4, 2025
ba4d855
Remove group_tuples_perfect.
orlp Jun 4, 2025
e103750
Misc.
orlp Jun 5, 2025
f9a3688
Add conversion to arrow (no metadata yet).
orlp Jun 11, 2025
6b72e6b
Remove Categorical list builders.
orlp Jun 11, 2025
e73f6e9
Add CategoricalChunkedBuilder.
orlp Jun 11, 2025
92c5fe5
Fix matches_schema_type.
orlp Jun 24, 2025
0690332
Comparison kernel WIP.
orlp Jun 24, 2025
be15c92
Fix comparison.
orlp Jun 24, 2025
a251a1d
Dispatch comparisons.
orlp Jun 24, 2025
6d52959
Remove categorical row encoding context.
orlp Jun 24, 2025
55abe2a
Round-tripping with name, namespace, dtype identity.
orlp Jun 26, 2025
d67f5b4
Fix TakeChunked.
orlp Jun 26, 2025
cbdc868
Remove unnecessary categorical checks.
orlp Jun 26, 2025
b58eb5f
Fix enum <-> enum comparisons to use physical ordering.
orlp Jun 26, 2025
f36d63a
Arg min/max.
orlp Jun 26, 2025
3da02a3
Implement cut.
orlp Jun 26, 2025
0c41b65
Add is_in.
orlp Jun 26, 2025
c524a81
Fix some Rust tests.
orlp Jun 26, 2025
966fbd8
Fix polars-parquet.
orlp Jun 26, 2025
0b01731
Fix CSV categorical support.
orlp Jun 27, 2025
0d5d760
Fix polars-plan.
orlp Jun 27, 2025
35c3773
Fix polars-expr.
orlp Jun 27, 2025
5445aa1
Fix polars-mem-engine.
orlp Jun 27, 2025
6ad17cf
Fix polars-stream.
orlp Jun 27, 2025
42d6d7f
Fix polars-lazy.
orlp Jun 27, 2025
75b3e54
Fix polars crate.
orlp Jun 27, 2025
c6efd1c
Fix polars-python.
orlp Jun 27, 2025
6ad60fd
Fix unused import/variable warnings.
orlp Jun 27, 2025
981d099
Move categorical internals to polars-dtype.
orlp Jun 30, 2025
309ba10
Implement row encoding for categorical.
orlp Jun 30, 2025
3df0488
Remove old files.
orlp Jun 30, 2025
fe2c4f3
Fix Enum/Categorical construction.
orlp Jun 30, 2025
1270e42
Fix wrong physical type in conversion.
orlp Jun 30, 2025
5cba524
Fix AnyValue no-op cast.
orlp Jun 30, 2025
15406b4
More AnyValue categorical casts.
orlp Jun 30, 2025
5835556
Make converting from arrow more robust.
orlp Jul 1, 2025
fc5c38e
Transition to _PL_ENUM_VALUES2 to ensure older versions of Polars don…
orlp Jul 1, 2025
8a680d1
Fix warnings.
orlp Jul 1, 2025
919d510
Fix merge_sorted tests.
orlp Jul 1, 2025
15fc135
Fix from_buffers test.
orlp Jul 1, 2025
369e03d
Remove Deref on Logical in favor of explicit physical().
orlp Jul 1, 2025
52872e6
Fix _missing Categorical comparison kernel.
orlp Jul 1, 2025
7945e92
Fix tests in test_categorical.
orlp Jul 1, 2025
2feedd7
Remove test_string_cache.
orlp Jul 1, 2025
94a9e4c
Fix incorrect cast.
orlp Jul 2, 2025
09bbd62
Fix test_roundtrip.
orlp Jul 2, 2025
06e140b
Relax int <-> Enum/Categorical casting.
orlp Jul 2, 2025
6e4ac37
Make Enum <-> String comparison physically ordered.
orlp Jul 2, 2025
8b91f93
AnyValue to Categorical should respect strict.
orlp Jul 2, 2025
cf5c974
Fix test error messages.
orlp Jul 2, 2025
d6f7843
Fix test_cast.
orlp Jul 2, 2025
2986ac9
Change default to lexical ordering.
orlp Jul 2, 2025
e791631
Fix miscellaneous tests.
orlp Jul 2, 2025
deee61a
Update Cargo.lock.
orlp Jul 2, 2025
9eab52c
Allow get_buffers regardless of local categorical.
orlp Jul 2, 2025
62b0994
Fix more misc. tests.
orlp Jul 2, 2025
e0fac07
Fix to/from pandas tests.
orlp Jul 2, 2025
ae06956
Fix categorical append_owned.
orlp Jul 2, 2025
85259de
Fix last few tests.
orlp Jul 2, 2025
ef237d0
Fmt.
orlp Jul 2, 2025
464c6ab
Clippy.
orlp Jul 2, 2025
37f72b3
Feature flags.
orlp Jul 2, 2025
7efba9f
Dprint check.
orlp Jul 2, 2025
fcffdad
Rename NewCategorical/Enum to Categorical/Enum.
orlp Jul 2, 2025
9a65da7
Fmt.
orlp Jul 2, 2025
345b713
Mypy.
orlp Jul 2, 2025
a094c82
Bump DSL version.
orlp Jul 2, 2025
181fbad
Fix doctests.
orlp Jul 2, 2025
6d62214
Mypy.
orlp Jul 3, 2025
bf97989
Fix lexical categorical row-encoding.
orlp Jul 3, 2025
83a6b1e
Fix some more tests.
orlp Jul 3, 2025
ca14f71
Fmt.
orlp Jul 3, 2025
f54459b
Fix scatter.
orlp Jul 3, 2025
1b5bbe0
Remove remaining physical ordering references.
orlp Jul 3, 2025
c049a8c
Mypy.
orlp Jul 3, 2025
0644413
Skip get_categories doctest.
orlp Jul 3, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ zstd = "0.13"
polars = { version = "0.49.1", path = "crates/polars", default-features = false }
polars-compute = { version = "0.49.1", path = "crates/polars-compute", default-features = false }
polars-core = { version = "0.49.1", path = "crates/polars-core", default-features = false }
polars-dtype = { version = "0.49.1", path = "crates/polars-dtype", default-features = false }
polars-dylib = { version = "0.49.1", path = "crates/polars-dylib", default-features = false }
polars-error = { version = "0.49.1", path = "crates/polars-error", default-features = false }
polars-expr = { version = "0.49.1", path = "crates/polars-expr", default-features = false }
Expand Down
7 changes: 7 additions & 0 deletions crates/polars-arrow/src/bitmap/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,13 @@ impl BitmapBuilder {
self.bit_cap = words_available * 64;
}

pub fn clear(&mut self) {
self.buf = 0;
self.bit_len = 0;
self.set_bits_in_bytes = 0;
self.bytes.clear();
}

#[inline(always)]
pub fn push(&mut self, x: bool) {
self.reserve(1);
Expand Down
15 changes: 11 additions & 4 deletions crates/polars-arrow/src/datatypes/field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,15 @@ use serde::{Deserialize, Serialize};

use super::{ArrowDataType, Metadata};

pub static DTYPE_ENUM_VALUES: &str = "_PL_ENUM_VALUES";
pub static DTYPE_CATEGORICAL: &str = "_PL_CATEGORICAL";
// These two have the same encoding, but because older versions of Polars
// were unable to read non-u32-key arrow dictionaries while _PL_ENUM_VALUES
// is set we switched to a new version.
pub static DTYPE_ENUM_VALUES_LEGACY: &str = "_PL_ENUM_VALUES";
pub static DTYPE_ENUM_VALUES_NEW: &str = "_PL_ENUM_VALUES2";

// These have different encodings.
pub static DTYPE_CATEGORICAL_LEGACY: &str = "_PL_CATEGORICAL";
pub static DTYPE_CATEGORICAL_NEW: &str = "_PL_CATEGORICAL2";

/// Represents Arrow's metadata of a "column".
///
Expand Down Expand Up @@ -71,15 +78,15 @@ impl Field {

pub fn is_enum(&self) -> bool {
if let Some(md) = &self.metadata {
md.get(DTYPE_ENUM_VALUES).is_some()
md.get(DTYPE_ENUM_VALUES_LEGACY).is_some() || md.get(DTYPE_ENUM_VALUES_NEW).is_some()
} else {
false
}
}

pub fn is_categorical(&self) -> bool {
if let Some(md) = &self.metadata {
md.get(DTYPE_CATEGORICAL).is_some()
md.get(DTYPE_CATEGORICAL_LEGACY).is_some() || md.get(DTYPE_CATEGORICAL_NEW).is_some()
} else {
false
}
Expand Down
5 changes: 4 additions & 1 deletion crates/polars-arrow/src/datatypes/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@ mod schema;
use std::collections::BTreeMap;
use std::sync::Arc;

pub use field::{DTYPE_CATEGORICAL, DTYPE_ENUM_VALUES, Field};
pub use field::{
DTYPE_CATEGORICAL_LEGACY, DTYPE_CATEGORICAL_NEW, DTYPE_ENUM_VALUES_LEGACY,
DTYPE_ENUM_VALUES_NEW, Field,
};
pub use physical_type::*;
use polars_utils::pl_str::PlSmallStr;
pub use schema::{ArrowSchema, ArrowSchemaRef};
Expand Down
7 changes: 5 additions & 2 deletions crates/polars-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ description = "Core of the Polars DataFrame library"

[dependencies]
polars-compute = { workspace = true, features = ["gather"] }
polars-dtype = { workspace = true }
polars-error = { workspace = true }
polars-row = { workspace = true }
polars-schema = { workspace = true }
Expand All @@ -33,7 +34,7 @@ rand = { workspace = true, optional = true, features = ["small_rng", "std"] }
rand_distr = { workspace = true, optional = true }
rayon = { workspace = true }
regex = { workspace = true, optional = true }
schemars = { workspace = true, optional = true }
schemars = { workspace = true, optional = true, features = ["uuid1"] }
# activate if you want serde support for Series and DataFrames
serde = { workspace = true, optional = true }
serde_json = { workspace = true, optional = true }
Expand Down Expand Up @@ -132,6 +133,7 @@ serde = [
"polars-schema/serde",
"polars-utils/serde",
"polars-compute/serde",
"polars-dtype/serde",
"arrow/io_ipc",
"arrow/io_ipc_compression",
"serde_json",
Expand All @@ -140,9 +142,10 @@ serde-lazy = ["serde", "arrow/serde", "indexmap/serde", "chrono/serde"]
dsl-schema = [
"serde",
"dep:schemars",
"polars-compute/dsl-schema",
"polars-dtype/dsl-schema",
"polars-schema/dsl-schema",
"polars-utils/dsl-schema",
"polars-compute/dsl-schema",
]

docs-selection = [
Expand Down
82 changes: 82 additions & 0 deletions crates/polars-core/src/chunked_array/builder/categorical.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
use arrow::bitmap::BitmapBuilder;

use crate::prelude::*;

pub struct CategoricalChunkedBuilder<T: PolarsCategoricalType> {
name: PlSmallStr,
dtype: DataType,
mapping: Arc<CategoricalMapping>,
is_enum: bool,
cats: Vec<T::Native>,
validity: BitmapBuilder,
}

impl<T: PolarsCategoricalType> CategoricalChunkedBuilder<T> {
pub fn new(name: PlSmallStr, dtype: DataType) -> Self {
let (DataType::Categorical(_, mapping) | DataType::Enum(_, mapping)) = &dtype else {
panic!("non-Categorical/Enum dtype in CategoricalChunkedbuilder")
};
Self {
name,
mapping: mapping.clone(),
is_enum: matches!(dtype, DataType::Enum(_, _)),
dtype,
cats: Vec::new(),
validity: BitmapBuilder::new(),
}
}

pub fn dtype(&self) -> &DataType {
&self.dtype
}

pub fn reserve(&mut self, len: usize) {
self.cats.reserve(len);
self.validity.reserve(len);
}

pub fn append_cat(
&mut self,
cat: CatSize,
mapping: &Arc<CategoricalMapping>,
) -> PolarsResult<()> {
if Arc::ptr_eq(&self.mapping, mapping) {
self.cats.push(T::Native::from_cat(cat));
self.validity.push(true);
} else if let Some(s) = mapping.cat_to_str(cat) {
self.append_str(s)?;
} else {
self.append_null();
}
Ok(())
}

pub fn append_str(&mut self, val: &str) -> PolarsResult<()> {
let cat = if self.is_enum {
self.mapping.get_cat(val).ok_or_else(|| {
polars_err!(ComputeError: "attempted to insert '{val}' into Enum which does not contain this string")
})?
} else {
self.mapping.insert_cat(val)?
};
self.cats.push(T::Native::from_cat(cat));
self.validity.push(true);
Ok(())
}

pub fn append_null(&mut self) {
self.cats.push(T::Native::default());
self.validity.push(false);
}

pub fn finish(self) -> CategoricalChunked<T> {
unsafe {
let phys = ChunkedArray::from_vec_validity(
self.name,
self.cats,
self.validity.into_opt_validity(),
);
CategoricalChunked::from_cats_and_dtype_unchecked(phys, self.dtype)
}
}
}
46 changes: 28 additions & 18 deletions crates/polars-core/src/chunked_array/builder/list/anonymous.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ pub struct AnonymousListBuilder<'a> {
name: PlSmallStr,
builder: AnonymousBuilder<'a>,
fast_explode: bool,
inner_dtype: DtypeMerger,
inner_dtype: Option<DataType>,
}

impl Default for AnonymousListBuilder<'_> {
Expand All @@ -19,7 +19,7 @@ impl<'a> AnonymousListBuilder<'a> {
name,
builder: AnonymousBuilder::new(capacity),
fast_explode: true,
inner_dtype: DtypeMerger::new(inner_dtype),
inner_dtype,
}
}

Expand Down Expand Up @@ -59,13 +59,18 @@ impl<'a> AnonymousListBuilder<'a> {
}

pub fn append_series(&mut self, s: &'a Series) -> PolarsResult<()> {
match s.dtype() {
// Empty arrays tend to be null type and thus differ
// if we would push it the concat would fail.
DataType::Null if s.is_empty() => self.append_empty(),
dt => self.inner_dtype.update(dt)?,
match (s.dtype(), &self.inner_dtype) {
(DataType::Null, _) => {},
(dt, None) => self.inner_dtype = Some(dt.clone()),
(dt, Some(set_dt)) => {
polars_bail!(ComputeError: "dtypes don't match, got {dt}, expected: {set_dt}")
},
}
if s.is_empty() {
self.append_empty();
} else {
self.builder.push_multiple(s.chunks());
}
self.builder.push_multiple(s.chunks());
Ok(())
}

Expand All @@ -76,19 +81,18 @@ impl<'a> AnonymousListBuilder<'a> {
ListChunked::full_null_with_dtype(
slf.name.clone(),
0,
&slf.inner_dtype.materialize().unwrap_or(DataType::Null),
&slf.inner_dtype.unwrap_or(DataType::Null),
)
} else {
let inner_dtype = slf.inner_dtype.materialize();

let inner_dtype_physical = inner_dtype
let inner_dtype_physical = self
.inner_dtype
.as_ref()
.map(|dt| dt.to_physical().to_arrow(CompatLevel::newest()));
let arr = slf.builder.finish(inner_dtype_physical.as_ref()).unwrap();

let list_dtype_logical = match inner_dtype {
let list_dtype_logical = match &self.inner_dtype {
None => DataType::from_arrow_dtype(arr.dtype()),
Some(dt) => DataType::List(Box::new(dt)),
Some(dt) => DataType::List(Box::new(dt.clone())),
};

let mut ca = ListChunked::with_chunk(PlSmallStr::EMPTY, arr);
Expand All @@ -105,7 +109,7 @@ pub struct AnonymousOwnedListBuilder {
name: PlSmallStr,
builder: AnonymousBuilder<'static>,
owned: Vec<Series>,
inner_dtype: DtypeMerger,
inner_dtype: Option<DataType>,
fast_explode: bool,
}

Expand All @@ -117,11 +121,17 @@ impl Default for AnonymousOwnedListBuilder {

impl ListBuilderTrait for AnonymousOwnedListBuilder {
fn append_series(&mut self, s: &Series) -> PolarsResult<()> {
match (s.dtype(), &self.inner_dtype) {
(DataType::Null, _) => {},
(dt, None) => self.inner_dtype = Some(dt.clone()),
(dt, Some(set_dt)) => {
polars_ensure!(dt == set_dt, ComputeError: "dtypes don't match, got {dt}, expected: {set_dt}")
},
}
if s.is_empty() {
self.append_empty();
} else {
unsafe {
self.inner_dtype.update(s.dtype())?;
self.builder
.push_multiple(&*(s.chunks().as_ref() as *const [ArrayRef]));
}
Expand All @@ -138,7 +148,7 @@ impl ListBuilderTrait for AnonymousOwnedListBuilder {
}

fn finish(&mut self) -> ListChunked {
let inner_dtype = std::mem::take(&mut self.inner_dtype).materialize();
let inner_dtype = std::mem::take(&mut self.inner_dtype);
// Don't use self from here on out.
let slf = std::mem::take(self);
let inner_dtype_physical = inner_dtype
Expand Down Expand Up @@ -166,7 +176,7 @@ impl AnonymousOwnedListBuilder {
name,
builder: AnonymousBuilder::new(capacity),
owned: Vec::with_capacity(capacity),
inner_dtype: DtypeMerger::new(inner_dtype),
inner_dtype,
fast_explode: true,
}
}
Expand Down
Loading
Loading