Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion crates/polars-core/src/chunked_array/builder/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,12 @@ mod test {
builder.append_null();

let out = builder.finish();
let out = out.explode(false).unwrap();
let out = out
.explode(ExplodeOptions {
empty_as_null: true,
keep_nulls: true,
})
.unwrap();
assert_eq!(out.len(), 7);
assert_eq!(out.get(6).unwrap(), AnyValue::Null);
}
Expand Down
21 changes: 21 additions & 0 deletions crates/polars-core/src/chunked_array/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -630,6 +630,27 @@ impl ListChunked {
}
}

pub fn has_empty_lists(&self) -> bool {
for arr in self.downcast_iter() {
if arr.is_empty() {
continue;
}

if match arr.validity() {
None => arr.offsets().lengths().any(|l| l == 0),
Some(validity) => arr
.offsets()
.lengths()
.enumerate()
.any(|(i, l)| l == 0 && unsafe { validity.get_bit_unchecked(i) }),
} {
return true;
}
}

false
}

pub fn has_masked_out_values(&self) -> bool {
for arr in self.downcast_iter() {
if arr.is_empty() {
Expand Down
54 changes: 36 additions & 18 deletions crates/polars-core/src/chunked_array/ops/explode.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use crate::prelude::*;
use crate::series::implementations::null::NullChunked;

pub(crate) trait ExplodeByOffsets {
fn explode_by_offsets(&self, offsets: &[i64], skip_empty: bool) -> Series;
fn explode_by_offsets(&self, offsets: &[i64], options: ExplodeOptions) -> Series;
}

unsafe fn unset_nulls(
Expand All @@ -34,7 +34,7 @@ impl<T> ExplodeByOffsets for ChunkedArray<T>
where
T: PolarsIntegerType,
{
fn explode_by_offsets(&self, offsets: &[i64], skip_empty: bool) -> Series {
fn explode_by_offsets(&self, offsets: &[i64], options: ExplodeOptions) -> Series {
debug_assert_eq!(self.chunks.len(), 1);
let arr = self.downcast_iter().next().unwrap();

Expand Down Expand Up @@ -67,7 +67,7 @@ where

for &o in &offsets[1..] {
let o = o as usize;
if !skip_empty && o == last {
if options.empty_as_null && o == last {
if start != last {
#[cfg(debug_assertions)]
new_values.extend_from_slice(&values[start..last]);
Expand Down Expand Up @@ -114,7 +114,7 @@ where
} else {
for &o in &offsets[1..] {
let o = o as usize;
if !skip_empty && o == last {
if options.empty_as_null && o == last {
if start != last {
unsafe { new_values.extend_from_slice(values.get_unchecked(start..last)) };
}
Expand Down Expand Up @@ -150,39 +150,39 @@ where
}

impl ExplodeByOffsets for Float32Chunked {
fn explode_by_offsets(&self, offsets: &[i64], skip_empty: bool) -> Series {
fn explode_by_offsets(&self, offsets: &[i64], options: ExplodeOptions) -> Series {
self.apply_as_ints(|s| {
let ca = s.u32().unwrap();
ca.explode_by_offsets(offsets, skip_empty)
ca.explode_by_offsets(offsets, options)
})
}
}
impl ExplodeByOffsets for Float64Chunked {
fn explode_by_offsets(&self, offsets: &[i64], skip_empty: bool) -> Series {
fn explode_by_offsets(&self, offsets: &[i64], options: ExplodeOptions) -> Series {
self.apply_as_ints(|s| {
let ca = s.u64().unwrap();
ca.explode_by_offsets(offsets, skip_empty)
ca.explode_by_offsets(offsets, options)
})
}
}

impl ExplodeByOffsets for NullChunked {
fn explode_by_offsets(&self, offsets: &[i64], skip_empty: bool) -> Series {
fn explode_by_offsets(&self, offsets: &[i64], options: ExplodeOptions) -> Series {
let mut last_offset = offsets[0];

let mut len = 0;
for &offset in &offsets[1..] {
// If offset == last_offset we have an empty list and a new row is inserted,
// therefore we always increase at least 1.
len += std::cmp::max(offset - last_offset, i64::from(!skip_empty)) as usize;
len += std::cmp::max(offset - last_offset, i64::from(options.empty_as_null)) as usize;
last_offset = offset;
}
NullChunked::new(self.name.clone(), len).into_series()
}
}

impl ExplodeByOffsets for BooleanChunked {
fn explode_by_offsets(&self, offsets: &[i64], skip_empty: bool) -> Series {
fn explode_by_offsets(&self, offsets: &[i64], options: ExplodeOptions) -> Series {
debug_assert_eq!(self.chunks.len(), 1);
let arr = self.downcast_iter().next().unwrap();

Expand All @@ -193,7 +193,7 @@ impl ExplodeByOffsets for BooleanChunked {
let mut last = start;
for &o in &offsets[1..] {
let o = o as usize;
if !skip_empty && o == last {
if options.empty_as_null && o == last {
if start != last {
let vals = arr.slice_typed(start, last - start);

Expand Down Expand Up @@ -283,12 +283,18 @@ mod test {
assert!(ca._can_fast_explode());

// normal explode
let exploded = ca.explode(false)?;
let exploded = ca.explode(ExplodeOptions {
empty_as_null: true,
keep_nulls: true,
})?;
let out: Vec<_> = exploded.i32()?.into_no_null_iter().collect();
assert_eq!(out, &[1, 2, 3, 3, 1, 2]);

// sliced explode
let exploded = ca.slice(0, 1).explode(false)?;
let exploded = ca.slice(0, 1).explode(ExplodeOptions {
empty_as_null: true,
keep_nulls: true,
})?;
let out: Vec<_> = exploded.i32()?.into_no_null_iter().collect();
assert_eq!(out, &[1, 2, 3, 3]);

Expand All @@ -310,7 +316,10 @@ mod test {
.unwrap();

let ca = builder.finish();
let exploded = ca.explode(false)?;
let exploded = ca.explode(ExplodeOptions {
empty_as_null: true,
keep_nulls: true,
})?;
assert_eq!(
Vec::from(exploded.i32()?),
&[Some(1), Some(2), None, Some(3)]
Expand All @@ -335,7 +344,10 @@ mod test {
.unwrap();

let ca = builder.finish();
let exploded = ca.explode(false)?;
let exploded = ca.explode(ExplodeOptions {
empty_as_null: true,
keep_nulls: true,
})?;
assert_eq!(
Vec::from(exploded.i32()?),
&[Some(1), None, Some(2), None, Some(3), Some(4)]
Expand Down Expand Up @@ -381,7 +393,10 @@ mod test {
.unwrap();

let ca = builder.finish();
let exploded = ca.explode(false)?;
let exploded = ca.explode(ExplodeOptions {
empty_as_null: true,
keep_nulls: true,
})?;
assert_eq!(
Vec::from(exploded.str()?),
&[Some("abc"), None, Some("de"), None, Some("fg"), None]
Expand All @@ -406,7 +421,10 @@ mod test {
.unwrap();

let ca = builder.finish();
let exploded = ca.explode(false)?;
let exploded = ca.explode(ExplodeOptions {
empty_as_null: true,
keep_nulls: true,
})?;
assert_eq!(
Vec::from(exploded.bool()?),
&[Some(true), None, Some(false), None, Some(true), Some(true)]
Expand Down
54 changes: 42 additions & 12 deletions crates/polars-core/src/chunked_array/ops/explode_and_offsets.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ impl ListChunked {
values: ArrayRef,
offsets: &[i64],
offsets_buf: OffsetsBuffer<i64>,
skip_empty: bool,
options: ExplodeOptions,
) -> (Series, OffsetsBuffer<i64>) {
// SAFETY: inner_dtype should be correct
let values = unsafe {
Expand All @@ -25,16 +25,16 @@ impl ListChunked {
let mut values = match values.dtype() {
DataType::Boolean => {
let t = values.bool().unwrap();
ExplodeByOffsets::explode_by_offsets(t, offsets, skip_empty).into_series()
ExplodeByOffsets::explode_by_offsets(t, offsets, options).into_series()
},
DataType::Null => {
let t = values.null().unwrap();
ExplodeByOffsets::explode_by_offsets(t, offsets, skip_empty).into_series()
ExplodeByOffsets::explode_by_offsets(t, offsets, options).into_series()
},
dtype => {
with_match_physical_numeric_polars_type!(dtype, |$T| {
let t: &ChunkedArray<$T> = values.as_ref().as_ref();
ExplodeByOffsets::explode_by_offsets(t, offsets, skip_empty).into_series()
ExplodeByOffsets::explode_by_offsets(t, offsets, options).into_series()
})
},
};
Expand All @@ -55,7 +55,10 @@ impl ChunkExplode for ListChunked {
Ok(offsets)
}

fn explode_and_offsets(&self, skip_empty: bool) -> PolarsResult<(Series, OffsetsBuffer<i64>)> {
fn explode_and_offsets(
&self,
options: ExplodeOptions,
) -> PolarsResult<(Series, OffsetsBuffer<i64>)> {
// A list array's memory layout is actually already 'exploded', so we can just take the
// values array of the list. And we also return a slice of the offsets. This slice can be
// used to find the old list layout or indexes to expand a DataFrame in the same manner as
Expand All @@ -66,7 +69,10 @@ impl ChunkExplode for ListChunked {
let offsets = listarr.offsets().as_slice();
let mut values = listarr.values().clone();

let (mut s, offsets) = if ca._can_fast_explode() {
let (mut s, offsets) = if ca._can_fast_explode()
&& (!options.keep_nulls || !ca.has_nulls())
Comment thread
coastalwhite marked this conversation as resolved.
&& (!options.empty_as_null || !ca.has_empty_lists())
{
// ensure that the value array is sliced
// as a list only slices its offsets on a slice operation

Expand Down Expand Up @@ -112,7 +118,7 @@ impl ChunkExplode for ListChunked {
let inner_phys = self.inner_dtype().to_physical();
if inner_phys.is_primitive_numeric() || inner_phys.is_null() || inner_phys.is_bool()
{
return Ok(self.explode_specialized(values, offsets, offsets_buf, skip_empty));
return Ok(self.explode_specialized(values, offsets, offsets_buf, options));
}
// Use gather
let mut indices =
Expand All @@ -127,7 +133,7 @@ impl ChunkExplode for ListChunked {
let start = previous as IdxSize;
let end = offset as IdxSize;

if !skip_empty && len == 0 {
if options.empty_as_null && len == 0 {
indices.push_null();
} else {
indices.extend_trusted_len_values(start..end);
Expand Down Expand Up @@ -156,13 +162,13 @@ impl ChunkExplode for ListChunked {
// SAFETY: we are within bounds
if unsafe { validity.get_bit_unchecked(i) } {
// explode expects null value if sublist is empty.
if !skip_empty && len == 0 {
if options.empty_as_null && len == 0 {
indices.push_null();
} else {
indices.extend_trusted_len_values(start..end);
}
current_offset += len;
} else {
} else if options.keep_nulls {
indices.push_null();
}
previous = offset;
Expand Down Expand Up @@ -236,7 +242,31 @@ impl ChunkExplode for ArrayChunked {
Ok(offsets)
}

fn explode_and_offsets(&self, _skip_empty: bool) -> PolarsResult<(Series, OffsetsBuffer<i64>)> {
fn explode_and_offsets(
&self,
options: ExplodeOptions,
) -> PolarsResult<(Series, OffsetsBuffer<i64>)> {
if self.width() == 0 {
let mut num_nulls = 0;
if options.empty_as_null {
num_nulls += self.len() - self.null_count();
}
if options.keep_nulls {
num_nulls += self.null_count();
}
let offsets = (0..num_nulls as i64 + 1).collect::<Vec<i64>>();
// SAFETY: monotonically increasing
let offsets = unsafe { OffsetsBuffer::new_unchecked(offsets.into()) };
let s = Column::new_scalar(
self.name().clone(),
Scalar::null(self.inner_dtype().clone()),
num_nulls,
)
.take_materialized_series();

return Ok((s, offsets));
}

let ca = self.rechunk();
let arr = ca.downcast_iter().next().unwrap();
// fast-path for non-null array.
Expand Down Expand Up @@ -278,7 +308,7 @@ impl ChunkExplode for ArrayChunked {
let end = start + width as IdxSize;
indices.extend_trusted_len_values(start..end);
current_offset += width as i64;
} else {
} else if options.keep_nulls {
indices.push_null();
}
offsets.push(current_offset);
Expand Down
17 changes: 14 additions & 3 deletions crates/polars-core/src/chunked_array/ops/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -81,13 +81,24 @@ pub trait ChunkAnyValue {
fn get_any_value(&self, index: usize) -> PolarsResult<AnyValue<'_>>;
}

#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
#[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))]
#[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))]
pub struct ExplodeOptions {
pub empty_as_null: bool,
pub keep_nulls: bool,
}

/// Explode/flatten a List or String Series
pub trait ChunkExplode {
fn explode(&self, skip_empty: bool) -> PolarsResult<Series> {
self.explode_and_offsets(skip_empty).map(|t| t.0)
fn explode(&self, options: ExplodeOptions) -> PolarsResult<Series> {
self.explode_and_offsets(options).map(|t| t.0)
}
fn offsets(&self) -> PolarsResult<OffsetsBuffer<i64>>;
fn explode_and_offsets(&self, skip_empty: bool) -> PolarsResult<(Series, OffsetsBuffer<i64>)>;
fn explode_and_offsets(
&self,
options: ExplodeOptions,
) -> PolarsResult<(Series, OffsetsBuffer<i64>)>;
}

pub trait ChunkBytes {
Expand Down
4 changes: 2 additions & 2 deletions crates/polars-core/src/frame/column/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1250,9 +1250,9 @@ impl Column {
}
}

pub fn explode(&self, skip_empty: bool) -> PolarsResult<Column> {
pub fn explode(&self, options: ExplodeOptions) -> PolarsResult<Column> {
self.as_materialized_series()
.explode(skip_empty)
.explode(options)
.map(Column::from)
}
pub fn implode(&self) -> PolarsResult<ListChunked> {
Expand Down
Loading
Loading