diff --git a/crates/polars-core/src/chunked_array/ops/explode.rs b/crates/polars-core/src/chunked_array/ops/explode.rs index 8714cc0383d5..4bf7ac6ef274 100644 --- a/crates/polars-core/src/chunked_array/ops/explode.rs +++ b/crates/polars-core/src/chunked_array/ops/explode.rs @@ -223,7 +223,12 @@ impl ExplodeByOffsets for BooleanChunked { } /// Convert Arrow array offsets to indexes of the original list -pub(crate) fn offsets_to_indexes(offsets: &[i64], capacity: usize) -> Vec { +pub(crate) fn offsets_to_indexes( + offsets: &[i64], + capacity: usize, + options: ExplodeOptions, + validity: Option<&Bitmap>, +) -> Vec { if offsets.is_empty() { return vec![]; } @@ -231,25 +236,57 @@ pub(crate) fn offsets_to_indexes(offsets: &[i64], capacity: usize) -> Vec= capacity { - // significant speed-up in edge cases with many offsets, - // no measurable overhead in typical case due to branch prediction - break; - } + match validity { + None => { + for (offset_start, offset_end) in offsets.iter().zip(offsets[1..].iter()) { + if idx.len() >= capacity { + // significant speed-up in edge cases with many offsets, + // no measurable overhead in typical case due to branch prediction + break; + } - if offset_start == offset_end { - // if the previous offset is equal to the current offset, we have an empty - // list and we duplicate the previous index - idx.push(last_idx); - } else { - let width = (offset_end - offset_start) as usize; - for _ in 0..width { - idx.push(last_idx); + if offset_start == offset_end { + if options.empty_as_null { + // if the previous offset is equal to the current offset, we have an empty + // list and we duplicate the previous index + idx.push(last_idx); + } + } else { + let width = (offset_end - offset_start) as usize; + for _ in 0..width { + idx.push(last_idx); + } + } + + last_idx += 1; } - } + }, + Some(validity) => { + for ((offset_start, offset_end), is_valid) in + offsets.iter().zip(offsets[1..].iter()).zip(validity.iter()) + { + if idx.len() >= capacity { + // significant speed-up in edge cases with many offsets, + // no measurable overhead in typical case due to branch prediction + break; + } - last_idx += 1; + if offset_start == offset_end { + if (is_valid && options.empty_as_null) || (!is_valid && options.keep_nulls) { + // if the previous offset is equal to the current offset, we have an empty + // list and we duplicate the previous index + idx.push(last_idx); + } + } else { + let width = (offset_end - offset_start) as usize; + for _ in 0..width { + idx.push(last_idx); + } + } + + last_idx += 1; + } + }, } // take the remaining values @@ -436,14 +473,30 @@ mod test { #[test] fn test_row_offsets() { let offsets = &[0, 1, 2, 2, 3, 4, 4]; - let out = offsets_to_indexes(offsets, 6); + let out = offsets_to_indexes( + offsets, + 6, + ExplodeOptions { + empty_as_null: true, + keep_nulls: true, + }, + None, + ); assert_eq!(out, &[0, 1, 2, 3, 4, 5]); } #[test] fn test_empty_row_offsets() { let offsets = &[0, 0]; - let out = offsets_to_indexes(offsets, 0); + let out = offsets_to_indexes( + offsets, + 0, + ExplodeOptions { + empty_as_null: true, + keep_nulls: true, + }, + None, + ); let expected: Vec = Vec::new(); assert_eq!(out, expected); } @@ -451,14 +504,30 @@ mod test { #[test] fn test_row_offsets_over_capacity() { let offsets = &[0, 1, 1, 2, 2]; - let out = offsets_to_indexes(offsets, 2); + let out = offsets_to_indexes( + offsets, + 2, + ExplodeOptions { + empty_as_null: true, + keep_nulls: true, + }, + None, + ); assert_eq!(out, &[0, 1]); } #[test] fn test_row_offsets_nonzero_first_offset() { let offsets = &[3, 6, 8]; - let out = offsets_to_indexes(offsets, 10); + let out = offsets_to_indexes( + offsets, + 10, + ExplodeOptions { + empty_as_null: true, + keep_nulls: true, + }, + None, + ); assert_eq!(out, &[0, 0, 0, 1, 1, 2, 2, 2, 2, 2]); } } diff --git a/crates/polars-core/src/chunked_array/ops/mod.rs b/crates/polars-core/src/chunked_array/ops/mod.rs index 0a3664acd2a3..b3c11ed23e84 100644 --- a/crates/polars-core/src/chunked_array/ops/mod.rs +++ b/crates/polars-core/src/chunked_array/ops/mod.rs @@ -85,7 +85,9 @@ pub trait ChunkAnyValue { #[cfg_attr(feature = "serde", derive(serde::Serialize, serde::Deserialize))] #[cfg_attr(feature = "dsl-schema", derive(schemars::JsonSchema))] pub struct ExplodeOptions { + /// Explode an empty list into a `null`. pub empty_as_null: bool, + /// Explode a `null` into a `null`. pub keep_nulls: bool, } diff --git a/crates/polars-core/src/frame/explode.rs b/crates/polars-core/src/frame/explode.rs index 622243a14781..8ef5d124b09e 100644 --- a/crates/polars-core/src/frame/explode.rs +++ b/crates/polars-core/src/frame/explode.rs @@ -9,17 +9,14 @@ use crate::chunked_array::ops::explode::offsets_to_indexes; use crate::prelude::*; use crate::series::IsSorted; -fn get_exploded(series: &Series) -> PolarsResult<(Series, OffsetsBuffer)> { +fn get_exploded( + series: &Series, + options: ExplodeOptions, +) -> PolarsResult<(Series, OffsetsBuffer)> { match series.dtype() { - DataType::List(_) => series.list().unwrap().explode_and_offsets(ExplodeOptions { - empty_as_null: true, - keep_nulls: true, - }), + DataType::List(_) => series.list().unwrap().explode_and_offsets(options), #[cfg(feature = "dtype-array")] - DataType::Array(_, _) => series.array().unwrap().explode_and_offsets(ExplodeOptions { - empty_as_null: true, - keep_nulls: true, - }), + DataType::Array(_, _) => series.array().unwrap().explode_and_offsets(options), _ => polars_bail!(opq = explode, series.dtype()), } } @@ -35,15 +32,16 @@ pub struct UnpivotArgsIR { } impl DataFrame { - pub fn explode_impl(&self, mut columns: Vec) -> PolarsResult { + pub fn explode_impl( + &self, + mut columns: Vec, + options: ExplodeOptions, + ) -> PolarsResult { polars_ensure!(!columns.is_empty(), InvalidOperation: "no columns provided in explode"); let mut df = self.clone(); if self.is_empty() { for s in &columns { - df.with_column(s.as_materialized_series().explode(ExplodeOptions { - empty_as_null: true, - keep_nulls: true, - })?)?; + df.with_column(s.as_materialized_series().explode(options)?)?; } return Ok(df); } @@ -66,8 +64,7 @@ impl DataFrame { let exploded_columns = POOL.install(|| { columns .par_iter() - .map(Column::as_materialized_series) - .map(get_exploded) + .map(|c| get_exploded(c.as_materialized_series(), options)) .map(|s| s.map(|(s, o)| (Column::from(s), o))) .collect::>>() })?; @@ -110,9 +107,15 @@ impl DataFrame { Ok(()) }; let process_first = || { + let validity = columns[0].rechunk_validity(); let (exploded, offsets) = &exploded_columns[0]; - let row_idx = offsets_to_indexes(offsets.as_slice(), exploded.len()); + let row_idx = offsets_to_indexes( + offsets.as_slice(), + exploded.len(), + options, + validity.as_ref(), + ); let mut row_idx = IdxCa::from_vec(PlSmallStr::EMPTY, row_idx); row_idx.set_sorted_flag(IsSorted::Ascending); @@ -191,7 +194,7 @@ impl DataFrame { /// | 2 | 3 | 1 | /// +-----+-----+-----+ /// ``` - pub fn explode(&self, columns: I) -> PolarsResult + pub fn explode(&self, columns: I, options: ExplodeOptions) -> PolarsResult where I: IntoIterator, S: Into, @@ -199,7 +202,7 @@ impl DataFrame { // We need to sort the column by order of original occurrence. Otherwise the insert by index // below will panic let columns = self.select_columns(columns)?; - self.explode_impl(columns) + self.explode_impl(columns, options) } } @@ -219,7 +222,15 @@ mod test { let s0 = Column::new(PlSmallStr::from_static("B"), [1, 2, 3]); let s1 = Column::new(PlSmallStr::from_static("C"), [1, 1, 1]); let df = DataFrame::new(vec![list, s0, s1]).unwrap(); - let exploded = df.explode(["foo"]).unwrap(); + let exploded = df + .explode( + ["foo"], + ExplodeOptions { + empty_as_null: true, + keep_nulls: true, + }, + ) + .unwrap(); assert_eq!(exploded.shape(), (9, 3)); assert_eq!( exploded @@ -266,7 +277,13 @@ mod test { let s1 = Column::new(PlSmallStr::from_static("C"), [1, 1, 1]); let df = DataFrame::new(vec![list, s0.clone(), s1.clone()])?; - let out = df.explode(["foo"])?; + let out = df.explode( + ["foo"], + ExplodeOptions { + empty_as_null: true, + keep_nulls: true, + }, + )?; let expected = df![ "foo" => [Some(1), Some(2), Some(3), Some(1), Some(1), Some(1), None], "B" => [1, 1, 1, 2, 2, 2, 3], @@ -284,7 +301,13 @@ mod test { ], ); let df = DataFrame::new(vec![list, s0, s1])?; - let out = df.explode(["foo"])?; + let out = df.explode( + ["foo"], + ExplodeOptions { + empty_as_null: true, + keep_nulls: true, + }, + )?; let expected = df![ "foo" => [Some(1), Some(2), Some(3), None, Some(1), Some(1), Some(1)], "B" => [1, 1, 1, 2, 3, 3, 3], @@ -303,7 +326,13 @@ mod test { let list = Column::new(PlSmallStr::from_static("foo"), &[s0, s1]); let df = DataFrame::new(vec![list])?; - let out = df.explode(["foo"])?; + let out = df.explode( + ["foo"], + ExplodeOptions { + empty_as_null: true, + keep_nulls: true, + }, + )?; let out = out .column("foo")? .as_materialized_series() diff --git a/crates/polars-lazy/src/frame/mod.rs b/crates/polars-lazy/src/frame/mod.rs index ab01c955b2a3..f09cec0e8baa 100644 --- a/crates/polars-lazy/src/frame/mod.rs +++ b/crates/polars-lazy/src/frame/mod.rs @@ -1769,16 +1769,21 @@ impl LazyFrame { } /// Apply explode operation. [See eager explode](polars_core::frame::DataFrame::explode). - pub fn explode(self, columns: Selector) -> LazyFrame { - self.explode_impl(columns, false) + pub fn explode(self, columns: Selector, options: ExplodeOptions) -> LazyFrame { + self.explode_impl(columns, options, false) } /// Apply explode operation. [See eager explode](polars_core::frame::DataFrame::explode). - fn explode_impl(self, columns: Selector, allow_empty: bool) -> LazyFrame { + fn explode_impl( + self, + columns: Selector, + options: ExplodeOptions, + allow_empty: bool, + ) -> LazyFrame { let opt_state = self.get_opt_state(); let lp = self .get_plan_builder() - .explode(columns, allow_empty) + .explode(columns, options, allow_empty) .build(); Self::from_logical_plan(lp, opt_state) } @@ -2162,8 +2167,14 @@ impl LazyGroupBy { .filter_map(|expr| expr_output_name(expr).ok()) .collect::>(); - self.agg([all().as_expr().head(n)]) - .explode_impl(all() - by_name(keys.iter().cloned(), false), true) + self.agg([all().as_expr().head(n)]).explode_impl( + all() - by_name(keys.iter().cloned(), false), + ExplodeOptions { + empty_as_null: true, + keep_nulls: true, + }, + true, + ) } /// Return last n rows of each group @@ -2174,8 +2185,14 @@ impl LazyGroupBy { .filter_map(|expr| expr_output_name(expr).ok()) .collect::>(); - self.agg([all().as_expr().tail(n)]) - .explode_impl(all() - by_name(keys.iter().cloned(), false), true) + self.agg([all().as_expr().tail(n)]).explode_impl( + all() - by_name(keys.iter().cloned(), false), + ExplodeOptions { + empty_as_null: true, + keep_nulls: true, + }, + true, + ) } /// Apply a function over the groups as a new DataFrame. diff --git a/crates/polars-lazy/src/tests/aggregations.rs b/crates/polars-lazy/src/tests/aggregations.rs index e696038858c1..7c66461c13ed 100644 --- a/crates/polars-lazy/src/tests/aggregations.rs +++ b/crates/polars-lazy/src/tests/aggregations.rs @@ -418,7 +418,13 @@ fn test_shift_elementwise_issue_2509() -> PolarsResult<()> { .sort(["x"], Default::default()) .collect()?; - let out = out.explode(["sum"])?; + let out = out.explode( + ["sum"], + ExplodeOptions { + empty_as_null: true, + keep_nulls: true, + }, + )?; let out = out.column("sum")?; assert_eq!(out.get(0)?, AnyValue::Int32(10)); assert_eq!(out.get(1)?, AnyValue::Int32(20)); diff --git a/crates/polars-lazy/src/tests/logical.rs b/crates/polars-lazy/src/tests/logical.rs index dd642b29824e..1f19f8b3fc03 100644 --- a/crates/polars-lazy/src/tests/logical.rs +++ b/crates/polars-lazy/src/tests/logical.rs @@ -25,7 +25,13 @@ fn test_duration() -> PolarsResult<()> { (col("date") - col("date").first()).alias("date"), (col("datetime") - col("datetime").first()).alias("datetime"), ]) - .explode(by_name(["date", "datetime"], true)) + .explode( + by_name(["date", "datetime"], true), + ExplodeOptions { + empty_as_null: true, + keep_nulls: true, + }, + ) .collect()?; let column = out.column("date")?; diff --git a/crates/polars-lazy/src/tests/queries.rs b/crates/polars-lazy/src/tests/queries.rs index c49e8d7b9a99..930df213b836 100644 --- a/crates/polars-lazy/src/tests/queries.rs +++ b/crates/polars-lazy/src/tests/queries.rs @@ -287,7 +287,13 @@ fn test_lazy_query_4() -> PolarsResult<()> { .apply(|s: Column| &s - &(s.shift(1)), |_, f| Ok(f.clone())) .alias("diff_cases"), ]) - .explode(by_name(["day", "diff_cases"], true)) + .explode( + by_name(["day", "diff_cases"], true), + ExplodeOptions { + empty_as_null: true, + keep_nulls: true, + }, + ) .join( base_df, [col("uid"), col("day")], @@ -1107,7 +1113,13 @@ fn test_multiple_explode() -> PolarsResult<()> { .lazy() .group_by([col("a")]) .agg([col("b").alias("b_list"), col("c").alias("c_list")]) - .explode(by_name(["c_list", "b_list"], true)) + .explode( + by_name(["c_list", "b_list"], true), + ExplodeOptions { + empty_as_null: true, + keep_nulls: true, + }, + ) .collect()?; assert_eq!(out.shape(), (5, 3)); diff --git a/crates/polars-lazy/src/tests/schema.rs b/crates/polars-lazy/src/tests/schema.rs index da4e424d2917..544243008c7c 100644 --- a/crates/polars-lazy/src/tests/schema.rs +++ b/crates/polars-lazy/src/tests/schema.rs @@ -11,7 +11,13 @@ fn test_schema_update_after_projection_pd() -> PolarsResult<()> { let q = df .lazy() .with_column(col("a").implode()) - .explode(by_name(["a"], true)) + .explode( + by_name(["a"], true), + ExplodeOptions { + empty_as_null: true, + keep_nulls: true, + }, + ) .select([cols(["a", "b"]).as_expr()]); // run optimizations diff --git a/crates/polars-plan/dsl-schema-hashes.json b/crates/polars-plan/dsl-schema-hashes.json index 62c1a7739552..5931104c0138 100644 --- a/crates/polars-plan/dsl-schema-hashes.json +++ b/crates/polars-plan/dsl-schema-hashes.json @@ -42,7 +42,7 @@ "DeletionFilesList": "9082ea060ebc1bc0b04499d09aa75f5d98b4f37939831d6364e31f2472d957c7", "Dimension": "68880cdb10230df6c8c1632b073c80bd8ceb5c56a368c0cb438431ca9f3d3b31", "DistinctOptionsDSL": "41be5ec69ef9a614f2b36ac5deadfecdea5cca847ae1ada9d4bc626ff52a5b38", - "DslFunction": "0b5024fa450d86e8d5f7e79fb3944b66ad226e39ebfd107f3c7f551b204779ae", + "DslFunction": "221f1a46a043c8ed54f57be981bf24509f04f5f91f0f08e0acc180d96f842ebf", "DslPlan": "f9d1c8f632172a24dbaf76e56b2efab237937d2a2c615858d6f196932ada98f6", "Duration": "44999d59023085cbb592ce94b30d34f9b983081fc72bd6435a49bdf0869c0074", "DynListLiteralValue": "2266a553cb4a943f7097f24539eaa802453cf8742675996215235bd682dec0e8", diff --git a/crates/polars-plan/src/dsl/builder_dsl.rs b/crates/polars-plan/src/dsl/builder_dsl.rs index 2722b5578580..4ac6ce35ae38 100644 --- a/crates/polars-plan/src/dsl/builder_dsl.rs +++ b/crates/polars-plan/src/dsl/builder_dsl.rs @@ -293,11 +293,12 @@ impl DslBuilder { .into() } - pub fn explode(self, columns: Selector, allow_empty: bool) -> Self { + pub fn explode(self, columns: Selector, options: ExplodeOptions, allow_empty: bool) -> Self { DslPlan::MapFunction { input: Arc::new(self.0), function: DslFunction::Explode { columns, + options, allow_empty, }, } diff --git a/crates/polars-plan/src/plans/builder_ir.rs b/crates/polars-plan/src/plans/builder_ir.rs index 2a823cc5e0a0..bac26169c6c5 100644 --- a/crates/polars-plan/src/plans/builder_ir.rs +++ b/crates/polars-plan/src/plans/builder_ir.rs @@ -258,11 +258,12 @@ impl<'a> IRBuilder<'a> { } // call this if the schema needs to be updated - pub fn explode(self, columns: Arc<[PlSmallStr]>) -> Self { + pub fn explode(self, columns: Arc<[PlSmallStr]>, options: ExplodeOptions) -> Self { let lp = IR::MapFunction { input: self.root, function: FunctionIR::Explode { columns, + options, schema: Default::default(), }, }; diff --git a/crates/polars-plan/src/plans/conversion/dsl_to_ir/mod.rs b/crates/polars-plan/src/plans/conversion/dsl_to_ir/mod.rs index f7f8b615b98b..43a6f00b6cab 100644 --- a/crates/polars-plan/src/plans/conversion/dsl_to_ir/mod.rs +++ b/crates/polars-plan/src/plans/conversion/dsl_to_ir/mod.rs @@ -961,6 +961,7 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult match function { DslFunction::Explode { columns, + options, allow_empty, } => { let columns = columns.into_columns(&input_schema, &Default::default())?; @@ -970,6 +971,7 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult } let function = FunctionIR::Explode { columns: columns.into_iter().collect(), + options, schema: Default::default(), }; let ir = IR::MapFunction { input, function }; diff --git a/crates/polars-plan/src/plans/functions/dsl.rs b/crates/polars-plan/src/plans/functions/dsl.rs index f1e9e0405569..893f1a1243a8 100644 --- a/crates/polars-plan/src/plans/functions/dsl.rs +++ b/crates/polars-plan/src/plans/functions/dsl.rs @@ -33,6 +33,7 @@ pub enum DslFunction { OpaquePython(OpaquePythonUdf), Explode { columns: Selector, + options: ExplodeOptions, allow_empty: bool, }, #[cfg(feature = "pivot")] diff --git a/crates/polars-plan/src/plans/functions/mod.rs b/crates/polars-plan/src/plans/functions/mod.rs index e615bffb8f81..8274e4fb7cff 100644 --- a/crates/polars-plan/src/plans/functions/mod.rs +++ b/crates/polars-plan/src/plans/functions/mod.rs @@ -54,6 +54,7 @@ pub enum FunctionIR { Rechunk, Explode { columns: Arc<[PlSmallStr]>, + options: ExplodeOptions, #[cfg_attr(feature = "ir_serde", serde(skip))] schema: CachedSchema, }, @@ -93,7 +94,18 @@ impl PartialEq for FunctionIR { sources: srcs_r, .. }, ) => srcs_l == srcs_r, - (Explode { columns: l, .. }, Explode { columns: r, .. }) => l == r, + ( + Explode { + columns: l, + options: l_options, + .. + }, + Explode { + columns: r, + options: r_options, + .. + }, + ) => l == r && l_options == r_options, #[cfg(feature = "pivot")] (Unpivot { args: l, .. }, Unpivot { args: r, .. }) => l == r, (RowIndex { name: l, .. }, RowIndex { name: r, .. }) => l == r, @@ -125,7 +137,14 @@ impl Hash for FunctionIR { separator.hash(state); }, FunctionIR::Rechunk => {}, - FunctionIR::Explode { columns, schema: _ } => columns.hash(state), + FunctionIR::Explode { + columns, + options, + schema: _, + } => { + columns.hash(state); + options.hash(state); + }, #[cfg(feature = "pivot")] FunctionIR::Unpivot { args, schema: _ } => args.hash(state), FunctionIR::RowIndex { @@ -231,7 +250,9 @@ impl FunctionIR { df.unnest(columns.iter().cloned(), separator.as_deref()) ) }, - Explode { columns, .. } => df.explode(columns.iter().cloned()), + Explode { + columns, options, .. + } => df.explode(columns.iter().cloned(), *options), #[cfg(feature = "pivot")] Unpivot { args, .. } => { use polars_ops::unpivot::UnpivotDF; @@ -361,9 +382,20 @@ impl Display for FunctionIR { Ok(()) }, - Explode { columns, schema: _ } => { + Explode { + columns, + options, + schema: _, + } => { f.write_str("EXPLODE ")?; - fmt_column_delimited(f, columns, "[", "]") + fmt_column_delimited(f, columns, "[", "]")?; + if !options.empty_as_null { + f.write_str(", empty_as_null: false")?; + } + if !options.keep_nulls { + f.write_str(", keep_nulls: false")?; + } + Ok(()) }, #[cfg(feature = "pivot")] Unpivot { args, schema: _ } => { diff --git a/crates/polars-plan/src/plans/functions/schema.rs b/crates/polars-plan/src/plans/functions/schema.rs index 3ad742942204..89486219554e 100644 --- a/crates/polars-plan/src/plans/functions/schema.rs +++ b/crates/polars-plan/src/plans/functions/schema.rs @@ -95,7 +95,11 @@ impl FunctionIR { input_schema, name.clone(), ))), - Explode { schema, columns } => explode_schema(schema, input_schema, columns), + Explode { + schema, + options: _, + columns, + } => explode_schema(schema, input_schema, columns), #[cfg(feature = "pivot")] Unpivot { schema, args } => unpivot_schema(args, schema, input_schema), Hint(_) => Ok(Cow::Borrowed(input_schema)), diff --git a/crates/polars-plan/src/plans/optimizer/projection_pushdown/functions/mod.rs b/crates/polars-plan/src/plans/optimizer/projection_pushdown/functions/mod.rs index e1771740af07..69e92f7c9080 100644 --- a/crates/polars-plan/src/plans/optimizer/projection_pushdown/functions/mod.rs +++ b/crates/polars-plan/src/plans/optimizer/projection_pushdown/functions/mod.rs @@ -17,13 +17,15 @@ pub(super) fn process_functions( ) -> PolarsResult { use FunctionIR::*; match function { - Explode { columns, .. } => { + Explode { + columns, options, .. + } => { columns .iter() .for_each(|name| add_str_to_accumulated(name.clone(), &mut ctx, expr_arena)); proj_pd.pushdown_and_assign(input, ctx, lp_arena, expr_arena)?; Ok(IRBuilder::new(input, expr_arena, lp_arena) - .explode(columns) + .explode(columns, options) .build()) }, #[cfg(feature = "pivot")] diff --git a/crates/polars-python/src/lazyframe/general.rs b/crates/polars-python/src/lazyframe/general.rs index 40f1747f121f..e96f19e1027f 100644 --- a/crates/polars-python/src/lazyframe/general.rs +++ b/crates/polars-python/src/lazyframe/general.rs @@ -1274,8 +1274,18 @@ impl PyLazyFrame { out.into() } - fn explode(&self, subset: PySelector) -> Self { - self.ldf.read().clone().explode(subset.inner).into() + fn explode(&self, subset: PySelector, empty_as_null: bool, keep_nulls: bool) -> Self { + self.ldf + .read() + .clone() + .explode( + subset.inner, + ExplodeOptions { + empty_as_null, + keep_nulls, + }, + ) + .into() } fn null_count(&self) -> Self { diff --git a/crates/polars-python/src/lazyframe/visitor/nodes.rs b/crates/polars-python/src/lazyframe/visitor/nodes.rs index dd7328d4c7c8..aa4341e6d1fe 100644 --- a/crates/polars-python/src/lazyframe/visitor/nodes.rs +++ b/crates/polars-python/src/lazyframe/visitor/nodes.rs @@ -618,9 +618,15 @@ pub(crate) fn into_py(py: Python<'_>, plan: &IR) -> PyResult> { ) .into_py_any(py)?, FunctionIR::Rechunk => ("rechunk",).into_py_any(py)?, - FunctionIR::Explode { columns, schema: _ } => ( + FunctionIR::Explode { + columns, + options, + schema: _, + } => ( "explode", columns.iter().map(|s| s.to_string()).collect::>(), + options.empty_as_null, + options.keep_nulls, ) .into_py_any(py)?, #[cfg(feature = "pivot")] diff --git a/crates/polars/tests/it/lazy/explodes.rs b/crates/polars/tests/it/lazy/explodes.rs index 4c0bfcbd103d..9ebdbd3cf40d 100644 --- a/crates/polars/tests/it/lazy/explodes.rs +++ b/crates/polars/tests/it/lazy/explodes.rs @@ -11,7 +11,13 @@ fn test_explode_row_numbers() -> PolarsResult<()> { .lazy() .select([col("text").str().split(lit(" ")).alias("tokens")]) .with_row_index("index", None) - .explode(cols(["tokens"])) + .explode( + cols(["tokens"]), + ExplodeOptions { + empty_as_null: true, + keep_nulls: true, + }, + ) .select([col("index"), col("tokens")]) .collect()?; diff --git a/crates/polars/tests/it/lazy/expressions/apply.rs b/crates/polars/tests/it/lazy/expressions/apply.rs index 76fe3a860e14..2d5204382168 100644 --- a/crates/polars/tests/it/lazy/expressions/apply.rs +++ b/crates/polars/tests/it/lazy/expressions/apply.rs @@ -30,7 +30,13 @@ fn test_groups_update() -> PolarsResult<()> { .lazy() .group_by_stable([col("group")]) .agg([col("id").unique_counts().log(lit(2.0))]) - .explode(cols(["id"])) + .explode( + cols(["id"]), + ExplodeOptions { + empty_as_null: true, + keep_nulls: true, + }, + ) .collect()?; assert_eq!( out.column("id")? @@ -53,7 +59,13 @@ fn test_groups_update_binary_shift_log() -> PolarsResult<()> { .group_by([col("b")]) .agg([col("a") - col("a").shift(lit(1)).log(lit(2.0))]) .sort(["b"], Default::default()) - .explode(cols(["a"])) + .explode( + cols(["a"]), + ExplodeOptions { + empty_as_null: true, + keep_nulls: true, + }, + ) .collect()?; assert_eq!( Vec::from(out.column("a")?.f64()?), diff --git a/crates/polars/tests/it/lazy/group_by.rs b/crates/polars/tests/it/lazy/group_by.rs index 1605b5edb95d..eeb3ed4057bf 100644 --- a/crates/polars/tests/it/lazy/group_by.rs +++ b/crates/polars/tests/it/lazy/group_by.rs @@ -73,7 +73,13 @@ fn test_filter_diff_arithmetic() -> PolarsResult<()> { * lit(2)) .alias("diff")]) .sort(["user"], Default::default()) - .explode(cols(["diff"])) + .explode( + cols(["diff"]), + ExplodeOptions { + empty_as_null: true, + keep_nulls: true, + }, + ) .collect()?; let out = out.column("diff")?; diff --git a/crates/polars/tests/it/lazy/projection_queries.rs b/crates/polars/tests/it/lazy/projection_queries.rs index ff69a744e9a2..e540536ce453 100644 --- a/crates/polars/tests/it/lazy/projection_queries.rs +++ b/crates/polars/tests/it/lazy/projection_queries.rs @@ -153,7 +153,13 @@ fn test_unnest_pushdown() -> PolarsResult<()> { let out = df .lazy() - .explode(by_name(["users"], true)) + .explode( + by_name(["users"], true), + ExplodeOptions { + empty_as_null: true, + keep_nulls: true, + }, + ) .unnest(by_name(["users"], true), None) .select([col("email")]) .collect()?; diff --git a/crates/polars/tests/it/lazy/queries.rs b/crates/polars/tests/it/lazy/queries.rs index c76da18363fb..f7509955f956 100644 --- a/crates/polars/tests/it/lazy/queries.rs +++ b/crates/polars/tests/it/lazy/queries.rs @@ -143,7 +143,13 @@ fn test_sorted_path() -> PolarsResult<()> { let out = df .lazy() .with_row_index("index", None) - .explode(by_name(["a"], true)) + .explode( + by_name(["a"], true), + ExplodeOptions { + empty_as_null: true, + keep_nulls: true, + }, + ) .group_by(["index"]) .agg([col("a").count().alias("count")]) .collect()?; diff --git a/py-polars/src/polars/_plr.pyi b/py-polars/src/polars/_plr.pyi index a6c2f17920fb..b8526c5882d3 100644 --- a/py-polars/src/polars/_plr.pyi +++ b/py-polars/src/polars/_plr.pyi @@ -1049,7 +1049,9 @@ class PyLazyFrame: def quantile( self, quantile: PyExpr, interpolation: QuantileMethod ) -> PyLazyFrame: ... - def explode(self, subset: PySelector) -> PyLazyFrame: ... + def explode( + self, subset: PySelector, *, empty_as_null: bool, keep_nulls: bool + ) -> PyLazyFrame: ... def null_count(self) -> PyLazyFrame: ... def unique( self, diff --git a/py-polars/src/polars/dataframe/frame.py b/py-polars/src/polars/dataframe/frame.py index f5d78dfbf08c..093c96c309cd 100644 --- a/py-polars/src/polars/dataframe/frame.py +++ b/py-polars/src/polars/dataframe/frame.py @@ -9183,6 +9183,8 @@ def explode( self, columns: ColumnNameOrSelector | Iterable[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector, + empty_as_null: bool = True, + keep_nulls: bool = True, ) -> DataFrame: """ Explode the dataframe to long format by exploding the given columns. @@ -9194,6 +9196,10 @@ def explode( columns being exploded must be of the `List` or `Array` data type. *more_columns Additional names of columns to explode, specified as positional arguments. + empty_as_null + Explode an empty list/array into a `null`. + keep_nulls + Explode a `null` list/array into a `null`. Returns ------- @@ -9240,7 +9246,12 @@ def explode( return ( self.lazy() - .explode(columns, *more_columns) + .explode( + columns, + *more_columns, + empty_as_null=empty_as_null, + keep_nulls=keep_nulls, + ) .collect(optimizations=QueryOptFlags._eager()) ) diff --git a/py-polars/src/polars/lazyframe/frame.py b/py-polars/src/polars/lazyframe/frame.py index 36536801027d..c8aa0d67cbc8 100644 --- a/py-polars/src/polars/lazyframe/frame.py +++ b/py-polars/src/polars/lazyframe/frame.py @@ -7360,6 +7360,8 @@ def explode( self, columns: ColumnNameOrSelector | Iterable[ColumnNameOrSelector], *more_columns: ColumnNameOrSelector, + empty_as_null: bool = True, + keep_nulls: bool = True, ) -> LazyFrame: """ Explode the DataFrame to long format by exploding the given columns. @@ -7371,6 +7373,10 @@ def explode( columns being exploded must be of the `List` or `Array` data type. *more_columns Additional names of columns to explode, specified as positional arguments. + empty_as_null + Explode an empty list/array into a `null`. + keep_nulls + Explode a `null` list/array into a `null`. Examples -------- @@ -7400,7 +7406,13 @@ def explode( subset = parse_list_into_selector(columns) | parse_list_into_selector( # type: ignore[arg-type] more_columns ) - return self._from_pyldf(self._ldf.explode(subset=subset._pyselector)) + return self._from_pyldf( + self._ldf.explode( + subset=subset._pyselector, + empty_as_null=empty_as_null, + keep_nulls=keep_nulls, + ) + ) def unique( self, diff --git a/py-polars/tests/unit/operations/test_explode.py b/py-polars/tests/unit/operations/test_explode.py index c6011a0710a2..dada047ccd42 100644 --- a/py-polars/tests/unit/operations/test_explode.py +++ b/py-polars/tests/unit/operations/test_explode.py @@ -591,3 +591,26 @@ def test_explode_array_parameters() -> None: assert_series_equal( s.explode(empty_as_null=False, keep_nulls=False), pl.Series("a", [], pl.Int64) ) + + +def test_explode_params() -> None: + df = pl.DataFrame({"a": [[1, 2, 3], None, [4, 5, 6], []], "b": [1, 2, 3, 4]}) + + assert_frame_equal( + df.explode("a"), + pl.DataFrame( + {"a": [1, 2, 3, None, 4, 5, 6, None], "b": [1, 1, 1, 2, 3, 3, 3, 4]} + ), + ) + assert_frame_equal( + df.explode("a", empty_as_null=False), + pl.DataFrame({"a": [1, 2, 3, None, 4, 5, 6], "b": [1, 1, 1, 2, 3, 3, 3]}), + ) + assert_frame_equal( + df.explode("a", keep_nulls=False), + pl.DataFrame({"a": [1, 2, 3, 4, 5, 6, None], "b": [1, 1, 1, 3, 3, 3, 4]}), + ) + assert_frame_equal( + df.explode("a", empty_as_null=False, keep_nulls=False), + pl.DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [1, 1, 1, 3, 3, 3]}), + )