Skip to content

Commit 0b0af39

Browse files
authored
perf: Use split_at instead of double slice in chunk splits. (pola-rs#16856)
1 parent 3fe4cfe commit 0b0af39

File tree

22 files changed

+239
-6
lines changed

22 files changed

+239
-6
lines changed

Diff for: crates/polars-arrow/src/array/mod.rs

+3
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,9 @@ pub trait Array: Send + Sync + dyn_clone::DynClone + 'static {
161161
/// This function panics iff `offset + length > self.len()`.
162162
#[must_use]
163163
fn sliced(&self, offset: usize, length: usize) -> Box<dyn Array> {
164+
if length == 0 {
165+
return new_empty_array(self.data_type().clone());
166+
}
164167
let mut new = self.to_boxed();
165168
new.slice(offset, length);
166169
new

Diff for: crates/polars-core/src/chunked_array/ops/chunkops.rs

+96-2
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,44 @@ use crate::chunked_array::metadata::MetadataProperties;
77
use crate::chunked_array::object::builder::ObjectChunkedBuilder;
88
use crate::utils::slice_offsets;
99

10-
#[inline]
10+
pub(crate) fn split_at(
11+
chunks: &[ArrayRef],
12+
offset: i64,
13+
own_length: usize,
14+
) -> (Vec<ArrayRef>, Vec<ArrayRef>) {
15+
let mut new_chunks_left = Vec::with_capacity(1);
16+
let mut new_chunks_right = Vec::with_capacity(1);
17+
let (raw_offset, _) = slice_offsets(offset, 0, own_length);
18+
19+
let mut remaining_offset = raw_offset;
20+
let mut iter = chunks.iter();
21+
22+
for chunk in &mut iter {
23+
let chunk_len = chunk.len();
24+
if remaining_offset > 0 && remaining_offset >= chunk_len {
25+
remaining_offset -= chunk_len;
26+
new_chunks_left.push(chunk.clone());
27+
continue;
28+
}
29+
30+
let (l, r) = chunk.split_at_boxed(remaining_offset);
31+
new_chunks_left.push(l);
32+
new_chunks_right.push(r);
33+
break;
34+
}
35+
36+
for chunk in iter {
37+
new_chunks_right.push(chunk.clone())
38+
}
39+
if new_chunks_left.is_empty() {
40+
new_chunks_left.push(chunks[0].sliced(0, 0));
41+
}
42+
if new_chunks_right.is_empty() {
43+
new_chunks_right.push(chunks[0].sliced(0, 0));
44+
}
45+
(new_chunks_left, new_chunks_right)
46+
}
47+
1148
pub(crate) fn slice(
1249
chunks: &[ArrayRef],
1350
offset: i64,
@@ -136,12 +173,69 @@ impl<T: PolarsDataType> ChunkedArray<T> {
136173
}
137174
}
138175

176+
/// Split the array. The chunks are reallocated the underlying data slices are zero copy.
177+
///
178+
/// When offset is negative it will be counted from the end of the array.
179+
/// This method will never error,
180+
/// and will slice the best match when offset, or length is out of bounds
181+
pub fn split_at(&self, offset: i64) -> (Self, Self) {
182+
// A normal slice, slice the buffers and thus keep the whole memory allocated.
183+
let (l, r) = split_at(&self.chunks, offset, self.len());
184+
let mut out_l = unsafe { self.copy_with_chunks(l) };
185+
let mut out_r = unsafe { self.copy_with_chunks(r) };
186+
187+
use MetadataProperties as P;
188+
let mut properties_l = P::SORTED | P::FAST_EXPLODE_LIST;
189+
let mut properties_r = P::SORTED | P::FAST_EXPLODE_LIST;
190+
191+
let is_ascending = self.is_sorted_ascending_flag();
192+
let is_descending = self.is_sorted_descending_flag();
193+
194+
if is_ascending || is_descending {
195+
let has_nulls_at_start = self.null_count() != 0
196+
&& self
197+
.chunks()
198+
.first()
199+
.unwrap()
200+
.as_ref()
201+
.validity()
202+
.map_or(false, |bm| bm.get(0).unwrap());
203+
204+
if !has_nulls_at_start {
205+
let can_copy_min_value = !has_nulls_at_start && is_ascending;
206+
let can_copy_max_value = !has_nulls_at_start && is_descending;
207+
208+
properties_l.set(P::MIN_VALUE, can_copy_min_value);
209+
properties_l.set(P::MAX_VALUE, can_copy_max_value);
210+
}
211+
212+
let has_nulls_at_end = self.null_count() != 0
213+
&& self
214+
.chunks()
215+
.last()
216+
.unwrap()
217+
.as_ref()
218+
.validity()
219+
.map_or(false, |bm| bm.get(bm.len() - 1).unwrap());
220+
221+
if !has_nulls_at_end {
222+
let can_copy_min_value = !has_nulls_at_end && is_descending;
223+
let can_copy_max_value = !has_nulls_at_end && is_ascending;
224+
properties_r.set(P::MIN_VALUE, can_copy_min_value);
225+
properties_r.set(P::MAX_VALUE, can_copy_max_value);
226+
}
227+
}
228+
out_l.copy_metadata(self, properties_l);
229+
out_r.copy_metadata(self, properties_r);
230+
231+
(out_l, out_r)
232+
}
233+
139234
/// Slice the array. The chunks are reallocated the underlying data slices are zero copy.
140235
///
141236
/// When offset is negative it will be counted from the end of the array.
142237
/// This method will never error,
143238
/// and will slice the best match when offset, or length is out of bounds
144-
#[inline]
145239
pub fn slice(&self, offset: i64, length: usize) -> Self {
146240
// The len: 0 special cases ensure we release memory.
147241
// A normal slice, slice the buffers and thus keep the whole memory allocated.

Diff for: crates/polars-core/src/frame/mod.rs

+8
Original file line numberDiff line numberDiff line change
@@ -2233,6 +2233,14 @@ impl DataFrame {
22332233
unsafe { DataFrame::new_no_checks(col) }
22342234
}
22352235

2236+
/// Split [`DataFrame`] at the given `offset`.
2237+
pub fn split_at(&self, offset: i64) -> (Self, Self) {
2238+
let (a, b) = self.columns.iter().map(|s| s.split_at(offset)).unzip();
2239+
let a = unsafe { DataFrame::new_no_checks(a) };
2240+
let b = unsafe { DataFrame::new_no_checks(b) };
2241+
(a, b)
2242+
}
2243+
22362244
pub fn clear(&self) -> Self {
22372245
let col = self.columns.iter().map(|s| s.clear()).collect::<Vec<_>>();
22382246
unsafe { DataFrame::new_no_checks(col) }

Diff for: crates/polars-core/src/series/implementations/array.rs

+5
Original file line numberDiff line numberDiff line change
@@ -98,6 +98,11 @@ impl SeriesTrait for SeriesWrap<ArrayChunked> {
9898
self.0.slice(offset, length).into_series()
9999
}
100100

101+
fn split_at(&self, offset: i64) -> (Series, Series) {
102+
let (a, b) = self.0.split_at(offset);
103+
(a.into_series(), b.into_series())
104+
}
105+
101106
fn append(&mut self, other: &Series) -> PolarsResult<()> {
102107
polars_ensure!(self.0.dtype() == other.dtype(), append);
103108
let other = other.array()?;

Diff for: crates/polars-core/src/series/implementations/binary.rs

+4
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,10 @@ impl SeriesTrait for SeriesWrap<BinaryChunked> {
119119
fn slice(&self, offset: i64, length: usize) -> Series {
120120
self.0.slice(offset, length).into_series()
121121
}
122+
fn split_at(&self, offset: i64) -> (Series, Series) {
123+
let (a, b) = self.0.split_at(offset);
124+
(a.into_series(), b.into_series())
125+
}
122126

123127
fn append(&mut self, other: &Series) -> PolarsResult<()> {
124128
polars_ensure!(self.0.dtype() == other.dtype(), append);

Diff for: crates/polars-core/src/series/implementations/binary_offset.rs

+4
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,10 @@ impl SeriesTrait for SeriesWrap<BinaryOffsetChunked> {
8282
fn slice(&self, offset: i64, length: usize) -> Series {
8383
self.0.slice(offset, length).into_series()
8484
}
85+
fn split_at(&self, offset: i64) -> (Series, Series) {
86+
let (a, b) = self.0.split_at(offset);
87+
(a.into_series(), b.into_series())
88+
}
8589

8690
fn append(&mut self, other: &Series) -> PolarsResult<()> {
8791
polars_ensure!(self.0.dtype() == other.dtype(), append);

Diff for: crates/polars-core/src/series/implementations/boolean.rs

+4
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,10 @@ impl SeriesTrait for SeriesWrap<BooleanChunked> {
140140
fn slice(&self, offset: i64, length: usize) -> Series {
141141
self.0.slice(offset, length).into_series()
142142
}
143+
fn split_at(&self, offset: i64) -> (Series, Series) {
144+
let (a, b) = self.0.split_at(offset);
145+
(a.into_series(), b.into_series())
146+
}
143147

144148
fn append(&mut self, other: &Series) -> PolarsResult<()> {
145149
polars_ensure!(self.0.dtype() == other.dtype(), append);

Diff for: crates/polars-core/src/series/implementations/categorical.rs

+6
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,12 @@ impl SeriesTrait for SeriesWrap<CategoricalChunked> {
154154
self.with_state(false, |cats| cats.slice(offset, length))
155155
.into_series()
156156
}
157+
fn split_at(&self, offset: i64) -> (Series, Series) {
158+
let (a, b) = self.0.physical().split_at(offset);
159+
let a = self.finish_with_state(false, a).into_series();
160+
let b = self.finish_with_state(false, b).into_series();
161+
(a, b)
162+
}
157163

158164
fn append(&mut self, other: &Series) -> PolarsResult<()> {
159165
polars_ensure!(self.0.dtype() == other.dtype(), append);

Diff for: crates/polars-core/src/series/implementations/date.rs

+4
Original file line numberDiff line numberDiff line change
@@ -165,6 +165,10 @@ impl SeriesTrait for SeriesWrap<DateChunked> {
165165
fn slice(&self, offset: i64, length: usize) -> Series {
166166
self.0.slice(offset, length).into_date().into_series()
167167
}
168+
fn split_at(&self, offset: i64) -> (Series, Series) {
169+
let (a, b) = self.0.split_at(offset);
170+
(a.into_date().into_series(), b.into_date().into_series())
171+
}
168172

169173
fn mean(&self) -> Option<f64> {
170174
self.0.mean()

Diff for: crates/polars-core/src/series/implementations/datetime.rs

+9
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,15 @@ impl SeriesTrait for SeriesWrap<DatetimeChunked> {
172172
.into_datetime(self.0.time_unit(), self.0.time_zone().clone())
173173
.into_series()
174174
}
175+
fn split_at(&self, offset: i64) -> (Series, Series) {
176+
let (a, b) = self.0.split_at(offset);
177+
(
178+
a.into_datetime(self.0.time_unit(), self.0.time_zone().clone())
179+
.into_series(),
180+
b.into_datetime(self.0.time_unit(), self.0.time_zone().clone())
181+
.into_series(),
182+
)
183+
}
175184

176185
fn mean(&self) -> Option<f64> {
177186
self.0.mean()

Diff for: crates/polars-core/src/series/implementations/decimal.rs

+11
Original file line numberDiff line numberDiff line change
@@ -194,6 +194,17 @@ impl SeriesTrait for SeriesWrap<DecimalChunked> {
194194
self.apply_physical_to_s(|ca| ca.slice(offset, length))
195195
}
196196

197+
fn split_at(&self, offset: i64) -> (Series, Series) {
198+
let (a, b) = self.0.split_at(offset);
199+
let a = a
200+
.into_decimal_unchecked(self.0.precision(), self.0.scale())
201+
.into_series();
202+
let b = b
203+
.into_decimal_unchecked(self.0.precision(), self.0.scale())
204+
.into_series();
205+
(a, b)
206+
}
207+
197208
fn append(&mut self, other: &Series) -> PolarsResult<()> {
198209
polars_ensure!(self.0.dtype() == other.dtype(), append);
199210
let other = other.decimal()?;

Diff for: crates/polars-core/src/series/implementations/duration.rs

+7
Original file line numberDiff line numberDiff line change
@@ -234,6 +234,13 @@ impl SeriesTrait for SeriesWrap<DurationChunked> {
234234
.into_series()
235235
}
236236

237+
fn split_at(&self, offset: i64) -> (Series, Series) {
238+
let (a, b) = self.0.split_at(offset);
239+
let a = a.into_duration(self.0.time_unit()).into_series();
240+
let b = b.into_duration(self.0.time_unit()).into_series();
241+
(a, b)
242+
}
243+
237244
fn mean(&self) -> Option<f64> {
238245
self.0.mean()
239246
}

Diff for: crates/polars-core/src/series/implementations/floats.rs

+5
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,11 @@ macro_rules! impl_dyn_series {
160160
return self.0.slice(offset, length).into_series();
161161
}
162162

163+
fn split_at(&self, offset: i64) -> (Series, Series) {
164+
let (a, b) = self.0.split_at(offset);
165+
(a.into_series(), b.into_series())
166+
}
167+
163168
fn append(&mut self, other: &Series) -> PolarsResult<()> {
164169
polars_ensure!(self.0.dtype() == other.dtype(), append);
165170
self.0.append(other.as_ref().as_ref());

Diff for: crates/polars-core/src/series/implementations/list.rs

+5
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,11 @@ impl SeriesTrait for SeriesWrap<ListChunked> {
8383
self.0.slice(offset, length).into_series()
8484
}
8585

86+
fn split_at(&self, offset: i64) -> (Series, Series) {
87+
let (a, b) = self.0.split_at(offset);
88+
(a.into_series(), b.into_series())
89+
}
90+
8691
fn append(&mut self, other: &Series) -> PolarsResult<()> {
8792
polars_ensure!(self.0.dtype() == other.dtype(), append);
8893
self.0.append(other.as_ref().as_ref())

Diff for: crates/polars-core/src/series/implementations/mod.rs

+6-1
Original file line numberDiff line numberDiff line change
@@ -261,7 +261,12 @@ macro_rules! impl_dyn_series {
261261
}
262262

263263
fn slice(&self, offset: i64, length: usize) -> Series {
264-
return self.0.slice(offset, length).into_series();
264+
self.0.slice(offset, length).into_series()
265+
}
266+
267+
fn split_at(&self, offset: i64) -> (Series, Series) {
268+
let (a, b) = self.0.split_at(offset);
269+
(a.into_series(), b.into_series())
265270
}
266271

267272
fn append(&mut self, other: &Series) -> PolarsResult<()> {

Diff for: crates/polars-core/src/series/implementations/null.rs

+18
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,24 @@ impl SeriesTrait for NullChunked {
239239
.into_series()
240240
}
241241

242+
fn split_at(&self, offset: i64) -> (Series, Series) {
243+
let (l, r) = chunkops::split_at(self.chunks(), offset, self.len());
244+
(
245+
NullChunked {
246+
name: self.name.clone(),
247+
length: l.iter().map(|arr| arr.len() as IdxSize).sum(),
248+
chunks: l,
249+
}
250+
.into_series(),
251+
NullChunked {
252+
name: self.name.clone(),
253+
length: r.iter().map(|arr| arr.len() as IdxSize).sum(),
254+
chunks: r,
255+
}
256+
.into_series(),
257+
)
258+
}
259+
242260
fn sort_with(&self, _options: SortOptions) -> PolarsResult<Series> {
243261
Ok(self.clone().into_series())
244262
}

Diff for: crates/polars-core/src/series/implementations/object.rs

+5
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,11 @@ where
104104
ObjectChunked::slice(&self.0, offset, length).into_series()
105105
}
106106

107+
fn split_at(&self, offset: i64) -> (Series, Series) {
108+
let (a, b) = ObjectChunked::split_at(&self.0, offset);
109+
(a.into_series(), b.into_series())
110+
}
111+
107112
fn append(&mut self, other: &Series) -> PolarsResult<()> {
108113
if self.dtype() != other.dtype() {
109114
polars_bail!(append);

Diff for: crates/polars-core/src/series/implementations/string.rs

+4
Original file line numberDiff line numberDiff line change
@@ -119,6 +119,10 @@ impl SeriesTrait for SeriesWrap<StringChunked> {
119119
fn slice(&self, offset: i64, length: usize) -> Series {
120120
self.0.slice(offset, length).into_series()
121121
}
122+
fn split_at(&self, offset: i64) -> (Series, Series) {
123+
let (a, b) = self.0.split_at(offset);
124+
(a.into_series(), b.into_series())
125+
}
122126

123127
fn append(&mut self, other: &Series) -> PolarsResult<()> {
124128
polars_ensure!(

Diff for: crates/polars-core/src/series/implementations/struct_.rs

+8
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,14 @@ impl SeriesTrait for SeriesWrap<StructChunked> {
126126
out.into_series()
127127
}
128128

129+
fn split_at(&self, offset: i64) -> (Series, Series) {
130+
let (a, b): (Vec<_>, Vec<_>) = self.0.fields().iter().map(|s| s.split_at(offset)).unzip();
131+
132+
let a = StructChunked::new(self.name(), &a).unwrap();
133+
let b = StructChunked::new(self.name(), &b).unwrap();
134+
(a.into_series(), b.into_series())
135+
}
136+
129137
fn append(&mut self, other: &Series) -> PolarsResult<()> {
130138
let other = other.struct_()?;
131139
if self.is_empty() {

Diff for: crates/polars-core/src/series/implementations/time.rs

+4
Original file line numberDiff line numberDiff line change
@@ -140,6 +140,10 @@ impl SeriesTrait for SeriesWrap<TimeChunked> {
140140
fn slice(&self, offset: i64, length: usize) -> Series {
141141
self.0.slice(offset, length).into_time().into_series()
142142
}
143+
fn split_at(&self, offset: i64) -> (Series, Series) {
144+
let (a, b) = self.0.split_at(offset);
145+
(a.into_series(), b.into_series())
146+
}
143147

144148
fn mean(&self) -> Option<f64> {
145149
self.0.mean()

Diff for: crates/polars-core/src/series/series_trait.rs

+6
Original file line numberDiff line numberDiff line change
@@ -239,6 +239,12 @@ pub trait SeriesTrait:
239239
/// end of the array
240240
fn slice(&self, _offset: i64, _length: usize) -> Series;
241241

242+
/// Get a zero copy view of the data.
243+
///
244+
/// When offset is negative the offset is counted from the
245+
/// end of the array
246+
fn split_at(&self, _offset: i64) -> (Series, Series);
247+
242248
#[doc(hidden)]
243249
fn append(&mut self, _other: &Series) -> PolarsResult<()>;
244250

0 commit comments

Comments
 (0)