|
16 | 16 | // under the License. |
17 | 17 |
|
18 | 18 | use std::collections::HashSet; |
19 | | -use std::sync::Arc; |
20 | | - |
21 | | -use crate::stats::Precision; |
22 | | -use arrow::array::UInt64Array; |
23 | | -use arrow::datatypes::FieldRef; |
24 | 19 | use arrow::{ |
25 | 20 | array::{ArrayRef, BooleanArray}, |
26 | | - datatypes::{Schema, SchemaRef}, |
27 | 21 | }; |
28 | 22 |
|
29 | 23 | use crate::Column; |
30 | | -use crate::{ScalarValue, Statistics}; |
| 24 | +use crate::ScalarValue; |
31 | 25 |
|
32 | 26 | /// A source of runtime statistical information to [`PruningPredicate`]s. |
33 | 27 | /// |
@@ -131,324 +125,3 @@ pub trait PruningStatistics { |
131 | 125 | ) -> Option<BooleanArray>; |
132 | 126 | } |
133 | 127 |
|
134 | | -/// Prune files based on their partition values. |
135 | | -/// This is used both at planning time and execution time to prune |
136 | | -/// files based on their partition values. |
137 | | -/// This feeds into [`CompositePruningStatistics`] to allow pruning |
138 | | -/// with filters that depend both on partition columns and data columns |
139 | | -/// (e.g. `WHERE partition_col = data_col`). |
140 | | -pub struct PartitionPruningStatistics { |
141 | | - /// Values for each column for each container. |
142 | | - /// The outer vectors represent the columns while the inner |
143 | | - /// vectors represent the containers. |
144 | | - /// The order must match the order of the partition columns in |
145 | | - /// [`PartitionPruningStatistics::partition_schema`]. |
146 | | - partition_values: Vec<Vec<ScalarValue>>, |
147 | | - /// The number of containers. |
148 | | - /// Stored since the partition values are column-major and if |
149 | | - /// there are no columns we wouldn't know the number of containers. |
150 | | - num_containers: usize, |
151 | | - /// The schema of the partition columns. |
152 | | - /// This must **not** be the schema of the entire file or table: |
153 | | - /// it must only be the schema of the partition columns, |
154 | | - /// in the same order as the values in [`PartitionPruningStatistics::partition_values`]. |
155 | | - partition_schema: SchemaRef, |
156 | | -} |
157 | | - |
158 | | -impl PartitionPruningStatistics { |
159 | | - /// Create a new instance of [`PartitionPruningStatistics`]. |
160 | | - /// |
161 | | - /// Args: |
162 | | - /// * `partition_values`: A vector of vectors of [`ScalarValue`]s. |
163 | | - /// The outer vector represents the containers while the inner |
164 | | - /// vector represents the partition values for each column. |
165 | | - /// Note that this is the **opposite** of the order of the |
166 | | - /// partition columns in `PartitionPruningStatistics::partition_schema`. |
167 | | - /// * `partition_schema`: The schema of the partition columns. |
168 | | - /// This must **not** be the schema of the entire file or table: |
169 | | - /// instead it must only be the schema of the partition columns, |
170 | | - /// in the same order as the values in `partition_values`. |
171 | | - pub fn new( |
172 | | - partition_values: Vec<Vec<ScalarValue>>, |
173 | | - partition_fields: Vec<FieldRef>, |
174 | | - ) -> Self { |
175 | | - let num_containers = partition_values.len(); |
176 | | - let partition_schema = Arc::new(Schema::new(partition_fields)); |
177 | | - let mut partition_valeus_by_column = |
178 | | - vec![vec![]; partition_schema.fields().len()]; |
179 | | - for partition_value in partition_values.iter() { |
180 | | - for (i, value) in partition_value.iter().enumerate() { |
181 | | - partition_valeus_by_column[i].push(value.clone()); |
182 | | - } |
183 | | - } |
184 | | - Self { |
185 | | - partition_values: partition_valeus_by_column, |
186 | | - num_containers, |
187 | | - partition_schema, |
188 | | - } |
189 | | - } |
190 | | -} |
191 | | - |
192 | | -impl PruningStatistics for PartitionPruningStatistics { |
193 | | - fn min_values(&self, column: &Column) -> Option<ArrayRef> { |
194 | | - let index = self.partition_schema.index_of(column.name()).ok()?; |
195 | | - let partition_values = self.partition_values.get(index)?; |
196 | | - let mut values = Vec::with_capacity(self.partition_values.len()); |
197 | | - for partition_value in partition_values { |
198 | | - match partition_value { |
199 | | - ScalarValue::Null => values.push(ScalarValue::Null), |
200 | | - _ => values.push(partition_value.clone()), |
201 | | - } |
202 | | - } |
203 | | - match ScalarValue::iter_to_array(values) { |
204 | | - Ok(array) => Some(array), |
205 | | - Err(_) => { |
206 | | - log::warn!( |
207 | | - "Failed to convert min values to array for column {}", |
208 | | - column.name() |
209 | | - ); |
210 | | - None |
211 | | - } |
212 | | - } |
213 | | - } |
214 | | - |
215 | | - fn max_values(&self, column: &Column) -> Option<ArrayRef> { |
216 | | - self.min_values(column) |
217 | | - } |
218 | | - |
219 | | - fn num_containers(&self) -> usize { |
220 | | - self.num_containers |
221 | | - } |
222 | | - |
223 | | - fn null_counts(&self, _column: &Column) -> Option<ArrayRef> { |
224 | | - None |
225 | | - } |
226 | | - |
227 | | - fn row_counts(&self, _column: &Column) -> Option<ArrayRef> { |
228 | | - None |
229 | | - } |
230 | | - |
231 | | - fn contained( |
232 | | - &self, |
233 | | - column: &Column, |
234 | | - values: &HashSet<ScalarValue>, |
235 | | - ) -> Option<BooleanArray> { |
236 | | - let index = self.partition_schema.index_of(column.name()).ok()?; |
237 | | - let partition_values = self.partition_values.get(index)?; |
238 | | - let mut contained = Vec::with_capacity(self.partition_values.len()); |
239 | | - for partition_value in partition_values { |
240 | | - let contained_value = if values.contains(partition_value) { |
241 | | - Some(true) |
242 | | - } else { |
243 | | - Some(false) |
244 | | - }; |
245 | | - contained.push(contained_value); |
246 | | - } |
247 | | - let array = BooleanArray::from(contained); |
248 | | - Some(array) |
249 | | - } |
250 | | -} |
251 | | - |
252 | | -/// Prune a set of containers represented by their statistics. |
253 | | -/// Each [`Statistics`] represents a container (e.g. a file or a partition of files). |
254 | | -pub struct PrunableStatistics { |
255 | | - /// Statistics for each container. |
256 | | - statistics: Vec<Arc<Statistics>>, |
257 | | - /// The schema of the file these statistics are for. |
258 | | - schema: SchemaRef, |
259 | | -} |
260 | | - |
261 | | -impl PrunableStatistics { |
262 | | - /// Create a new instance of [`PrunableStatistics`]. |
263 | | - /// Each [`Statistics`] represents a container (e.g. a file or a partition of files). |
264 | | - /// The `schema` is the schema of the data in the containers and should apply to all files. |
265 | | - pub fn new(statistics: Vec<Arc<Statistics>>, schema: SchemaRef) -> Self { |
266 | | - Self { statistics, schema } |
267 | | - } |
268 | | -} |
269 | | - |
270 | | -impl PruningStatistics for PrunableStatistics { |
271 | | - fn min_values(&self, column: &Column) -> Option<ArrayRef> { |
272 | | - let index = self.schema.index_of(column.name()).ok()?; |
273 | | - let mut values = Vec::with_capacity(self.statistics.len()); |
274 | | - for stats in &self.statistics { |
275 | | - let stat = stats.column_statistics.get(index)?; |
276 | | - match &stat.min_value { |
277 | | - Precision::Exact(min) => { |
278 | | - values.push(min.clone()); |
279 | | - } |
280 | | - _ => values.push(ScalarValue::Null), |
281 | | - } |
282 | | - } |
283 | | - match ScalarValue::iter_to_array(values) { |
284 | | - Ok(array) => Some(array), |
285 | | - Err(_) => { |
286 | | - log::warn!( |
287 | | - "Failed to convert min values to array for column {}", |
288 | | - column.name() |
289 | | - ); |
290 | | - None |
291 | | - } |
292 | | - } |
293 | | - } |
294 | | - |
295 | | - fn max_values(&self, column: &Column) -> Option<ArrayRef> { |
296 | | - let index = self.schema.index_of(column.name()).ok()?; |
297 | | - let mut values = Vec::with_capacity(self.statistics.len()); |
298 | | - for stats in &self.statistics { |
299 | | - let stat = stats.column_statistics.get(index)?; |
300 | | - match &stat.max_value { |
301 | | - Precision::Exact(max) => { |
302 | | - values.push(max.clone()); |
303 | | - } |
304 | | - _ => values.push(ScalarValue::Null), |
305 | | - } |
306 | | - } |
307 | | - match ScalarValue::iter_to_array(values) { |
308 | | - Ok(array) => Some(array), |
309 | | - Err(_) => { |
310 | | - log::warn!( |
311 | | - "Failed to convert max values to array for column {}", |
312 | | - column.name() |
313 | | - ); |
314 | | - None |
315 | | - } |
316 | | - } |
317 | | - } |
318 | | - |
319 | | - fn num_containers(&self) -> usize { |
320 | | - self.statistics.len() |
321 | | - } |
322 | | - |
323 | | - fn null_counts(&self, column: &Column) -> Option<ArrayRef> { |
324 | | - let index = self.schema.index_of(column.name()).ok()?; |
325 | | - let mut values = Vec::with_capacity(self.statistics.len()); |
326 | | - let mut has_null_count = false; |
327 | | - for stats in &self.statistics { |
328 | | - let stat = stats.column_statistics.get(index)?; |
329 | | - match &stat.null_count { |
330 | | - Precision::Exact(null_count) => match u64::try_from(*null_count) { |
331 | | - Ok(null_count) => { |
332 | | - has_null_count = true; |
333 | | - values.push(Some(null_count)); |
334 | | - } |
335 | | - Err(_) => { |
336 | | - values.push(None); |
337 | | - } |
338 | | - }, |
339 | | - _ => values.push(None), |
340 | | - } |
341 | | - } |
342 | | - if has_null_count { |
343 | | - Some(Arc::new(UInt64Array::from(values))) |
344 | | - } else { |
345 | | - None |
346 | | - } |
347 | | - } |
348 | | - |
349 | | - fn row_counts(&self, _column: &Column) -> Option<ArrayRef> { |
350 | | - let mut values = Vec::with_capacity(self.statistics.len()); |
351 | | - let mut has_row_count = false; |
352 | | - for stats in &self.statistics { |
353 | | - match &stats.num_rows { |
354 | | - Precision::Exact(row_count) => match u64::try_from(*row_count) { |
355 | | - Ok(row_count) => { |
356 | | - has_row_count = true; |
357 | | - values.push(Some(row_count)); |
358 | | - } |
359 | | - Err(_) => { |
360 | | - values.push(None); |
361 | | - } |
362 | | - }, |
363 | | - _ => values.push(None), |
364 | | - } |
365 | | - } |
366 | | - if has_row_count { |
367 | | - Some(Arc::new(UInt64Array::from(values))) |
368 | | - } else { |
369 | | - None |
370 | | - } |
371 | | - } |
372 | | - |
373 | | - fn contained( |
374 | | - &self, |
375 | | - _column: &Column, |
376 | | - _values: &HashSet<ScalarValue>, |
377 | | - ) -> Option<BooleanArray> { |
378 | | - None |
379 | | - } |
380 | | -} |
381 | | - |
382 | | -/// Combine multiple [`PruningStatistics`] into a single |
383 | | -/// [`CompositePruningStatistics`]. |
384 | | -/// This can be used to combine statistics from different sources, |
385 | | -/// for example partition values and file statistics. |
386 | | -/// This allows pruning with filters that depend on multiple sources of statistics, |
387 | | -/// such as `WHERE partition_col = data_col`. |
388 | | -pub struct CompositePruningStatistics { |
389 | | - pub statistics: Vec<Box<dyn PruningStatistics>>, |
390 | | -} |
391 | | - |
392 | | -impl CompositePruningStatistics { |
393 | | - /// Create a new instance of [`CompositePruningStatistics`] from |
394 | | - /// a vector of [`PruningStatistics`]. |
395 | | - pub fn new(statistics: Vec<Box<dyn PruningStatistics>>) -> Self { |
396 | | - assert!(!statistics.is_empty()); |
397 | | - Self { statistics } |
398 | | - } |
399 | | -} |
400 | | - |
401 | | -impl PruningStatistics for CompositePruningStatistics { |
402 | | - fn min_values(&self, column: &Column) -> Option<ArrayRef> { |
403 | | - for stats in &self.statistics { |
404 | | - if let Some(array) = stats.min_values(column) { |
405 | | - return Some(array); |
406 | | - } |
407 | | - } |
408 | | - None |
409 | | - } |
410 | | - |
411 | | - fn max_values(&self, column: &Column) -> Option<ArrayRef> { |
412 | | - for stats in &self.statistics { |
413 | | - if let Some(array) = stats.max_values(column) { |
414 | | - return Some(array); |
415 | | - } |
416 | | - } |
417 | | - None |
418 | | - } |
419 | | - |
420 | | - fn num_containers(&self) -> usize { |
421 | | - self.statistics[0].num_containers() |
422 | | - } |
423 | | - |
424 | | - fn null_counts(&self, column: &Column) -> Option<ArrayRef> { |
425 | | - for stats in &self.statistics { |
426 | | - if let Some(array) = stats.null_counts(column) { |
427 | | - return Some(array); |
428 | | - } |
429 | | - } |
430 | | - None |
431 | | - } |
432 | | - |
433 | | - fn row_counts(&self, column: &Column) -> Option<ArrayRef> { |
434 | | - for stats in &self.statistics { |
435 | | - if let Some(array) = stats.row_counts(column) { |
436 | | - return Some(array); |
437 | | - } |
438 | | - } |
439 | | - None |
440 | | - } |
441 | | - |
442 | | - fn contained( |
443 | | - &self, |
444 | | - column: &Column, |
445 | | - values: &HashSet<ScalarValue>, |
446 | | - ) -> Option<BooleanArray> { |
447 | | - for stats in &self.statistics { |
448 | | - if let Some(array) = stats.contained(column, values) { |
449 | | - return Some(array); |
450 | | - } |
451 | | - } |
452 | | - None |
453 | | - } |
454 | | -} |
0 commit comments