@@ -98,6 +98,19 @@ Result<std::shared_ptr<SchemaManifest>> GetSchemaManifest(
98
98
return manifest;
99
99
}
100
100
101
+ bool IsNan (const Scalar& value) {
102
+ if (value.is_valid ) {
103
+ if (value.type ->id () == Type::FLOAT) {
104
+ const FloatScalar& float_scalar = checked_cast<const FloatScalar&>(value);
105
+ return std::isnan (float_scalar.value );
106
+ } else if (value.type ->id () == Type::DOUBLE) {
107
+ const DoubleScalar& double_scalar = checked_cast<const DoubleScalar&>(value);
108
+ return std::isnan (double_scalar.value );
109
+ }
110
+ }
111
+ return false ;
112
+ }
113
+
101
114
std::optional<compute::Expression> ColumnChunkStatisticsAsExpression (
102
115
const SchemaField& schema_field, const parquet::RowGroupMetaData& metadata) {
103
116
// For the remaining of this function, failure to extract/parse statistics
@@ -112,50 +125,13 @@ std::optional<compute::Expression> ColumnChunkStatisticsAsExpression(
112
125
113
126
auto column_metadata = metadata.ColumnChunk (schema_field.column_index );
114
127
auto statistics = column_metadata->statistics ();
115
- if (statistics == nullptr ) {
116
- return std::nullopt;
117
- }
118
-
119
128
const auto & field = schema_field.field ;
120
- auto field_expr = compute::field_ref (field->name ());
121
129
122
- // Optimize for corner case where all values are nulls
123
- if (statistics->num_values () == 0 && statistics->null_count () > 0 ) {
124
- return is_null (std::move (field_expr));
125
- }
126
-
127
- std::shared_ptr<Scalar> min, max;
128
- if (!StatisticsAsScalars (*statistics, &min, &max).ok ()) {
130
+ if (statistics == nullptr ) {
129
131
return std::nullopt;
130
132
}
131
133
132
- auto maybe_min = min->CastTo (field->type ());
133
- auto maybe_max = max->CastTo (field->type ());
134
- if (maybe_min.ok () && maybe_max.ok ()) {
135
- min = maybe_min.MoveValueUnsafe ();
136
- max = maybe_max.MoveValueUnsafe ();
137
-
138
- if (min->Equals (max)) {
139
- auto single_value = compute::equal (field_expr, compute::literal (std::move (min)));
140
-
141
- if (statistics->null_count () == 0 ) {
142
- return single_value;
143
- }
144
- return compute::or_ (std::move (single_value), is_null (std::move (field_expr)));
145
- }
146
-
147
- auto lower_bound =
148
- compute::greater_equal (field_expr, compute::literal (std::move (min)));
149
- auto upper_bound = compute::less_equal (field_expr, compute::literal (std::move (max)));
150
-
151
- auto in_range = compute::and_ (std::move (lower_bound), std::move (upper_bound));
152
- if (statistics->null_count () != 0 ) {
153
- return compute::or_ (std::move (in_range), compute::is_null (field_expr));
154
- }
155
- return in_range;
156
- }
157
-
158
- return std::nullopt;
134
+ return ParquetFileFragment::EvaluateStatisticsAsExpression (*field, *statistics);
159
135
}
160
136
161
137
void AddColumnIndices (const SchemaField& schema_field,
@@ -306,6 +282,65 @@ Result<bool> IsSupportedParquetFile(const ParquetFileFormat& format,
306
282
307
283
} // namespace
308
284
285
+ std::optional<compute::Expression> ParquetFileFragment::EvaluateStatisticsAsExpression (
286
+ const Field& field, const parquet::Statistics& statistics) {
287
+ auto field_expr = compute::field_ref (field.name ());
288
+
289
+ // Optimize for corner case where all values are nulls
290
+ if (statistics.num_values () == 0 && statistics.null_count () > 0 ) {
291
+ return is_null (std::move (field_expr));
292
+ }
293
+
294
+ std::shared_ptr<Scalar> min, max;
295
+ if (!StatisticsAsScalars (statistics, &min, &max).ok ()) {
296
+ return std::nullopt;
297
+ }
298
+
299
+ auto maybe_min = min->CastTo (field.type ());
300
+ auto maybe_max = max->CastTo (field.type ());
301
+
302
+ if (maybe_min.ok () && maybe_max.ok ()) {
303
+ min = maybe_min.MoveValueUnsafe ();
304
+ max = maybe_max.MoveValueUnsafe ();
305
+
306
+ if (min->Equals (max)) {
307
+ auto single_value = compute::equal (field_expr, compute::literal (std::move (min)));
308
+
309
+ if (statistics.null_count () == 0 ) {
310
+ return single_value;
311
+ }
312
+ return compute::or_ (std::move (single_value), is_null (std::move (field_expr)));
313
+ }
314
+
315
+ auto lower_bound = compute::greater_equal (field_expr, compute::literal (min));
316
+ auto upper_bound = compute::less_equal (field_expr, compute::literal (max));
317
+ compute::Expression in_range;
318
+
319
+ // Since the minimum & maximum values are NaN, useful statistics
320
+ // cannot be extracted for checking the presence of a value within
321
+ // range
322
+ if (IsNan (*min) && IsNan (*max)) {
323
+ return std::nullopt;
324
+ }
325
+
326
+ // If either minimum or maximum is NaN, it should be ignored for the
327
+ // range computation
328
+ if (IsNan (*min)) {
329
+ in_range = std::move (upper_bound);
330
+ } else if (IsNan (*max)) {
331
+ in_range = std::move (lower_bound);
332
+ } else {
333
+ in_range = compute::and_ (std::move (lower_bound), std::move (upper_bound));
334
+ }
335
+
336
+ if (statistics.null_count () != 0 ) {
337
+ return compute::or_ (std::move (in_range), compute::is_null (field_expr));
338
+ }
339
+ return in_range;
340
+ }
341
+ return std::nullopt;
342
+ }
343
+
309
344
ParquetFileFormat::ParquetFileFormat ()
310
345
: FileFormat(std::make_shared<ParquetFragmentScanOptions>()) {}
311
346
0 commit comments