From 4327ac5ce50d1d4e4ffb325fc7a4b9e0648262bd Mon Sep 17 00:00:00 2001 From: Gabor Szadovszky Date: Tue, 20 Mar 2018 15:57:09 +0100 Subject: [PATCH 1/2] PARQUET-1251: Describe handling of the ambigous min/max statistics for FLOAT/DOUBLE --- src/main/thrift/parquet.thrift | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 195ff9086..dd8027ebf 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -751,10 +751,19 @@ union ColumnOrder { * INT32 - signed comparison * INT64 - signed comparison * INT96 (only used for legacy timestamps) - undefined - * FLOAT - signed comparison of the represented value - * DOUBLE - signed comparison of the represented value + * FLOAT - signed comparison of the represented value (*) + * DOUBLE - signed comparison of the represented value (*) * BYTE_ARRAY - unsigned byte-wise comparison * FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison + * + * (*) Because of the sorting order is not specified properly for floating + * point values (relations vs. total ordering) the following + * compatibility rules should be applied: + * - When looking for NaN values, min and max should be ignored. + * - If the min is a NaN, it should be ignored. + * - If the max is a NaN, it should be ignored. + * - If the min is +0, the row group may contain -0 values as well. + * - If the max is -0, the row group may contain +0 values as well. */ 1: TypeDefinedOrder TYPE_ORDER; } From 65493958e0d6f427b7815a71695d43ca1f2c25b4 Mon Sep 17 00:00:00 2001 From: Gabor Szadovszky Date: Mon, 26 Mar 2018 12:58:27 +0200 Subject: [PATCH 2/2] PARQUET-1251: Updates for zi's comments --- src/main/thrift/parquet.thrift | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index dd8027ebf..bee82b30c 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -756,14 +756,14 @@ union ColumnOrder { * BYTE_ARRAY - unsigned byte-wise comparison * FIXED_LEN_BYTE_ARRAY - unsigned byte-wise comparison * - * (*) Because of the sorting order is not specified properly for floating + * (*) Because the sorting order is not specified properly for floating * point values (relations vs. total ordering) the following - * compatibility rules should be applied: - * - When looking for NaN values, min and max should be ignored. + * compatibility rules should be applied when reading statistics: * - If the min is a NaN, it should be ignored. * - If the max is a NaN, it should be ignored. * - If the min is +0, the row group may contain -0 values as well. * - If the max is -0, the row group may contain +0 values as well. + * - When looking for NaN values, min and max should be ignored. */ 1: TypeDefinedOrder TYPE_ORDER; }