From 69df56f29250d38779936273a50d16d20460feba Mon Sep 17 00:00:00 2001 From: Simhadri Govindappa Date: Tue, 1 Aug 2023 15:28:59 +0530 Subject: [PATCH 1/6] Core: Add KLL Datasketch and Hive ColumnStatisticsObj as standard blob types to puffin file --- .../apache/iceberg/puffin/StandardBlobTypes.java | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java b/core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java index ce78375c4b1a..8d1f4258a5ce 100644 --- a/core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java +++ b/core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java @@ -27,6 +27,19 @@ private StandardBlobTypes() {} */ public static final String APACHE_DATASKETCHES_THETA_V1 = "apache-datasketches-theta-v1"; + /** + * A serialized form of + * HIVE_COLUMN_STATS_OBJ + */ + public static final String HIVE_COLUMN_STATS_OBJ = "column-statistics-obj"; + + /** + * A serialized form of KLL sketch produced by the Apache DataSketches library + */ + public static final String APACHE_DATASKETCHES_KLL_SKETCH = "apache_datasketches_kll_sketch"; + /** A serialized deletion vector according to the Iceberg spec */ public static final String DV_V1 = "deletion-vector-v1"; } From abf45098eaa4906daa390682a5a42fe48fca0d2d Mon Sep 17 00:00:00 2001 From: Simhadri Govindappa Date: Wed, 2 Aug 2023 21:53:49 +0530 Subject: [PATCH 2/6] Addressed review comments --- .../java/org/apache/iceberg/puffin/StandardBlobTypes.java | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java b/core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java index 8d1f4258a5ce..7223305d8c3f 100644 --- a/core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java +++ b/core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java @@ -28,9 +28,9 @@ private StandardBlobTypes() {} public static final String APACHE_DATASKETCHES_THETA_V1 = "apache-datasketches-theta-v1"; /** - * A serialized form of - * HIVE_COLUMN_STATS_OBJ + * A serialized form of HIVE_COLUMN_STATS_OBJ. The full list of available stats are provided in the + * + * Hive columns stats wiki */ public static final String HIVE_COLUMN_STATS_OBJ = "column-statistics-obj"; @@ -38,7 +38,7 @@ private StandardBlobTypes() {} * A serialized form of KLL sketch produced by the Apache DataSketches library */ - public static final String APACHE_DATASKETCHES_KLL_SKETCH = "apache_datasketches_kll_sketch"; + public static final String APACHE_DATASKETCHES_KLL_SKETCH = "apache-datasketches-kll-sketch"; /** A serialized deletion vector according to the Iceberg spec */ public static final String DV_V1 = "deletion-vector-v1"; From e8f311f415cd1402db633d27fbb48edf59342c79 Mon Sep 17 00:00:00 2001 From: Simhadri Govindappa Date: Thu, 17 Aug 2023 15:02:32 +0530 Subject: [PATCH 3/6] Addressed review comments and moved the doc update from iceberg-doc to the main iceberg/format --- .../iceberg/puffin/StandardBlobTypes.java | 6 +++--- format/puffin-spec.md | 17 +++++++++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java b/core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java index 7223305d8c3f..ebaf93c07e7d 100644 --- a/core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java +++ b/core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java @@ -28,14 +28,14 @@ private StandardBlobTypes() {} public static final String APACHE_DATASKETCHES_THETA_V1 = "apache-datasketches-theta-v1"; /** - * A serialized form of HIVE_COLUMN_STATS_OBJ. The full list of available stats are provided in the + * A serialized form of Hive column stats object. The full list of available stats are provided in the * * Hive columns stats wiki */ - public static final String HIVE_COLUMN_STATS_OBJ = "column-statistics-obj"; + public static final String HIVE_COLUMN_STATS_OBJ = "hive-column-statistics-obj"; /** - * A serialized form of KLL sketch produced by the Apache DataSketches library */ public static final String APACHE_DATASKETCHES_KLL_SKETCH = "apache-datasketches-kll-sketch"; diff --git a/format/puffin-spec.md b/format/puffin-spec.md index 62e8ae085398..195492334155 100644 --- a/format/puffin-spec.md +++ b/format/puffin-spec.md @@ -181,6 +181,23 @@ for Puffin v1. [roaring-bitmap-portable-serialization]: https://github.com/RoaringBitmap/RoaringFormatSpec?tab=readme-ov-file#extension-for-64-bit-implementations [roaring-bitmap-general-layout]: https://github.com/RoaringBitmap/RoaringFormatSpec?tab=readme-ov-file#general-layout +#### `hive-column-statistics-obj` blob type + +A serialized form of Hive ColumnStatsObject. + +The columnStatsObject supports Histograms, NDV, Min and Max values, Number of nulls, Number of trues, column name, type. +A full list of supported statistics is listed in the table here: +[ColumnStatistics](https://cwiki.apache.org/confluence/display/Hive/StatsDev#StatsDev-ColumnStatistics) + +#### `apache-datasketches-KLL-sketch` blob type + +A serialized form of a "compact" KLL-sketch produced by the [Apache +DataSketches](https://datasketches.apache.org/) library. +Apache-Datasketches-KLL-sketch is an implementation of a very compact quantiles +sketch with lazy compaction scheme and nearly optimal accuracy per bit. + +Histograms are derived from this sketch. + ### Compression codecs The data can also be uncompressed. If it is compressed the codec should be one of From 7f0c727b79a7aa7d72bb74d8ec76d995e7909875 Mon Sep 17 00:00:00 2001 From: Simhadri Govindappa Date: Thu, 17 Aug 2023 15:36:40 +0530 Subject: [PATCH 4/6] Update a typo in format/puffin-spec.md Co-authored-by: Eduard Tudenhoefner --- format/puffin-spec.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/format/puffin-spec.md b/format/puffin-spec.md index 195492334155..bfcc66a03a09 100644 --- a/format/puffin-spec.md +++ b/format/puffin-spec.md @@ -185,7 +185,7 @@ for Puffin v1. A serialized form of Hive ColumnStatsObject. -The columnStatsObject supports Histograms, NDV, Min and Max values, Number of nulls, Number of trues, column name, type. +The ColumnStatsObject supports Histograms, NDV, Min and Max values, Number of nulls, Number of trues, column name, type. A full list of supported statistics is listed in the table here: [ColumnStatistics](https://cwiki.apache.org/confluence/display/Hive/StatsDev#StatsDev-ColumnStatistics) From a4fdf6c81dd4656ca51e7f9ceb235f5d0e687c11 Mon Sep 17 00:00:00 2001 From: Simhadri Govindappa Date: Thu, 17 Aug 2023 16:06:15 +0530 Subject: [PATCH 5/6] Fix checkstyle error flagged by :iceberg-core:spotlessJavaCheck --- .../java/org/apache/iceberg/puffin/StandardBlobTypes.java | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java b/core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java index ebaf93c07e7d..0c879790e59b 100644 --- a/core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java +++ b/core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java @@ -28,9 +28,10 @@ private StandardBlobTypes() {} public static final String APACHE_DATASKETCHES_THETA_V1 = "apache-datasketches-theta-v1"; /** - * A serialized form of Hive column stats object. The full list of available stats are provided in the - * - * Hive columns stats wiki + * A serialized form of Hive column stats object. The full list of available stats are provided in + * the Hive + * columns stats wiki */ public static final String HIVE_COLUMN_STATS_OBJ = "hive-column-statistics-obj"; From c0bba5d1b037c402a5be66206a9252b0da7a00ba Mon Sep 17 00:00:00 2001 From: Denys Kuzmenko Date: Fri, 7 Feb 2025 17:38:49 +0100 Subject: [PATCH 6/6] removed Hive ColumnStatsObject --- .../iceberg/puffin/StandardBlobTypes.java | 12 ++-------- format/puffin-spec.md | 24 +++++++------------ 2 files changed, 10 insertions(+), 26 deletions(-) diff --git a/core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java b/core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java index 0c879790e59b..d916fc46e94b 100644 --- a/core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java +++ b/core/src/main/java/org/apache/iceberg/puffin/StandardBlobTypes.java @@ -28,18 +28,10 @@ private StandardBlobTypes() {} public static final String APACHE_DATASKETCHES_THETA_V1 = "apache-datasketches-theta-v1"; /** - * A serialized form of Hive column stats object. The full list of available stats are provided in - * the Hive - * columns stats wiki - */ - public static final String HIVE_COLUMN_STATS_OBJ = "hive-column-statistics-obj"; - - /** - * A serialized form of a KLL sketch produced by the Apache DataSketches library */ - public static final String APACHE_DATASKETCHES_KLL_SKETCH = "apache-datasketches-kll-sketch"; + public static final String APACHE_DATASKETCHES_KLL_SKETCH = "apache-datasketches-kll-v1"; /** A serialized deletion vector according to the Iceberg spec */ public static final String DV_V1 = "deletion-vector-v1"; diff --git a/format/puffin-spec.md b/format/puffin-spec.md index bfcc66a03a09..da28c4c497bd 100644 --- a/format/puffin-spec.md +++ b/format/puffin-spec.md @@ -181,22 +181,14 @@ for Puffin v1. [roaring-bitmap-portable-serialization]: https://github.com/RoaringBitmap/RoaringFormatSpec?tab=readme-ov-file#extension-for-64-bit-implementations [roaring-bitmap-general-layout]: https://github.com/RoaringBitmap/RoaringFormatSpec?tab=readme-ov-file#general-layout -#### `hive-column-statistics-obj` blob type - -A serialized form of Hive ColumnStatsObject. - -The ColumnStatsObject supports Histograms, NDV, Min and Max values, Number of nulls, Number of trues, column name, type. -A full list of supported statistics is listed in the table here: -[ColumnStatistics](https://cwiki.apache.org/confluence/display/Hive/StatsDev#StatsDev-ColumnStatistics) - -#### `apache-datasketches-KLL-sketch` blob type - -A serialized form of a "compact" KLL-sketch produced by the [Apache -DataSketches](https://datasketches.apache.org/) library. -Apache-Datasketches-KLL-sketch is an implementation of a very compact quantiles -sketch with lazy compaction scheme and nearly optimal accuracy per bit. - -Histograms are derived from this sketch. +#### `apache-datasketches-kll-v1` blob type + +A serialized form of a KLL sketch, a very compact quantiles sketch, produced by the +[Apache DataSketches](https://datasketches.apache.org/) library. +KLL quantiles sketch is a mergeable streaming algorithm to estimate +the distribution of values, and approximately answer queries about the rank of a value, +probability mass function of the distribution (PMF) or histogram, +cumulative distribution function (CDF), and quantiles (median, min, max, 95th percentile and such) ### Compression codecs