apache · chenjunjiedada · Aug 12, 2017 · Aug 28, 2017 · Sep 14, 2017 · Sep 14, 2017
diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift
@@ -511,6 +511,57 @@ struct ColumnMetaData {
    * This information can be used to determine if all data pages are
    * dictionary encoded for example **/
   13: optional list<PageEncodingStats> encoding_stats;
+
+  /** Byte offset from beginning of file to bloom filter data. The bloom filters
+   * data of columns together is stored before the start of row group wihch describe.**/
+  14: optional i64 bloom_filter_offset;
+}
+
+/**
+  * Definition of bloom filter algorithm.
+  */
+union BloomFilterAlgorithm {
+  /** The default value 0 represents Block based bloom filter. 
+   * The bloom filter bitset is separated into tiny bucket as tiny bloom 
+   * filter, the high 32 bits hash value is used to select bucket, and 
+   * lower 32 bits hash values are used to set bits in tiny bloom filter.
+   * See “Cache-, Hash- and Space-Efficient Bloom Filters”. Specifically, 
+   * one tiny bloom filter contains eight 32-bit words (4 bytes stored in 
+   * little endian), and the algorithm set one bit in each 32-bit word.
+   *
+   * In order to set bits in bucket, the algorithm need 8 SALT values 
+   * (0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU, 0x705495c7U, 
+   * 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U) to calculate index with formular:
+   *                  index[i] = (hash * SALT[i]) >> 27 
+   **/
+   1: i32 algorithm = 0;
+}
+
+/** 
+ * Definition for hash function used to compute hash of column value.
+ * Note that the hash function take plain encoding of column values as input.
+ */
+union BloomFilterHash {
+  /** The default value 0 represents Murmur3.
+   * Murmur3 hash has 32 bits and 128 bits hash variants, we use least significant 
+   * 64 bits from its x64 128 bits function murmur3hash_x64_128  
+   **/
+  1: i32 hash_strategy = 0;
+}
+
+/**
+  * Bloom filter header is stored at beginning of bloom filter data of each column 
+  * and followed by its bitset.
+  */
+struct BloomFilterHeader {
+  /** The size of bitset in bytes, must be a power of 2**/
+  1: required i32 numBytes;
+
+  /** The algorithm for setting bits. **/
+  2: required BloomFilterAlgorithm algorithm;
+
+  /** The hash function used for bloom filter. **/
+  3: required BloomFilterHash hash;
 }
 
 struct ColumnChunk {