Skip to content

Commit a417b2b

Browse files
feat(iceberg): Support Iceberg hash (facebookincubator#14025)
Summary: The iceberg hash use mumur3 hash, which aligns with https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp, firstly, process every 4 bytes as a chunk, then process remaining bytes by XOR, sparksql also uses this hash algorithm but is different with processing remaining bytes, which combine the remaining bytes. Extract the common function hashInt64 to functions/lib. This class will be used for iceberg bucket transform and bucket function. The iceberg mumur3 hash should be strictly with java implementation, then write by iceberg could read with iceberg Java, and the function call can also get the correct result. The iceberg utility lib `velox_functions_iceberg_hash` will be linked by iceberg connector write to do partition transform. facebookincubator#13874 Pull Request resolved: facebookincubator#14025 Reviewed By: pedroerp Differential Revision: D79732785 Pulled By: kgpai fbshipit-source-id: 6122b94673f015dca5c8484722926709a30fe65e
1 parent 2a01b24 commit a417b2b

File tree

11 files changed

+308
-40
lines changed

11 files changed

+308
-40
lines changed

CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -131,6 +131,7 @@ option(VELOX_ENABLE_HIVE_CONNECTOR "Build Hive connector." ON)
131131
option(VELOX_ENABLE_TPCH_CONNECTOR "Build TPC-H connector." ON)
132132
option(VELOX_ENABLE_PRESTO_FUNCTIONS "Build Presto SQL functions." ON)
133133
option(VELOX_ENABLE_SPARK_FUNCTIONS "Build Spark SQL functions." ON)
134+
option(VELOX_ENABLE_ICEBERG_FUNCTIONS "Build Iceberg functions." ON)
134135
option(VELOX_ENABLE_EXPRESSION "Build expression." ON)
135136
option(
136137
VELOX_ENABLE_EXAMPLES

velox/functions/CMakeLists.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,3 +37,7 @@ endif()
3737
if(${VELOX_BUILD_TESTING})
3838
add_subdirectory(tests)
3939
endif()
40+
41+
if(${VELOX_ENABLE_ICEBERG_FUNCTIONS})
42+
add_subdirectory(iceberg)
43+
endif()
Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
# Copyright (c) Facebook, Inc. and its affiliates.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
velox_add_library(velox_functions_iceberg_hash Murmur3Hash32.cpp)
16+
17+
velox_link_libraries(velox_functions_iceberg_hash velox_common_base
18+
velox_functions_util)
19+
20+
if(${VELOX_BUILD_TESTING})
21+
add_subdirectory(tests)
22+
endif()
Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
/*
2+
* Copyright (c) Facebook, Inc. and its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include "velox/functions/iceberg/Murmur3Hash32.h"
18+
#include "velox/common/base/BitUtil.h"
19+
20+
namespace facebook::velox::functions::iceberg {
21+
22+
namespace {
23+
24+
constexpr int32_t kSeed = 0;
25+
} // namespace
26+
27+
int32_t Murmur3Hash32::hashInt64(uint64_t input) {
28+
return Murmur3Hash32Base::hashInt64(input, kSeed);
29+
}
30+
31+
int32_t Murmur3Hash32::hashBytes(const char* input, uint32_t len) {
32+
const uint8_t* data = reinterpret_cast<const uint8_t*>(input);
33+
const int32_t nblocks = len / 4;
34+
35+
uint32_t h1 = kSeed;
36+
37+
// Process 4-byte chunks.
38+
for (auto i = 0; i < nblocks; i++) {
39+
uint32_t k1 = *reinterpret_cast<const uint32_t*>(data + i * 4);
40+
k1 = mixK1(k1);
41+
h1 = mixH1(h1, k1);
42+
}
43+
44+
// Process remaining bytes.
45+
const uint8_t* tail = data + nblocks * 4;
46+
uint32_t k1 = 0;
47+
switch (len & 3) {
48+
case 3:
49+
k1 ^= static_cast<uint32_t>(tail[2]) << 16;
50+
[[fallthrough]];
51+
case 2:
52+
k1 ^= static_cast<uint32_t>(tail[1]) << 8;
53+
[[fallthrough]];
54+
case 1:
55+
k1 ^= static_cast<uint32_t>(tail[0]);
56+
h1 ^= mixK1(k1);
57+
}
58+
59+
return fmix(h1, len);
60+
}
61+
62+
} // namespace facebook::velox::functions::iceberg
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
/*
2+
* Copyright (c) Facebook, Inc. and its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#pragma once
17+
18+
#include <cstdint>
19+
#include "velox/functions/lib/Murmur3Hash32Base.h"
20+
21+
namespace facebook::velox::functions::iceberg {
22+
23+
/// This implementation should align with Iceberg bucket transform hash
24+
/// algorithm which uses Guava hash
25+
/// https://github.com/google/guava/blob/master/guava/src/com/google/common/hash/Murmur3_32HashFunction.java,
26+
/// Bucket partition transforms use a 32-bit hash of the source value. The
27+
/// 32-bit hash implementation is the 32-bit Murmur3 hash, x86 variant, seeded
28+
/// with 0. If not same with Iceberg java version, the partition will be
29+
/// different and can not read with Iceberg java.
30+
class Murmur3Hash32 : Murmur3Hash32Base {
31+
public:
32+
/// Value of type INTEGER and BIGINT is treated as unsigned type.
33+
/// For the schema evolution, promote int to int64, treat int32 as uint64.
34+
static int32_t hashInt64(uint64_t input);
35+
36+
/// Hash the bytes every 4 bytes, XOR on remaining bytes. Processing for the
37+
/// remaining bytes is different with Spark murmur3 which combine with the
38+
/// remaining bytes.
39+
static int32_t hashBytes(const char* input, uint32_t len);
40+
};
41+
42+
} // namespace facebook::velox::functions::iceberg
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# Copyright (c) Facebook, Inc. and its affiliates.
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
add_executable(velox_functions_iceberg_test Murmur3Hash32Test.cpp)
15+
16+
add_test(
17+
NAME velox_functions_iceberg_test
18+
COMMAND velox_functions_iceberg_test
19+
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
20+
21+
target_link_libraries(
22+
velox_functions_iceberg_test
23+
velox_functions_iceberg_hash
24+
GTest::gtest
25+
GTest::gtest_main
26+
GTest::gmock)
Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
/*
2+
* Copyright (c) Facebook, Inc. and its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#include "velox/functions/iceberg/Murmur3Hash32.h"
17+
18+
#include <gtest/gtest.h>
19+
20+
namespace facebook::velox::functions::iceberg {
21+
namespace {
22+
23+
TEST(Murmur3Hash32Test, bigint) {
24+
Murmur3Hash32 func;
25+
EXPECT_EQ(func.hashInt64(10), -289985220);
26+
EXPECT_EQ(func.hashInt64(0), 1669671676);
27+
EXPECT_EQ(func.hashInt64(-5), 1222806974);
28+
}
29+
30+
TEST(Murmur3Hash32Test, string) {
31+
Murmur3Hash32 func;
32+
33+
const auto hash = [&](std::string input) {
34+
return func.hashBytes(input.c_str(), input.size());
35+
};
36+
37+
EXPECT_EQ(hash("abcdefg"), -2009294074);
38+
EXPECT_EQ(hash("abc"), -1277324294);
39+
EXPECT_EQ(hash("abcd"), 1139631978);
40+
EXPECT_EQ(hash("abcde"), -392455434);
41+
EXPECT_EQ(hash("测试"), -25843656);
42+
EXPECT_EQ(hash("测试raul试测"), -912788207);
43+
EXPECT_EQ(hash(""), 0);
44+
EXPECT_EQ(hash("Товары"), 1817480714);
45+
EXPECT_EQ(hash("😀"), -1095487750);
46+
}
47+
} // namespace
48+
} // namespace facebook::velox::functions::iceberg

velox/functions/lib/CMakeLists.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ velox_add_library(velox_is_null_functions IsNull.cpp)
1616

1717
velox_link_libraries(velox_is_null_functions velox_expression)
1818

19-
velox_add_library(velox_functions_util LambdaFunctionUtil.cpp
20-
RowsTranslationUtil.cpp)
19+
velox_add_library(velox_functions_util Murmur3Hash32Base.cpp
20+
LambdaFunctionUtil.cpp RowsTranslationUtil.cpp)
2121

2222
velox_link_libraries(velox_functions_util velox_vector velox_common_base)
2323

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,58 @@
1+
/*
2+
* Copyright (c) Facebook, Inc. and its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include "velox/functions/lib/Murmur3Hash32Base.h"
18+
#include "velox/common/base/BitUtil.h"
19+
20+
namespace facebook::velox::functions {
21+
22+
uint32_t Murmur3Hash32Base::hashInt64(uint64_t input, uint32_t seed) {
23+
uint32_t low = input;
24+
uint32_t high = input >> 32;
25+
26+
uint32_t k1 = mixK1(low);
27+
uint32_t h1 = mixH1(seed, k1);
28+
29+
k1 = mixK1(high);
30+
h1 = mixH1(h1, k1);
31+
32+
return fmix(h1, 8);
33+
}
34+
35+
uint32_t Murmur3Hash32Base::mixK1(uint32_t k1) {
36+
k1 *= 0xcc9e2d51;
37+
k1 = bits::rotateLeft(k1, 15);
38+
k1 *= 0x1b873593;
39+
return k1;
40+
}
41+
42+
uint32_t Murmur3Hash32Base::mixH1(uint32_t h1, uint32_t k1) {
43+
h1 ^= k1;
44+
h1 = bits::rotateLeft(h1, 13);
45+
h1 = h1 * 5 + 0xe6546b64;
46+
return h1;
47+
}
48+
49+
uint32_t Murmur3Hash32Base::fmix(uint32_t h1, uint32_t length) {
50+
h1 ^= length;
51+
h1 ^= h1 >> 16;
52+
h1 *= 0x85ebca6b;
53+
h1 ^= h1 >> 13;
54+
h1 *= 0xc2b2ae35;
55+
h1 ^= h1 >> 16;
56+
return h1;
57+
}
58+
} // namespace facebook::velox::functions
Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,39 @@
1+
/*
2+
* Copyright (c) Facebook, Inc. and its affiliates.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
#pragma once
17+
18+
#include <cstdint>
19+
20+
namespace facebook::velox::functions {
21+
/// Murmur3 aligns with Austin Appleby
22+
/// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
23+
///
24+
/// Signed integer types have been remapped to unsigned types (as in the
25+
/// original) to avoid undefined signed integer overflow and sign extension.
26+
class Murmur3Hash32Base {
27+
protected:
28+
/// Hash the lower int, then combine with higher int, is a fast path of
29+
/// hashBytes.
30+
static uint32_t hashInt64(uint64_t input, uint32_t seed);
31+
32+
static uint32_t mixK1(uint32_t k1);
33+
34+
static uint32_t mixH1(uint32_t h1, uint32_t k1);
35+
36+
// Finalization mix - force all bits of a hash block to avalanche.
37+
static uint32_t fmix(uint32_t h1, uint32_t length);
38+
};
39+
} // namespace facebook::velox::functions

0 commit comments

Comments
 (0)