Skip to content

Commit 284c1e4

Browse files
Krishna Paifacebook-github-bot
authored andcommitted
misc: Backout Support decimal as partition key type in Hive write and read
Summary: This change revert's #12910 , as per [issue 15309](#15309) . Differential Revision: D85714403
1 parent b5d502b commit 284c1e4

12 files changed

+53
-311
lines changed

velox/connectors/hive/HivePartitionFunction.cpp

Lines changed: 0 additions & 78 deletions
Original file line numberDiff line numberDiff line change
@@ -28,46 +28,6 @@ int32_t hashInt64(int64_t value) {
2828
return ((*reinterpret_cast<uint64_t*>(&value)) >> 32) ^ value;
2929
}
3030

31-
template <typename T>
32-
inline int32_t hashDecimal(T value, uint8_t scale) {
33-
bool isNegative = value < 0;
34-
uint64_t absValue =
35-
isNegative ? -static_cast<uint64_t>(value) : static_cast<uint64_t>(value);
36-
37-
uint32_t high = absValue >> 32;
38-
uint32_t low = absValue;
39-
40-
uint32_t hash = 31 * high + low;
41-
if (isNegative) {
42-
hash = -hash;
43-
}
44-
45-
return 31 * hash + scale;
46-
}
47-
48-
// Simulates Hive's hashing function from Hive v1.2.1
49-
// org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils#hashcode()
50-
// Returns java BigDecimal#hashCode()
51-
template <>
52-
inline int32_t hashDecimal<int128_t>(int128_t value, uint8_t scale) {
53-
uint32_t words[4];
54-
bool isNegative = value < 0;
55-
uint128_t absValue = isNegative ? -value : value;
56-
words[0] = absValue >> 96;
57-
words[1] = absValue >> 64;
58-
words[2] = absValue >> 32;
59-
words[3] = absValue;
60-
61-
uint32_t hash = 0;
62-
for (auto i = 0; i < 4; i++) {
63-
hash = 31 * hash + words[i];
64-
}
65-
if (isNegative) {
66-
hash = -hash;
67-
}
68-
return hash * 31 + scale;
69-
}
70-
7131
#if defined(__has_feature)
7232
#if __has_feature(__address_sanitizer__)
7333
__attribute__((no_sanitize("integer")))
@@ -155,34 +115,6 @@ void hashPrimitive(
155115
const SelectivityVector& rows,
156116
bool mix,
157117
std::vector<uint32_t>& hashes) {
158-
const auto& type = values.base()->type();
159-
if constexpr (kind == TypeKind::BIGINT || kind == TypeKind::HUGEINT) {
160-
if (type->isDecimal()) {
161-
const auto scale = getDecimalPrecisionScale(*type).second;
162-
if (rows.isAllSelected()) {
163-
vector_size_t numRows = rows.size();
164-
for (auto i = 0; i < numRows; ++i) {
165-
const uint32_t hash = values.isNullAt(i)
166-
? 0
167-
: hashDecimal(
168-
values.valueAt<typename TypeTraits<kind>::NativeType>(i),
169-
scale);
170-
mergeHash(mix, hash, hashes[i]);
171-
}
172-
} else {
173-
rows.applyToSelected([&](auto row) INLINE_LAMBDA {
174-
const uint32_t hash = values.isNullAt(row)
175-
? 0
176-
: hashDecimal(
177-
values.valueAt<typename TypeTraits<kind>::NativeType>(row),
178-
scale);
179-
mergeHash(mix, hash, hashes[row]);
180-
});
181-
}
182-
return;
183-
}
184-
}
185-
186118
if (rows.isAllSelected()) {
187119
// The compiler seems to be a little fickle with optimizations.
188120
// Although rows.applyToSelected should do roughly the same thing, doing
@@ -279,16 +211,6 @@ void HivePartitionFunction::hashTyped<TypeKind::BIGINT>(
279211
hashPrimitive<TypeKind::BIGINT>(values, rows, mix, hashes);
280212
}
281213

282-
template <>
283-
void HivePartitionFunction::hashTyped<TypeKind::HUGEINT>(
284-
const DecodedVector& values,
285-
const SelectivityVector& rows,
286-
bool mix,
287-
std::vector<uint32_t>& hashes,
288-
size_t /* poolIndex */) {
289-
hashPrimitive<TypeKind::HUGEINT>(values, rows, mix, hashes);
290-
}
291-
292214
template <>
293215
void HivePartitionFunction::hashTyped<TypeKind::DOUBLE>(
294216
const DecodedVector& values,

velox/connectors/hive/HivePartitionUtil.cpp

Lines changed: 17 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -19,24 +19,23 @@
1919

2020
namespace facebook::velox::connector::hive {
2121

22-
#define PARTITION_TYPE_DISPATCH(TEMPLATE_FUNC, typeKind, ...) \
23-
[&]() { \
24-
switch (typeKind) { \
25-
case TypeKind::BOOLEAN: \
26-
case TypeKind::TINYINT: \
27-
case TypeKind::SMALLINT: \
28-
case TypeKind::INTEGER: \
29-
case TypeKind::BIGINT: \
30-
case TypeKind::HUGEINT: \
31-
case TypeKind::VARCHAR: \
32-
case TypeKind::VARBINARY: \
33-
case TypeKind::TIMESTAMP: \
34-
return VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH( \
35-
TEMPLATE_FUNC, typeKind, __VA_ARGS__); \
36-
default: \
37-
VELOX_UNSUPPORTED( \
38-
"Unsupported partition type: {}", TypeKindName::toName(typeKind)); \
39-
} \
22+
#define PARTITION_TYPE_DISPATCH(TEMPLATE_FUNC, typeKind, ...) \
23+
[&]() { \
24+
switch (typeKind) { \
25+
case TypeKind::BOOLEAN: \
26+
case TypeKind::TINYINT: \
27+
case TypeKind::SMALLINT: \
28+
case TypeKind::INTEGER: \
29+
case TypeKind::BIGINT: \
30+
case TypeKind::VARCHAR: \
31+
case TypeKind::VARBINARY: \
32+
case TypeKind::TIMESTAMP: \
33+
return VELOX_DYNAMIC_SCALAR_TYPE_DISPATCH( \
34+
TEMPLATE_FUNC, typeKind, __VA_ARGS__); \
35+
default: \
36+
VELOX_UNSUPPORTED( \
37+
"Unsupported partition type: {}", mapTypeKindToName(typeKind)); \
38+
} \
4039
}()
4140

4241
namespace {
@@ -90,22 +89,6 @@ std::pair<std::string, std::string> makePartitionKeyValueString(
9089
DATE()->toString(
9190
partitionVector->as<SimpleVector<int32_t>>()->valueAt(row)));
9291
}
93-
if constexpr (Kind == TypeKind::BIGINT || Kind == TypeKind::HUGEINT) {
94-
if (partitionVector->type()->isDecimal()) {
95-
auto [precision, scale] =
96-
getDecimalPrecisionScale(*partitionVector->type());
97-
const auto maxStringSize =
98-
DecimalUtil::maxStringViewSize(precision, scale);
99-
std::vector<char> maxString(maxStringSize);
100-
const auto size = DecimalUtil::castToString(
101-
partitionVector->as<SimpleVector<T>>()->valueAt(row),
102-
scale,
103-
maxStringSize,
104-
maxString.data());
105-
return std::make_pair(name, std::string(maxString.data(), size));
106-
}
107-
}
108-
10992
return std::make_pair(
11093
name,
11194
makePartitionValueString(

velox/connectors/hive/SplitReader.cpp

Lines changed: 0 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -52,18 +52,6 @@ VectorPtr newConstantFromStringImpl(
5252
pool, 1, false, type, std::move(days));
5353
}
5454

55-
if constexpr (std::is_same_v<T, int64_t> || std::is_same_v<T, int128_t>) {
56-
if (type->isDecimal()) {
57-
auto [precision, scale] = getDecimalPrecisionScale(*type);
58-
T result;
59-
const auto status = DecimalUtil::castFromString<T>(
60-
StringView(value.value()), precision, scale, result);
61-
VELOX_USER_CHECK(status.ok(), status.message());
62-
return std::make_shared<ConstantVector<T>>(
63-
pool, 1, false, type, std::move(result));
64-
}
65-
}
66-
6755
if constexpr (std::is_same_v<T, StringView>) {
6856
return std::make_shared<ConstantVector<StringView>>(
6957
pool, 1, false, type, StringView(value.value()));

velox/connectors/hive/tests/HiveDataSinkTest.cpp

Lines changed: 0 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -598,67 +598,6 @@ TEST_F(HiveDataSinkTest, basicBucket) {
598598
verifyWrittenData(outputDirectory->getPath(), numBuckets);
599599
}
600600

601-
TEST_F(HiveDataSinkTest, decimalPartition) {
602-
const auto outputDirectory = TempDirectoryPath::create();
603-
604-
connectorSessionProperties_->set(
605-
HiveConfig::kSortWriterFinishTimeSliceLimitMsSession, "1");
606-
const auto rowType =
607-
ROW({"c0", "c1", "c2"}, {BIGINT(), DECIMAL(14, 3), DECIMAL(20, 4)});
608-
auto dataSink = createDataSink(
609-
rowType,
610-
outputDirectory->getPath(),
611-
dwio::common::FileFormat::DWRF,
612-
{"c2"});
613-
auto stats = dataSink->stats();
614-
ASSERT_TRUE(stats.empty()) << stats.toString();
615-
616-
const auto vector = makeRowVector(
617-
{makeNullableFlatVector<int64_t>({1, 2, std::nullopt, 345}),
618-
makeNullableFlatVector<int64_t>(
619-
{1, 2, std::nullopt, 345}, DECIMAL(14, 3)),
620-
makeFlatVector<int128_t>({1, 340, 234567, -345}, DECIMAL(20, 4))});
621-
622-
dataSink->appendData(vector);
623-
while (!dataSink->finish()) {
624-
}
625-
const auto partitions = dataSink->close();
626-
stats = dataSink->stats();
627-
ASSERT_FALSE(stats.empty());
628-
ASSERT_EQ(partitions.size(), vector->size());
629-
630-
createDuckDbTable({vector});
631-
632-
const auto rootPath = outputDirectory->getPath();
633-
std::vector<std::shared_ptr<ConnectorSplit>> splits;
634-
std::unordered_map<std::string, std::optional<std::string>> partitionKeys;
635-
auto partitionPath = [&](std::string value) {
636-
partitionKeys["c2"] = value;
637-
auto path = listFiles(rootPath + "/c2=" + value)[0];
638-
splits.push_back(makeHiveConnectorSplits(
639-
path, 1, dwio::common::FileFormat::DWRF, partitionKeys)
640-
.back());
641-
};
642-
partitionPath("0.0001");
643-
partitionPath("0.0340");
644-
partitionPath("23.4567");
645-
partitionPath("-0.0345");
646-
647-
ColumnHandleMap assignments = {
648-
{"c0", regularColumn("c0", BIGINT())},
649-
{"c1", regularColumn("c1", DECIMAL(14, 3))},
650-
{"c2", partitionKey("c2", DECIMAL(20, 4))}};
651-
652-
auto op = PlanBuilder()
653-
.startTableScan()
654-
.outputType(rowType)
655-
.assignments(assignments)
656-
.endTableScan()
657-
.planNode();
658-
659-
assertQuery(op, splits, fmt::format("SELECT * FROM tmp"));
660-
}
661-
662601
TEST_F(HiveDataSinkTest, close) {
663602
for (bool empty : {true, false}) {
664603
SCOPED_TRACE(fmt::format("Data sink is empty: {}", empty));

velox/connectors/hive/tests/HivePartitionFunctionTest.cpp

Lines changed: 0 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -124,58 +124,6 @@ TEST_F(HivePartitionFunctionTest, bigint) {
124124
assertPartitionsWithConstChannel(values, 997);
125125
}
126126

127-
TEST_F(HivePartitionFunctionTest, shortDecimal) {
128-
auto values = makeNullableFlatVector<int64_t>(
129-
{std::nullopt,
130-
300'000'000'000,
131-
123456789,
132-
DecimalUtil::kShortDecimalMin / 100,
133-
DecimalUtil::kShortDecimalMax / 100},
134-
DECIMAL(18, 2));
135-
136-
assertPartitions(values, 1, {0, 0, 0, 0, 0});
137-
assertPartitions(values, 2, {0, 1, 1, 1, 1});
138-
assertPartitions(values, 500, {0, 471, 313, 115, 37});
139-
assertPartitions(values, 997, {0, 681, 6, 982, 502});
140-
141-
assertPartitionsWithConstChannel(values, 1);
142-
assertPartitionsWithConstChannel(values, 2);
143-
assertPartitionsWithConstChannel(values, 500);
144-
assertPartitionsWithConstChannel(values, 997);
145-
146-
values = makeFlatVector<int64_t>(
147-
{123456789, DecimalUtil::kShortDecimalMin, DecimalUtil::kShortDecimalMax},
148-
DECIMAL(18, 0));
149-
assertPartitions(values, 500, {311, 236, 412});
150-
}
151-
152-
TEST_F(HivePartitionFunctionTest, longDecimal) {
153-
auto values = makeNullableFlatVector<int128_t>(
154-
{std::nullopt,
155-
300'000'000'000,
156-
HugeInt::parse("12345678901234567891"),
157-
DecimalUtil::kLongDecimalMin / 100,
158-
DecimalUtil::kLongDecimalMax / 100},
159-
DECIMAL(38, 2));
160-
161-
assertPartitions(values, 1, {0, 0, 0, 0, 0});
162-
assertPartitions(values, 2, {0, 1, 1, 1, 1});
163-
assertPartitions(values, 500, {0, 471, 99, 49, 103});
164-
assertPartitions(values, 997, {0, 681, 982, 481, 6});
165-
166-
assertPartitionsWithConstChannel(values, 1);
167-
assertPartitionsWithConstChannel(values, 2);
168-
assertPartitionsWithConstChannel(values, 500);
169-
assertPartitionsWithConstChannel(values, 997);
170-
171-
values = makeNullableFlatVector<int128_t>(
172-
{HugeInt::parse("1234567890123456789112345678"),
173-
DecimalUtil::kLongDecimalMin,
174-
DecimalUtil::kLongDecimalMax},
175-
DECIMAL(38, 0));
176-
assertPartitions(values, 997, {51, 835, 645});
177-
}
178-
179127
TEST_F(HivePartitionFunctionTest, varchar) {
180128
auto values = makeNullableFlatVector<std::string>(
181129
{std::nullopt,

velox/connectors/hive/tests/HivePartitionUtilTest.cpp

Lines changed: 5 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,7 @@ TEST_F(HivePartitionUtilTest, partitionName) {
7474
"flat_bigint_col",
7575
"dict_string_col",
7676
"const_date_col",
77-
"flat_timestamp_col",
78-
"short_decimal_col",
79-
"long_decimal_col"},
77+
"flat_timestamp_col"},
8078
{makeFlatVector<bool>(std::vector<bool>{false}),
8179
makeFlatVector<int8_t>(std::vector<int8_t>{10}),
8280
makeFlatVector<int16_t>(std::vector<int16_t>{100}),
@@ -85,10 +83,7 @@ TEST_F(HivePartitionUtilTest, partitionName) {
8583
makeDictionary<StringView>(std::vector<StringView>{"str1000"}),
8684
makeConstant<int32_t>(10000, 1, DATE()),
8785
makeFlatVector<Timestamp>(
88-
std::vector<Timestamp>{Timestamp::fromMillis(1577836800000)}),
89-
makeConstant<int64_t>(10000, 1, DECIMAL(12, 2)),
90-
makeConstant<int128_t>(
91-
DecimalUtil::kLongDecimalMin / 100, 1, DECIMAL(38, 2))});
86+
std::vector<Timestamp>{Timestamp::fromMillis(1577836800000)})});
9287

9388
std::vector<std::string> expectedPartitionKeyValues{
9489
"flat_bool_col=false",
@@ -98,9 +93,7 @@ TEST_F(HivePartitionUtilTest, partitionName) {
9893
"flat_bigint_col=10000",
9994
"dict_string_col=str1000",
10095
"const_date_col=1997-05-19",
101-
"flat_timestamp_col=2019-12-31 16%3A00%3A00.0",
102-
"short_decimal_col=100.00",
103-
"long_decimal_col=-" + std::string(34, '9') + ".99"};
96+
"flat_timestamp_col=2019-12-31 16%3A00%3A00.0"};
10497

10598
std::vector<column_index_t> partitionChannels;
10699
for (auto i = 1; i <= expectedPartitionKeyValues.size(); i++) {
@@ -147,9 +140,7 @@ TEST_F(HivePartitionUtilTest, partitionNameForNull) {
147140
"flat_bigint_col",
148141
"flat_string_col",
149142
"const_date_col",
150-
"flat_timestamp_col",
151-
"short_decimal_col",
152-
"long_decimal_col"};
143+
"flat_timestamp_col"};
153144

154145
RowVectorPtr input = makeRowVector(
155146
partitionColumnNames,
@@ -160,9 +151,7 @@ TEST_F(HivePartitionUtilTest, partitionNameForNull) {
160151
makeNullableFlatVector<int64_t>({std::nullopt}),
161152
makeNullableFlatVector<StringView>({std::nullopt}),
162153
makeConstant<int32_t>(std::nullopt, 1, DATE()),
163-
makeNullableFlatVector<Timestamp>({std::nullopt}),
164-
makeConstant<int64_t>(std::nullopt, 1, DECIMAL(12, 2)),
165-
makeConstant<int128_t>(std::nullopt, 1, DECIMAL(38, 2))});
154+
makeNullableFlatVector<Timestamp>({std::nullopt})});
166155

167156
for (auto i = 0; i < partitionColumnNames.size(); i++) {
168157
std::vector<column_index_t> partitionChannels = {(column_index_t)i};

velox/connectors/hive/tests/PartitionIdGeneratorTest.cpp

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -322,9 +322,8 @@ TEST_F(PartitionIdGeneratorTest, supportedPartitionKeyTypes) {
322322
INTEGER(),
323323
BIGINT(),
324324
TIMESTAMP(),
325-
DECIMAL(20, 2),
326325
}),
327-
{0, 1, 2, 3, 4, 5, 6, 7, 8},
326+
{0, 1, 2, 3, 4, 5, 6, 7},
328327
100,
329328
pool(),
330329
true);
@@ -342,9 +341,7 @@ TEST_F(PartitionIdGeneratorTest, supportedPartitionKeyTypes) {
342341
makeNullableFlatVector<Timestamp>(
343342
{std::nullopt,
344343
Timestamp::fromMillis(1639426440001),
345-
Timestamp::fromMillis(1639426440002)}),
346-
makeNullableFlatVector<int128_t>(
347-
{std::nullopt, 1, DecimalUtil::kLongDecimalMin})});
344+
Timestamp::fromMillis(1639426440002)})});
348345

349346
raw_vector<uint64_t> ids;
350347
idGenerator.run(input, ids);

0 commit comments

Comments
 (0)