diff --git a/velox/connectors/hive/HivePartitionFunction.cpp b/velox/connectors/hive/HivePartitionFunction.cpp index b2a4a17a7619..548aff99e15d 100644 --- a/velox/connectors/hive/HivePartitionFunction.cpp +++ b/velox/connectors/hive/HivePartitionFunction.cpp @@ -28,46 +28,6 @@ int32_t hashInt64(int64_t value) { return ((*reinterpret_cast(&value)) >> 32) ^ value; } -template -inline int32_t hashDecimal(T value, uint8_t scale) { - bool isNegative = value < 0; - uint64_t absValue = - isNegative ? -static_cast(value) : static_cast(value); - - uint32_t high = absValue >> 32; - uint32_t low = absValue; - - uint32_t hash = 31 * high + low; - if (isNegative) { - hash = -hash; - } - - return 31 * hash + scale; -} - -// Simulates Hive's hashing function from Hive v1.2.1 -// org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorUtils#hashcode() -// Returns java BigDecimal#hashCode() -template <> -inline int32_t hashDecimal(int128_t value, uint8_t scale) { - uint32_t words[4]; - bool isNegative = value < 0; - uint128_t absValue = isNegative ? -value : value; - words[0] = absValue >> 96; - words[1] = absValue >> 64; - words[2] = absValue >> 32; - words[3] = absValue; - - uint32_t hash = 0; - for (auto i = 0; i < 4; i++) { - hash = 31 * hash + words[i]; - } - if (isNegative) { - hash = -hash; - } - return hash * 31 + scale; -} - #if defined(__has_feature) #if __has_feature(__address_sanitizer__) __attribute__((no_sanitize("integer"))) @@ -155,34 +115,6 @@ void hashPrimitive( const SelectivityVector& rows, bool mix, std::vector& hashes) { - const auto& type = values.base()->type(); - if constexpr (kind == TypeKind::BIGINT || kind == TypeKind::HUGEINT) { - if (type->isDecimal()) { - const auto scale = getDecimalPrecisionScale(*type).second; - if (rows.isAllSelected()) { - vector_size_t numRows = rows.size(); - for (auto i = 0; i < numRows; ++i) { - const uint32_t hash = values.isNullAt(i) - ? 0 - : hashDecimal( - values.valueAt::NativeType>(i), - scale); - mergeHash(mix, hash, hashes[i]); - } - } else { - rows.applyToSelected([&](auto row) INLINE_LAMBDA { - const uint32_t hash = values.isNullAt(row) - ? 0 - : hashDecimal( - values.valueAt::NativeType>(row), - scale); - mergeHash(mix, hash, hashes[row]); - }); - } - return; - } - } - if (rows.isAllSelected()) { // The compiler seems to be a little fickle with optimizations. // Although rows.applyToSelected should do roughly the same thing, doing @@ -279,16 +211,6 @@ void HivePartitionFunction::hashTyped( hashPrimitive(values, rows, mix, hashes); } -template <> -void HivePartitionFunction::hashTyped( - const DecodedVector& values, - const SelectivityVector& rows, - bool mix, - std::vector& hashes, - size_t /* poolIndex */) { - hashPrimitive(values, rows, mix, hashes); -} - template <> void HivePartitionFunction::hashTyped( const DecodedVector& values, diff --git a/velox/connectors/hive/HivePartitionUtil.cpp b/velox/connectors/hive/HivePartitionUtil.cpp index f09dbf8a11c9..07b115e79b74 100644 --- a/velox/connectors/hive/HivePartitionUtil.cpp +++ b/velox/connectors/hive/HivePartitionUtil.cpp @@ -27,7 +27,6 @@ namespace facebook::velox::connector::hive { case TypeKind::SMALLINT: \ case TypeKind::INTEGER: \ case TypeKind::BIGINT: \ - case TypeKind::HUGEINT: \ case TypeKind::VARCHAR: \ case TypeKind::VARBINARY: \ case TypeKind::TIMESTAMP: \ @@ -90,22 +89,6 @@ std::pair makePartitionKeyValueString( DATE()->toString( partitionVector->as>()->valueAt(row))); } - if constexpr (Kind == TypeKind::BIGINT || Kind == TypeKind::HUGEINT) { - if (partitionVector->type()->isDecimal()) { - auto [precision, scale] = - getDecimalPrecisionScale(*partitionVector->type()); - const auto maxStringSize = - DecimalUtil::maxStringViewSize(precision, scale); - std::vector maxString(maxStringSize); - const auto size = DecimalUtil::castToString( - partitionVector->as>()->valueAt(row), - scale, - maxStringSize, - maxString.data()); - return std::make_pair(name, std::string(maxString.data(), size)); - } - } - return std::make_pair( name, makePartitionValueString( diff --git a/velox/connectors/hive/SplitReader.cpp b/velox/connectors/hive/SplitReader.cpp index 609186d3285d..47ba102904ef 100644 --- a/velox/connectors/hive/SplitReader.cpp +++ b/velox/connectors/hive/SplitReader.cpp @@ -52,18 +52,6 @@ VectorPtr newConstantFromStringImpl( pool, 1, false, type, std::move(days)); } - if constexpr (std::is_same_v || std::is_same_v) { - if (type->isDecimal()) { - auto [precision, scale] = getDecimalPrecisionScale(*type); - T result; - const auto status = DecimalUtil::castFromString( - StringView(value.value()), precision, scale, result); - VELOX_USER_CHECK(status.ok(), status.message()); - return std::make_shared>( - pool, 1, false, type, std::move(result)); - } - } - if constexpr (std::is_same_v) { return std::make_shared>( pool, 1, false, type, StringView(value.value())); diff --git a/velox/connectors/hive/SplitReader.h b/velox/connectors/hive/SplitReader.h index fd81732852ef..46ecccb4eca7 100644 --- a/velox/connectors/hive/SplitReader.h +++ b/velox/connectors/hive/SplitReader.h @@ -55,7 +55,7 @@ namespace facebook::velox::connector::hive { /// converted to their appropriate types. /// /// @param type The target Velox type for the constant vector. Supports all -/// scalar types including primitives, dates, timestamps, and decimals. +/// scalar types including primitives, dates, timestamps. /// @param value The string representation of the value to convert, formatted /// the same way as CAST(x as VARCHAR). Date values must be formatted using ISO /// 8601 as YYYY-MM-DD. If nullopt, creates a null constant vector. diff --git a/velox/connectors/hive/iceberg/tests/IcebergReadTest.cpp b/velox/connectors/hive/iceberg/tests/IcebergReadTest.cpp index ab4119556718..34e87fc25257 100644 --- a/velox/connectors/hive/iceberg/tests/IcebergReadTest.cpp +++ b/velox/connectors/hive/iceberg/tests/IcebergReadTest.cpp @@ -921,51 +921,6 @@ TEST_F(HiveIcebergTest, partitionColumnsFromHive) { AssertQueryBuilder(plan).splits(icebergSplits).assertResults(expectedVectors); } -TEST_F(HiveIcebergTest, mixedScenario) { - auto fileRowType = ROW({"c0", "c1"}, {BIGINT(), VARCHAR()}); - auto tableRowType = - ROW({"c0", "c1", "c2", "region"}, - {BIGINT(), VARCHAR(), INTEGER(), DECIMAL(38, 5)}); - - // Write data file with c0 and c1. - std::vector dataVectors; - dataVectors.push_back(makeRowVector({ - makeFlatVector({100, 200}), - makeFlatVector({"a", "b"}), - })); - auto dataFilePath = TempFilePath::create(); - writeToFile(dataFilePath->getPath(), dataVectors); - - std::unordered_map> partitionKeys; - partitionKeys["region"] = "12345.67890"; - - auto icebergSplits = - makeIcebergSplits(dataFilePath->getPath(), {}, partitionKeys); - auto assignments = makeColumnHandles(tableRowType, {3}); - - // Expected result: - // - c0, c1: from file. - // - c2: NULL (schema evolution). - // - region: from partition keys (DECIMAL(38,5) = 12345.67890). - std::vector expectedVectors; - expectedVectors.push_back(makeRowVector( - tableRowType->names(), - { - dataVectors[0]->childAt(0), - dataVectors[0]->childAt(1), - makeNullConstant(TypeKind::INTEGER, 2), - makeFlatVector( - {1'234'567'890, 1'234'567'890}, DECIMAL(38, 5)), - })); - - // Read with table schema: c0, c1 (from file), c2 (new column), region - // (partition). - auto plan = PlanBuilder() - .tableScan(tableRowType, {}, "", tableRowType, assignments) - .planNode(); - AssertQueryBuilder(plan).splits(icebergSplits).assertResults(expectedVectors); -} - #ifdef VELOX_ENABLE_PARQUET TEST_F(HiveIcebergTest, positionalDeleteFileWithRowGroupFilter) { // This file contains three row groups, each with about 100 rows. diff --git a/velox/connectors/hive/tests/HiveDataSinkTest.cpp b/velox/connectors/hive/tests/HiveDataSinkTest.cpp index cee5b9de971f..936e90b25fe5 100644 --- a/velox/connectors/hive/tests/HiveDataSinkTest.cpp +++ b/velox/connectors/hive/tests/HiveDataSinkTest.cpp @@ -598,67 +598,6 @@ TEST_F(HiveDataSinkTest, basicBucket) { verifyWrittenData(outputDirectory->getPath(), numBuckets); } -TEST_F(HiveDataSinkTest, decimalPartition) { - const auto outputDirectory = TempDirectoryPath::create(); - - connectorSessionProperties_->set( - HiveConfig::kSortWriterFinishTimeSliceLimitMsSession, "1"); - const auto rowType = - ROW({"c0", "c1", "c2"}, {BIGINT(), DECIMAL(14, 3), DECIMAL(20, 4)}); - auto dataSink = createDataSink( - rowType, - outputDirectory->getPath(), - dwio::common::FileFormat::DWRF, - {"c2"}); - auto stats = dataSink->stats(); - ASSERT_TRUE(stats.empty()) << stats.toString(); - - const auto vector = makeRowVector( - {makeNullableFlatVector({1, 2, std::nullopt, 345}), - makeNullableFlatVector( - {1, 2, std::nullopt, 345}, DECIMAL(14, 3)), - makeFlatVector({1, 340, 234567, -345}, DECIMAL(20, 4))}); - - dataSink->appendData(vector); - while (!dataSink->finish()) { - } - const auto partitions = dataSink->close(); - stats = dataSink->stats(); - ASSERT_FALSE(stats.empty()); - ASSERT_EQ(partitions.size(), vector->size()); - - createDuckDbTable({vector}); - - const auto rootPath = outputDirectory->getPath(); - std::vector> splits; - std::unordered_map> partitionKeys; - auto partitionPath = [&](std::string value) { - partitionKeys["c2"] = value; - auto path = listFiles(rootPath + "/c2=" + value)[0]; - splits.push_back(makeHiveConnectorSplits( - path, 1, dwio::common::FileFormat::DWRF, partitionKeys) - .back()); - }; - partitionPath("0.0001"); - partitionPath("0.0340"); - partitionPath("23.4567"); - partitionPath("-0.0345"); - - ColumnHandleMap assignments = { - {"c0", regularColumn("c0", BIGINT())}, - {"c1", regularColumn("c1", DECIMAL(14, 3))}, - {"c2", partitionKey("c2", DECIMAL(20, 4))}}; - - auto op = PlanBuilder() - .startTableScan() - .outputType(rowType) - .assignments(assignments) - .endTableScan() - .planNode(); - - assertQuery(op, splits, fmt::format("SELECT * FROM tmp")); -} - TEST_F(HiveDataSinkTest, close) { for (bool empty : {true, false}) { SCOPED_TRACE(fmt::format("Data sink is empty: {}", empty)); diff --git a/velox/connectors/hive/tests/HivePartitionFunctionTest.cpp b/velox/connectors/hive/tests/HivePartitionFunctionTest.cpp index 22e8962be147..1adfb3874458 100644 --- a/velox/connectors/hive/tests/HivePartitionFunctionTest.cpp +++ b/velox/connectors/hive/tests/HivePartitionFunctionTest.cpp @@ -124,58 +124,6 @@ TEST_F(HivePartitionFunctionTest, bigint) { assertPartitionsWithConstChannel(values, 997); } -TEST_F(HivePartitionFunctionTest, shortDecimal) { - auto values = makeNullableFlatVector( - {std::nullopt, - 300'000'000'000, - 123456789, - DecimalUtil::kShortDecimalMin / 100, - DecimalUtil::kShortDecimalMax / 100}, - DECIMAL(18, 2)); - - assertPartitions(values, 1, {0, 0, 0, 0, 0}); - assertPartitions(values, 2, {0, 1, 1, 1, 1}); - assertPartitions(values, 500, {0, 471, 313, 115, 37}); - assertPartitions(values, 997, {0, 681, 6, 982, 502}); - - assertPartitionsWithConstChannel(values, 1); - assertPartitionsWithConstChannel(values, 2); - assertPartitionsWithConstChannel(values, 500); - assertPartitionsWithConstChannel(values, 997); - - values = makeFlatVector( - {123456789, DecimalUtil::kShortDecimalMin, DecimalUtil::kShortDecimalMax}, - DECIMAL(18, 0)); - assertPartitions(values, 500, {311, 236, 412}); -} - -TEST_F(HivePartitionFunctionTest, longDecimal) { - auto values = makeNullableFlatVector( - {std::nullopt, - 300'000'000'000, - HugeInt::parse("12345678901234567891"), - DecimalUtil::kLongDecimalMin / 100, - DecimalUtil::kLongDecimalMax / 100}, - DECIMAL(38, 2)); - - assertPartitions(values, 1, {0, 0, 0, 0, 0}); - assertPartitions(values, 2, {0, 1, 1, 1, 1}); - assertPartitions(values, 500, {0, 471, 99, 49, 103}); - assertPartitions(values, 997, {0, 681, 982, 481, 6}); - - assertPartitionsWithConstChannel(values, 1); - assertPartitionsWithConstChannel(values, 2); - assertPartitionsWithConstChannel(values, 500); - assertPartitionsWithConstChannel(values, 997); - - values = makeNullableFlatVector( - {HugeInt::parse("1234567890123456789112345678"), - DecimalUtil::kLongDecimalMin, - DecimalUtil::kLongDecimalMax}, - DECIMAL(38, 0)); - assertPartitions(values, 997, {51, 835, 645}); -} - TEST_F(HivePartitionFunctionTest, varchar) { auto values = makeNullableFlatVector( {std::nullopt, diff --git a/velox/connectors/hive/tests/HivePartitionUtilTest.cpp b/velox/connectors/hive/tests/HivePartitionUtilTest.cpp index 3c1575697872..8598f46742fe 100644 --- a/velox/connectors/hive/tests/HivePartitionUtilTest.cpp +++ b/velox/connectors/hive/tests/HivePartitionUtilTest.cpp @@ -74,9 +74,7 @@ TEST_F(HivePartitionUtilTest, partitionName) { "flat_bigint_col", "dict_string_col", "const_date_col", - "flat_timestamp_col", - "short_decimal_col", - "long_decimal_col"}, + "flat_timestamp_col"}, {makeFlatVector(std::vector{false}), makeFlatVector(std::vector{10}), makeFlatVector(std::vector{100}), @@ -85,10 +83,7 @@ TEST_F(HivePartitionUtilTest, partitionName) { makeDictionary(std::vector{"str1000"}), makeConstant(10000, 1, DATE()), makeFlatVector( - std::vector{Timestamp::fromMillis(1577836800000)}), - makeConstant(10000, 1, DECIMAL(12, 2)), - makeConstant( - DecimalUtil::kLongDecimalMin / 100, 1, DECIMAL(38, 2))}); + std::vector{Timestamp::fromMillis(1577836800000)})}); std::vector expectedPartitionKeyValues{ "flat_bool_col=false", @@ -98,9 +93,7 @@ TEST_F(HivePartitionUtilTest, partitionName) { "flat_bigint_col=10000", "dict_string_col=str1000", "const_date_col=1997-05-19", - "flat_timestamp_col=2019-12-31 16%3A00%3A00.0", - "short_decimal_col=100.00", - "long_decimal_col=-" + std::string(34, '9') + ".99"}; + "flat_timestamp_col=2019-12-31 16%3A00%3A00.0"}; std::vector partitionChannels; for (auto i = 1; i <= expectedPartitionKeyValues.size(); i++) { @@ -147,9 +140,7 @@ TEST_F(HivePartitionUtilTest, partitionNameForNull) { "flat_bigint_col", "flat_string_col", "const_date_col", - "flat_timestamp_col", - "short_decimal_col", - "long_decimal_col"}; + "flat_timestamp_col"}; RowVectorPtr input = makeRowVector( partitionColumnNames, @@ -160,9 +151,7 @@ TEST_F(HivePartitionUtilTest, partitionNameForNull) { makeNullableFlatVector({std::nullopt}), makeNullableFlatVector({std::nullopt}), makeConstant(std::nullopt, 1, DATE()), - makeNullableFlatVector({std::nullopt}), - makeConstant(std::nullopt, 1, DECIMAL(12, 2)), - makeConstant(std::nullopt, 1, DECIMAL(38, 2))}); + makeNullableFlatVector({std::nullopt})}); for (auto i = 0; i < partitionColumnNames.size(); i++) { std::vector partitionChannels = {(column_index_t)i}; diff --git a/velox/connectors/hive/tests/PartitionIdGeneratorTest.cpp b/velox/connectors/hive/tests/PartitionIdGeneratorTest.cpp index 565d0325ad54..271e4599d3f0 100644 --- a/velox/connectors/hive/tests/PartitionIdGeneratorTest.cpp +++ b/velox/connectors/hive/tests/PartitionIdGeneratorTest.cpp @@ -322,9 +322,8 @@ TEST_F(PartitionIdGeneratorTest, supportedPartitionKeyTypes) { INTEGER(), BIGINT(), TIMESTAMP(), - DECIMAL(20, 2), }), - {0, 1, 2, 3, 4, 5, 6, 7, 8}, + {0, 1, 2, 3, 4, 5, 6, 7}, 100, pool(), true); @@ -342,9 +341,7 @@ TEST_F(PartitionIdGeneratorTest, supportedPartitionKeyTypes) { makeNullableFlatVector( {std::nullopt, Timestamp::fromMillis(1639426440001), - Timestamp::fromMillis(1639426440002)}), - makeNullableFlatVector( - {std::nullopt, 1, DecimalUtil::kLongDecimalMin})}); + Timestamp::fromMillis(1639426440002)})}); raw_vector ids; idGenerator.run(input, ids); diff --git a/velox/exec/VectorHasher.cpp b/velox/exec/VectorHasher.cpp index 1bacff0927a6..66427ac5c886 100644 --- a/velox/exec/VectorHasher.cpp +++ b/velox/exec/VectorHasher.cpp @@ -41,9 +41,6 @@ namespace facebook::velox::exec { case TypeKind::BIGINT: { \ return TEMPLATE_FUNC(__VA_ARGS__); \ } \ - case TypeKind::HUGEINT: { \ - return TEMPLATE_FUNC(__VA_ARGS__); \ - } \ case TypeKind::VARCHAR: \ case TypeKind::VARBINARY: { \ return TEMPLATE_FUNC(__VA_ARGS__); \ @@ -739,7 +736,6 @@ void extendRange( extendRange(reserve, min, max); break; case TypeKind::BIGINT: - case TypeKind::HUGEINT: case TypeKind::VARCHAR: case TypeKind::VARBINARY: case TypeKind::TIMESTAMP: diff --git a/velox/exec/VectorHasher.h b/velox/exec/VectorHasher.h index 211c1e34e538..c0e88a0e54c0 100644 --- a/velox/exec/VectorHasher.h +++ b/velox/exec/VectorHasher.h @@ -291,7 +291,6 @@ class VectorHasher { case TypeKind::SMALLINT: case TypeKind::INTEGER: case TypeKind::BIGINT: - case TypeKind::HUGEINT: case TypeKind::VARCHAR: case TypeKind::VARBINARY: case TypeKind::TIMESTAMP: diff --git a/velox/exec/tests/TableScanTest.cpp b/velox/exec/tests/TableScanTest.cpp index c53fdc16c7b3..774bbb2cde6d 100644 --- a/velox/exec/tests/TableScanTest.cpp +++ b/velox/exec/tests/TableScanTest.cpp @@ -1870,15 +1870,6 @@ TEST_F(TableScanTest, partitionedTableDoubleKey) { testPartitionedTable(filePath->getPath(), DOUBLE(), "3.5"); } -TEST_F(TableScanTest, partitionedTableDecimalKey) { - auto rowType = ROW({"c0", "c1"}, {BIGINT(), DOUBLE()}); - auto vectors = makeVectors(10, 1'000, rowType); - auto filePath = TempFilePath::create(); - writeToFile(filePath->getPath(), vectors); - createDuckDbTable(vectors); - testPartitionedTable(filePath->getPath(), DECIMAL(20, 4), "3.5123"); -} - TEST_F(TableScanTest, partitionedTableDateKey) { auto rowType = ROW({"c0", "c1"}, {BIGINT(), DOUBLE()}); auto vectors = makeVectors(10, 1'000, rowType); diff --git a/velox/exec/tests/VectorHasherTest.cpp b/velox/exec/tests/VectorHasherTest.cpp index bedafa941b4e..8c23c69fe28c 100644 --- a/velox/exec/tests/VectorHasherTest.cpp +++ b/velox/exec/tests/VectorHasherTest.cpp @@ -721,11 +721,6 @@ TEST_F(VectorHasherTest, merge) { EXPECT_EQ(numDistinct - 1, ids.size()); } -TEST_F(VectorHasherTest, computeValueIdsHugeInt) { - testComputeValueIds(false); - testComputeValueIds(true); -} - TEST_F(VectorHasherTest, computeValueIdsBigint) { testComputeValueIds(false); testComputeValueIds(true); diff --git a/velox/exec/tests/utils/TableScanTestBase.cpp b/velox/exec/tests/utils/TableScanTestBase.cpp index 71392147e4a5..394020fe1b5a 100644 --- a/velox/exec/tests/utils/TableScanTestBase.cpp +++ b/velox/exec/tests/utils/TableScanTestBase.cpp @@ -173,11 +173,6 @@ void TableScanTestBase::testPartitionedTableImpl( std::string partitionValueStr; partitionValueStr = partitionValue.has_value() ? "'" + *partitionValue + "'" : "null"; - if (partitionValue.has_value() && partitionType->isDecimal()) { - auto [p, s] = getDecimalPrecisionScale(*partitionType); - partitionValueStr = - fmt::format("CAST({} AS DECIMAL({}, {}))", partitionValueStr, p, s); - } assertQuery( op, split, fmt::format("SELECT {}, * FROM tmp", partitionValueStr));