diff --git a/velox/dwio/parquet/reader/ParquetReader.cpp b/velox/dwio/parquet/reader/ParquetReader.cpp index eec910c66d6c..5ee77e783d1c 100644 --- a/velox/dwio/parquet/reader/ParquetReader.cpp +++ b/velox/dwio/parquet/reader/ParquetReader.cpp @@ -759,6 +759,11 @@ TypePtr ReaderBase::convertType( "Converted type {} is not allowed for requested type {}"; const bool isRepeated = schemaElement.__isset.repetition_type && schemaElement.repetition_type == thrift::FieldRepetitionType::REPEATED; + const auto isInteger = [](const TypePtr& type) { + return type->kind() == TypeKind::TINYINT || + type->kind() == TypeKind::SMALLINT || + type->kind() == TypeKind::INTEGER || type->kind() == TypeKind::BIGINT; + }; if (schemaElement.__isset.converted_type) { switch (schemaElement.converted_type) { case thrift::ConvertedType::INT_8: @@ -770,15 +775,7 @@ TypePtr ReaderBase::convertType( schemaElement.converted_type); VELOX_CHECK( !requestedType || - isCompatible( - requestedType, - isRepeated, - [](const TypePtr& type) { - return type->kind() == TypeKind::TINYINT || - type->kind() == TypeKind::SMALLINT || - type->kind() == TypeKind::INTEGER || - type->kind() == TypeKind::BIGINT; - }), + isCompatible(requestedType, isRepeated, isInteger), kTypeMappingErrorFmtStr, "TINYINT", requestedType->toString()); @@ -793,14 +790,7 @@ TypePtr ReaderBase::convertType( schemaElement.converted_type); VELOX_CHECK( !requestedType || - isCompatible( - requestedType, - isRepeated, - [](const TypePtr& type) { - return type->kind() == TypeKind::SMALLINT || - type->kind() == TypeKind::INTEGER || - type->kind() == TypeKind::BIGINT; - }), + isCompatible(requestedType, isRepeated, isInteger), kTypeMappingErrorFmtStr, "SMALLINT", requestedType->toString()); @@ -815,13 +805,7 @@ TypePtr ReaderBase::convertType( schemaElement.converted_type); VELOX_CHECK( !requestedType || - isCompatible( - requestedType, - isRepeated, - [](const TypePtr& type) { - return type->kind() == TypeKind::INTEGER || - type->kind() == TypeKind::BIGINT; - }), + isCompatible(requestedType, isRepeated, isInteger), kTypeMappingErrorFmtStr, "INTEGER", requestedType->toString()); @@ -836,12 +820,7 @@ TypePtr ReaderBase::convertType( schemaElement.converted_type); VELOX_CHECK( !requestedType || - isCompatible( - requestedType, - isRepeated, - [](const TypePtr& type) { - return type->kind() == TypeKind::BIGINT; - }), + isCompatible(requestedType, isRepeated, isInteger), kTypeMappingErrorFmtStr, "BIGINT", requestedType->toString()); @@ -1007,13 +986,7 @@ TypePtr ReaderBase::convertType( case thrift::Type::type::INT32: VELOX_CHECK( !requestedType || - isCompatible( - requestedType, - isRepeated, - [](const TypePtr& type) { - return type->kind() == TypeKind::INTEGER || - type->kind() == TypeKind::BIGINT; - }), + isCompatible(requestedType, isRepeated, isInteger), kTypeMappingErrorFmtStr, "INTEGER", requestedType->toString()); @@ -1037,12 +1010,7 @@ TypePtr ReaderBase::convertType( } VELOX_CHECK( !requestedType || - isCompatible( - requestedType, - isRepeated, - [](const TypePtr& type) { - return type->kind() == TypeKind::BIGINT; - }), + isCompatible(requestedType, isRepeated, isInteger), kTypeMappingErrorFmtStr, "BIGINT", requestedType->toString()); diff --git a/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp b/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp index 5df7408327e1..5a292d0314e1 100644 --- a/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp +++ b/velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp @@ -1400,6 +1400,77 @@ TEST_F(ParquetTableScanTest, intToBigintRead) { assertEqualVectors(bigintDataFileVectors->childAt(0), rows->childAt(0)); } +TEST_F(ParquetTableScanTest, intReadWithNarrowerType) { + // Reading a wider integer as a narrower one causes unchecked truncation and + // two’s complement reinterpretation, resulting in values INT_MAX becoming -1. + RowVectorPtr intVectors = makeRowVector( + {"c1", "c2", "c3"}, + { + makeFlatVector( + {123, + std::numeric_limits::max(), + std::numeric_limits::min(), + std::numeric_limits::max(), + std::numeric_limits::min()}), + makeFlatVector( + {123, + std::numeric_limits::max(), + std::numeric_limits::min(), + std::numeric_limits::max(), + std::numeric_limits::min()}), + makeFlatVector( + {123, + std::numeric_limits::max(), + std::numeric_limits::min(), + std::numeric_limits::max(), + std::numeric_limits::min()}), + }); + + RowVectorPtr smallerIntVectors = makeRowVector( + {"c1", "c2", "c3"}, + { + makeFlatVector({ + 123, + std::numeric_limits::max(), + std::numeric_limits::min(), + -1, + 0, + }), + makeFlatVector({{ + 123, + std::numeric_limits::max(), + std::numeric_limits::min(), + -1, + 0, + }}), + makeFlatVector({ + 123, + std::numeric_limits::max(), + std::numeric_limits::min(), + -1, + 0, + }), + }); + + auto dataFile = TempFilePath::create(); + WriterOptions options; + writeToParquetFile(dataFile->getPath(), {intVectors}, options); + + auto rowType = ROW({"c1", "c2", "c3"}, {TINYINT(), SMALLINT(), INTEGER()}); + auto op = PlanBuilder() + .startTableScan() + .outputType(rowType) + .dataColumns(rowType) + .endTableScan() + .planNode(); + + auto split = makeSplit(dataFile->getPath()); + auto result = AssertQueryBuilder(op).split(split).copyResults(pool()); + auto rows = result->as(); + + assertEqualVectors(smallerIntVectors->childAt(0), rows->childAt(0)); +} + TEST_F(ParquetTableScanTest, shortAndLongDecimalReadWithLargerPrecision) { // decimal.parquet holds two columns (a: DECIMAL(5, 2), b: DECIMAL(20, 5)) and // 20 rows (10 rows per group). Data is in plain uncompressed format: