Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 11 additions & 43 deletions velox/dwio/parquet/reader/ParquetReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -759,6 +759,11 @@ TypePtr ReaderBase::convertType(
"Converted type {} is not allowed for requested type {}";
const bool isRepeated = schemaElement.__isset.repetition_type &&
schemaElement.repetition_type == thrift::FieldRepetitionType::REPEATED;
const auto isInteger = [](const TypePtr& type) {
return type->kind() == TypeKind::TINYINT ||
type->kind() == TypeKind::SMALLINT ||
type->kind() == TypeKind::INTEGER || type->kind() == TypeKind::BIGINT;
};
if (schemaElement.__isset.converted_type) {
switch (schemaElement.converted_type) {
case thrift::ConvertedType::INT_8:
Expand All @@ -770,15 +775,7 @@ TypePtr ReaderBase::convertType(
schemaElement.converted_type);
VELOX_CHECK(
!requestedType ||
isCompatible(
requestedType,
isRepeated,
[](const TypePtr& type) {
return type->kind() == TypeKind::TINYINT ||
type->kind() == TypeKind::SMALLINT ||
type->kind() == TypeKind::INTEGER ||
type->kind() == TypeKind::BIGINT;
}),
isCompatible(requestedType, isRepeated, isInteger),
kTypeMappingErrorFmtStr,
"TINYINT",
requestedType->toString());
Expand All @@ -793,14 +790,7 @@ TypePtr ReaderBase::convertType(
schemaElement.converted_type);
VELOX_CHECK(
!requestedType ||
isCompatible(
requestedType,
isRepeated,
[](const TypePtr& type) {
return type->kind() == TypeKind::SMALLINT ||
type->kind() == TypeKind::INTEGER ||
type->kind() == TypeKind::BIGINT;
}),
isCompatible(requestedType, isRepeated, isInteger),
kTypeMappingErrorFmtStr,
"SMALLINT",
requestedType->toString());
Expand All @@ -815,13 +805,7 @@ TypePtr ReaderBase::convertType(
schemaElement.converted_type);
VELOX_CHECK(
!requestedType ||
isCompatible(
requestedType,
isRepeated,
[](const TypePtr& type) {
return type->kind() == TypeKind::INTEGER ||
type->kind() == TypeKind::BIGINT;
}),
isCompatible(requestedType, isRepeated, isInteger),
kTypeMappingErrorFmtStr,
"INTEGER",
requestedType->toString());
Expand All @@ -836,12 +820,7 @@ TypePtr ReaderBase::convertType(
schemaElement.converted_type);
VELOX_CHECK(
!requestedType ||
isCompatible(
requestedType,
isRepeated,
[](const TypePtr& type) {
return type->kind() == TypeKind::BIGINT;
}),
isCompatible(requestedType, isRepeated, isInteger),
kTypeMappingErrorFmtStr,
"BIGINT",
requestedType->toString());
Expand Down Expand Up @@ -1007,13 +986,7 @@ TypePtr ReaderBase::convertType(
case thrift::Type::type::INT32:
VELOX_CHECK(
!requestedType ||
isCompatible(
requestedType,
isRepeated,
[](const TypePtr& type) {
return type->kind() == TypeKind::INTEGER ||
type->kind() == TypeKind::BIGINT;
}),
isCompatible(requestedType, isRepeated, isInteger),
kTypeMappingErrorFmtStr,
"INTEGER",
requestedType->toString());
Expand All @@ -1037,12 +1010,7 @@ TypePtr ReaderBase::convertType(
}
VELOX_CHECK(
!requestedType ||
isCompatible(
requestedType,
isRepeated,
[](const TypePtr& type) {
return type->kind() == TypeKind::BIGINT;
}),
isCompatible(requestedType, isRepeated, isInteger),
kTypeMappingErrorFmtStr,
"BIGINT",
requestedType->toString());
Expand Down
71 changes: 71 additions & 0 deletions velox/dwio/parquet/tests/reader/ParquetTableScanTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1400,6 +1400,77 @@ TEST_F(ParquetTableScanTest, intToBigintRead) {
assertEqualVectors(bigintDataFileVectors->childAt(0), rows->childAt(0));
}

TEST_F(ParquetTableScanTest, intReadWithNarrowerType) {
// Reading a wider integer as a narrower one causes unchecked truncation and
// two’s complement reinterpretation, resulting in values INT_MAX becoming -1.
RowVectorPtr intVectors = makeRowVector(
{"c1", "c2", "c3"},
{
makeFlatVector<int16_t>(
{123,
std::numeric_limits<int8_t>::max(),
std::numeric_limits<int8_t>::min(),
std::numeric_limits<int16_t>::max(),
std::numeric_limits<int16_t>::min()}),
makeFlatVector<int32_t>(
{123,
std::numeric_limits<int16_t>::max(),
std::numeric_limits<int16_t>::min(),
std::numeric_limits<int32_t>::max(),
std::numeric_limits<int32_t>::min()}),
makeFlatVector<int64_t>(
{123,
std::numeric_limits<int32_t>::max(),
std::numeric_limits<int32_t>::min(),
std::numeric_limits<int64_t>::max(),
std::numeric_limits<int64_t>::min()}),
});

RowVectorPtr smallerIntVectors = makeRowVector(
{"c1", "c2", "c3"},
{
makeFlatVector<int8_t>({
123,
std::numeric_limits<int8_t>::max(),
std::numeric_limits<int8_t>::min(),
-1,
0,
}),
makeFlatVector<int16_t>({{
123,
std::numeric_limits<int16_t>::max(),
std::numeric_limits<int16_t>::min(),
-1,
0,
}}),
makeFlatVector<int32_t>({
123,
std::numeric_limits<int32_t>::max(),
std::numeric_limits<int32_t>::min(),
-1,
0,
}),
});

auto dataFile = TempFilePath::create();
WriterOptions options;
writeToParquetFile(dataFile->getPath(), {intVectors}, options);

auto rowType = ROW({"c1", "c2", "c3"}, {TINYINT(), SMALLINT(), INTEGER()});
auto op = PlanBuilder()
.startTableScan()
.outputType(rowType)
.dataColumns(rowType)
.endTableScan()
.planNode();

auto split = makeSplit(dataFile->getPath());
auto result = AssertQueryBuilder(op).split(split).copyResults(pool());
auto rows = result->as<RowVector>();

assertEqualVectors(smallerIntVectors->childAt(0), rows->childAt(0));
}

TEST_F(ParquetTableScanTest, shortAndLongDecimalReadWithLargerPrecision) {
// decimal.parquet holds two columns (a: DECIMAL(5, 2), b: DECIMAL(20, 5)) and
// 20 rows (10 rows per group). Data is in plain uncompressed format:
Expand Down
Loading