Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions cpp/src/parquet/printer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -87,11 +87,15 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
const ColumnDescriptor* descr = file_metadata->schema()->Column(i);
stream << "Column " << i << ": " << descr->path()->ToDotString() << " ("
<< TypeToString(descr->physical_type());
if (descr->converted_type() != ConvertedType::NONE) {
stream << "/" << ConvertedTypeToString(descr->converted_type());
const auto& logical_type = descr->logical_type();
if (!logical_type->is_none()) {
stream << " / " << logical_type->ToString();
}
if (descr->converted_type() == ConvertedType::DECIMAL) {
stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")";
if (descr->converted_type() != ConvertedType::NONE) {
stream << " / " << ConvertedTypeToString(descr->converted_type());
if (descr->converted_type() == ConvertedType::DECIMAL) {
stream << "(" << descr->type_precision() << "," << descr->type_scale() << ")";
}
}
stream << ")" << std::endl;
}
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/reader_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -317,7 +317,7 @@ Number of RowGroups: 1
Number of Real Columns: 2
Number of Columns: 2
Number of Selected Columns: 2
Column 0: a.list.element.list.element.list.element (BYTE_ARRAY/UTF8)
Column 0: a.list.element.list.element.list.element (BYTE_ARRAY / String / UTF8)
Column 1: b (INT32)
--- Row Group: 0 ---
--- Total Bytes: 155 ---
Expand Down
13 changes: 11 additions & 2 deletions cpp/src/parquet/schema.cc
Original file line number Diff line number Diff line change
Expand Up @@ -448,7 +448,7 @@ std::unique_ptr<Node> PrimitiveNode::FromParquet(const void* opaque_element,
LogicalType::FromThrift(element->logicalType),
LoadEnumSafe(&element->type), element->type_length, field_id));
} else if (element->__isset.converted_type) {
// legacy writer with logical type present
// legacy writer with converted type present
primitive_node = std::unique_ptr<PrimitiveNode>(new PrimitiveNode(
element->name, LoadEnumSafe(&element->repetition_type),
LoadEnumSafe(&element->type), LoadEnumSafe(&element->converted_type),
Expand Down Expand Up @@ -500,7 +500,16 @@ void PrimitiveNode::ToParquet(void* opaque_element) const {
element->__set_name(name_);
element->__set_repetition_type(ToThrift(repetition_));
if (converted_type_ != ConvertedType::NONE) {
element->__set_converted_type(ToThrift(converted_type_));
if (converted_type_ != ConvertedType::NA) {
element->__set_converted_type(ToThrift(converted_type_));
} else {
// ConvertedType::NA is an unreleased, obsolete synonym for LogicalType::Null.
// Never emit it (see PARQUET-1990 for discussion).
if (!logical_type_ || !logical_type_->is_null()) {
throw ParquetException(
"ConvertedType::NA is obsolete, please use LogicalType::Null instead");
}
}
}
if (field_id_ >= 0) {
element->__set_field_id(field_id_);
Expand Down
14 changes: 5 additions & 9 deletions cpp/src/parquet/schema_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1101,12 +1101,12 @@ TEST(TestLogicalTypeConstruction, ConvertedTypeCompatibility) {
ASSERT_TRUE(reconstructed->is_valid());
ASSERT_TRUE(reconstructed->Equals(*original));

// Unknown
original = LogicalType::Unknown();
// Undefined
original = UndefinedLogicalType::Make();
ASSERT_TRUE(original->is_invalid());
ASSERT_FALSE(original->is_valid());
converted_type = original->ToConvertedType(&converted_decimal_metadata);
ASSERT_EQ(converted_type, ConvertedType::NA);
ASSERT_EQ(converted_type, ConvertedType::UNDEFINED);
ASSERT_FALSE(converted_decimal_metadata.isset);
ASSERT_TRUE(original->is_compatible(converted_type, converted_decimal_metadata));
ASSERT_TRUE(original->is_compatible(converted_type));
Expand Down Expand Up @@ -1243,7 +1243,6 @@ TEST(TestLogicalTypeOperation, LogicalTypeProperties) {
{BSONLogicalType::Make(), false, true, true},
{UUIDLogicalType::Make(), false, true, true},
{NoLogicalType::Make(), false, false, true},
{UnknownLogicalType::Make(), false, false, false},
};

for (const ExpectedProperties& c : cases) {
Expand Down Expand Up @@ -1339,7 +1338,7 @@ TEST(TestLogicalTypeOperation, LogicalTypeApplicability) {
}

std::vector<std::shared_ptr<const LogicalType>> any_type_cases = {
LogicalType::Null(), LogicalType::None(), LogicalType::Unknown()};
LogicalType::Null(), LogicalType::None(), UndefinedLogicalType::Make()};

for (auto c : any_type_cases) {
ConfirmAnyPrimitiveTypeApplicability(c);
Expand Down Expand Up @@ -1453,7 +1452,7 @@ TEST(TestLogicalTypeOperation, LogicalTypeRepresentation) {
};

std::vector<ExpectedRepresentation> cases = {
{LogicalType::Unknown(), "Unknown", R"({"Type": "Unknown"})"},
{UndefinedLogicalType::Make(), "Undefined", R"({"Type": "Undefined"})"},
{LogicalType::String(), "String", R"({"Type": "String"})"},
{LogicalType::Map(), "Map", R"({"Type": "Map"})"},
{LogicalType::List(), "List", R"({"Type": "List"})"},
Expand Down Expand Up @@ -1550,7 +1549,6 @@ TEST(TestLogicalTypeOperation, LogicalTypeSortOrder) {
};

std::vector<ExpectedSortOrder> cases = {
{LogicalType::Unknown(), SortOrder::UNKNOWN},
{LogicalType::String(), SortOrder::UNSIGNED},
{LogicalType::Map(), SortOrder::UNKNOWN},
{LogicalType::List(), SortOrder::UNKNOWN},
Expand Down Expand Up @@ -1888,8 +1886,6 @@ TEST_F(TestSchemaElementConstruction, SimpleCases) {
{"uuid", LogicalType::UUID(), Type::FIXED_LEN_BYTE_ARRAY, 16, false,
ConvertedType::NA, true, [this]() { return element_->logicalType.__isset.UUID; }},
{"none", LogicalType::None(), Type::INT64, -1, false, ConvertedType::NA, false,
check_nothing},
{"unknown", LogicalType::Unknown(), Type::INT64, -1, true, ConvertedType::NA, false,
check_nothing}};

for (const SchemaElementConstructionArguments& c : cases) {
Expand Down
3 changes: 3 additions & 0 deletions cpp/src/parquet/thrift_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,9 @@ static inline format::Type::type ToThrift(Type::type type) {
static inline format::ConvertedType::type ToThrift(ConvertedType::type type) {
// item 0 is NONE
DCHECK_NE(type, ConvertedType::NONE);
// it is forbidden to emit "NA" (PARQUET-1990)
DCHECK_NE(type, ConvertedType::NA);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

DCHECK_Lt might be better here.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't think it would change anything. Or we would have to do a full-blown check for valid values, which is a bit out of scope.

Copy link
Contributor

@emkornfield emkornfield Mar 31, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK. I think it prevents accidental writes of UNDEFINED as well.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, I can add that as well :-)

DCHECK_NE(type, ConvertedType::UNDEFINED);
return static_cast<format::ConvertedType::type>(static_cast<int>(type) - 1);
}

Expand Down
37 changes: 18 additions & 19 deletions cpp/src/parquet/types.cc
Original file line number Diff line number Diff line change
Expand Up @@ -366,13 +366,14 @@ std::shared_ptr<const LogicalType> LogicalType::FromConvertedType(
return JSONLogicalType::Make();
case ConvertedType::BSON:
return BSONLogicalType::Make();
case ConvertedType::NA:
return NullLogicalType::Make();
case ConvertedType::NONE:
return NoLogicalType::Make();
case ConvertedType::NA:
case ConvertedType::UNDEFINED:
return UnknownLogicalType::Make();
return UndefinedLogicalType::Make();
}
return UnknownLogicalType::Make();
return UndefinedLogicalType::Make();
}

std::shared_ptr<const LogicalType> LogicalType::FromThrift(
Expand Down Expand Up @@ -483,10 +484,6 @@ std::shared_ptr<const LogicalType> LogicalType::UUID() { return UUIDLogicalType:

std::shared_ptr<const LogicalType> LogicalType::None() { return NoLogicalType::Make(); }

std::shared_ptr<const LogicalType> LogicalType::Unknown() {
return UnknownLogicalType::Make();
}

/*
* The logical type implementation classes are built in four layers: (1) the base
* layer, which establishes the interface and provides generally reusable implementations
Expand Down Expand Up @@ -516,7 +513,7 @@ class LogicalType::Impl {
virtual std::string ToString() const = 0;

virtual bool is_serialized() const {
return !(type_ == LogicalType::Type::NONE || type_ == LogicalType::Type::UNKNOWN);
return !(type_ == LogicalType::Type::NONE || type_ == LogicalType::Type::UNDEFINED);
}

virtual std::string ToJSON() const {
Expand Down Expand Up @@ -567,14 +564,14 @@ class LogicalType::Impl {
class BSON;
class UUID;
class No;
class Unknown;
class Undefined;

protected:
Impl(LogicalType::Type::type t, SortOrder::type o) : type_(t), order_(o) {}
Impl() = default;

private:
LogicalType::Type::type type_ = LogicalType::Type::UNKNOWN;
LogicalType::Type::type type_ = LogicalType::Type::UNDEFINED;
SortOrder::type order_ = SortOrder::UNKNOWN;
};

Expand Down Expand Up @@ -636,7 +633,9 @@ bool LogicalType::is_JSON() const { return impl_->type() == LogicalType::Type::J
bool LogicalType::is_BSON() const { return impl_->type() == LogicalType::Type::BSON; }
bool LogicalType::is_UUID() const { return impl_->type() == LogicalType::Type::UUID; }
bool LogicalType::is_none() const { return impl_->type() == LogicalType::Type::NONE; }
bool LogicalType::is_valid() const { return impl_->type() != LogicalType::Type::UNKNOWN; }
bool LogicalType::is_valid() const {
return impl_->type() != LogicalType::Type::UNDEFINED;
}
bool LogicalType::is_invalid() const { return !is_valid(); }
bool LogicalType::is_nested() const {
return (impl_->type() == LogicalType::Type::LIST) ||
Expand Down Expand Up @@ -1555,19 +1554,19 @@ class LogicalType::Impl::No final : public LogicalType::Impl::SimpleCompatible,

GENERATE_MAKE(No)

class LogicalType::Impl::Unknown final : public LogicalType::Impl::SimpleCompatible,
public LogicalType::Impl::UniversalApplicable {
class LogicalType::Impl::Undefined final : public LogicalType::Impl::SimpleCompatible,
public LogicalType::Impl::UniversalApplicable {
public:
friend class UnknownLogicalType;
friend class UndefinedLogicalType;

OVERRIDE_TOSTRING(Unknown)
OVERRIDE_TOSTRING(Undefined)

private:
Unknown()
: LogicalType::Impl(LogicalType::Type::UNKNOWN, SortOrder::UNKNOWN),
LogicalType::Impl::SimpleCompatible(ConvertedType::NA) {}
Undefined()
: LogicalType::Impl(LogicalType::Type::UNDEFINED, SortOrder::UNKNOWN),
LogicalType::Impl::SimpleCompatible(ConvertedType::UNDEFINED) {}
};

GENERATE_MAKE(Unknown)
GENERATE_MAKE(Undefined)

} // namespace parquet
29 changes: 19 additions & 10 deletions cpp/src/parquet/types.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ struct Type {
// Mirrors parquet::ConvertedType
struct ConvertedType {
enum type {
NONE,
NONE, // Not a real converted type, but means no converted type is specified
UTF8,
MAP,
MAP_KEY_VALUE,
Expand All @@ -94,9 +94,12 @@ struct ConvertedType {
JSON,
BSON,
INTERVAL,
// DEPRECATED INVALID ConvertedType for all-null data.
// Only useful for reading legacy files written out by interim Parquet C++ releases.
// For writing, always emit LogicalType::Null instead.
// See PARQUET-1990.
NA = 25,
// Should always be last element.
UNDEFINED = 26
UNDEFINED = 26 // Not a real converted type; should always be last element
};
};

Expand Down Expand Up @@ -140,7 +143,7 @@ class PARQUET_EXPORT LogicalType {
public:
struct Type {
enum type {
UNKNOWN = 0,
UNDEFINED = 0, // Not a real logical type
STRING = 1,
MAP,
LIST,
Expand All @@ -151,11 +154,11 @@ class PARQUET_EXPORT LogicalType {
TIMESTAMP,
INTERVAL,
INT,
NIL, // Thrift NullType
NIL, // Thrift NullType: annotates data that is always null
JSON,
BSON,
UUID,
NONE
NONE // Not a real logical type; should always be last element
};
};

Expand Down Expand Up @@ -199,12 +202,18 @@ class PARQUET_EXPORT LogicalType {

static std::shared_ptr<const LogicalType> Interval();
static std::shared_ptr<const LogicalType> Int(int bit_width, bool is_signed);

/// \brief Create a logical type for data that's always null
///
/// Any physical type can be annotated with this logical type.
static std::shared_ptr<const LogicalType> Null();

static std::shared_ptr<const LogicalType> JSON();
static std::shared_ptr<const LogicalType> BSON();
static std::shared_ptr<const LogicalType> UUID();

/// \brief Create a placeholder for when no logical type is specified
static std::shared_ptr<const LogicalType> None();
static std::shared_ptr<const LogicalType> Unknown();

/// \brief Return true if this logical type is consistent with the given underlying
/// physical type.
Expand Down Expand Up @@ -434,13 +443,13 @@ class PARQUET_EXPORT NoLogicalType : public LogicalType {
NoLogicalType() = default;
};

/// \brief Allowed for any type.
class PARQUET_EXPORT UnknownLogicalType : public LogicalType {
// Internal API, for unrecognized logical types
class PARQUET_EXPORT UndefinedLogicalType : public LogicalType {
public:
static std::shared_ptr<const LogicalType> Make();

private:
UnknownLogicalType() = default;
UndefinedLogicalType() = default;
};

// Data encodings. Mirrors parquet::Encoding
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/_parquet.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ cdef extern from "parquet/api/schema.h" namespace "parquet" nogil:
ParquetType_FIXED_LEN_BYTE_ARRAY" parquet::Type::FIXED_LEN_BYTE_ARRAY"

enum ParquetLogicalTypeId" parquet::LogicalType::Type::type":
ParquetLogicalType_UNKNOWN" parquet::LogicalType::Type::UNKNOWN"
ParquetLogicalType_UNDEFINED" parquet::LogicalType::Type::UNDEFINED"
ParquetLogicalType_STRING" parquet::LogicalType::Type::STRING"
ParquetLogicalType_MAP" parquet::LogicalType::Type::MAP"
ParquetLogicalType_LIST" parquet::LogicalType::Type::LIST"
Expand Down
2 changes: 1 addition & 1 deletion python/pyarrow/_parquet.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -809,7 +809,7 @@ cdef physical_type_name_from_enum(ParquetType type_):

cdef logical_type_name_from_enum(ParquetLogicalTypeId type_):
return {
ParquetLogicalType_UNKNOWN: 'UNKNOWN',
ParquetLogicalType_UNDEFINED: 'UNDEFINED',
ParquetLogicalType_STRING: 'STRING',
ParquetLogicalType_MAP: 'MAP',
ParquetLogicalType_LIST: 'LIST',
Expand Down