From 24c1d5d7e9b8edad849efe5b2c3184ead8f5ca2b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 8 Nov 2016 18:18:16 -0500 Subject: [PATCH 01/27] Some Types refactoring, add TypeVisitor abstract class. Add RapidJSON as external project Change-Id: I65db68e6972e12368da3ded0b70b8578689d45f3 --- cpp/CMakeLists.txt | 19 ++ cpp/src/arrow/type.cc | 36 +++- cpp/src/arrow/type.h | 238 ++++++++++++++++++++------ cpp/src/arrow/types/collection.h | 41 ----- cpp/src/arrow/types/datetime.h | 33 ---- cpp/src/arrow/types/decimal.h | 10 -- cpp/src/arrow/types/list-test.cc | 1 - cpp/src/arrow/types/primitive-test.cc | 18 +- cpp/src/arrow/types/string-test.cc | 2 +- cpp/src/arrow/types/union.h | 21 --- 10 files changed, 244 insertions(+), 175 deletions(-) delete mode 100644 cpp/src/arrow/types/collection.h diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 6f954830b63..0bff7528578 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -545,6 +545,25 @@ if(ARROW_BUILD_BENCHMARKS) endif() endif() +# RapidJSON, header only dependency +if("$ENV{RAPIDJSON_HOME}" STREQUAL "") + ExternalProject_Add(rapidjson_ep + PREFIX "${CMAKE_BINARY_DIR}" + URL "https://github.com/miloyip/rapidjson/archive/v1.1.0.tar.gz" + URL_MD5 "badd12c511e081fec6c89c43a7027bce" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + BUILD_IN_SOURCE 1 + INSTALL_COMMAND "") + + ExternalProject_Get_Property(rapidjson_ep SOURCE_DIR) + set(RAPIDJSON_INCLUDE_DIR "${SOURCE_DIR}/include") +else() + set(RAPIDJSON_INCLUDE_DIR "$ENV{RAPIDJSON_HOME}/include") +endif() +message(STATUS "RapidJSON include dir: ${RAPIDJSON_INCLUDE_DIR}") +include_directories(SYSTEM ${RAPIDJSON_INCLUDE_DIR}) + ## Google PerfTools ## ## Disabled with TSAN/ASAN as well as with gold+dynamic linking (see comment diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 4fd50b7c193..927143fa4ff 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -20,6 +20,8 @@ #include #include +#include "arrow/util/status.h" + namespace arrow { std::string Field::ToString() const { @@ -45,8 +47,7 @@ bool DataType::Equals(const DataType* other) const { } std::string StringType::ToString() const { - std::string result(name()); - return result; + return std::string("string"); } std::string ListType::ToString() const { @@ -56,7 +57,7 @@ std::string ListType::ToString() const { } std::string BinaryType::ToString() const { - return std::string(name()); + return std::string("binary"); } std::string StructType::ToString() const { @@ -71,4 +72,33 @@ std::string StructType::ToString() const { return s.str(); } +// Visitors and template instantiation + +#define ACCEPT_VISITOR(TYPE) \ + Status TYPE::Accept(TypeVisitor* visitor) const { \ + return visitor->Visit(*this); \ + } + +ACCEPT_VISITOR(NullType); +ACCEPT_VISITOR(BinaryType); +ACCEPT_VISITOR(StringType); +ACCEPT_VISITOR(ListType); +ACCEPT_VISITOR(StructType); +ACCEPT_VISITOR(DecimalType); +ACCEPT_VISITOR(SparseUnionType); +ACCEPT_VISITOR(DenseUnionType); +ACCEPT_VISITOR(DateType); +ACCEPT_VISITOR(TimestampType); + +constexpr const char* Int8Type::NAME; +constexpr const char* UInt16Type::NAME; +constexpr const char* Int16Type::NAME; +constexpr const char* UInt32Type::NAME; +constexpr const char* Int32Type::NAME; +constexpr const char* UInt64Type::NAME; +constexpr const char* Int64Type::NAME; +constexpr const char* FloatType::NAME; +constexpr const char* DoubleType::NAME; +constexpr const char* BooleanType::NAME; + } // namespace arrow diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index ea8516fc347..2062f0d3608 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -24,10 +24,62 @@ #include #include "arrow/util/macros.h" +#include "arrow/util/status.h" #include "arrow/util/visibility.h" namespace arrow { +struct Field; + +// Type forward declarations for the TypeVisitor +struct DataType; +struct NullType; +struct BooleanType; +struct Int8Type; +struct Int16Type; +struct Int32Type; +struct Int64Type; +struct UInt8Type; +struct UInt16Type; +struct UInt32Type; +struct UInt64Type; +struct FloatType; +struct DoubleType; +struct StringType; +struct BinaryType; +struct DateType; +struct TimestampType; +struct DecimalType; +struct ListType; +struct StructType; +struct DenseUnionType; +struct SparseUnionType; + +class TypeVisitor { + public: + virtual Status Visit(const NullType& type) = 0; + virtual Status Visit(const BooleanType& type) = 0; + virtual Status Visit(const Int8Type& type) = 0; + virtual Status Visit(const Int16Type& type) = 0; + virtual Status Visit(const Int32Type& type) = 0; + virtual Status Visit(const Int64Type& type) = 0; + virtual Status Visit(const UInt8Type& type) = 0; + virtual Status Visit(const UInt16Type& type) = 0; + virtual Status Visit(const UInt32Type& type) = 0; + virtual Status Visit(const UInt64Type& type) = 0; + virtual Status Visit(const FloatType& type) = 0; + virtual Status Visit(const DoubleType& type) = 0; + virtual Status Visit(const StringType& type) = 0; + virtual Status Visit(const BinaryType& type) = 0; + virtual Status Visit(const DateType& type) = 0; + virtual Status Visit(const TimestampType& type) = 0; + virtual Status Visit(const DecimalType& type) = 0; + virtual Status Visit(const ListType& type) = 0; + virtual Status Visit(const StructType& type) = 0; + virtual Status Visit(const DenseUnionType& type) = 0; + virtual Status Visit(const SparseUnionType& type) = 0; +}; + // Data types in this library are all *logical*. They can be expressed as // either a primitive physical type (bytes or bits of some fixed size), a // nested type consisting of other data types, or another data type (e.g. a @@ -91,17 +143,9 @@ struct Type { // Unions of logical types DENSE_UNION = 32, SPARSE_UNION = 33, - - // Union - JSON_SCALAR = 50, - - // User-defined type - USER = 60 }; }; -struct Field; - struct ARROW_EXPORT DataType { Type::type type; @@ -127,6 +171,8 @@ struct ARROW_EXPORT DataType { virtual int value_size() const { return -1; } + virtual Status Accept(TypeVisitor* visitor) const = 0; + virtual std::string ToString() const = 0; }; @@ -168,75 +214,81 @@ struct ARROW_EXPORT Field { }; typedef std::shared_ptr FieldPtr; -template +template struct ARROW_EXPORT PrimitiveType : public DataType { - PrimitiveType() : DataType(Derived::type_enum) {} + using c_type = C_TYPE; \ + static constexpr Type::type type_id = TYPE_ID; - std::string ToString() const override; + PrimitiveType() : DataType(TYPE_ID) {} + + int value_size() const override { return sizeof(C_TYPE); } + + Status Accept(TypeVisitor* visitor) const override { + return visitor->Visit(*static_cast(this)); + } + + std::string ToString() const override { + return std::string(DERIVED::NAME); + } }; -template -inline std::string PrimitiveType::ToString() const { - std::string result(static_cast(this)->name()); - return result; -} - -#define PRIMITIVE_DECL(TYPENAME, C_TYPE, ENUM, SIZE, NAME) \ - typedef C_TYPE c_type; \ - static constexpr Type::type type_enum = Type::ENUM; \ - \ - TYPENAME() : PrimitiveType() {} \ - \ - virtual int value_size() const { return SIZE; } \ - \ - static const char* name() { return NAME; } - -struct ARROW_EXPORT NullType : public PrimitiveType { - PRIMITIVE_DECL(NullType, void, NA, 0, "null"); +struct ARROW_EXPORT NullType : public DataType { + static constexpr Type::type type_enum = Type::NA; + + NullType() : DataType(Type::NA) {} + + int value_size() const override { return 0; } + + Status Accept(TypeVisitor* visitor) const override; + + std::string ToString() const override { + static const std::string name = "null"; + return name; + } }; -struct ARROW_EXPORT BooleanType : public PrimitiveType { - PRIMITIVE_DECL(BooleanType, uint8_t, BOOL, 1, "bool"); +struct ARROW_EXPORT BooleanType : public PrimitiveType { + constexpr static const char* NAME = "bool"; }; -struct ARROW_EXPORT UInt8Type : public PrimitiveType { - PRIMITIVE_DECL(UInt8Type, uint8_t, UINT8, 1, "uint8"); +struct ARROW_EXPORT UInt8Type : public PrimitiveType { + constexpr static const char* NAME = "uint8"; }; -struct ARROW_EXPORT Int8Type : public PrimitiveType { - PRIMITIVE_DECL(Int8Type, int8_t, INT8, 1, "int8"); +struct ARROW_EXPORT Int8Type : public PrimitiveType { + constexpr static const char* NAME = "int8"; }; -struct ARROW_EXPORT UInt16Type : public PrimitiveType { - PRIMITIVE_DECL(UInt16Type, uint16_t, UINT16, 2, "uint16"); +struct ARROW_EXPORT UInt16Type : public PrimitiveType { + constexpr static const char* NAME = "uint16"; }; -struct ARROW_EXPORT Int16Type : public PrimitiveType { - PRIMITIVE_DECL(Int16Type, int16_t, INT16, 2, "int16"); +struct ARROW_EXPORT Int16Type : public PrimitiveType { + constexpr static const char* NAME = "int16"; }; -struct ARROW_EXPORT UInt32Type : public PrimitiveType { - PRIMITIVE_DECL(UInt32Type, uint32_t, UINT32, 4, "uint32"); +struct ARROW_EXPORT UInt32Type : public PrimitiveType { + constexpr static const char* NAME = "uint32"; }; -struct ARROW_EXPORT Int32Type : public PrimitiveType { - PRIMITIVE_DECL(Int32Type, int32_t, INT32, 4, "int32"); +struct ARROW_EXPORT Int32Type : public PrimitiveType { + constexpr static const char* NAME = "int32"; }; -struct ARROW_EXPORT UInt64Type : public PrimitiveType { - PRIMITIVE_DECL(UInt64Type, uint64_t, UINT64, 8, "uint64"); +struct ARROW_EXPORT UInt64Type : public PrimitiveType { + constexpr static const char* NAME = "uint64"; }; -struct ARROW_EXPORT Int64Type : public PrimitiveType { - PRIMITIVE_DECL(Int64Type, int64_t, INT64, 8, "int64"); +struct ARROW_EXPORT Int64Type : public PrimitiveType { + constexpr static const char* NAME = "int64"; }; -struct ARROW_EXPORT FloatType : public PrimitiveType { - PRIMITIVE_DECL(FloatType, float, FLOAT, 4, "float"); +struct ARROW_EXPORT FloatType : public PrimitiveType { + constexpr static const char* NAME = "float"; }; -struct ARROW_EXPORT DoubleType : public PrimitiveType { - PRIMITIVE_DECL(DoubleType, double, DOUBLE, 8, "double"); +struct ARROW_EXPORT DoubleType : public PrimitiveType { + constexpr static const char* NAME = "double"; }; struct ARROW_EXPORT ListType : public DataType { @@ -252,15 +304,15 @@ struct ARROW_EXPORT ListType : public DataType { const std::shared_ptr& value_type() const { return children_[0]->type; } - static char const* name() { return "list"; } - + Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; }; // BinaryType type is reprsents lists of 1-byte values. struct ARROW_EXPORT BinaryType : public DataType { BinaryType() : BinaryType(Type::BINARY) {} - static char const* name() { return "binary"; } + + Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; protected: @@ -272,8 +324,7 @@ struct ARROW_EXPORT BinaryType : public DataType { struct ARROW_EXPORT StringType : public BinaryType { StringType() : BinaryType(Type::STRING) {} - static char const* name() { return "string"; } - + Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; }; @@ -283,9 +334,84 @@ struct ARROW_EXPORT StructType : public DataType { children_ = fields; } + Status Accept(TypeVisitor* visitor) const override; + std::string ToString() const override; +}; + +struct ARROW_EXPORT DecimalType : public DataType { + explicit DecimalType(int precision_, int scale_) + : DataType(Type::DECIMAL), precision(precision_), scale(scale_) {} + int precision; + int scale; + + Status Accept(TypeVisitor* visitor) const override; + std::string ToString() const override; +}; + +template +struct UnionType : public DataType { + std::vector child_types_; + + UnionType() : DataType(T) {} + + const TypePtr& child(int i) const { return child_types_[i]; } + int num_children() const { return child_types_.size(); } +}; + +struct DenseUnionType : public UnionType { + typedef UnionType Base; + + explicit DenseUnionType(const std::vector& child_types) : Base() { + child_types_ = child_types; + } + + Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; }; +struct SparseUnionType : public UnionType { + typedef UnionType Base; + + explicit SparseUnionType(const std::vector& child_types) : Base() { + child_types_ = child_types; + } + + Status Accept(TypeVisitor* visitor) const override; + virtual std::string ToString() const; +}; + +struct DateType : public DataType { + enum class Unit : char { DAY = 0, MONTH = 1, YEAR = 2 }; + + Unit unit; + + explicit DateType(Unit unit = Unit::DAY) : DataType(Type::DATE), unit(unit) {} + + DateType(const DateType& other) : DateType(other.unit) {} + + Status Accept(TypeVisitor* visitor) const override; +}; + +struct ARROW_EXPORT TimestampType : public DataType { + enum class Unit : char { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 }; + + typedef int64_t c_type; + static constexpr Type::type type_enum = Type::TIMESTAMP; + + int value_size() const override { return sizeof(int64_t); } + + Unit unit; + + explicit TimestampType(Unit unit = Unit::MILLI) + : DataType(Type::TIMESTAMP), unit(unit) {} + + TimestampType(const TimestampType& other) : TimestampType(other.unit) {} + virtual ~TimestampType() {} + + Status Accept(TypeVisitor* visitor) const override; + std::string ToString() const override { return "timestamp"; } +}; + // These will be defined elsewhere template struct TypeTraits {}; diff --git a/cpp/src/arrow/types/collection.h b/cpp/src/arrow/types/collection.h deleted file mode 100644 index 1712030203f..00000000000 --- a/cpp/src/arrow/types/collection.h +++ /dev/null @@ -1,41 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_TYPES_COLLECTION_H -#define ARROW_TYPES_COLLECTION_H - -#include -#include - -#include "arrow/type.h" - -namespace arrow { - -template -struct CollectionType : public DataType { - std::vector child_types_; - - CollectionType() : DataType(T) {} - - const TypePtr& child(int i) const { return child_types_[i]; } - - int num_children() const { return child_types_.size(); } -}; - -} // namespace arrow - -#endif // ARROW_TYPES_COLLECTION_H diff --git a/cpp/src/arrow/types/datetime.h b/cpp/src/arrow/types/datetime.h index 241a126d100..5676a946993 100644 --- a/cpp/src/arrow/types/datetime.h +++ b/cpp/src/arrow/types/datetime.h @@ -24,39 +24,6 @@ namespace arrow { -struct DateType : public DataType { - enum class Unit : char { DAY = 0, MONTH = 1, YEAR = 2 }; - - Unit unit; - - explicit DateType(Unit unit = Unit::DAY) : DataType(Type::DATE), unit(unit) {} - - DateType(const DateType& other) : DateType(other.unit) {} - - static char const* name() { return "date"; } -}; - -struct ARROW_EXPORT TimestampType : public DataType { - enum class Unit : char { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 }; - - typedef int64_t c_type; - static constexpr Type::type type_enum = Type::TIMESTAMP; - - int value_size() const override { return sizeof(int64_t); } - - Unit unit; - - explicit TimestampType(Unit unit = Unit::MILLI) - : DataType(Type::TIMESTAMP), unit(unit) {} - - TimestampType(const TimestampType& other) : TimestampType(other.unit) {} - virtual ~TimestampType() {} - - std::string ToString() const override { return "timestamp"; } - - static char const* name() { return "timestamp"; } -}; - } // namespace arrow #endif // ARROW_TYPES_DATETIME_H diff --git a/cpp/src/arrow/types/decimal.h b/cpp/src/arrow/types/decimal.h index 6c497c597d9..1709702e3f1 100644 --- a/cpp/src/arrow/types/decimal.h +++ b/cpp/src/arrow/types/decimal.h @@ -25,16 +25,6 @@ namespace arrow { -struct ARROW_EXPORT DecimalType : public DataType { - explicit DecimalType(int precision_, int scale_) - : DataType(Type::DECIMAL), precision(precision_), scale(scale_) {} - int precision; - int scale; - static char const* name() { return "decimal"; } - - std::string ToString() const override; -}; - } // namespace arrow #endif // ARROW_TYPES_DECIMAL_H diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index 12c539495a2..71b34c9850e 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -46,7 +46,6 @@ TEST(TypesTest, TestListType) { ListType list_type(vt); ASSERT_EQ(list_type.type, Type::LIST); - ASSERT_EQ(list_type.name(), string("list")); ASSERT_EQ(list_type.ToString(), string("list")); ASSERT_EQ(list_type.value_type()->type, vt->type); diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index e47f6dc74fb..eee08783c02 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -41,15 +41,15 @@ namespace arrow { class Array; -#define PRIMITIVE_TEST(KLASS, ENUM, NAME) \ - TEST(TypesTest, TestPrimitive_##ENUM) { \ - KLASS tp; \ - \ - ASSERT_EQ(tp.type, Type::ENUM); \ - ASSERT_EQ(tp.name(), string(NAME)); \ - \ - KLASS tp_copy = tp; \ - ASSERT_EQ(tp_copy.type, Type::ENUM); \ +#define PRIMITIVE_TEST(KLASS, ENUM, NAME) \ + TEST(TypesTest, TestPrimitive_##ENUM) { \ + KLASS tp; \ + \ + ASSERT_EQ(tp.type, Type::ENUM); \ + ASSERT_EQ(tp.ToString(), string(NAME)); \ + \ + KLASS tp_copy = tp; \ + ASSERT_EQ(tp_copy.type, Type::ENUM); \ } PRIMITIVE_TEST(Int8Type, INT8, "int8"); diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index af87a14a8b3..4e5db17dfcc 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -47,7 +47,7 @@ TEST(TypesTest, BinaryType) { TEST(TypesTest, TestStringType) { StringType str; ASSERT_EQ(str.type, Type::STRING); - ASSERT_EQ(str.name(), std::string("string")); + ASSERT_EQ(str.ToString(), std::string("string")); } // ---------------------------------------------------------------------- diff --git a/cpp/src/arrow/types/union.h b/cpp/src/arrow/types/union.h index d2ee9bde04d..44f39cc6994 100644 --- a/cpp/src/arrow/types/union.h +++ b/cpp/src/arrow/types/union.h @@ -24,32 +24,11 @@ #include "arrow/array.h" #include "arrow/type.h" -#include "arrow/types/collection.h" namespace arrow { class Buffer; -struct DenseUnionType : public CollectionType { - typedef CollectionType Base; - - explicit DenseUnionType(const std::vector& child_types) : Base() { - child_types_ = child_types; - } - - virtual std::string ToString() const; -}; - -struct SparseUnionType : public CollectionType { - typedef CollectionType Base; - - explicit SparseUnionType(const std::vector& child_types) : Base() { - child_types_ = child_types; - } - - virtual std::string ToString() const; -}; - class UnionArray : public Array { protected: // The data are types encoded as int16 From 1edf2a9a51de4c37fe4c21eb51fb2db294a6520e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 9 Nov 2016 00:38:03 -0500 Subject: [PATCH 02/27] Prototyping out visitor pattern for json serialization Change-Id: I52f616d96a1abaa35e2620393f9c471ee7c152e5 --- cpp/src/arrow/io/hdfs.cc | 8 +- cpp/src/arrow/io/libhdfs_shim.cc | 26 +-- cpp/src/arrow/ipc/CMakeLists.txt | 2 + cpp/src/arrow/ipc/json.cc | 278 ++++++++++++++++++++++++++ cpp/src/arrow/ipc/json.h | 29 +++ cpp/src/arrow/type.cc | 52 +++-- cpp/src/arrow/type.h | 140 +++++++++---- cpp/src/arrow/types/datetime.h | 4 +- cpp/src/arrow/types/decimal.h | 4 +- cpp/src/arrow/types/primitive-test.cc | 21 +- cpp/src/arrow/types/primitive.cc | 10 +- cpp/src/arrow/util/logging.h | 4 +- 12 files changed, 478 insertions(+), 100 deletions(-) create mode 100644 cpp/src/arrow/ipc/json.cc create mode 100644 cpp/src/arrow/ipc/json.h diff --git a/cpp/src/arrow/io/hdfs.cc b/cpp/src/arrow/io/hdfs.cc index 6490a7574ee..13491e780e2 100644 --- a/cpp/src/arrow/io/hdfs.cc +++ b/cpp/src/arrow/io/hdfs.cc @@ -289,13 +289,9 @@ class HdfsClient::HdfsClientImpl { // connect to HDFS with the builder object hdfsBuilder* builder = hdfsNewBuilder(); - if (!config->host.empty()) { - hdfsBuilderSetNameNode(builder, config->host.c_str()); - } + if (!config->host.empty()) { hdfsBuilderSetNameNode(builder, config->host.c_str()); } hdfsBuilderSetNameNodePort(builder, config->port); - if (!config->user.empty()) { - hdfsBuilderSetUserName(builder, config->user.c_str()); - } + if (!config->user.empty()) { hdfsBuilderSetUserName(builder, config->user.c_str()); } if (!config->kerb_ticket.empty()) { hdfsBuilderSetKerbTicketCachePath(builder, config->kerb_ticket.c_str()); } diff --git a/cpp/src/arrow/io/libhdfs_shim.cc b/cpp/src/arrow/io/libhdfs_shim.cc index 1fee595d071..36b8a4ec980 100644 --- a/cpp/src/arrow/io/libhdfs_shim.cc +++ b/cpp/src/arrow/io/libhdfs_shim.cc @@ -74,12 +74,9 @@ static HINSTANCE libjvm_handle = NULL; // NOTE(wesm): cpplint does not like use of short and other imprecise C types static hdfsBuilder* (*ptr_hdfsNewBuilder)(void) = NULL; -static void (*ptr_hdfsBuilderSetNameNode)( - hdfsBuilder* bld, const char* nn) = NULL; -static void (*ptr_hdfsBuilderSetNameNodePort)( - hdfsBuilder* bld, tPort port) = NULL; -static void (*ptr_hdfsBuilderSetUserName)( - hdfsBuilder* bld, const char* userName) = NULL; +static void (*ptr_hdfsBuilderSetNameNode)(hdfsBuilder* bld, const char* nn) = NULL; +static void (*ptr_hdfsBuilderSetNameNodePort)(hdfsBuilder* bld, tPort port) = NULL; +static void (*ptr_hdfsBuilderSetUserName)(hdfsBuilder* bld, const char* userName) = NULL; static void (*ptr_hdfsBuilderSetKerbTicketCachePath)( hdfsBuilder* bld, const char* kerbTicketCachePath) = NULL; static hdfsFS (*ptr_hdfsBuilderConnect)(hdfsBuilder* bld) = NULL; @@ -173,9 +170,9 @@ void hdfsBuilderSetUserName(hdfsBuilder* bld, const char* userName) { ptr_hdfsBuilderSetUserName(bld, userName); } -void hdfsBuilderSetKerbTicketCachePath(hdfsBuilder* bld, - const char* kerbTicketCachePath) { - ptr_hdfsBuilderSetKerbTicketCachePath(bld , kerbTicketCachePath); +void hdfsBuilderSetKerbTicketCachePath( + hdfsBuilder* bld, const char* kerbTicketCachePath) { + ptr_hdfsBuilderSetKerbTicketCachePath(bld, kerbTicketCachePath); } hdfsFS hdfsBuilderConnect(hdfsBuilder* bld) { @@ -364,7 +361,7 @@ static std::vector get_potential_libhdfs_paths() { std::vector libhdfs_potential_paths; std::string file_name; - // OS-specific file name +// OS-specific file name #ifdef __WIN32 file_name = "hdfs.dll"; #elif __APPLE__ @@ -374,10 +371,7 @@ static std::vector get_potential_libhdfs_paths() { #endif // Common paths - std::vector search_paths = { - fs::path(""), - fs::path(".") - }; + std::vector search_paths = {fs::path(""), fs::path(".")}; // Path from environment variable const char* hadoop_home = std::getenv("HADOOP_HOME"); @@ -387,9 +381,7 @@ static std::vector get_potential_libhdfs_paths() { } const char* libhdfs_dir = std::getenv("ARROW_LIBHDFS_DIR"); - if (libhdfs_dir != nullptr) { - search_paths.push_back(fs::path(libhdfs_dir)); - } + if (libhdfs_dir != nullptr) { search_paths.push_back(fs::path(libhdfs_dir)); } // All paths with file name for (auto& path : search_paths) { diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index d2db339de7e..95b9d2c4049 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -34,6 +34,7 @@ set(ARROW_IPC_TEST_LINK_LIBS set(ARROW_IPC_SRCS adapter.cc file.cc + json.cc metadata.cc metadata-internal.cc ) @@ -114,6 +115,7 @@ add_dependencies(arrow_objlib metadata_fbs) install(FILES adapter.h file.h + json.h metadata.h DESTINATION include/arrow/ipc) diff --git a/cpp/src/arrow/ipc/json.cc b/cpp/src/arrow/ipc/json.cc new file mode 100644 index 00000000000..18d9ce47412 --- /dev/null +++ b/cpp/src/arrow/ipc/json.cc @@ -0,0 +1,278 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/ipc/json.h" + +#define RAPIDJSON_HAS_STDSTRING 1 +#define RAPIDJSON_HAS_CXX11_RVALUE_REFS 1 +#define RAPIDJSON_HAS_CXX11_RANGE_FOR 1 + +#include "rapidjson/stringbuffer.h" +#include "rapidjson/writer.h" + +#include "arrow/type.h" +#include "arrow/util/status.h" + +namespace arrow { +namespace ipc { + +namespace rj = rapidjson; + +enum class BufferType : char { DATA, OFFSET, TYPE, VALIDITY }; + +static std::string GetBufferTypeName(BufferType type) { + switch (type) { + case BufferType::DATA: + return "DATA"; + case BufferType::OFFSET: + return "OFFSET"; + case BufferType::TYPE: + return "TYPE"; + case BufferType::VALIDITY: + return "VALIDITY"; + default: + break; + }; + return "UNKNOWN"; +} + +class BufferLayout { + public: + BufferLayout(BufferType type, int bit_width) : type_(type), bit_width_(bit_width) {} + + BufferType type() const { return type_; } + int bit_width() const { return bit_width_; } + + private: + BufferType type_; + int bit_width_; +}; + +static const BufferLayout kValidityBuffer(BufferType::VALIDITY, 1); +static const BufferLayout kOffsetBuffer(BufferType::OFFSET, 32); +static const BufferLayout kTypeBuffer(BufferType::TYPE, 32); +static const BufferLayout kBooleanBuffer(BufferType::DATA, 1); +static const BufferLayout kValues64(BufferType::DATA, 64); +static const BufferLayout kValues32(BufferType::DATA, 32); +static const BufferLayout kValues16(BufferType::DATA, 16); +static const BufferLayout kValues8(BufferType::DATA, 8); + +class JsonSchemaWriter : public TypeVisitor { + public: + explicit JsonSchemaWriter(rj::Writer* writer) : writer_(writer) {} + + void Start() { + writer_->Key("schema"); + writer_->StartArray(); + } + + void Finish() { writer_->EndArray(); } + + Status VisitField(const Field& field) { + writer_->StartObject(); + + writer_->Key("name"); + writer_->String(field.name.c_str()); + + writer_->Key("nullable"); + writer_->Bool(field.nullable); + + // Visit the type + RETURN_NOT_OK(field.type->Accept(this)); + + writer_->EndObject(); + + return Status::OK(); + } + + template + void WritePrimitive(const T& type, const std::vector& buffer_layout) { + WriteName(type); + SetNoChildren(); + WriteBufferLayout(buffer_layout); + } + + template + void WriteVarBytes(const T& type) { + WriteName(type); + SetNoChildren(); + WriteBufferLayout({kValidityBuffer, kOffsetBuffer, kValues8}); + } + + void WriteBufferLayout(const std::vector& buffer_layout) { + writer_->Key("typeLayout"); + writer_->StartArray(); + + for (const BufferLayout& buffer : buffer_layout) { + writer_->StartObject(); + writer_->Key("type"); + writer_->String(GetBufferTypeName(buffer.type())); + + writer_->Key("typeBitWidth"); + writer_->Int(buffer.bit_width()); + + writer_->EndObject(); + } + writer_->EndArray(); + } + + void WriteChildren(const std::vector>& children) {} + + void SetNoChildren() { + writer_->Key("children"); + writer_->StartArray(); + writer_->EndArray(); + } + + template + void WriteName(const T& type) { + writer_->Key("type"); + writer_->String(T::NAME); + } + + Status Visit(const NullType& type) override { + WritePrimitive(type, {}); + return Status::OK(); + } + + Status Visit(const BooleanType& type) override { + WritePrimitive(type, {kValidityBuffer, kBooleanBuffer}); + return Status::OK(); + } + + Status Visit(const Int8Type& type) override { + WritePrimitive(type, {kValidityBuffer, kValues8}); + return Status::OK(); + } + + Status Visit(const Int16Type& type) override { + WritePrimitive(type, {kValidityBuffer, kValues16}); + return Status::OK(); + } + + Status Visit(const Int32Type& type) override { + WritePrimitive(type, {kValidityBuffer, kValues32}); + return Status::OK(); + } + + Status Visit(const Int64Type& type) override { + WritePrimitive(type, {kValidityBuffer, kValues64}); + return Status::OK(); + } + + Status Visit(const UInt8Type& type) override { + WritePrimitive(type, {kValidityBuffer, kValues8}); + return Status::OK(); + } + + Status Visit(const UInt16Type& type) override { + WritePrimitive(type, {kValidityBuffer, kValues16}); + return Status::OK(); + } + + Status Visit(const UInt32Type& type) override { + WritePrimitive(type, {kValidityBuffer, kValues32}); + return Status::OK(); + } + + Status Visit(const UInt64Type& type) override { + WritePrimitive(type, {kValidityBuffer, kValues64}); + return Status::OK(); + } + + Status Visit(const HalfFloatType& type) override { + WritePrimitive(type, {kValidityBuffer, kValues16}); + return Status::OK(); + } + + Status Visit(const FloatType& type) override { + WritePrimitive(type, {kValidityBuffer, kValues32}); + return Status::OK(); + } + + Status Visit(const DoubleType& type) override { + WritePrimitive(type, {kValidityBuffer, kValues64}); + return Status::OK(); + } + + Status Visit(const StringType& type) override { + WriteVarBytes(type); + return Status::OK(); + } + + Status Visit(const BinaryType& type) override { + WriteVarBytes(type); + return Status::OK(); + } + + Status Visit(const DateType& type) override { + WritePrimitive(type, {kValidityBuffer, kValues64}); + return Status::OK(); + } + + Status Visit(const TimeType& type) override { + WritePrimitive(type, {kValidityBuffer, kValues64}); + return Status::OK(); + } + + Status Visit(const TimestampType& type) override { + WritePrimitive(type, {kValidityBuffer, kValues64}); + return Status::OK(); + } + + Status Visit(const DecimalType& type) override { return Status::NotImplemented("NYI"); } + + Status Visit(const ListType& type) override { + WriteName(type); + WriteChildren(type.children()); + WriteBufferLayout({kValidityBuffer, kOffsetBuffer}); + return Status::OK(); + } + + Status Visit(const StructType& type) override { + WriteName(type); + WriteChildren(type.children()); + WriteBufferLayout({kValidityBuffer, kTypeBuffer}); + return Status::OK(); + } + + Status Visit(const DenseUnionType& type) override { + WriteName(type); + WriteChildren(type.children()); + WriteBufferLayout({kValidityBuffer, kTypeBuffer, kOffsetBuffer}); + return Status::NotImplemented("NYI"); + } + + Status Visit(const SparseUnionType& type) override { + WriteName(type); + WriteChildren(type.children()); + WriteBufferLayout({kValidityBuffer, kTypeBuffer}); + return Status::NotImplemented("NYI"); + } + + static void test() { + rj::StringBuffer s; + rj::Writer writer(s); + auto schema_writer = std::make_shared(&writer); + } + + private: + rj::Writer* writer_; +}; + +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/json.h b/cpp/src/arrow/ipc/json.h new file mode 100644 index 00000000000..2b725c7e445 --- /dev/null +++ b/cpp/src/arrow/ipc/json.h @@ -0,0 +1,29 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Implement Arrow JSON serialization format + +#ifndef ARROW_IPC_JSON_H +#define ARROW_IPC_JSON_H + +#include "arrow/type.h" + +namespace arrow { +namespace ipc {} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_FILE_H diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 927143fa4ff..6d2c7e02217 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -46,6 +46,18 @@ bool DataType::Equals(const DataType* other) const { return equals; } +FloatingPointMeta::Precision HalfFloatType::precision() const { + return FloatingPointMeta::HALF; +} + +FloatingPointMeta::Precision FloatType::precision() const { + return FloatingPointMeta::SINGLE; +} + +FloatingPointMeta::Precision DoubleType::precision() const { + return FloatingPointMeta::DOUBLE; +} + std::string StringType::ToString() const { return std::string("string"); } @@ -74,10 +86,8 @@ std::string StructType::ToString() const { // Visitors and template instantiation -#define ACCEPT_VISITOR(TYPE) \ - Status TYPE::Accept(TypeVisitor* visitor) const { \ - return visitor->Visit(*this); \ - } +#define ACCEPT_VISITOR(TYPE) \ + Status TYPE::Accept(TypeVisitor* visitor) const { return visitor->Visit(*this); } ACCEPT_VISITOR(NullType); ACCEPT_VISITOR(BinaryType); @@ -88,17 +98,31 @@ ACCEPT_VISITOR(DecimalType); ACCEPT_VISITOR(SparseUnionType); ACCEPT_VISITOR(DenseUnionType); ACCEPT_VISITOR(DateType); +ACCEPT_VISITOR(TimeType); ACCEPT_VISITOR(TimestampType); -constexpr const char* Int8Type::NAME; -constexpr const char* UInt16Type::NAME; -constexpr const char* Int16Type::NAME; -constexpr const char* UInt32Type::NAME; -constexpr const char* Int32Type::NAME; -constexpr const char* UInt64Type::NAME; -constexpr const char* Int64Type::NAME; -constexpr const char* FloatType::NAME; -constexpr const char* DoubleType::NAME; -constexpr const char* BooleanType::NAME; +const std::string NullType::NAME = "null"; +const std::string UInt8Type::NAME = "uint8"; +const std::string Int8Type::NAME = "int8"; +const std::string UInt16Type::NAME = "uint16"; +const std::string Int16Type::NAME = "int16"; +const std::string UInt32Type::NAME = "uint32"; +const std::string Int32Type::NAME = "int32"; +const std::string UInt64Type::NAME = "uint64"; +const std::string Int64Type::NAME = "int64"; +const std::string HalfFloatType::NAME = "halffloat"; +const std::string FloatType::NAME = "float"; +const std::string DoubleType::NAME = "double"; +const std::string BooleanType::NAME = "bool"; +const std::string BinaryType::NAME = "binary"; +const std::string StringType::NAME = "utf8"; +const std::string DecimalType::NAME = "decimal"; +const std::string DateType::NAME = "decimal"; +const std::string TimeType::NAME = "time"; +const std::string TimestampType::NAME = "timestamp"; +const std::string ListType::NAME = "list"; +const std::string StructType::NAME = "struct"; +const std::string DenseUnionType::NAME = "union"; +const std::string SparseUnionType::NAME = "union"; } // namespace arrow diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 2062f0d3608..4b02cb2d3ed 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -43,11 +43,13 @@ struct UInt8Type; struct UInt16Type; struct UInt32Type; struct UInt64Type; +struct HalfFloatType; struct FloatType; struct DoubleType; struct StringType; struct BinaryType; struct DateType; +struct TimeType; struct TimestampType; struct DecimalType; struct ListType; @@ -67,11 +69,13 @@ class TypeVisitor { virtual Status Visit(const UInt16Type& type) = 0; virtual Status Visit(const UInt32Type& type) = 0; virtual Status Visit(const UInt64Type& type) = 0; + virtual Status Visit(const HalfFloatType& type) = 0; virtual Status Visit(const FloatType& type) = 0; virtual Status Visit(const DoubleType& type) = 0; virtual Status Visit(const StringType& type) = 0; virtual Status Visit(const BinaryType& type) = 0; virtual Status Visit(const DateType& type) = 0; + virtual Status Visit(const TimeType& type) = 0; virtual Status Visit(const TimestampType& type) = 0; virtual Status Visit(const DecimalType& type) = 0; virtual Status Visit(const ListType& type) = 0; @@ -102,17 +106,20 @@ struct Type { UINT64 = 8, INT64 = 9, + // 2-byte floating point value + HALF_FLOAT = 10, + // 4-byte floating point value - FLOAT = 10, + FLOAT = 11, // 8-byte floating point value - DOUBLE = 11, + DOUBLE = 12, // UTF8 variable-length string as List STRING = 13, // Variable-length bytes (no guarantee of UTF8-ness) - BINARY = 15, + BINARY = 14, // By default, int32 days since the UNIX epoch DATE = 16, @@ -167,11 +174,13 @@ struct ARROW_EXPORT DataType { const std::shared_ptr& child(int i) const { return children_[i]; } + const std::vector>& children() const { return children_; } + int num_children() const { return children_.size(); } - virtual int value_size() const { return -1; } + virtual int bit_width() const { return -1; } - virtual Status Accept(TypeVisitor* visitor) const = 0; + virtual Status Accept(TypeVisitor* visitor) const = 0; virtual std::string ToString() const = 0; }; @@ -216,20 +225,18 @@ typedef std::shared_ptr FieldPtr; template struct ARROW_EXPORT PrimitiveType : public DataType { - using c_type = C_TYPE; \ + using c_type = C_TYPE; static constexpr Type::type type_id = TYPE_ID; PrimitiveType() : DataType(TYPE_ID) {} - int value_size() const override { return sizeof(C_TYPE); } + int bit_width() const override { return sizeof(C_TYPE) * 8; } Status Accept(TypeVisitor* visitor) const override { return visitor->Visit(*static_cast(this)); } - std::string ToString() const override { - return std::string(DERIVED::NAME); - } + std::string ToString() const override { return std::string(DERIVED::NAME); } }; struct ARROW_EXPORT NullType : public DataType { @@ -237,58 +244,87 @@ struct ARROW_EXPORT NullType : public DataType { NullType() : DataType(Type::NA) {} - int value_size() const override { return 0; } + int bit_width() const override { return 0; } Status Accept(TypeVisitor* visitor) const override; - std::string ToString() const override { - static const std::string name = "null"; - return name; - } + static const std::string NAME; + + std::string ToString() const override { return NAME; } +}; + +struct IntegerMeta { + virtual bool is_signed() const = 0; +}; + +struct FloatingPointMeta { + enum Precision { HALF, SINGLE, DOUBLE }; + virtual Precision precision() const = 0; +}; + +template +struct IntegerTypeImpl : public PrimitiveType, + public IntegerMeta { + bool is_signed() const override { return std::is_signed::value; } }; struct ARROW_EXPORT BooleanType : public PrimitiveType { - constexpr static const char* NAME = "bool"; + int bit_width() const override { return 1; } + static const std::string NAME; }; -struct ARROW_EXPORT UInt8Type : public PrimitiveType { - constexpr static const char* NAME = "uint8"; +struct ARROW_EXPORT UInt8Type : public IntegerTypeImpl { + static const std::string NAME; }; -struct ARROW_EXPORT Int8Type : public PrimitiveType { - constexpr static const char* NAME = "int8"; +struct ARROW_EXPORT Int8Type : public IntegerTypeImpl { + static const std::string NAME; }; -struct ARROW_EXPORT UInt16Type : public PrimitiveType { - constexpr static const char* NAME = "uint16"; +struct ARROW_EXPORT UInt16Type + : public IntegerTypeImpl { + static const std::string NAME; }; -struct ARROW_EXPORT Int16Type : public PrimitiveType { - constexpr static const char* NAME = "int16"; +struct ARROW_EXPORT Int16Type : public IntegerTypeImpl { + static const std::string NAME; }; -struct ARROW_EXPORT UInt32Type : public PrimitiveType { - constexpr static const char* NAME = "uint32"; +struct ARROW_EXPORT UInt32Type + : public IntegerTypeImpl { + static const std::string NAME; }; -struct ARROW_EXPORT Int32Type : public PrimitiveType { - constexpr static const char* NAME = "int32"; +struct ARROW_EXPORT Int32Type : public IntegerTypeImpl { + static const std::string NAME; }; -struct ARROW_EXPORT UInt64Type : public PrimitiveType { - constexpr static const char* NAME = "uint64"; +struct ARROW_EXPORT UInt64Type + : public IntegerTypeImpl { + static const std::string NAME; }; -struct ARROW_EXPORT Int64Type : public PrimitiveType { - constexpr static const char* NAME = "int64"; +struct ARROW_EXPORT Int64Type : public IntegerTypeImpl { + static const std::string NAME; }; -struct ARROW_EXPORT FloatType : public PrimitiveType { - constexpr static const char* NAME = "float"; +struct ARROW_EXPORT HalfFloatType + : public PrimitiveType, + public FloatingPointMeta { + Precision precision() const override; + static const std::string NAME; }; -struct ARROW_EXPORT DoubleType : public PrimitiveType { - constexpr static const char* NAME = "double"; +struct ARROW_EXPORT FloatType : public PrimitiveType, + public FloatingPointMeta { + Precision precision() const override; + static const std::string NAME; +}; + +struct ARROW_EXPORT DoubleType : public PrimitiveType, + public FloatingPointMeta { + Precision precision() const override; + static const std::string NAME; }; struct ARROW_EXPORT ListType : public DataType { @@ -306,6 +342,7 @@ struct ARROW_EXPORT ListType : public DataType { Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; + static const std::string NAME; }; // BinaryType type is reprsents lists of 1-byte values. @@ -315,6 +352,8 @@ struct ARROW_EXPORT BinaryType : public DataType { Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; + static const std::string NAME; + protected: // Allow subclasses to change the logical type. explicit BinaryType(Type::type logical_type) : DataType(logical_type) {} @@ -326,6 +365,7 @@ struct ARROW_EXPORT StringType : public BinaryType { Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; + static const std::string NAME; }; struct ARROW_EXPORT StructType : public DataType { @@ -336,6 +376,7 @@ struct ARROW_EXPORT StructType : public DataType { Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; + static const std::string NAME; }; struct ARROW_EXPORT DecimalType : public DataType { @@ -346,6 +387,7 @@ struct ARROW_EXPORT DecimalType : public DataType { Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; + static const std::string NAME; }; template @@ -367,6 +409,7 @@ struct DenseUnionType : public UnionType { Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; + static const std::string NAME; }; struct SparseUnionType : public UnionType { @@ -377,10 +420,11 @@ struct SparseUnionType : public UnionType { } Status Accept(TypeVisitor* visitor) const override; - virtual std::string ToString() const; + std::string ToString() const override; + static const std::string NAME; }; -struct DateType : public DataType { +struct ARROW_EXPORT DateType : public DataType { enum class Unit : char { DAY = 0, MONTH = 1, YEAR = 2 }; Unit unit; @@ -390,6 +434,21 @@ struct DateType : public DataType { DateType(const DateType& other) : DateType(other.unit) {} Status Accept(TypeVisitor* visitor) const override; + std::string ToString() const override { return NAME; } + static const std::string NAME; +}; + +struct ARROW_EXPORT TimeType : public DataType { + enum class Unit : char { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 }; + + Unit unit; + + explicit TimeType(Unit unit = Unit::MILLI) : DataType(Type::TIME), unit(unit) {} + TimeType(const TimeType& other) : TimeType(other.unit) {} + + Status Accept(TypeVisitor* visitor) const override; + std::string ToString() const override { return NAME; } + static const std::string NAME; }; struct ARROW_EXPORT TimestampType : public DataType { @@ -398,7 +457,7 @@ struct ARROW_EXPORT TimestampType : public DataType { typedef int64_t c_type; static constexpr Type::type type_enum = Type::TIMESTAMP; - int value_size() const override { return sizeof(int64_t); } + int bit_width() const override { return sizeof(int64_t) * 8; } Unit unit; @@ -409,7 +468,8 @@ struct ARROW_EXPORT TimestampType : public DataType { virtual ~TimestampType() {} Status Accept(TypeVisitor* visitor) const override; - std::string ToString() const override { return "timestamp"; } + std::string ToString() const override { return NAME; } + static const std::string NAME; }; // These will be defined elsewhere diff --git a/cpp/src/arrow/types/datetime.h b/cpp/src/arrow/types/datetime.h index 5676a946993..a8f86392312 100644 --- a/cpp/src/arrow/types/datetime.h +++ b/cpp/src/arrow/types/datetime.h @@ -22,8 +22,6 @@ #include "arrow/type.h" -namespace arrow { - -} // namespace arrow +namespace arrow {} // namespace arrow #endif // ARROW_TYPES_DATETIME_H diff --git a/cpp/src/arrow/types/decimal.h b/cpp/src/arrow/types/decimal.h index 1709702e3f1..b3ea3a56d80 100644 --- a/cpp/src/arrow/types/decimal.h +++ b/cpp/src/arrow/types/decimal.h @@ -23,8 +23,6 @@ #include "arrow/type.h" #include "arrow/util/visibility.h" -namespace arrow { - -} // namespace arrow +namespace arrow {} // namespace arrow #endif // ARROW_TYPES_DECIMAL_H diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index eee08783c02..69d200b958f 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -41,15 +41,15 @@ namespace arrow { class Array; -#define PRIMITIVE_TEST(KLASS, ENUM, NAME) \ - TEST(TypesTest, TestPrimitive_##ENUM) { \ - KLASS tp; \ - \ - ASSERT_EQ(tp.type, Type::ENUM); \ - ASSERT_EQ(tp.ToString(), string(NAME)); \ - \ - KLASS tp_copy = tp; \ - ASSERT_EQ(tp_copy.type, Type::ENUM); \ +#define PRIMITIVE_TEST(KLASS, ENUM, NAME) \ + TEST(TypesTest, TestPrimitive_##ENUM) { \ + KLASS tp; \ + \ + ASSERT_EQ(tp.type, Type::ENUM); \ + ASSERT_EQ(tp.ToString(), string(NAME)); \ + \ + KLASS tp_copy = tp; \ + ASSERT_EQ(tp_copy.type, Type::ENUM); \ } PRIMITIVE_TEST(Int8Type, INT8, "int8"); @@ -243,7 +243,8 @@ void TestPrimitiveBuilder::Check( } typedef ::testing::Types Primitives; + PInt32, PInt64, PFloat, PDouble> + Primitives; TYPED_TEST_CASE(TestPrimitiveBuilder, Primitives); diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index d2288bafa71..e5c6f047746 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -48,13 +48,13 @@ bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const { const uint8_t* this_data = raw_data_; const uint8_t* other_data = other.raw_data_; - int value_size = type_->value_size(); - DCHECK_GT(value_size, 0); + int value_byte_size = type_->bit_width() / 8; + DCHECK_GT(value_byte_size, 0); for (int i = 0; i < length_; ++i) { - if (!IsNull(i) && memcmp(this_data, other_data, value_size)) { return false; } - this_data += value_size; - other_data += value_size; + if (!IsNull(i) && memcmp(this_data, other_data, value_byte_size)) { return false; } + this_data += value_byte_size; + other_data += value_byte_size; } return true; } else { diff --git a/cpp/src/arrow/util/logging.h b/cpp/src/arrow/util/logging.h index 06ee8411e28..b22f07dd634 100644 --- a/cpp/src/arrow/util/logging.h +++ b/cpp/src/arrow/util/logging.h @@ -118,9 +118,9 @@ class CerrLog { class FatalLog : public CerrLog { public: explicit FatalLog(int /* severity */) // NOLINT - : CerrLog(ARROW_FATAL) {} // NOLINT + : CerrLog(ARROW_FATAL){} // NOLINT - [[noreturn]] ~FatalLog() { + [[noreturn]] ~FatalLog() { if (has_logged_) { std::cerr << std::endl; } std::exit(1); } From 68ee7ab308e69cf59f7243610d2efce20ad2d87a Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 9 Nov 2016 11:43:56 -0500 Subject: [PATCH 03/27] Move forward declarations into type_fwd.h Change-Id: I26ff705a285af0217fc1f9b71e646ebda1111016 --- cpp/src/arrow/ipc/CMakeLists.txt | 4 + cpp/src/arrow/ipc/json.cc | 9 +- cpp/src/arrow/ipc/json.h | 6 +- cpp/src/arrow/type.h | 56 +------------ cpp/src/arrow/type_fwd.h | 137 +++++++++++++++++++++++++++++++ cpp/src/arrow/types/primitive.h | 31 ++++--- 6 files changed, 168 insertions(+), 75 deletions(-) create mode 100644 cpp/src/arrow/type_fwd.h diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index 95b9d2c4049..669fdbfb880 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -80,6 +80,10 @@ ADD_ARROW_TEST(ipc-metadata-test) ARROW_TEST_LINK_LIBRARIES(ipc-metadata-test ${ARROW_IPC_TEST_LINK_LIBS}) +ADD_ARROW_TEST(ipc-json-test) +ARROW_TEST_LINK_LIBRARIES(ipc-json-test + ${ARROW_IPC_TEST_LINK_LIBS}) + # make clean will delete the generated file set_source_files_properties(Metadata_generated.h PROPERTIES GENERATED TRUE) diff --git a/cpp/src/arrow/ipc/json.cc b/cpp/src/arrow/ipc/json.cc index 18d9ce47412..957253afd2f 100644 --- a/cpp/src/arrow/ipc/json.cc +++ b/cpp/src/arrow/ipc/json.cc @@ -130,7 +130,12 @@ class JsonSchemaWriter : public TypeVisitor { writer_->EndArray(); } - void WriteChildren(const std::vector>& children) {} + Status WriteChildren(const std::vector>& children) { + for (const std::shared_ptr& field : children) { + RETURN_NOT_OK(VisitField(*field.get())); + } + return Status::OK(); + } void SetNoChildren() { writer_->Key("children"); @@ -238,7 +243,7 @@ class JsonSchemaWriter : public TypeVisitor { Status Visit(const ListType& type) override { WriteName(type); - WriteChildren(type.children()); + RETURN_NOT_OK(WriteChildren(type.children())); WriteBufferLayout({kValidityBuffer, kOffsetBuffer}); return Status::OK(); } diff --git a/cpp/src/arrow/ipc/json.h b/cpp/src/arrow/ipc/json.h index 2b725c7e445..fec2d0f7ac9 100644 --- a/cpp/src/arrow/ipc/json.h +++ b/cpp/src/arrow/ipc/json.h @@ -20,10 +20,12 @@ #ifndef ARROW_IPC_JSON_H #define ARROW_IPC_JSON_H -#include "arrow/type.h" +#include "arrow/type_fwd.h" namespace arrow { -namespace ipc {} // namespace ipc +namespace ipc { + +} // namespace ipc } // namespace arrow #endif // ARROW_IPC_FILE_H diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 4b02cb2d3ed..4819751794c 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -23,67 +23,13 @@ #include #include +#include "arrow/type_fwd.h" #include "arrow/util/macros.h" #include "arrow/util/status.h" #include "arrow/util/visibility.h" namespace arrow { -struct Field; - -// Type forward declarations for the TypeVisitor -struct DataType; -struct NullType; -struct BooleanType; -struct Int8Type; -struct Int16Type; -struct Int32Type; -struct Int64Type; -struct UInt8Type; -struct UInt16Type; -struct UInt32Type; -struct UInt64Type; -struct HalfFloatType; -struct FloatType; -struct DoubleType; -struct StringType; -struct BinaryType; -struct DateType; -struct TimeType; -struct TimestampType; -struct DecimalType; -struct ListType; -struct StructType; -struct DenseUnionType; -struct SparseUnionType; - -class TypeVisitor { - public: - virtual Status Visit(const NullType& type) = 0; - virtual Status Visit(const BooleanType& type) = 0; - virtual Status Visit(const Int8Type& type) = 0; - virtual Status Visit(const Int16Type& type) = 0; - virtual Status Visit(const Int32Type& type) = 0; - virtual Status Visit(const Int64Type& type) = 0; - virtual Status Visit(const UInt8Type& type) = 0; - virtual Status Visit(const UInt16Type& type) = 0; - virtual Status Visit(const UInt32Type& type) = 0; - virtual Status Visit(const UInt64Type& type) = 0; - virtual Status Visit(const HalfFloatType& type) = 0; - virtual Status Visit(const FloatType& type) = 0; - virtual Status Visit(const DoubleType& type) = 0; - virtual Status Visit(const StringType& type) = 0; - virtual Status Visit(const BinaryType& type) = 0; - virtual Status Visit(const DateType& type) = 0; - virtual Status Visit(const TimeType& type) = 0; - virtual Status Visit(const TimestampType& type) = 0; - virtual Status Visit(const DecimalType& type) = 0; - virtual Status Visit(const ListType& type) = 0; - virtual Status Visit(const StructType& type) = 0; - virtual Status Visit(const DenseUnionType& type) = 0; - virtual Status Visit(const SparseUnionType& type) = 0; -}; - // Data types in this library are all *logical*. They can be expressed as // either a primitive physical type (bytes or bits of some fixed size), a // nested type consisting of other data types, or another data type (e.g. a diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h new file mode 100644 index 00000000000..ba488e79321 --- /dev/null +++ b/cpp/src/arrow/type_fwd.h @@ -0,0 +1,137 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TYPE_FWD_H +#define ARROW_TYPE_FWD_H + +namespace arrow { + +class Status; + +// Type forward declarations for the TypeVisitor +struct DataType; +struct Field; +struct NullType; +struct BooleanType; +struct Int8Type; +struct Int16Type; +struct Int32Type; +struct Int64Type; +struct UInt8Type; +struct UInt16Type; +struct UInt32Type; +struct UInt64Type; +struct HalfFloatType; +struct FloatType; +struct DoubleType; +struct StringType; +struct BinaryType; +struct DateType; +struct TimeType; +struct TimestampType; +struct DecimalType; +struct ListType; +struct StructType; +struct DenseUnionType; +struct SparseUnionType; + +class TypeVisitor { + public: + virtual Status Visit(const NullType& type) = 0; + virtual Status Visit(const BooleanType& type) = 0; + virtual Status Visit(const Int8Type& type) = 0; + virtual Status Visit(const Int16Type& type) = 0; + virtual Status Visit(const Int32Type& type) = 0; + virtual Status Visit(const Int64Type& type) = 0; + virtual Status Visit(const UInt8Type& type) = 0; + virtual Status Visit(const UInt16Type& type) = 0; + virtual Status Visit(const UInt32Type& type) = 0; + virtual Status Visit(const UInt64Type& type) = 0; + virtual Status Visit(const HalfFloatType& type) = 0; + virtual Status Visit(const FloatType& type) = 0; + virtual Status Visit(const DoubleType& type) = 0; + virtual Status Visit(const StringType& type) = 0; + virtual Status Visit(const BinaryType& type) = 0; + virtual Status Visit(const DateType& type) = 0; + virtual Status Visit(const TimeType& type) = 0; + virtual Status Visit(const TimestampType& type) = 0; + virtual Status Visit(const DecimalType& type) = 0; + virtual Status Visit(const ListType& type) = 0; + virtual Status Visit(const StructType& type) = 0; + virtual Status Visit(const DenseUnionType& type) = 0; + virtual Status Visit(const SparseUnionType& type) = 0; +}; + +class NullArray; +class BooleanArray; +class StringArray; +class BinaryArray; +class DecimalArray; +class ListArray; +class StructArray; +class DenseUnionArray; +class SparseUnionArray; + +template +class NumericArray; + +class DateArray; +class TimeArray; + +using HalfFloatArray = NumericArray; +using FloatArray = NumericArray; +using DoubleArray = NumericArray; +using Int8Array = NumericArray; +using UInt8Array = NumericArray; +using Int16Array = NumericArray; +using UInt16Array = NumericArray; +using Int32Array = NumericArray; +using UInt32Array = NumericArray; +using Int64Array = NumericArray; +using UInt64Array = NumericArray; +using TimestampArray = NumericArray; + +class ArrayVisitor { + public: + virtual Status Visit(const NullArray& array) = 0; + virtual Status Visit(const BooleanArray& array) = 0; + virtual Status Visit(const Int8Array& array) = 0; + virtual Status Visit(const Int16Array& array) = 0; + virtual Status Visit(const Int32Array& array) = 0; + virtual Status Visit(const Int64Array& array) = 0; + virtual Status Visit(const UInt8Array& array) = 0; + virtual Status Visit(const UInt16Array& array) = 0; + virtual Status Visit(const UInt32Array& array) = 0; + virtual Status Visit(const UInt64Array& array) = 0; + virtual Status Visit(const HalfFloatArray& array) = 0; + virtual Status Visit(const FloatArray& array) = 0; + virtual Status Visit(const DoubleArray& array) = 0; + virtual Status Visit(const StringArray& array) = 0; + virtual Status Visit(const BinaryArray& array) = 0; + virtual Status Visit(const DateArray& array) = 0; + virtual Status Visit(const TimeArray& array) = 0; + virtual Status Visit(const TimestampArray& array) = 0; + virtual Status Visit(const DecimalArray& array) = 0; + virtual Status Visit(const ListArray& array) = 0; + virtual Status Visit(const StructArray& array) = 0; + virtual Status Visit(const DenseUnionArray& array) = 0; + virtual Status Visit(const SparseUnionArray& array) = 0; +}; + +} // namespace arrow + +#endif // ARROW_TYPE_FWD_H diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index c71df584ffe..3fe0b275765 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -26,6 +26,7 @@ #include "arrow/array.h" #include "arrow/builder.h" #include "arrow/type.h" +#include "arrow/type_fwd.h" #include "arrow/types/datetime.h" #include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" @@ -91,22 +92,6 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray { value_type Value(int i) const { return raw_data()[i]; } }; -#define NUMERIC_ARRAY_DECL(NAME, TypeClass) \ - using NAME = NumericArray; \ - extern template class ARROW_EXPORT NumericArray; - -NUMERIC_ARRAY_DECL(UInt8Array, UInt8Type); -NUMERIC_ARRAY_DECL(Int8Array, Int8Type); -NUMERIC_ARRAY_DECL(UInt16Array, UInt16Type); -NUMERIC_ARRAY_DECL(Int16Array, Int16Type); -NUMERIC_ARRAY_DECL(UInt32Array, UInt32Type); -NUMERIC_ARRAY_DECL(Int32Array, Int32Type); -NUMERIC_ARRAY_DECL(UInt64Array, UInt64Type); -NUMERIC_ARRAY_DECL(Int64Array, Int64Type); -NUMERIC_ARRAY_DECL(TimestampArray, TimestampType); -NUMERIC_ARRAY_DECL(FloatArray, FloatType); -NUMERIC_ARRAY_DECL(DoubleArray, DoubleType); - template class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { public: @@ -327,6 +312,20 @@ class ARROW_EXPORT BooleanBuilder : public PrimitiveBuilder { Status Append(uint8_t val) { return Append(static_cast(val)); } }; +// Only instantiate these templates once +extern template class ARROW_EXPORT NumericArray; +extern template class ARROW_EXPORT NumericArray; +extern template class ARROW_EXPORT NumericArray; +extern template class ARROW_EXPORT NumericArray; +extern template class ARROW_EXPORT NumericArray; +extern template class ARROW_EXPORT NumericArray; +extern template class ARROW_EXPORT NumericArray; +extern template class ARROW_EXPORT NumericArray; +extern template class ARROW_EXPORT NumericArray; +extern template class ARROW_EXPORT NumericArray; +extern template class ARROW_EXPORT NumericArray; +extern template class ARROW_EXPORT NumericArray; + } // namespace arrow #endif // ARROW_TYPES_PRIMITIVE_H From 820b0f2eba81cad96281dc2b8d5a2ec36e75e8ea Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 9 Nov 2016 19:04:27 -0500 Subject: [PATCH 04/27] Drafting JSON schema read/write Change-Id: I8dce236d9c9d5ee6badbe384249b3e2b0fbfc5a8 --- cpp/src/arrow/ipc/CMakeLists.txt | 1 + cpp/src/arrow/ipc/ipc-json-test.cc | 54 +++ cpp/src/arrow/ipc/json-internal.cc | 510 +++++++++++++++++++++++++++++ cpp/src/arrow/ipc/json-internal.h | 54 +++ cpp/src/arrow/ipc/json.cc | 249 +------------- cpp/src/arrow/ipc/json.h | 64 ++++ cpp/src/arrow/schema.h | 2 + cpp/src/arrow/type.h | 6 +- 8 files changed, 689 insertions(+), 251 deletions(-) create mode 100644 cpp/src/arrow/ipc/ipc-json-test.cc create mode 100644 cpp/src/arrow/ipc/json-internal.cc create mode 100644 cpp/src/arrow/ipc/json-internal.h diff --git a/cpp/src/arrow/ipc/CMakeLists.txt b/cpp/src/arrow/ipc/CMakeLists.txt index 669fdbfb880..6955bcb6c23 100644 --- a/cpp/src/arrow/ipc/CMakeLists.txt +++ b/cpp/src/arrow/ipc/CMakeLists.txt @@ -35,6 +35,7 @@ set(ARROW_IPC_SRCS adapter.cc file.cc json.cc + json-internal.cc metadata.cc metadata-internal.cc ) diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc new file mode 100644 index 00000000000..11da3567b1c --- /dev/null +++ b/cpp/src/arrow/ipc/ipc-json-test.cc @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include + +#include "gtest/gtest.h" + +#include "arrow/io/memory.h" +#include "arrow/io/test-common.h" +#include "arrow/ipc/adapter.h" +#include "arrow/ipc/file.h" +#include "arrow/ipc/test-common.h" +#include "arrow/ipc/util.h" + +#include "arrow/test-util.h" +#include "arrow/types/list.h" +#include "arrow/types/primitive.h" +#include "arrow/types/string.h" +#include "arrow/types/struct.h" +#include "arrow/util/bit-util.h" +#include "arrow/util/buffer.h" +#include "arrow/util/memory-pool.h" +#include "arrow/util/status.h" + +namespace arrow { +namespace ipc { + +class TestJsonSchemaWriter : public ::testing::Test { + public: + void SetUp() {} + void TearDown() {} +}; + +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc new file mode 100644 index 00000000000..50989054c2d --- /dev/null +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -0,0 +1,510 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/ipc/json-internal.h" + +#include +#include +#include + +#include "rapidjson/stringbuffer.h" +#include "rapidjson/writer.h" + +#include "arrow/schema.h" +#include "arrow/type.h" +#include "arrow/util/status.h" + +namespace arrow { +namespace ipc { + +enum class BufferType : char { DATA, OFFSET, TYPE, VALIDITY }; + +static std::string GetBufferTypeName(BufferType type) { + switch (type) { + case BufferType::DATA: + return "DATA"; + case BufferType::OFFSET: + return "OFFSET"; + case BufferType::TYPE: + return "TYPE"; + case BufferType::VALIDITY: + return "VALIDITY"; + default: + break; + }; + return "UNKNOWN"; +} + +class BufferLayout { + public: + BufferLayout(BufferType type, int bit_width) : type_(type), bit_width_(bit_width) {} + + BufferType type() const { return type_; } + int bit_width() const { return bit_width_; } + + private: + BufferType type_; + int bit_width_; +}; + +static const BufferLayout kValidityBuffer(BufferType::VALIDITY, 1); +static const BufferLayout kOffsetBuffer(BufferType::OFFSET, 32); +static const BufferLayout kTypeBuffer(BufferType::TYPE, 32); +static const BufferLayout kBooleanBuffer(BufferType::DATA, 1); +static const BufferLayout kValues64(BufferType::DATA, 64); +static const BufferLayout kValues32(BufferType::DATA, 32); +static const BufferLayout kValues16(BufferType::DATA, 16); +static const BufferLayout kValues8(BufferType::DATA, 8); + +class JsonSchemaWriter : public TypeVisitor { + public: + explicit JsonSchemaWriter(RjWriter* writer) : writer_(writer) {} + + void Start() { + writer_->Key("schema"); + writer_->StartArray(); + } + + void Finish() { writer_->EndArray(); } + + Status VisitField(const Field& field) { + writer_->StartObject(); + + writer_->Key("name"); + writer_->String(field.name.c_str()); + + writer_->Key("nullable"); + writer_->Bool(field.nullable); + + // Visit the type + RETURN_NOT_OK(field.type->Accept(this)); + + writer_->EndObject(); + + return Status::OK(); + } + + void SetNoChildren() { + writer_->Key("children"); + writer_->StartArray(); + writer_->EndArray(); + } + + template + void WriteTypeMetadata(const T& type) {} + + // TODO(wesm): Other Type metadata + + template + void WriteName(const T& type) { + writer_->Key("type"); + writer_->StartObject(); + writer_->Key("name"); + writer_->String(T::NAME); + + WriteTypeMetadata(type); + + writer_->EndObject(); + } + + template + void WritePrimitive(const T& type, const std::vector& buffer_layout) { + WriteName(type); + SetNoChildren(); + WriteBufferLayout(buffer_layout); + } + + template + void WriteVarBytes(const T& type) { + WriteName(type); + SetNoChildren(); + WriteBufferLayout({kValidityBuffer, kOffsetBuffer, kValues8}); + } + + void WriteBufferLayout(const std::vector& buffer_layout) { + writer_->Key("typeLayout"); + writer_->StartArray(); + + for (const BufferLayout& buffer : buffer_layout) { + writer_->StartObject(); + writer_->Key("type"); + writer_->String(GetBufferTypeName(buffer.type())); + + writer_->Key("typeBitWidth"); + writer_->Int(buffer.bit_width()); + + writer_->EndObject(); + } + writer_->EndArray(); + } + + Status WriteChildren(const std::vector>& children) { + for (const std::shared_ptr& field : children) { + RETURN_NOT_OK(VisitField(*field.get())); + } + return Status::OK(); + } + + Status Visit(const NullType& type) override { + WritePrimitive(type, {}); + return Status::OK(); + } + + Status Visit(const BooleanType& type) override { + WritePrimitive(type, {kValidityBuffer, kBooleanBuffer}); + return Status::OK(); + } + + Status Visit(const Int8Type& type) override { + WritePrimitive(type, {kValidityBuffer, kValues8}); + return Status::OK(); + } + + Status Visit(const Int16Type& type) override { + WritePrimitive(type, {kValidityBuffer, kValues16}); + return Status::OK(); + } + + Status Visit(const Int32Type& type) override { + WritePrimitive(type, {kValidityBuffer, kValues32}); + return Status::OK(); + } + + Status Visit(const Int64Type& type) override { + WritePrimitive(type, {kValidityBuffer, kValues64}); + return Status::OK(); + } + + Status Visit(const UInt8Type& type) override { + WritePrimitive(type, {kValidityBuffer, kValues8}); + return Status::OK(); + } + + Status Visit(const UInt16Type& type) override { + WritePrimitive(type, {kValidityBuffer, kValues16}); + return Status::OK(); + } + + Status Visit(const UInt32Type& type) override { + WritePrimitive(type, {kValidityBuffer, kValues32}); + return Status::OK(); + } + + Status Visit(const UInt64Type& type) override { + WritePrimitive(type, {kValidityBuffer, kValues64}); + return Status::OK(); + } + + Status Visit(const HalfFloatType& type) override { + WritePrimitive(type, {kValidityBuffer, kValues16}); + return Status::OK(); + } + + Status Visit(const FloatType& type) override { + WritePrimitive(type, {kValidityBuffer, kValues32}); + return Status::OK(); + } + + Status Visit(const DoubleType& type) override { + WritePrimitive(type, {kValidityBuffer, kValues64}); + return Status::OK(); + } + + Status Visit(const StringType& type) override { + WriteVarBytes(type); + return Status::OK(); + } + + Status Visit(const BinaryType& type) override { + WriteVarBytes(type); + return Status::OK(); + } + + Status Visit(const DateType& type) override { + WritePrimitive(type, {kValidityBuffer, kValues64}); + return Status::OK(); + } + + Status Visit(const TimeType& type) override { + WritePrimitive(type, {kValidityBuffer, kValues64}); + return Status::OK(); + } + + Status Visit(const TimestampType& type) override { + WritePrimitive(type, {kValidityBuffer, kValues64}); + return Status::OK(); + } + + Status Visit(const DecimalType& type) override { return Status::NotImplemented("NYI"); } + + Status Visit(const ListType& type) override { + WriteName(type); + RETURN_NOT_OK(WriteChildren(type.children())); + WriteBufferLayout({kValidityBuffer, kOffsetBuffer}); + return Status::OK(); + } + + Status Visit(const StructType& type) override { + WriteName(type); + WriteChildren(type.children()); + WriteBufferLayout({kValidityBuffer, kTypeBuffer}); + return Status::OK(); + } + + Status Visit(const DenseUnionType& type) override { + WriteName(type); + WriteChildren(type.children()); + WriteBufferLayout({kValidityBuffer, kTypeBuffer, kOffsetBuffer}); + return Status::NotImplemented("NYI"); + } + + Status Visit(const SparseUnionType& type) override { + WriteName(type); + WriteChildren(type.children()); + WriteBufferLayout({kValidityBuffer, kTypeBuffer}); + return Status::NotImplemented("NYI"); + } + + private: + RjWriter* writer_; +}; + +#define RETURN_NOT_STRING(NAME, PARENT) \ + if (NAME == PARENT.MemberEnd() || !NAME->value.IsString()) { \ + return Status::Invalid("invalid field"); \ + } + +#define RETURN_NOT_BOOL(NAME, PARENT) \ + if (NAME == PARENT.MemberEnd() || !NAME->value.IsBool()) { \ + return Status::Invalid("invalid field"); \ + } + +#define RETURN_NOT_INT(NAME, PARENT) \ + if (NAME == PARENT.MemberEnd() || !NAME->value.IsInt()) { \ + return Status::Invalid("invalid field"); \ + } + +#define RETURN_NOT_ARRAY(NAME, PARENT) \ + if (NAME == PARENT.MemberEnd() || !NAME->value.IsArray()) { \ + return Status::Invalid("invalid field"); \ + } + +#define RETURN_NOT_OBJECT(NAME, PARENT) \ + if (NAME == PARENT.MemberEnd() || !NAME->value.IsObject()) { \ + return Status::Invalid("invalid field"); \ + } + +class JsonSchemaReader { + public: + explicit JsonSchemaReader(const rj::Value& json_schema) + : json_schema_(json_schema) {} + + Status GetSchema(std::shared_ptr* schema) { + const auto& obj_schema = json_schema_.GetObject(); + + const auto& json_fields = obj_schema.FindMember("fields"); + RETURN_NOT_ARRAY(json_fields, obj_schema); + + std::vector> fields; + RETURN_NOT_OK(GetFieldsFromArray(json_fields->value, &fields)); + + *schema = std::make_shared(fields); + return Status::OK(); + } + + Status GetFieldsFromArray(const rj::Value& obj, std::vector>* fields) { + const auto& values = obj.GetArray(); + + fields->resize(values.Size()); + for (size_t i = 0; i < fields->size(); ++i) { + RETURN_NOT_OK(GetField(values[i], &(*fields)[i])); + } + return Status::OK(); + } + + Status GetField(const rj::Value& obj, std::shared_ptr* field) { + if (!obj.IsObject()) { + return Status::Invalid("Field was not a JSON object"); + } + const auto& json_field = obj.GetObject(); + + const auto& json_name = json_field.FindMember("name"); + RETURN_NOT_STRING(json_name, json_field); + + const auto& json_nullable = json_field.FindMember("nullable"); + RETURN_NOT_BOOL(json_nullable, json_field); + + const auto& json_type = json_field.FindMember("type"); + RETURN_NOT_OBJECT(json_type, json_field); + + const auto& json_children = json_field.FindMember("children"); + RETURN_NOT_ARRAY(json_children, json_field); + + std::vector> children; + RETURN_NOT_OK(GetFieldsFromArray(json_children->value, &children)); + + std::shared_ptr type; + RETURN_NOT_OK(GetType(json_type->value, children, &type)); + + *field = std::make_shared(json_name->value.GetString(), type, + json_nullable->value.GetBool()); + return Status::OK(); + } + + Status GetInteger(const rj::Value& obj, std::shared_ptr* type) { + const auto& json_type = obj.GetObject(); + + const auto& json_bit_width = json_type.FindMember("bidWidth"); + RETURN_NOT_INT(json_bit_width, json_type); + + const auto& json_is_signed = json_type.FindMember("isSigned"); + RETURN_NOT_BOOL(json_is_signed, json_type); + + bool is_signed = json_is_signed->value.GetBool(); + int bit_width = json_bit_width->value.GetInt(); + + switch (bit_width) { + case 8: + if (is_signed) { + *type = std::make_shared(); + } else { + *type = std::make_shared(); + } + break; + case 16: + if (is_signed) { + *type = std::make_shared(); + } else { + *type = std::make_shared(); + } + break; + case 32: + if (is_signed) { + *type = std::make_shared(); + } else { + *type = std::make_shared(); + } + break; + case 64: + if (is_signed) { + *type = std::make_shared(); + } else { + *type = std::make_shared(); + } + break; + default: + std::stringstream ss; + ss << "Invalid bit width: " << bit_width; + return Status::Invalid(ss.str()); + } + return Status::OK(); + } + + Status GetFloatingPoint(const rj::Value& obj, std::shared_ptr* type) { + const auto& json_type = obj.GetObject(); + + const auto& json_precision = json_type.FindMember("precision"); + RETURN_NOT_STRING(json_precision, json_type); + + std::string precision = json_precision->value.GetString(); + + if (precision == "DOUBLE") { + *type = std::make_shared(); + } else if (precision == "FLOAT") { + *type = std::make_shared(); + } else if (precision == "HALF") { + *type = std::make_shared(); + } else { + std::stringstream ss; + ss << "Invalid precision: " << precision; + return Status::Invalid(ss.str()); + } + return Status::OK(); + } + + Status GetType(const rj::Value& obj, + const std::vector>& children, std::shared_ptr* type) { + const auto& json_type = obj.GetObject(); + + const auto& json_type_name = json_type.FindMember("name"); + RETURN_NOT_STRING(json_type_name, json_type); + + std::string type_name = json_type_name->value.GetString(); + + if (type_name == "int") { + return GetInteger(obj, type); + } else if (type_name == "floatingpoint") { + return GetFloatingPoint(obj, type); + } else if (type_name == "bool") { + *type = std::make_shared(); + } else if (type_name == "utf8") { + *type = std::make_shared(); + } else if (type_name == "binary") { + *type = std::make_shared(); + } else if (type_name == "null") { + *type = std::make_shared(); + } else if (type_name == "list") { + *type = std::make_shared(children[0]); + } else if (type_name == "struct") { + *type = std::make_shared(children); + } else { + return Status::NotImplemented(type_name); + } + return Status::OK(); + } + + private: + const rj::Value& json_schema_; +}; + +class JsonArrayReader { + public: + explicit JsonArrayReader(const rj::Value& json_array) + : json_array_(json_array) {} + + Status GetArray(std::shared_ptr* array) { + if (!json_array_.IsObject()) { + return Status::Invalid("Array was not a JSON object"); + } + + return Status::OK(); + } + + private: + const rj::Value& json_array_; +}; + +Status WriteJsonSchema(const Schema& schema, RjWriter* json_writer) { + JsonSchemaWriter converter(json_writer); + for (const std::shared_ptr& field : schema.fields()) { + RETURN_NOT_OK(converter.VisitField(*field.get())); + } + return Status::OK(); +} + +Status ReadJsonSchema(const rj::Value& json_schema, std::shared_ptr* schema) { + JsonSchemaReader converter(json_schema); + return converter.GetSchema(schema); +} + +Status ReadJsonArray(const rj::Value& json_array, std::shared_ptr* array) { + JsonArrayReader converter(json_array); + return converter.GetArray(array); +} + +} // namespace ipc +} // namespace arrow diff --git a/cpp/src/arrow/ipc/json-internal.h b/cpp/src/arrow/ipc/json-internal.h new file mode 100644 index 00000000000..576d98c2080 --- /dev/null +++ b/cpp/src/arrow/ipc/json-internal.h @@ -0,0 +1,54 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +// Implement Arrow JSON serialization format + +#ifndef ARROW_IPC_JSON_H +#define ARROW_IPC_JSON_H + +#define RAPIDJSON_HAS_STDSTRING 1 +#define RAPIDJSON_HAS_CXX11_RVALUE_REFS 1 +#define RAPIDJSON_HAS_CXX11_RANGE_FOR 1 + +#include + +#include "rapidjson/document.h" +#include "rapidjson/stringbuffer.h" +#include "rapidjson/writer.h" + +#include "arrow/type_fwd.h" + +namespace rj = rapidjson; +using RjWriter = rj::Writer; + +namespace arrow { + +class Array; +class Schema; + +namespace ipc { + +Status WriteJsonSchema(const Schema& schema, RjWriter* json_writer); +Status WriteJsonArray(const Array& array, RjWriter* json_writer); + +Status ReadJsonSchema(const rj::Value& json_arr, std::shared_ptr* schema); +Status ReadJsonArray(const rj::Value& json_obj, std::shared_ptr* schema); + +} // namespace ipc +} // namespace arrow + +#endif // ARROW_IPC_FILE_H diff --git a/cpp/src/arrow/ipc/json.cc b/cpp/src/arrow/ipc/json.cc index 957253afd2f..afe1e75d827 100644 --- a/cpp/src/arrow/ipc/json.cc +++ b/cpp/src/arrow/ipc/json.cc @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/ipc/json.h" +#include "arrow/ipc/json-internal.h" #define RAPIDJSON_HAS_STDSTRING 1 #define RAPIDJSON_HAS_CXX11_RVALUE_REFS 1 @@ -32,252 +32,5 @@ namespace ipc { namespace rj = rapidjson; -enum class BufferType : char { DATA, OFFSET, TYPE, VALIDITY }; - -static std::string GetBufferTypeName(BufferType type) { - switch (type) { - case BufferType::DATA: - return "DATA"; - case BufferType::OFFSET: - return "OFFSET"; - case BufferType::TYPE: - return "TYPE"; - case BufferType::VALIDITY: - return "VALIDITY"; - default: - break; - }; - return "UNKNOWN"; -} - -class BufferLayout { - public: - BufferLayout(BufferType type, int bit_width) : type_(type), bit_width_(bit_width) {} - - BufferType type() const { return type_; } - int bit_width() const { return bit_width_; } - - private: - BufferType type_; - int bit_width_; -}; - -static const BufferLayout kValidityBuffer(BufferType::VALIDITY, 1); -static const BufferLayout kOffsetBuffer(BufferType::OFFSET, 32); -static const BufferLayout kTypeBuffer(BufferType::TYPE, 32); -static const BufferLayout kBooleanBuffer(BufferType::DATA, 1); -static const BufferLayout kValues64(BufferType::DATA, 64); -static const BufferLayout kValues32(BufferType::DATA, 32); -static const BufferLayout kValues16(BufferType::DATA, 16); -static const BufferLayout kValues8(BufferType::DATA, 8); - -class JsonSchemaWriter : public TypeVisitor { - public: - explicit JsonSchemaWriter(rj::Writer* writer) : writer_(writer) {} - - void Start() { - writer_->Key("schema"); - writer_->StartArray(); - } - - void Finish() { writer_->EndArray(); } - - Status VisitField(const Field& field) { - writer_->StartObject(); - - writer_->Key("name"); - writer_->String(field.name.c_str()); - - writer_->Key("nullable"); - writer_->Bool(field.nullable); - - // Visit the type - RETURN_NOT_OK(field.type->Accept(this)); - - writer_->EndObject(); - - return Status::OK(); - } - - template - void WritePrimitive(const T& type, const std::vector& buffer_layout) { - WriteName(type); - SetNoChildren(); - WriteBufferLayout(buffer_layout); - } - - template - void WriteVarBytes(const T& type) { - WriteName(type); - SetNoChildren(); - WriteBufferLayout({kValidityBuffer, kOffsetBuffer, kValues8}); - } - - void WriteBufferLayout(const std::vector& buffer_layout) { - writer_->Key("typeLayout"); - writer_->StartArray(); - - for (const BufferLayout& buffer : buffer_layout) { - writer_->StartObject(); - writer_->Key("type"); - writer_->String(GetBufferTypeName(buffer.type())); - - writer_->Key("typeBitWidth"); - writer_->Int(buffer.bit_width()); - - writer_->EndObject(); - } - writer_->EndArray(); - } - - Status WriteChildren(const std::vector>& children) { - for (const std::shared_ptr& field : children) { - RETURN_NOT_OK(VisitField(*field.get())); - } - return Status::OK(); - } - - void SetNoChildren() { - writer_->Key("children"); - writer_->StartArray(); - writer_->EndArray(); - } - - template - void WriteName(const T& type) { - writer_->Key("type"); - writer_->String(T::NAME); - } - - Status Visit(const NullType& type) override { - WritePrimitive(type, {}); - return Status::OK(); - } - - Status Visit(const BooleanType& type) override { - WritePrimitive(type, {kValidityBuffer, kBooleanBuffer}); - return Status::OK(); - } - - Status Visit(const Int8Type& type) override { - WritePrimitive(type, {kValidityBuffer, kValues8}); - return Status::OK(); - } - - Status Visit(const Int16Type& type) override { - WritePrimitive(type, {kValidityBuffer, kValues16}); - return Status::OK(); - } - - Status Visit(const Int32Type& type) override { - WritePrimitive(type, {kValidityBuffer, kValues32}); - return Status::OK(); - } - - Status Visit(const Int64Type& type) override { - WritePrimitive(type, {kValidityBuffer, kValues64}); - return Status::OK(); - } - - Status Visit(const UInt8Type& type) override { - WritePrimitive(type, {kValidityBuffer, kValues8}); - return Status::OK(); - } - - Status Visit(const UInt16Type& type) override { - WritePrimitive(type, {kValidityBuffer, kValues16}); - return Status::OK(); - } - - Status Visit(const UInt32Type& type) override { - WritePrimitive(type, {kValidityBuffer, kValues32}); - return Status::OK(); - } - - Status Visit(const UInt64Type& type) override { - WritePrimitive(type, {kValidityBuffer, kValues64}); - return Status::OK(); - } - - Status Visit(const HalfFloatType& type) override { - WritePrimitive(type, {kValidityBuffer, kValues16}); - return Status::OK(); - } - - Status Visit(const FloatType& type) override { - WritePrimitive(type, {kValidityBuffer, kValues32}); - return Status::OK(); - } - - Status Visit(const DoubleType& type) override { - WritePrimitive(type, {kValidityBuffer, kValues64}); - return Status::OK(); - } - - Status Visit(const StringType& type) override { - WriteVarBytes(type); - return Status::OK(); - } - - Status Visit(const BinaryType& type) override { - WriteVarBytes(type); - return Status::OK(); - } - - Status Visit(const DateType& type) override { - WritePrimitive(type, {kValidityBuffer, kValues64}); - return Status::OK(); - } - - Status Visit(const TimeType& type) override { - WritePrimitive(type, {kValidityBuffer, kValues64}); - return Status::OK(); - } - - Status Visit(const TimestampType& type) override { - WritePrimitive(type, {kValidityBuffer, kValues64}); - return Status::OK(); - } - - Status Visit(const DecimalType& type) override { return Status::NotImplemented("NYI"); } - - Status Visit(const ListType& type) override { - WriteName(type); - RETURN_NOT_OK(WriteChildren(type.children())); - WriteBufferLayout({kValidityBuffer, kOffsetBuffer}); - return Status::OK(); - } - - Status Visit(const StructType& type) override { - WriteName(type); - WriteChildren(type.children()); - WriteBufferLayout({kValidityBuffer, kTypeBuffer}); - return Status::OK(); - } - - Status Visit(const DenseUnionType& type) override { - WriteName(type); - WriteChildren(type.children()); - WriteBufferLayout({kValidityBuffer, kTypeBuffer, kOffsetBuffer}); - return Status::NotImplemented("NYI"); - } - - Status Visit(const SparseUnionType& type) override { - WriteName(type); - WriteChildren(type.children()); - WriteBufferLayout({kValidityBuffer, kTypeBuffer}); - return Status::NotImplemented("NYI"); - } - - static void test() { - rj::StringBuffer s; - rj::Writer writer(s); - auto schema_writer = std::make_shared(&writer); - } - - private: - rj::Writer* writer_; -}; - } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/json.h b/cpp/src/arrow/ipc/json.h index fec2d0f7ac9..e9adf15506b 100644 --- a/cpp/src/arrow/ipc/json.h +++ b/cpp/src/arrow/ipc/json.h @@ -20,11 +20,75 @@ #ifndef ARROW_IPC_JSON_H #define ARROW_IPC_JSON_H +#include + #include "arrow/type_fwd.h" +#include "arrow/visibility.h" namespace arrow { + +class MemoryPool; + +namespace io { + +class OutputStream; +class ReadableFileInterface; + +} // namespace io + namespace ipc { +class ARROW_EXPORT JsonWriter { + public: + static Status Open(io::OutputStream* sink, const std::shared_ptr& schema, + std::shared_ptr* out); + + // TODO(wesm): Write dictionaries + + Status WriteRecordBatch( + const std::vector>& columns, int32_t num_rows); + + Status Close(); + + private: + JsonWriter(io::OutputStream* sink, const std::shared_ptr& schema); + + io::OutputStream* sink_; + std::shared_ptr schema_; + + // Hide RapidJSON details from public API + class JsonWriterImpl; + std::unique_ptr impl_; +}; + +class ARROW_EXPORT JsonReader { + public: + static Status Open(MemoryPool* pool, + const std::shared_ptr& file, + std::shared_ptr* reader); + + // Use the default memory pool + static Status Open(const std::shared_ptr& file, + std::shared_ptr* reader); + + std::shared_ptr schema() const; + + int num_record_batches() const; + + // Read a record batch from the file + Status GetRecordBatch(int i, std::shared_ptr* batch); + + private: + JsonReader(const std::shared_ptr& file); + + std::shared_ptr file_; + std::shared_ptr schema_; + + // Hide RapidJSON details from public API + class JsonReaderImpl; + std::unique_ptr impl_; +}; + } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/schema.h b/cpp/src/arrow/schema.h index 4301968e015..8e9a95f1e81 100644 --- a/cpp/src/arrow/schema.h +++ b/cpp/src/arrow/schema.h @@ -39,6 +39,8 @@ class ARROW_EXPORT Schema { // Return the ith schema element. Does not boundscheck const std::shared_ptr& field(int i) const { return fields_[i]; } + const std::vector>& fields() const { return fields_; } + // Render a string representation of the schema suitable for debugging std::string ToString() const; diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 4819751794c..17648f4b6a7 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -337,7 +337,7 @@ struct ARROW_EXPORT DecimalType : public DataType { }; template -struct UnionType : public DataType { +struct ARROW_EXPORT UnionType : public DataType { std::vector child_types_; UnionType() : DataType(T) {} @@ -346,7 +346,7 @@ struct UnionType : public DataType { int num_children() const { return child_types_.size(); } }; -struct DenseUnionType : public UnionType { +struct ARROW_EXPORT DenseUnionType : public UnionType { typedef UnionType Base; explicit DenseUnionType(const std::vector& child_types) : Base() { @@ -358,7 +358,7 @@ struct DenseUnionType : public UnionType { static const std::string NAME; }; -struct SparseUnionType : public UnionType { +struct ARROW_EXPORT SparseUnionType : public UnionType { typedef UnionType Base; explicit SparseUnionType(const std::vector& child_types) : Base() { From 3b9d14e8bc3d5e63e91ccc1fd9c0d2cfcc42ca43 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 10 Nov 2016 12:45:59 -0500 Subject: [PATCH 05/27] Add type-specific JSON metadata to schema writer Change-Id: I1ea61fd3ff1d480eefdc663696e784e90ac0b7b6 --- cpp/src/arrow/ipc/json-internal.cc | 167 +++++++++++++++++++++++------ cpp/src/arrow/type.cc | 25 ++++- cpp/src/arrow/type.h | 120 +++++++++++---------- cpp/src/arrow/type_fwd.h | 16 +-- cpp/src/arrow/types/primitive.cc | 3 +- cpp/src/arrow/types/union.cc | 23 +--- format/Metadata.md | 5 + 7 files changed, 232 insertions(+), 127 deletions(-) diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index 50989054c2d..90866e76f96 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -20,6 +20,7 @@ #include #include #include +#include #include "rapidjson/stringbuffer.h" #include "rapidjson/writer.h" @@ -105,7 +106,106 @@ class JsonSchemaWriter : public TypeVisitor { } template - void WriteTypeMetadata(const T& type) {} + typename std::enable_if::value || + std::is_base_of::value || + std::is_base_of::value, + void>::type + WriteTypeMetadata(const T& type) {} + + template + typename std::enable_if::value, void>::type + WriteTypeMetadata(const T& type) { + writer_->Key("bitWidth"); + writer_->Int(type.bit_width()); + writer_->Key("isSigned"); + writer_->Bool(type.is_signed()); + } + + template + typename std::enable_if::value, void>::type + WriteTypeMetadata(const T& type) { + writer_->Key("precision"); + switch (type.precision()) { + case FloatingPointMeta::HALF: + writer_->String("HALF"); + break; + case FloatingPointMeta::SINGLE: + writer_->String("SINGLE"); + break; + case FloatingPointMeta::DOUBLE: + writer_->String("DOUBLE"); + break; + default: + break; + }; + } + + template + typename std::enable_if::value, void>::type + WriteTypeMetadata(const T& type) { + writer_->Key("unit"); + switch (type.unit) { + case IntervalType::Unit::YEAR_MONTH: + writer_->String("YEAR_MONTH"); + break; + case IntervalType::Unit::DAY_TIME: + writer_->String("DAY_TIME"); + break; + }; + } + + template + typename std::enable_if::value || + std::is_base_of::value, + void>::type + WriteTypeMetadata(const T& type) { + writer_->Key("unit"); + switch (type.unit) { + case TimeUnit::SECOND: + writer_->String("SECOND"); + break; + case TimeUnit::MILLI: + writer_->String("MILLISECOND"); + break; + case TimeUnit::MICRO: + writer_->String("MICROSECOND"); + break; + case TimeUnit::NANO: + writer_->String("NANOSECOND"); + break; + }; + } + + template + typename std::enable_if::value, void>::type + WriteTypeMetadata(const T& type) { + writer_->Key("precision"); + writer_->Int(type.precision); + writer_->Key("scale"); + writer_->Int(type.scale); + } + + template + typename std::enable_if::value, void>::type + WriteTypeMetadata(const T& type) { + writer_->Key("mode"); + switch (type.mode) { + case UnionType::SPARSE: + writer_->String("SPARSE"); + break; + case UnionType::DENSE: + writer_->String("DENSE"); + break; + }; + + // Write type ids + writer_->Key("typeIds"); + writer_->StartArray(); + for (size_t i = 0; i < type.type_ids.size(); ++i) { + writer_->Uint(type.type_ids[i]); + } + writer_->EndArray(); + } // TODO(wesm): Other Type metadata @@ -135,7 +235,7 @@ class JsonSchemaWriter : public TypeVisitor { WriteBufferLayout({kValidityBuffer, kOffsetBuffer, kValues8}); } - void WriteBufferLayout(const std::vector& buffer_layout) { + void WriteBufferLayout(const std::vector& buffer_layout) { writer_->Key("typeLayout"); writer_->StartArray(); @@ -249,6 +349,11 @@ class JsonSchemaWriter : public TypeVisitor { return Status::OK(); } + Status Visit(const IntervalType& type) override { + WritePrimitive(type, {kValidityBuffer, kValues64}); + return Status::OK(); + } + Status Visit(const DecimalType& type) override { return Status::NotImplemented("NYI"); } Status Visit(const ListType& type) override { @@ -265,17 +370,15 @@ class JsonSchemaWriter : public TypeVisitor { return Status::OK(); } - Status Visit(const DenseUnionType& type) override { + Status Visit(const UnionType& type) override { WriteName(type); WriteChildren(type.children()); - WriteBufferLayout({kValidityBuffer, kTypeBuffer, kOffsetBuffer}); - return Status::NotImplemented("NYI"); - } - Status Visit(const SparseUnionType& type) override { - WriteName(type); - WriteChildren(type.children()); - WriteBufferLayout({kValidityBuffer, kTypeBuffer}); + if (type.mode == UnionType::SPARSE) { + WriteBufferLayout({kValidityBuffer, kTypeBuffer}); + } else { + WriteBufferLayout({kValidityBuffer, kTypeBuffer, kOffsetBuffer}); + } return Status::NotImplemented("NYI"); } @@ -283,14 +386,14 @@ class JsonSchemaWriter : public TypeVisitor { RjWriter* writer_; }; -#define RETURN_NOT_STRING(NAME, PARENT) \ - if (NAME == PARENT.MemberEnd() || !NAME->value.IsString()) { \ - return Status::Invalid("invalid field"); \ +#define RETURN_NOT_STRING(NAME, PARENT) \ + if (NAME == PARENT.MemberEnd() || !NAME->value.IsString()) { \ + return Status::Invalid("invalid field"); \ } -#define RETURN_NOT_BOOL(NAME, PARENT) \ - if (NAME == PARENT.MemberEnd() || !NAME->value.IsBool()) { \ - return Status::Invalid("invalid field"); \ +#define RETURN_NOT_BOOL(NAME, PARENT) \ + if (NAME == PARENT.MemberEnd() || !NAME->value.IsBool()) { \ + return Status::Invalid("invalid field"); \ } #define RETURN_NOT_INT(NAME, PARENT) \ @@ -298,20 +401,19 @@ class JsonSchemaWriter : public TypeVisitor { return Status::Invalid("invalid field"); \ } -#define RETURN_NOT_ARRAY(NAME, PARENT) \ - if (NAME == PARENT.MemberEnd() || !NAME->value.IsArray()) { \ - return Status::Invalid("invalid field"); \ +#define RETURN_NOT_ARRAY(NAME, PARENT) \ + if (NAME == PARENT.MemberEnd() || !NAME->value.IsArray()) { \ + return Status::Invalid("invalid field"); \ } -#define RETURN_NOT_OBJECT(NAME, PARENT) \ - if (NAME == PARENT.MemberEnd() || !NAME->value.IsObject()) { \ - return Status::Invalid("invalid field"); \ +#define RETURN_NOT_OBJECT(NAME, PARENT) \ + if (NAME == PARENT.MemberEnd() || !NAME->value.IsObject()) { \ + return Status::Invalid("invalid field"); \ } class JsonSchemaReader { public: - explicit JsonSchemaReader(const rj::Value& json_schema) - : json_schema_(json_schema) {} + explicit JsonSchemaReader(const rj::Value& json_schema) : json_schema_(json_schema) {} Status GetSchema(std::shared_ptr* schema) { const auto& obj_schema = json_schema_.GetObject(); @@ -326,7 +428,8 @@ class JsonSchemaReader { return Status::OK(); } - Status GetFieldsFromArray(const rj::Value& obj, std::vector>* fields) { + Status GetFieldsFromArray( + const rj::Value& obj, std::vector>* fields) { const auto& values = obj.GetArray(); fields->resize(values.Size()); @@ -337,9 +440,7 @@ class JsonSchemaReader { } Status GetField(const rj::Value& obj, std::shared_ptr* field) { - if (!obj.IsObject()) { - return Status::Invalid("Field was not a JSON object"); - } + if (!obj.IsObject()) { return Status::Invalid("Field was not a JSON object"); } const auto& json_field = obj.GetObject(); const auto& json_name = json_field.FindMember("name"); @@ -360,8 +461,8 @@ class JsonSchemaReader { std::shared_ptr type; RETURN_NOT_OK(GetType(json_type->value, children, &type)); - *field = std::make_shared(json_name->value.GetString(), type, - json_nullable->value.GetBool()); + *field = std::make_shared( + json_name->value.GetString(), type, json_nullable->value.GetBool()); return Status::OK(); } @@ -437,7 +538,8 @@ class JsonSchemaReader { } Status GetType(const rj::Value& obj, - const std::vector>& children, std::shared_ptr* type) { + const std::vector>& children, + std::shared_ptr* type) { const auto& json_type = obj.GetObject(); const auto& json_type_name = json_type.FindMember("name"); @@ -473,8 +575,7 @@ class JsonSchemaReader { class JsonArrayReader { public: - explicit JsonArrayReader(const rj::Value& json_array) - : json_array_(json_array) {} + explicit JsonArrayReader(const rj::Value& json_array) : json_array_(json_array) {} Status GetArray(std::shared_ptr* array) { if (!json_array_.IsObject()) { diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 6d2c7e02217..f6b4dbbee7e 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -84,6 +84,23 @@ std::string StructType::ToString() const { return s.str(); } +std::string UnionType::ToString() const { + std::stringstream s; + + if (mode == UnionType::SPARSE) { + s << "union[sparse]<"; + } else { + s << "union[dense]<"; + } + + for (size_t i = 0; i < child_types.size(); ++i) { + if (i) { s << ", "; } + s << child_types[i]->ToString(); + } + s << ">"; + return s.str(); +} + // Visitors and template instantiation #define ACCEPT_VISITOR(TYPE) \ @@ -95,11 +112,11 @@ ACCEPT_VISITOR(StringType); ACCEPT_VISITOR(ListType); ACCEPT_VISITOR(StructType); ACCEPT_VISITOR(DecimalType); -ACCEPT_VISITOR(SparseUnionType); -ACCEPT_VISITOR(DenseUnionType); +ACCEPT_VISITOR(UnionType); ACCEPT_VISITOR(DateType); ACCEPT_VISITOR(TimeType); ACCEPT_VISITOR(TimestampType); +ACCEPT_VISITOR(IntervalType); const std::string NullType::NAME = "null"; const std::string UInt8Type::NAME = "uint8"; @@ -120,9 +137,9 @@ const std::string DecimalType::NAME = "decimal"; const std::string DateType::NAME = "decimal"; const std::string TimeType::NAME = "time"; const std::string TimestampType::NAME = "timestamp"; +const std::string IntervalType::NAME = "interval"; const std::string ListType::NAME = "list"; const std::string StructType::NAME = "struct"; -const std::string DenseUnionType::NAME = "union"; -const std::string SparseUnionType::NAME = "union"; +const std::string UnionType::NAME = "union"; } // namespace arrow diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 17648f4b6a7..9b6ba12936b 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -80,12 +80,15 @@ struct Type { // Exact time encoded with int64, default unit millisecond TIME = 19, + // YEAR_MONTH or DAY_TIME interval in SQL style + INTERVAL = 20, + // Precision- and scale-based decimal type. Storage type depends on the // parameters. - DECIMAL = 20, + DECIMAL = 21, // Decimal value encoded as a text string - DECIMAL_TEXT = 21, + DECIMAL_TEXT = 22, // A list of some logical data type LIST = 30, @@ -94,8 +97,7 @@ struct Type { STRUCT = 31, // Unions of logical types - DENSE_UNION = 32, - SPARSE_UNION = 33, + UNION = 32, }; }; @@ -124,8 +126,6 @@ struct ARROW_EXPORT DataType { int num_children() const { return children_.size(); } - virtual int bit_width() const { return -1; } - virtual Status Accept(TypeVisitor* visitor) const = 0; virtual std::string ToString() const = 0; @@ -133,6 +133,21 @@ struct ARROW_EXPORT DataType { typedef std::shared_ptr TypePtr; +struct PrimitiveMeta { + virtual int bit_width() const = 0; +}; + +struct IntegerMeta { + virtual bool is_signed() const = 0; +}; + +struct FloatingPointMeta { + enum Precision { HALF, SINGLE, DOUBLE }; + virtual Precision precision() const = 0; +}; + +struct NoExtraMeta {}; + // A field is a piece of metadata that includes (for now) a name and a data // type struct ARROW_EXPORT Field { @@ -170,7 +185,7 @@ struct ARROW_EXPORT Field { typedef std::shared_ptr FieldPtr; template -struct ARROW_EXPORT PrimitiveType : public DataType { +struct ARROW_EXPORT PrimitiveType : public DataType, public PrimitiveMeta { using c_type = C_TYPE; static constexpr Type::type type_id = TYPE_ID; @@ -185,7 +200,7 @@ struct ARROW_EXPORT PrimitiveType : public DataType { std::string ToString() const override { return std::string(DERIVED::NAME); } }; -struct ARROW_EXPORT NullType : public DataType { +struct ARROW_EXPORT NullType : public DataType, public PrimitiveMeta { static constexpr Type::type type_enum = Type::NA; NullType() : DataType(Type::NA) {} @@ -199,15 +214,6 @@ struct ARROW_EXPORT NullType : public DataType { std::string ToString() const override { return NAME; } }; -struct IntegerMeta { - virtual bool is_signed() const = 0; -}; - -struct FloatingPointMeta { - enum Precision { HALF, SINGLE, DOUBLE }; - virtual Precision precision() const = 0; -}; - template struct IntegerTypeImpl : public PrimitiveType, public IntegerMeta { @@ -273,7 +279,7 @@ struct ARROW_EXPORT DoubleType : public PrimitiveType& value_type) : ListType(std::make_shared("item", value_type)) {} @@ -292,7 +298,7 @@ struct ARROW_EXPORT ListType : public DataType { }; // BinaryType type is reprsents lists of 1-byte values. -struct ARROW_EXPORT BinaryType : public DataType { +struct ARROW_EXPORT BinaryType : public DataType, public NoExtraMeta { BinaryType() : BinaryType(Type::BINARY) {} Status Accept(TypeVisitor* visitor) const override; @@ -314,7 +320,7 @@ struct ARROW_EXPORT StringType : public BinaryType { static const std::string NAME; }; -struct ARROW_EXPORT StructType : public DataType { +struct ARROW_EXPORT StructType : public DataType, public NoExtraMeta { explicit StructType(const std::vector>& fields) : DataType(Type::STRUCT) { children_ = fields; @@ -336,82 +342,78 @@ struct ARROW_EXPORT DecimalType : public DataType { static const std::string NAME; }; -template struct ARROW_EXPORT UnionType : public DataType { - std::vector child_types_; + enum UnionMode { SPARSE, DENSE }; - UnionType() : DataType(T) {} + UnionType(const std::vector>& child_types, + const std::vector& type_ids, UnionMode mode = UnionMode::SPARSE) + : DataType(Type::UNION), mode(mode), child_types(child_types), type_ids(type_ids) {} - const TypePtr& child(int i) const { return child_types_[i]; } - int num_children() const { return child_types_.size(); } -}; - -struct ARROW_EXPORT DenseUnionType : public UnionType { - typedef UnionType Base; + const TypePtr& child(int i) const { return child_types[i]; } + int num_children() const { return child_types.size(); } - explicit DenseUnionType(const std::vector& child_types) : Base() { - child_types_ = child_types; - } - - Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; + Status Accept(TypeVisitor* visitor) const override; + + UnionMode mode; + std::vector child_types; + std::vector type_ids; static const std::string NAME; }; -struct ARROW_EXPORT SparseUnionType : public UnionType { - typedef UnionType Base; - - explicit SparseUnionType(const std::vector& child_types) : Base() { - child_types_ = child_types; - } +struct ARROW_EXPORT DateType : public DataType, public NoExtraMeta { + DateType() : DataType(Type::DATE) {} Status Accept(TypeVisitor* visitor) const override; - std::string ToString() const override; + std::string ToString() const override { return NAME; } static const std::string NAME; }; -struct ARROW_EXPORT DateType : public DataType { - enum class Unit : char { DAY = 0, MONTH = 1, YEAR = 2 }; - - Unit unit; +enum class TimeUnit : char { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 }; - explicit DateType(Unit unit = Unit::DAY) : DataType(Type::DATE), unit(unit) {} +struct ARROW_EXPORT TimeType : public DataType { + TimeUnit unit; - DateType(const DateType& other) : DateType(other.unit) {} + explicit TimeType(TimeUnit unit = TimeUnit::MILLI) : DataType(Type::TIME), unit(unit) {} + TimeType(const TimeType& other) : TimeType(other.unit) {} Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override { return NAME; } static const std::string NAME; }; -struct ARROW_EXPORT TimeType : public DataType { - enum class Unit : char { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 }; +struct ARROW_EXPORT TimestampType : public DataType, public PrimitiveMeta { + typedef int64_t c_type; + static constexpr Type::type type_enum = Type::TIMESTAMP; - Unit unit; + int bit_width() const override { return sizeof(int64_t) * 8; } - explicit TimeType(Unit unit = Unit::MILLI) : DataType(Type::TIME), unit(unit) {} - TimeType(const TimeType& other) : TimeType(other.unit) {} + TimeUnit unit; + + explicit TimestampType(TimeUnit unit = TimeUnit::MILLI) + : DataType(Type::TIMESTAMP), unit(unit) {} + + TimestampType(const TimestampType& other) : TimestampType(other.unit) {} Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override { return NAME; } static const std::string NAME; }; -struct ARROW_EXPORT TimestampType : public DataType { - enum class Unit : char { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 }; +struct ARROW_EXPORT IntervalType : public DataType, public PrimitiveMeta { + enum class Unit : char { YEAR_MONTH = 0, DAY_TIME = 1 }; typedef int64_t c_type; - static constexpr Type::type type_enum = Type::TIMESTAMP; + static constexpr Type::type type_enum = Type::INTERVAL; int bit_width() const override { return sizeof(int64_t) * 8; } Unit unit; - explicit TimestampType(Unit unit = Unit::MILLI) - : DataType(Type::TIMESTAMP), unit(unit) {} + explicit IntervalType(Unit unit = Unit::YEAR_MONTH) + : DataType(Type::INTERVAL), unit(unit) {} - TimestampType(const TimestampType& other) : TimestampType(other.unit) {} - virtual ~TimestampType() {} + IntervalType(const IntervalType& other) : IntervalType(other.unit) {} Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override { return NAME; } diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index ba488e79321..0ac42c6f7de 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -43,11 +43,11 @@ struct BinaryType; struct DateType; struct TimeType; struct TimestampType; +struct IntervalType; struct DecimalType; struct ListType; struct StructType; -struct DenseUnionType; -struct SparseUnionType; +struct UnionType; class TypeVisitor { public: @@ -69,11 +69,11 @@ class TypeVisitor { virtual Status Visit(const DateType& type) = 0; virtual Status Visit(const TimeType& type) = 0; virtual Status Visit(const TimestampType& type) = 0; + virtual Status Visit(const IntervalType& type) = 0; virtual Status Visit(const DecimalType& type) = 0; virtual Status Visit(const ListType& type) = 0; virtual Status Visit(const StructType& type) = 0; - virtual Status Visit(const DenseUnionType& type) = 0; - virtual Status Visit(const SparseUnionType& type) = 0; + virtual Status Visit(const UnionType& type) = 0; }; class NullArray; @@ -83,8 +83,7 @@ class BinaryArray; class DecimalArray; class ListArray; class StructArray; -class DenseUnionArray; -class SparseUnionArray; +class UnionArray; template class NumericArray; @@ -104,6 +103,7 @@ using UInt32Array = NumericArray; using Int64Array = NumericArray; using UInt64Array = NumericArray; using TimestampArray = NumericArray; +using IntervalArray = NumericArray; class ArrayVisitor { public: @@ -125,11 +125,11 @@ class ArrayVisitor { virtual Status Visit(const DateArray& array) = 0; virtual Status Visit(const TimeArray& array) = 0; virtual Status Visit(const TimestampArray& array) = 0; + virtual Status Visit(const IntervalArray& array) = 0; virtual Status Visit(const DecimalArray& array) = 0; virtual Status Visit(const ListArray& array) = 0; virtual Status Visit(const StructArray& array) = 0; - virtual Status Visit(const DenseUnionArray& array) = 0; - virtual Status Visit(const SparseUnionArray& array) = 0; + virtual Status Visit(const UnionArray& array) = 0; }; } // namespace arrow diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index e5c6f047746..11ad85159a4 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -48,7 +48,8 @@ bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const { const uint8_t* this_data = raw_data_; const uint8_t* other_data = other.raw_data_; - int value_byte_size = type_->bit_width() / 8; + auto primitive_meta = dynamic_cast(type_.get()); + int value_byte_size = primitive_meta->bit_width() / 8; DCHECK_GT(value_byte_size, 0); for (int i = 0; i < length_; ++i) { diff --git a/cpp/src/arrow/types/union.cc b/cpp/src/arrow/types/union.cc index c891b4a5357..cc2934b2e4a 100644 --- a/cpp/src/arrow/types/union.cc +++ b/cpp/src/arrow/types/union.cc @@ -24,25 +24,4 @@ #include "arrow/type.h" -namespace arrow { - -static inline std::string format_union(const std::vector& child_types) { - std::stringstream s; - s << "union<"; - for (size_t i = 0; i < child_types.size(); ++i) { - if (i) { s << ", "; } - s << child_types[i]->ToString(); - } - s << ">"; - return s.str(); -} - -std::string DenseUnionType::ToString() const { - return format_union(child_types_); -} - -std::string SparseUnionType::ToString() const { - return format_union(child_types_); -} - -} // namespace arrow +namespace arrow {} // namespace arrow diff --git a/format/Metadata.md b/format/Metadata.md index 653a4c73e83..a4878f34707 100644 --- a/format/Metadata.md +++ b/format/Metadata.md @@ -98,6 +98,11 @@ Union: "typeIds" : [ /* integer */ ] } ``` + +The `typeIds` field in the Union are the codes used to denote each type, which +may be different from the index of the child array. This is so that the union +type ids do not have to be enumerated from 0. + Int: ``` { From 86c95590d0236ae8ffa7985c2195c0b884125cbd Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 10 Nov 2016 15:42:56 -0500 Subject: [PATCH 06/27] Add convenience factory functions for common types Change-Id: Ie2107cee9b85c79122506ca81701865a7063b691 --- cpp/CMakeLists.txt | 1 + cpp/src/arrow/factory.cc | 45 +++++++++++++++++++++++++++++ cpp/src/arrow/factory.h | 46 ++++++++++++++++++++++++++++++ cpp/src/arrow/ipc/ipc-json-test.cc | 5 ++++ cpp/src/arrow/ipc/json-internal.cc | 22 +++++++------- cpp/src/arrow/schema-test.cc | 32 ++++++++++----------- 6 files changed, 124 insertions(+), 27 deletions(-) create mode 100644 cpp/src/arrow/factory.cc create mode 100644 cpp/src/arrow/factory.h diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 0bff7528578..949cc551626 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -694,6 +694,7 @@ set(ARROW_SRCS src/arrow/array.cc src/arrow/builder.cc src/arrow/column.cc + src/arrow/factory.cc src/arrow/schema.cc src/arrow/table.cc src/arrow/type.cc diff --git a/cpp/src/arrow/factory.cc b/cpp/src/arrow/factory.cc new file mode 100644 index 00000000000..f1c0a1b2923 --- /dev/null +++ b/cpp/src/arrow/factory.cc @@ -0,0 +1,45 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "arrow/factory.h" + +#include + +#include "arrow/type.h" + +namespace arrow { + +#define TYPE_FACTORY(NAME, KLASS) \ + std::shared_ptr NAME() { \ + static std::shared_ptr result = std::make_shared(); \ + return result; \ + } + +TYPE_FACTORY(int8, Int8Type); +TYPE_FACTORY(uint8, UInt8Type); +TYPE_FACTORY(int16, Int16Type); +TYPE_FACTORY(uint16, UInt16Type); +TYPE_FACTORY(int32, Int32Type); +TYPE_FACTORY(uint32, UInt32Type); +TYPE_FACTORY(int64, Int64Type); +TYPE_FACTORY(uint64, UInt64Type); +TYPE_FACTORY(float32, FloatType); +TYPE_FACTORY(float64, DoubleType); +TYPE_FACTORY(utf8, StringType); +TYPE_FACTORY(binary, BinaryType); + +} // namespace arrow diff --git a/cpp/src/arrow/factory.h b/cpp/src/arrow/factory.h new file mode 100644 index 00000000000..8a18b8bc2c4 --- /dev/null +++ b/cpp/src/arrow/factory.h @@ -0,0 +1,46 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_FACTORY_H +#define ARROW_FACTORY_H + +#include +#include +#include +#include + +#include "arrow/type_fwd.h" +#include "arrow/util/visibility.h" + +namespace arrow { + +std::shared_ptr int8(); +std::shared_ptr int16(); +std::shared_ptr int32(); +std::shared_ptr int64(); +std::shared_ptr uint8(); +std::shared_ptr uint16(); +std::shared_ptr uint32(); +std::shared_ptr uint64(); +std::shared_ptr float32(); +std::shared_ptr float64(); +std::shared_ptr utf8(); +std::shared_ptr binary(); + +} // namespace arrow + +#endif // ARROW_FACTORY_H diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc index 11da3567b1c..768025c2821 100644 --- a/cpp/src/arrow/ipc/ipc-json-test.cc +++ b/cpp/src/arrow/ipc/ipc-json-test.cc @@ -48,6 +48,11 @@ class TestJsonSchemaWriter : public ::testing::Test { public: void SetUp() {} void TearDown() {} + + void TestRoundTrip(const Schema& schema) { + + + } }; } // namespace ipc diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index 90866e76f96..518e5eb412e 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -73,15 +73,19 @@ static const BufferLayout kValues8(BufferType::DATA, 8); class JsonSchemaWriter : public TypeVisitor { public: - explicit JsonSchemaWriter(RjWriter* writer) : writer_(writer) {} + explicit JsonSchemaWriter(const Schema& schema, RjWriter* writer) + : schema_(schema), writer_(writer) {} - void Start() { - writer_->Key("schema"); + Status Write() { + writer_->Key("fields"); writer_->StartArray(); + for (const std::shared_ptr& field : schema_.fields()) { + RETURN_NOT_OK(VisitField(*field.get())); + } + writer_->EndArray(); + return Status::OK(); } - void Finish() { writer_->EndArray(); } - Status VisitField(const Field& field) { writer_->StartObject(); @@ -383,6 +387,7 @@ class JsonSchemaWriter : public TypeVisitor { } private: + const Schema& schema_; RjWriter* writer_; }; @@ -590,11 +595,8 @@ class JsonArrayReader { }; Status WriteJsonSchema(const Schema& schema, RjWriter* json_writer) { - JsonSchemaWriter converter(json_writer); - for (const std::shared_ptr& field : schema.fields()) { - RETURN_NOT_OK(converter.VisitField(*field.get())); - } - return Status::OK(); + JsonSchemaWriter converter(schema, json_writer); + return converter.Write(); } Status ReadJsonSchema(const rj::Value& json_schema, std::shared_ptr* schema) { diff --git a/cpp/src/arrow/schema-test.cc b/cpp/src/arrow/schema-test.cc index 8cc80be120a..2398c838e5b 100644 --- a/cpp/src/arrow/schema-test.cc +++ b/cpp/src/arrow/schema-test.cc @@ -22,6 +22,7 @@ #include "gtest/gtest.h" #include "arrow/schema.h" +#include "arrow/factory.h" #include "arrow/type.h" using std::shared_ptr; @@ -29,23 +30,21 @@ using std::vector; namespace arrow { -const auto INT32 = std::make_shared(); - TEST(TestField, Basics) { - Field f0("f0", INT32); - Field f0_nn("f0", INT32, false); + Field f0("f0", int32()); + Field f0_nn("f0", int32(), false); ASSERT_EQ(f0.name, "f0"); - ASSERT_EQ(f0.type->ToString(), INT32->ToString()); + ASSERT_EQ(f0.type->ToString(), int32()->ToString()); ASSERT_TRUE(f0.nullable); ASSERT_FALSE(f0_nn.nullable); } TEST(TestField, Equals) { - Field f0("f0", INT32); - Field f0_nn("f0", INT32, false); - Field f0_other("f0", INT32); + Field f0("f0", int32()); + Field f0_nn("f0", int32(), false); + Field f0_other("f0", int32()); ASSERT_EQ(f0, f0_other); ASSERT_NE(f0, f0_nn); @@ -57,11 +56,11 @@ class TestSchema : public ::testing::Test { }; TEST_F(TestSchema, Basics) { - auto f0 = std::make_shared("f0", INT32); - auto f1 = std::make_shared("f1", std::make_shared(), false); - auto f1_optional = std::make_shared("f1", std::make_shared()); + auto f0 = std::make_shared("f0", int32()); + auto f1 = std::make_shared("f1", uint8(), false); + auto f1_optional = std::make_shared("f1", uint8()); - auto f2 = std::make_shared("f2", std::make_shared()); + auto f2 = std::make_shared("f2", utf8()); vector> fields = {f0, f1, f2}; auto schema = std::make_shared(fields); @@ -83,11 +82,10 @@ TEST_F(TestSchema, Basics) { } TEST_F(TestSchema, ToString) { - auto f0 = std::make_shared("f0", INT32); - auto f1 = std::make_shared("f1", std::make_shared(), false); - auto f2 = std::make_shared("f2", std::make_shared()); - auto f3 = std::make_shared( - "f3", std::make_shared(std::make_shared())); + auto f0 = std::make_shared("f0", int32()); + auto f1 = std::make_shared("f1", uint8(), false); + auto f2 = std::make_shared("f2", utf8()); + auto f3 = std::make_shared("f3", std::make_shared(int16())); vector> fields = {f0, f1, f2, f3}; auto schema = std::make_shared(fields); From 1c082332cdd85cb0d171ea68b765b325ea4edf02 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 10 Nov 2016 17:49:03 -0500 Subject: [PATCH 07/27] JSON schema roundtrip passing for many types Change-Id: I70d02dcd2958217deb37296f280c0298d4f943a0 --- cpp/CMakeLists.txt | 1 - cpp/src/arrow/factory.cc | 45 ---------- cpp/src/arrow/factory.h | 46 ---------- cpp/src/arrow/ipc/ipc-json-test.cc | 41 ++++++--- cpp/src/arrow/ipc/json-internal.cc | 131 +++++++++++++++++------------ cpp/src/arrow/ipc/json-internal.h | 11 ++- cpp/src/arrow/ipc/test-common.h | 14 +-- cpp/src/arrow/schema-test.cc | 17 ++-- cpp/src/arrow/type.cc | 32 +++++++ cpp/src/arrow/type.h | 22 +++++ 10 files changed, 180 insertions(+), 180 deletions(-) delete mode 100644 cpp/src/arrow/factory.cc delete mode 100644 cpp/src/arrow/factory.h diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 949cc551626..0bff7528578 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -694,7 +694,6 @@ set(ARROW_SRCS src/arrow/array.cc src/arrow/builder.cc src/arrow/column.cc - src/arrow/factory.cc src/arrow/schema.cc src/arrow/table.cc src/arrow/type.cc diff --git a/cpp/src/arrow/factory.cc b/cpp/src/arrow/factory.cc deleted file mode 100644 index f1c0a1b2923..00000000000 --- a/cpp/src/arrow/factory.cc +++ /dev/null @@ -1,45 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#include "arrow/factory.h" - -#include - -#include "arrow/type.h" - -namespace arrow { - -#define TYPE_FACTORY(NAME, KLASS) \ - std::shared_ptr NAME() { \ - static std::shared_ptr result = std::make_shared(); \ - return result; \ - } - -TYPE_FACTORY(int8, Int8Type); -TYPE_FACTORY(uint8, UInt8Type); -TYPE_FACTORY(int16, Int16Type); -TYPE_FACTORY(uint16, UInt16Type); -TYPE_FACTORY(int32, Int32Type); -TYPE_FACTORY(uint32, UInt32Type); -TYPE_FACTORY(int64, Int64Type); -TYPE_FACTORY(uint64, UInt64Type); -TYPE_FACTORY(float32, FloatType); -TYPE_FACTORY(float64, DoubleType); -TYPE_FACTORY(utf8, StringType); -TYPE_FACTORY(binary, BinaryType); - -} // namespace arrow diff --git a/cpp/src/arrow/factory.h b/cpp/src/arrow/factory.h deleted file mode 100644 index 8a18b8bc2c4..00000000000 --- a/cpp/src/arrow/factory.h +++ /dev/null @@ -1,46 +0,0 @@ -// Licensed to the Apache Software Foundation (ASF) under one -// or more contributor license agreements. See the NOTICE file -// distributed with this work for additional information -// regarding copyright ownership. The ASF licenses this file -// to you under the Apache License, Version 2.0 (the -// "License"); you may not use this file except in compliance -// with the License. You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, -// software distributed under the License is distributed on an -// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -// KIND, either express or implied. See the License for the -// specific language governing permissions and limitations -// under the License. - -#ifndef ARROW_FACTORY_H -#define ARROW_FACTORY_H - -#include -#include -#include -#include - -#include "arrow/type_fwd.h" -#include "arrow/util/visibility.h" - -namespace arrow { - -std::shared_ptr int8(); -std::shared_ptr int16(); -std::shared_ptr int32(); -std::shared_ptr int64(); -std::shared_ptr uint8(); -std::shared_ptr uint16(); -std::shared_ptr uint32(); -std::shared_ptr uint64(); -std::shared_ptr float32(); -std::shared_ptr float64(); -std::shared_ptr utf8(); -std::shared_ptr binary(); - -} // namespace arrow - -#endif // ARROW_FACTORY_H diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc index 768025c2821..8812e2ed091 100644 --- a/cpp/src/arrow/ipc/ipc-json-test.cc +++ b/cpp/src/arrow/ipc/ipc-json-test.cc @@ -15,29 +15,23 @@ // specific language governing permissions and limitations // under the License. +#include "arrow/ipc/json-internal.h" + #include #include #include +#include #include #include #include +#include "rapidjson/document.h" +#include "rapidjson/filewritestream.h" +#include "rapidjson/prettywriter.h" #include "gtest/gtest.h" -#include "arrow/io/memory.h" -#include "arrow/io/test-common.h" #include "arrow/ipc/adapter.h" -#include "arrow/ipc/file.h" -#include "arrow/ipc/test-common.h" -#include "arrow/ipc/util.h" - #include "arrow/test-util.h" -#include "arrow/types/list.h" -#include "arrow/types/primitive.h" -#include "arrow/types/string.h" -#include "arrow/types/struct.h" -#include "arrow/util/bit-util.h" -#include "arrow/util/buffer.h" #include "arrow/util/memory-pool.h" #include "arrow/util/status.h" @@ -50,10 +44,33 @@ class TestJsonSchemaWriter : public ::testing::Test { void TearDown() {} void TestRoundTrip(const Schema& schema) { + rj::StringBuffer sb; + rj::Writer writer(sb); + + ASSERT_OK(WriteJsonSchema(schema, &writer)); + rj::Document d; + d.Parse(sb.GetString()); + std::shared_ptr out; + ASSERT_OK(ReadJsonSchema(d, &out)); + + ASSERT_TRUE(schema.Equals(out)); } }; +TEST_F(TestJsonSchemaWriter, FlatTypes) { + std::vector> fields = { + field("f0", int8()), field("f1", int16(), false), field("f2", int32()), + field("f3", int64(), false), field("f4", uint8()), field("f5", uint16()), + field("f6", uint32()), field("f7", uint64()), field("f8", float32()), + field("f9", float64()), field("f10", utf8()), field("f11", binary()), + field("f12", list(int32())), field("f13", struct_({field("s1", int32()), + field("s2", utf8())}))}; + + Schema schema(fields); + TestRoundTrip(schema); +} + } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index 518e5eb412e..4bae3e4b118 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -77,12 +77,14 @@ class JsonSchemaWriter : public TypeVisitor { : schema_(schema), writer_(writer) {} Status Write() { + writer_->StartObject(); writer_->Key("fields"); writer_->StartArray(); for (const std::shared_ptr& field : schema_.fields()) { RETURN_NOT_OK(VisitField(*field.get())); } writer_->EndArray(); + writer_->EndObject(); return Status::OK(); } @@ -97,7 +99,6 @@ class JsonSchemaWriter : public TypeVisitor { // Visit the type RETURN_NOT_OK(field.type->Accept(this)); - writer_->EndObject(); return Status::OK(); @@ -214,11 +215,11 @@ class JsonSchemaWriter : public TypeVisitor { // TODO(wesm): Other Type metadata template - void WriteName(const T& type) { + void WriteName(const std::string& typeclass, const T& type) { writer_->Key("type"); writer_->StartObject(); writer_->Key("name"); - writer_->String(T::NAME); + writer_->String(typeclass); WriteTypeMetadata(type); @@ -226,15 +227,16 @@ class JsonSchemaWriter : public TypeVisitor { } template - void WritePrimitive(const T& type, const std::vector& buffer_layout) { - WriteName(type); + void WritePrimitive(const std::string& typeclass, const T& type, + const std::vector& buffer_layout) { + WriteName(typeclass, type); SetNoChildren(); WriteBufferLayout(buffer_layout); } template - void WriteVarBytes(const T& type) { - WriteName(type); + void WriteVarBytes(const std::string& typeclass, const T& type) { + WriteName(typeclass, type); SetNoChildren(); WriteBufferLayout({kValidityBuffer, kOffsetBuffer, kValues8}); } @@ -257,125 +259,128 @@ class JsonSchemaWriter : public TypeVisitor { } Status WriteChildren(const std::vector>& children) { + writer_->Key("children"); + writer_->StartArray(); for (const std::shared_ptr& field : children) { RETURN_NOT_OK(VisitField(*field.get())); } + writer_->EndArray(); return Status::OK(); } Status Visit(const NullType& type) override { - WritePrimitive(type, {}); + WritePrimitive("null", type, {}); return Status::OK(); } Status Visit(const BooleanType& type) override { - WritePrimitive(type, {kValidityBuffer, kBooleanBuffer}); + WritePrimitive("bool", type, {kValidityBuffer, kBooleanBuffer}); return Status::OK(); } Status Visit(const Int8Type& type) override { - WritePrimitive(type, {kValidityBuffer, kValues8}); + WritePrimitive("int", type, {kValidityBuffer, kValues8}); return Status::OK(); } Status Visit(const Int16Type& type) override { - WritePrimitive(type, {kValidityBuffer, kValues16}); + WritePrimitive("int", type, {kValidityBuffer, kValues16}); return Status::OK(); } Status Visit(const Int32Type& type) override { - WritePrimitive(type, {kValidityBuffer, kValues32}); + WritePrimitive("int", type, {kValidityBuffer, kValues32}); return Status::OK(); } Status Visit(const Int64Type& type) override { - WritePrimitive(type, {kValidityBuffer, kValues64}); + WritePrimitive("int", type, {kValidityBuffer, kValues64}); return Status::OK(); } Status Visit(const UInt8Type& type) override { - WritePrimitive(type, {kValidityBuffer, kValues8}); + WritePrimitive("int", type, {kValidityBuffer, kValues8}); return Status::OK(); } Status Visit(const UInt16Type& type) override { - WritePrimitive(type, {kValidityBuffer, kValues16}); + WritePrimitive("int", type, {kValidityBuffer, kValues16}); return Status::OK(); } Status Visit(const UInt32Type& type) override { - WritePrimitive(type, {kValidityBuffer, kValues32}); + WritePrimitive("int", type, {kValidityBuffer, kValues32}); return Status::OK(); } Status Visit(const UInt64Type& type) override { - WritePrimitive(type, {kValidityBuffer, kValues64}); + WritePrimitive("int", type, {kValidityBuffer, kValues64}); return Status::OK(); } Status Visit(const HalfFloatType& type) override { - WritePrimitive(type, {kValidityBuffer, kValues16}); + WritePrimitive("floatingpoint", type, {kValidityBuffer, kValues16}); return Status::OK(); } Status Visit(const FloatType& type) override { - WritePrimitive(type, {kValidityBuffer, kValues32}); + WritePrimitive("floatingpoint", type, {kValidityBuffer, kValues32}); return Status::OK(); } Status Visit(const DoubleType& type) override { - WritePrimitive(type, {kValidityBuffer, kValues64}); + WritePrimitive("floatingpoint", type, {kValidityBuffer, kValues64}); return Status::OK(); } Status Visit(const StringType& type) override { - WriteVarBytes(type); + WriteVarBytes("utf8", type); return Status::OK(); } Status Visit(const BinaryType& type) override { - WriteVarBytes(type); + WriteVarBytes("binary", type); return Status::OK(); } Status Visit(const DateType& type) override { - WritePrimitive(type, {kValidityBuffer, kValues64}); + WritePrimitive("date", type, {kValidityBuffer, kValues64}); return Status::OK(); } Status Visit(const TimeType& type) override { - WritePrimitive(type, {kValidityBuffer, kValues64}); + WritePrimitive("time", type, {kValidityBuffer, kValues64}); return Status::OK(); } Status Visit(const TimestampType& type) override { - WritePrimitive(type, {kValidityBuffer, kValues64}); + WritePrimitive("timestamp", type, {kValidityBuffer, kValues64}); return Status::OK(); } Status Visit(const IntervalType& type) override { - WritePrimitive(type, {kValidityBuffer, kValues64}); + WritePrimitive("interval", type, {kValidityBuffer, kValues64}); return Status::OK(); } Status Visit(const DecimalType& type) override { return Status::NotImplemented("NYI"); } Status Visit(const ListType& type) override { - WriteName(type); + WriteName("list", type); RETURN_NOT_OK(WriteChildren(type.children())); WriteBufferLayout({kValidityBuffer, kOffsetBuffer}); return Status::OK(); } Status Visit(const StructType& type) override { - WriteName(type); + WriteName("struct", type); WriteChildren(type.children()); WriteBufferLayout({kValidityBuffer, kTypeBuffer}); return Status::OK(); } Status Visit(const UnionType& type) override { - WriteName(type); + WriteName("union", type); WriteChildren(type.children()); if (type.mode == UnionType::SPARSE) { @@ -391,29 +396,51 @@ class JsonSchemaWriter : public TypeVisitor { RjWriter* writer_; }; -#define RETURN_NOT_STRING(NAME, PARENT) \ - if (NAME == PARENT.MemberEnd() || !NAME->value.IsString()) { \ - return Status::Invalid("invalid field"); \ +#define RETURN_NOT_FOUND(NAME, PARENT) \ + if (NAME == PARENT.MemberEnd()) { \ + std::stringstream ss; \ + ss << "field not found"; \ + return Status::Invalid(ss.str()); \ + } + +#define RETURN_NOT_STRING(NAME, PARENT) \ + RETURN_NOT_FOUND(NAME, PARENT); \ + if (!NAME->value.IsString()) { \ + std::stringstream ss; \ + ss << "field was not a string"; \ + return Status::Invalid(ss.str()); \ } -#define RETURN_NOT_BOOL(NAME, PARENT) \ - if (NAME == PARENT.MemberEnd() || !NAME->value.IsBool()) { \ - return Status::Invalid("invalid field"); \ +#define RETURN_NOT_BOOL(NAME, PARENT) \ + RETURN_NOT_FOUND(NAME, PARENT); \ + if (!NAME->value.IsBool()) { \ + std::stringstream ss; \ + ss << "field was not a boolean"; \ + return Status::Invalid(ss.str()); \ } -#define RETURN_NOT_INT(NAME, PARENT) \ - if (NAME == PARENT.MemberEnd() || !NAME->value.IsInt()) { \ - return Status::Invalid("invalid field"); \ +#define RETURN_NOT_INT(NAME, PARENT) \ + RETURN_NOT_FOUND(NAME, PARENT); \ + if (!NAME->value.IsInt()) { \ + std::stringstream ss; \ + ss << "field was not an int"; \ + return Status::Invalid(ss.str()); \ } -#define RETURN_NOT_ARRAY(NAME, PARENT) \ - if (NAME == PARENT.MemberEnd() || !NAME->value.IsArray()) { \ - return Status::Invalid("invalid field"); \ +#define RETURN_NOT_ARRAY(NAME, PARENT) \ + RETURN_NOT_FOUND(NAME, PARENT); \ + if (!NAME->value.IsArray()) { \ + std::stringstream ss; \ + ss << "field was not an array"; \ + return Status::Invalid(ss.str()); \ } -#define RETURN_NOT_OBJECT(NAME, PARENT) \ - if (NAME == PARENT.MemberEnd() || !NAME->value.IsObject()) { \ - return Status::Invalid("invalid field"); \ +#define RETURN_NOT_OBJECT(NAME, PARENT) \ + RETURN_NOT_FOUND(NAME, PARENT); \ + if (!NAME->value.IsObject()) { \ + std::stringstream ss; \ + ss << "field was not an object"; \ + return Status::Invalid(ss.str()); \ } class JsonSchemaReader { @@ -474,7 +501,7 @@ class JsonSchemaReader { Status GetInteger(const rj::Value& obj, std::shared_ptr* type) { const auto& json_type = obj.GetObject(); - const auto& json_bit_width = json_type.FindMember("bidWidth"); + const auto& json_bit_width = json_type.FindMember("bitWidth"); RETURN_NOT_INT(json_bit_width, json_type); const auto& json_is_signed = json_type.FindMember("isSigned"); @@ -485,18 +512,10 @@ class JsonSchemaReader { switch (bit_width) { case 8: - if (is_signed) { - *type = std::make_shared(); - } else { - *type = std::make_shared(); - } + *type = is_signed ? int8() : uint8(); break; case 16: - if (is_signed) { - *type = std::make_shared(); - } else { - *type = std::make_shared(); - } + *type = is_signed ? int16() : uint16(); break; case 32: if (is_signed) { @@ -530,7 +549,7 @@ class JsonSchemaReader { if (precision == "DOUBLE") { *type = std::make_shared(); - } else if (precision == "FLOAT") { + } else if (precision == "SINGLE") { *type = std::make_shared(); } else if (precision == "HALF") { *type = std::make_shared(); diff --git a/cpp/src/arrow/ipc/json-internal.h b/cpp/src/arrow/ipc/json-internal.h index 576d98c2080..7fd50113161 100644 --- a/cpp/src/arrow/ipc/json-internal.h +++ b/cpp/src/arrow/ipc/json-internal.h @@ -31,6 +31,7 @@ #include "rapidjson/writer.h" #include "arrow/type_fwd.h" +#include "arrow/util/visibility.h" namespace rj = rapidjson; using RjWriter = rj::Writer; @@ -42,11 +43,13 @@ class Schema; namespace ipc { -Status WriteJsonSchema(const Schema& schema, RjWriter* json_writer); -Status WriteJsonArray(const Array& array, RjWriter* json_writer); +Status ARROW_EXPORT WriteJsonSchema(const Schema& schema, RjWriter* json_writer); +Status ARROW_EXPORT WriteJsonArray(const Array& array, RjWriter* json_writer); -Status ReadJsonSchema(const rj::Value& json_arr, std::shared_ptr* schema); -Status ReadJsonArray(const rj::Value& json_obj, std::shared_ptr* schema); +Status ARROW_EXPORT ReadJsonSchema( + const rj::Value& json_arr, std::shared_ptr* schema); +Status ARROW_EXPORT ReadJsonArray( + const rj::Value& json_obj, std::shared_ptr* schema); } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/test-common.h b/cpp/src/arrow/ipc/test-common.h index 784e238e977..9abc20d876d 100644 --- a/cpp/src/arrow/ipc/test-common.h +++ b/cpp/src/arrow/ipc/test-common.h @@ -27,6 +27,7 @@ #include "arrow/array.h" #include "arrow/table.h" #include "arrow/test-util.h" +#include "arrow/type.h" #include "arrow/types/list.h" #include "arrow/types/primitive.h" #include "arrow/types/string.h" @@ -39,15 +40,14 @@ namespace arrow { namespace ipc { const auto kInt32 = std::make_shared(); -const auto kListInt32 = std::make_shared(kInt32); -const auto kListListInt32 = std::make_shared(kListInt32); +const auto kListInt32 = list(kInt32); +const auto kListListInt32 = list(kListInt32); Status MakeRandomInt32Array( int32_t length, bool include_nulls, MemoryPool* pool, std::shared_ptr* out) { std::shared_ptr data; test::MakeRandomInt32PoolBuffer(length, pool, &data); - const auto kInt32 = std::make_shared(); - Int32Builder builder(pool, kInt32); + Int32Builder builder(pool, int32()); if (include_nulls) { std::shared_ptr valid_bytes; test::MakeRandomBytePoolBuffer(length, pool, &valid_bytes); @@ -134,8 +134,8 @@ Status MakeRandomBinaryArray( Status MakeStringTypesRecordBatch(std::shared_ptr* out) { const int32_t length = 500; - auto string_type = std::make_shared(); - auto binary_type = std::make_shared(); + auto string_type = utf8(); + auto binary_type = binary(); auto f0 = std::make_shared("f0", string_type); auto f1 = std::make_shared("f1", binary_type); std::shared_ptr schema(new Schema({f0, f1})); @@ -233,7 +233,7 @@ Status MakeDeeplyNestedList(std::shared_ptr* out) { const bool include_nulls = true; RETURN_NOT_OK(MakeRandomInt32Array(1000, include_nulls, pool, &array)); for (int i = 0; i < 63; ++i) { - type = std::static_pointer_cast(std::make_shared(type)); + type = std::static_pointer_cast(list(type)); RETURN_NOT_OK(MakeRandomListArray(array, batch_length, include_nulls, pool, &array)); } diff --git a/cpp/src/arrow/schema-test.cc b/cpp/src/arrow/schema-test.cc index 2398c838e5b..dcfc9fd5ff7 100644 --- a/cpp/src/arrow/schema-test.cc +++ b/cpp/src/arrow/schema-test.cc @@ -22,7 +22,6 @@ #include "gtest/gtest.h" #include "arrow/schema.h" -#include "arrow/factory.h" #include "arrow/type.h" using std::shared_ptr; @@ -56,11 +55,11 @@ class TestSchema : public ::testing::Test { }; TEST_F(TestSchema, Basics) { - auto f0 = std::make_shared("f0", int32()); - auto f1 = std::make_shared("f1", uint8(), false); - auto f1_optional = std::make_shared("f1", uint8()); + auto f0 = field("f0", int32()); + auto f1 = field("f1", uint8(), false); + auto f1_optional = field("f1", uint8()); - auto f2 = std::make_shared("f2", utf8()); + auto f2 = field("f2", utf8()); vector> fields = {f0, f1, f2}; auto schema = std::make_shared(fields); @@ -82,10 +81,10 @@ TEST_F(TestSchema, Basics) { } TEST_F(TestSchema, ToString) { - auto f0 = std::make_shared("f0", int32()); - auto f1 = std::make_shared("f1", uint8(), false); - auto f2 = std::make_shared("f2", utf8()); - auto f3 = std::make_shared("f3", std::make_shared(int16())); + auto f0 = field("f0", int32()); + auto f1 = field("f1", uint8(), false); + auto f2 = field("f2", utf8()); + auto f3 = field("f3", list(int16())); vector> fields = {f0, f1, f2, f3}; auto schema = std::make_shared(fields); diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index f6b4dbbee7e..d89d69aabdc 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -142,4 +142,36 @@ const std::string ListType::NAME = "list"; const std::string StructType::NAME = "struct"; const std::string UnionType::NAME = "union"; +#define TYPE_FACTORY(NAME, KLASS) \ + std::shared_ptr NAME() { \ + static std::shared_ptr result = std::make_shared(); \ + return result; \ + } + +TYPE_FACTORY(int8, Int8Type); +TYPE_FACTORY(uint8, UInt8Type); +TYPE_FACTORY(int16, Int16Type); +TYPE_FACTORY(uint16, UInt16Type); +TYPE_FACTORY(int32, Int32Type); +TYPE_FACTORY(uint32, UInt32Type); +TYPE_FACTORY(int64, Int64Type); +TYPE_FACTORY(uint64, UInt64Type); +TYPE_FACTORY(float32, FloatType); +TYPE_FACTORY(float64, DoubleType); +TYPE_FACTORY(utf8, StringType); +TYPE_FACTORY(binary, BinaryType); + +std::shared_ptr list(const std::shared_ptr& value_type) { + return std::make_shared(value_type); +} + +std::shared_ptr struct_(const std::vector>& fields) { + return std::make_shared(fields); +} + +std::shared_ptr field( + const std::string& name, const TypePtr& type, bool nullable, int64_t dictionary) { + return std::make_shared(name, type, nullable, dictionary); +} + } // namespace arrow diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 9b6ba12936b..bf1e6980cb9 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -424,6 +424,28 @@ struct ARROW_EXPORT IntervalType : public DataType, public PrimitiveMeta { template struct TypeTraits {}; +// Factory functions + +std::shared_ptr ARROW_EXPORT int8(); +std::shared_ptr ARROW_EXPORT int16(); +std::shared_ptr ARROW_EXPORT int32(); +std::shared_ptr ARROW_EXPORT int64(); +std::shared_ptr ARROW_EXPORT uint8(); +std::shared_ptr ARROW_EXPORT uint16(); +std::shared_ptr ARROW_EXPORT uint32(); +std::shared_ptr ARROW_EXPORT uint64(); +std::shared_ptr ARROW_EXPORT float32(); +std::shared_ptr ARROW_EXPORT float64(); +std::shared_ptr ARROW_EXPORT utf8(); +std::shared_ptr ARROW_EXPORT binary(); +std::shared_ptr ARROW_EXPORT list(const std::shared_ptr& value_type); + +std::shared_ptr ARROW_EXPORT struct_( + const std::vector>& fields); + +std::shared_ptr ARROW_EXPORT field(const std::string& name, const TypePtr& type, + bool nullable = true, int64_t dictionary = 0); + } // namespace arrow #endif // ARROW_TYPE_H From 5fbea412ef675f6b7e3e8c4d536821827bf0fdb6 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 10 Nov 2016 18:06:45 -0500 Subject: [PATCH 08/27] Implement some more json types and add convenience factory functions Change-Id: I1cf8aae078e76c03dcc6e6f7000ccfb6587cdc78 --- cpp/src/arrow/ipc/ipc-json-test.cc | 7 +++- cpp/src/arrow/ipc/json-internal.cc | 66 +++++++++++++++++++++--------- cpp/src/arrow/type.cc | 18 ++++++++ cpp/src/arrow/type.h | 9 ++++ 4 files changed, 79 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc index 8812e2ed091..bc22cb3e388 100644 --- a/cpp/src/arrow/ipc/ipc-json-test.cc +++ b/cpp/src/arrow/ipc/ipc-json-test.cc @@ -30,8 +30,8 @@ #include "rapidjson/prettywriter.h" #include "gtest/gtest.h" -#include "arrow/ipc/adapter.h" #include "arrow/test-util.h" +#include "arrow/type.h" #include "arrow/util/memory-pool.h" #include "arrow/util/status.h" @@ -66,7 +66,10 @@ TEST_F(TestJsonSchemaWriter, FlatTypes) { field("f6", uint32()), field("f7", uint64()), field("f8", float32()), field("f9", float64()), field("f10", utf8()), field("f11", binary()), field("f12", list(int32())), field("f13", struct_({field("s1", int32()), - field("s2", utf8())}))}; + field("s2", utf8())})), + field("f14", date()), field("f15", timestamp(TimeUnit::NANO)), + field("f16", timestamp(TimeUnit::MICRO)), + }; Schema schema(fields); TestRoundTrip(schema); diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index 4bae3e4b118..fa3c0bc3a44 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -518,18 +518,10 @@ class JsonSchemaReader { *type = is_signed ? int16() : uint16(); break; case 32: - if (is_signed) { - *type = std::make_shared(); - } else { - *type = std::make_shared(); - } + *type = is_signed ? int32() : uint32(); break; case 64: - if (is_signed) { - *type = std::make_shared(); - } else { - *type = std::make_shared(); - } + *type = is_signed ? int64() : uint64(); break; default: std::stringstream ss; @@ -548,11 +540,11 @@ class JsonSchemaReader { std::string precision = json_precision->value.GetString(); if (precision == "DOUBLE") { - *type = std::make_shared(); + *type = float64(); } else if (precision == "SINGLE") { - *type = std::make_shared(); + *type = float32(); } else if (precision == "HALF") { - *type = std::make_shared(); + *type = float16(); } else { std::stringstream ss; ss << "Invalid precision: " << precision; @@ -561,6 +553,36 @@ class JsonSchemaReader { return Status::OK(); } + template + Status GetTimeLike(const rj::Value& obj, std::shared_ptr* type) { + const auto& json_type = obj.GetObject(); + + const auto& json_unit = json_type.FindMember("unit"); + RETURN_NOT_STRING(json_unit, json_type); + + std::string unit_str = json_unit->value.GetString(); + + TimeUnit unit; + + if (unit_str == "SECOND") { + unit = TimeUnit::SECOND; + } else if (unit_str == "MILLISECOND") { + unit = TimeUnit::MILLI; + } else if (unit_str == "MICROSECOND") { + unit = TimeUnit::MICRO; + } else if (unit_str == "NANOSECOND") { + unit = TimeUnit::NANO; + } else { + std::stringstream ss; + ss << "Invalid time unit: " << unit_str; + return Status::Invalid(ss.str()); + } + + *type = std::make_shared(unit); + + return Status::OK(); + } + Status GetType(const rj::Value& obj, const std::vector>& children, std::shared_ptr* type) { @@ -576,17 +598,23 @@ class JsonSchemaReader { } else if (type_name == "floatingpoint") { return GetFloatingPoint(obj, type); } else if (type_name == "bool") { - *type = std::make_shared(); + *type = boolean(); } else if (type_name == "utf8") { - *type = std::make_shared(); + *type = utf8(); } else if (type_name == "binary") { - *type = std::make_shared(); + *type = binary(); } else if (type_name == "null") { - *type = std::make_shared(); + *type = null(); + } else if (type_name == "date") { + *type = date(); + } else if (type_name == "time") { + return GetTimeLike(obj, type); + } else if (type_name == "timestamp") { + return GetTimeLike(obj, type); } else if (type_name == "list") { - *type = std::make_shared(children[0]); + *type = list(children[0]); } else if (type_name == "struct") { - *type = std::make_shared(children); + *type = struct_(children); } else { return Status::NotImplemented(type_name); } diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index d89d69aabdc..7261e67111c 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -148,6 +148,8 @@ const std::string UnionType::NAME = "union"; return result; \ } +TYPE_FACTORY(null, NullType); +TYPE_FACTORY(boolean, BooleanType); TYPE_FACTORY(int8, Int8Type); TYPE_FACTORY(uint8, UInt8Type); TYPE_FACTORY(int16, Int16Type); @@ -156,15 +158,31 @@ TYPE_FACTORY(int32, Int32Type); TYPE_FACTORY(uint32, UInt32Type); TYPE_FACTORY(int64, Int64Type); TYPE_FACTORY(uint64, UInt64Type); +TYPE_FACTORY(float16, HalfFloatType); TYPE_FACTORY(float32, FloatType); TYPE_FACTORY(float64, DoubleType); TYPE_FACTORY(utf8, StringType); TYPE_FACTORY(binary, BinaryType); +TYPE_FACTORY(date, DateType); + +std::shared_ptr timestamp(TimeUnit unit) { + static std::shared_ptr result = std::make_shared(); + return result; +} + +std::shared_ptr time(TimeUnit unit) { + static std::shared_ptr result = std::make_shared(); + return result; +} std::shared_ptr list(const std::shared_ptr& value_type) { return std::make_shared(value_type); } +std::shared_ptr list(const std::shared_ptr& value_field) { + return std::make_shared(value_field); +} + std::shared_ptr struct_(const std::vector>& fields) { return std::make_shared(fields); } diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index bf1e6980cb9..bf3a18fc7db 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -426,6 +426,8 @@ struct TypeTraits {}; // Factory functions +std::shared_ptr ARROW_EXPORT null(); +std::shared_ptr ARROW_EXPORT boolean(); std::shared_ptr ARROW_EXPORT int8(); std::shared_ptr ARROW_EXPORT int16(); std::shared_ptr ARROW_EXPORT int32(); @@ -434,12 +436,19 @@ std::shared_ptr ARROW_EXPORT uint8(); std::shared_ptr ARROW_EXPORT uint16(); std::shared_ptr ARROW_EXPORT uint32(); std::shared_ptr ARROW_EXPORT uint64(); +std::shared_ptr ARROW_EXPORT float16(); std::shared_ptr ARROW_EXPORT float32(); std::shared_ptr ARROW_EXPORT float64(); std::shared_ptr ARROW_EXPORT utf8(); std::shared_ptr ARROW_EXPORT binary(); + +std::shared_ptr ARROW_EXPORT list(const std::shared_ptr& value_type); std::shared_ptr ARROW_EXPORT list(const std::shared_ptr& value_type); +std::shared_ptr ARROW_EXPORT date(); +std::shared_ptr ARROW_EXPORT timestamp(TimeUnit unit); +std::shared_ptr ARROW_EXPORT time(TimeUnit unit); + std::shared_ptr ARROW_EXPORT struct_( const std::vector>& fields); From 379da3c8e0c1e929c84b1d93c1098abd8dbc366d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 10 Nov 2016 18:31:29 -0500 Subject: [PATCH 09/27] Implement union metadata JSON serialization Change-Id: I6d34fa8cbb25ea3fdaf53bada30690a2b8dd6c1f --- cpp/src/arrow/ipc/ipc-json-test.cc | 18 ++-- cpp/src/arrow/ipc/json-internal.cc | 138 ++++++++++++++++++----------- cpp/src/arrow/type.cc | 12 ++- cpp/src/arrow/type.h | 18 ++-- 4 files changed, 114 insertions(+), 72 deletions(-) diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc index bc22cb3e388..7690f5400b4 100644 --- a/cpp/src/arrow/ipc/ipc-json-test.cc +++ b/cpp/src/arrow/ipc/ipc-json-test.cc @@ -60,16 +60,16 @@ class TestJsonSchemaWriter : public ::testing::Test { }; TEST_F(TestJsonSchemaWriter, FlatTypes) { - std::vector> fields = { - field("f0", int8()), field("f1", int16(), false), field("f2", int32()), - field("f3", int64(), false), field("f4", uint8()), field("f5", uint16()), - field("f6", uint32()), field("f7", uint64()), field("f8", float32()), - field("f9", float64()), field("f10", utf8()), field("f11", binary()), - field("f12", list(int32())), field("f13", struct_({field("s1", int32()), - field("s2", utf8())})), + std::vector> fields = {field("f0", int8()), + field("f1", int16(), false), field("f2", int32()), field("f3", int64(), false), + field("f4", uint8()), field("f5", uint16()), field("f6", uint32()), + field("f7", uint64()), field("f8", float32()), field("f9", float64()), + field("f10", utf8()), field("f11", binary()), field("f12", list(int32())), + field("f13", struct_({field("s1", int32()), field("s2", utf8())})), field("f14", date()), field("f15", timestamp(TimeUnit::NANO)), - field("f16", timestamp(TimeUnit::MICRO)), - }; + field("f16", time(TimeUnit::MICRO)), + field("f17", union_({field("u1", int8()), field("u2", time(TimeUnit::MILLI))}, + {0, 1}, UnionMode::DENSE))}; Schema schema(fields); TestRoundTrip(schema); diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index fa3c0bc3a44..aedda331437 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -195,10 +195,10 @@ class JsonSchemaWriter : public TypeVisitor { WriteTypeMetadata(const T& type) { writer_->Key("mode"); switch (type.mode) { - case UnionType::SPARSE: + case UnionMode::SPARSE: writer_->String("SPARSE"); break; - case UnionType::DENSE: + case UnionMode::DENSE: writer_->String("DENSE"); break; }; @@ -220,9 +220,7 @@ class JsonSchemaWriter : public TypeVisitor { writer_->StartObject(); writer_->Key("name"); writer_->String(typeclass); - WriteTypeMetadata(type); - writer_->EndObject(); } @@ -383,12 +381,12 @@ class JsonSchemaWriter : public TypeVisitor { WriteName("union", type); WriteChildren(type.children()); - if (type.mode == UnionType::SPARSE) { + if (type.mode == UnionMode::SPARSE) { WriteBufferLayout({kValidityBuffer, kTypeBuffer}); } else { WriteBufferLayout({kValidityBuffer, kTypeBuffer, kOffsetBuffer}); } - return Status::NotImplemented("NYI"); + return Status::OK(); } private: @@ -396,51 +394,51 @@ class JsonSchemaWriter : public TypeVisitor { RjWriter* writer_; }; -#define RETURN_NOT_FOUND(NAME, PARENT) \ - if (NAME == PARENT.MemberEnd()) { \ - std::stringstream ss; \ - ss << "field not found"; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_FOUND(TOK, NAME, PARENT) \ + if (NAME == PARENT.MemberEnd()) { \ + std::stringstream ss; \ + ss << "field " << TOK << " not found"; \ + return Status::Invalid(ss.str()); \ } -#define RETURN_NOT_STRING(NAME, PARENT) \ - RETURN_NOT_FOUND(NAME, PARENT); \ - if (!NAME->value.IsString()) { \ - std::stringstream ss; \ - ss << "field was not a string"; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_STRING(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsString()) { \ + std::stringstream ss; \ + ss << "field was not a string"; \ + return Status::Invalid(ss.str()); \ } -#define RETURN_NOT_BOOL(NAME, PARENT) \ - RETURN_NOT_FOUND(NAME, PARENT); \ - if (!NAME->value.IsBool()) { \ - std::stringstream ss; \ - ss << "field was not a boolean"; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_BOOL(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsBool()) { \ + std::stringstream ss; \ + ss << "field was not a boolean"; \ + return Status::Invalid(ss.str()); \ } -#define RETURN_NOT_INT(NAME, PARENT) \ - RETURN_NOT_FOUND(NAME, PARENT); \ - if (!NAME->value.IsInt()) { \ - std::stringstream ss; \ - ss << "field was not an int"; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_INT(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsInt()) { \ + std::stringstream ss; \ + ss << "field was not an int"; \ + return Status::Invalid(ss.str()); \ } -#define RETURN_NOT_ARRAY(NAME, PARENT) \ - RETURN_NOT_FOUND(NAME, PARENT); \ - if (!NAME->value.IsArray()) { \ - std::stringstream ss; \ - ss << "field was not an array"; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_ARRAY(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsArray()) { \ + std::stringstream ss; \ + ss << "field was not an array"; \ + return Status::Invalid(ss.str()); \ } -#define RETURN_NOT_OBJECT(NAME, PARENT) \ - RETURN_NOT_FOUND(NAME, PARENT); \ - if (!NAME->value.IsObject()) { \ - std::stringstream ss; \ - ss << "field was not an object"; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_OBJECT(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsObject()) { \ + std::stringstream ss; \ + ss << "field was not an object"; \ + return Status::Invalid(ss.str()); \ } class JsonSchemaReader { @@ -451,7 +449,7 @@ class JsonSchemaReader { const auto& obj_schema = json_schema_.GetObject(); const auto& json_fields = obj_schema.FindMember("fields"); - RETURN_NOT_ARRAY(json_fields, obj_schema); + RETURN_NOT_ARRAY("fields", json_fields, obj_schema); std::vector> fields; RETURN_NOT_OK(GetFieldsFromArray(json_fields->value, &fields)); @@ -476,16 +474,16 @@ class JsonSchemaReader { const auto& json_field = obj.GetObject(); const auto& json_name = json_field.FindMember("name"); - RETURN_NOT_STRING(json_name, json_field); + RETURN_NOT_STRING("name", json_name, json_field); const auto& json_nullable = json_field.FindMember("nullable"); - RETURN_NOT_BOOL(json_nullable, json_field); + RETURN_NOT_BOOL("nullable", json_nullable, json_field); const auto& json_type = json_field.FindMember("type"); - RETURN_NOT_OBJECT(json_type, json_field); + RETURN_NOT_OBJECT("type", json_type, json_field); const auto& json_children = json_field.FindMember("children"); - RETURN_NOT_ARRAY(json_children, json_field); + RETURN_NOT_ARRAY("children", json_children, json_field); std::vector> children; RETURN_NOT_OK(GetFieldsFromArray(json_children->value, &children)); @@ -502,10 +500,10 @@ class JsonSchemaReader { const auto& json_type = obj.GetObject(); const auto& json_bit_width = json_type.FindMember("bitWidth"); - RETURN_NOT_INT(json_bit_width, json_type); + RETURN_NOT_INT("bitWidth", json_bit_width, json_type); const auto& json_is_signed = json_type.FindMember("isSigned"); - RETURN_NOT_BOOL(json_is_signed, json_type); + RETURN_NOT_BOOL("isSigned", json_is_signed, json_type); bool is_signed = json_is_signed->value.GetBool(); int bit_width = json_bit_width->value.GetInt(); @@ -535,7 +533,7 @@ class JsonSchemaReader { const auto& json_type = obj.GetObject(); const auto& json_precision = json_type.FindMember("precision"); - RETURN_NOT_STRING(json_precision, json_type); + RETURN_NOT_STRING("precision", json_precision, json_type); std::string precision = json_precision->value.GetString(); @@ -558,7 +556,7 @@ class JsonSchemaReader { const auto& json_type = obj.GetObject(); const auto& json_unit = json_type.FindMember("unit"); - RETURN_NOT_STRING(json_unit, json_type); + RETURN_NOT_STRING("unit", json_unit, json_type); std::string unit_str = json_unit->value.GetString(); @@ -583,13 +581,49 @@ class JsonSchemaReader { return Status::OK(); } + Status GetUnion(const rj::Value& obj, + const std::vector>& children, + std::shared_ptr* type) { + const auto& json_type = obj.GetObject(); + + const auto& json_mode = json_type.FindMember("mode"); + RETURN_NOT_STRING("mode", json_mode, json_type); + + std::string mode_str = json_mode->value.GetString(); + UnionMode mode; + + if (mode_str == "SPARSE") { + mode = UnionMode::SPARSE; + } else if (mode_str == "DENSE") { + mode = UnionMode::DENSE; + } else { + std::stringstream ss; + ss << "Invalid union mode: " << mode_str; + ; + return Status::Invalid(ss.str()); + } + + const auto& json_type_ids = json_type.FindMember("typeIds"); + RETURN_NOT_ARRAY("typeIds", json_type_ids, json_type); + + std::vector type_ids; + const auto& id_array = json_type_ids->value.GetArray(); + for (const rj::Value& val : id_array) { + type_ids.push_back(val.GetUint()); + } + + *type = union_(children, type_ids, mode); + + return Status::OK(); + } + Status GetType(const rj::Value& obj, const std::vector>& children, std::shared_ptr* type) { const auto& json_type = obj.GetObject(); const auto& json_type_name = json_type.FindMember("name"); - RETURN_NOT_STRING(json_type_name, json_type); + RETURN_NOT_STRING("name", json_type_name, json_type); std::string type_name = json_type_name->value.GetString(); @@ -616,7 +650,7 @@ class JsonSchemaReader { } else if (type_name == "struct") { *type = struct_(children); } else { - return Status::NotImplemented(type_name); + return GetUnion(obj, children, type); } return Status::OK(); } diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 7261e67111c..af0d6944c97 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -87,15 +87,15 @@ std::string StructType::ToString() const { std::string UnionType::ToString() const { std::stringstream s; - if (mode == UnionType::SPARSE) { + if (mode == UnionMode::SPARSE) { s << "union[sparse]<"; } else { s << "union[dense]<"; } - for (size_t i = 0; i < child_types.size(); ++i) { + for (size_t i = 0; i < children_.size(); ++i) { if (i) { s << ", "; } - s << child_types[i]->ToString(); + s << children_[i]->ToString(); } s << ">"; return s.str(); @@ -187,6 +187,12 @@ std::shared_ptr struct_(const std::vector>& fie return std::make_shared(fields); } +std::shared_ptr ARROW_EXPORT union_( + const std::vector>& child_fields, + const std::vector& type_ids, UnionMode mode) { + return std::make_shared(child_fields, type_ids, mode); +} + std::shared_ptr field( const std::string& name, const TypePtr& type, bool nullable, int64_t dictionary) { return std::make_shared(name, type, nullable, dictionary); diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index bf3a18fc7db..3e6a2b9dc1e 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -342,21 +342,19 @@ struct ARROW_EXPORT DecimalType : public DataType { static const std::string NAME; }; -struct ARROW_EXPORT UnionType : public DataType { - enum UnionMode { SPARSE, DENSE }; +enum class UnionMode : char { SPARSE, DENSE }; - UnionType(const std::vector>& child_types, +struct ARROW_EXPORT UnionType : public DataType { + UnionType(const std::vector>& child_fields, const std::vector& type_ids, UnionMode mode = UnionMode::SPARSE) - : DataType(Type::UNION), mode(mode), child_types(child_types), type_ids(type_ids) {} - - const TypePtr& child(int i) const { return child_types[i]; } - int num_children() const { return child_types.size(); } + : DataType(Type::UNION), mode(mode), type_ids(type_ids) { + children_ = child_fields; + } std::string ToString() const override; Status Accept(TypeVisitor* visitor) const override; UnionMode mode; - std::vector child_types; std::vector type_ids; static const std::string NAME; }; @@ -452,6 +450,10 @@ std::shared_ptr ARROW_EXPORT time(TimeUnit unit); std::shared_ptr ARROW_EXPORT struct_( const std::vector>& fields); +std::shared_ptr ARROW_EXPORT union_( + const std::vector>& child_fields, + const std::vector& type_ids, UnionMode mode = UnionMode::SPARSE); + std::shared_ptr ARROW_EXPORT field(const std::string& name, const TypePtr& type, bool nullable = true, int64_t dictionary = 0); From 209ba4877dd4ac03f2cdc4108b4c9068108179cf Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 10 Nov 2016 19:13:28 -0500 Subject: [PATCH 10/27] More types refactoring. Strange linker error in pyarrow Change-Id: I112eaf0f591b80d94fad827cbf0d1c813d30d0bc --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/type.cc | 31 +++---------- cpp/src/arrow/type.h | 74 +++++++++++++++--------------- cpp/src/arrow/types/CMakeLists.txt | 1 - 4 files changed, 46 insertions(+), 61 deletions(-) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index a9b2feca28c..e3e16ef0817 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -24,6 +24,7 @@ install(FILES schema.h table.h type.h + type_fwd.h test-util.h DESTINATION include/arrow) diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index af0d6944c97..5bc1a8d5ce3 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -101,6 +101,13 @@ std::string UnionType::ToString() const { return s.str(); } +int NullType::bit_width() const { + return 0; +} +std::string NullType::ToString() const { + return name(); +} + // Visitors and template instantiation #define ACCEPT_VISITOR(TYPE) \ @@ -118,30 +125,6 @@ ACCEPT_VISITOR(TimeType); ACCEPT_VISITOR(TimestampType); ACCEPT_VISITOR(IntervalType); -const std::string NullType::NAME = "null"; -const std::string UInt8Type::NAME = "uint8"; -const std::string Int8Type::NAME = "int8"; -const std::string UInt16Type::NAME = "uint16"; -const std::string Int16Type::NAME = "int16"; -const std::string UInt32Type::NAME = "uint32"; -const std::string Int32Type::NAME = "int32"; -const std::string UInt64Type::NAME = "uint64"; -const std::string Int64Type::NAME = "int64"; -const std::string HalfFloatType::NAME = "halffloat"; -const std::string FloatType::NAME = "float"; -const std::string DoubleType::NAME = "double"; -const std::string BooleanType::NAME = "bool"; -const std::string BinaryType::NAME = "binary"; -const std::string StringType::NAME = "utf8"; -const std::string DecimalType::NAME = "decimal"; -const std::string DateType::NAME = "decimal"; -const std::string TimeType::NAME = "time"; -const std::string TimestampType::NAME = "timestamp"; -const std::string IntervalType::NAME = "interval"; -const std::string ListType::NAME = "list"; -const std::string StructType::NAME = "struct"; -const std::string UnionType::NAME = "union"; - #define TYPE_FACTORY(NAME, KLASS) \ std::shared_ptr NAME() { \ static std::shared_ptr result = std::make_shared(); \ diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 3e6a2b9dc1e..86bf4264ed3 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -133,15 +133,15 @@ struct ARROW_EXPORT DataType { typedef std::shared_ptr TypePtr; -struct PrimitiveMeta { +struct ARROW_EXPORT PrimitiveMeta { virtual int bit_width() const = 0; }; -struct IntegerMeta { +struct ARROW_EXPORT IntegerMeta { virtual bool is_signed() const = 0; }; -struct FloatingPointMeta { +struct ARROW_EXPORT FloatingPointMeta { enum Precision { HALF, SINGLE, DOUBLE }; virtual Precision precision() const = 0; }; @@ -197,7 +197,7 @@ struct ARROW_EXPORT PrimitiveType : public DataType, public PrimitiveMeta { return visitor->Visit(*static_cast(this)); } - std::string ToString() const override { return std::string(DERIVED::NAME); } + std::string ToString() const override { return std::string(DERIVED::name()); } }; struct ARROW_EXPORT NullType : public DataType, public PrimitiveMeta { @@ -205,13 +205,11 @@ struct ARROW_EXPORT NullType : public DataType, public PrimitiveMeta { NullType() : DataType(Type::NA) {} - int bit_width() const override { return 0; } - + int bit_width() const override; Status Accept(TypeVisitor* visitor) const override; + std::string ToString() const override; - static const std::string NAME; - - std::string ToString() const override { return NAME; } + static std::string name() { return "null"; } }; template @@ -222,61 +220,61 @@ struct IntegerTypeImpl : public PrimitiveType, struct ARROW_EXPORT BooleanType : public PrimitiveType { int bit_width() const override { return 1; } - static const std::string NAME; + static std::string name() { return "bool"; } }; struct ARROW_EXPORT UInt8Type : public IntegerTypeImpl { - static const std::string NAME; + static std::string name() { return "uint8"; } }; struct ARROW_EXPORT Int8Type : public IntegerTypeImpl { - static const std::string NAME; + static std::string name() { return "int8"; } }; struct ARROW_EXPORT UInt16Type : public IntegerTypeImpl { - static const std::string NAME; + static std::string name() { return "uint16"; } }; struct ARROW_EXPORT Int16Type : public IntegerTypeImpl { - static const std::string NAME; + static std::string name() { return "int16"; } }; struct ARROW_EXPORT UInt32Type : public IntegerTypeImpl { - static const std::string NAME; + static std::string name() { return "uint32"; } }; struct ARROW_EXPORT Int32Type : public IntegerTypeImpl { - static const std::string NAME; + static std::string name() { return "int32"; } }; struct ARROW_EXPORT UInt64Type : public IntegerTypeImpl { - static const std::string NAME; + static std::string name() { return "uint64"; } }; struct ARROW_EXPORT Int64Type : public IntegerTypeImpl { - static const std::string NAME; + static std::string name() { return "int64"; } }; struct ARROW_EXPORT HalfFloatType : public PrimitiveType, public FloatingPointMeta { Precision precision() const override; - static const std::string NAME; + static std::string name() { return "halffloat"; } }; struct ARROW_EXPORT FloatType : public PrimitiveType, public FloatingPointMeta { Precision precision() const override; - static const std::string NAME; + static std::string name() { return "float"; } }; struct ARROW_EXPORT DoubleType : public PrimitiveType, public FloatingPointMeta { Precision precision() const override; - static const std::string NAME; + static std::string name() { return "double"; } }; struct ARROW_EXPORT ListType : public DataType, public NoExtraMeta { @@ -294,7 +292,8 @@ struct ARROW_EXPORT ListType : public DataType, public NoExtraMeta { Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; - static const std::string NAME; + + static std::string name() { return "list"; } }; // BinaryType type is reprsents lists of 1-byte values. @@ -303,8 +302,7 @@ struct ARROW_EXPORT BinaryType : public DataType, public NoExtraMeta { Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; - - static const std::string NAME; + static std::string name() { return "binary"; } protected: // Allow subclasses to change the logical type. @@ -317,7 +315,7 @@ struct ARROW_EXPORT StringType : public BinaryType { Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; - static const std::string NAME; + static std::string name() { return "utf8"; } }; struct ARROW_EXPORT StructType : public DataType, public NoExtraMeta { @@ -328,7 +326,7 @@ struct ARROW_EXPORT StructType : public DataType, public NoExtraMeta { Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; - static const std::string NAME; + static std::string name() { return "struct"; } }; struct ARROW_EXPORT DecimalType : public DataType { @@ -339,7 +337,7 @@ struct ARROW_EXPORT DecimalType : public DataType { Status Accept(TypeVisitor* visitor) const override; std::string ToString() const override; - static const std::string NAME; + static std::string name() { return "decimal"; } }; enum class UnionMode : char { SPARSE, DENSE }; @@ -352,35 +350,39 @@ struct ARROW_EXPORT UnionType : public DataType { } std::string ToString() const override; + static std::string name() { return "union"; } Status Accept(TypeVisitor* visitor) const override; UnionMode mode; std::vector type_ids; - static const std::string NAME; }; struct ARROW_EXPORT DateType : public DataType, public NoExtraMeta { DateType() : DataType(Type::DATE) {} Status Accept(TypeVisitor* visitor) const override; - std::string ToString() const override { return NAME; } - static const std::string NAME; + std::string ToString() const override { return name(); } + static std::string name() { return "date"; } }; enum class TimeUnit : char { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 }; struct ARROW_EXPORT TimeType : public DataType { + using Unit = TimeUnit; + TimeUnit unit; explicit TimeType(TimeUnit unit = TimeUnit::MILLI) : DataType(Type::TIME), unit(unit) {} TimeType(const TimeType& other) : TimeType(other.unit) {} Status Accept(TypeVisitor* visitor) const override; - std::string ToString() const override { return NAME; } - static const std::string NAME; + std::string ToString() const override { return name(); } + static std::string name() { return "time"; } }; struct ARROW_EXPORT TimestampType : public DataType, public PrimitiveMeta { + using Unit = TimeUnit; + typedef int64_t c_type; static constexpr Type::type type_enum = Type::TIMESTAMP; @@ -394,8 +396,8 @@ struct ARROW_EXPORT TimestampType : public DataType, public PrimitiveMeta { TimestampType(const TimestampType& other) : TimestampType(other.unit) {} Status Accept(TypeVisitor* visitor) const override; - std::string ToString() const override { return NAME; } - static const std::string NAME; + std::string ToString() const override { return name(); } + static std::string name() { return "timestamp"; } }; struct ARROW_EXPORT IntervalType : public DataType, public PrimitiveMeta { @@ -414,8 +416,8 @@ struct ARROW_EXPORT IntervalType : public DataType, public PrimitiveMeta { IntervalType(const IntervalType& other) : IntervalType(other.unit) {} Status Accept(TypeVisitor* visitor) const override; - std::string ToString() const override { return NAME; } - static const std::string NAME; + std::string ToString() const override { return name(); } + static std::string name() { return "date"; } }; // These will be defined elsewhere diff --git a/cpp/src/arrow/types/CMakeLists.txt b/cpp/src/arrow/types/CMakeLists.txt index 9f781698982..6d59acfdf2e 100644 --- a/cpp/src/arrow/types/CMakeLists.txt +++ b/cpp/src/arrow/types/CMakeLists.txt @@ -21,7 +21,6 @@ # Headers: top level install(FILES - collection.h construct.h datetime.h decimal.h From 15c10941946e2e92325ab9b021772d61119df725 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 11 Nov 2016 18:16:23 -0500 Subject: [PATCH 11/27] Add type traits, refactoring, drafting json array writing. not working yet Change-Id: Ic33aed8adb80cc79cf24f843ef508722f0ff384c --- cpp/src/arrow/array.h | 2 + cpp/src/arrow/ipc/adapter.cc | 2 +- cpp/src/arrow/ipc/json-internal.cc | 321 +++++++++++++++++++++----- cpp/src/arrow/ipc/json-internal.h | 5 +- cpp/src/arrow/type.h | 4 - cpp/src/arrow/type_traits.h | 142 ++++++++++++ cpp/src/arrow/types/list-test.cc | 2 +- cpp/src/arrow/types/list.h | 6 +- cpp/src/arrow/types/primitive-test.cc | 1 + cpp/src/arrow/types/primitive.cc | 1 + cpp/src/arrow/types/primitive.h | 91 +------- cpp/src/arrow/types/string.cc | 8 +- cpp/src/arrow/types/string.h | 6 + cpp/src/arrow/types/struct-test.cc | 2 +- cpp/src/arrow/types/struct.h | 2 + 15 files changed, 431 insertions(+), 164 deletions(-) create mode 100644 cpp/src/arrow/type_traits.h diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index ff37323f605..8647d29e074 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -86,6 +86,8 @@ class ARROW_EXPORT Array { // Degenerate null type Array class ARROW_EXPORT NullArray : public Array { public: + using TypeClass = NullType; + NullArray(const std::shared_ptr& type, int32_t length) : Array(type, length, length, nullptr) {} diff --git a/cpp/src/arrow/ipc/adapter.cc b/cpp/src/arrow/ipc/adapter.cc index 74786bf85ff..da718c08d54 100644 --- a/cpp/src/arrow/ipc/adapter.cc +++ b/cpp/src/arrow/ipc/adapter.cc @@ -106,7 +106,7 @@ Status VisitArray(const Array* arr, std::vector* field_nodes buffers->push_back(binary_arr->data()); } else if (arr->type_enum() == Type::LIST) { const auto list_arr = static_cast(arr); - buffers->push_back(list_arr->offset_buffer()); + buffers->push_back(list_arr->offsets()); RETURN_NOT_OK(VisitArray( list_arr->values().get(), field_nodes, buffers, max_recursion_depth - 1)); } else if (arr->type_enum() == Type::STRUCT) { diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index aedda331437..0ef749a13b7 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -25,8 +25,14 @@ #include "rapidjson/stringbuffer.h" #include "rapidjson/writer.h" +#include "arrow/array.h" #include "arrow/schema.h" #include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/types/list.h" +#include "arrow/types/primitive.h" +#include "arrow/types/string.h" +#include "arrow/types/struct.h" #include "arrow/util/status.h" namespace arrow { @@ -225,18 +231,20 @@ class JsonSchemaWriter : public TypeVisitor { } template - void WritePrimitive(const std::string& typeclass, const T& type, + Status WritePrimitive(const std::string& typeclass, const T& type, const std::vector& buffer_layout) { WriteName(typeclass, type); SetNoChildren(); WriteBufferLayout(buffer_layout); + return Status::OK(); } template - void WriteVarBytes(const std::string& typeclass, const T& type) { + Status WriteVarBytes(const std::string& typeclass, const T& type) { WriteName(typeclass, type); SetNoChildren(); WriteBufferLayout({kValidityBuffer, kOffsetBuffer, kValues8}); + return Status::OK(); } void WriteBufferLayout(const std::vector& buffer_layout) { @@ -266,99 +274,74 @@ class JsonSchemaWriter : public TypeVisitor { return Status::OK(); } - Status Visit(const NullType& type) override { - WritePrimitive("null", type, {}); - return Status::OK(); - } + Status Visit(const NullType& type) override { return WritePrimitive("null", type, {}); } Status Visit(const BooleanType& type) override { - WritePrimitive("bool", type, {kValidityBuffer, kBooleanBuffer}); - return Status::OK(); + return WritePrimitive("bool", type, {kValidityBuffer, kBooleanBuffer}); } Status Visit(const Int8Type& type) override { - WritePrimitive("int", type, {kValidityBuffer, kValues8}); - return Status::OK(); + return WritePrimitive("int", type, {kValidityBuffer, kValues8}); } Status Visit(const Int16Type& type) override { - WritePrimitive("int", type, {kValidityBuffer, kValues16}); - return Status::OK(); + return WritePrimitive("int", type, {kValidityBuffer, kValues16}); } Status Visit(const Int32Type& type) override { - WritePrimitive("int", type, {kValidityBuffer, kValues32}); - return Status::OK(); + return WritePrimitive("int", type, {kValidityBuffer, kValues32}); } Status Visit(const Int64Type& type) override { - WritePrimitive("int", type, {kValidityBuffer, kValues64}); - return Status::OK(); + return WritePrimitive("int", type, {kValidityBuffer, kValues64}); } Status Visit(const UInt8Type& type) override { - WritePrimitive("int", type, {kValidityBuffer, kValues8}); - return Status::OK(); + return WritePrimitive("int", type, {kValidityBuffer, kValues8}); } Status Visit(const UInt16Type& type) override { - WritePrimitive("int", type, {kValidityBuffer, kValues16}); - return Status::OK(); + return WritePrimitive("int", type, {kValidityBuffer, kValues16}); } Status Visit(const UInt32Type& type) override { - WritePrimitive("int", type, {kValidityBuffer, kValues32}); - return Status::OK(); + return WritePrimitive("int", type, {kValidityBuffer, kValues32}); } Status Visit(const UInt64Type& type) override { - WritePrimitive("int", type, {kValidityBuffer, kValues64}); - return Status::OK(); + return WritePrimitive("int", type, {kValidityBuffer, kValues64}); } Status Visit(const HalfFloatType& type) override { - WritePrimitive("floatingpoint", type, {kValidityBuffer, kValues16}); - return Status::OK(); + return WritePrimitive("floatingpoint", type, {kValidityBuffer, kValues16}); } Status Visit(const FloatType& type) override { - WritePrimitive("floatingpoint", type, {kValidityBuffer, kValues32}); - return Status::OK(); + return WritePrimitive("floatingpoint", type, {kValidityBuffer, kValues32}); } Status Visit(const DoubleType& type) override { - WritePrimitive("floatingpoint", type, {kValidityBuffer, kValues64}); - return Status::OK(); + return WritePrimitive("floatingpoint", type, {kValidityBuffer, kValues64}); } - Status Visit(const StringType& type) override { - WriteVarBytes("utf8", type); - return Status::OK(); - } + Status Visit(const StringType& type) override { return WriteVarBytes("utf8", type); } - Status Visit(const BinaryType& type) override { - WriteVarBytes("binary", type); - return Status::OK(); - } + Status Visit(const BinaryType& type) override { return WriteVarBytes("binary", type); } Status Visit(const DateType& type) override { - WritePrimitive("date", type, {kValidityBuffer, kValues64}); - return Status::OK(); + return WritePrimitive("date", type, {kValidityBuffer, kValues64}); } Status Visit(const TimeType& type) override { - WritePrimitive("time", type, {kValidityBuffer, kValues64}); - return Status::OK(); + return WritePrimitive("time", type, {kValidityBuffer, kValues64}); } Status Visit(const TimestampType& type) override { - WritePrimitive("timestamp", type, {kValidityBuffer, kValues64}); - return Status::OK(); + return WritePrimitive("timestamp", type, {kValidityBuffer, kValues64}); } Status Visit(const IntervalType& type) override { - WritePrimitive("interval", type, {kValidityBuffer, kValues64}); - return Status::OK(); + return WritePrimitive("interval", type, {kValidityBuffer, kValues64}); } Status Visit(const DecimalType& type) override { return Status::NotImplemented("NYI"); } @@ -394,6 +377,211 @@ class JsonSchemaWriter : public TypeVisitor { RjWriter* writer_; }; +class JsonArrayWriter : public ArrayVisitor { + public: + explicit JsonArrayWriter(const std::string& name, const Array& array, RjWriter* writer) + : name_(name), array_(array), writer_(writer) {} + + Status Write() { return VisitArray(name_, array_); } + + Status VisitArray(const std::string& name, const Array& arr) { + writer_->StartObject(); + writer_->Key("name"); + writer_->String(name); + + writer_->Key("count"); + writer_->String(arr.length()); + + RETURN_NOT_OK(array.Accept(this)); + + writer_->EndObject(); + return Status::OK(); + } + + template + typename std::enable_if::value, void>::type WriteDataValues( + const T& arr) { + const typename T::c_type* data = arr.raw_data(); + for (auto i = 0; i < arr.length(); ++i) { + writer_->Int64(data[i]); + } + } + + template + typename std::enable_if::value, void>::type WriteDataValues( + const T& arr) { + const typename T::c_type* data = arr.raw_data(); + for (auto i = 0; i < arr.length(); ++i) { + writer_->Uint64(data[i]); + } + } + + template + typename std::enable_if::value, void>::type WriteDataValues( + const T& arr) { + const typename T::c_type* data = arr.raw_data(); + for (auto i = 0; i < arr.length(); ++i) { + writer_->Double(data[i]); + } + } + + // String (Utf8), Binary + template + typename std::enable_if::value, void>::type + WriteDataValues(const T& arr) { + for (auto i = 0; i < arr.length(); ++i) { + int32_t length; + const char* buf = reinterpret_cast(arr.GetValue(i, &length)); + writer_->String(buf, length); + } + } + + template + typename std::enable_if::value, void>::type + WriteDataValues(const T& arr) { + for (auto i = 0; i < arr.length(); ++i) { + writer_->String(buf, length); + } + } + + template + void WriteDataField(const T& arr) { + writer_->StartArray(); + WriteDataValues(arr); + writer_->EndArray(); + } + + template + void WriteOffsetsField(const T* offsets, int32_t length) { + writer_->Key("OFFSETS"); + writer_->StartArray(); + for (auto i = 0; i < arr.length(); ++i) { + writer_->Int64(offsets[i]); + } + writer_->EndArray(); + } + + void WriteValidityField(const Array& arr) { + writer_->Key("VALIDITY"); + writer_->StartArray(); + if (arr.null_count() > 0) { + for (auto i = 0; i < arr.length(); ++i) { + writer_->Int(arr.IsNull(i) ? 0 : 1); + } + } else { + for (auto i = 0; i < arr.length(); ++i) { + writer_->Int(1); + } + } + writer_->EndArray(); + } + + void SetNoChildren() { + writer_->Key("children"); + writer_->StartArray(); + writer_->EndArray(); + } + + template + Status WritePrimitive(const T& array) { + WriteValidityField(array); + WriteDataField(array); + SetNoChildren(); + return Status::OK(); + } + + template + Status WriteVarBytes(const T& array) { + WriteValidityField(array); + WriteOffsetsField(array.raw_offsets(), array.length() + 1); + WriteDataField(array); + SetNoChildren(); + return Status::OK(); + } + + Status WriteChildren(const std::vector>& fields, + const std::vector>& arrays) { + writer_->Key("children"); + writer_->StartArray(); + for (size_t i = 0; i < fields.size(); ++i) { + RETURN_NOT_OK(VisitArray(fields[i].name, *arrays[i].get())); + } + writer_->EndArray(); + return Status::OK(); + } + + Status Visit(const NullArray& array) override { + SetNoChildren(); + return Status::OK(); + } + + Status Visit(const BooleanArray& array) override { return WritePrimitive(array); } + + Status Visit(const Int8Array& array) override { return WritePrimitive(array); } + + Status Visit(const Int16Array& array) override { return WritePrimitive(array); } + + Status Visit(const Int32Array& array) override { return WritePrimitive(array); } + + Status Visit(const Int64Array& array) override { return WritePrimitive(array); } + + Status Visit(const UInt8Array& array) override { return WritePrimitive(array); } + + Status Visit(const UInt16Array& array) override { return WritePrimitive(array); } + + Status Visit(const UInt32Array& array) override { return WritePrimitive(array); } + + Status Visit(const UInt64Array& array) override { return WritePrimitive(array); } + + Status Visit(const HalfFloatArray& array) override { return WritePrimitive(array); } + + Status Visit(const FloatArray& array) override { return WritePrimitive(array); } + + Status Visit(const DoubleArray& array) override { return WritePrimitive(array); } + + Status Visit(const StringArray& array) override { return WriteVarBytes(array); } + + Status Visit(const BinaryArray& array) override { return WriteVarBytes(array); } + + Status Visit(const DateArray& array) override { return Status::NotImplemented("date"); } + + Status Visit(const TimeArray& array) override { return Status::NotImplemented("time"); } + + Status Visit(const TimestampArray& array) override { + return Status::NotImplemented("timestamp"); + } + + Status Visit(const IntervalArray& array) override { + return Status::NotImplemented("interval"); + } + + Status Visit(const DecimalArray& array) override { + return Status::NotImplemented("decimal"); + } + + Status Visit(const ListArray& array) override { + WriteValidityField(array); + WriteOffsetsField(array); + auto type = static_cast(array.type().get()); + return WriteChildren(type.children(), {array.values()}); + } + + Status Visit(const StructArray& array) override { + WriteValidityField(array); + auto type = static_cast(array.type().get()); + return WriteChildren(type.children(), array.fields()); + } + + Status Visit(const UnionArray& array) override { + return Status::NotImplemented("union"); + } + + private: + const std::string& name_; + const Array& array_; + RjWriter* writer_; +}; + #define RETURN_NOT_FOUND(TOK, NAME, PARENT) \ if (NAME == PARENT.MemberEnd()) { \ std::stringstream ss; \ @@ -659,21 +847,23 @@ class JsonSchemaReader { const rj::Value& json_schema_; }; -class JsonArrayReader { - public: - explicit JsonArrayReader(const rj::Value& json_array) : json_array_(json_array) {} +// class JsonArrayReader { +// public: +// explicit JsonArrayReader(const rj::Value& json_array, const Schema& schema) +// : json_array_(json_array), schema_(schema) {} - Status GetArray(std::shared_ptr* array) { - if (!json_array_.IsObject()) { - return Status::Invalid("Array was not a JSON object"); - } +// Status GetArray(std::shared_ptr* array) { +// if (!json_array_.IsObject()) { +// return Status::Invalid("Array was not a JSON object"); +// } - return Status::OK(); - } +// return Status::OK(); +// } - private: - const rj::Value& json_array_; -}; +// private: +// const rj::Value& json_array_; +// const Schema& schema_; +// }; Status WriteJsonSchema(const Schema& schema, RjWriter* json_writer) { JsonSchemaWriter converter(schema, json_writer); @@ -685,10 +875,17 @@ Status ReadJsonSchema(const rj::Value& json_schema, std::shared_ptr* sch return converter.GetSchema(schema); } -Status ReadJsonArray(const rj::Value& json_array, std::shared_ptr* array) { - JsonArrayReader converter(json_array); - return converter.GetArray(array); -} +// Status WriteJsonArray( +// const std::string& name, const Array& array, RjWriter* json_writer) { +// JsonArrayWriter converter(name, array, json_writer); +// converter.Write(); +// } + +// Status ReadJsonArray( +// const rj::Value& json_array, const Schema& schema, std::shared_ptr* array) { +// JsonArrayReader converter(json_array, schema); +// return converter.GetArray(array); +// } } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/json-internal.h b/cpp/src/arrow/ipc/json-internal.h index 7fd50113161..a50b29547c7 100644 --- a/cpp/src/arrow/ipc/json-internal.h +++ b/cpp/src/arrow/ipc/json-internal.h @@ -44,12 +44,13 @@ class Schema; namespace ipc { Status ARROW_EXPORT WriteJsonSchema(const Schema& schema, RjWriter* json_writer); -Status ARROW_EXPORT WriteJsonArray(const Array& array, RjWriter* json_writer); +Status ARROW_EXPORT WriteJsonArray( + const std::string& name, const Array& array, RjWriter* json_writer); Status ARROW_EXPORT ReadJsonSchema( const rj::Value& json_arr, std::shared_ptr* schema); Status ARROW_EXPORT ReadJsonArray( - const rj::Value& json_obj, std::shared_ptr* schema); + const rj::Value& json_obj, const Schema& schema, std::shared_ptr* array); } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 86bf4264ed3..608d06e00aa 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -420,10 +420,6 @@ struct ARROW_EXPORT IntervalType : public DataType, public PrimitiveMeta { static std::string name() { return "date"; } }; -// These will be defined elsewhere -template -struct TypeTraits {}; - // Factory functions std::shared_ptr ARROW_EXPORT null(); diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h new file mode 100644 index 00000000000..153bbd31f9e --- /dev/null +++ b/cpp/src/arrow/type_traits.h @@ -0,0 +1,142 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef ARROW_TYPE_TRAITS_H +#define ARROW_TYPE_TRAITS_H + +#include + +#include "arrow/type_fwd.h" +#include "arrow/util/bit-util.h" + +namespace arrow { + +template +struct TypeTraits {}; + +template <> +struct TypeTraits { + using ArrayType = UInt8Array; + static inline int bytes_required(int elements) { return elements; } +}; + +template <> +struct TypeTraits { + using ArrayType = Int8Array; + static inline int bytes_required(int elements) { return elements; } +}; + +template <> +struct TypeTraits { + using ArrayType = UInt16Array; + + static inline int bytes_required(int elements) { return elements * sizeof(uint16_t); } +}; + +template <> +struct TypeTraits { + using ArrayType = Int16Array; + + static inline int bytes_required(int elements) { return elements * sizeof(int16_t); } +}; + +template <> +struct TypeTraits { + using ArrayType = UInt32Array; + + static inline int bytes_required(int elements) { return elements * sizeof(uint32_t); } +}; + +template <> +struct TypeTraits { + using ArrayType = Int32Array; + + static inline int bytes_required(int elements) { return elements * sizeof(int32_t); } +}; + +template <> +struct TypeTraits { + using ArrayType = UInt64Array; + + static inline int bytes_required(int elements) { return elements * sizeof(uint64_t); } +}; + +template <> +struct TypeTraits { + using ArrayType = Int64Array; + + static inline int bytes_required(int elements) { return elements * sizeof(int64_t); } +}; + +template <> +struct TypeTraits { + using ArrayType = TimestampArray; + + static inline int bytes_required(int elements) { return elements * sizeof(int64_t); } +}; +template <> + +struct TypeTraits { + using ArrayType = FloatArray; + + static inline int bytes_required(int elements) { return elements * sizeof(float); } +}; + +template <> +struct TypeTraits { + using ArrayType = DoubleArray; + + static inline int bytes_required(int elements) { return elements * sizeof(double); } +}; + +template <> +struct TypeTraits { + typedef BooleanArray ArrayType; + + static inline int bytes_required(int elements) { + return BitUtil::BytesForBits(elements); + } +}; + +#define PRIMITIVE_TRAITS(T) \ + using TypeClass = typename std::conditional::value, T, \ + typename T::TypeClass>::type; \ + using c_type = typename TypeClass::c_type; + +template +struct IsUnsignedInt { + PRIMITIVE_TRAITS(T); + static constexpr bool value = + std::is_integral::value && std::is_unsigned::value; +}; + +template +struct IsSignedInt { + PRIMITIVE_TRAITS(T); + static constexpr bool value = + std::is_integral::value && std::is_signed::value; +}; + +template +struct IsFloatingPoint { + PRIMITIVE_TRAITS(T); + static constexpr bool value = std::is_floating_point::value; +}; + +} // namespace arrow + +#endif // ARROW_TYPE_TRAITS_H diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index 71b34c9850e..cf56d0d0e87 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -140,7 +140,7 @@ TEST_F(TestListBuilder, TestAppendNull) { ASSERT_TRUE(result_->IsNull(0)); ASSERT_TRUE(result_->IsNull(1)); - ASSERT_EQ(0, result_->offsets()[0]); + ASSERT_EQ(0, result_->raw_offsets()[0]); ASSERT_EQ(0, result_->offset(1)); ASSERT_EQ(0, result_->offset(2)); diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index 9440ffed4bf..64dfd649b25 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -39,6 +39,8 @@ class MemoryPool; class ARROW_EXPORT ListArray : public Array { public: + using TypeClass = ListType; + ListArray(const TypePtr& type, int32_t length, std::shared_ptr offsets, const ArrayPtr& values, int32_t null_count = 0, std::shared_ptr null_bitmap = nullptr) @@ -56,13 +58,13 @@ class ARROW_EXPORT ListArray : public Array { // Return a shared pointer in case the requestor desires to share ownership // with this array. const std::shared_ptr& values() const { return values_; } - const std::shared_ptr offset_buffer() const { + std::shared_ptr offsets() const { return std::static_pointer_cast(offset_buffer_); } const std::shared_ptr& value_type() const { return values_->type(); } - const int32_t* offsets() const { return offsets_; } + const int32_t* raw_offsets() const { return offsets_; } int32_t offset(int i) const { return offsets_[i]; } diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 69d200b958f..568a1fecd05 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -25,6 +25,7 @@ #include "arrow/builder.h" #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/type_traits.h" #include "arrow/types/construct.h" #include "arrow/types/primitive.h" #include "arrow/types/test-common.h" diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index 11ad85159a4..12e66d5ab06 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -19,6 +19,7 @@ #include +#include "arrow/type_traits.h" #include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" #include "arrow/util/logging.h" diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index 3fe0b275765..d418e8d4614 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -55,9 +55,10 @@ class ARROW_EXPORT PrimitiveArray : public Array { const uint8_t* raw_data_; }; -template +template class ARROW_EXPORT NumericArray : public PrimitiveArray { public: + using TypeClass = TYPE; using value_type = typename TypeClass::c_type; NumericArray(int32_t length, const std::shared_ptr& data, int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr) @@ -168,83 +169,6 @@ class ARROW_EXPORT NumericBuilder : public PrimitiveBuilder { using PrimitiveBuilder::raw_data_; }; -template <> -struct TypeTraits { - typedef UInt8Array ArrayType; - - static inline int bytes_required(int elements) { return elements; } -}; - -template <> -struct TypeTraits { - typedef Int8Array ArrayType; - - static inline int bytes_required(int elements) { return elements; } -}; - -template <> -struct TypeTraits { - typedef UInt16Array ArrayType; - - static inline int bytes_required(int elements) { return elements * sizeof(uint16_t); } -}; - -template <> -struct TypeTraits { - typedef Int16Array ArrayType; - - static inline int bytes_required(int elements) { return elements * sizeof(int16_t); } -}; - -template <> -struct TypeTraits { - typedef UInt32Array ArrayType; - - static inline int bytes_required(int elements) { return elements * sizeof(uint32_t); } -}; - -template <> -struct TypeTraits { - typedef Int32Array ArrayType; - - static inline int bytes_required(int elements) { return elements * sizeof(int32_t); } -}; - -template <> -struct TypeTraits { - typedef UInt64Array ArrayType; - - static inline int bytes_required(int elements) { return elements * sizeof(uint64_t); } -}; - -template <> -struct TypeTraits { - typedef Int64Array ArrayType; - - static inline int bytes_required(int elements) { return elements * sizeof(int64_t); } -}; - -template <> -struct TypeTraits { - typedef TimestampArray ArrayType; - - static inline int bytes_required(int elements) { return elements * sizeof(int64_t); } -}; -template <> - -struct TypeTraits { - typedef FloatArray ArrayType; - - static inline int bytes_required(int elements) { return elements * sizeof(float); } -}; - -template <> -struct TypeTraits { - typedef DoubleArray ArrayType; - - static inline int bytes_required(int elements) { return elements * sizeof(double); } -}; - // Builders typedef NumericBuilder UInt8Builder; @@ -263,6 +187,8 @@ typedef NumericBuilder DoubleBuilder; class ARROW_EXPORT BooleanArray : public PrimitiveArray { public: + using TypeClass = BooleanType; + BooleanArray(int32_t length, const std::shared_ptr& data, int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); BooleanArray(const TypePtr& type, int32_t length, const std::shared_ptr& data, @@ -278,15 +204,6 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { bool Value(int i) const { return BitUtil::GetBit(raw_data(), i); } }; -template <> -struct TypeTraits { - typedef BooleanArray ArrayType; - - static inline int bytes_required(int elements) { - return BitUtil::BytesForBits(elements); - } -}; - class ARROW_EXPORT BooleanBuilder : public PrimitiveBuilder { public: explicit BooleanBuilder(MemoryPool* pool, const TypePtr& type) diff --git a/cpp/src/arrow/types/string.cc b/cpp/src/arrow/types/string.cc index f6d26df3167..b0c5761e09c 100644 --- a/cpp/src/arrow/types/string.cc +++ b/cpp/src/arrow/types/string.cc @@ -122,8 +122,8 @@ Status BinaryBuilder::Finish(std::shared_ptr* out) { const auto list = std::dynamic_pointer_cast(result); auto values = std::dynamic_pointer_cast(list->values()); - *out = std::make_shared(list->length(), list->offset_buffer(), - values->data(), list->null_count(), list->null_bitmap()); + *out = std::make_shared(list->length(), list->offsets(), values->data(), + list->null_count(), list->null_bitmap()); return Status::OK(); } @@ -134,8 +134,8 @@ Status StringBuilder::Finish(std::shared_ptr* out) { const auto list = std::dynamic_pointer_cast(result); auto values = std::dynamic_pointer_cast(list->values()); - *out = std::make_shared(list->length(), list->offset_buffer(), - values->data(), list->null_count(), list->null_bitmap()); + *out = std::make_shared(list->length(), list->offsets(), values->data(), + list->null_count(), list->null_bitmap()); return Status::OK(); } diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index aaba49c6023..74dee65fe69 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -37,6 +37,8 @@ class MemoryPool; class ARROW_EXPORT BinaryArray : public Array { public: + using TypeClass = BinaryType; + BinaryArray(int32_t length, const std::shared_ptr& offsets, const std::shared_ptr& data, int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); @@ -60,6 +62,8 @@ class ARROW_EXPORT BinaryArray : public Array { std::shared_ptr data() const { return data_buffer_; } std::shared_ptr offsets() const { return offset_buffer_; } + const int32_t* raw_offsets() const { return offsets_; } + int32_t offset(int i) const { return offsets_[i]; } // Neither of these functions will perform boundschecking @@ -83,6 +87,8 @@ class ARROW_EXPORT BinaryArray : public Array { class ARROW_EXPORT StringArray : public BinaryArray { public: + using TypeClass = StringType; + StringArray(int32_t length, const std::shared_ptr& offsets, const std::shared_ptr& data, int32_t null_count = 0, const std::shared_ptr& null_bitmap = nullptr); diff --git a/cpp/src/arrow/types/struct-test.cc b/cpp/src/arrow/types/struct-test.cc index 8e82c389a94..197d7d4ad1f 100644 --- a/cpp/src/arrow/types/struct-test.cc +++ b/cpp/src/arrow/types/struct-test.cc @@ -80,7 +80,7 @@ void ValidateBasicStructArray(const StructArray* result, ASSERT_EQ(4, list_char_arr->length()); ASSERT_EQ(10, list_char_arr->values()->length()); for (size_t i = 0; i < list_offsets.size(); ++i) { - ASSERT_EQ(list_offsets[i], list_char_arr->offsets()[i]); + ASSERT_EQ(list_offsets[i], list_char_arr->raw_offsets()[i]); } for (size_t i = 0; i < list_values.size(); ++i) { ASSERT_EQ(list_values[i], char_arr->Value(i)); diff --git a/cpp/src/arrow/types/struct.h b/cpp/src/arrow/types/struct.h index 65b8daf214a..239affee03a 100644 --- a/cpp/src/arrow/types/struct.h +++ b/cpp/src/arrow/types/struct.h @@ -31,6 +31,8 @@ namespace arrow { class ARROW_EXPORT StructArray : public Array { public: + using TypeClass = StructType; + StructArray(const TypePtr& type, int32_t length, std::vector& field_arrays, int32_t null_count = 0, std::shared_ptr null_bitmap = nullptr) : Array(type, length, null_count, null_bitmap) { From 932ba7a1ceb5c72ab5fa3112f1a8a4c6186d456e Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 14 Nov 2016 11:19:34 -0500 Subject: [PATCH 12/27] Add ArrayVisitor methods, add enough metaprogramming to detect presence of c_type type member Change-Id: I7326738f8a235770ddebe9d5cf1ef90eb49b3e35 --- cpp/src/arrow/array.cc | 4 ++ cpp/src/arrow/array.h | 4 ++ cpp/src/arrow/ipc/json-internal.cc | 22 ++++---- cpp/src/arrow/type.cc | 6 +++ cpp/src/arrow/type.h | 29 ++++++---- cpp/src/arrow/type_traits.h | 19 ++++++- cpp/src/arrow/types/list.cc | 4 ++ cpp/src/arrow/types/list.h | 2 + cpp/src/arrow/types/primitive.cc | 87 +++++++++++++++++++++++------- cpp/src/arrow/types/primitive.h | 43 +++++++++++++-- cpp/src/arrow/types/string.cc | 8 +++ cpp/src/arrow/types/string.h | 4 ++ cpp/src/arrow/types/struct.cc | 4 ++ cpp/src/arrow/types/struct.h | 2 + 14 files changed, 193 insertions(+), 45 deletions(-) diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index e432a53781f..10f7a2f5ebf 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -66,4 +66,8 @@ bool NullArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_st return true; } +Status NullArray::Accept(ArrayVisitor* visitor) const { + return visitor->Visit(*this); +} + } // namespace arrow diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index 8647d29e074..fa7ef615a8a 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -70,6 +70,8 @@ class ARROW_EXPORT Array { // returning Status::OK. This can be an expensive check. virtual Status Validate() const; + virtual Status Accept(ArrayVisitor* visitor) const = 0; + protected: std::shared_ptr type_; int32_t null_count_; @@ -96,6 +98,8 @@ class ARROW_EXPORT NullArray : public Array { bool Equals(const std::shared_ptr& arr) const override; bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_index, const std::shared_ptr& arr) const override; + + Status Accept(ArrayVisitor* visitor) const override; }; typedef std::shared_ptr ArrayPtr; diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index 0ef749a13b7..ae67aff3ca8 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -390,9 +390,9 @@ class JsonArrayWriter : public ArrayVisitor { writer_->String(name); writer_->Key("count"); - writer_->String(arr.length()); + writer_->Int(arr.length()); - RETURN_NOT_OK(array.Accept(this)); + RETURN_NOT_OK(arr.Accept(this)); writer_->EndObject(); return Status::OK(); @@ -401,7 +401,7 @@ class JsonArrayWriter : public ArrayVisitor { template typename std::enable_if::value, void>::type WriteDataValues( const T& arr) { - const typename T::c_type* data = arr.raw_data(); + const auto data = arr.raw_data(); for (auto i = 0; i < arr.length(); ++i) { writer_->Int64(data[i]); } @@ -410,7 +410,7 @@ class JsonArrayWriter : public ArrayVisitor { template typename std::enable_if::value, void>::type WriteDataValues( const T& arr) { - const typename T::c_type* data = arr.raw_data(); + const auto data = arr.raw_data(); for (auto i = 0; i < arr.length(); ++i) { writer_->Uint64(data[i]); } @@ -419,7 +419,7 @@ class JsonArrayWriter : public ArrayVisitor { template typename std::enable_if::value, void>::type WriteDataValues( const T& arr) { - const typename T::c_type* data = arr.raw_data(); + const auto data = arr.raw_data(); for (auto i = 0; i < arr.length(); ++i) { writer_->Double(data[i]); } @@ -440,7 +440,7 @@ class JsonArrayWriter : public ArrayVisitor { typename std::enable_if::value, void>::type WriteDataValues(const T& arr) { for (auto i = 0; i < arr.length(); ++i) { - writer_->String(buf, length); + writer_->Bool(arr.Value(i)); } } @@ -455,7 +455,7 @@ class JsonArrayWriter : public ArrayVisitor { void WriteOffsetsField(const T* offsets, int32_t length) { writer_->Key("OFFSETS"); writer_->StartArray(); - for (auto i = 0; i < arr.length(); ++i) { + for (auto i = 0; i < length; ++i) { writer_->Int64(offsets[i]); } writer_->EndArray(); @@ -504,7 +504,7 @@ class JsonArrayWriter : public ArrayVisitor { writer_->Key("children"); writer_->StartArray(); for (size_t i = 0; i < fields.size(); ++i) { - RETURN_NOT_OK(VisitArray(fields[i].name, *arrays[i].get())); + RETURN_NOT_OK(VisitArray(fields[i]->name, *arrays[i].get())); } writer_->EndArray(); return Status::OK(); @@ -561,15 +561,15 @@ class JsonArrayWriter : public ArrayVisitor { Status Visit(const ListArray& array) override { WriteValidityField(array); - WriteOffsetsField(array); + WriteOffsetsField(array.raw_offsets(), array.length() + 1); auto type = static_cast(array.type().get()); - return WriteChildren(type.children(), {array.values()}); + return WriteChildren(type->children(), {array.values()}); } Status Visit(const StructArray& array) override { WriteValidityField(array); auto type = static_cast(array.type().get()); - return WriteChildren(type.children(), array.fields()); + return WriteChildren(type->children(), array.fields()); } Status Visit(const UnionArray& array) override { diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 5bc1a8d5ce3..589bdadb77c 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -46,6 +46,10 @@ bool DataType::Equals(const DataType* other) const { return equals; } +std::string BooleanType::ToString() const { + return name(); +} + FloatingPointMeta::Precision HalfFloatType::precision() const { return FloatingPointMeta::HALF; } @@ -104,6 +108,7 @@ std::string UnionType::ToString() const { int NullType::bit_width() const { return 0; } + std::string NullType::ToString() const { return name(); } @@ -114,6 +119,7 @@ std::string NullType::ToString() const { Status TYPE::Accept(TypeVisitor* visitor) const { return visitor->Visit(*this); } ACCEPT_VISITOR(NullType); +ACCEPT_VISITOR(BooleanType); ACCEPT_VISITOR(BinaryType); ACCEPT_VISITOR(StringType); ACCEPT_VISITOR(ListType); diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 608d06e00aa..a9d4eea3c0d 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -133,7 +133,7 @@ struct ARROW_EXPORT DataType { typedef std::shared_ptr TypePtr; -struct ARROW_EXPORT PrimitiveMeta { +struct ARROW_EXPORT FixedWidthMeta { virtual int bit_width() const = 0; }; @@ -185,11 +185,11 @@ struct ARROW_EXPORT Field { typedef std::shared_ptr FieldPtr; template -struct ARROW_EXPORT PrimitiveType : public DataType, public PrimitiveMeta { +struct ARROW_EXPORT PrimitiveCType : public DataType, public FixedWidthMeta { using c_type = C_TYPE; static constexpr Type::type type_id = TYPE_ID; - PrimitiveType() : DataType(TYPE_ID) {} + PrimitiveCType() : DataType(TYPE_ID) {} int bit_width() const override { return sizeof(C_TYPE) * 8; } @@ -200,7 +200,7 @@ struct ARROW_EXPORT PrimitiveType : public DataType, public PrimitiveMeta { std::string ToString() const override { return std::string(DERIVED::name()); } }; -struct ARROW_EXPORT NullType : public DataType, public PrimitiveMeta { +struct ARROW_EXPORT NullType : public DataType, public FixedWidthMeta { static constexpr Type::type type_enum = Type::NA; NullType() : DataType(Type::NA) {} @@ -213,12 +213,19 @@ struct ARROW_EXPORT NullType : public DataType, public PrimitiveMeta { }; template -struct IntegerTypeImpl : public PrimitiveType, +struct IntegerTypeImpl : public PrimitiveCType, public IntegerMeta { bool is_signed() const override { return std::is_signed::value; } }; -struct ARROW_EXPORT BooleanType : public PrimitiveType { +struct ARROW_EXPORT BooleanType : public DataType, FixedWidthMeta { + static constexpr Type::type type_enum = Type::BOOL; + + BooleanType() : DataType(Type::BOOL) {} + + Status Accept(TypeVisitor* visitor) const override; + std::string ToString() const override; + int bit_width() const override { return 1; } static std::string name() { return "bool"; } }; @@ -259,19 +266,19 @@ struct ARROW_EXPORT Int64Type : public IntegerTypeImpl, + : public PrimitiveCType, public FloatingPointMeta { Precision precision() const override; static std::string name() { return "halffloat"; } }; -struct ARROW_EXPORT FloatType : public PrimitiveType, +struct ARROW_EXPORT FloatType : public PrimitiveCType, public FloatingPointMeta { Precision precision() const override; static std::string name() { return "float"; } }; -struct ARROW_EXPORT DoubleType : public PrimitiveType, +struct ARROW_EXPORT DoubleType : public PrimitiveCType, public FloatingPointMeta { Precision precision() const override; static std::string name() { return "double"; } @@ -380,7 +387,7 @@ struct ARROW_EXPORT TimeType : public DataType { static std::string name() { return "time"; } }; -struct ARROW_EXPORT TimestampType : public DataType, public PrimitiveMeta { +struct ARROW_EXPORT TimestampType : public DataType, public FixedWidthMeta { using Unit = TimeUnit; typedef int64_t c_type; @@ -400,7 +407,7 @@ struct ARROW_EXPORT TimestampType : public DataType, public PrimitiveMeta { static std::string name() { return "timestamp"; } }; -struct ARROW_EXPORT IntervalType : public DataType, public PrimitiveMeta { +struct ARROW_EXPORT IntervalType : public DataType, public FixedWidthMeta { enum class Unit : char { YEAR_MONTH = 0, DAY_TIME = 1 }; typedef int64_t c_type; diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 153bbd31f9e..e4c9824fa01 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -112,10 +112,27 @@ struct TypeTraits { } }; +// Not all type classes have a c_type +template +struct as_void { + using type = void; +}; + +template +struct GetCType { + using type = void; +}; + +// The partial specialization will match if T has the c_type member +template +struct GetCType::type> { + using type = typename T::c_type; +}; + #define PRIMITIVE_TRAITS(T) \ using TypeClass = typename std::conditional::value, T, \ typename T::TypeClass>::type; \ - using c_type = typename TypeClass::c_type; + using c_type = typename GetCType::type; template struct IsUnsignedInt { diff --git a/cpp/src/arrow/types/list.cc b/cpp/src/arrow/types/list.cc index 4b1e8214727..d86563253bd 100644 --- a/cpp/src/arrow/types/list.cc +++ b/cpp/src/arrow/types/list.cc @@ -155,4 +155,8 @@ void ListBuilder::Reset() { null_bitmap_ = nullptr; } +Status ListArray::Accept(ArrayVisitor* visitor) const { + return visitor->Visit(*this); +} + } // namespace arrow diff --git a/cpp/src/arrow/types/list.h b/cpp/src/arrow/types/list.h index 64dfd649b25..bd93e8fdcfa 100644 --- a/cpp/src/arrow/types/list.h +++ b/cpp/src/arrow/types/list.h @@ -78,6 +78,8 @@ class ARROW_EXPORT ListArray : public Array { bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, const ArrayPtr& arr) const override; + Status Accept(ArrayVisitor* visitor) const override; + protected: std::shared_ptr offset_buffer_; const int32_t* offsets_; diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index 12e66d5ab06..e4dd18761a5 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -49,8 +49,8 @@ bool PrimitiveArray::EqualsExact(const PrimitiveArray& other) const { const uint8_t* this_data = raw_data_; const uint8_t* other_data = other.raw_data_; - auto primitive_meta = dynamic_cast(type_.get()); - int value_byte_size = primitive_meta->bit_width() / 8; + auto size_meta = dynamic_cast(type_.get()); + int value_byte_size = size_meta->bit_width() / 8; DCHECK_GT(value_byte_size, 0); for (int i = 0; i < length_; ++i) { @@ -72,6 +72,11 @@ bool PrimitiveArray::Equals(const std::shared_ptr& arr) const { return EqualsExact(*static_cast(arr.get())); } +template +Status NumericArray::Accept(ArrayVisitor* visitor) const { + return visitor->Visit(*this); +} + template class NumericArray; template class NumericArray; template class NumericArray; @@ -83,7 +88,6 @@ template class NumericArray; template class NumericArray; template class NumericArray; template class NumericArray; -template class NumericArray; template Status PrimitiveBuilder::Init(int32_t capacity) { @@ -147,8 +151,64 @@ Status PrimitiveBuilder::Finish(std::shared_ptr* out) { return Status::OK(); } -template <> -Status PrimitiveBuilder::Append( +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; +template class PrimitiveBuilder; + +Status BooleanBuilder::Init(int32_t capacity) { + RETURN_NOT_OK(ArrayBuilder::Init(capacity)); + data_ = std::make_shared(pool_); + + int64_t nbytes = BitUtil::BytesForBits(capacity); + RETURN_NOT_OK(data_->Resize(nbytes)); + // TODO(emkornfield) valgrind complains without this + memset(data_->mutable_data(), 0, nbytes); + + raw_data_ = reinterpret_cast(data_->mutable_data()); + return Status::OK(); +} + +Status BooleanBuilder::Resize(int32_t capacity) { + // XXX: Set floor size for now + if (capacity < kMinBuilderCapacity) { capacity = kMinBuilderCapacity; } + + if (capacity_ == 0) { + RETURN_NOT_OK(Init(capacity)); + } else { + RETURN_NOT_OK(ArrayBuilder::Resize(capacity)); + const int64_t old_bytes = data_->size(); + const int64_t new_bytes = BitUtil::BytesForBits(capacity); + + RETURN_NOT_OK(data_->Resize(new_bytes)); + raw_data_ = reinterpret_cast(data_->mutable_data()); + memset(data_->mutable_data() + old_bytes, 0, new_bytes - old_bytes); + } + return Status::OK(); +} + +Status BooleanBuilder::Finish(std::shared_ptr* out) { + const int64_t bytes_required = BitUtil::BytesForBits(length_); + + if (bytes_required > 0 && bytes_required < data_->size()) { + // Trim buffers + RETURN_NOT_OK(data_->Resize(bytes_required)); + } + *out = std::make_shared(type_, length_, data_, null_count_, null_bitmap_); + + data_ = null_bitmap_ = nullptr; + capacity_ = length_ = null_count_ = 0; + return Status::OK(); +} + +Status BooleanBuilder::Append( const uint8_t* values, int32_t length, const uint8_t* valid_bytes) { RETURN_NOT_OK(Reserve(length)); @@ -170,19 +230,6 @@ Status PrimitiveBuilder::Append( return Status::OK(); } -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; -template class PrimitiveBuilder; - BooleanArray::BooleanArray(int32_t length, const std::shared_ptr& data, int32_t null_count, const std::shared_ptr& null_bitmap) : PrimitiveArray( @@ -237,4 +284,8 @@ bool BooleanArray::RangeEquals(int32_t start_idx, int32_t end_idx, return true; } +Status BooleanArray::Accept(ArrayVisitor* visitor) const { + return visitor->Visit(*this); +} + } // namespace arrow diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index d418e8d4614..826988a6076 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -90,6 +90,8 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray { return reinterpret_cast(raw_data_); } + Status Accept(ArrayVisitor* visitor) const override; + value_type Value(int i) const { return raw_data()[i]; } }; @@ -199,19 +201,36 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray { bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, const ArrayPtr& arr) const override; + Status Accept(ArrayVisitor* visitor) const override; + const uint8_t* raw_data() const { return reinterpret_cast(raw_data_); } bool Value(int i) const { return BitUtil::GetBit(raw_data(), i); } }; -class ARROW_EXPORT BooleanBuilder : public PrimitiveBuilder { +class ARROW_EXPORT BooleanBuilder : public ArrayBuilder { public: explicit BooleanBuilder(MemoryPool* pool, const TypePtr& type) - : PrimitiveBuilder(pool, type) {} + : ArrayBuilder(pool, type), data_(nullptr) {} virtual ~BooleanBuilder() {} - using PrimitiveBuilder::Append; + using ArrayBuilder::Advance; + + // Write nulls as uint8_t* (0 value indicates null) into pre-allocated memory + Status AppendNulls(const uint8_t* valid_bytes, int32_t length) { + RETURN_NOT_OK(Reserve(length)); + UnsafeAppendToBitmap(valid_bytes, length); + return Status::OK(); + } + + Status AppendNull() { + RETURN_NOT_OK(Reserve(1)); + UnsafeAppendToBitmap(false); + return Status::OK(); + } + + std::shared_ptr data() const { return data_; } // Scalar append Status Append(bool val) { @@ -226,7 +245,23 @@ class ARROW_EXPORT BooleanBuilder : public PrimitiveBuilder { return Status::OK(); } - Status Append(uint8_t val) { return Append(static_cast(val)); } + // Vector append + // + // If passed, valid_bytes is of equal length to values, and any zero byte + // will be considered as a null for that slot + Status Append( + const uint8_t* values, int32_t length, const uint8_t* valid_bytes = nullptr); + + Status Finish(std::shared_ptr* out) override; + Status Init(int32_t capacity) override; + + // Increase the capacity of the builder to accommodate at least the indicated + // number of elements + Status Resize(int32_t capacity) override; + + protected: + std::shared_ptr data_; + uint8_t* raw_data_; }; // Only instantiate these templates once diff --git a/cpp/src/arrow/types/string.cc b/cpp/src/arrow/types/string.cc index b0c5761e09c..db963dfa0de 100644 --- a/cpp/src/arrow/types/string.cc +++ b/cpp/src/arrow/types/string.cc @@ -94,6 +94,10 @@ bool BinaryArray::RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_ return true; } +Status BinaryArray::Accept(ArrayVisitor* visitor) const { + return visitor->Visit(*this); +} + StringArray::StringArray(int32_t length, const std::shared_ptr& offsets, const std::shared_ptr& data, int32_t null_count, const std::shared_ptr& null_bitmap) @@ -104,6 +108,10 @@ Status StringArray::Validate() const { return BinaryArray::Validate(); } +Status StringArray::Accept(ArrayVisitor* visitor) const { + return visitor->Visit(*this); +} + // This used to be a static member variable of BinaryBuilder, but it can cause // valgrind to report a (spurious?) memory leak when needed in other shared // libraries. The problem came up while adding explicit visibility to libarrow diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index 74dee65fe69..167b8ec8bf8 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -77,6 +77,8 @@ class ARROW_EXPORT BinaryArray : public Array { Status Validate() const override; + Status Accept(ArrayVisitor* visitor) const override; + private: std::shared_ptr offset_buffer_; const int32_t* offsets_; @@ -102,6 +104,8 @@ class ARROW_EXPORT StringArray : public BinaryArray { } Status Validate() const override; + + Status Accept(ArrayVisitor* visitor) const override; }; // BinaryBuilder : public ListBuilder diff --git a/cpp/src/arrow/types/struct.cc b/cpp/src/arrow/types/struct.cc index 369c29d15ef..0e0db23544b 100644 --- a/cpp/src/arrow/types/struct.cc +++ b/cpp/src/arrow/types/struct.cc @@ -87,6 +87,10 @@ Status StructArray::Validate() const { return Status::OK(); } +Status StructArray::Accept(ArrayVisitor* visitor) const { + return visitor->Visit(*this); +} + Status StructBuilder::Finish(std::shared_ptr* out) { std::vector> fields(field_builders_.size()); for (size_t i = 0; i < field_builders_.size(); ++i) { diff --git a/cpp/src/arrow/types/struct.h b/cpp/src/arrow/types/struct.h index 239affee03a..035af051325 100644 --- a/cpp/src/arrow/types/struct.h +++ b/cpp/src/arrow/types/struct.h @@ -57,6 +57,8 @@ class ARROW_EXPORT StructArray : public Array { bool RangeEquals(int32_t start_idx, int32_t end_idx, int32_t other_start_idx, const std::shared_ptr& arr) const override; + Status Accept(ArrayVisitor* visitor) const override; + protected: // The child arrays corresponding to each field of the struct data type. std::vector field_arrays_; From 2c93cce22c47b9f6beed957b8789af7abaf34825 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 14 Nov 2016 21:11:34 -0500 Subject: [PATCH 13/27] WIP JSON array reader code path Change-Id: I7c52a441edcef57a7a868c80f0caa2f4ac734f22 --- cpp/src/arrow/ipc/json-internal.cc | 191 +++++++++++++++++++++++++---- cpp/src/arrow/schema-test.cc | 21 ++++ cpp/src/arrow/schema.cc | 15 +++ cpp/src/arrow/schema.h | 10 +- cpp/src/arrow/type.h | 28 ++--- cpp/src/arrow/type_fwd.h | 107 +++++++++------- cpp/src/arrow/type_traits.h | 16 ++- cpp/src/arrow/types/primitive.cc | 1 + 8 files changed, 299 insertions(+), 90 deletions(-) diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index ae67aff3ca8..8ebc8bc6c7c 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -33,6 +33,7 @@ #include "arrow/types/primitive.h" #include "arrow/types/string.h" #include "arrow/types/struct.h" +#include "arrow/util/memory-pool.h" #include "arrow/util/status.h" namespace arrow { @@ -847,23 +848,163 @@ class JsonSchemaReader { const rj::Value& json_schema_; }; -// class JsonArrayReader { -// public: -// explicit JsonArrayReader(const rj::Value& json_array, const Schema& schema) -// : json_array_(json_array), schema_(schema) {} +class JsonArrayReader { + public: + explicit JsonArrayReader( + MemoryPool* pool, const rj::Value& json_array, const Schema& schema) + : pool_(pool), json_array_(json_array), schema_(schema) {} + + Status GetResult(std::shared_ptr* array) { + if (!json_array_.IsObject()) { + return Status::Invalid("Array was not a JSON object"); + } + const auto& json_array = json_array_.GetObject(); + + const auto& json_name = json_array.FindMember("name"); + RETURN_NOT_STRING("name", json_name, json_array); + + return GetArrayFromStruct( + json_array_, json_name.GetString(), schema_.fields(), array); + } + + Status GetArrayFromStruct(const rj::Value& obj, const std::string& name, + const std::vector>& fields, std::shared_ptr* array) { + std::shared_ptr result = nullptr; + + for (const std::shared_ptr& field : fields) { + if (field->name == name) { + result = field; + break; + } + } + + if (result == nullptr) { + std::stringstream ss; + ss << "Field named " << name << " not found in struct/schema"; + return Status::KeyError(ss.str()); + } + + return GetArray(obj, result->type, array); + } + + template + Status ReadArray(const rj::Value& obj, const std::vector& is_valid, + const std::shared_ptr& type, std::shared_ptr* array) { + typename TypeTraits::BuilderType builder(pool_, type); + const auto& json_array = obj.GetObject(); -// Status GetArray(std::shared_ptr* array) { -// if (!json_array_.IsObject()) { -// return Status::Invalid("Array was not a JSON object"); -// } + const auto& json_data = json_array.FindMember("DATA"); + RETURN_NOT_ARRAY("DATA", json_data, json_array); -// return Status::OK(); -// } + const auto& json_data_arr = json_type_ids->value.GetArray(); -// private: -// const rj::Value& json_array_; -// const Schema& schema_; -// }; + for (auto i = 0; i < json_data_arr.Size(); ++i) { + if (!is_valid[i]) { + builder.AppendNull(); + continue; + } + + const rj::Value& val = json_data_arr[i]; + if (IsSignedInt::value) { + builder.Append(val.GetInt64()); + } else if (IsUnsignedInt::value) { + builder.Append(val.GetUint64()); + } else if (IsFloatingPoint::value) { + builder.Append(val.GetFloat()); + } else if (std::is_base_of::value) { + builder.Append(val.GetBool()); + } else { + // We are in the wrong function + return Status::Invalid(type->ToString()); + } + } + + return builder.Finish(array); + } + + template + typename std::enable_if::value, Status>::type + ReadArray(const rj::Value& obj, const std::shared_ptr& type, + std::shared_ptr* array) {} + + template + typename std::enable_if::value, Status>::type + ReadArray(const rj::Value& obj, const std::shared_ptr& type, + std::shared_ptr* array) {} + + template + typename std::enable_if::value, Status>::type + ReadArray(const rj::Value& obj, const std::shared_ptr& type, + std::shared_ptr* array) {} + + template + typename std::enable_if::value, Status>::type + ReadArray(const rj::Value& obj, const std::shared_ptr& type, + std::shared_ptr* array) {} + + Status GetArray(const rj::Value& obj, const std::shared_ptr& type, + std::shared_ptr* array) { + if (!obj.IsObject()) { return Status::Invalid("Array was not a JSON object"); } + const auto& json_array = obj.GetObject(); + + const auto& json_length = json_array.FindMember("count"); + RETURN_NOT_INT("count", json_length, json_array); + + const auto& json_validity = json_array.FindMember("VALIDITY"); + RETURN_NOT_ARRAY("VALIDITY", json_validity, json_array); + + std::vector is_valid(count); + +#define TYPE_CASE(TYPE) \ + case TYPE::type_enum: \ + return ReadArray(obj, type, array); + +#define NOT_IMPLEMENTED_CASE(TYPE_ENUM) \ + case Type::TYPE_ENUM: \ + std::stringstream ss; \ + ss << type->ToString(); \ + return Status::NotImplemented(ss.str()); + + switch (type->type) { + TYPE_CASE(NullType); + TYPE_CASE(BooleanType); + TYPE_CASE(UInt8Type); + TYPE_CASE(Int8Type); + TYPE_CASE(UInt16Type); + TYPE_CASE(Int16Type); + TYPE_CASE(UInt32Type); + TYPE_CASE(Int32Type); + TYPE_CASE(UInt64Type); + TYPE_CASE(Int64Type); + TYPE_CASE(HalfFloatType); + TYPE_CASE(FloatType); + TYPE_CASE(DoubleType); + TYPE_CASE(StringType); + TYPE_CASE(BinaryType); + NOT_IMPLEMENTED_CASE(DATE); + NOT_IMPLEMENTED_CASE(TIMESTAMP); + NOT_IMPLEMENTED_CASE(TIME); + NOT_IMPLEMENTED_CASE(INTERVAL); + TYPE_CASE(ListType); + TYPE_CASE(StructType); + NOT_IMPLEMENTED_CASE(UNION); + default: + std::stringstream ss; + ss << type->ToString(); + return Status::NotImplemented(ss.str()); + }; + +#undef TYPE_CASE +#undef NOT_IMPLEMENTED_CASE + + return Status::OK(); + } + + private: + MemoryPool* pool; + const rj::Value& json_array_; + const Schema& schema_; +}; Status WriteJsonSchema(const Schema& schema, RjWriter* json_writer) { JsonSchemaWriter converter(schema, json_writer); @@ -875,17 +1016,17 @@ Status ReadJsonSchema(const rj::Value& json_schema, std::shared_ptr* sch return converter.GetSchema(schema); } -// Status WriteJsonArray( -// const std::string& name, const Array& array, RjWriter* json_writer) { -// JsonArrayWriter converter(name, array, json_writer); -// converter.Write(); -// } - -// Status ReadJsonArray( -// const rj::Value& json_array, const Schema& schema, std::shared_ptr* array) { -// JsonArrayReader converter(json_array, schema); -// return converter.GetArray(array); -// } +Status WriteJsonArray( + const std::string& name, const Array& array, RjWriter* json_writer) { + JsonArrayWriter converter(name, array, json_writer); + return converter.Write(); +} + +Status ReadJsonArray(MemoryPool* pool, const rj::Value& json_array, const Schema& schema, + std::shared_ptr* array) { + JsonArrayReader converter(pool, json_array, schema); + return converter.GetArray(array); +} } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/schema-test.cc b/cpp/src/arrow/schema-test.cc index dcfc9fd5ff7..4826199f73d 100644 --- a/cpp/src/arrow/schema-test.cc +++ b/cpp/src/arrow/schema-test.cc @@ -98,4 +98,25 @@ f3: list)"; ASSERT_EQ(expected, result); } +TEST_F(TestSchema, GetFieldByName) { + auto f0 = field("f0", int32()); + auto f1 = field("f1", uint8(), false); + auto f2 = field("f2", utf8()); + auto f3 = field("f3", list(int16())); + + vector> fields = {f0, f1, f2, f3}; + auto schema = std::make_shared(fields); + + std::shared_ptr result; + + result = schema->GetFieldByName("f1"); + ASSERT_TRUE(f1->Equals(result)); + + result = schema->GetFieldByName("f3"); + ASSERT_TRUE(f3->Equals(result)); + + result = schema->GetFieldByName("not-found"); + ASSERT_TRUE(result == nullptr); +} + } // namespace arrow diff --git a/cpp/src/arrow/schema.cc b/cpp/src/arrow/schema.cc index ff3ea1990e5..cd8256e658e 100644 --- a/cpp/src/arrow/schema.cc +++ b/cpp/src/arrow/schema.cc @@ -42,6 +42,21 @@ bool Schema::Equals(const std::shared_ptr& other) const { return Equals(*other.get()); } +std::shared_ptr Schema::GetFieldByName(const std::string& name) { + if (fields_.size() > 0 && name_to_index_.size() == 0) { + for (size_t i = 0; i < fields_.size(); ++i) { + name_to_index_[fields_[i]->name] = i; + } + } + + auto it = name_to_index_.find(name); + if (it == name_to_index_.end()) { + return nullptr; + } else { + return fields_[it->second]; + } +} + std::string Schema::ToString() const { std::stringstream buffer; diff --git a/cpp/src/arrow/schema.h b/cpp/src/arrow/schema.h index 8e9a95f1e81..0e1ab5c368e 100644 --- a/cpp/src/arrow/schema.h +++ b/cpp/src/arrow/schema.h @@ -20,14 +20,14 @@ #include #include +#include #include +#include "arrow/type.h" #include "arrow/util/visibility.h" namespace arrow { -struct Field; - class ARROW_EXPORT Schema { public: explicit Schema(const std::vector>& fields); @@ -37,7 +37,10 @@ class ARROW_EXPORT Schema { bool Equals(const std::shared_ptr& other) const; // Return the ith schema element. Does not boundscheck - const std::shared_ptr& field(int i) const { return fields_[i]; } + std::shared_ptr field(int i) const { return fields_[i]; } + + // Returns nullptr if name not found + std::shared_ptr GetFieldByName(const std::string& name); const std::vector>& fields() const { return fields_; } @@ -48,6 +51,7 @@ class ARROW_EXPORT Schema { private: std::vector> fields_; + std::unordered_map name_to_index_; }; } // namespace arrow diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index a9d4eea3c0d..f27384077ec 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -74,21 +74,15 @@ struct Type { // Default unit millisecond TIMESTAMP = 17, - // Timestamp as double seconds since the UNIX epoch - TIMESTAMP_DOUBLE = 18, - // Exact time encoded with int64, default unit millisecond - TIME = 19, + TIME = 18, // YEAR_MONTH or DAY_TIME interval in SQL style - INTERVAL = 20, + INTERVAL = 19, // Precision- and scale-based decimal type. Storage type depends on the // parameters. - DECIMAL = 21, - - // Decimal value encoded as a text string - DECIMAL_TEXT = 22, + DECIMAL = 20, // A list of some logical data type LIST = 30, @@ -98,6 +92,12 @@ struct Type { // Unions of logical types UNION = 32, + + // Timestamp as double seconds since the UNIX epoch + TIMESTAMP_DOUBLE = 33, + + // Decimal value encoded as a text string + DECIMAL_TEXT = 34, }; }; @@ -155,7 +155,7 @@ struct ARROW_EXPORT Field { std::string name; // The field's data type - TypePtr type; + std::shared_ptr type; // Fields can be nullable bool nullable; @@ -164,8 +164,8 @@ struct ARROW_EXPORT Field { // 0 means it's not dictionary encoded int64_t dictionary; - Field(const std::string& name, const TypePtr& type, bool nullable = true, - int64_t dictionary = 0) + Field(const std::string& name, const std::shared_ptr& type, + bool nullable = true, int64_t dictionary = 0) : name(name), type(type), nullable(nullable), dictionary(dictionary) {} bool operator==(const Field& other) const { return this->Equals(other); } @@ -459,8 +459,8 @@ std::shared_ptr ARROW_EXPORT union_( const std::vector>& child_fields, const std::vector& type_ids, UnionMode mode = UnionMode::SPARSE); -std::shared_ptr ARROW_EXPORT field(const std::string& name, const TypePtr& type, - bool nullable = true, int64_t dictionary = 0); +std::shared_ptr ARROW_EXPORT field(const std::string& name, + const std::shared_ptr& type, bool nullable = true, int64_t dictionary = 0); } // namespace arrow diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 0ac42c6f7de..2bc8d277279 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -22,32 +22,76 @@ namespace arrow { class Status; -// Type forward declarations for the TypeVisitor struct DataType; +class Array; +class ArrayBuilder; struct Field; + struct NullType; +class NullArray; + struct BooleanType; -struct Int8Type; -struct Int16Type; -struct Int32Type; -struct Int64Type; -struct UInt8Type; -struct UInt16Type; -struct UInt32Type; -struct UInt64Type; -struct HalfFloatType; -struct FloatType; -struct DoubleType; -struct StringType; +class BooleanArray; +class BooleanBuilder; + struct BinaryType; +class BinaryArray; +class BinaryBuilder; + +struct StringType; +class StringArray; +class StringBuilder; + +struct ListType; +class ListArray; +class ListBuilder; + +struct StructType; +class StructArray; +class StructBuilder; + +struct DecimalType; +class DecimalArray; + +struct UnionType; +class UnionArray; + +template +class NumericArray; + +template +class NumericBuilder; + +#define _NUMERIC_TYPE_DECL(KLASS) \ + struct KLASS##TYPE; \ + using KLASS##Array = NumericArray; \ + using KLASS##Builder = NumericBuilder; + +_NUMERIC_TYPE_DECL(Int8); +_NUMERIC_TYPE_DECL(Int16); +_NUMERIC_TYPE_DECL(Int32); +_NUMERIC_TYPE_DECL(Int64); +_NUMERIC_TYPE_DECL(UInt8); +_NUMERIC_TYPE_DECL(UInt16); +_NUMERIC_TYPE_DECL(UInt32); +_NUMERIC_TYPE_DECL(UInt64); +_NUMERIC_TYPE_DECL(HalfFloat); +_NUMERIC_TYPE_DECL(Float); +_NUMERIC_TYPE_DECL(Double); + +#undef _NUMERIC_TYPE_DECL + struct DateType; +class DateArray; + struct TimeType; +class TimeArray; + struct TimestampType; +using TimestampArray = NumericArray; + struct IntervalType; -struct DecimalType; -struct ListType; -struct StructType; -struct UnionType; +using IntervalArray = NumericArray; class TypeVisitor { public: @@ -76,35 +120,6 @@ class TypeVisitor { virtual Status Visit(const UnionType& type) = 0; }; -class NullArray; -class BooleanArray; -class StringArray; -class BinaryArray; -class DecimalArray; -class ListArray; -class StructArray; -class UnionArray; - -template -class NumericArray; - -class DateArray; -class TimeArray; - -using HalfFloatArray = NumericArray; -using FloatArray = NumericArray; -using DoubleArray = NumericArray; -using Int8Array = NumericArray; -using UInt8Array = NumericArray; -using Int16Array = NumericArray; -using UInt16Array = NumericArray; -using Int32Array = NumericArray; -using UInt32Array = NumericArray; -using Int64Array = NumericArray; -using UInt64Array = NumericArray; -using TimestampArray = NumericArray; -using IntervalArray = NumericArray; - class ArrayVisitor { public: virtual Status Visit(const NullArray& array) = 0; diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index e4c9824fa01..4449a37575a 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -31,18 +31,21 @@ struct TypeTraits {}; template <> struct TypeTraits { using ArrayType = UInt8Array; + using BuilderType = UInt8Builder; static inline int bytes_required(int elements) { return elements; } }; template <> struct TypeTraits { using ArrayType = Int8Array; + using BuilderType = Int8Builder; static inline int bytes_required(int elements) { return elements; } }; template <> struct TypeTraits { using ArrayType = UInt16Array; + using BuilderType = UInt16Builder; static inline int bytes_required(int elements) { return elements * sizeof(uint16_t); } }; @@ -50,6 +53,7 @@ struct TypeTraits { template <> struct TypeTraits { using ArrayType = Int16Array; + using BuilderType = Int16Builder; static inline int bytes_required(int elements) { return elements * sizeof(int16_t); } }; @@ -57,6 +61,7 @@ struct TypeTraits { template <> struct TypeTraits { using ArrayType = UInt32Array; + using BuilderType = UInt32Builder; static inline int bytes_required(int elements) { return elements * sizeof(uint32_t); } }; @@ -64,6 +69,7 @@ struct TypeTraits { template <> struct TypeTraits { using ArrayType = Int32Array; + using BuilderType = Int32Builder; static inline int bytes_required(int elements) { return elements * sizeof(int32_t); } }; @@ -71,6 +77,7 @@ struct TypeTraits { template <> struct TypeTraits { using ArrayType = UInt64Array; + using BuilderType = UInt64Builder; static inline int bytes_required(int elements) { return elements * sizeof(uint64_t); } }; @@ -78,6 +85,7 @@ struct TypeTraits { template <> struct TypeTraits { using ArrayType = Int64Array; + using BuilderType = Int64Builder; static inline int bytes_required(int elements) { return elements * sizeof(int64_t); } }; @@ -85,13 +93,15 @@ struct TypeTraits { template <> struct TypeTraits { using ArrayType = TimestampArray; + using BuilderType = TimestampBuilder; static inline int bytes_required(int elements) { return elements * sizeof(int64_t); } }; -template <> +template <> struct TypeTraits { using ArrayType = FloatArray; + using BuilderType = FloatBuilder; static inline int bytes_required(int elements) { return elements * sizeof(float); } }; @@ -99,13 +109,15 @@ struct TypeTraits { template <> struct TypeTraits { using ArrayType = DoubleArray; + using BuilderType = DoubleBuilder; static inline int bytes_required(int elements) { return elements * sizeof(double); } }; template <> struct TypeTraits { - typedef BooleanArray ArrayType; + using ArrayType = BooleanArray; + using BuilderType = BooleanBuilder; static inline int bytes_required(int elements) { return BitUtil::BytesForBits(elements); diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index e4dd18761a5..449561d9cbc 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -86,6 +86,7 @@ template class NumericArray; template class NumericArray; template class NumericArray; template class NumericArray; +template class NumericArray; template class NumericArray; template class NumericArray; From 4fc729447f4a874bc72b751686638f98a8becb1c Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 14 Nov 2016 22:16:44 -0500 Subject: [PATCH 14/27] Refactoring, type attribute consistency. Array reader compiles Change-Id: I9cda0f769d8c942893cb0e33e772068b4c850ef8 --- cpp/src/arrow/ipc/json-internal.cc | 77 +++++++++++++++++++----------- cpp/src/arrow/type.h | 40 +++++++++++----- cpp/src/arrow/type_fwd.h | 2 +- cpp/src/arrow/type_traits.h | 38 ++++++++++----- cpp/src/arrow/types/primitive.cc | 1 + cpp/src/arrow/types/primitive.h | 25 +++++----- 6 files changed, 119 insertions(+), 64 deletions(-) diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index 8ebc8bc6c7c..62fd2c714d5 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -864,7 +864,7 @@ class JsonArrayReader { RETURN_NOT_STRING("name", json_name, json_array); return GetArrayFromStruct( - json_array_, json_name.GetString(), schema_.fields(), array); + json_array_, json_name->value.GetString(), schema_.fields(), array); } Status GetArrayFromStruct(const rj::Value& obj, const std::string& name, @@ -888,7 +888,10 @@ class JsonArrayReader { } template - Status ReadArray(const rj::Value& obj, const std::vector& is_valid, + typename std::enable_if::value || + std::is_base_of::value, + Status>::type + ReadArray(const rj::Value& obj, const std::vector& is_valid, const std::shared_ptr& type, std::shared_ptr* array) { typename TypeTraits::BuilderType builder(pool_, type); const auto& json_array = obj.GetObject(); @@ -896,7 +899,7 @@ class JsonArrayReader { const auto& json_data = json_array.FindMember("DATA"); RETURN_NOT_ARRAY("DATA", json_data, json_array); - const auto& json_data_arr = json_type_ids->value.GetArray(); + const auto& json_data_arr = json_data->value.GetArray(); for (auto i = 0; i < json_data_arr.Size(); ++i) { if (!is_valid[i]) { @@ -923,24 +926,32 @@ class JsonArrayReader { } template - typename std::enable_if::value, Status>::type - ReadArray(const rj::Value& obj, const std::shared_ptr& type, - std::shared_ptr* array) {} + typename std::enable_if::value, Status>::type ReadArray( + const rj::Value& obj, const std::vector& is_valid, + const std::shared_ptr& type, std::shared_ptr* array) { + return Status::OK(); + } template - typename std::enable_if::value, Status>::type - ReadArray(const rj::Value& obj, const std::shared_ptr& type, - std::shared_ptr* array) {} + typename std::enable_if::value, Status>::type ReadArray( + const rj::Value& obj, const std::vector& is_valid, + const std::shared_ptr& type, std::shared_ptr* array) { + return Status::OK(); + } template - typename std::enable_if::value, Status>::type - ReadArray(const rj::Value& obj, const std::shared_ptr& type, - std::shared_ptr* array) {} + typename std::enable_if::value, Status>::type ReadArray( + const rj::Value& obj, const std::vector& is_valid, + const std::shared_ptr& type, std::shared_ptr* array) { + return Status::OK(); + } template - typename std::enable_if::value, Status>::type - ReadArray(const rj::Value& obj, const std::shared_ptr& type, - std::shared_ptr* array) {} + typename std::enable_if::value, Status>::type ReadArray( + const rj::Value& obj, const std::vector& is_valid, + const std::shared_ptr& type, std::shared_ptr* array) { + return Status::NotImplemented("null"); + } Status GetArray(const rj::Value& obj, const std::shared_ptr& type, std::shared_ptr* array) { @@ -949,21 +960,31 @@ class JsonArrayReader { const auto& json_length = json_array.FindMember("count"); RETURN_NOT_INT("count", json_length, json_array); + int32_t length = json_length->value.GetInt(); + + const auto& json_valid_iter = json_array.FindMember("VALIDITY"); + RETURN_NOT_ARRAY("VALIDITY", json_valid_iter, json_array); - const auto& json_validity = json_array.FindMember("VALIDITY"); - RETURN_NOT_ARRAY("VALIDITY", json_validity, json_array); + const auto& json_validity = json_valid_iter->value.GetArray(); - std::vector is_valid(count); + DCHECK_EQ(static_cast(json_validity.Size()), length); -#define TYPE_CASE(TYPE) \ - case TYPE::type_enum: \ - return ReadArray(obj, type, array); + std::vector is_valid(length); + for (const rj::Value& val : json_validity) { + DCHECK(val.IsInt()); + is_valid.push_back(static_cast(val.GetInt())); + } + +#define TYPE_CASE(TYPE) \ + case TYPE::type_id: \ + return ReadArray(obj, is_valid, type, array); -#define NOT_IMPLEMENTED_CASE(TYPE_ENUM) \ - case Type::TYPE_ENUM: \ - std::stringstream ss; \ - ss << type->ToString(); \ - return Status::NotImplemented(ss.str()); +#define NOT_IMPLEMENTED_CASE(TYPE_ENUM) \ + case Type::TYPE_ENUM: { \ + std::stringstream ss; \ + ss << type->ToString(); \ + return Status::NotImplemented(ss.str()); \ + } switch (type->type) { TYPE_CASE(NullType); @@ -1001,7 +1022,7 @@ class JsonArrayReader { } private: - MemoryPool* pool; + MemoryPool* pool_; const rj::Value& json_array_; const Schema& schema_; }; @@ -1025,7 +1046,7 @@ Status WriteJsonArray( Status ReadJsonArray(MemoryPool* pool, const rj::Value& json_array, const Schema& schema, std::shared_ptr* array) { JsonArrayReader converter(pool, json_array, schema); - return converter.GetArray(array); + return converter.GetResult(array); } } // namespace ipc diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index f27384077ec..5b4d7bc42bd 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -184,12 +184,16 @@ struct ARROW_EXPORT Field { }; typedef std::shared_ptr FieldPtr; +struct PrimitiveCType : public DataType { + using DataType::DataType; +}; + template -struct ARROW_EXPORT PrimitiveCType : public DataType, public FixedWidthMeta { +struct ARROW_EXPORT CTypeImpl : public PrimitiveCType, public FixedWidthMeta { using c_type = C_TYPE; static constexpr Type::type type_id = TYPE_ID; - PrimitiveCType() : DataType(TYPE_ID) {} + CTypeImpl() : PrimitiveCType(TYPE_ID) {} int bit_width() const override { return sizeof(C_TYPE) * 8; } @@ -201,7 +205,7 @@ struct ARROW_EXPORT PrimitiveCType : public DataType, public FixedWidthMeta { }; struct ARROW_EXPORT NullType : public DataType, public FixedWidthMeta { - static constexpr Type::type type_enum = Type::NA; + static constexpr Type::type type_id = Type::NA; NullType() : DataType(Type::NA) {} @@ -213,13 +217,12 @@ struct ARROW_EXPORT NullType : public DataType, public FixedWidthMeta { }; template -struct IntegerTypeImpl : public PrimitiveCType, - public IntegerMeta { +struct IntegerTypeImpl : public CTypeImpl, public IntegerMeta { bool is_signed() const override { return std::is_signed::value; } }; struct ARROW_EXPORT BooleanType : public DataType, FixedWidthMeta { - static constexpr Type::type type_enum = Type::BOOL; + static constexpr Type::type type_id = Type::BOOL; BooleanType() : DataType(Type::BOOL) {} @@ -266,25 +269,27 @@ struct ARROW_EXPORT Int64Type : public IntegerTypeImpl, + : public CTypeImpl, public FloatingPointMeta { Precision precision() const override; static std::string name() { return "halffloat"; } }; -struct ARROW_EXPORT FloatType : public PrimitiveCType, +struct ARROW_EXPORT FloatType : public CTypeImpl, public FloatingPointMeta { Precision precision() const override; static std::string name() { return "float"; } }; -struct ARROW_EXPORT DoubleType : public PrimitiveCType, +struct ARROW_EXPORT DoubleType : public CTypeImpl, public FloatingPointMeta { Precision precision() const override; static std::string name() { return "double"; } }; struct ARROW_EXPORT ListType : public DataType, public NoExtraMeta { + static constexpr Type::type type_id = Type::LIST; + // List can contain any other logical value type explicit ListType(const std::shared_ptr& value_type) : ListType(std::make_shared("item", value_type)) {} @@ -305,6 +310,8 @@ struct ARROW_EXPORT ListType : public DataType, public NoExtraMeta { // BinaryType type is reprsents lists of 1-byte values. struct ARROW_EXPORT BinaryType : public DataType, public NoExtraMeta { + static constexpr Type::type type_id = Type::BINARY; + BinaryType() : BinaryType(Type::BINARY) {} Status Accept(TypeVisitor* visitor) const override; @@ -318,6 +325,8 @@ struct ARROW_EXPORT BinaryType : public DataType, public NoExtraMeta { // UTF encoded strings struct ARROW_EXPORT StringType : public BinaryType { + static constexpr Type::type type_id = Type::STRING; + StringType() : BinaryType(Type::STRING) {} Status Accept(TypeVisitor* visitor) const override; @@ -326,6 +335,8 @@ struct ARROW_EXPORT StringType : public BinaryType { }; struct ARROW_EXPORT StructType : public DataType, public NoExtraMeta { + static constexpr Type::type type_id = Type::STRUCT; + explicit StructType(const std::vector>& fields) : DataType(Type::STRUCT) { children_ = fields; @@ -337,6 +348,8 @@ struct ARROW_EXPORT StructType : public DataType, public NoExtraMeta { }; struct ARROW_EXPORT DecimalType : public DataType { + static constexpr Type::type type_id = Type::DECIMAL; + explicit DecimalType(int precision_, int scale_) : DataType(Type::DECIMAL), precision(precision_), scale(scale_) {} int precision; @@ -350,6 +363,8 @@ struct ARROW_EXPORT DecimalType : public DataType { enum class UnionMode : char { SPARSE, DENSE }; struct ARROW_EXPORT UnionType : public DataType { + static constexpr Type::type type_id = Type::UNION; + UnionType(const std::vector>& child_fields, const std::vector& type_ids, UnionMode mode = UnionMode::SPARSE) : DataType(Type::UNION), mode(mode), type_ids(type_ids) { @@ -365,6 +380,8 @@ struct ARROW_EXPORT UnionType : public DataType { }; struct ARROW_EXPORT DateType : public DataType, public NoExtraMeta { + static constexpr Type::type type_id = Type::DATE; + DateType() : DataType(Type::DATE) {} Status Accept(TypeVisitor* visitor) const override; @@ -375,6 +392,7 @@ struct ARROW_EXPORT DateType : public DataType, public NoExtraMeta { enum class TimeUnit : char { SECOND = 0, MILLI = 1, MICRO = 2, NANO = 3 }; struct ARROW_EXPORT TimeType : public DataType { + static constexpr Type::type type_id = Type::TIME; using Unit = TimeUnit; TimeUnit unit; @@ -391,7 +409,7 @@ struct ARROW_EXPORT TimestampType : public DataType, public FixedWidthMeta { using Unit = TimeUnit; typedef int64_t c_type; - static constexpr Type::type type_enum = Type::TIMESTAMP; + static constexpr Type::type type_id = Type::TIMESTAMP; int bit_width() const override { return sizeof(int64_t) * 8; } @@ -411,7 +429,7 @@ struct ARROW_EXPORT IntervalType : public DataType, public FixedWidthMeta { enum class Unit : char { YEAR_MONTH = 0, DAY_TIME = 1 }; typedef int64_t c_type; - static constexpr Type::type type_enum = Type::INTERVAL; + static constexpr Type::type type_id = Type::INTERVAL; int bit_width() const override { return sizeof(int64_t) * 8; } diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 2bc8d277279..e80f7b17a98 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -63,7 +63,7 @@ template class NumericBuilder; #define _NUMERIC_TYPE_DECL(KLASS) \ - struct KLASS##TYPE; \ + struct KLASS##Type; \ using KLASS##Array = NumericArray; \ using KLASS##Builder = NumericBuilder; diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 4449a37575a..1f90f4d65b0 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -93,11 +93,19 @@ struct TypeTraits { template <> struct TypeTraits { using ArrayType = TimestampArray; - using BuilderType = TimestampBuilder; + // using BuilderType = TimestampBuilder; static inline int bytes_required(int elements) { return elements * sizeof(int64_t); } }; +template <> +struct TypeTraits { + using ArrayType = HalfFloatArray; + using BuilderType = HalfFloatBuilder; + + static inline int bytes_required(int elements) { return elements * sizeof(uint16_t); } +}; + template <> struct TypeTraits { using ArrayType = FloatArray; @@ -130,21 +138,27 @@ struct as_void { using type = void; }; -template -struct GetCType { - using type = void; -}; +// The partial specialization will match if T has the ATTR_NAME member +#define GET_ATTR(ATTR_NAME, DEFAULT) \ + template \ + struct GetAttr_##ATTR_NAME { \ + using type = DEFAULT; \ + }; \ + \ + template \ + struct GetAttr_##ATTR_NAME::type> { \ + using type = typename T::ATTR_NAME; \ + }; -// The partial specialization will match if T has the c_type member -template -struct GetCType::type> { - using type = typename T::c_type; -}; +GET_ATTR(c_type, void); +GET_ATTR(TypeClass, void); + +#undef GET_ATTR #define PRIMITIVE_TRAITS(T) \ using TypeClass = typename std::conditional::value, T, \ - typename T::TypeClass>::type; \ - using c_type = typename GetCType::type; + typename GetAttr_TypeClass::type>::type; \ + using c_type = typename GetAttr_c_type::type; template struct IsUnsignedInt { diff --git a/cpp/src/arrow/types/primitive.cc b/cpp/src/arrow/types/primitive.cc index 449561d9cbc..14667ee5b6e 100644 --- a/cpp/src/arrow/types/primitive.cc +++ b/cpp/src/arrow/types/primitive.cc @@ -161,6 +161,7 @@ template class PrimitiveBuilder; template class PrimitiveBuilder; template class PrimitiveBuilder; template class PrimitiveBuilder; +template class PrimitiveBuilder; template class PrimitiveBuilder; template class PrimitiveBuilder; diff --git a/cpp/src/arrow/types/primitive.h b/cpp/src/arrow/types/primitive.h index 826988a6076..a5a3704e2d2 100644 --- a/cpp/src/arrow/types/primitive.h +++ b/cpp/src/arrow/types/primitive.h @@ -98,7 +98,7 @@ class ARROW_EXPORT NumericArray : public PrimitiveArray { template class ARROW_EXPORT PrimitiveBuilder : public ArrayBuilder { public: - typedef typename Type::c_type value_type; + using value_type = typename Type::c_type; explicit PrimitiveBuilder(MemoryPool* pool, const TypePtr& type) : ArrayBuilder(pool, type), data_(nullptr) {} @@ -173,19 +173,20 @@ class ARROW_EXPORT NumericBuilder : public PrimitiveBuilder { // Builders -typedef NumericBuilder UInt8Builder; -typedef NumericBuilder UInt16Builder; -typedef NumericBuilder UInt32Builder; -typedef NumericBuilder UInt64Builder; +using UInt8Builder = NumericBuilder; +using UInt16Builder = NumericBuilder; +using UInt32Builder = NumericBuilder; +using UInt64Builder = NumericBuilder; -typedef NumericBuilder Int8Builder; -typedef NumericBuilder Int16Builder; -typedef NumericBuilder Int32Builder; -typedef NumericBuilder Int64Builder; -typedef NumericBuilder TimestampBuilder; +using Int8Builder = NumericBuilder; +using Int16Builder = NumericBuilder; +using Int32Builder = NumericBuilder; +using Int64Builder = NumericBuilder; +using TimestampBuilder = NumericBuilder; -typedef NumericBuilder FloatBuilder; -typedef NumericBuilder DoubleBuilder; +using HalfFloatBuilder = NumericBuilder; +using FloatBuilder = NumericBuilder; +using DoubleBuilder = NumericBuilder; class ARROW_EXPORT BooleanArray : public PrimitiveArray { public: From f26402a6eaca33db2d8f5a604c9aa720e6a9a738 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Mon, 14 Nov 2016 22:28:20 -0500 Subject: [PATCH 15/27] Install type_traits.h. cpplint Change-Id: I008bf0adc806034062d6684fd1615448db246c6b --- cpp/src/arrow/CMakeLists.txt | 1 + cpp/src/arrow/ipc/json-internal.cc | 14 +++++++------- cpp/src/arrow/ipc/json-internal.h | 1 + cpp/src/arrow/ipc/json.cc | 10 ++-------- cpp/src/arrow/ipc/json.h | 3 ++- 5 files changed, 13 insertions(+), 16 deletions(-) diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index e3e16ef0817..81851bc5b3e 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -25,6 +25,7 @@ install(FILES table.h type.h type_fwd.h + type_traits.h test-util.h DESTINATION include/arrow) diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index 62fd2c714d5..f733a44546f 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -21,6 +21,7 @@ #include #include #include +#include #include "rapidjson/stringbuffer.h" #include "rapidjson/writer.h" @@ -53,7 +54,7 @@ static std::string GetBufferTypeName(BufferType type) { return "VALIDITY"; default: break; - }; + } return "UNKNOWN"; } @@ -149,7 +150,7 @@ class JsonSchemaWriter : public TypeVisitor { break; default: break; - }; + } } template @@ -163,7 +164,7 @@ class JsonSchemaWriter : public TypeVisitor { case IntervalType::Unit::DAY_TIME: writer_->String("DAY_TIME"); break; - }; + } } template @@ -185,7 +186,7 @@ class JsonSchemaWriter : public TypeVisitor { case TimeUnit::NANO: writer_->String("NANOSECOND"); break; - }; + } } template @@ -208,7 +209,7 @@ class JsonSchemaWriter : public TypeVisitor { case UnionMode::DENSE: writer_->String("DENSE"); break; - }; + } // Write type ids writer_->Key("typeIds"); @@ -788,7 +789,6 @@ class JsonSchemaReader { } else { std::stringstream ss; ss << "Invalid union mode: " << mode_str; - ; return Status::Invalid(ss.str()); } @@ -1013,7 +1013,7 @@ class JsonArrayReader { std::stringstream ss; ss << type->ToString(); return Status::NotImplemented(ss.str()); - }; + } #undef TYPE_CASE #undef NOT_IMPLEMENTED_CASE diff --git a/cpp/src/arrow/ipc/json-internal.h b/cpp/src/arrow/ipc/json-internal.h index a50b29547c7..e3ce0b8703a 100644 --- a/cpp/src/arrow/ipc/json-internal.h +++ b/cpp/src/arrow/ipc/json-internal.h @@ -25,6 +25,7 @@ #define RAPIDJSON_HAS_CXX11_RANGE_FOR 1 #include +#include #include "rapidjson/document.h" #include "rapidjson/stringbuffer.h" diff --git a/cpp/src/arrow/ipc/json.cc b/cpp/src/arrow/ipc/json.cc index afe1e75d827..646e7c2b8bb 100644 --- a/cpp/src/arrow/ipc/json.cc +++ b/cpp/src/arrow/ipc/json.cc @@ -15,15 +15,9 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/ipc/json-internal.h" - -#define RAPIDJSON_HAS_STDSTRING 1 -#define RAPIDJSON_HAS_CXX11_RVALUE_REFS 1 -#define RAPIDJSON_HAS_CXX11_RANGE_FOR 1 - -#include "rapidjson/stringbuffer.h" -#include "rapidjson/writer.h" +#include "arrow/ipc/json.h" +#include "arrow/ipc/json-internal.h" #include "arrow/type.h" #include "arrow/util/status.h" diff --git a/cpp/src/arrow/ipc/json.h b/cpp/src/arrow/ipc/json.h index e9adf15506b..2c799f9c1e5 100644 --- a/cpp/src/arrow/ipc/json.h +++ b/cpp/src/arrow/ipc/json.h @@ -21,6 +21,7 @@ #define ARROW_IPC_JSON_H #include +#include #include "arrow/type_fwd.h" #include "arrow/visibility.h" @@ -79,7 +80,7 @@ class ARROW_EXPORT JsonReader { Status GetRecordBatch(int i, std::shared_ptr* batch); private: - JsonReader(const std::shared_ptr& file); + explicit JsonReader(const std::shared_ptr& file); std::shared_ptr file_; std::shared_ptr schema_; From 35c2f85c888f2f222220656e2c7faa6d7961f153 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 15 Nov 2016 17:46:05 -0500 Subject: [PATCH 16/27] Refactoring. Start drafting string/list reader Change-Id: I3e7508c435e4a9f5fe76c1df3951338a24d81839 --- cpp/src/arrow/ipc/json-internal.cc | 141 ++++++++++++++++++++--------- cpp/src/arrow/ipc/json.cc | 6 +- cpp/src/arrow/ipc/json.h | 6 +- cpp/src/arrow/type_traits.h | 30 ++++-- cpp/src/arrow/types/string.h | 14 +-- 5 files changed, 132 insertions(+), 65 deletions(-) diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index f733a44546f..7a0614e9445 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -34,6 +34,7 @@ #include "arrow/types/primitive.h" #include "arrow/types/string.h" #include "arrow/types/struct.h" +#include "arrow/util/bit-util.h" #include "arrow/util/memory-pool.h" #include "arrow/util/status.h" @@ -58,6 +59,22 @@ static std::string GetBufferTypeName(BufferType type) { return "UNKNOWN"; } +static std::string GetTimeUnitName(TimeUnit unit) { + switch (unit) { + case TimeUnit::SECOND: + return "SECOND"; + case TimeUnit::MILLI: + return "MILLISECOND"; + case TimeUnit::MICRO: + return "MICROSECOND"; + case TimeUnit::NANO: + return "NANOSECOND"; + default: + break; + } + return "UNKNOWN"; +} + class BufferLayout { public: BufferLayout(BufferType type, int bit_width) : type_(type), bit_width_(bit_width) {} @@ -173,20 +190,7 @@ class JsonSchemaWriter : public TypeVisitor { void>::type WriteTypeMetadata(const T& type) { writer_->Key("unit"); - switch (type.unit) { - case TimeUnit::SECOND: - writer_->String("SECOND"); - break; - case TimeUnit::MILLI: - writer_->String("MILLISECOND"); - break; - case TimeUnit::MICRO: - writer_->String("MICROSECOND"); - break; - case TimeUnit::NANO: - writer_->String("NANOSECOND"); - break; - } + writer_->String(GetTimeUnitName(type.unit)); } template @@ -679,16 +683,15 @@ class JsonSchemaReader { RETURN_NOT_OK(GetFieldsFromArray(json_children->value, &children)); std::shared_ptr type; - RETURN_NOT_OK(GetType(json_type->value, children, &type)); + RETURN_NOT_OK(GetType(json_type->value.GetObject(), children, &type)); *field = std::make_shared( json_name->value.GetString(), type, json_nullable->value.GetBool()); return Status::OK(); } - Status GetInteger(const rj::Value& obj, std::shared_ptr* type) { - const auto& json_type = obj.GetObject(); - + Status GetInteger( + const rj::Value::ConstObject& json_type, std::shared_ptr* type) { const auto& json_bit_width = json_type.FindMember("bitWidth"); RETURN_NOT_INT("bitWidth", json_bit_width, json_type); @@ -719,9 +722,8 @@ class JsonSchemaReader { return Status::OK(); } - Status GetFloatingPoint(const rj::Value& obj, std::shared_ptr* type) { - const auto& json_type = obj.GetObject(); - + Status GetFloatingPoint( + const rj::Value::ConstObject& json_type, std::shared_ptr* type) { const auto& json_precision = json_type.FindMember("precision"); RETURN_NOT_STRING("precision", json_precision, json_type); @@ -742,9 +744,8 @@ class JsonSchemaReader { } template - Status GetTimeLike(const rj::Value& obj, std::shared_ptr* type) { - const auto& json_type = obj.GetObject(); - + Status GetTimeLike( + const rj::Value::ConstObject& json_type, std::shared_ptr* type) { const auto& json_unit = json_type.FindMember("unit"); RETURN_NOT_STRING("unit", json_unit, json_type); @@ -771,11 +772,9 @@ class JsonSchemaReader { return Status::OK(); } - Status GetUnion(const rj::Value& obj, + Status GetUnion(const rj::Value::ConstObject& json_type, const std::vector>& children, std::shared_ptr* type) { - const auto& json_type = obj.GetObject(); - const auto& json_mode = json_type.FindMember("mode"); RETURN_NOT_STRING("mode", json_mode, json_type); @@ -806,20 +805,18 @@ class JsonSchemaReader { return Status::OK(); } - Status GetType(const rj::Value& obj, + Status GetType(const rj::Value::ConstObject& json_type, const std::vector>& children, std::shared_ptr* type) { - const auto& json_type = obj.GetObject(); - const auto& json_type_name = json_type.FindMember("name"); RETURN_NOT_STRING("name", json_type_name, json_type); std::string type_name = json_type_name->value.GetString(); if (type_name == "int") { - return GetInteger(obj, type); + return GetInteger(json_type, type); } else if (type_name == "floatingpoint") { - return GetFloatingPoint(obj, type); + return GetFloatingPoint(json_type, type); } else if (type_name == "bool") { *type = boolean(); } else if (type_name == "utf8") { @@ -831,15 +828,15 @@ class JsonSchemaReader { } else if (type_name == "date") { *type = date(); } else if (type_name == "time") { - return GetTimeLike(obj, type); + return GetTimeLike(json_type, type); } else if (type_name == "timestamp") { - return GetTimeLike(obj, type); + return GetTimeLike(json_type, type); } else if (type_name == "list") { *type = list(children[0]); } else if (type_name == "struct") { *type = struct_(children); } else { - return GetUnion(obj, children, type); + return GetUnion(json_type, children, type); } return Status::OK(); } @@ -891,10 +888,9 @@ class JsonArrayReader { typename std::enable_if::value || std::is_base_of::value, Status>::type - ReadArray(const rj::Value& obj, const std::vector& is_valid, + ReadArray(const rj::Value::ConstObject& json_array, const std::vector& is_valid, const std::shared_ptr& type, std::shared_ptr* array) { typename TypeTraits::BuilderType builder(pool_, type); - const auto& json_array = obj.GetObject(); const auto& json_data = json_array.FindMember("DATA"); RETURN_NOT_ARRAY("DATA", json_data, json_array); @@ -909,12 +905,16 @@ class JsonArrayReader { const rj::Value& val = json_data_arr[i]; if (IsSignedInt::value) { + DCHECK(val.IsInt()); builder.Append(val.GetInt64()); } else if (IsUnsignedInt::value) { + DCHECK(val.IsUint()); builder.Append(val.GetUint64()); } else if (IsFloatingPoint::value) { + DCHECK(val.IsFloat()); builder.Append(val.GetFloat()); } else if (std::is_base_of::value) { + DCHECK(val.IsBool()); builder.Append(val.GetBool()); } else { // We are in the wrong function @@ -927,28 +927,83 @@ class JsonArrayReader { template typename std::enable_if::value, Status>::type ReadArray( - const rj::Value& obj, const std::vector& is_valid, + const rj::Value::ConstObject& json_array, const std::vector& is_valid, const std::shared_ptr& type, std::shared_ptr* array) { - return Status::OK(); + typename TypeTraits::BuilderType builder(pool_, type); + + const auto& json_data = json_array.FindMember("DATA"); + RETURN_NOT_ARRAY("DATA", json_data, json_array); + + const auto& json_data_arr = json_data->value.GetArray(); + + for (auto i = 0; i < json_data_arr.Size(); ++i) { + if (!is_valid[i]) { + builder.AppendNull(); + continue; + } + + const rj::Value& val = json_data_arr[i]; + DCHECK(val.IsString()); + builder.Append(val.GetString()); + } + + return builder.Finish(array); } template typename std::enable_if::value, Status>::type ReadArray( - const rj::Value& obj, const std::vector& is_valid, + const rj::Value::ConstObject& json_array, const std::vector& is_valid, const std::shared_ptr& type, std::shared_ptr* array) { + const auto& json_offsets = json_array.FindMember("OFFSETS"); + RETURN_NOT_ARRAY("OFFSETS", json_offsets, json_array); + const auto& json_offsets_arr = json_offsets->value.GetArray(); + + int length = static_cast(is_valid.size()); + + auto validity_buffer = std::make_shared(pool_); + RETURN_NOT_OK(validity_buffer->Resize(BitUtil::BytesForBits(length))); + + auto offsets_buffer = std::make_shared(pool_); + RETURN_NOT_OK(offsets_buffer->Resize((length + 1) * sizeof(int32_t))); + + int32_t null_count = 0; + uint8_t* bitmap = reinterpret_cast(validity_buffer->mutable_data()); + memset(bitmap, 0, validity_buffer->size()); + + int32_t* offsets = reinterpret_cast(offsets_buffer->mutable_data()); + + for (int i = 0; i < length; ++i) { + const rj::Value& val = json_offsets_arr[i]; + + DCHECK(val.IsInt()); + offsets[i] = val.GetInt(); + + if (!is_valid[i]) { + ++null_count; + continue; + } + BitUtil::SetBit(bitmap, i); + } + + // auto list_type = dynamic_cast(type.get()); + std::shared_ptr values; + + *array = std::make_shared( + type, length, offsets_buffer, values, null_count, validity_buffer); + return Status::OK(); } template typename std::enable_if::value, Status>::type ReadArray( - const rj::Value& obj, const std::vector& is_valid, + const rj::Value::ConstObject& json_array, const std::vector& is_valid, const std::shared_ptr& type, std::shared_ptr* array) { return Status::OK(); } template typename std::enable_if::value, Status>::type ReadArray( - const rj::Value& obj, const std::vector& is_valid, + const rj::Value::ConstObject& json_array, const std::vector& is_valid, const std::shared_ptr& type, std::shared_ptr* array) { return Status::NotImplemented("null"); } @@ -977,7 +1032,7 @@ class JsonArrayReader { #define TYPE_CASE(TYPE) \ case TYPE::type_id: \ - return ReadArray(obj, is_valid, type, array); + return ReadArray(json_array, is_valid, type, array); #define NOT_IMPLEMENTED_CASE(TYPE_ENUM) \ case Type::TYPE_ENUM: { \ diff --git a/cpp/src/arrow/ipc/json.cc b/cpp/src/arrow/ipc/json.cc index 646e7c2b8bb..49992f85d70 100644 --- a/cpp/src/arrow/ipc/json.cc +++ b/cpp/src/arrow/ipc/json.cc @@ -22,9 +22,5 @@ #include "arrow/util/status.h" namespace arrow { -namespace ipc { - -namespace rj = rapidjson; - -} // namespace ipc +namespace ipc {} // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/json.h b/cpp/src/arrow/ipc/json.h index 2c799f9c1e5..0485f8e4163 100644 --- a/cpp/src/arrow/ipc/json.h +++ b/cpp/src/arrow/ipc/json.h @@ -24,11 +24,13 @@ #include #include "arrow/type_fwd.h" -#include "arrow/visibility.h" +#include "arrow/util/visibility.h" namespace arrow { class MemoryPool; +class RecordBatch; +class Schema; namespace io { @@ -42,7 +44,7 @@ namespace ipc { class ARROW_EXPORT JsonWriter { public: static Status Open(io::OutputStream* sink, const std::shared_ptr& schema, - std::shared_ptr* out); + std::shared_ptr* out); // TODO(wesm): Write dictionaries diff --git a/cpp/src/arrow/type_traits.h b/cpp/src/arrow/type_traits.h index 1f90f4d65b0..bbb807488e3 100644 --- a/cpp/src/arrow/type_traits.h +++ b/cpp/src/arrow/type_traits.h @@ -132,6 +132,18 @@ struct TypeTraits { } }; +template <> +struct TypeTraits { + using ArrayType = StringArray; + using BuilderType = StringBuilder; +}; + +template <> +struct TypeTraits { + using ArrayType = BinaryArray; + using BuilderType = BinaryBuilder; +}; + // Not all type classes have a c_type template struct as_void { @@ -139,15 +151,15 @@ struct as_void { }; // The partial specialization will match if T has the ATTR_NAME member -#define GET_ATTR(ATTR_NAME, DEFAULT) \ - template \ - struct GetAttr_##ATTR_NAME { \ - using type = DEFAULT; \ - }; \ - \ - template \ +#define GET_ATTR(ATTR_NAME, DEFAULT) \ + template \ + struct GetAttr_##ATTR_NAME { \ + using type = DEFAULT; \ + }; \ + \ + template \ struct GetAttr_##ATTR_NAME::type> { \ - using type = typename T::ATTR_NAME; \ + using type = typename T::ATTR_NAME; \ }; GET_ATTR(c_type, void); @@ -157,7 +169,7 @@ GET_ATTR(TypeClass, void); #define PRIMITIVE_TRAITS(T) \ using TypeClass = typename std::conditional::value, T, \ - typename GetAttr_TypeClass::type>::type; \ + typename GetAttr_TypeClass::type>::type; \ using c_type = typename GetAttr_c_type::type; template diff --git a/cpp/src/arrow/types/string.h b/cpp/src/arrow/types/string.h index 167b8ec8bf8..c8752439f16 100644 --- a/cpp/src/arrow/types/string.h +++ b/cpp/src/arrow/types/string.h @@ -119,6 +119,12 @@ class ARROW_EXPORT BinaryBuilder : public ListBuilder { return byte_builder_->Append(value, length); } + Status Append(const char* value, int32_t length) { + return Append(reinterpret_cast(value), length); + } + + Status Append(const std::string& value) { return Append(value.c_str(), value.size()); } + Status Finish(std::shared_ptr* out) override; protected: @@ -131,13 +137,9 @@ class ARROW_EXPORT StringBuilder : public BinaryBuilder { explicit StringBuilder(MemoryPool* pool, const TypePtr& type) : BinaryBuilder(pool, type) {} - Status Finish(std::shared_ptr* out) override; + using BinaryBuilder::Append; - Status Append(const std::string& value) { return Append(value.c_str(), value.size()); } - - Status Append(const char* value, int32_t length) { - return BinaryBuilder::Append(reinterpret_cast(value), length); - } + Status Finish(std::shared_ptr* out) override; Status Append(const std::vector& values, uint8_t* null_bytes); }; From 656634344df2e6bf6f4893e4310554ac7f26b1dc Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 16 Nov 2016 18:01:07 -0500 Subject: [PATCH 17/27] Recursively construct children for list/struct Change-Id: Iab4687ef38f889100a8e83fad59c1bec3772810a --- cpp/src/arrow/ipc/json-internal.cc | 155 +++++++++++++++++++---------- 1 file changed, 103 insertions(+), 52 deletions(-) diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index 7a0614e9445..a2b8914d84f 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -18,6 +18,7 @@ #include "arrow/ipc/json-internal.h" #include +#include #include #include #include @@ -41,6 +42,9 @@ namespace arrow { namespace ipc { +using RjArray = rj::Value::ConstArray; +using RjObject = rj::Value::ConstObject; + enum class BufferType : char { DATA, OFFSET, TYPE, VALIDITY }; static std::string GetBufferTypeName(BufferType type) { @@ -59,6 +63,20 @@ static std::string GetBufferTypeName(BufferType type) { return "UNKNOWN"; } +static std::string GetFloatingPrecisionName(FloatingPointMeta::Precision precision) { + switch (precision) { + case FloatingPointMeta::HALF: + return "HALF"; + case FloatingPointMeta::SINGLE: + return "SINGLE"; + case FloatingPointMeta::DOUBLE: + return "DOUBLE"; + default: + break; + } + return "UNKNOWN"; +} + static std::string GetTimeUnitName(TimeUnit unit) { switch (unit) { case TimeUnit::SECOND: @@ -155,19 +173,7 @@ class JsonSchemaWriter : public TypeVisitor { typename std::enable_if::value, void>::type WriteTypeMetadata(const T& type) { writer_->Key("precision"); - switch (type.precision()) { - case FloatingPointMeta::HALF: - writer_->String("HALF"); - break; - case FloatingPointMeta::SINGLE: - writer_->String("SINGLE"); - break; - case FloatingPointMeta::DOUBLE: - writer_->String("DOUBLE"); - break; - default: - break; - } + writer_->String(GetFloatingPrecisionName(type.precision())); } template @@ -722,8 +728,7 @@ class JsonSchemaReader { return Status::OK(); } - Status GetFloatingPoint( - const rj::Value::ConstObject& json_type, std::shared_ptr* type) { + Status GetFloatingPoint(const RjObject& json_type, std::shared_ptr* type) { const auto& json_precision = json_type.FindMember("precision"); RETURN_NOT_STRING("precision", json_precision, json_type); @@ -744,8 +749,7 @@ class JsonSchemaReader { } template - Status GetTimeLike( - const rj::Value::ConstObject& json_type, std::shared_ptr* type) { + Status GetTimeLike(const RjObject& json_type, std::shared_ptr* type) { const auto& json_unit = json_type.FindMember("unit"); RETURN_NOT_STRING("unit", json_unit, json_type); @@ -772,7 +776,7 @@ class JsonSchemaReader { return Status::OK(); } - Status GetUnion(const rj::Value::ConstObject& json_type, + Status GetUnion(const RjObject& json_type, const std::vector>& children, std::shared_ptr* type) { const auto& json_mode = json_type.FindMember("mode"); @@ -797,6 +801,7 @@ class JsonSchemaReader { std::vector type_ids; const auto& id_array = json_type_ids->value.GetArray(); for (const rj::Value& val : id_array) { + DCHECK(val.IsUint()); type_ids.push_back(val.GetUint()); } @@ -805,7 +810,7 @@ class JsonSchemaReader { return Status::OK(); } - Status GetType(const rj::Value::ConstObject& json_type, + Status GetType(const RjObject& json_type, const std::vector>& children, std::shared_ptr* type) { const auto& json_type_name = json_type.FindMember("name"); @@ -852,9 +857,6 @@ class JsonArrayReader { : pool_(pool), json_array_(json_array), schema_(schema) {} Status GetResult(std::shared_ptr* array) { - if (!json_array_.IsObject()) { - return Status::Invalid("Array was not a JSON object"); - } const auto& json_array = json_array_.GetObject(); const auto& json_name = json_array.FindMember("name"); @@ -884,11 +886,33 @@ class JsonArrayReader { return GetArray(obj, result->type, array); } + Status GetValidityBuffer(const std::vector& is_valid, int32_t* null_count, + std::shared_ptr* validity_buffer) { + int length = static_cast(is_valid.size()); + + auto out_buffer = std::make_shared(pool_); + RETURN_NOT_OK(out_buffer->Resize(BitUtil::BytesForBits(length))); + uint8_t* bitmap = reinterpret_cast(out_buffer->mutable_data()); + memset(bitmap, 0, out_buffer->size()); + + *null_count = 0; + for (int i = 0; i < length; ++i) { + if (!is_valid[i]) { + ++(*null_count); + continue; + } + BitUtil::SetBit(bitmap, i); + } + + *validity_buffer = out_buffer; + return Status::OK(); + } + template typename std::enable_if::value || std::is_base_of::value, Status>::type - ReadArray(const rj::Value::ConstObject& json_array, const std::vector& is_valid, + ReadArray(const RjObject& json_array, int32_t length, const std::vector& is_valid, const std::shared_ptr& type, std::shared_ptr* array) { typename TypeTraits::BuilderType builder(pool_, type); @@ -897,7 +921,8 @@ class JsonArrayReader { const auto& json_data_arr = json_data->value.GetArray(); - for (auto i = 0; i < json_data_arr.Size(); ++i) { + DCHECK_EQ(static_cast(json_data_arr.Size()), length); + for (auto i = 0; i < length; ++i) { if (!is_valid[i]) { builder.AppendNull(); continue; @@ -927,7 +952,7 @@ class JsonArrayReader { template typename std::enable_if::value, Status>::type ReadArray( - const rj::Value::ConstObject& json_array, const std::vector& is_valid, + const RjObject& json_array, int32_t length, const std::vector& is_valid, const std::shared_ptr& type, std::shared_ptr* array) { typename TypeTraits::BuilderType builder(pool_, type); @@ -936,7 +961,8 @@ class JsonArrayReader { const auto& json_data_arr = json_data->value.GetArray(); - for (auto i = 0; i < json_data_arr.Size(); ++i) { + DCHECK_EQ(static_cast(json_data_arr.Size()), length); + for (auto i = 0; i < length; ++i) { if (!is_valid[i]) { builder.AppendNull(); continue; @@ -952,65 +978,89 @@ class JsonArrayReader { template typename std::enable_if::value, Status>::type ReadArray( - const rj::Value::ConstObject& json_array, const std::vector& is_valid, + const RjObject& json_array, int32_t length, const std::vector& is_valid, const std::shared_ptr& type, std::shared_ptr* array) { const auto& json_offsets = json_array.FindMember("OFFSETS"); RETURN_NOT_ARRAY("OFFSETS", json_offsets, json_array); const auto& json_offsets_arr = json_offsets->value.GetArray(); - int length = static_cast(is_valid.size()); - - auto validity_buffer = std::make_shared(pool_); - RETURN_NOT_OK(validity_buffer->Resize(BitUtil::BytesForBits(length))); + int32_t null_count = 0; + std::shared_ptr validity_buffer; + RETURN_NOT_OK(GetValidityBuffer(is_valid, &null_count, &validity_buffer)); auto offsets_buffer = std::make_shared(pool_); RETURN_NOT_OK(offsets_buffer->Resize((length + 1) * sizeof(int32_t))); - - int32_t null_count = 0; - uint8_t* bitmap = reinterpret_cast(validity_buffer->mutable_data()); - memset(bitmap, 0, validity_buffer->size()); - int32_t* offsets = reinterpret_cast(offsets_buffer->mutable_data()); - for (int i = 0; i < length; ++i) { + for (int i = 0; i < length + 1; ++i) { const rj::Value& val = json_offsets_arr[i]; - DCHECK(val.IsInt()); offsets[i] = val.GetInt(); - - if (!is_valid[i]) { - ++null_count; - continue; - } - BitUtil::SetBit(bitmap, i); } - // auto list_type = dynamic_cast(type.get()); - std::shared_ptr values; + std::vector> children; + RETURN_NOT_OK(GetChildren(json_array, type, &children)); + DCHECK_EQ(children.size(), 1); *array = std::make_shared( - type, length, offsets_buffer, values, null_count, validity_buffer); + type, length, offsets_buffer, children[0], null_count, validity_buffer); return Status::OK(); } template typename std::enable_if::value, Status>::type ReadArray( - const rj::Value::ConstObject& json_array, const std::vector& is_valid, + const RjObject& json_array, int32_t length, const std::vector& is_valid, const std::shared_ptr& type, std::shared_ptr* array) { + int32_t null_count = 0; + std::shared_ptr validity_buffer; + RETURN_NOT_OK(GetValidityBuffer(is_valid, &null_count, &validity_buffer)); + + std::vector> fields; + RETURN_NOT_OK(GetChildren(json_array, type, &fields)); + + *array = + std::make_shared(type, length, fields, null_count, validity_buffer); + return Status::OK(); } template typename std::enable_if::value, Status>::type ReadArray( - const rj::Value::ConstObject& json_array, const std::vector& is_valid, + const RjObject& json_array, int32_t length, const std::vector& is_valid, const std::shared_ptr& type, std::shared_ptr* array) { - return Status::NotImplemented("null"); + *array = std::make_shared(type, length); + return Status::OK(); + } + + Status GetChildren(const RjObject& json_array, const std::shared_ptr& type, + std::vector>* array) { + const auto& json_children = json_array.FindMember("children"); + RETURN_NOT_ARRAY("children", json_children, json_array); + const auto& json_children_arr = json_children->value.GetArray(); + + if (type->num_children() != static_cast(json_children_arr.Size())) { + std::stringstream ss; + ss << "Expected " << type->num_children() << " children, but got " + << json_children_arr.Size(); + return Status::Invalid(ss.str()); + } + + for (auto i = 0; i < json_children_arr.Size(); ++i) { + DCHECK(json_children_arr[i].IsObject()); + std::shared_ptr child; + RETURN_NOT_OK(GetArray(json_children_arr[i], type->child(i)->type, &child)); + array->emplace_back(child); + } + + return Status::OK(); } Status GetArray(const rj::Value& obj, const std::shared_ptr& type, std::shared_ptr* array) { - if (!obj.IsObject()) { return Status::Invalid("Array was not a JSON object"); } + if (!obj.IsObject()) { + return Status::Invalid("Array element was not a JSON object"); + } const auto& json_array = obj.GetObject(); const auto& json_length = json_array.FindMember("count"); @@ -1032,7 +1082,7 @@ class JsonArrayReader { #define TYPE_CASE(TYPE) \ case TYPE::type_id: \ - return ReadArray(json_array, is_valid, type, array); + return ReadArray(json_array, length, is_valid, type, array); #define NOT_IMPLEMENTED_CASE(TYPE_ENUM) \ case Type::TYPE_ENUM: { \ @@ -1100,6 +1150,7 @@ Status WriteJsonArray( Status ReadJsonArray(MemoryPool* pool, const rj::Value& json_array, const Schema& schema, std::shared_ptr* array) { + if (!json_array.IsObject()) { return Status::Invalid("Element was not a JSON object"); } JsonArrayReader converter(pool, json_array, schema); return converter.GetResult(array); } From 0891378a2b1b45f792a67038df57270fc810185b Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 16 Nov 2016 18:13:13 -0500 Subject: [PATCH 18/27] Declare loop variables Change-Id: If772fa9cf8b3ea4f04cdf0825d91572e96825f31 --- cpp/CMakeLists.txt | 2 +- cpp/src/arrow/ipc/json-internal.cc | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 0bff7528578..9245615f10a 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -114,7 +114,7 @@ CHECK_CXX_COMPILER_FLAG("-maltivec" CXX_SUPPORTS_ALTIVEC) # compiler flags that are common across debug/release builds # - Wall: Enable all warnings. -set(CXX_COMMON_FLAGS "-std=c++11 -Wall") +set(CXX_COMMON_FLAGS "-std=c++11 -Wall -Werror") # Only enable additional instruction sets if they are supported if (CXX_SUPPORTS_SSE3 AND ARROW_SSE3) diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index a2b8914d84f..49adeea101e 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -414,7 +414,7 @@ class JsonArrayWriter : public ArrayVisitor { typename std::enable_if::value, void>::type WriteDataValues( const T& arr) { const auto data = arr.raw_data(); - for (auto i = 0; i < arr.length(); ++i) { + for (int i = 0; i < arr.length(); ++i) { writer_->Int64(data[i]); } } @@ -423,7 +423,7 @@ class JsonArrayWriter : public ArrayVisitor { typename std::enable_if::value, void>::type WriteDataValues( const T& arr) { const auto data = arr.raw_data(); - for (auto i = 0; i < arr.length(); ++i) { + for (int i = 0; i < arr.length(); ++i) { writer_->Uint64(data[i]); } } @@ -432,7 +432,7 @@ class JsonArrayWriter : public ArrayVisitor { typename std::enable_if::value, void>::type WriteDataValues( const T& arr) { const auto data = arr.raw_data(); - for (auto i = 0; i < arr.length(); ++i) { + for (int i = 0; i < arr.length(); ++i) { writer_->Double(data[i]); } } @@ -441,7 +441,7 @@ class JsonArrayWriter : public ArrayVisitor { template typename std::enable_if::value, void>::type WriteDataValues(const T& arr) { - for (auto i = 0; i < arr.length(); ++i) { + for (int i = 0; i < arr.length(); ++i) { int32_t length; const char* buf = reinterpret_cast(arr.GetValue(i, &length)); writer_->String(buf, length); @@ -451,7 +451,7 @@ class JsonArrayWriter : public ArrayVisitor { template typename std::enable_if::value, void>::type WriteDataValues(const T& arr) { - for (auto i = 0; i < arr.length(); ++i) { + for (int i = 0; i < arr.length(); ++i) { writer_->Bool(arr.Value(i)); } } @@ -467,7 +467,7 @@ class JsonArrayWriter : public ArrayVisitor { void WriteOffsetsField(const T* offsets, int32_t length) { writer_->Key("OFFSETS"); writer_->StartArray(); - for (auto i = 0; i < length; ++i) { + for (int i = 0; i < length; ++i) { writer_->Int64(offsets[i]); } writer_->EndArray(); @@ -477,11 +477,11 @@ class JsonArrayWriter : public ArrayVisitor { writer_->Key("VALIDITY"); writer_->StartArray(); if (arr.null_count() > 0) { - for (auto i = 0; i < arr.length(); ++i) { + for (int i = 0; i < arr.length(); ++i) { writer_->Int(arr.IsNull(i) ? 0 : 1); } } else { - for (auto i = 0; i < arr.length(); ++i) { + for (int i = 0; i < arr.length(); ++i) { writer_->Int(1); } } @@ -922,7 +922,7 @@ class JsonArrayReader { const auto& json_data_arr = json_data->value.GetArray(); DCHECK_EQ(static_cast(json_data_arr.Size()), length); - for (auto i = 0; i < length; ++i) { + for (int i = 0; i < length; ++i) { if (!is_valid[i]) { builder.AppendNull(); continue; @@ -962,7 +962,7 @@ class JsonArrayReader { const auto& json_data_arr = json_data->value.GetArray(); DCHECK_EQ(static_cast(json_data_arr.Size()), length); - for (auto i = 0; i < length; ++i) { + for (int i = 0; i < length; ++i) { if (!is_valid[i]) { builder.AppendNull(); continue; @@ -1046,7 +1046,7 @@ class JsonArrayReader { return Status::Invalid(ss.str()); } - for (auto i = 0; i < json_children_arr.Size(); ++i) { + for (int i = 0; i < static_cast(json_children_arr.Size()); ++i) { DCHECK(json_children_arr[i].IsObject()); std::shared_ptr child; RETURN_NOT_OK(GetArray(json_children_arr[i], type->child(i)->type, &child)); From 82f108bcc8fb61828fdb277c67c2cd3eeb057be7 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 16 Nov 2016 23:50:49 -0500 Subject: [PATCH 19/27] Refactoring. Array test scaffold Change-Id: I92590f8ad17c761576af499584225d4fe24c7440 --- cpp/src/arrow/ipc/ipc-json-test.cc | 74 +++++++++++++++++++++------ cpp/src/arrow/ipc/json-internal.cc | 72 ++++++++++++-------------- cpp/src/arrow/ipc/json-internal.h | 11 ++-- cpp/src/arrow/type_fwd.h | 3 ++ cpp/src/arrow/types/primitive-test.cc | 14 ----- cpp/src/arrow/types/test-common.h | 16 ++++++ 6 files changed, 115 insertions(+), 75 deletions(-) diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc index 7690f5400b4..79c52735dcd 100644 --- a/cpp/src/arrow/ipc/ipc-json-test.cc +++ b/cpp/src/arrow/ipc/ipc-json-test.cc @@ -30,36 +30,70 @@ #include "rapidjson/prettywriter.h" #include "gtest/gtest.h" +#include "arrow/array.h" #include "arrow/test-util.h" #include "arrow/type.h" +#include "arrow/type_traits.h" +#include "arrow/types/primitive.h" #include "arrow/util/memory-pool.h" #include "arrow/util/status.h" namespace arrow { namespace ipc { -class TestJsonSchemaWriter : public ::testing::Test { - public: - void SetUp() {} - void TearDown() {} +void TestSchemaRoundTrip(const Schema& schema) { + rj::StringBuffer sb; + rj::Writer writer(sb); - void TestRoundTrip(const Schema& schema) { - rj::StringBuffer sb; - rj::Writer writer(sb); + ASSERT_OK(WriteJsonSchema(schema, &writer)); - ASSERT_OK(WriteJsonSchema(schema, &writer)); + rj::Document d; + d.Parse(sb.GetString()); - rj::Document d; - d.Parse(sb.GetString()); + std::shared_ptr out; + ASSERT_OK(ReadJsonSchema(d, &out)); - std::shared_ptr out; - ASSERT_OK(ReadJsonSchema(d, &out)); + ASSERT_TRUE(schema.Equals(out)); +} + +void TestArrayRoundTrip(const Array& array) { + static std::string name = "dummy"; + + rj::StringBuffer sb; + rj::Writer writer(sb); + + ASSERT_OK(WriteJsonArray(name, array, &writer)); + + rj::Document d; + d.Parse(sb.GetString()); + + std::shared_ptr out; + ASSERT_OK(ReadJsonArray(default_memory_pool(), d, array.type(), &out)); + + ASSERT_TRUE(array.Equals(out)); +} - ASSERT_TRUE(schema.Equals(out)); + +template +void CheckPrimitive(const std::shared_ptr& type, + const std::vector& is_valid, const std::vector& values) { + MemoryPool* pool = default_memory_pool(); + typename TypeTraits::BuilderType builder(pool, type); + + for (size_t i = 0; i < values.size(); ++i) { + if (is_valid[i]) { + ASSERT_OK(builder.Append(values[i])); + } else { + ASSERT_OK(builder.AppendNull()); + } } -}; -TEST_F(TestJsonSchemaWriter, FlatTypes) { + std::shared_ptr array; + ASSERT_OK(builder.Finish(&array)); + TestArrayRoundTrip(*array.get()); +} + +TEST(TestJsonSchemaWriter, FlatTypes) { std::vector> fields = {field("f0", int8()), field("f1", int16(), false), field("f2", int32()), field("f3", int64(), false), field("f4", uint8()), field("f5", uint16()), field("f6", uint32()), @@ -72,7 +106,15 @@ TEST_F(TestJsonSchemaWriter, FlatTypes) { {0, 1}, UnionMode::DENSE))}; Schema schema(fields); - TestRoundTrip(schema); + TestSchemaRoundTrip(schema); +} + + +TEST(TestJsonArrayWriter, PrimitiveTypes) { + std::vector is_valid = {true, false, true, true, true, false, true, true}; + + std::vector u1 = {0, 1, 2, 3, 4, 5, 6, 7}; + CheckPrimitive(uint8(), is_valid, u1); } } // namespace ipc diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index 49adeea101e..3bc631dbca7 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -852,39 +852,7 @@ class JsonSchemaReader { class JsonArrayReader { public: - explicit JsonArrayReader( - MemoryPool* pool, const rj::Value& json_array, const Schema& schema) - : pool_(pool), json_array_(json_array), schema_(schema) {} - - Status GetResult(std::shared_ptr* array) { - const auto& json_array = json_array_.GetObject(); - - const auto& json_name = json_array.FindMember("name"); - RETURN_NOT_STRING("name", json_name, json_array); - - return GetArrayFromStruct( - json_array_, json_name->value.GetString(), schema_.fields(), array); - } - - Status GetArrayFromStruct(const rj::Value& obj, const std::string& name, - const std::vector>& fields, std::shared_ptr* array) { - std::shared_ptr result = nullptr; - - for (const std::shared_ptr& field : fields) { - if (field->name == name) { - result = field; - break; - } - } - - if (result == nullptr) { - std::stringstream ss; - ss << "Field named " << name << " not found in struct/schema"; - return Status::KeyError(ss.str()); - } - - return GetArray(obj, result->type, array); - } + explicit JsonArrayReader(MemoryPool* pool) : pool_(pool) {} Status GetValidityBuffer(const std::vector& is_valid, int32_t* null_count, std::shared_ptr* validity_buffer) { @@ -1128,8 +1096,6 @@ class JsonArrayReader { private: MemoryPool* pool_; - const rj::Value& json_array_; - const Schema& schema_; }; Status WriteJsonSchema(const Schema& schema, RjWriter* json_writer) { @@ -1148,11 +1114,39 @@ Status WriteJsonArray( return converter.Write(); } -Status ReadJsonArray(MemoryPool* pool, const rj::Value& json_array, const Schema& schema, - std::shared_ptr* array) { +Status ReadJsonArray(MemoryPool* pool, const rj::Value& json_array, + const std::shared_ptr& type, std::shared_ptr* array) { + JsonArrayReader converter(pool); + return converter.GetArray(json_array, type, array); +} + + +Status ReadJsonArray(MemoryPool* pool, const rj::Value& json_array, + const Schema& schema, std::shared_ptr* array) { if (!json_array.IsObject()) { return Status::Invalid("Element was not a JSON object"); } - JsonArrayReader converter(pool, json_array, schema); - return converter.GetResult(array); + + const auto& json_obj = json_array.GetObject(); + + const auto& json_name = json_obj.FindMember("name"); + RETURN_NOT_STRING("name", json_name, json_obj); + + std::string name = json_name->value.GetString(); + + std::shared_ptr result = nullptr; + for (const std::shared_ptr& field : schema.fields()) { + if (field->name == name) { + result = field; + break; + } + } + + if (result == nullptr) { + std::stringstream ss; + ss << "Field named " << name << " not found in schema"; + return Status::KeyError(ss.str()); + } + + return ReadJsonArray(pool, json_array, result->type, array); } } // namespace ipc diff --git a/cpp/src/arrow/ipc/json-internal.h b/cpp/src/arrow/ipc/json-internal.h index e3ce0b8703a..7878c31187d 100644 --- a/cpp/src/arrow/ipc/json-internal.h +++ b/cpp/src/arrow/ipc/json-internal.h @@ -38,10 +38,6 @@ namespace rj = rapidjson; using RjWriter = rj::Writer; namespace arrow { - -class Array; -class Schema; - namespace ipc { Status ARROW_EXPORT WriteJsonSchema(const Schema& schema, RjWriter* json_writer); @@ -50,8 +46,11 @@ Status ARROW_EXPORT WriteJsonArray( Status ARROW_EXPORT ReadJsonSchema( const rj::Value& json_arr, std::shared_ptr* schema); -Status ARROW_EXPORT ReadJsonArray( - const rj::Value& json_obj, const Schema& schema, std::shared_ptr* array); +Status ARROW_EXPORT ReadJsonArray(MemoryPool* pool, const rj::Value& json_obj, + const std::shared_ptr& type, std::shared_ptr* array); + +Status ARROW_EXPORT ReadJsonArray(MemoryPool* pool, const rj::Value& json_obj, + const Schema& schema, std::shared_ptr* array); } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index e80f7b17a98..76987b7475d 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -27,6 +27,9 @@ class Array; class ArrayBuilder; struct Field; +class MemoryPool; +class Schema; + struct NullType; class NullArray; diff --git a/cpp/src/arrow/types/primitive-test.cc b/cpp/src/arrow/types/primitive-test.cc index 568a1fecd05..bdc8ec00be0 100644 --- a/cpp/src/arrow/types/primitive-test.cc +++ b/cpp/src/arrow/types/primitive-test.cc @@ -313,20 +313,6 @@ TYPED_TEST(TestPrimitiveBuilder, TestArrayDtorDealloc) { ASSERT_EQ(memory_before, this->pool_->bytes_allocated()); } -template -Status MakeArray(const vector& valid_bytes, const vector& draws, int size, - Builder* builder, ArrayPtr* out) { - // Append the first 1000 - for (int i = 0; i < size; ++i) { - if (valid_bytes[i] > 0) { - RETURN_NOT_OK(builder->Append(draws[i])); - } else { - RETURN_NOT_OK(builder->AppendNull()); - } - } - return builder->Finish(out); -} - TYPED_TEST(TestPrimitiveBuilder, Equality) { DECL_T(); diff --git a/cpp/src/arrow/types/test-common.h b/cpp/src/arrow/types/test-common.h index 1957636b141..6e6ab85ad4e 100644 --- a/cpp/src/arrow/types/test-common.h +++ b/cpp/src/arrow/types/test-common.h @@ -24,6 +24,8 @@ #include "gtest/gtest.h" +#include "arrow/array.h" +#include "arrow/builder.h" #include "arrow/test-util.h" #include "arrow/type.h" #include "arrow/util/memory-pool.h" @@ -49,6 +51,20 @@ class TestBuilder : public ::testing::Test { unique_ptr builder_nn_; }; +template +Status MakeArray(const std::vector& valid_bytes, const std::vector& values, + int size, Builder* builder, ArrayPtr* out) { + // Append the first 1000 + for (int i = 0; i < size; ++i) { + if (valid_bytes[i] > 0) { + RETURN_NOT_OK(builder->Append(values[i])); + } else { + RETURN_NOT_OK(builder->AppendNull()); + } + } + return builder->Finish(out); +} + } // namespace arrow #endif // ARROW_TYPES_TEST_COMMON_H From e2e86b58a8796448bce582b356f70ab4302e08f2 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 17 Nov 2016 10:37:48 -0500 Subject: [PATCH 20/27] Test JSON array roundtrip for numeric types, strings, lists, structs Change-Id: I9403a253307d304d0dc5a71e5d8b7e623fbfa69f --- cpp/src/arrow/array.cc | 11 +++++ cpp/src/arrow/array.h | 6 +++ cpp/src/arrow/column-test.cc | 1 + cpp/src/arrow/ipc/ipc-json-test.cc | 75 ++++++++++++++++++++++++++++-- cpp/src/arrow/ipc/json-internal.cc | 27 +++++++---- cpp/src/arrow/test-util.h | 19 +++++++- cpp/src/arrow/types/string-test.cc | 10 ++-- 7 files changed, 128 insertions(+), 21 deletions(-) diff --git a/cpp/src/arrow/array.cc b/cpp/src/arrow/array.cc index 10f7a2f5ebf..3262425e99b 100644 --- a/cpp/src/arrow/array.cc +++ b/cpp/src/arrow/array.cc @@ -18,6 +18,7 @@ #include "arrow/array.h" #include +#include #include "arrow/util/bit-util.h" #include "arrow/util/buffer.h" @@ -25,6 +26,16 @@ namespace arrow { +Status GetEmptyBitmap( + MemoryPool* pool, int32_t length, std::shared_ptr* result) { + auto buffer = std::make_shared(pool); + RETURN_NOT_OK(buffer->Resize(BitUtil::BytesForBits(length))); + memset(buffer->mutable_data(), 0, buffer->size()); + + *result = buffer; + return Status::OK(); +} + // ---------------------------------------------------------------------- // Base array class diff --git a/cpp/src/arrow/array.h b/cpp/src/arrow/array.h index fa7ef615a8a..ff2b70e213b 100644 --- a/cpp/src/arrow/array.h +++ b/cpp/src/arrow/array.h @@ -29,6 +29,8 @@ namespace arrow { class Buffer; +class MemoryPool; +class MutableBuffer; class Status; // Immutable data array with some logical type and some length. Any memory is @@ -103,6 +105,10 @@ class ARROW_EXPORT NullArray : public Array { }; typedef std::shared_ptr ArrayPtr; + +Status ARROW_EXPORT GetEmptyBitmap( + MemoryPool* pool, int32_t length, std::shared_ptr* result); + } // namespace arrow #endif diff --git a/cpp/src/arrow/column-test.cc b/cpp/src/arrow/column-test.cc index 1edf313d49b..ac3636d1b6d 100644 --- a/cpp/src/arrow/column-test.cc +++ b/cpp/src/arrow/column-test.cc @@ -22,6 +22,7 @@ #include "gtest/gtest.h" +#include "arrow/array.h" #include "arrow/column.h" #include "arrow/schema.h" #include "arrow/test-util.h" diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc index 79c52735dcd..b0eb55fdea4 100644 --- a/cpp/src/arrow/ipc/ipc-json-test.cc +++ b/cpp/src/arrow/ipc/ipc-json-test.cc @@ -35,6 +35,8 @@ #include "arrow/type.h" #include "arrow/type_traits.h" #include "arrow/types/primitive.h" +#include "arrow/types/string.h" +#include "arrow/types/struct.h" #include "arrow/util/memory-pool.h" #include "arrow/util/status.h" @@ -64,15 +66,20 @@ void TestArrayRoundTrip(const Array& array) { ASSERT_OK(WriteJsonArray(name, array, &writer)); + std::string array_as_json = sb.GetString(); + rj::Document d; - d.Parse(sb.GetString()); + d.Parse(array_as_json); + + if (d.HasParseError()) { FAIL() << "JSON parsing failed"; } std::shared_ptr out; ASSERT_OK(ReadJsonArray(default_memory_pool(), d, array.type(), &out)); - ASSERT_TRUE(array.Equals(out)); -} + std::cout << array_as_json << std::endl; + ASSERT_TRUE(array.Equals(out)) << array_as_json; +} template void CheckPrimitive(const std::shared_ptr& type, @@ -109,12 +116,70 @@ TEST(TestJsonSchemaWriter, FlatTypes) { TestSchemaRoundTrip(schema); } +template +void PrimitiveTypesCheckOne() { + using c_type = typename T::c_type; + + std::vector is_valid = {true, false, true, true, true, false, true, true}; + std::vector values = {0, 1, 2, 3, 4, 5, 6, 7}; + CheckPrimitive(std::make_shared(), is_valid, values); +} TEST(TestJsonArrayWriter, PrimitiveTypes) { + PrimitiveTypesCheckOne(); + PrimitiveTypesCheckOne(); + PrimitiveTypesCheckOne(); + PrimitiveTypesCheckOne(); + PrimitiveTypesCheckOne(); + PrimitiveTypesCheckOne(); + PrimitiveTypesCheckOne(); + PrimitiveTypesCheckOne(); + PrimitiveTypesCheckOne(); + PrimitiveTypesCheckOne(); + std::vector is_valid = {true, false, true, true, true, false, true, true}; + std::vector values = {"foo", "bar", "", "baz", "qux", "foo", "a", "1"}; + + CheckPrimitive(utf8(), is_valid, values); + CheckPrimitive(binary(), is_valid, values); +} + +TEST(TestJsonArrayWriter, NestedTypes) { + auto value_type = int32(); + + std::vector values_is_valid = {true, false, true, true, false, true, true}; + std::vector values = {0, 1, 2, 3, 4, 5, 6}; + + std::shared_ptr values_buffer = test::GetBufferFromVector(values); + std::shared_ptr values_bitmap; + ASSERT_OK(test::GetBitmapFromBoolVector(values_is_valid, &values_bitmap)); + auto values_array = std::make_shared( + value_type, static_cast(values.size()), values_buffer, 2, values_bitmap); + + // List + std::vector list_is_valid = {true, false, true, true, true}; + std::vector offsets = {0, 0, 0, 1, 4, 7}; + + std::shared_ptr list_bitmap; + ASSERT_OK(test::GetBitmapFromBoolVector(list_is_valid, &list_bitmap)); + std::shared_ptr offsets_buffer = test::GetBufferFromVector(offsets); + + ListArray list_array(list(value_type), 5, offsets_buffer, values_array, 1, list_bitmap); + + TestArrayRoundTrip(list_array); + + // Struct + std::vector struct_is_valid = {true, false, true, true, true, false, true}; + std::shared_ptr struct_bitmap; + ASSERT_OK(test::GetBitmapFromBoolVector(struct_is_valid, &struct_bitmap)); + + auto struct_type = + struct_({field("f1", int32()), field("f2", int32()), field("f3", int32())}); - std::vector u1 = {0, 1, 2, 3, 4, 5, 6, 7}; - CheckPrimitive(uint8(), is_valid, u1); + std::vector> fields = {values_array, values_array, values_array}; + StructArray struct_array( + struct_type, static_cast(struct_is_valid.size()), fields, 2, struct_bitmap); + TestArrayRoundTrip(struct_array); } } // namespace ipc diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index 3bc631dbca7..4e2f31fc337 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -458,6 +458,7 @@ class JsonArrayWriter : public ArrayVisitor { template void WriteDataField(const T& arr) { + writer_->Key("DATA"); writer_->StartArray(); WriteDataValues(arr); writer_->EndArray(); @@ -858,10 +859,9 @@ class JsonArrayReader { std::shared_ptr* validity_buffer) { int length = static_cast(is_valid.size()); - auto out_buffer = std::make_shared(pool_); - RETURN_NOT_OK(out_buffer->Resize(BitUtil::BytesForBits(length))); - uint8_t* bitmap = reinterpret_cast(out_buffer->mutable_data()); - memset(bitmap, 0, out_buffer->size()); + std::shared_ptr out_buffer; + RETURN_NOT_OK(GetEmptyBitmap(pool_, length, &out_buffer)); + uint8_t* bitmap = out_buffer->mutable_data(); *null_count = 0; for (int i = 0; i < length; ++i) { @@ -1015,9 +1015,17 @@ class JsonArrayReader { } for (int i = 0; i < static_cast(json_children_arr.Size()); ++i) { - DCHECK(json_children_arr[i].IsObject()); + const rj::Value& json_child = json_children_arr[i]; + DCHECK(json_child.IsObject()); + + std::shared_ptr child_field = type->child(i); + + auto it = json_child.FindMember("name"); + RETURN_NOT_STRING("name", it, json_child); + + DCHECK_EQ(it->value.GetString(), child_field->name); std::shared_ptr child; - RETURN_NOT_OK(GetArray(json_children_arr[i], type->child(i)->type, &child)); + RETURN_NOT_OK(GetArray(json_children_arr[i], child_field->type, &child)); array->emplace_back(child); } @@ -1042,7 +1050,7 @@ class JsonArrayReader { DCHECK_EQ(static_cast(json_validity.Size()), length); - std::vector is_valid(length); + std::vector is_valid; for (const rj::Value& val : json_validity) { DCHECK(val.IsInt()); is_valid.push_back(static_cast(val.GetInt())); @@ -1120,9 +1128,8 @@ Status ReadJsonArray(MemoryPool* pool, const rj::Value& json_array, return converter.GetArray(json_array, type, array); } - -Status ReadJsonArray(MemoryPool* pool, const rj::Value& json_array, - const Schema& schema, std::shared_ptr* array) { +Status ReadJsonArray(MemoryPool* pool, const rj::Value& json_array, const Schema& schema, + std::shared_ptr* array) { if (!json_array.IsObject()) { return Status::Invalid("Element was not a JSON object"); } const auto& json_obj = json_array.GetObject(); diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index ac56f5ed087..808510a38ee 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -27,6 +27,7 @@ #include "gtest/gtest.h" +#include "arrow/array.h" #include "arrow/column.h" #include "arrow/schema.h" #include "arrow/table.h" @@ -102,11 +103,27 @@ void random_real(int n, uint32_t seed, T min_value, T max_value, std::vector* } template -std::shared_ptr to_buffer(const std::vector& values) { +std::shared_ptr GetBufferFromVector(const std::vector& values) { return std::make_shared( reinterpret_cast(values.data()), values.size() * sizeof(T)); } +static inline Status GetBitmapFromBoolVector( + const std::vector& is_valid, std::shared_ptr* result) { + int length = static_cast(is_valid.size()); + + std::shared_ptr buffer; + RETURN_NOT_OK(GetEmptyBitmap(default_memory_pool(), length, &buffer)); + + uint8_t* bitmap = buffer->mutable_data(); + for (int i = 0; i < length; ++i) { + if (is_valid[i]) { BitUtil::SetBit(bitmap, i); } + } + + *result = buffer; + return Status::OK(); +} + // Sets approximately pct_null of the first n bytes in null_bytes to zero // and the rest to non-zero (true) values. void random_null_bytes(int64_t n, double pct_null, uint8_t* null_bytes) { diff --git a/cpp/src/arrow/types/string-test.cc b/cpp/src/arrow/types/string-test.cc index 4e5db17dfcc..3c4b12b7bc7 100644 --- a/cpp/src/arrow/types/string-test.cc +++ b/cpp/src/arrow/types/string-test.cc @@ -66,8 +66,8 @@ class TestStringContainer : public ::testing::Test { void MakeArray() { length_ = offsets_.size() - 1; - value_buf_ = test::to_buffer(chars_); - offsets_buf_ = test::to_buffer(offsets_); + value_buf_ = test::GetBufferFromVector(chars_); + offsets_buf_ = test::GetBufferFromVector(offsets_); null_bitmap_ = test::bytes_to_null_buffer(valid_bytes_); null_count_ = test::null_count(valid_bytes_); @@ -131,7 +131,7 @@ TEST_F(TestStringContainer, TestGetString) { TEST_F(TestStringContainer, TestEmptyStringComparison) { offsets_ = {0, 0, 0, 0, 0, 0}; - offsets_buf_ = test::to_buffer(offsets_); + offsets_buf_ = test::GetBufferFromVector(offsets_); length_ = offsets_.size() - 1; auto strings_a = std::make_shared( @@ -227,8 +227,8 @@ class TestBinaryContainer : public ::testing::Test { void MakeArray() { length_ = offsets_.size() - 1; - value_buf_ = test::to_buffer(chars_); - offsets_buf_ = test::to_buffer(offsets_); + value_buf_ = test::GetBufferFromVector(chars_); + offsets_buf_ = test::GetBufferFromVector(offsets_); null_bitmap_ = test::bytes_to_null_buffer(valid_bytes_); null_count_ = test::null_count(valid_bytes_); From 6bbd669024f388b983175770a205eb893f819d7a Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 17 Nov 2016 10:45:04 -0500 Subject: [PATCH 21/27] Tweaks Change-Id: I04a99b218bb38e0472d71414898840e6f6d8cf7f --- cpp/CMakeLists.txt | 2 +- cpp/src/arrow/ipc/json-internal.h | 2 ++ cpp/src/arrow/types/list-test.cc | 1 + 3 files changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 9245615f10a..0bff7528578 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -114,7 +114,7 @@ CHECK_CXX_COMPILER_FLAG("-maltivec" CXX_SUPPORTS_ALTIVEC) # compiler flags that are common across debug/release builds # - Wall: Enable all warnings. -set(CXX_COMMON_FLAGS "-std=c++11 -Wall -Werror") +set(CXX_COMMON_FLAGS "-std=c++11 -Wall") # Only enable additional instruction sets if they are supported if (CXX_SUPPORTS_SSE3 AND ARROW_SSE3) diff --git a/cpp/src/arrow/ipc/json-internal.h b/cpp/src/arrow/ipc/json-internal.h index 7878c31187d..79b1d195c37 100644 --- a/cpp/src/arrow/ipc/json-internal.h +++ b/cpp/src/arrow/ipc/json-internal.h @@ -40,6 +40,8 @@ using RjWriter = rj::Writer; namespace arrow { namespace ipc { +// TODO(wesm): Only exporting these because arrow_ipc does not have a static +// library at the moment. Better to not export Status ARROW_EXPORT WriteJsonSchema(const Schema& schema, RjWriter* json_writer); Status ARROW_EXPORT WriteJsonArray( const std::string& name, const Array& array, RjWriter* json_writer); diff --git a/cpp/src/arrow/types/list-test.cc b/cpp/src/arrow/types/list-test.cc index cf56d0d0e87..cb9a8c12d8a 100644 --- a/cpp/src/arrow/types/list-test.cc +++ b/cpp/src/arrow/types/list-test.cc @@ -46,6 +46,7 @@ TEST(TypesTest, TestListType) { ListType list_type(vt); ASSERT_EQ(list_type.type, Type::LIST); + ASSERT_EQ(list_type.name(), string("list")); ASSERT_EQ(list_type.ToString(), string("list")); ASSERT_EQ(list_type.value_type()->type, vt->type); From 3d6bbbd1ff95b7c63b139fff91b2c3bd0dc7e051 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 17 Nov 2016 12:14:02 -0500 Subject: [PATCH 22/27] Start high level writer scaffold Change-Id: I325f3c3a33c1ded53b083d19d72234794338b28b --- cpp/src/arrow/ipc/ipc-json-test.cc | 6 +-- cpp/src/arrow/ipc/json-internal.h | 8 ++-- cpp/src/arrow/ipc/json.cc | 59 +++++++++++++++++++++++++++++- cpp/src/arrow/ipc/json.h | 24 ++++-------- 4 files changed, 71 insertions(+), 26 deletions(-) diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc index b0eb55fdea4..536fd12ecfc 100644 --- a/cpp/src/arrow/ipc/ipc-json-test.cc +++ b/cpp/src/arrow/ipc/ipc-json-test.cc @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. -#include "arrow/ipc/json-internal.h" #include #include @@ -25,12 +24,11 @@ #include #include -#include "rapidjson/document.h" -#include "rapidjson/filewritestream.h" -#include "rapidjson/prettywriter.h" #include "gtest/gtest.h" #include "arrow/array.h" +#include "arrow/ipc/json.h" +#include "arrow/ipc/json-internal.h" #include "arrow/test-util.h" #include "arrow/type.h" #include "arrow/type_traits.h" diff --git a/cpp/src/arrow/ipc/json-internal.h b/cpp/src/arrow/ipc/json-internal.h index 79b1d195c37..5748c8043f3 100644 --- a/cpp/src/arrow/ipc/json-internal.h +++ b/cpp/src/arrow/ipc/json-internal.h @@ -15,10 +15,8 @@ // specific language governing permissions and limitations // under the License. -// Implement Arrow JSON serialization format - -#ifndef ARROW_IPC_JSON_H -#define ARROW_IPC_JSON_H +#ifndef ARROW_IPC_JSON_INTERNAL_H +#define ARROW_IPC_JSON_INTERNAL_H #define RAPIDJSON_HAS_STDSTRING 1 #define RAPIDJSON_HAS_CXX11_RVALUE_REFS 1 @@ -57,4 +55,4 @@ Status ARROW_EXPORT ReadJsonArray(MemoryPool* pool, const rj::Value& json_obj, } // namespace ipc } // namespace arrow -#endif // ARROW_IPC_FILE_H +#endif // ARROW_IPC_JSON_INTERNAL_H diff --git a/cpp/src/arrow/ipc/json.cc b/cpp/src/arrow/ipc/json.cc index 49992f85d70..72fb4f0b36f 100644 --- a/cpp/src/arrow/ipc/json.cc +++ b/cpp/src/arrow/ipc/json.cc @@ -22,5 +22,62 @@ #include "arrow/util/status.h" namespace arrow { -namespace ipc {} // namespace ipc +namespace ipc { + +class JsonWriter::JsonWriterImpl { + public: + JsonWriterImpl(const std::shared_ptr& schema) + : schema_(schema) { + writer_.reset(new RjWriter(string_buffer_)); + } + + Status Start() { + writer_->StartObject(); + + writer_->Key("schema"); + RETURN_NOT_OK(WriteJsonSchema(schema_, writer_.get())); + + // Record batches + writer_->Key("batches"); + writer_->StartArray(); + return Status::OK(); + } + + Status Finish() { + writer_->EndArray(); // Record batches + writer_->EndObject(); + return Status::OK(); + } + + Status WriteRecordBatch(const std::vector>& columns, int32_t num_rows) { + return Status::OK(); + } + + private: + std::shared_ptr schema_; + + rj::StringBuffer string_buffer_; + std::unique_ptr writer_; +}; + +JsonWriter::JsonWriter(const std::shared_ptr& schema) { + impl_.reset(new JsonWriteImpl(schema)); +} + +Status JsonWriter::Open( + const std::shared_ptr& schema, std::unique_ptr* writer) { + *writer = std::unique_ptr(new JsonWriter(schema)); + return (*writer)->impl_->Start(); +} + +Status JsonWriter::Close() { + return impl_->Close(); +} + +Status JsonWriter::WriteRecordBatch( + const std::vector>& columns, int32_t num_rows) { + +} + +} // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/json.h b/cpp/src/arrow/ipc/json.h index 0485f8e4163..01771846584 100644 --- a/cpp/src/arrow/ipc/json.h +++ b/cpp/src/arrow/ipc/json.h @@ -43,8 +43,7 @@ namespace ipc { class ARROW_EXPORT JsonWriter { public: - static Status Open(io::OutputStream* sink, const std::shared_ptr& schema, - std::shared_ptr* out); + static Status Open(const std::shared_ptr& schema, std::unique_ptr* out); // TODO(wesm): Write dictionaries @@ -54,25 +53,21 @@ class ARROW_EXPORT JsonWriter { Status Close(); private: - JsonWriter(io::OutputStream* sink, const std::shared_ptr& schema); - - io::OutputStream* sink_; - std::shared_ptr schema_; + explicit JsonWriter(const std::shared_ptr& schema); // Hide RapidJSON details from public API class JsonWriterImpl; std::unique_ptr impl_; }; +// TODO(wesm): Read from a file stream rather than an in-memory buffer class ARROW_EXPORT JsonReader { public: - static Status Open(MemoryPool* pool, - const std::shared_ptr& file, - std::shared_ptr* reader); + static Status Open(MemoryPool* pool, const std::shared_ptr& data, + std::unique_ptr* reader); // Use the default memory pool - static Status Open(const std::shared_ptr& file, - std::shared_ptr* reader); + static Status Open(const std::shared_ptr& data, std::unique_ptr* reader); std::shared_ptr schema() const; @@ -82,10 +77,7 @@ class ARROW_EXPORT JsonReader { Status GetRecordBatch(int i, std::shared_ptr* batch); private: - explicit JsonReader(const std::shared_ptr& file); - - std::shared_ptr file_; - std::shared_ptr schema_; + explicit JsonReader(const std::shared_ptr& data); // Hide RapidJSON details from public API class JsonReaderImpl; @@ -95,4 +87,4 @@ class ARROW_EXPORT JsonReader { } // namespace ipc } // namespace arrow -#endif // ARROW_IPC_FILE_H +#endif // ARROW_IPC_JSON_H From 2753449aec4b0d7d2e312352e8c0a6313c629e46 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Thu, 17 Nov 2016 17:49:26 -0500 Subject: [PATCH 23/27] Complete draft json roundtrip implementation. tests not complete yet Change-Id: Ic6efc59347c8234c8707492aa741eabaf82c0ffe --- cpp/src/arrow/ipc/ipc-json-test.cc | 69 +++++++++++++-- cpp/src/arrow/ipc/json-internal.cc | 47 ---------- cpp/src/arrow/ipc/json-internal.h | 50 ++++++++++- cpp/src/arrow/ipc/json.cc | 137 +++++++++++++++++++++++++++-- cpp/src/arrow/ipc/json.h | 15 ++-- cpp/src/arrow/type_fwd.h | 2 + 6 files changed, 247 insertions(+), 73 deletions(-) diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc index 536fd12ecfc..b5fea6fda78 100644 --- a/cpp/src/arrow/ipc/ipc-json-test.cc +++ b/cpp/src/arrow/ipc/ipc-json-test.cc @@ -15,7 +15,6 @@ // specific language governing permissions and limitations // under the License. - #include #include #include @@ -27,8 +26,8 @@ #include "gtest/gtest.h" #include "arrow/array.h" -#include "arrow/ipc/json.h" #include "arrow/ipc/json-internal.h" +#include "arrow/ipc/json.h" #include "arrow/test-util.h" #include "arrow/type.h" #include "arrow/type_traits.h" @@ -74,8 +73,6 @@ void TestArrayRoundTrip(const Array& array) { std::shared_ptr out; ASSERT_OK(ReadJsonArray(default_memory_pool(), d, array.type(), &out)); - std::cout << array_as_json << std::endl; - ASSERT_TRUE(array.Equals(out)) << array_as_json; } @@ -98,6 +95,25 @@ void CheckPrimitive(const std::shared_ptr& type, TestArrayRoundTrip(*array.get()); } +template +void MakeArray(const std::shared_ptr& type, + const std::vector& is_valid, const std::vector& values, + std::shared_ptr* out) { + std::shared_ptr values_buffer = test::GetBufferFromVector(values); + std::shared_ptr values_bitmap; + ASSERT_OK(test::GetBitmapFromBoolVector(is_valid, &values_bitmap)); + + using ArrayType = typename TypeTraits::ArrayType; + + int32_t null_count = 0; + for (bool val : is_valid) { + if (!val) { ++null_count; } + } + + *out = std::make_shared(type, static_cast(values.size()), + values_buffer, null_count, values_bitmap); +} + TEST(TestJsonSchemaWriter, FlatTypes) { std::vector> fields = {field("f0", int8()), field("f1", int16(), false), field("f2", int32()), field("f3", int64(), false), @@ -148,11 +164,8 @@ TEST(TestJsonArrayWriter, NestedTypes) { std::vector values_is_valid = {true, false, true, true, false, true, true}; std::vector values = {0, 1, 2, 3, 4, 5, 6}; - std::shared_ptr values_buffer = test::GetBufferFromVector(values); - std::shared_ptr values_bitmap; - ASSERT_OK(test::GetBitmapFromBoolVector(values_is_valid, &values_bitmap)); - auto values_array = std::make_shared( - value_type, static_cast(values.size()), values_buffer, 2, values_bitmap); + std::shared_ptr values_array; + MakeArray(int32(), values_is_valid, values, &values_array); // List std::vector list_is_valid = {true, false, true, true, true}; @@ -180,5 +193,43 @@ TEST(TestJsonArrayWriter, NestedTypes) { TestArrayRoundTrip(struct_array); } +TEST(TestJsonFileReadWrite, BasicRoundTrip) { + auto v1_type = int8(); + auto v2_type = int32(); + auto v3_type = utf8(); + + std::vector is_valid = {true, false, true, true, false, true, true}; + + std::vector v1_values = {0, 1, 2, 3, 4, 5, 6}; + std::shared_ptr v1; + MakeArray(v1_type, is_valid, v1_values, &v1); + + std::vector v2_values = {0, 1, 2, 3, 4, 5, 6}; + std::shared_ptr v2; + MakeArray(v2_type, is_valid, v2_values, &v2); + + std::vector v3_values = {"foo", "bar", "", "", "", "baz", "qux"}; + std::shared_ptr v3; + MakeArray(v3_type, is_valid, v3_values, &v3); + + std::shared_ptr schema({field("f1", v1_type), field("f2", v2_type), + field("f3", v3_type)}); + + std::vector> arrays = {v1, v2, v3} + + std::unique_ptr writer; + ASSERT_OK(JsonWriter::Open(schema, &writer)); + + const int nbatches = 3; + const int32_t num_rows = static_cast(v1_values.size()); + + for (int i = 0; i < nbatches; ++i) { + ASSERT_OK(writer_->WriteRecordBatch(arrays, num_rows)); + } + + std::shared_ptr data; + ASSERT_OK(writer->Finish(&data)); +} + } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/json-internal.cc b/cpp/src/arrow/ipc/json-internal.cc index 4e2f31fc337..31fe35b44ce 100644 --- a/cpp/src/arrow/ipc/json-internal.cc +++ b/cpp/src/arrow/ipc/json-internal.cc @@ -595,53 +595,6 @@ class JsonArrayWriter : public ArrayVisitor { RjWriter* writer_; }; -#define RETURN_NOT_FOUND(TOK, NAME, PARENT) \ - if (NAME == PARENT.MemberEnd()) { \ - std::stringstream ss; \ - ss << "field " << TOK << " not found"; \ - return Status::Invalid(ss.str()); \ - } - -#define RETURN_NOT_STRING(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsString()) { \ - std::stringstream ss; \ - ss << "field was not a string"; \ - return Status::Invalid(ss.str()); \ - } - -#define RETURN_NOT_BOOL(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsBool()) { \ - std::stringstream ss; \ - ss << "field was not a boolean"; \ - return Status::Invalid(ss.str()); \ - } - -#define RETURN_NOT_INT(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsInt()) { \ - std::stringstream ss; \ - ss << "field was not an int"; \ - return Status::Invalid(ss.str()); \ - } - -#define RETURN_NOT_ARRAY(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsArray()) { \ - std::stringstream ss; \ - ss << "field was not an array"; \ - return Status::Invalid(ss.str()); \ - } - -#define RETURN_NOT_OBJECT(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsObject()) { \ - std::stringstream ss; \ - ss << "field was not an object"; \ - return Status::Invalid(ss.str()); \ - } - class JsonSchemaReader { public: explicit JsonSchemaReader(const rj::Value& json_schema) : json_schema_(json_schema) {} diff --git a/cpp/src/arrow/ipc/json-internal.h b/cpp/src/arrow/ipc/json-internal.h index 5748c8043f3..c41ba26c480 100644 --- a/cpp/src/arrow/ipc/json-internal.h +++ b/cpp/src/arrow/ipc/json-internal.h @@ -23,6 +23,7 @@ #define RAPIDJSON_HAS_CXX11_RANGE_FOR 1 #include +#include #include #include "rapidjson/document.h" @@ -35,6 +36,53 @@ namespace rj = rapidjson; using RjWriter = rj::Writer; +#define RETURN_NOT_FOUND(TOK, NAME, PARENT) \ + if (NAME == PARENT.MemberEnd()) { \ + std::stringstream ss; \ + ss << "field " << TOK << " not found"; \ + return Status::Invalid(ss.str()); \ + } + +#define RETURN_NOT_STRING(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsString()) { \ + std::stringstream ss; \ + ss << "field was not a string"; \ + return Status::Invalid(ss.str()); \ + } + +#define RETURN_NOT_BOOL(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsBool()) { \ + std::stringstream ss; \ + ss << "field was not a boolean"; \ + return Status::Invalid(ss.str()); \ + } + +#define RETURN_NOT_INT(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsInt()) { \ + std::stringstream ss; \ + ss << "field was not an int"; \ + return Status::Invalid(ss.str()); \ + } + +#define RETURN_NOT_ARRAY(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsArray()) { \ + std::stringstream ss; \ + ss << "field was not an array"; \ + return Status::Invalid(ss.str()); \ + } + +#define RETURN_NOT_OBJECT(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsObject()) { \ + std::stringstream ss; \ + ss << "field was not an object"; \ + return Status::Invalid(ss.str()); \ + } + namespace arrow { namespace ipc { @@ -45,7 +93,7 @@ Status ARROW_EXPORT WriteJsonArray( const std::string& name, const Array& array, RjWriter* json_writer); Status ARROW_EXPORT ReadJsonSchema( - const rj::Value& json_arr, std::shared_ptr* schema); + const rj::Value& json_obj, std::shared_ptr* schema); Status ARROW_EXPORT ReadJsonArray(MemoryPool* pool, const rj::Value& json_obj, const std::shared_ptr& type, std::shared_ptr* array); diff --git a/cpp/src/arrow/ipc/json.cc b/cpp/src/arrow/ipc/json.cc index 72fb4f0b36f..1148fd69f77 100644 --- a/cpp/src/arrow/ipc/json.cc +++ b/cpp/src/arrow/ipc/json.cc @@ -17,17 +17,28 @@ #include "arrow/ipc/json.h" +#include +#include + +#include "arrow/array.h" #include "arrow/ipc/json-internal.h" +#include "arrow/schema.h" +#include "arrow/table.h" #include "arrow/type.h" +#include "arrow/util/buffer.h" +#include "arrow/util/logging.h" +#include "arrow/util/memory-pool.h" #include "arrow/util/status.h" namespace arrow { namespace ipc { +// ---------------------------------------------------------------------- +// Writer implementation + class JsonWriter::JsonWriterImpl { public: - JsonWriterImpl(const std::shared_ptr& schema) - : schema_(schema) { + JsonWriterImpl(const std::shared_ptr& schema) : schema_(schema) { writer_.reset(new RjWriter(string_buffer_)); } @@ -35,7 +46,7 @@ class JsonWriter::JsonWriterImpl { writer_->StartObject(); writer_->Key("schema"); - RETURN_NOT_OK(WriteJsonSchema(schema_, writer_.get())); + RETURN_NOT_OK(WriteJsonSchema(*schema_.get(), writer_.get())); // Record batches writer_->Key("batches"); @@ -49,7 +60,29 @@ class JsonWriter::JsonWriterImpl { return Status::OK(); } - Status WriteRecordBatch(const std::vector>& columns, int32_t num_rows) { + Status WriteRecordBatch( + const std::vector>& columns, int32_t num_rows) { + DCHECK_EQ(static_cast(columns.size()), schema_->num_fields()); + + writer_->StartObject(); + writer_->Key("count"); + writer_->Int(num_rows); + + writer_->Key("columns"); + writer_->StartArray(); + + for (int i = 0; i < schema_->num_fields(); ++i) { + const std::shared_ptr& column = columns[i]; + + DCHECK_EQ(num_rows, column->length()) + << "Array length did not match record batch length"; + + RETURN_NOT_OK( + WriteJsonArray(schema_->field(i)->name, *column.get(), writer_.get())); + } + + writer_->EndArray(); + writer_->EndObject(); return Status::OK(); } @@ -61,7 +94,7 @@ class JsonWriter::JsonWriterImpl { }; JsonWriter::JsonWriter(const std::shared_ptr& schema) { - impl_.reset(new JsonWriteImpl(schema)); + impl_.reset(new JsonWriterImpl(schema)); } Status JsonWriter::Open( @@ -70,13 +103,103 @@ Status JsonWriter::Open( return (*writer)->impl_->Start(); } -Status JsonWriter::Close() { - return impl_->Close(); +Status JsonWriter::Finish(std::shared_ptr* out) { + return impl_->Finish(); } Status JsonWriter::WriteRecordBatch( const std::vector>& columns, int32_t num_rows) { + return impl_->WriteRecordBatch(columns, num_rows); +} + +// ---------------------------------------------------------------------- +// Reader implementation + +class JsonReader::JsonReaderImpl { + public: + JsonReaderImpl(MemoryPool* pool, const std::shared_ptr& data) + : pool_(pool), data_(data) {} + + Status ParseAndReadSchema() { + doc_.Parse(reinterpret_cast(data_->data()), + static_cast(data_->size())); + if (doc_.HasParseError()) { return Status::IOError("JSON parsing failed"); } + + auto it = doc_.FindMember("schema"); + RETURN_NOT_OBJECT("schema", it, doc_); + RETURN_NOT_OK(ReadJsonSchema(it->value, &schema_)); + + it = doc_.FindMember("batches"); + RETURN_NOT_ARRAY("batches", it, doc_); + record_batches_ = &it->value; + + return Status::OK(); + } + + Status GetRecordBatch(int i, std::shared_ptr* batch) const { + DCHECK_GT(i, 0) << "i out of bounds"; + DCHECK_LT(i, record_batches_->GetArray().Size()) << "i out of bounds"; + + const auto& batch_val = record_batches_->GetArray()[i]; + DCHECK(batch_val.IsObject()); + + const auto& batch_obj = batch_val.GetObject(); + + auto it = batch_obj.FindMember("count"); + RETURN_NOT_INT("count", it, batch_obj); + int32_t num_rows = static_cast(it->value.GetInt()); + + it = batch_obj.FindMember("columns"); + RETURN_NOT_ARRAY("columns", it, batch_obj); + const auto& json_columns = it->value.GetArray(); + + std::vector> columns(json_columns.Size()); + for (size_t i = 0; i < columns.size(); ++i) { + const std::shared_ptr& type = schema_->field(i)->type; + RETURN_NOT_OK(ReadJsonArray(pool_, json_columns[i], type, &columns[i])); + } + + *batch = std::make_shared(schema_, num_rows, columns); + return Status::OK(); + } + + std::shared_ptr schema() const { return schema_; } + + int num_record_batches() const { + return static_cast(record_batches_->GetArray().Size()); + } + + private: + MemoryPool* pool_; + std::shared_ptr data_; + rj::Document doc_; + + const rj::Value* record_batches_; + + std::shared_ptr schema_; +}; + +JsonReader::JsonReader(MemoryPool* pool, const std::shared_ptr& data) { + impl_.reset(new JsonReaderImpl(pool, data)); +} + +Status JsonReader::Open( + const std::shared_ptr& data, std::unique_ptr* reader) { + return Open(default_memory_pool(), data, reader); +} + +Status JsonReader::Open(MemoryPool* pool, const std::shared_ptr& data, + std::unique_ptr* reader) { + *reader = std::unique_ptr(new JsonReader(pool, data)); + return (*reader)->impl_->ParseAndReadSchema(); +} + +std::shared_ptr JsonReader::schema() const { + return impl_->schema(); +} +int JsonReader::num_record_batches() const { + return impl_->num_record_batches(); } } // namespace ipc diff --git a/cpp/src/arrow/ipc/json.h b/cpp/src/arrow/ipc/json.h index 01771846584..d830b9e2a31 100644 --- a/cpp/src/arrow/ipc/json.h +++ b/cpp/src/arrow/ipc/json.h @@ -27,11 +27,6 @@ #include "arrow/util/visibility.h" namespace arrow { - -class MemoryPool; -class RecordBatch; -class Schema; - namespace io { class OutputStream; @@ -43,14 +38,15 @@ namespace ipc { class ARROW_EXPORT JsonWriter { public: - static Status Open(const std::shared_ptr& schema, std::unique_ptr* out); + static Status Open( + const std::shared_ptr& schema, std::unique_ptr* out); // TODO(wesm): Write dictionaries Status WriteRecordBatch( const std::vector>& columns, int32_t num_rows); - Status Close(); + Status Finish(std::shared_ptr* result); private: explicit JsonWriter(const std::shared_ptr& schema); @@ -67,7 +63,8 @@ class ARROW_EXPORT JsonReader { std::unique_ptr* reader); // Use the default memory pool - static Status Open(const std::shared_ptr& data, std::unique_ptr* reader); + static Status Open( + const std::shared_ptr& data, std::unique_ptr* reader); std::shared_ptr schema() const; @@ -77,7 +74,7 @@ class ARROW_EXPORT JsonReader { Status GetRecordBatch(int i, std::shared_ptr* batch); private: - explicit JsonReader(const std::shared_ptr& data); + JsonReader(MemoryPool* pool, const std::shared_ptr& data); // Hide RapidJSON details from public API class JsonReaderImpl; diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 76987b7475d..6d660f4fdee 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -27,7 +27,9 @@ class Array; class ArrayBuilder; struct Field; +class Buffer; class MemoryPool; +class RecordBatch; class Schema; struct NullType; From 3d9fcc229f7df783a1ae359d720e9ca4e59cff3d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 18 Nov 2016 13:14:55 -0500 Subject: [PATCH 24/27] Complete round trip json file test with multiple record batches Change-Id: I56d3222db251c99af5c8a3536909e45b429c8150 --- cpp/src/arrow/ipc/ipc-json-test.cc | 92 ++++++++++++++++++++++-------- cpp/src/arrow/ipc/json.cc | 20 +++++-- cpp/src/arrow/ipc/json.h | 8 ++- cpp/src/arrow/test-util.h | 32 ++++++++++- 4 files changed, 120 insertions(+), 32 deletions(-) diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc index b5fea6fda78..b15efd2a64c 100644 --- a/cpp/src/arrow/ipc/ipc-json-test.cc +++ b/cpp/src/arrow/ipc/ipc-json-test.cc @@ -28,6 +28,7 @@ #include "arrow/array.h" #include "arrow/ipc/json-internal.h" #include "arrow/ipc/json.h" +#include "arrow/table.h" #include "arrow/test-util.h" #include "arrow/type.h" #include "arrow/type_traits.h" @@ -96,11 +97,12 @@ void CheckPrimitive(const std::shared_ptr& type, } template -void MakeArray(const std::shared_ptr& type, - const std::vector& is_valid, const std::vector& values, - std::shared_ptr* out) { - std::shared_ptr values_buffer = test::GetBufferFromVector(values); +void MakeArray(const std::shared_ptr& type, const std::vector& is_valid, + const std::vector& values, std::shared_ptr* out) { + std::shared_ptr values_buffer; std::shared_ptr values_bitmap; + + ASSERT_OK(test::CopyBufferFromVector(values, &values_buffer)); ASSERT_OK(test::GetBitmapFromBoolVector(is_valid, &values_bitmap)); using ArrayType = typename TypeTraits::ArrayType; @@ -193,42 +195,84 @@ TEST(TestJsonArrayWriter, NestedTypes) { TestArrayRoundTrip(struct_array); } -TEST(TestJsonFileReadWrite, BasicRoundTrip) { - auto v1_type = int8(); - auto v2_type = int32(); - auto v3_type = utf8(); +// Data generation for test case below +void MakeBatchArrays(const std::shared_ptr& schema, const int num_rows, + std::vector>* arrays) { + std::vector is_valid; + test::random_is_valid(num_rows, 0.25, &is_valid); + + std::vector v1_values; + std::vector v2_values; - std::vector is_valid = {true, false, true, true, false, true, true}; + test::randint(num_rows, 0, 100, &v1_values); + test::randint(num_rows, 0, 100, &v2_values); - std::vector v1_values = {0, 1, 2, 3, 4, 5, 6}; std::shared_ptr v1; - MakeArray(v1_type, is_valid, v1_values, &v1); + MakeArray(schema->field(0)->type, is_valid, v1_values, &v1); - std::vector v2_values = {0, 1, 2, 3, 4, 5, 6}; std::shared_ptr v2; - MakeArray(v2_type, is_valid, v2_values, &v2); - - std::vector v3_values = {"foo", "bar", "", "", "", "baz", "qux"}; + MakeArray(schema->field(1)->type, is_valid, v2_values, &v2); + + static const int kBufferSize = 10; + static uint8_t buffer[kBufferSize]; + static uint32_t seed = 0; + StringBuilder string_builder(default_memory_pool(), utf8()); + for (int i = 0; i < num_rows; ++i) { + if (!is_valid[i]) { + string_builder.AppendNull(); + } else { + test::random_ascii(kBufferSize, seed++, buffer); + string_builder.Append(buffer, kBufferSize); + } + } std::shared_ptr v3; - MakeArray(v3_type, is_valid, v3_values, &v3); + ASSERT_OK(string_builder.Finish(&v3)); - std::shared_ptr schema({field("f1", v1_type), field("f2", v2_type), - field("f3", v3_type)}); + arrays->emplace_back(v1); + arrays->emplace_back(v2); + arrays->emplace_back(v3); +} - std::vector> arrays = {v1, v2, v3} +TEST(TestJsonFileReadWrite, BasicRoundTrip) { + auto v1_type = int8(); + auto v2_type = int32(); + auto v3_type = utf8(); + + std::shared_ptr schema( + new Schema({field("f1", v1_type), field("f2", v2_type), field("f3", v3_type)})); std::unique_ptr writer; ASSERT_OK(JsonWriter::Open(schema, &writer)); const int nbatches = 3; - const int32_t num_rows = static_cast(v1_values.size()); - + std::vector> batches; for (int i = 0; i < nbatches; ++i) { - ASSERT_OK(writer_->WriteRecordBatch(arrays, num_rows)); + int32_t num_rows = 5 + i * 5; + std::vector> arrays; + + MakeBatchArrays(schema, num_rows, &arrays); + batches.emplace_back(std::make_shared(schema, num_rows, arrays)); + ASSERT_OK(writer->WriteRecordBatch(arrays, num_rows)); } - std::shared_ptr data; - ASSERT_OK(writer->Finish(&data)); + std::string result; + ASSERT_OK(writer->Finish(&result)); + + std::unique_ptr reader; + + auto buffer = std::make_shared( + reinterpret_cast(result.c_str()), static_cast(result.size())); + + ASSERT_OK(JsonReader::Open(buffer, &reader)); + ASSERT_TRUE(reader->schema()->Equals(*schema.get())); + + ASSERT_EQ(nbatches, reader->num_record_batches()); + + for (int i = 0; i < nbatches; ++i) { + std::shared_ptr batch; + ASSERT_OK(reader->GetRecordBatch(i, &batch)); + ASSERT_TRUE(batch->Equals(*batches[i].get())); + } } } // namespace ipc diff --git a/cpp/src/arrow/ipc/json.cc b/cpp/src/arrow/ipc/json.cc index 1148fd69f77..26c43364271 100644 --- a/cpp/src/arrow/ipc/json.cc +++ b/cpp/src/arrow/ipc/json.cc @@ -54,9 +54,11 @@ class JsonWriter::JsonWriterImpl { return Status::OK(); } - Status Finish() { + Status Finish(std::string* result) { writer_->EndArray(); // Record batches writer_->EndObject(); + + *result = string_buffer_.GetString(); return Status::OK(); } @@ -75,7 +77,7 @@ class JsonWriter::JsonWriterImpl { const std::shared_ptr& column = columns[i]; DCHECK_EQ(num_rows, column->length()) - << "Array length did not match record batch length"; + << "Array length did not match record batch length"; RETURN_NOT_OK( WriteJsonArray(schema_->field(i)->name, *column.get(), writer_.get())); @@ -97,14 +99,16 @@ JsonWriter::JsonWriter(const std::shared_ptr& schema) { impl_.reset(new JsonWriterImpl(schema)); } +JsonWriter::~JsonWriter() {} + Status JsonWriter::Open( const std::shared_ptr& schema, std::unique_ptr* writer) { *writer = std::unique_ptr(new JsonWriter(schema)); return (*writer)->impl_->Start(); } -Status JsonWriter::Finish(std::shared_ptr* out) { - return impl_->Finish(); +Status JsonWriter::Finish(std::string* result) { + return impl_->Finish(result); } Status JsonWriter::WriteRecordBatch( @@ -137,7 +141,7 @@ class JsonReader::JsonReaderImpl { } Status GetRecordBatch(int i, std::shared_ptr* batch) const { - DCHECK_GT(i, 0) << "i out of bounds"; + DCHECK_GE(i, 0) << "i out of bounds"; DCHECK_LT(i, record_batches_->GetArray().Size()) << "i out of bounds"; const auto& batch_val = record_batches_->GetArray()[i]; @@ -183,6 +187,8 @@ JsonReader::JsonReader(MemoryPool* pool, const std::shared_ptr& data) { impl_.reset(new JsonReaderImpl(pool, data)); } +JsonReader::~JsonReader() {} + Status JsonReader::Open( const std::shared_ptr& data, std::unique_ptr* reader) { return Open(default_memory_pool(), data, reader); @@ -202,5 +208,9 @@ int JsonReader::num_record_batches() const { return impl_->num_record_batches(); } +Status JsonReader::GetRecordBatch(int i, std::shared_ptr* batch) const { + return impl_->GetRecordBatch(i, batch); +} + } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/json.h b/cpp/src/arrow/ipc/json.h index d830b9e2a31..0194098219e 100644 --- a/cpp/src/arrow/ipc/json.h +++ b/cpp/src/arrow/ipc/json.h @@ -38,6 +38,8 @@ namespace ipc { class ARROW_EXPORT JsonWriter { public: + ~JsonWriter(); + static Status Open( const std::shared_ptr& schema, std::unique_ptr* out); @@ -46,7 +48,7 @@ class ARROW_EXPORT JsonWriter { Status WriteRecordBatch( const std::vector>& columns, int32_t num_rows); - Status Finish(std::shared_ptr* result); + Status Finish(std::string* result); private: explicit JsonWriter(const std::shared_ptr& schema); @@ -59,6 +61,8 @@ class ARROW_EXPORT JsonWriter { // TODO(wesm): Read from a file stream rather than an in-memory buffer class ARROW_EXPORT JsonReader { public: + ~JsonReader(); + static Status Open(MemoryPool* pool, const std::shared_ptr& data, std::unique_ptr* reader); @@ -71,7 +75,7 @@ class ARROW_EXPORT JsonReader { int num_record_batches() const; // Read a record batch from the file - Status GetRecordBatch(int i, std::shared_ptr* batch); + Status GetRecordBatch(int i, std::shared_ptr* batch) const; private: JsonReader(MemoryPool* pool, const std::shared_ptr& data); diff --git a/cpp/src/arrow/test-util.h b/cpp/src/arrow/test-util.h index 808510a38ee..ab4b980b3be 100644 --- a/cpp/src/arrow/test-util.h +++ b/cpp/src/arrow/test-util.h @@ -108,6 +108,19 @@ std::shared_ptr GetBufferFromVector(const std::vector& values) { reinterpret_cast(values.data()), values.size() * sizeof(T)); } +template +inline Status CopyBufferFromVector( + const std::vector& values, std::shared_ptr* result) { + int64_t nbytes = static_cast(values.size()) * sizeof(T); + + auto buffer = std::make_shared(default_memory_pool()); + RETURN_NOT_OK(buffer->Resize(nbytes)); + memcpy(buffer->mutable_data(), values.data(), nbytes); + + *result = buffer; + return Status::OK(); +} + static inline Status GetBitmapFromBoolVector( const std::vector& is_valid, std::shared_ptr* result) { int length = static_cast(is_valid.size()); @@ -126,13 +139,21 @@ static inline Status GetBitmapFromBoolVector( // Sets approximately pct_null of the first n bytes in null_bytes to zero // and the rest to non-zero (true) values. -void random_null_bytes(int64_t n, double pct_null, uint8_t* null_bytes) { +static inline void random_null_bytes(int64_t n, double pct_null, uint8_t* null_bytes) { Random rng(random_seed()); for (int i = 0; i < n; ++i) { null_bytes[i] = rng.NextDoubleFraction() > pct_null; } } +static inline void random_is_valid( + int64_t n, double pct_null, std::vector* is_valid) { + Random rng(random_seed()); + for (int i = 0; i < n; ++i) { + is_valid->push_back(rng.NextDoubleFraction() > pct_null); + } +} + static inline void random_bytes(int n, uint32_t seed, uint8_t* out) { std::mt19937 gen(seed); std::uniform_int_distribution d(0, 255); @@ -142,6 +163,15 @@ static inline void random_bytes(int n, uint32_t seed, uint8_t* out) { } } +static inline void random_ascii(int n, uint32_t seed, uint8_t* out) { + std::mt19937 gen(seed); + std::uniform_int_distribution d(65, 122); + + for (int i = 0; i < n; ++i) { + out[i] = d(gen) & 0xFF; + } +} + template void rand_uniform_int(int n, uint32_t seed, T min_value, T max_value, T* out) { DCHECK(out); From a2cf47baf380948ad791998963698535baafb668 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 18 Nov 2016 13:17:09 -0500 Subject: [PATCH 25/27] cpplint Change-Id: Id46ffdfac630f983fe8d97e4cd5c933bee174614 --- cpp/src/arrow/ipc/json.cc | 4 +++- cpp/src/arrow/ipc/json.h | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/cpp/src/arrow/ipc/json.cc b/cpp/src/arrow/ipc/json.cc index 26c43364271..eaaec4dbda8 100644 --- a/cpp/src/arrow/ipc/json.cc +++ b/cpp/src/arrow/ipc/json.cc @@ -19,6 +19,8 @@ #include #include +#include +#include #include "arrow/array.h" #include "arrow/ipc/json-internal.h" @@ -38,7 +40,7 @@ namespace ipc { class JsonWriter::JsonWriterImpl { public: - JsonWriterImpl(const std::shared_ptr& schema) : schema_(schema) { + explicit JsonWriterImpl(const std::shared_ptr& schema) : schema_(schema) { writer_.reset(new RjWriter(string_buffer_)); } diff --git a/cpp/src/arrow/ipc/json.h b/cpp/src/arrow/ipc/json.h index 0194098219e..7395be43b96 100644 --- a/cpp/src/arrow/ipc/json.h +++ b/cpp/src/arrow/ipc/json.h @@ -21,6 +21,7 @@ #define ARROW_IPC_JSON_H #include +#include #include #include "arrow/type_fwd.h" From 72c24fe62a95625b070a7306b0c4c6384b5722fe Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 18 Nov 2016 14:03:25 -0500 Subject: [PATCH 26/27] Add a minimal literal JSON example Change-Id: Icd4cd4b58cb0ce392a856f83f89dd3e8a01a54b9 --- cpp/src/arrow/ipc/ipc-json-test.cc | 74 ++++++++++++++++++++++++++++++ cpp/src/arrow/ipc/json-internal.h | 60 ++++++++++++------------ 2 files changed, 104 insertions(+), 30 deletions(-) diff --git a/cpp/src/arrow/ipc/ipc-json-test.cc b/cpp/src/arrow/ipc/ipc-json-test.cc index b15efd2a64c..a51371c6200 100644 --- a/cpp/src/arrow/ipc/ipc-json-test.cc +++ b/cpp/src/arrow/ipc/ipc-json-test.cc @@ -275,5 +275,79 @@ TEST(TestJsonFileReadWrite, BasicRoundTrip) { } } +TEST(TestJsonFileReadWrite, MinimalFormatExample) { + static const char* example = R"example( +{ + "schema": { + "fields": [ + { + "name": "foo", + "type": {"name": "int", "isSigned": true, "bitWidth": 32}, + "nullable": true, "children": [], + "typeLayout": [ + {"type": "VALIDITY", "typeBitWidth": 1}, + {"type": "DATA", "typeBitWidth": 32} + ] + }, + { + "name": "bar", + "type": {"name": "floatingpoint", "precision": "DOUBLE"}, + "nullable": true, "children": [], + "typeLayout": [ + {"type": "VALIDITY", "typeBitWidth": 1}, + {"type": "DATA", "typeBitWidth": 64} + ] + } + ] + }, + "batches": [ + { + "count": 5, + "columns": [ + { + "name": "foo", + "count": 5, + "DATA": [1, 2, 3, 4, 5], + "VALIDITY": [1, 0, 1, 1, 1] + }, + { + "name": "bar", + "count": 5, + "DATA": [1.0, 2.0, 3.0, 4.0, 5.0], + "VALIDITY": [1, 0, 0, 1, 1] + } + ] + } + ] +} +)example"; + + auto buffer = std::make_shared( + reinterpret_cast(example), strlen(example)); + + std::unique_ptr reader; + ASSERT_OK(JsonReader::Open(buffer, &reader)); + + Schema ex_schema({field("foo", int32()), field("bar", float64())}); + + ASSERT_TRUE(reader->schema()->Equals(ex_schema)); + ASSERT_EQ(1, reader->num_record_batches()); + + std::shared_ptr batch; + ASSERT_OK(reader->GetRecordBatch(0, &batch)); + + std::vector foo_valid = {true, false, true, true, true}; + std::vector foo_values = {1, 2, 3, 4, 5}; + std::shared_ptr foo; + MakeArray(int32(), foo_valid, foo_values, &foo); + ASSERT_TRUE(batch->column(0)->Equals(foo)); + + std::vector bar_valid = {true, false, false, true, true}; + std::vector bar_values = {1, 2, 3, 4, 5}; + std::shared_ptr bar; + MakeArray(float64(), bar_valid, bar_values, &bar); + ASSERT_TRUE(batch->column(1)->Equals(bar)); +} + } // namespace ipc } // namespace arrow diff --git a/cpp/src/arrow/ipc/json-internal.h b/cpp/src/arrow/ipc/json-internal.h index c41ba26c480..3d7912f029f 100644 --- a/cpp/src/arrow/ipc/json-internal.h +++ b/cpp/src/arrow/ipc/json-internal.h @@ -43,44 +43,44 @@ using RjWriter = rj::Writer; return Status::Invalid(ss.str()); \ } -#define RETURN_NOT_STRING(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsString()) { \ - std::stringstream ss; \ - ss << "field was not a string"; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_STRING(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsString()) { \ + std::stringstream ss; \ + ss << "field was not a string" << " line " << __LINE__; \ + return Status::Invalid(ss.str()); \ } -#define RETURN_NOT_BOOL(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsBool()) { \ - std::stringstream ss; \ - ss << "field was not a boolean"; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_BOOL(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsBool()) { \ + std::stringstream ss; \ + ss << "field was not a boolean" << " line " << __LINE__; \ + return Status::Invalid(ss.str()); \ } -#define RETURN_NOT_INT(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsInt()) { \ - std::stringstream ss; \ - ss << "field was not an int"; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_INT(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsInt()) { \ + std::stringstream ss; \ + ss << "field was not an int" << " line " << __LINE__; \ + return Status::Invalid(ss.str()); \ } -#define RETURN_NOT_ARRAY(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsArray()) { \ - std::stringstream ss; \ - ss << "field was not an array"; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_ARRAY(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsArray()) { \ + std::stringstream ss; \ + ss << "field was not an array" << " line " << __LINE__; \ + return Status::Invalid(ss.str()); \ } -#define RETURN_NOT_OBJECT(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsObject()) { \ - std::stringstream ss; \ - ss << "field was not an object"; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_OBJECT(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsObject()) { \ + std::stringstream ss; \ + ss << "field was not an object" << " line " << __LINE__; \ + return Status::Invalid(ss.str()); \ } namespace arrow { From d13a05fb75a4eec68faf6711bad072cc87263c6d Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Fri, 18 Nov 2016 14:09:51 -0500 Subject: [PATCH 27/27] Compiler warning Change-Id: I200b04cababa0d02db39de764aea79e201372700 --- cpp/src/arrow/ipc/json-internal.h | 65 +++++++++++++++++-------------- cpp/src/arrow/ipc/json.cc | 3 +- 2 files changed, 37 insertions(+), 31 deletions(-) diff --git a/cpp/src/arrow/ipc/json-internal.h b/cpp/src/arrow/ipc/json-internal.h index 3d7912f029f..0c167a4ec53 100644 --- a/cpp/src/arrow/ipc/json-internal.h +++ b/cpp/src/arrow/ipc/json-internal.h @@ -43,44 +43,49 @@ using RjWriter = rj::Writer; return Status::Invalid(ss.str()); \ } -#define RETURN_NOT_STRING(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsString()) { \ - std::stringstream ss; \ - ss << "field was not a string" << " line " << __LINE__; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_STRING(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsString()) { \ + std::stringstream ss; \ + ss << "field was not a string" \ + << " line " << __LINE__; \ + return Status::Invalid(ss.str()); \ } -#define RETURN_NOT_BOOL(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsBool()) { \ - std::stringstream ss; \ - ss << "field was not a boolean" << " line " << __LINE__; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_BOOL(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsBool()) { \ + std::stringstream ss; \ + ss << "field was not a boolean" \ + << " line " << __LINE__; \ + return Status::Invalid(ss.str()); \ } -#define RETURN_NOT_INT(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsInt()) { \ - std::stringstream ss; \ - ss << "field was not an int" << " line " << __LINE__; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_INT(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsInt()) { \ + std::stringstream ss; \ + ss << "field was not an int" \ + << " line " << __LINE__; \ + return Status::Invalid(ss.str()); \ } -#define RETURN_NOT_ARRAY(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsArray()) { \ - std::stringstream ss; \ - ss << "field was not an array" << " line " << __LINE__; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_ARRAY(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsArray()) { \ + std::stringstream ss; \ + ss << "field was not an array" \ + << " line " << __LINE__; \ + return Status::Invalid(ss.str()); \ } -#define RETURN_NOT_OBJECT(TOK, NAME, PARENT) \ - RETURN_NOT_FOUND(TOK, NAME, PARENT); \ - if (!NAME->value.IsObject()) { \ - std::stringstream ss; \ - ss << "field was not an object" << " line " << __LINE__; \ - return Status::Invalid(ss.str()); \ +#define RETURN_NOT_OBJECT(TOK, NAME, PARENT) \ + RETURN_NOT_FOUND(TOK, NAME, PARENT); \ + if (!NAME->value.IsObject()) { \ + std::stringstream ss; \ + ss << "field was not an object" \ + << " line " << __LINE__; \ + return Status::Invalid(ss.str()); \ } namespace arrow { diff --git a/cpp/src/arrow/ipc/json.cc b/cpp/src/arrow/ipc/json.cc index eaaec4dbda8..2281611f8b8 100644 --- a/cpp/src/arrow/ipc/json.cc +++ b/cpp/src/arrow/ipc/json.cc @@ -144,7 +144,8 @@ class JsonReader::JsonReaderImpl { Status GetRecordBatch(int i, std::shared_ptr* batch) const { DCHECK_GE(i, 0) << "i out of bounds"; - DCHECK_LT(i, record_batches_->GetArray().Size()) << "i out of bounds"; + DCHECK_LT(i, static_cast(record_batches_->GetArray().Size())) + << "i out of bounds"; const auto& batch_val = record_batches_->GetArray()[i]; DCHECK(batch_val.IsObject());