diff --git a/LICENSE.txt b/LICENSE.txt index cb359c05720..755db54ae5c 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -1874,3 +1874,31 @@ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +---------------------------------------------------------------------- + +cpp/src/arrow/vendored/base64.cpp has the following license + +ZLIB License + +Copyright (C) 2004-2017 René Nyffenegger + +This source code is provided 'as-is', without any express or implied +warranty. In no event will the author be held liable for any damages arising +from the use of this software. + +Permission is granted to anyone to use this software for any purpose, including +commercial applications, and to alter it and redistribute it freely, subject to +the following restrictions: + +1. The origin of this source code must not be misrepresented; you must not + claim that you wrote the original source code. If you use this source code + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + +2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original source code. + +3. This notice may not be removed or altered from any source distribution. + +René Nyffenegger rene.nyffenegger@adp-gmbh.ch diff --git a/cpp/src/arrow/CMakeLists.txt b/cpp/src/arrow/CMakeLists.txt index f1968af592d..d575faff822 100644 --- a/cpp/src/arrow/CMakeLists.txt +++ b/cpp/src/arrow/CMakeLists.txt @@ -145,6 +145,7 @@ set(ARROW_SRCS util/thread_pool.cc util/trie.cc util/utf8.cc + vendored/base64.cpp vendored/datetime/tz.cpp) # Add dependencies for third-party allocators. diff --git a/cpp/src/arrow/util/base64.h b/cpp/src/arrow/util/base64.h new file mode 100644 index 00000000000..9ab41412ac3 --- /dev/null +++ b/cpp/src/arrow/util/base64.h @@ -0,0 +1,34 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "arrow/util/visibility.h" + +namespace arrow { +namespace util { + +ARROW_EXPORT +std::string base64_encode(unsigned char const*, unsigned int len); + +ARROW_EXPORT +std::string base64_decode(std::string const& s); + +} // namespace util +} // namespace arrow diff --git a/cpp/src/arrow/vendored/base64.cpp b/cpp/src/arrow/vendored/base64.cpp new file mode 100644 index 00000000000..50ece19455e --- /dev/null +++ b/cpp/src/arrow/vendored/base64.cpp @@ -0,0 +1,128 @@ +/* + base64.cpp and base64.h + + base64 encoding and decoding with C++. + + Version: 1.01.00 + + Copyright (C) 2004-2017 René Nyffenegger + + This source code is provided 'as-is', without any express or implied + warranty. In no event will the author be held liable for any damages + arising from the use of this software. + + Permission is granted to anyone to use this software for any purpose, + including commercial applications, and to alter it and redistribute it + freely, subject to the following restrictions: + + 1. The origin of this source code must not be misrepresented; you must not + claim that you wrote the original source code. If you use this source code + in a product, an acknowledgment in the product documentation would be + appreciated but is not required. + + 2. Altered source versions must be plainly marked as such, and must not be + misrepresented as being the original source code. + + 3. This notice may not be removed or altered from any source distribution. + + René Nyffenegger rene.nyffenegger@adp-gmbh.ch + +*/ + +#include "arrow/util/base64.h" +#include + +namespace arrow { +namespace util { + +static const std::string base64_chars = + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789+/"; + + +static inline bool is_base64(unsigned char c) { + return (isalnum(c) || (c == '+') || (c == '/')); +} + +std::string base64_encode(unsigned char const* bytes_to_encode, unsigned int in_len) { + std::string ret; + int i = 0; + int j = 0; + unsigned char char_array_3[3]; + unsigned char char_array_4[4]; + + while (in_len--) { + char_array_3[i++] = *(bytes_to_encode++); + if (i == 3) { + char_array_4[0] = (char_array_3[0] & 0xfc) >> 2; + char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4); + char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6); + char_array_4[3] = char_array_3[2] & 0x3f; + + for(i = 0; (i <4) ; i++) + ret += base64_chars[char_array_4[i]]; + i = 0; + } + } + + if (i) + { + for(j = i; j < 3; j++) + char_array_3[j] = '\0'; + + char_array_4[0] = ( char_array_3[0] & 0xfc) >> 2; + char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4); + char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6); + + for (j = 0; (j < i + 1); j++) + ret += base64_chars[char_array_4[j]]; + + while((i++ < 3)) + ret += '='; + + } + + return ret; + +} + +std::string base64_decode(std::string const& encoded_string) { + size_t in_len = encoded_string.size(); + int i = 0; + int j = 0; + int in_ = 0; + unsigned char char_array_4[4], char_array_3[3]; + std::string ret; + + while (in_len-- && ( encoded_string[in_] != '=') && is_base64(encoded_string[in_])) { + char_array_4[i++] = encoded_string[in_]; in_++; + if (i ==4) { + for (i = 0; i <4; i++) + char_array_4[i] = base64_chars.find(char_array_4[i]) & 0xff; + + char_array_3[0] = ( char_array_4[0] << 2 ) + ((char_array_4[1] & 0x30) >> 4); + char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); + char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; + + for (i = 0; (i < 3); i++) + ret += char_array_3[i]; + i = 0; + } + } + + if (i) { + for (j = 0; j < i; j++) + char_array_4[j] = base64_chars.find(char_array_4[j]) & 0xff; + + char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4); + char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); + + for (j = 0; (j < i - 1); j++) ret += char_array_3[j]; + } + + return ret; +} + +} // namespace util +} // namespace arrow diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index f8307c54d0d..7b803fa261f 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -39,6 +39,7 @@ #include "arrow/table.h" #include "arrow/type.h" #include "arrow/type_traits.h" +#include "arrow/util/base64.h" #include "arrow/util/checked_cast.h" #include "arrow/util/int_util.h" #include "arrow/util/logging.h" @@ -576,7 +577,8 @@ Status GetOriginSchema(const std::shared_ptr& metadata, // The original Arrow schema was serialized using the store_schema option. We // deserialize it here and use it to inform read options such as // dictionary-encoded fields - auto schema_buf = std::make_shared(metadata->value(schema_index)); + auto decoded = ::arrow::util::base64_decode(metadata->value(schema_index)); + auto schema_buf = std::make_shared(decoded); ::arrow::ipc::DictionaryMemo dict_memo; ::arrow::io::BufferReader input(schema_buf); diff --git a/cpp/src/parquet/arrow/writer.cc b/cpp/src/parquet/arrow/writer.cc index cfd58b2a452..18c01a527ea 100644 --- a/cpp/src/parquet/arrow/writer.cc +++ b/cpp/src/parquet/arrow/writer.cc @@ -30,6 +30,7 @@ #include "arrow/ipc/writer.h" #include "arrow/table.h" #include "arrow/type.h" +#include "arrow/util/base64.h" #include "arrow/visitor_inline.h" #include "parquet/arrow/reader_internal.h" @@ -577,7 +578,13 @@ Status GetSchemaMetadata(const ::arrow::Schema& schema, ::arrow::MemoryPool* poo ::arrow::ipc::DictionaryMemo dict_memo; std::shared_ptr serialized; RETURN_NOT_OK(::arrow::ipc::SerializeSchema(schema, &dict_memo, pool, &serialized)); - result->Append(kArrowSchemaKey, serialized->ToString()); + + // The serialized schema is not UTF-8, which is required for Thrift + std::string schema_as_string = serialized->ToString(); + std::string schema_base64 = ::arrow::util::base64_encode( + reinterpret_cast(schema_as_string.data()), + static_cast(schema_as_string.size())); + result->Append(kArrowSchemaKey, schema_base64); *out = result; return Status::OK(); } diff --git a/python/pyarrow/tests/test_extension_type.py b/python/pyarrow/tests/test_extension_type.py index dd9208549f9..35a40241450 100644 --- a/python/pyarrow/tests/test_extension_type.py +++ b/python/pyarrow/tests/test_extension_type.py @@ -372,7 +372,10 @@ def test_parquet(tmpdir, registered_period_type): meta = pq.read_metadata(filename) assert meta.schema.column(0).physical_type == "INT64" assert b"ARROW:schema" in meta.metadata - schema = pa.read_schema(pa.BufferReader(meta.metadata[b"ARROW:schema"])) + + import base64 + decoded_schema = base64.b64decode(meta.metadata[b"ARROW:schema"]) + schema = pa.read_schema(pa.BufferReader(decoded_schema)) assert schema.field("ext").metadata == { b'ARROW:extension:metadata': b'freq=D', b'ARROW:extension:name': b'pandas.period'}