diff --git a/build_release.sh b/build_release.sh new file mode 100755 index 00000000000..5afaff58823 --- /dev/null +++ b/build_release.sh @@ -0,0 +1,34 @@ +rm -rf cpp-jni java-dist java-jni cpp/debug +mkdir cpp/debug +cd cpp/debug + +arch -x86_64 cmake -DCMAKE_BUILD_TYPE=RELEASE -DARROW_GANDIVA=ON -DARROW_JEMALLOC=OFF -DARROW_GANDIVA_JAVA=ON -DARROW_BUILD_TESTS=OFF .. +arch -x86_64 make -j 8 +if [ $? -ne 0 ] +then + echo "failed" + exit 1 +fi + +cd ../../ +mkdir -p java-jni cpp-jni + +arch -x86_64 cmake -S cpp -B cpp-jni -DARROW_BUILD_SHARED=OFF -DARROW_JEMALLOC=OFF -DARROW_CSV=ON -DARROW_DATASET=ON -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_DEPENDENCY_USE_SHARED=OFF -DARROW_FILESYSTEM=ON -DARROW_GANDIVA=ON -DARROW_GANDIVA_STATIC_LIBSTDCPP=ON -DARROW_ORC=ON -DARROW_PARQUET=ON -DARROW_S3=ON -DARROW_USE_CCACHE=ON -DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_INSTALL_LIBDIR=lib/x86_64 -DCMAKE_INSTALL_PREFIX=java-dist -DCMAKE_UNITY_BUILD=ON +arch -x86_64 cmake --build cpp-jni --target install --config Release +if [ $? -ne 0 ] +then + echo "failed" + exit 1 +fi + +arch -x86_64 cmake -S java -B java-jni -DARROW_JAVA_JNI_ENABLE_C=OFF -DARROW_JEMALLOC=OFF -DARROW_JAVA_JNI_ENABLE_DEFAULT=ON -DBUILD_TESTING=OFF -DCMAKE_BUILD_TYPE=RELEASE -DCMAKE_INSTALL_LIBDIR=lib/x86_64 -DCMAKE_INSTALL_PREFIX=java-dist -DCMAKE_PREFIX_PATH=$PWD/java-dist/lib/x86_64/cmake +arch -x86_64 cmake --build java-jni --target install --config Release +if [ $? -ne 0 ] +then + echo "failed" + exit 1 +fi + +cd java +/opt/homebrew/bin/mvn -DskipTests -Darrow.c.jni.dist.dir=/Users/logan.riggs/github/arrow/java-dist/lib -Darrow.cpp.build.dir=/Users/logan.riggs/github/arrow/java-dist/lib -Parrow-jni clean install +cp gandiva/target/arrow-gandiva-12.0.1.jar /Users/logan.riggs/github/dremio/enterprise/distribution/server/target/dremio-enterprise-24.3.0-SNAPSHOT/dremio-enterprise-24.3.0-SNAPSHOT/jars/3rdparty/ diff --git a/build_testing.sh b/build_testing.sh new file mode 100755 index 00000000000..9604ba5678f --- /dev/null +++ b/build_testing.sh @@ -0,0 +1,38 @@ +rm -rf cpp-jni java-dist java-jni cpp/debug +mkdir cpp/debug +cd cpp/debug + +echo "====CPP====" +arch -x86_64 cmake -DCMAKE_BUILD_TYPE=DEBUG -DARROW_GANDIVA=ON -DARROW_JEMALLOC=OFF -DARROW_GANDIVA_JAVA=ON -DARROW_BUILD_TESTS=ON .. +arch -x86_64 make -j 8 +if [ $? -ne 0 ] +then + echo "failed" + exit 1 +fi + +cd ../../ +mkdir -p java-jni cpp-jni + +echo "====CPP-JNI====" +arch -x86_64 cmake -S cpp -B cpp-jni -DARROW_BUILD_SHARED=OFF -DARROW_JEMALLOC=OFF -DARROW_CSV=ON -DARROW_DATASET=ON -DARROW_DEPENDENCY_SOURCE=BUNDLED -DARROW_DEPENDENCY_USE_SHARED=OFF -DARROW_FILESYSTEM=ON -DARROW_GANDIVA=ON -DARROW_GANDIVA_STATIC_LIBSTDCPP=ON -DARROW_ORC=ON -DARROW_PARQUET=ON -DARROW_S3=ON -DARROW_USE_CCACHE=ON -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_LIBDIR=lib/x86_64 -DCMAKE_INSTALL_PREFIX=java-dist -DCMAKE_UNITY_BUILD=ON +arch -x86_64 cmake --build cpp-jni --target install --config Debug +if [ $? -ne 0 ] +then + echo "failed" + exit 1 +fi + +echo "====JAVA-JNI====" +arch -x86_64 cmake -S java -B java-jni -DARROW_JAVA_JNI_ENABLE_C=OFF -DARROW_JEMALLOC=OFF -DARROW_JAVA_JNI_ENABLE_DEFAULT=ON -DBUILD_TESTING=OFF -DCMAKE_BUILD_TYPE=Debug -DCMAKE_INSTALL_LIBDIR=lib/x86_64 -DCMAKE_INSTALL_PREFIX=java-dist -DCMAKE_PREFIX_PATH=$PWD/java-dist/lib/x86_64/cmake -DArrowTesting_DIR=$PWD/cpp/debug/src/arrow +arch -x86_64 cmake --build java-jni --target install --config Debug +if [ $? -ne 0 ] +then + echo "failed" + exit 1 +fi + +echo "====JARS====" +cd java +/opt/homebrew/bin/mvn -DskipTests -Darrow.c.jni.dist.dir=/Users/logan.riggs/github/arrow-fork/arrow/java-dist/lib -Darrow.cpp.build.dir=/Users/logan.riggs/github/arrow-fork/arrow/java-dist/lib -Parrow-jni clean install +cp java/gandiva/target/arrow-gandiva-12.0.1.jar /Users/logan.riggs/github/dremio/enterprise/distribution/server/target/dremio-enterprise-24.3.0-SNAPSHOT/dremio-enterprise-24.3.0-SNAPSHOT/jars/3rdparty/ diff --git a/cpp/src/arrow/buffer.h b/cpp/src/arrow/buffer.h index 9270c4dea3f..66da004c2be 100644 --- a/cpp/src/arrow/buffer.h +++ b/cpp/src/arrow/buffer.h @@ -444,10 +444,21 @@ class ARROW_EXPORT ResizableBuffer : public MutableBuffer { return Reserve(sizeof(T) * new_nb_elements); } + public: + uint8_t* offsetBuffer; + int64_t offsetCapacity; + protected: - ResizableBuffer(uint8_t* data, int64_t size) : MutableBuffer(data, size) {} + ResizableBuffer(uint8_t* data, int64_t size) : MutableBuffer(data, size) { + offsetBuffer = nullptr; + offsetCapacity = 0; + + } ResizableBuffer(uint8_t* data, int64_t size, std::shared_ptr mm) - : MutableBuffer(data, size, std::move(mm)) {} + : MutableBuffer(data, size, std::move(mm)) { + offsetBuffer = nullptr; + offsetCapacity = 0; + } }; /// \defgroup buffer-allocation-functions Functions for allocating buffers diff --git a/cpp/src/arrow/c/bridge.cc b/cpp/src/arrow/c/bridge.cc index 85a5156d11d..32dbc088a71 100644 --- a/cpp/src/arrow/c/bridge.cc +++ b/cpp/src/arrow/c/bridge.cc @@ -195,7 +195,7 @@ struct SchemaExporter { } Status ExportSchema(const Schema& schema) { - static const StructType dummy_struct_type({}); + static const StructType dummy_struct_type = StructType(); flags_ = 0; RETURN_NOT_OK(ExportFormat(dummy_struct_type)); diff --git a/cpp/src/arrow/type.cc b/cpp/src/arrow/type.cc index 4804570bdf5..60b71cbb71d 100644 --- a/cpp/src/arrow/type.cc +++ b/cpp/src/arrow/type.cc @@ -856,6 +856,10 @@ StructType::StructType(const std::vector>& fields) children_ = fields; } +StructType::StructType() + : NestedType(Type::STRUCT) { +} + StructType::~StructType() {} std::string StructType::ToString() const { @@ -2527,6 +2531,7 @@ TYPE_FACTORY(float16, HalfFloatType) TYPE_FACTORY(float32, FloatType) TYPE_FACTORY(float64, DoubleType) TYPE_FACTORY(utf8, StringType) +TYPE_FACTORY(structType, StructType) TYPE_FACTORY(large_utf8, LargeStringType) TYPE_FACTORY(binary, BinaryType) TYPE_FACTORY(large_binary, LargeBinaryType) diff --git a/cpp/src/arrow/type.h b/cpp/src/arrow/type.h index 560805535dc..ddeb45b721f 100644 --- a/cpp/src/arrow/type.h +++ b/cpp/src/arrow/type.h @@ -1079,6 +1079,9 @@ class ARROW_EXPORT StructType : public NestedType { static constexpr const char* type_name() { return "struct"; } explicit StructType(const std::vector>& fields); + explicit StructType(); + StructType(const StructType& rhs) = delete; + StructType& operator=(const StructType& rhs) = delete; ~StructType() override; diff --git a/cpp/src/arrow/type_fwd.h b/cpp/src/arrow/type_fwd.h index 657abbaecc4..450ed9a136d 100644 --- a/cpp/src/arrow/type_fwd.h +++ b/cpp/src/arrow/type_fwd.h @@ -123,6 +123,11 @@ class StringArray; class StringBuilder; struct StringScalar; +class StructType; +class StructArray; +class StructBuilder; +struct StructScalar; + class LargeStringType; class LargeStringArray; class LargeStringBuilder; @@ -454,6 +459,7 @@ ARROW_EXPORT const std::shared_ptr& float32(); ARROW_EXPORT const std::shared_ptr& float64(); /// \brief Return a StringType instance ARROW_EXPORT const std::shared_ptr& utf8(); +ARROW_EXPORT const std::shared_ptr& structType(); /// \brief Return a LargeStringType instance ARROW_EXPORT const std::shared_ptr& large_utf8(); /// \brief Return a BinaryType instance diff --git a/cpp/src/gandiva/CMakeLists.txt b/cpp/src/gandiva/CMakeLists.txt index 6a92224e911..dc0c427f48d 100644 --- a/cpp/src/gandiva/CMakeLists.txt +++ b/cpp/src/gandiva/CMakeLists.txt @@ -61,9 +61,11 @@ set(SRC_FILES expression_registry.cc exported_funcs_registry.cc filter.cc + array_ops.cc function_ir_builder.cc function_registry.cc function_registry_arithmetic.cc + function_registry_array.cc function_registry_datetime.cc function_registry_hash.cc function_registry_math_ops.cc @@ -249,7 +251,8 @@ add_gandiva_test(internals-test random_generator_holder_test.cc hash_utils_test.cc gdv_function_stubs_test.cc - interval_holder_test.cc) + interval_holder_test.cc + array_ops_test.cc) add_subdirectory(precompiled) add_subdirectory(tests) diff --git a/cpp/src/gandiva/annotator.cc b/cpp/src/gandiva/annotator.cc index b341fdde3a3..4cc0e1dc29b 100644 --- a/cpp/src/gandiva/annotator.cc +++ b/cpp/src/gandiva/annotator.cc @@ -17,6 +17,7 @@ #include "gandiva/annotator.h" +#include #include #include @@ -46,15 +47,24 @@ FieldDescriptorPtr Annotator::MakeDesc(FieldPtr field, bool is_output) { int data_idx = buffer_count_++; int validity_idx = buffer_count_++; int offsets_idx = FieldDescriptor::kInvalidIdx; + int child_offsets_idx = FieldDescriptor::kInvalidIdx; if (arrow::is_binary_like(field->type()->id())) { offsets_idx = buffer_count_++; } + + if (field->type()->id() == arrow::Type::LIST) { + std::cout << "LR Annotator::MakeDesc 1" << std::endl; + offsets_idx = buffer_count_++; + if (arrow::is_binary_like(field->type()->field(0)->type()->id())) { + child_offsets_idx = buffer_count_++; + } + } int data_buffer_ptr_idx = FieldDescriptor::kInvalidIdx; if (is_output) { data_buffer_ptr_idx = buffer_count_++; } return std::make_shared(field, data_idx, validity_idx, offsets_idx, - data_buffer_ptr_idx); + data_buffer_ptr_idx, child_offsets_idx); } int Annotator::AddHolderPointer(void* holder) { @@ -71,26 +81,90 @@ void Annotator::PrepareBuffersForField(const FieldDescriptor& desc, // The validity buffer is optional. Use nullptr if it does not have one. if (array_data.buffers[buffer_idx]) { uint8_t* validity_buf = const_cast(array_data.buffers[buffer_idx]->data()); + std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -6 " << &validity_buf << std::endl; eval_batch->SetBuffer(desc.validity_idx(), validity_buf, array_data.offset); } else { + std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -5 null " << std::endl; eval_batch->SetBuffer(desc.validity_idx(), nullptr, array_data.offset); } ++buffer_idx; if (desc.HasOffsetsIdx()) { uint8_t* offsets_buf = const_cast(array_data.buffers[buffer_idx]->data()); + std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -4 " << &offsets_buf << " using idx=" << buffer_idx << std::endl; eval_batch->SetBuffer(desc.offsets_idx(), offsets_buf, array_data.offset); - ++buffer_idx; + + if (desc.HasChildOffsetsIdx()) { + //std::cout << "LR Annotator::PrepareBuffersForField 1 for field " << desc.Name() << " type is " << array_data.type->id() << std::endl; + if (is_output) { + // if list field is output field, we should put buffer pointer into eval batch + // for resizing + uint8_t* child_offsets_buf = reinterpret_cast( + array_data.child_data.at(0)->buffers[buffer_idx].get()); + std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -3 " << &child_offsets_buf << std::endl; + eval_batch->SetBuffer(desc.child_data_offsets_idx(), child_offsets_buf, + array_data.child_data.at(0)->offset); + } else { + //std::cout << "LR Annotator::PrepareBuffersForField 2" << std::endl; + // if list field is input field, just put buffer data into eval batch + uint8_t* child_offsets_buf = const_cast( + array_data.child_data.at(0)->buffers[buffer_idx]->data()); + std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -2 " << &child_offsets_buf << std::endl; + eval_batch->SetBuffer(desc.child_data_offsets_idx(), child_offsets_buf, + array_data.child_data.at(0)->offset); + } + } + if (array_data.type->id() != arrow::Type::LIST || + arrow::is_binary_like(array_data.type->field(0)->type()->id())) { + //std::cout << "LR Annotator::PrepareBuffersForField 3" << std::endl; + + // primitive type list data buffer index is 1 + // binary like type list data buffer index is 2 + ++buffer_idx; + } + } + + if (array_data.type->id() != arrow::Type::LIST) { + //std::cout << "LR Annotator::PrepareBuffersForField 4" << std::endl; + + //std::cout << "LR Annotator::PrepareBuffersForField 4 buffer_idx " << buffer_idx << std::endl; + uint8_t* data_buf = const_cast(array_data.buffers[buffer_idx]->data()); + //std::cout << "LR Annotator::PrepareBuffersForField 4a" << std::endl; + std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer -1 " << &data_buf << std::endl; + eval_batch->SetBuffer(desc.data_idx(), data_buf, array_data.offset); + //std::cout << "LR Annotator::PrepareBuffersForField 4b" << std::endl; + } else { + //std::cout << "LR Annotator::PrepareBuffersForField 5 " << desc.Name() << " buffer_idx " << buffer_idx << std::endl; + //std::cout << "LR Annotator::PrepareBuffersForField 5 array_data child size " << array_data.child_data.size() << std::endl; + + uint8_t* data_buf = + const_cast(array_data.child_data.at(0)->buffers[buffer_idx]->data()); + std::cout << "LR Annotator::PrepareBuffersForField setting offset eval buffer idx=" << buffer_idx << " data=" << &data_buf << std::endl; + eval_batch->SetBuffer(desc.data_idx(), data_buf, array_data.child_data.at(0)->offset); + //std::cout << "LR Annotator::PrepareBuffersForField 5a" << std::endl; } - uint8_t* data_buf = const_cast(array_data.buffers[buffer_idx]->data()); - eval_batch->SetBuffer(desc.data_idx(), data_buf, array_data.offset); if (is_output) { // pass in the Buffer object for output data buffers. Can be used for resizing. - uint8_t* data_buf_ptr = - reinterpret_cast(array_data.buffers[buffer_idx].get()); - eval_batch->SetBuffer(desc.data_buffer_ptr_idx(), data_buf_ptr, array_data.offset); + + if (array_data.type->id() != arrow::Type::LIST) { + uint8_t* data_buf_ptr = + reinterpret_cast(array_data.buffers[buffer_idx].get()); + std::cout << "LR Annotator::PrepareBuffersForField setting eval buffer 1 " << &data_buf_ptr << std::endl; + eval_batch->SetBuffer(desc.data_buffer_ptr_idx(), data_buf_ptr, array_data.offset); + } else { + //std::cout << "LR Annotator::PrepareBuffersForField is_output index " << desc.data_buffer_ptr_idx() << std::endl; + + // list data buffer is in child data buffer + uint8_t* data_buf_ptr = reinterpret_cast( + array_data.child_data.at(0)->buffers[buffer_idx].get()); + std::cout << "LR Annotator::PrepareBuffersForField setting eval data buffer " << buffer_idx << " data=" << &data_buf_ptr << std::endl; + + eval_batch->SetBuffer(desc.data_buffer_ptr_idx(), data_buf_ptr, + array_data.child_data.at(0)->offset); + } } + } EvalBatchPtr Annotator::PrepareEvalBatch(const arrow::RecordBatch& record_batch, @@ -98,6 +172,7 @@ EvalBatchPtr Annotator::PrepareEvalBatch(const arrow::RecordBatch& record_batch, EvalBatchPtr eval_batch = std::make_shared( record_batch.num_rows(), buffer_count_, local_bitmap_count_); + //std::cout << "LR PrepareEvalBatch 1" << std::endl; // Fill in the entries for the input fields. for (int i = 0; i < record_batch.num_columns(); ++i) { const std::string& name = record_batch.column_name(i); @@ -107,17 +182,27 @@ EvalBatchPtr Annotator::PrepareEvalBatch(const arrow::RecordBatch& record_batch, continue; } + /*std::cout << "LR PrepareEvalBatch 1a i=" << i << " record batch schema " << record_batch.schema()->ToString() + << " num rows " << record_batch.num_rows() + << " num columns " << record_batch.num_columns() + << " data size " << record_batch.column_data().size() + << " col 1 " << record_batch.column(0)->ToString() + << std::endl;*/ + + //std::cout << "LR PrepareEvalBatch 1a i=" << i << " record batch data " << record_batch.ToString() << std::endl; PrepareBuffersForField(*(found->second), *(record_batch.column_data(i)), eval_batch.get(), false /*is_output*/); } // Fill in the entries for the output fields. + //std::cout << "LR PrepareEvalBatch preparing output fields" << std::endl; int idx = 0; for (auto& arraydata : out_vector) { const FieldDescriptorPtr& desc = out_descs_.at(idx); PrepareBuffersForField(*desc, *arraydata, eval_batch.get(), true /*is_output*/); ++idx; } + //std::cout << "LR PrepareEvalBatch 2" << std::endl; return eval_batch; } diff --git a/cpp/src/gandiva/array_ops.cc b/cpp/src/gandiva/array_ops.cc new file mode 100644 index 00000000000..0cbac7942bb --- /dev/null +++ b/cpp/src/gandiva/array_ops.cc @@ -0,0 +1,228 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/array_ops.h" + +#include +#include + +#include "arrow/util/value_parsing.h" + +#include "gandiva/gdv_function_stubs.h" +#include "gandiva/engine.h" +#include "gandiva/exported_funcs.h" + +/// Stub functions that can be accessed from LLVM or the pre-compiled library. + +extern "C" { + +bool array_utf8_contains_utf8(int64_t context_ptr, const char* entry_buf, + int32_t* entry_child_offsets, int32_t entry_offsets_len, + const char* contains_data, int32_t contains_data_length) { + for (int i = 0; i < entry_offsets_len; i++) { + int32_t entry_len = *(entry_child_offsets + i + 1) - *(entry_child_offsets + i); + if (entry_len != contains_data_length) { + entry_buf = entry_buf + entry_len; + continue; + } + if (strncmp(entry_buf, contains_data, contains_data_length) == 0) { + return true; + } + entry_buf = entry_buf + entry_len; + } + return false; +} + +bool array_int32_contains_int32(int64_t context_ptr, const int32_t* entry_buf, + int32_t entry_offsets_len, + int32_t contains_data) { + //std::cout << "LR array_int32_contains_int32 offset length=" << entry_offsets_len << std::endl; + for (int i = 0; i < entry_offsets_len; i++) { + //std::cout << "LR going to check " << entry_buf + i << std::endl; + //LR TODO + int32_t entry_len = *(entry_buf + i); + //coming as int64 for some reason. *2 + //int32_t entry_len = *(entry_buf + (i * 2)); + //std::cout << "LR checking value " << entry_len << " against target " << contains_data << std::endl; + if (entry_len == contains_data) { + return true; + } + } + return false; +} + +bool array_int64_contains_int64(int64_t context_ptr, const int64_t* entry_buf, + int32_t entry_offsets_len, + int64_t contains_data) { + //std::cout << "LR array_int64_contains_int64 offset length=" << entry_offsets_len << std::endl; + for (int i = 0; i < entry_offsets_len; i++) { + //std::cout << "LR going to check " << entry_buf + i << std::endl; + int64_t entry_len = *(entry_buf + (i*2)); //LR TODO sizeof int64? + //std::cout << "LR checking value " << entry_len << " against target " << contains_data << std::endl; + if (entry_len == contains_data) { + return true; + } + } + return false; +} + + +int32_t* array_int32_make_array(int64_t context_ptr, int32_t contains_data, int32_t* out_len) { + //std::cout << "LR array_int32_make_array offset data=" << contains_data << std::endl; + + int integers[] = { contains_data, 21, 3, contains_data, 5 }; + *out_len = 5;// * 4; + //length is number of items, but buffers must account for byte size. + uint8_t* ret = gdv_fn_context_arena_malloc(context_ptr, *out_len * 4); + memcpy(ret, integers, *out_len * 4); + //std::cout << "LR made a buffer length" << *out_len * 4 << " item 3 is = " << int32_t(ret[3*4]) << std::endl; + + + //return reinterpret_cast(ret); + return reinterpret_cast(ret); +} +/* +int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, + int32_t entry_offsets_len, int32_t remove_data, int32_t* out_len) { + //std::cout << "LR array_int32_remove data=" << remove_data + // << " entry_offsets_len " << entry_offsets_len << std::endl; + + //LR sizes are HACK + int* integers = new int[5]; + int j = 0; + for (int i = 0; i < entry_offsets_len; i++) { + //std::cout << "LR going to check " << entry_buf + i << std::endl; + int32_t entry_len = *(entry_buf + (i * 1)); + //std::cout << "LR checking value " << entry_len << " against target " << remove_data << std::endl; + if (entry_len == remove_data) { + continue; + } else { + integers[j++] = entry_len; + } + } + + *out_len = 5;// * 4; + //length is number of items, but buffers must account for byte size. + uint8_t* ret = gdv_fn_context_arena_malloc(context_ptr, *out_len * 4); + memcpy(ret, integers, *out_len * 4); + //std::cout << "LR made a buffer length" << *out_len * 4 << " item 3 is = " << int32_t(ret[3*4]) << std::endl; + + delete [] integers; + //return reinterpret_cast(ret); + return reinterpret_cast(ret); +} +*/ +int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, + int32_t entry_offsets_len, int32_t remove_data, int32_t* out_len) { + //std::cout << "LR array_int32_remove data=" << remove_data + // << " entry_offsets_len " << entry_offsets_len << std::endl; + + std::vector newInts; + + for (int i = 0; i < entry_offsets_len; i++) { + //std::cout << "LR going to check " << entry_buf + i << std::endl; + int32_t entry_item = *(entry_buf + (i * 1)); + //std::cout << "LR checking value " << entry_len << " against target " << remove_data << std::endl; + if (entry_item == remove_data) { + continue; + } else { + newInts.push_back(entry_item); + } + } + + *out_len = newInts.size(); + int32_t outBufferLength = *out_len * sizeof(int); + //length is number of items, but buffers must account for byte size. + uint8_t* ret = gdv_fn_context_arena_malloc(context_ptr, outBufferLength); + memcpy(ret, newInts.data(), outBufferLength); + //std::cout << "LR made a buffer length" << *out_len * 4 << " item 3 is = " << int32_t(ret[3*4]) << std::endl; + + + //return reinterpret_cast(ret); + return reinterpret_cast(ret); +} + +int64_t array_utf8_length(int64_t context_ptr, const char* entry_buf, + int32_t* entry_child_offsets, int32_t entry_offsets_len) { + int64_t res = entry_offsets_len; + return res; +} +} + +namespace gandiva { +void ExportedArrayFunctions::AddMappings(Engine* engine) const { + std::vector args; + auto types = engine->types(); + + args = {types->i64_type(), // int64_t execution_context + types->i8_ptr_type(), // int8_t* data ptr + types->i32_ptr_type(), // int32_t* child offsets ptr + types->i32_type()}; // int32_t child offsets length + + engine->AddGlobalMappingForFunc("array_utf8_length", types->i64_type() /*return_type*/, + args, reinterpret_cast(array_utf8_length)); + + args = {types->i64_type(), // int64_t execution_context + types->i8_ptr_type(), // int8_t* data ptr + types->i32_ptr_type(), // int32_t* child offsets ptr + types->i32_type(), // int32_t child offsets length + types->i8_ptr_type(), // const char* contains data buf + types->i32_type()}; // int32_t contains data length + + engine->AddGlobalMappingForFunc("array_utf8_contains_utf8", + types->i1_type() /*return_type*/, args, + reinterpret_cast(array_utf8_contains_utf8)); + + args = {types->i64_type(), // int64_t execution_context + types->i32_ptr_type(), // int8_t* data ptr + types->i32_type(), // int32_t child offsets length + types->i32_type()}; // int32_t contains data length + + engine->AddGlobalMappingForFunc("array_int32_contains_int32", + types->i1_type() /*return_type*/, args, + reinterpret_cast(array_int32_contains_int32)); + + args = {types->i64_type(), // int64_t execution_context + types->i64_ptr_type(), // int8_t* data ptr + types->i32_type(), // int32_t child offsets length + types->i64_type()}; // int32_t contains data length + + engine->AddGlobalMappingForFunc("array_int64_contains_int64", + types->i1_type() /*return_type*/, args, + reinterpret_cast(array_int64_contains_int64)); + + + args = {types->i64_type(), // int64_t execution_context + types->i32_type(), // array item input + types->i32_ptr_type()}; // out array length + + engine->AddGlobalMappingForFunc("array_int32_make_array", + types->i32_ptr_type(), args, + reinterpret_cast(array_int32_make_array)); + + args = {types->i64_type(), // int64_t execution_context + types->i32_ptr_type(), // int8_t* data ptr + types->i32_type(), // int32_t child offsets length + types->i32_type(), //value to remove from input + types->i32_ptr_type()}; // out array length + + engine->AddGlobalMappingForFunc("array_int32_remove", + types->i32_ptr_type(), args, + reinterpret_cast(array_int32_remove)); + +} +} // namespace gandiva diff --git a/cpp/src/gandiva/array_ops.h b/cpp/src/gandiva/array_ops.h new file mode 100644 index 00000000000..76c158f0e27 --- /dev/null +++ b/cpp/src/gandiva/array_ops.h @@ -0,0 +1,56 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "gandiva/visibility.h" + +namespace llvm { +class VectorType; +} + +/// Array functions that can be accessed from LLVM. +extern "C" { +GANDIVA_EXPORT +bool array_utf8_contains_utf8(int64_t context_ptr, const char* entry_buf, + int32_t* entry_child_offsets, int32_t entry_offsets_len, + const char* contains_data, int32_t contains_data_length); +GANDIVA_EXPORT +int64_t array_utf8_length(int64_t context_ptr, const char* entry_buf, + int32_t* entry_child_offsets, int32_t entry_offsets_len); +GANDIVA_EXPORT +bool array_int32_contains_int32(int64_t context_ptr, const int32_t* entry_buf, + int32_t entry_offsets_len, + int32_t contains_data); +GANDIVA_EXPORT +bool array_int64_contains_int64(int64_t context_ptr, const int64_t* entry_buf, + int32_t entry_offsets_len, + int64_t contains_data); + +GANDIVA_EXPORT +int32_t* array_int32_make_array(int64_t context_ptr, + int32_t contains_data, + int32_t* out_len); + +GANDIVA_EXPORT +int32_t* array_int32_remove(int64_t context_ptr, const int32_t* entry_buf, + int32_t entry_offsets_len, + int32_t remove_data, + int32_t* out_len); +} diff --git a/cpp/src/gandiva/array_ops_test.cc b/cpp/src/gandiva/array_ops_test.cc new file mode 100644 index 00000000000..12dd6f9c56d --- /dev/null +++ b/cpp/src/gandiva/array_ops_test.cc @@ -0,0 +1,65 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include + +#include "gandiva/execution_context.h" +#include "gandiva/precompiled/types.h" + +namespace gandiva { + +TEST(TestArrayOps, TestInt32ContainsInt32) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + int32_t data[] = {1, 2, 3, 4}; + int32_t entry_offsets_len = 3; + int32_t contains_data = 2; + + EXPECT_EQ( + array_int32_contains_int32(ctx_ptr, data, entry_offsets_len, + contains_data), + true); +} + +TEST(TestArrayOps, TestUtf8ContainsUtf8) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + const char* entry_buf = "trianglecirclerectangle"; + int32_t entry_child_offsets[] = {0, 8, 14, 24}; + int32_t entry_offsets_len = 3; + const char* contains_data = "triangle"; + int32_t contains_data_length = 8; + + EXPECT_EQ( + array_utf8_contains_utf8(ctx_ptr, entry_buf, entry_child_offsets, entry_offsets_len, + contains_data, contains_data_length), + true); +} + +TEST(TestArrayOps, TestUtf8Length) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + const char* entry_buf = "trianglecirclerectangle"; + int32_t entry_child_offsets[] = {0, 8, 14, 24}; + int32_t entry_offsets_len = 3; + + EXPECT_EQ(array_utf8_length(ctx_ptr, entry_buf, entry_child_offsets, entry_offsets_len), + 3); +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/dex.h b/cpp/src/gandiva/dex.h index 2998c213176..c35ee93dc03 100644 --- a/cpp/src/gandiva/dex.h +++ b/cpp/src/gandiva/dex.h @@ -80,6 +80,19 @@ class GANDIVA_EXPORT VectorReadFixedLenValueDex : public VectorReadBaseDex { void Accept(DexVisitor& visitor) override { visitor.Visit(*this); } }; +/// value component of a fixed-len list ValueVector +class GANDIVA_EXPORT VectorReadFixedLenValueListDex : public VectorReadBaseDex { + public: + explicit VectorReadFixedLenValueListDex(FieldDescriptorPtr field_desc) + : VectorReadBaseDex(field_desc) {} + + int DataIdx() const { return field_desc_->data_idx(); } + + int OffsetsIdx() const { return field_desc_->offsets_idx(); } + + void Accept(DexVisitor& visitor) override { visitor.Visit(*this); } +}; + /// value component of a variable-len ValueVector class GANDIVA_EXPORT VectorReadVarLenValueDex : public VectorReadBaseDex { public: @@ -93,6 +106,21 @@ class GANDIVA_EXPORT VectorReadVarLenValueDex : public VectorReadBaseDex { void Accept(DexVisitor& visitor) override { visitor.Visit(*this); } }; +/// value component of a variable-len list ValueVector +class GANDIVA_EXPORT VectorReadVarLenValueListDex : public VectorReadBaseDex { + public: + explicit VectorReadVarLenValueListDex(FieldDescriptorPtr field_desc) + : VectorReadBaseDex(field_desc) {} + + int DataIdx() const { return field_desc_->data_idx(); } + + int OffsetsIdx() const { return field_desc_->offsets_idx(); } + + int ChildOffsetsIdx() const { return field_desc_->child_data_offsets_idx(); } + + void Accept(DexVisitor& visitor) override { visitor.Visit(*this); } +}; + /// validity based on a local bitmap. class GANDIVA_EXPORT LocalBitMapValidityDex : public Dex { public: diff --git a/cpp/src/gandiva/dex_visitor.h b/cpp/src/gandiva/dex_visitor.h index 5d160bb22ca..4a03b9c21fc 100644 --- a/cpp/src/gandiva/dex_visitor.h +++ b/cpp/src/gandiva/dex_visitor.h @@ -28,7 +28,9 @@ namespace gandiva { class VectorReadValidityDex; class VectorReadFixedLenValueDex; +class VectorReadFixedLenValueListDex; class VectorReadVarLenValueDex; +class VectorReadVarLenValueListDex; class LocalBitMapValidityDex; class LiteralDex; class TrueDex; @@ -49,7 +51,9 @@ class GANDIVA_EXPORT DexVisitor { virtual void Visit(const VectorReadValidityDex& dex) = 0; virtual void Visit(const VectorReadFixedLenValueDex& dex) = 0; + virtual void Visit(const VectorReadFixedLenValueListDex& dex) = 0; virtual void Visit(const VectorReadVarLenValueDex& dex) = 0; + virtual void Visit(const VectorReadVarLenValueListDex& dex) = 0; virtual void Visit(const LocalBitMapValidityDex& dex) = 0; virtual void Visit(const TrueDex& dex) = 0; virtual void Visit(const FalseDex& dex) = 0; @@ -75,7 +79,9 @@ class GANDIVA_EXPORT DexVisitor { class GANDIVA_EXPORT DexDefaultVisitor : public DexVisitor { VISIT_DCHECK(VectorReadValidityDex) VISIT_DCHECK(VectorReadFixedLenValueDex) + VISIT_DCHECK(VectorReadFixedLenValueListDex) VISIT_DCHECK(VectorReadVarLenValueDex) + VISIT_DCHECK(VectorReadVarLenValueListDex) VISIT_DCHECK(LocalBitMapValidityDex) VISIT_DCHECK(TrueDex) VISIT_DCHECK(FalseDex) diff --git a/cpp/src/gandiva/engine.cc b/cpp/src/gandiva/engine.cc index f5f9460ddd1..80e60ab7ba7 100644 --- a/cpp/src/gandiva/engine.cc +++ b/cpp/src/gandiva/engine.cc @@ -300,6 +300,7 @@ Status Engine::FinalizeModule() { if (!cached_) { ARROW_RETURN_NOT_OK(RemoveUnusedFunctions()); + //LR Turning this off seems to provide better error messages with compilation/generation failures. if (optimize_) { // misc passes to allow for inlining, vectorization, .. std::unique_ptr pass_manager( diff --git a/cpp/src/gandiva/exported_funcs.h b/cpp/src/gandiva/exported_funcs.h index 5a14c521621..55145b301e7 100644 --- a/cpp/src/gandiva/exported_funcs.h +++ b/cpp/src/gandiva/exported_funcs.h @@ -32,6 +32,12 @@ class ExportedFuncsBase { virtual void AddMappings(Engine* engine) const = 0; }; +// Class for exporting Array functions +class ExportedArrayFunctions : public ExportedFuncsBase { + void AddMappings(Engine* engine) const override; +}; +REGISTER_EXPORTED_FUNCS(ExportedArrayFunctions); + // Class for exporting Stub functions class ExportedStubFunctions : public ExportedFuncsBase { void AddMappings(Engine* engine) const override; diff --git a/cpp/src/gandiva/expr_decomposer.cc b/cpp/src/gandiva/expr_decomposer.cc index 957d9d046bd..72c992df11c 100644 --- a/cpp/src/gandiva/expr_decomposer.cc +++ b/cpp/src/gandiva/expr_decomposer.cc @@ -17,6 +17,7 @@ #include "gandiva/expr_decomposer.h" +#include #include #include #include @@ -37,11 +38,28 @@ namespace gandiva { Status ExprDecomposer::Visit(const FieldNode& node) { auto desc = annotator_.CheckAndAddInputFieldDescriptor(node.field()); + //std::cout << "LR ExprDecomposer" << std::endl; DexPtr validity_dex = std::make_shared(desc); DexPtr value_dex; - if (desc->HasOffsetsIdx()) { - value_dex = std::make_shared(desc); + if (desc->HasChildOffsetsIdx()) { + //std::cout << "LR ExprDecomposer 1" << std::endl; + // handle list type + value_dex = std::make_shared(desc); + } else if (desc->HasOffsetsIdx()) { + //std::cout << "LR ExprDecomposer 2" << std::endl; + if (desc->field()->type()->id() == arrow::Type::LIST) { + // handle list type + //std::cout << "LR ExprDecomposer 3" << std::endl; + auto p = std::make_shared(desc); + value_dex = p; + //int v = p->DataIdx(); + //std::cout << "LR primitive list type " v << " " << + } else { + //std::cout << "LR ExprDecomposer 4" << std::endl; + value_dex = std::make_shared(desc); + } } else { + //std::cout << "LR ExprDecomposer 5" << std::endl; value_dex = std::make_shared(desc); } result_ = std::make_shared(validity_dex, value_dex); diff --git a/cpp/src/gandiva/expr_validator.cc b/cpp/src/gandiva/expr_validator.cc index 35a13494523..265f2c119cd 100644 --- a/cpp/src/gandiva/expr_validator.cc +++ b/cpp/src/gandiva/expr_validator.cc @@ -67,7 +67,7 @@ Status ExprValidator::Validate(const ExpressionPtr& expr) { } Status ExprValidator::Visit(const FieldNode& node) { - auto llvm_type = types_->IRType(node.return_type()->id()); + auto llvm_type = types_->DataVecType(node.return_type()); ARROW_RETURN_IF(llvm_type == nullptr, Status::ExpressionValidationError("Field ", node.field()->name(), " has unsupported data type ", @@ -136,7 +136,7 @@ Status ExprValidator::Visit(const IfNode& node) { } Status ExprValidator::Visit(const LiteralNode& node) { - auto llvm_type = types_->IRType(node.return_type()->id()); + auto llvm_type = types_->DataVecType(node.return_type()); ARROW_RETURN_IF(llvm_type == nullptr, Status::ExpressionValidationError("Value ", ToString(node.holder()), " has unsupported data type ", diff --git a/cpp/src/gandiva/expression_registry.cc b/cpp/src/gandiva/expression_registry.cc index 9bff97f5ad2..20be12548e0 100644 --- a/cpp/src/gandiva/expression_registry.cc +++ b/cpp/src/gandiva/expression_registry.cc @@ -166,6 +166,13 @@ static void AddArrowTypesToVector(arrow::Type::type type, DataTypeVector& vector case arrow::Type::type::INTERVAL_DAY_TIME: vector.push_back(arrow::day_time_interval()); break; + case arrow::Type::type::STRUCT: + vector.push_back(arrow::struct_({field("lattitude", arrow::float64(), false), field("longitude", arrow::float64(), false)})); + break; + case arrow::Type::type::LIST: + //vector.push_back(arrow::list(arrow::utf8())); + vector.push_back(arrow::list(arrow::int32())); + break; default: // Unsupported types. test ensures that // when one of these are added build breaks. diff --git a/cpp/src/gandiva/field_descriptor.h b/cpp/src/gandiva/field_descriptor.h index 0fe6fe37f4d..7b2d0c3b4fa 100644 --- a/cpp/src/gandiva/field_descriptor.h +++ b/cpp/src/gandiva/field_descriptor.h @@ -30,12 +30,14 @@ class FieldDescriptor { static const int kInvalidIdx = -1; FieldDescriptor(FieldPtr field, int data_idx, int validity_idx = kInvalidIdx, - int offsets_idx = kInvalidIdx, int data_buffer_ptr_idx = kInvalidIdx) + int offsets_idx = kInvalidIdx, int data_buffer_ptr_idx = kInvalidIdx, + int child_offsets_idx = kInvalidIdx) : field_(field), data_idx_(data_idx), validity_idx_(validity_idx), offsets_idx_(offsets_idx), - data_buffer_ptr_idx_(data_buffer_ptr_idx) {} + data_buffer_ptr_idx_(data_buffer_ptr_idx), + child_offsets_idx_(child_offsets_idx) {} /// Index of validity array in the array-of-buffers int validity_idx() const { return validity_idx_; } @@ -49,6 +51,9 @@ class FieldDescriptor { /// Index of data buffer pointer in the array-of-buffers int data_buffer_ptr_idx() const { return data_buffer_ptr_idx_; } + /// Index of list type child data offsets + int child_data_offsets_idx() const { return child_offsets_idx_; } + FieldPtr field() const { return field_; } const std::string& Name() const { return field_->name(); } @@ -58,12 +63,15 @@ class FieldDescriptor { bool HasDataBufferPtrIdx() const { return data_buffer_ptr_idx_ != kInvalidIdx; } + bool HasChildOffsetsIdx() const { return child_offsets_idx_ != kInvalidIdx; } + private: FieldPtr field_; int data_idx_; int validity_idx_; int offsets_idx_; int data_buffer_ptr_idx_; + int child_offsets_idx_; }; } // namespace gandiva diff --git a/cpp/src/gandiva/function_registry.cc b/cpp/src/gandiva/function_registry.cc index 67b7b404b32..021100678a0 100644 --- a/cpp/src/gandiva/function_registry.cc +++ b/cpp/src/gandiva/function_registry.cc @@ -16,17 +16,20 @@ // under the License. #include "gandiva/function_registry.h" + +#include +#include +#include +#include + #include "gandiva/function_registry_arithmetic.h" +#include "gandiva/function_registry_array.h" #include "gandiva/function_registry_datetime.h" #include "gandiva/function_registry_hash.h" #include "gandiva/function_registry_math_ops.h" #include "gandiva/function_registry_string.h" #include "gandiva/function_registry_timestamp_arithmetic.h" -#include -#include -#include - namespace gandiva { FunctionRegistry::iterator FunctionRegistry::begin() const { @@ -64,8 +67,15 @@ SignatureMap FunctionRegistry::InitPCMap() { auto v6 = GetDateTimeArithmeticFunctionRegistry(); pc_registry_.insert(std::end(pc_registry_), v6.begin(), v6.end()); + + auto v7 = GetArrayFunctionRegistry(); + pc_registry_.insert(std::end(pc_registry_), v7.begin(), v7.end()); + for (auto& elem : pc_registry_) { + //std::cout << "LR pc_registry_ item " << elem.pc_name() << " first signature name " << elem.signatures()[0].base_name() << std::endl; for (auto& func_signature : elem.signatures()) { + //std::cout << "LR Adding function to map " << func_signature.base_name() << std::endl; + //std::cout << " LR args " << func_signature.param_types map.insert(std::make_pair(&(func_signature), &elem)); } } diff --git a/cpp/src/gandiva/function_registry_array.cc b/cpp/src/gandiva/function_registry_array.cc new file mode 100644 index 00000000000..dc81b6b4601 --- /dev/null +++ b/cpp/src/gandiva/function_registry_array.cc @@ -0,0 +1,50 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "gandiva/function_registry_array.h" + +#include "gandiva/function_registry_common.h" + +namespace gandiva { +std::vector GetArrayFunctionRegistry() { + static std::vector array_fn_registry_ = { + NativeFunction("array_containsGandiva", {}, DataTypeVector{list(utf8()), utf8()}, + boolean(), kResultNullIfNull, "array_utf8_contains_utf8", + NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), + NativeFunction("array_lengthGandiva", {}, DataTypeVector{list(utf8())}, int64(), + kResultNullIfNull, "array_utf8_length", + NativeFunction::kNeedsContext | NativeFunction::kCanReturnErrors), + NativeFunction("array_containsGandiva", {}, DataTypeVector{list(int32()), int32()}, + boolean(), kResultNullIfNull, "array_int32_contains_int32", + NativeFunction::kNeedsContext), + NativeFunction("array_contains", {}, DataTypeVector{list(int32()), int32()}, + boolean(), kResultNullIfNull, "array_int32_contains_int32", + NativeFunction::kNeedsContext), + NativeFunction("array_makeGandiva", {}, DataTypeVector{int32()}, + list(int32()), kResultNullIfNull, "array_int32_make_array", + NativeFunction::kNeedsContext), + NativeFunction("array_removeGandiva", {}, DataTypeVector{list(int32()), int32()}, + list(int32()), kResultNullIfNull, "array_int32_remove", + NativeFunction::kNeedsContext), + /*NativeFunction("array_containsGandiva", {}, DataTypeVector{list(int64()), int64()}, + boolean(), kResultNullIfNull, "array_int64_contains_int64", + NativeFunction::kNeedsContext),*/ + }; + return array_fn_registry_; +} + +} // namespace gandiva diff --git a/cpp/src/gandiva/function_registry_array.h b/cpp/src/gandiva/function_registry_array.h new file mode 100644 index 00000000000..9b8e4553702 --- /dev/null +++ b/cpp/src/gandiva/function_registry_array.h @@ -0,0 +1,28 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include + +#include "gandiva/native_function.h" + +namespace gandiva { + +std::vector GetArrayFunctionRegistry(); + +} // namespace gandiva diff --git a/cpp/src/gandiva/function_registry_string.cc b/cpp/src/gandiva/function_registry_string.cc index 2bc6936d77b..442cdecbde7 100644 --- a/cpp/src/gandiva/function_registry_string.cc +++ b/cpp/src/gandiva/function_registry_string.cc @@ -263,6 +263,15 @@ std::vector GetStringFunctionRegistry() { NativeFunction::kNeedsFunctionHolder | NativeFunction::kCanReturnErrors), + NativeFunction("st_geohash", {}, DataTypeVector{float64(), float64()}, + utf8(), kResultNullIfNull, "gdv_fn_geo_hash_encode_float64_float64", + NativeFunction::kNeedsContext), + + NativeFunction("st_fromgeohash", {}, DataTypeVector{utf8()}, + arrow::struct_({field("lattitude", arrow::float64(), false), field("longitude", arrow::float64(), false)}), kResultNullIfNull, "gdv_fn_geo_hash_decode_utf8", + //arrow::structType(), kResultNullIfNull, "gdv_fn_geo_hash_decode_utf8", + NativeFunction::kNeedsContext), + NativeFunction("concatOperator", {}, DataTypeVector{utf8(), utf8()}, utf8(), kResultNullIfNull, "concatOperator_utf8_utf8", NativeFunction::kNeedsContext), diff --git a/cpp/src/gandiva/function_registry_test.cc b/cpp/src/gandiva/function_registry_test.cc index e3c1e85f79c..63ede751b44 100644 --- a/cpp/src/gandiva/function_registry_test.cc +++ b/cpp/src/gandiva/function_registry_test.cc @@ -93,4 +93,14 @@ TEST_F(TestFunctionRegistry, TestNoDuplicates) { "different precompiled functions:\n" << stream.str(); } + +TEST_F(TestFunctionRegistry, TestFound2) { + FunctionSignature array_length("array_lengthGandiva", {list(utf8())}, arrow::int64()); + + const NativeFunction* function = registry_.LookupSignature(array_length); + EXPECT_NE(function, nullptr); + EXPECT_THAT(function->signatures(), testing::Contains(array_length)); + EXPECT_EQ(function->pc_name(), "array_utf8_length"); +} + } // namespace gandiva diff --git a/cpp/src/gandiva/gdv_function_stubs.cc b/cpp/src/gandiva/gdv_function_stubs.cc index 5146f7fa199..3e506f83a33 100644 --- a/cpp/src/gandiva/gdv_function_stubs.cc +++ b/cpp/src/gandiva/gdv_function_stubs.cc @@ -36,8 +36,6 @@ #include "gandiva/random_generator_holder.h" #include "gandiva/to_date_holder.h" -/// Stub functions that can be accessed from LLVM or the pre-compiled library. - extern "C" { static char mask_array[256] = { @@ -161,6 +159,99 @@ int32_t gdv_fn_populate_varlen_vector(int64_t context_ptr, int8_t* data_ptr, return 0; } +/// Stub functions that can be accessed from LLVM or the pre-compiled library. +#define POPULATE_NUMERIC_LIST_TYPE_VECTOR(TYPE, SCALE) \ + int32_t gdv_fn_populate_list_##TYPE##_vector(int64_t context_ptr, int8_t* data_ptr, \ + int32_t* offsets, int64_t slot, \ + TYPE* entry_buf, int32_t entry_len) { \ + auto buffer = reinterpret_cast(data_ptr); \ + int32_t offset = static_cast(buffer->size()); \ + auto status = buffer->Resize(offset + entry_len * SCALE, false /*shrink*/); \ + if (!status.ok()) { \ + gandiva::ExecutionContext* context = \ + reinterpret_cast(context_ptr); \ + context->set_error_msg(status.message().c_str()); \ + return -1; \ + } \ + memcpy(buffer->mutable_data() + offset, (char*)entry_buf, entry_len * SCALE); \ + std::cout << "LR gdv_fn_populate buffer=" << buffer->data() << std::endl; \ + std::cout << " and offset=" << offsets << " * =" << *offsets << std::endl; \ + std::cout << "Setting offset slot=" << slot << "=" << offset / SCALE << std::endl; \ + std::cout << "Setting offset slot+1=" << slot + 1 << "=" << offset / SCALE + entry_len << std::endl; \ + offsets = reinterpret_cast(buffer->offsetBuffer); \ + offsets[slot] = offset / SCALE; \ + offsets[slot + 1] = offset / SCALE + entry_len; \ + return 0; \ + } + + //buffer->offsetBuffer[slot] = offset / SCALE; + //buffer->offsetBuffer[slot + 1] = offset / SCALE + entry_len; + +POPULATE_NUMERIC_LIST_TYPE_VECTOR(int32_t, 4) +POPULATE_NUMERIC_LIST_TYPE_VECTOR(int64_t, 8) +POPULATE_NUMERIC_LIST_TYPE_VECTOR(float, 4) +POPULATE_NUMERIC_LIST_TYPE_VECTOR(double, 8) + +int32_t gdv_fn_populate_list_varlen_vector(int64_t context_ptr, int8_t* data_ptr, + int32_t* offsets, int32_t* child_offsets, + int64_t slot, const char* entry_buf, + int32_t* entry_child_offsets, + int32_t entry_offsets_len) { + // we should calculate varlen list type varlen offset + // copy from entry child offsets + // it should be noted that, + // buffer size unit is byte(8 bit), + // offset element unit is int32(32 bit) + auto child_offsets_buffer = reinterpret_cast(child_offsets); + int32_t child_offsets_buffer_offset = + static_cast(child_offsets_buffer->size()); + + // data buffer elelment is char(8 bit) + auto data_buffer = reinterpret_cast(data_ptr); + int32_t data_buffer_offset = static_cast(data_buffer->size()); + + // sets the size in the child offsets buffer + // offsets element is int32, we should resize buffer by extra offsets_len * 4 + auto status = child_offsets_buffer->Resize( + child_offsets_buffer_offset + entry_offsets_len * 4, false /*shrink*/); + if (!status.ok()) { + gandiva::ExecutionContext* context = + reinterpret_cast(context_ptr); + + context->set_error_msg(status.message().c_str()); + return -1; + } + + // append the new child offsets entry to child offsets buffer + // offsets buffer last offset number indicating data length + // we should take this extra offset into consider + // so the initialize child_offsets_buffer length is 1(int32) + memcpy(child_offsets_buffer->mutable_data() + child_offsets_buffer_offset - 4, + (char*)entry_child_offsets, (entry_offsets_len + 1) * 4); + + // compute data length + int32_t data_length = + *(entry_child_offsets + entry_offsets_len) - *(entry_child_offsets); + + // sets the size in the child offsets buffer. + status = data_buffer->Resize(data_buffer_offset + data_length, false /*shrink*/); + if (!status.ok()) { + gandiva::ExecutionContext* context = + reinterpret_cast(context_ptr); + + context->set_error_msg(status.message().c_str()); + return -1; + } + + // append the new child offsets entry to child offsets buffer + memcpy(data_buffer->mutable_data() + data_buffer_offset, entry_buf, data_length); + + // update offsets buffer. + offsets[slot] = child_offsets_buffer_offset / 4 - 1; + offsets[slot + 1] = child_offsets_buffer_offset / 4 - 1 + entry_offsets_len; + return 0; +} + #define CRC_FUNCTION(TYPE) \ GANDIVA_EXPORT \ int64_t gdv_fn_crc_32_##TYPE(int64_t ctx, const char* input, int32_t input_len) { \ @@ -838,6 +929,8 @@ const char* gdv_mask_show_last_n_utf8_int32(int64_t context, const char* data, int32_t n_to_mask = num_of_chars - n_to_show; return gdv_mask_first_n_utf8_int32(context, data, data_len, n_to_mask, out_len); } + +#undef POPULATE_NUMERIC_LIST_TYPE_VECTOR } namespace gandiva { @@ -1174,6 +1267,34 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const { types->i32_type() /*return_type*/, args, reinterpret_cast(gdv_fn_cast_intervalyear_utf8)); + engine->AddGlobalMappingForFunc("gdv_fn_in_expr_lookup_utf8", + types->i1_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_in_expr_lookup_utf8)); + +#define ADD_MAPPING_FOR_NUMERIC_LIST_TYPE_POPULATE_FUNCTION(LLVM_TYPE, DATA_TYPE) \ + args = {types->i64_type(), types->i8_ptr_type(), types->i32_ptr_type(), \ + types->i64_type(), types->LLVM_TYPE##_ptr_type(), types->i32_type()}; \ + engine->AddGlobalMappingForFunc( \ + "gdv_fn_populate_list_" #DATA_TYPE "_vector", types->i32_type() /*return_type*/, \ + args, reinterpret_cast(gdv_fn_populate_list_##DATA_TYPE##_vector)); + + ADD_MAPPING_FOR_NUMERIC_LIST_TYPE_POPULATE_FUNCTION(i32, int32_t) + ADD_MAPPING_FOR_NUMERIC_LIST_TYPE_POPULATE_FUNCTION(i64, int64_t) + ADD_MAPPING_FOR_NUMERIC_LIST_TYPE_POPULATE_FUNCTION(float, float) + ADD_MAPPING_FOR_NUMERIC_LIST_TYPE_POPULATE_FUNCTION(double, double) + + // gdv_fn_populate_varlen_vector + args = {types->i64_type(), // int64_t execution_context + types->i8_ptr_type(), // int8_t* data ptr + types->i32_ptr_type(), // int32_t* offsets ptr + types->i64_type(), // int64_t slot + types->i8_ptr_type(), // const char* entry_buf + types->i32_type()}; // int32_t entry__len + + engine->AddGlobalMappingForFunc("gdv_fn_populate_varlen_vector", + types->i32_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_populate_varlen_vector)); + // gdv_fn_cast_intervalyear_utf8_int32 args = { types->i64_type(), // context @@ -1190,6 +1311,24 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const { "gdv_fn_cast_intervalyear_utf8_int32", types->i32_type() /*return_type*/, args, reinterpret_cast(gdv_fn_cast_intervalyear_utf8_int32)); + // gdv_fn_populate_list_varlen_vector + args = {types->i64_type(), // int64_t execution_context + types->i8_ptr_type(), // int8_t* data ptr + types->i32_ptr_type(), // int32_t* offsets ptr + types->i32_ptr_type(), // int32_t* child offsets ptr + types->i64_type(), // int64_t slot + types->i8_ptr_type(), // const char* entry_buf + types->i32_ptr_type(), // int32_t* entry child offsets ptr + types->i32_type()}; // int32_t entry child offsets length + + engine->AddGlobalMappingForFunc( + "gdv_fn_populate_list_varlen_vector", types->i32_type() /*return_type*/, args, + reinterpret_cast(gdv_fn_populate_list_varlen_vector)); + + // gdv_fn_random + args = {types->i64_type()}; + engine->AddGlobalMappingForFunc("gdv_fn_random", types->double_type(), args, + reinterpret_cast(gdv_fn_random)); // to_utc_timezone_timestamp args = { types->i64_type(), // context @@ -1289,4 +1428,6 @@ void ExportedStubFunctions::AddMappings(Engine* engine) const { engine->AddGlobalMappingForFunc("mask_utf8", types->i8_ptr_type() /*return_type*/, args, reinterpret_cast(mask_utf8)); } + +#undef ADD_MAPPING_FOR_NUMERIC_LIST_TYPE_POPULATE_FUNCTION } // namespace gandiva diff --git a/cpp/src/gandiva/llvm_generator.cc b/cpp/src/gandiva/llvm_generator.cc index 1615eece1f2..a97c8b02b07 100644 --- a/cpp/src/gandiva/llvm_generator.cc +++ b/cpp/src/gandiva/llvm_generator.cc @@ -17,6 +17,7 @@ #include "gandiva/llvm_generator.h" +#include #include #include #include @@ -30,12 +31,32 @@ #include "gandiva/lvalue.h" namespace gandiva { - #define ADD_TRACE(...) \ if (enable_ir_traces_) { \ AddTrace(__VA_ARGS__); \ } +/*namespace { + std::string printType(llvm::Type* t) { + if (t == nullptr) { + return std::string("null"); + } + std::string str; + llvm::raw_string_ostream output(str); + t->print(output); + return str; + } + std::string printType(llvm::Value* t) { + if (t == nullptr) { + return std::string("null"); + } + std::string str; + llvm::raw_string_ostream output(str); + t->print(output); + return str; + } +}*/ + LLVMGenerator::LLVMGenerator(bool cached) : cached_(cached), enable_ir_traces_(false) {} Status LLVMGenerator::Make(std::shared_ptr config, bool cached, @@ -71,6 +92,7 @@ Status LLVMGenerator::Add(const ExpressionPtr expr, const FieldDescriptorPtr out std::unique_ptr compiled_expr(new CompiledExpr(value_validity, output)); std::string fn_name = "expr_" + std::to_string(idx) + "_" + std::to_string(static_cast(selection_vector_mode_)); + //std::cout << "LR LLVMGenerator::Add " << fn_name << std::endl; if (!cached_) { ARROW_RETURN_NOT_OK(engine_->LoadFunctionIRs()); ARROW_RETURN_NOT_OK(CodeGenExprValue(value_validity->value_expr(), @@ -79,6 +101,7 @@ Status LLVMGenerator::Add(const ExpressionPtr expr, const FieldDescriptorPtr out } compiled_expr->SetFunctionName(selection_vector_mode_, fn_name); compiled_exprs_.push_back(std::move(compiled_expr)); + //std::cout << "LR LLVMGenerator::Add Done" << std::endl; return Status::OK(); } @@ -87,14 +110,19 @@ Status LLVMGenerator::Add(const ExpressionPtr expr, const FieldDescriptorPtr out Status LLVMGenerator::Build(const ExpressionVector& exprs, SelectionVector::Mode mode) { selection_vector_mode_ = mode; + //std::cout << "LR LLVMGenerator::Build " << std::endl; for (auto& expr : exprs) { auto output = annotator_.AddOutputFieldDescriptor(expr->result()); ARROW_RETURN_NOT_OK(Add(expr, output)); } + //std::cout << "LR LLVMGenerator::Build 2" << std::endl; + //Too much logging. needle in haystack? + //std::cout << "LR LLVMGenerator::Build 2 IR is " << engine_->DumpIR() << std::endl; // Compile and inject into the process' memory the generated function. ARROW_RETURN_NOT_OK(engine_->FinalizeModule()); - + //std::cout << "LR LLVMGenerator::Build FinalizeModule" << std::endl; + // setup the jit functions for each expression. for (auto& compiled_expr : compiled_exprs_) { auto fn_name = compiled_expr->GetFunctionName(mode); @@ -102,6 +130,7 @@ Status LLVMGenerator::Build(const ExpressionVector& exprs, SelectionVector::Mode compiled_expr->SetJITFunction(selection_vector_mode_, jit_fn); } + //std::cout << "LR LLVMGenerator::Build Done" << std::endl; return Status::OK(); } @@ -123,10 +152,12 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, const SelectionVector* selection_vector, const ArrayDataVector& output_vector) const { DCHECK_GT(record_batch.num_rows(), 0); + //std::cout << "LR LLVMGenerator::Execute 1"<< std::endl; auto eval_batch = annotator_.PrepareEvalBatch(record_batch, output_vector); DCHECK_GT(eval_batch->GetNumBuffers(), 0); + //std::cout << "LR LLVMGenerator::Execute 2" << std::endl; auto mode = SelectionVector::MODE_NONE; if (selection_vector != nullptr) { mode = selection_vector->GetMode(); @@ -136,6 +167,7 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, selection_vector_mode_, " received vector with mode ", mode); } + //std::cout << "LR LLVMGenerator::Execute 3" << std::endl; for (auto& compiled_expr : compiled_exprs_) { // generate data/offset vectors. const uint8_t* selection_buffer = nullptr; @@ -145,6 +177,7 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, num_output_rows = selection_vector->GetNumSlots(); } + //std::cout << "LR LLVMGenerator::Execute A1" << std::endl; EvalFunc jit_function = compiled_expr->GetJITFunction(mode); jit_function(eval_batch->GetBufferArray(), eval_batch->GetBufferOffsetArray(), eval_batch->GetLocalBitMapArray(), annotator_.GetHolderPointersArray(), @@ -156,6 +189,7 @@ Status LLVMGenerator::Execute(const arrow::RecordBatch& record_batch, eval_batch->GetExecutionContext()->has_error(), Status::ExecutionError(eval_batch->GetExecutionContext()->get_error())); + //std::cout << "LR LLVMGenerator::Execute A2" << std::endl; // generate validity vectors. ComputeBitMapsForExpr(*compiled_expr, selection_vector, eval_batch.get()); } @@ -210,6 +244,14 @@ llvm::Value* LLVMGenerator::GetOffsetsReference(llvm::Value* arg_addrs, int idx, return ir_builder()->CreateIntToPtr(load, types()->i32_ptr_type(), name + "_oarray"); } +/// Get reference to child offsets array at specified index in the args list. +llvm::Value* LLVMGenerator::GetChildOffsetsReference(llvm::Value* arg_addrs, int idx, + FieldPtr field) { + const std::string& name = field->name(); + llvm::Value* load = LoadVectorAtIndex(arg_addrs, types()->i64_type(), idx, name); + return ir_builder()->CreateIntToPtr(load, types()->i32_ptr_type(), name + "_coarray"); +} + /// Get reference to local bitmap array at specified index in the args list. llvm::Value* LLVMGenerator::GetLocalBitMapReference(llvm::Value* arg_bitmaps, int idx) { llvm::Value* load = LoadVectorAtIndex(arg_bitmaps, types()->i64_type(), idx, ""); @@ -270,6 +312,9 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, FieldDescriptorPtr output, int suffix_idx, std::string& fn_name, SelectionVector::Mode selection_vector_mode) { + //std::cout << "LR CodeGenExprValue for output field " << output->Name() + // << " type " << output->Type()->ToString() << " output type id " << output->Type()->id() << std::endl; + try { llvm::IRBuilder<>* builder = ir_builder(); // Create fn prototype : // int expr_1 (long **addrs, long *offsets, long **bitmaps, @@ -367,6 +412,7 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, } // The visitor can add code to both the entry/loop blocks. + //std::cout << "LR calling visitor to get output data for [" << fn_name << "]" << std::endl; Visitor visitor(this, fn, loop_entry, arg_addrs, arg_local_bitmaps, arg_holder_ptrs, slice_offsets, arg_context_ptr, position_var); value_expr->Accept(visitor); @@ -397,12 +443,69 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, AddFunctionCall("gdv_fn_populate_varlen_vector", types()->i32_type(), {arg_context_ptr, output_buffer_ptr_ref, output_offset_ref, loop_var, output_value->data(), output_value->length()}); + } else if (output_type_id == arrow::Type::STRUCT) { + //std::cout << "LR creating struct type to store the result." << std::endl; + auto slot_offset = builder->CreateGEP(types()->IRType(output_type_id), output_ref, loop_var); + builder->CreateStore(output_value->data(), slot_offset); + } else if (output_type_id == arrow::Type::LIST) { + auto output_list_internal_type = output->Type()->field(0)->type()->id(); + //std::cout << "LR creating list type to store the result with internal type " << output_list_internal_type << std::endl; + + if (arrow::is_binary_like(output_list_internal_type)) { + auto output_list_value = std::dynamic_pointer_cast(output_value); + llvm::Value* child_output_offset_ref = GetChildOffsetsReference( + arg_addrs, output->child_data_offsets_idx(), output->field()); + AddFunctionCall( + "gdv_fn_populate_list_varlen_vector", types()->i32_type(), + {arg_context_ptr, output_buffer_ptr_ref, output_offset_ref, + child_output_offset_ref, loop_var, output_list_value->data(), + output_list_value->child_offsets(), output_list_value->offsets_length()}); + } else if (output_list_internal_type == arrow::Type::INT32) { + + + std::string str1; + llvm::raw_string_ostream output1(str1); + output_value->data()->print(output1); + + std::string str2; + llvm::raw_string_ostream output2(str2); + output_value->length()->print(output2); + + + std::cout << "LR gdv_fn_populate_list_int32_t_vector params are " << arg_context_ptr << "," << output_buffer_ptr_ref << "," + << output_offset_ref << "," << loop_var << std::endl; + // << output_offset_ref << "," << loop_var << "[[" << str1 << "]] [[" << str2 << "]]" << std::endl; + AddFunctionCall("gdv_fn_populate_list_int32_t_vector", types()->i32_type(), + {arg_context_ptr, output_buffer_ptr_ref, output_offset_ref, + loop_var, output_value->data(), output_value->length()}); + } else if (output_list_internal_type == arrow::Type::INT64) { + AddFunctionCall("gdv_fn_populate_list_int64_t_vector", types()->i32_type(), + {arg_context_ptr, output_buffer_ptr_ref, output_offset_ref, + loop_var, output_value->data(), output_value->length()}); + } else if (output_list_internal_type == arrow::Type::FLOAT) { + AddFunctionCall("gdv_fn_populate_list_float_vector", types()->i32_type(), + {arg_context_ptr, output_buffer_ptr_ref, output_offset_ref, + loop_var, output_value->data(), output_value->length()}); + } else if (output_list_internal_type == arrow::Type::DOUBLE) { + AddFunctionCall("gdv_fn_populate_list_double_vector", types()->i32_type(), + {arg_context_ptr, output_buffer_ptr_ref, output_offset_ref, + loop_var, output_value->data(), output_value->length()}); + } else { + return Status::NotImplemented("list internal type ", + output->Type()->field(0)->type()->ToString(), + " not supported"); + } } else { return Status::NotImplemented("output type ", output->Type()->ToString(), " not supported"); } - ADD_TRACE("saving result " + output->Name() + " value %T", output_value->data()); + //LR HACK somehow this caused a crash???? + //std::cout << "LR saving result " << output->Name() << " value " << + // printType(output_value->data()) << std::endl; + //ADD_TRACE("saving result 2 " + output->Name() + " value %T", output_value->data()); + //int jello = 0; + //std::cout << "LR CodeGenExprValue " << jello++ << std::endl; if (visitor.has_arena_allocs()) { // Reset allocations to avoid excessive memory usage. Once the result is copied to // the output vector (store instruction above), any memory allocations in this @@ -412,20 +515,28 @@ Status LLVMGenerator::CodeGenExprValue(DexPtr value_expr, int buffer_count, AddFunctionCall("gdv_fn_context_arena_reset", types()->void_type(), reset_args); } + //std::cout << "LR CodeGenExprValue " << jello++ << std::endl; // check loop_var loop_var->addIncoming(types()->i64_constant(0), loop_entry); llvm::Value* loop_update = builder->CreateAdd(loop_var, types()->i64_constant(1), "loop_var+1"); loop_var->addIncoming(loop_update, loop_body_tail); + //std::cout << "LR CodeGenExprValue " << jello++ << std::endl; llvm::Value* loop_var_check = builder->CreateICmpSLT(loop_update, arg_nrecords, "loop_var < nrec"); builder->CreateCondBr(loop_var_check, loop_body, loop_exit); + //std::cout << "LR CodeGenExprValue " << jello++ << std::endl; // Loop exit builder->SetInsertPoint(loop_exit); builder->CreateRet(types()->i32_constant(0)); + //std::cout << "LR CodeGenExprValue " << jello++ << std::endl; return Status::OK(); + } catch (std::exception& e) { + std::cout << e.what() << std::endl; + throw e; + } } /// Return value of a bit in bitMap. @@ -517,7 +628,7 @@ llvm::Value* LLVMGenerator::AddFunctionCall(const std::string& full_name, llvm::Function* fn = module()->getFunction(full_name); DCHECK_NE(fn, nullptr) << "missing function " << full_name; - if (enable_ir_traces_ && !full_name.compare("printf") && + if (!full_name.compare("printf") && !full_name.compare("printff")) { // Trace for debugging ADD_TRACE("invoke native fn " + full_name); @@ -530,6 +641,15 @@ llvm::Value* LLVMGenerator::AddFunctionCall(const std::string& full_name, value = ir_builder()->CreateCall(fn, args); } else { value = ir_builder()->CreateCall(fn, args, full_name); + + std::string str; + llvm::raw_string_ostream output(str); + std::string str2; + llvm::raw_string_ostream output2(str2); + ret_type->print(output); + value->getType()->print(output2); + //std::cout << "LR addfunctioncall for " << full_name << " == value->getType " << str2 << " ret_type " << str << std::endl; + DCHECK(value->getType() == ret_type); } @@ -548,9 +668,7 @@ std::shared_ptr LLVMGenerator::BuildDecimalLValue(llvm::Value* va } #define ADD_VISITOR_TRACE(...) \ - if (generator_->enable_ir_traces_) { \ generator_->AddTrace(__VA_ARGS__); \ - } // Visitor for generating the code for a decomposed expression. LLVMGenerator::Visitor::Visitor(LLVMGenerator* generator, llvm::Function* function, @@ -573,6 +691,7 @@ LLVMGenerator::Visitor::Visitor(LLVMGenerator* generator, llvm::Function* functi } void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueDex& dex) { + ADD_VISITOR_TRACE("VectorReadFixedLenValueDex"); llvm::IRBuilder<>* builder = ir_builder(); auto types = generator_->types(); llvm::Value* slot_ref = GetBufferReference(dex.DataIdx(), kBufferTypeData, dex.Field()); @@ -580,6 +699,7 @@ void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueDex& dex) { llvm::Value* slot_value; std::shared_ptr lvalue; + ADD_VISITOR_TRACE("VectorReadFixedLenValueDex"); switch (dex.FieldType()->id()) { case arrow::Type::BOOL: slot_value = generator_->GetPackedBitValue(slot_ref, slot_index); @@ -606,11 +726,67 @@ void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueDex& dex) { result_ = lvalue; } -void LLVMGenerator::Visitor::Visit(const VectorReadVarLenValueDex& dex) { +void LLVMGenerator::Visitor::Visit(const VectorReadFixedLenValueListDex& dex) { + ADD_VISITOR_TRACE("VectorReadFixedLenValueListDex"); llvm::IRBuilder<>* builder = ir_builder(); llvm::Value* slot; auto types = generator_->types(); + auto type = types->IRType(dex.FieldType()->id()); + + //std::cout << "LR Visitor::Visit(const VectorReadFixedLenValueListDex& dex)" << std::endl; + //std::cout << "LR VectorReadFixedLenValueListDex dex.FieldType()->id() " << dex.FieldType()->id() << " types->DataVecType( " << printType(types->DataVecType(dex.FieldType())) << std::endl; + //std::cout << "LR VectorReadFixedLenValueListDex IRType is " << printType(type) << std::endl; + arrow::Type::type at = arrow::Type::INT32; + type = types->IRType(at); + //type = types->DataVecType(dex.FieldType()); + //std::cout << "LR VectorReadFixedLenValueListDex went with type " << printType(type) << std::endl; + + // compute list len from the offsets array. + llvm::Value* offsets_slot_ref = + GetBufferReference(dex.OffsetsIdx(), kBufferTypeOffsets, dex.Field()); + llvm::Value* offsets_slot_index = + builder->CreateAdd(loop_var_, GetSliceOffset(dex.OffsetsIdx())); + //std::cout << "LR VectorReadFixedLenValueListDex values " << printType(offsets_slot_ref) << " [next] " << + // printType(offsets_slot_index) << std::endl; + + // => offset_start = offsets[loop_var] + slot = builder->CreateGEP(type, offsets_slot_ref, offsets_slot_index); + llvm::Value* offset_start = builder->CreateLoad(type, slot, "offset_start"); + + // => offset_end = offsets[loop_var + 1] + llvm::Value* offsets_slot_index_next = builder->CreateAdd( + offsets_slot_index, generator_->types()->i64_constant(1), "loop_var+1"); + slot = builder->CreateGEP(type, offsets_slot_ref, offsets_slot_index_next); + llvm::Value* offset_end = builder->CreateLoad(type,slot, "offset_end"); + + // => offsets_len_value = offset_end - offset_start + llvm::Value* list_len = builder->CreateSub(offset_end, offset_start, "offsets_len"); + + // get data array + llvm::Value* slot_ref = GetBufferReference(dex.DataIdx(), kBufferTypeData, dex.Field()); + // do not forget slice offset + llvm::Value* offset_start_int64 = + builder->CreateIntCast(offset_start, generator_->types()->i64_type(), true); + llvm::Value* slot_index = + builder->CreateAdd(offset_start_int64, GetSliceOffset(dex.DataIdx())); + llvm::Value* data_list = builder->CreateGEP(type, slot_ref, slot_index); + + // TODO: handle bool type bitmap + // TODO: handle decimal precision and scale + + //std::cout << "LR VectorReadFixedLenValueListDex slot_ref " << printType(slot_ref) << std::endl; + //std::cout << "LR VectorReadFixedLenValueListDex visit fixed-len data list vector " << dex.FieldName() << + // " length " << printType(list_len) << " data_list " << printType(data_list) << std::endl; + ADD_VISITOR_TRACE("visit fixed-len data list vector " + dex.FieldName() + " length %T", + list_len); + result_.reset(new LValue(data_list, list_len)); +} +void LLVMGenerator::Visitor::Visit(const VectorReadVarLenValueDex& dex) { + llvm::IRBuilder<>* builder = ir_builder(); + llvm::Value* slot; + auto types = generator_->types(); + ADD_VISITOR_TRACE("VectorReadVarLenValueDex"); // compute len from the offsets array. llvm::Value* offsets_slot_ref = GetBufferReference(dex.OffsetsIdx(), kBufferTypeOffsets, dex.Field()); @@ -641,7 +817,86 @@ void LLVMGenerator::Visitor::Visit(const VectorReadVarLenValueDex& dex) { result_.reset(new LValue(data_value, len_value)); } +/* + * create list type field context for each loop + */ +void LLVMGenerator::Visitor::Visit(const VectorReadVarLenValueListDex& dex) { + /* Example + * list_data: [["var_len_val11"], ["var_len_val211", "var_len_val22"], + * ["var_len_val3331"]] loop_var: 0, 1, 2 data_buffer: + * var_len_val11var_len_val211var_len_val22var_len_val3331 offsets_buffer: 0, 1, 3, 4 + * list_element_len = offsets[loop_var+1]-offsets[loop_var] => 1, 2, 1 + * child_offsets_buffer: 0, 13, 27, 40, 55 + * for i in list_element_len: + * data_buffer[child_offsets_buffer[offsets[i+1]] - child_offsets_buffer[offsets[i]]] + * => list_data[loop_var][i] + */ + ADD_VISITOR_TRACE("VectorReadVarLenValueListDex"); + llvm::IRBuilder<>* builder = ir_builder(); + llvm::Value* slot; + auto types = generator_->types(); + auto type = types->IRType(dex.FieldType()->id()); + //std::cout << "LR dex.FieldType()->id() " << dex.FieldType()->id() << " types->DataVecType( " << printType(types->DataVecType(dex.FieldType())) << std::endl; + //std::cout << "LR IRType is " << printType(type) << std::endl; + //type = types->DataVecType(dex.FieldType()); + //LR HACK. Original was type = types->DataVecType(dex.FieldType()); + arrow::Type::type at = arrow::Type::INT32; + type = types->IRType(at); + + // compute list length from the offsets array + llvm::Value* offsets_slot_ref = + GetBufferReference(dex.OffsetsIdx(), kBufferTypeOffsets, dex.Field()); + llvm::Value* offsets_slot_index = + builder->CreateAdd(loop_var_, GetSliceOffset(dex.OffsetsIdx())); + + int i = 0; + std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; + // => offset_start = offsets[loop_var] + //std::cout << "LR Type is " << printType(type) << std::endl; + slot = builder->CreateGEP(type, offsets_slot_ref, offsets_slot_index); + std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; + llvm::Value* offset_start = builder->CreateLoad(type, slot, "offset_start"); + + //std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; + // => offset_end = offsets[loop_var + 1] + llvm::Value* offsets_slot_index_next = builder->CreateAdd( + offsets_slot_index, generator_->types()->i64_constant(1), "loop_var+1"); + slot = builder->CreateGEP(type, offsets_slot_ref, offsets_slot_index_next); + llvm::Value* offset_end = builder->CreateLoad(type, slot, "offset_end"); + + //std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; + // => list_data_length = offset_end - offset_start + llvm::Value* list_data_length = + builder->CreateSub(offset_end, offset_start, "offsets_len"); + + //std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; + // get the child offsets array from the child offsets array, + // start from offset 'offset_start' + llvm::Value* child_offset_slot_ref = + GetBufferReference(dex.ChildOffsetsIdx(), kBufferTypeChildOffsets, dex.Field()); + //std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; + // do not forget slice offset + llvm::Value* offset_start_int64 = + builder->CreateIntCast(offset_start, generator_->types()->i64_type(), true); + llvm::Value* child_offset_slot_index = + builder->CreateAdd(offset_start_int64, GetSliceOffset(dex.ChildOffsetsIdx())); + llvm::Value* child_offsets = + builder->CreateGEP(type, child_offset_slot_ref, child_offset_slot_index); + llvm::Value* child_offset_start = + builder->CreateLoad(type, child_offsets, "child_offset_start"); + + //std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; + // get the data array + llvm::Value* data_slot_ref = + GetBufferReference(dex.DataIdx(), kBufferTypeData, dex.Field()); + llvm::Value* data_value = builder->CreateGEP(type, data_slot_ref, child_offset_start); + + //std::cout << "VectorReadVarLenValueListDex " << i++ << std::endl; + result_.reset(new ListLValue(data_value, child_offsets, list_data_length)); +} + void LLVMGenerator::Visitor::Visit(const VectorReadValidityDex& dex) { + ADD_VISITOR_TRACE("VectorReadValidityDex"); llvm::IRBuilder<>* builder = ir_builder(); llvm::Value* slot_ref = GetBufferReference(dex.ValidityIdx(), kBufferTypeValidity, dex.Field()); @@ -654,6 +909,7 @@ void LLVMGenerator::Visitor::Visit(const VectorReadValidityDex& dex) { } void LLVMGenerator::Visitor::Visit(const LocalBitMapValidityDex& dex) { + ADD_VISITOR_TRACE("LocalBitMapValidityDex"); llvm::Value* slot_ref = GetLocalBitMapReference(dex.local_bitmap_idx()); llvm::Value* validity = generator_->GetPackedBitValue(slot_ref, loop_var_); @@ -664,18 +920,22 @@ void LLVMGenerator::Visitor::Visit(const LocalBitMapValidityDex& dex) { } void LLVMGenerator::Visitor::Visit(const TrueDex& dex) { + ADD_VISITOR_TRACE("TrueDex"); result_.reset(new LValue(generator_->types()->true_constant())); } void LLVMGenerator::Visitor::Visit(const FalseDex& dex) { + ADD_VISITOR_TRACE("FalseDex"); result_.reset(new LValue(generator_->types()->false_constant())); } void LLVMGenerator::Visitor::Visit(const LiteralDex& dex) { + ADD_VISITOR_TRACE("LiteralDex"); LLVMTypes* types = generator_->types(); llvm::Value* value = nullptr; llvm::Value* len = nullptr; + //std::cout << "LR LiteralDex type " << dex.type()->id() << std::endl; switch (dex.type()->id()) { case arrow::Type::BOOL: value = types->i1_constant(std::get(dex.holder())); @@ -716,7 +976,7 @@ void LLVMGenerator::Visitor::Visit(const LiteralDex& dex) { case arrow::Type::STRING: case arrow::Type::BINARY: { const std::string& str = std::get(dex.holder()); - + //std::cout << "LR Literal string " << str << std::endl; value = ir_builder()->CreateGlobalStringPtr(str.c_str()); len = types->i32_constant(static_cast(str.length())); break; @@ -770,6 +1030,8 @@ void LLVMGenerator::Visitor::Visit(const NonNullableFuncDex& dex) { native_function->NeedsContext()); auto arrow_return_type = dex.func_descriptor()->return_type(); + //std::cout << "LR NonNullableFunc 1 result_type " << printType(generator_->types()->DataVecType(arrow_return_type)) << " arrow_return_type " << arrow_return_type->ToString() << " old type " << printType(generator_->types()->IRType(arrow_return_type->id())) << std::endl; + if (native_function->CanReturnErrors()) { // slow path : if a function can return errors, skip invoking the function // unless all of the input args are valid. Otherwise, it can cause spurious errors. @@ -777,7 +1039,10 @@ void LLVMGenerator::Visitor::Visit(const NonNullableFuncDex& dex) { llvm::IRBuilder<>* builder = ir_builder(); LLVMTypes* types = generator_->types(); auto arrow_type_id = arrow_return_type->id(); - auto result_type = types->IRType(arrow_type_id); + auto result_type = types->DataVecType(arrow_return_type); + //Result type array/list is special. + //auto result_type = types->IRType(arrow_type_id); + //std::cout << "LR NonNullableFunc 2 result_type " << printType(result_type) << " arrow_return_type " << arrow_return_type->ToString() << " old type " << types->IRType(arrow_type_id) << std::endl; // Build combined validity of the args. llvm::Value* is_valid = types->true_constant(); @@ -1125,25 +1390,31 @@ void LLVMGenerator::Visitor::VisitInExpression( } void LLVMGenerator::Visitor::Visit(const InExprDexBase& dex) { + ADD_VISITOR_TRACE("InExprDexBase&"); VisitInExpression(dex); } void LLVMGenerator::Visitor::Visit(const InExprDexBase& dex) { + ADD_VISITOR_TRACE("InExprDexBase&"); VisitInExpression(dex); } void LLVMGenerator::Visitor::Visit(const InExprDexBase& dex) { + ADD_VISITOR_TRACE("InExprDexBase&"); VisitInExpression(dex); } void LLVMGenerator::Visitor::Visit(const InExprDexBase& dex) { + ADD_VISITOR_TRACE("InExprDexBase&"); VisitInExpression(dex); } void LLVMGenerator::Visitor::Visit(const InExprDexBase& dex) { + ADD_VISITOR_TRACE("InExprDexBase&"); VisitInExpression(dex); } void LLVMGenerator::Visitor::Visit(const InExprDexBase& dex) { + ADD_VISITOR_TRACE("InExprDexBase&"); VisitInExpression(dex); } @@ -1151,6 +1422,7 @@ LValuePtr LLVMGenerator::Visitor::BuildIfElse(llvm::Value* condition, std::function then_func, std::function else_func, DataTypePtr result_type) { + ADD_VISITOR_TRACE("BuildIfElse"); llvm::IRBuilder<>* builder = ir_builder(); llvm::LLVMContext* context = generator_->context(); LLVMTypes* types = generator_->types(); @@ -1180,7 +1452,7 @@ LValuePtr LLVMGenerator::Visitor::BuildIfElse(llvm::Value* condition, // Emit the merge block. builder->SetInsertPoint(merge_bb); - auto llvm_type = types->IRType(result_type->id()); + auto llvm_type = types->DataVecType(result_type); llvm::PHINode* result_value = builder->CreatePHI(llvm_type, 2, "res_value"); result_value->addIncoming(then_lvalue->data(), then_bb); result_value->addIncoming(else_lvalue->data(), else_bb); @@ -1226,9 +1498,10 @@ LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func, std::vector* params) { auto types = generator_->types(); auto arrow_return_type_id = arrow_return_type->id(); - auto llvm_return_type = types->IRType(arrow_return_type_id); + auto llvm_return_type = types->DataVecType(arrow_return_type); DecimalIR decimalIR(generator_->engine_.get()); + //std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall for " << func->pc_name() << " llvm return type is " << printType(llvm_return_type) << std::endl; if (arrow_return_type_id == arrow::Type::DECIMAL) { // For decimal fns, the output precision/scale are passed along as parameters. // @@ -1256,12 +1529,31 @@ LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func, // add extra arg for return length for variable len return types (allocated on stack). llvm::AllocaInst* result_len_ptr = nullptr; if (arrow::is_binary_like(arrow_return_type_id)) { + //std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall is binary like" << std::endl; result_len_ptr = new llvm::AllocaInst(generator_->types()->i32_type(), 0, "result_len", entry_block_); params->push_back(result_len_ptr); has_arena_allocs_ = true; } + if (arrow_return_type_id == arrow::Type::LIST) { + //std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall is list" << std::endl; + result_len_ptr = new llvm::AllocaInst(generator_->types()->i32_type(), 0, + "result_len", entry_block_); + params->push_back(result_len_ptr); + has_arena_allocs_ = true; + + + } + + //std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall params are: " << std::endl; + for (auto p : *params) { + std::string str1; + llvm::raw_string_ostream output1(str1); + p->print(output1); + std::cout << str1 << std::endl; + } + // Make the function call llvm::IRBuilder<>* builder = ir_builder(); auto value = @@ -1272,6 +1564,8 @@ LValuePtr LLVMGenerator::Visitor::BuildFunctionCall(const NativeFunction* func, (result_len_ptr == nullptr) ? nullptr : builder->CreateLoad(result_len_ptr->getAllocatedType(), result_len_ptr); + //std::cout << "LR LLVMGenerator::Visitor::BuildFunctionCall is DONE" << std::endl; + return std::make_shared(value, value_len); } } @@ -1281,11 +1575,13 @@ std::vector LLVMGenerator::Visitor::BuildParams( bool with_context) { std::vector params; + ADD_VISITOR_TRACE("LLVMGenerator::Visitor::BuildParams"); // add context if required. if (with_context) { params.push_back(arg_context_ptr_); } + //std::cout << "LR BuildParams1" << std::endl; // if the function has holder, add the holder pointer. if (holder_idx != -1) { auto builder = ir_builder(); @@ -1294,6 +1590,7 @@ std::vector LLVMGenerator::Visitor::BuildParams( llvm::BasicBlock* saved_block = builder->GetInsertBlock(); builder->SetInsertPoint(entry_block_); + //std::cout << "LR BuildParams1a" << std::endl; auto holder = generator_->LoadVectorAtIndex( arg_holder_ptrs_, generator_->types()->i64_type(), holder_idx, "holder"); @@ -1301,16 +1598,20 @@ std::vector LLVMGenerator::Visitor::BuildParams( params.push_back(holder); } + //std::cout << "LR BuildParams2" << std::endl; // build the function params, along with the validities. for (auto& pair : args) { // build value. DexPtr value_expr = pair->value_expr(); + //std::cout << "LR BuildParams2a" << std::endl; value_expr->Accept(*this); + //std::cout << "LR BuildParams2b" << std::endl; LValue& result_ref = *result(); // append all the parameters corresponding to this LValue. result_ref.AppendFunctionParams(¶ms); + //std::cout << "LR BuildParams2c" << std::endl; // build validity. if (with_validity) { llvm::Value* validity_expr = BuildCombinedValidity(pair->validity_exprs()); @@ -1356,6 +1657,10 @@ llvm::Value* LLVMGenerator::Visitor::GetBufferReference(int idx, BufferType buff case kBufferTypeOffsets: slot_ref = generator_->GetOffsetsReference(arg_addrs_, idx, field); break; + + case kBufferTypeChildOffsets: + slot_ref = generator_->GetChildOffsetsReference(arg_addrs_, idx, field); + break; } // Revert to the saved block. @@ -1440,6 +1745,7 @@ void LLVMGenerator::AddTrace(const std::string& msg, llvm::Value* value) { dmsg = ReplaceFormatInTrace(dmsg, value, &print_fn_name); } trace_strings_.push_back(dmsg); + std::cout << dmsg << std::endl; // cast this to an llvm pointer. const char* str = trace_strings_.back().c_str(); @@ -1454,5 +1760,4 @@ void LLVMGenerator::AddTrace(const std::string& msg, llvm::Value* value) { } AddFunctionCall(print_fn_name, types()->i32_type(), args); } - } // namespace gandiva diff --git a/cpp/src/gandiva/llvm_generator.h b/cpp/src/gandiva/llvm_generator.h index 04f9b854b1d..2d10871a81f 100644 --- a/cpp/src/gandiva/llvm_generator.h +++ b/cpp/src/gandiva/llvm_generator.h @@ -102,7 +102,9 @@ class GANDIVA_EXPORT LLVMGenerator { void Visit(const VectorReadValidityDex& dex) override; void Visit(const VectorReadFixedLenValueDex& dex) override; + void Visit(const VectorReadFixedLenValueListDex& dex) override; void Visit(const VectorReadVarLenValueDex& dex) override; + void Visit(const VectorReadVarLenValueListDex& dex) override; void Visit(const LocalBitMapValidityDex& dex) override; void Visit(const TrueDex& dex) override; void Visit(const FalseDex& dex) override; @@ -127,7 +129,12 @@ class GANDIVA_EXPORT LLVMGenerator { bool has_arena_allocs() { return has_arena_allocs_; } private: - enum BufferType { kBufferTypeValidity = 0, kBufferTypeData, kBufferTypeOffsets }; + enum BufferType { + kBufferTypeValidity = 0, + kBufferTypeData, + kBufferTypeOffsets, + kBufferTypeChildOffsets + }; llvm::IRBuilder<>* ir_builder() { return generator_->ir_builder(); } llvm::Module* module() { return generator_->module(); } @@ -195,6 +202,10 @@ class GANDIVA_EXPORT LLVMGenerator { /// Generate code to load the vector at specified index and cast it as offsets array. llvm::Value* GetOffsetsReference(llvm::Value* arg_addrs, int idx, FieldPtr field); + /// Generate code to load the vector at specified index and cast it as child offsets + /// array. + llvm::Value* GetChildOffsetsReference(llvm::Value* arg_addrs, int idx, FieldPtr field); + /// Generate code to load the vector at specified index and cast it as buffer pointer. llvm::Value* GetDataBufferPtrReference(llvm::Value* arg_addrs, int idx, FieldPtr field); diff --git a/cpp/src/gandiva/llvm_types.cc b/cpp/src/gandiva/llvm_types.cc index de322a8c0fc..68be62816f6 100644 --- a/cpp/src/gandiva/llvm_types.cc +++ b/cpp/src/gandiva/llvm_types.cc @@ -42,7 +42,9 @@ LLVMTypes::LLVMTypes(llvm::LLVMContext& context) : context_(context) { {arrow::Type::type::BINARY, i8_ptr_type()}, {arrow::Type::type::DECIMAL, i128_type()}, {arrow::Type::type::INTERVAL_MONTHS, i32_type()}, - {arrow::Type::type::INTERVAL_DAY_TIME, i64_type()}}; + {arrow::Type::type::STRUCT, struct_type()}, + {arrow::Type::type::INTERVAL_DAY_TIME, i64_type()}, + {arrow::Type::type::LIST, list_type()}}; } } // namespace gandiva diff --git a/cpp/src/gandiva/llvm_types.h b/cpp/src/gandiva/llvm_types.h index d6f0952713e..f2355354235 100644 --- a/cpp/src/gandiva/llvm_types.h +++ b/cpp/src/gandiva/llvm_types.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include @@ -46,6 +47,12 @@ class GANDIVA_EXPORT LLVMTypes { llvm::Type* i128_type() { return llvm::Type::getInt128Ty(context_); } + llvm::StructType* struct_type() { + return llvm::StructType::get(context_, {double_type(), double_type()}, false); + } + + llvm::VectorType* list_type() { return llvm::ScalableVectorType::get(i8_type(), (unsigned int)0); } + llvm::StructType* i128_split_type() { // struct with high/low bits (see decimal_ops.cc:DecimalSplit) return llvm::StructType::get(context_, {i64_type(), i64_type()}, false); @@ -65,6 +72,10 @@ class GANDIVA_EXPORT LLVMTypes { llvm::PointerType* i128_ptr_type() { return ptr_type(i128_type()); } + llvm::PointerType* float_ptr_type() { return ptr_type(float_type()); } + + llvm::PointerType* double_ptr_type() { return ptr_type(double_type()); } + template llvm::Constant* int_constant(ctype val) { return llvm::ConstantInt::get(context_, llvm::APInt(N, val)); @@ -87,6 +98,10 @@ class GANDIVA_EXPORT LLVMTypes { return llvm::ConstantFP::get(float_type(), val); } + llvm::LLVMContext* get_context() { + return &context_; + } + llvm::Constant* double_constant(double val) { return llvm::ConstantFP::get(double_type(), val); } @@ -104,6 +119,17 @@ class GANDIVA_EXPORT LLVMTypes { /// For a given data type, find the ir type used for the data vector slot. llvm::Type* DataVecType(const DataTypePtr& data_type) { + // support list type + // list type data is formed by base type buffer, wrapped with offsets buffer + // offsets buffer is to separate data into list + // not support nested list + if (data_type->id() == arrow::Type::LIST) { + //LR HACK + //std::cout << "LR Returning list type as type " << data_type->field(0)->type()->id()<< " for IR " << std::endl; + //return IRType(data_type->field(0)->type()->id()); + //return IRType(data_type->id()); + return i32_ptr_type(); + } return IRType(data_type->id()); } diff --git a/cpp/src/gandiva/llvm_types_test.cc b/cpp/src/gandiva/llvm_types_test.cc index 66696830618..665a82d133f 100644 --- a/cpp/src/gandiva/llvm_types_test.cc +++ b/cpp/src/gandiva/llvm_types_test.cc @@ -50,12 +50,22 @@ TEST_F(TestLLVMTypes, TestFound) { types_->i64_type()); EXPECT_EQ(types_->DataVecType(arrow::timestamp(arrow::TimeUnit::MILLI)), types_->i64_type()); + + EXPECT_EQ(types_->IRType(arrow::Type::STRING), types_->i8_ptr_type()); + EXPECT_EQ(types_->DataVecType(arrow::list(arrow::boolean())), types_->i1_type()); + EXPECT_EQ(types_->DataVecType(arrow::list(arrow::int32())), types_->i32_type()); + EXPECT_EQ(types_->DataVecType(arrow::list(arrow::int64())), types_->i64_type()); + EXPECT_EQ(types_->DataVecType(arrow::list(arrow::float32())), types_->float_type()); + EXPECT_EQ(types_->DataVecType(arrow::list(arrow::float64())), types_->double_type()); + EXPECT_EQ(types_->DataVecType(arrow::list(arrow::utf8())), types_->i8_ptr_type()); } TEST_F(TestLLVMTypes, TestNotFound) { EXPECT_EQ(types_->IRType(arrow::Type::SPARSE_UNION), nullptr); EXPECT_EQ(types_->IRType(arrow::Type::DENSE_UNION), nullptr); EXPECT_EQ(types_->DataVecType(arrow::null()), nullptr); + // not support nested list type + EXPECT_EQ(types_->DataVecType(arrow::list(arrow::list(arrow::utf8()))), nullptr); } } // namespace gandiva diff --git a/cpp/src/gandiva/lvalue.h b/cpp/src/gandiva/lvalue.h index df292855b69..7e6a5c2fb96 100644 --- a/cpp/src/gandiva/lvalue.h +++ b/cpp/src/gandiva/lvalue.h @@ -17,6 +17,7 @@ #pragma once +#include #include #include "arrow/util/macros.h" @@ -74,4 +75,29 @@ class GANDIVA_EXPORT DecimalLValue : public LValue { llvm::Value* scale_; }; +class GANDIVA_EXPORT ListLValue : public LValue { + public: + ListLValue(llvm::Value* data, llvm::Value* child_offsets, llvm::Value* offsets_length, + llvm::Value* validity = NULLPTR) + : LValue(data, NULLPTR, validity), + child_offsets_(child_offsets), + offsets_length_(offsets_length) { + //std::cout << "LR Creating ListLValue " << std::endl; + } + + llvm::Value* child_offsets() { return child_offsets_; } + + llvm::Value* offsets_length() { return offsets_length_; } + + void AppendFunctionParams(std::vector* params) override { + LValue::AppendFunctionParams(params); + params->push_back(child_offsets_); + params->push_back(offsets_length_); + } + + private: + llvm::Value* child_offsets_; + llvm::Value* offsets_length_; +}; + } // namespace gandiva diff --git a/cpp/src/gandiva/precompiled/string_ops.cc b/cpp/src/gandiva/precompiled/string_ops.cc index c255b9a11c0..9c4458ea1b7 100644 --- a/cpp/src/gandiva/precompiled/string_ops.cc +++ b/cpp/src/gandiva/precompiled/string_ops.cc @@ -827,6 +827,65 @@ const char* substr_utf8_int64(gdv_int64 context, const char* input, gdv_int32 in return substr_utf8_int64_int64(context, input, in_len, offset64, in_len, out_len); } +FORCE_INLINE +const char* gdv_fn_geo_hash_encode_float64_float64(gdv_int64 context, gdv_float64 lat, gdv_float64 lon, + gdv_int32* out_len) { + //if (repeat_number == 0 || in_len <= 0) { + // *out_len = 0; + // return ""; + //} + + + //Gandiva-blarg + *out_len = 14; + char* ret = reinterpret_cast(gdv_fn_context_arena_malloc(context, *out_len)); + if (ret == nullptr) { + gdv_fn_context_set_error_msg(context, "Could not allocate memory for output string"); + *out_len = 0; + return ""; + } + + std::string out_string = "Gandiva-blarg"; + memcpy(ret, out_string.c_str(), *out_len); + return ret; +} + +FORCE_INLINE +const gdv_struct gdv_fn_geo_hash_decode_utf8(gdv_int64 context, const char* input, gdv_int32 in_len) { + //gdv_struct* ret = reinterpret_cast(gdv_fn_context_arena_malloc(context, sizeof(gdv_struct))); + gdv_struct ret; + ret.lattitude = 42; + ret.longitude = 142; + return ret; + + //if (repeat_number == 0 || in_len <= 0) { + // *out_len = 0; + // return ""; + //} + + /*auto s = arrow::struct_({field("a", arrow::int32(), false), field("b", arrow::int32(), false)}); + + MemoryPool* pool_ = default_memory_pool(); + std::unique_ptr tmp; + MakeBuilder(pool_, s, &tmp); + + + +//std::vector list_lengths = {42, 43}; +//std::vector list_offsets = {142, 143}; +//410 ListBuilder* list_vb = checked_cast(builder_->field_builder(0)); + Int32Builder* int_vb = checked_cast(builder_->field_builder(0)); + Int32Builder* int_vb2 = checked_cast(builder_->field_builder(1)); +//420 ASSERT_OK(list_vb->AppendValues(list_offsets.data(), list_offsets.size(), +//421 list_is_valid.data())); + + int_vb->UnsafeAppend(42); + int_vb->UnsafeAppend(43); + int_vb2->UnsafeAppend(142); + int_vb2->UnsafeAppend(143); +*/ +} + FORCE_INLINE const char* repeat_utf8_int32(gdv_int64 context, const char* in, gdv_int32 in_len, gdv_int32 repeat_number, gdv_int32* out_len) { diff --git a/cpp/src/gandiva/precompiled/types.h b/cpp/src/gandiva/precompiled/types.h index 83bbdee2085..15fba4867e6 100644 --- a/cpp/src/gandiva/precompiled/types.h +++ b/cpp/src/gandiva/precompiled/types.h @@ -19,6 +19,8 @@ #include + +#include "gandiva/array_ops.h" #include "gandiva/gdv_function_stubs.h" // Use the same names as in arrow data types. Makes it easy to write pre-processor macros. @@ -41,6 +43,13 @@ using gdv_utf8 = char*; using gdv_binary = char*; using gdv_day_time_interval = int64_t; +struct GeoStruct { + double lattitude; + double longitude; +}; + +using gdv_struct = GeoStruct; + #ifdef GANDIVA_UNIT_TEST // unit tests may be compiled without O2, so inlining may not happen. #define FORCE_INLINE @@ -464,6 +473,11 @@ gdv_int64 truncate_int64_int32(gdv_int64 in, gdv_int32 out_scale); const char* repeat_utf8_int32(gdv_int64 context, const char* in, gdv_int32 in_len, gdv_int32 repeat_times, gdv_int32* out_len); +const char* gdv_fn_geo_hash_encode_float64_float64(gdv_int64 context, gdv_float64 lat, gdv_float64 lon, + gdv_int32* out_len); + +const gdv_struct gdv_fn_geo_hash_decode_utf8(gdv_int64 context, const char* input, gdv_int32 in_len); + const char* substr_utf8_int64_int64(gdv_int64 context, const char* input, gdv_int32 in_len, gdv_int64 offset64, gdv_int64 length, gdv_int32* out_len); diff --git a/cpp/src/gandiva/projector.cc b/cpp/src/gandiva/projector.cc index 54de03963f7..97f28f652ea 100644 --- a/cpp/src/gandiva/projector.cc +++ b/cpp/src/gandiva/projector.cc @@ -17,6 +17,7 @@ #include "gandiva/projector.h" +#include #include #include #include @@ -29,6 +30,96 @@ namespace gandiva { +class ProjectorCacheKey { + public: + ProjectorCacheKey(SchemaPtr schema, std::shared_ptr configuration, + ExpressionVector expression_vector, SelectionVector::Mode mode) + : schema_(schema), configuration_(configuration), mode_(mode), uniqifier_(0) { + static const int kSeedValue = 4; + size_t result = kSeedValue; + for (auto& expr : expression_vector) { + std::string expr_as_string = expr->ToString(); + expressions_as_strings_.push_back(expr_as_string); + arrow::internal::hash_combine(result, expr_as_string); + UpdateUniqifier(expr_as_string); + } + arrow::internal::hash_combine(result, static_cast(mode)); + arrow::internal::hash_combine(result, configuration->Hash()); + arrow::internal::hash_combine(result, schema_->ToString()); + arrow::internal::hash_combine(result, uniqifier_); + hash_code_ = result; + } + + std::size_t Hash() const { return hash_code_; } + + bool operator==(const ProjectorCacheKey& other) const { + // arrow schema does not overload equality operators. + if (!(schema_->Equals(*other.schema().get(), true))) { + return false; + } + + if (*configuration_ != *other.configuration_) { + return false; + } + + if (expressions_as_strings_ != other.expressions_as_strings_) { + return false; + } + + if (mode_ != other.mode_) { + return false; + } + + if (uniqifier_ != other.uniqifier_) { + return false; + } + return true; + } + + bool operator!=(const ProjectorCacheKey& other) const { return !(*this == other); } + + SchemaPtr schema() const { return schema_; } + + std::string ToString() const { + std::stringstream ss; + // indent, window, indent_size, null_rep and skip new lines. + arrow::PrettyPrintOptions options{0, 10, 2, "null", true}; + DCHECK_OK(PrettyPrint(*schema_.get(), options, &ss)); + + ss << "Expressions: ["; + bool first = true; + for (auto& expr : expressions_as_strings_) { + if (first) { + first = false; + } else { + ss << ", "; + } + + ss << expr; + } + ss << "]"; + return ss.str(); + } + + private: + void UpdateUniqifier(const std::string& expr) { + if (uniqifier_ == 0) { + // caching of expressions with re2 patterns causes lock contention. So, use + // multiple instances to reduce contention. + if (expr.find(" like(") != std::string::npos) { + uniqifier_ = std::hash()(std::this_thread::get_id()) % 16; + } + } + } + + const SchemaPtr schema_; + const std::shared_ptr configuration_; + SelectionVector::Mode mode_; + std::vector expressions_as_strings_; + size_t hash_code_; + uint32_t uniqifier_; +}; + Projector::Projector(std::unique_ptr llvm_generator, SchemaPtr schema, const FieldVector& output_fields, std::shared_ptr configuration) @@ -78,6 +169,7 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, ARROW_RETURN_IF(configuration == nullptr, Status::Invalid("Configuration cannot be null")); + //std::cout << "LR Projector::Make 1" << std::endl; // see if equivalent projector was already built std::shared_ptr>> cache = LLVMGenerator::GetCache(); @@ -100,6 +192,7 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, std::unique_ptr llvm_gen; ARROW_RETURN_NOT_OK(LLVMGenerator::Make(configuration, is_cached, &llvm_gen)); + //std::cout << "LR Projector::Make 2" << std::endl; if (!is_cached && sec_cache != nullptr) { std::shared_ptr arrow_buffer = sec_cache->Get(GetSecondaryCacheKey(cache_key.ToString())); @@ -117,6 +210,7 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, // Run the validation on the expressions. // Return if any of the expression is invalid since // we will not be able to process further. + //std::cout << "LR Projector::Make 3" << std::endl; if (!is_cached) { ExprValidator expr_validator(llvm_gen->types(), schema); for (auto& expr : exprs) { @@ -136,11 +230,13 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, output_fields.push_back(expr->result()); } + //std::cout << "LR Projector::Make 4" << std::endl; // Instantiate the projector with the completely built llvm generator *projector = std::shared_ptr( new Projector(std::move(llvm_gen), schema, output_fields, configuration)); projector->get()->SetBuiltFromCache(is_cached); + //std::cout << "LR Projector::Make 5" << std::endl; if (sec_cache != nullptr && is_cached == false) { std::shared_ptr sec_cached_obj = cache->GetObjectCode(cache_key); llvm::StringRef string_buffer = sec_cached_obj->getBuffer(); @@ -149,6 +245,7 @@ Status Projector::Make(SchemaPtr schema, const ExpressionVector& exprs, sec_cache->Set(GetSecondaryCacheKey(cache_key.ToString()), arrow_buffer); } + //std::cout << "LR Projector::Make DONE" << std::endl; return Status::OK(); } @@ -162,6 +259,7 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, const ArrayDataVector& output_data_vecs) const { ARROW_RETURN_NOT_OK(ValidateEvaluateArgsCommon(batch)); + //std::cout << "LR the other Projector::Evaluate" << std::endl; if (output_data_vecs.size() != output_fields_.size()) { std::stringstream ss; ss << "number of buffers for output_data_vecs is " << output_data_vecs.size() @@ -169,8 +267,10 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, return Status::Invalid(ss.str()); } +//std::cout << "LR the other Projector::Evaluate 1a" << std::endl; int idx = 0; for (auto& array_data : output_data_vecs) { + //std::cout << "LR the other Projector::Evaluate checking array_data" << std::endl; if (array_data == nullptr) { std::stringstream ss; ss << "array for output field " << output_fields_[idx]->name() << "is null."; @@ -180,11 +280,59 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, auto num_rows = selection_vector == nullptr ? batch.num_rows() : selection_vector->GetNumSlots(); + //std::cout << "LR the other Projector::Evaluate about to validate capacity" << std::endl; ARROW_RETURN_NOT_OK( ValidateArrayDataCapacity(*array_data, *(output_fields_[idx]), num_rows)); ++idx; } - return llvm_generator_->Execute(batch, selection_vector, output_data_vecs); + //std::cout << "LR the other Projector::Evaluate 2" << std::endl; + ARROW_RETURN_NOT_OK( + llvm_generator_->Execute(batch, selection_vector, output_data_vecs)); + + // Create and return array arrays. + + /* for (auto& array_data : output_data_vecs) { + + if (array_data->type->id() == arrow::Type::LIST) { + auto child_data = array_data->child_data[0]; + //std::cout << "LR the other Projector::Evaluate modifying child array " << + //child_data->buffers[1]->ToString() << std::endl; + //std::cout << "LR the other Projector::Evaluate child array[3] " << + //int32_t( (*child_data->buffers[1])[3*4]) << std::endl; + //std::cout << "LR the other Projector::Evaluate modifying child0 array " << + //child_data->buffers[0]->ToString() << std::endl; + + int64_t child_data_size = 1; + if (arrow::is_binary_like(child_data->type->id())) { + + child_data_size = child_data->buffers[1]->size() / 4 - 1; + } else if (child_data->type->id() == arrow::Type::INT32) { + child_data_size = child_data->buffers[1]->size() / 4; + } else if (child_data->type->id() == arrow::Type::INT64) { + child_data_size = child_data->buffers[1]->size() / 8; + } else if (child_data->type->id() == arrow::Type::FLOAT) { + child_data_size = child_data->buffers[1]->size() / 4; + } else if (child_data->type->id() == arrow::Type::DOUBLE) { + child_data_size = child_data->buffers[1]->size() / 8; + } + auto new_child_data = arrow::ArrayData::Make( + child_data->type, child_data_size, child_data->buffers, child_data->offset); + array_data->child_data.clear(); + array_data->child_data.push_back(new_child_data); + + //std::cout << "LR the other Projector::Evaluate child data size " << child_data_size << std::endl; + //std::cout << "LR the other Projector::Evaluate after modifying child array[3] " << + //int32_t( (*(array_data->child_data[0])->buffers[1])[3*4]) << std::endl; + + //array_data = arrow::ArrayData::Make(array_data->type, array_data->length, + // array_data->buffers, {new_child_data}, + // array_data->null_count, array_data->offset); + } + + }*/ + + + return Status::OK(); } Status Projector::Evaluate(const arrow::RecordBatch& batch, arrow::MemoryPool* pool, @@ -195,12 +343,14 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, arrow::MemoryPool* p Status Projector::Evaluate(const arrow::RecordBatch& batch, const SelectionVector* selection_vector, arrow::MemoryPool* pool, arrow::ArrayVector* output) const { + //std::cout << "LR Projector::Evaluate" << std::endl; ARROW_RETURN_NOT_OK(ValidateEvaluateArgsCommon(batch)); ARROW_RETURN_IF(output == nullptr, Status::Invalid("Output must be non-null.")); ARROW_RETURN_IF(pool == nullptr, Status::Invalid("Memory pool must be non-null.")); auto num_rows = selection_vector == nullptr ? batch.num_rows() : selection_vector->GetNumSlots(); + //std::cout << "LR Projector::Evaluate num_rows" << num_rows << std::endl; // Allocate the output data vecs. ArrayDataVector output_data_vecs; for (auto& field : output_fields_) { @@ -217,6 +367,36 @@ Status Projector::Evaluate(const arrow::RecordBatch& batch, // Create and return array arrays. output->clear(); for (auto& array_data : output_data_vecs) { + if (array_data->type->id() == arrow::Type::LIST) { + auto child_data = array_data->child_data[0]; + int64_t child_data_size = 1; + if (arrow::is_binary_like(child_data->type->id())) { + /* when allocate array data, child data length is an initialized value, + * after calculating, child data offsets buffer has been resized for results, + * but array data length is unchanged. + * We should recalculate child data length and make ArrayData with new length + * + * Otherwise, child data offsets buffer length is data length + 1 + * and offset data is int32_t, need use buffer->size()/4 - 1 + */ + child_data_size = child_data->buffers[1]->size() / 4 - 1; + } else if (child_data->type->id() == arrow::Type::INT32) { + child_data_size = child_data->buffers[1]->size() / 4; + } else if (child_data->type->id() == arrow::Type::INT64) { + child_data_size = child_data->buffers[1]->size() / 8; + } else if (child_data->type->id() == arrow::Type::FLOAT) { + child_data_size = child_data->buffers[1]->size() / 4; + } else if (child_data->type->id() == arrow::Type::DOUBLE) { + child_data_size = child_data->buffers[1]->size() / 8; + } + auto new_child_data = arrow::ArrayData::Make( + child_data->type, child_data_size, child_data->buffers, child_data->offset); + array_data = arrow::ArrayData::Make(array_data->type, array_data->length, + array_data->buffers, {new_child_data}, + array_data->null_count, array_data->offset); + std::cout << "LR Making array data length " << array_data->length << std::endl; + } + output->push_back(arrow::MakeArray(array_data)); } return Status::OK(); @@ -229,6 +409,7 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, arrow::Status astatus; std::vector> buffers; + //std::cout << "LR Projector::AllocArrayData Enter" << std::endl; // The output vector always has a null bitmap. int64_t size = arrow::bit_util::BytesForBits(num_records); ARROW_ASSIGN_OR_RAISE(auto bitmap_buffer, arrow::AllocateBuffer(size, pool)); @@ -243,6 +424,23 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, buffers.push_back(std::move(offsets_buffer)); } + if (type_id == arrow::Type::LIST) { + auto offsets_len = arrow::bit_util::BytesForBits((num_records + 1) * 32); + + ARROW_ASSIGN_OR_RAISE(auto offsets_buffer, arrow::AllocateBuffer(offsets_len, pool)); + buffers.push_back(std::move(offsets_buffer)); + + if (arrow::is_binary_like(type->field(0)->type()->id())) { + // child offsets length is internal data length + 1 + // offsets element is int32 + // so here i just allocate extra 32 bit for extra 1 length + ARROW_ASSIGN_OR_RAISE( + auto child_offsets_buffer, + arrow::AllocateResizableBuffer(arrow::bit_util::BytesForBits(32), pool)); + buffers.push_back(std::move(child_offsets_buffer)); + } + } + // The output vector always has a data array. int64_t data_len; if (arrow::is_primitive(type_id) || type_id == arrow::Type::DECIMAL) { @@ -251,6 +449,8 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, } else if (arrow::is_binary_like(type_id)) { // we don't know the expected size for varlen output vectors. data_len = 0; + } else if (type_id == arrow::Type::LIST) { + data_len = 0; } else { return Status::Invalid("Unsupported output data type " + type->ToString()); } @@ -263,7 +463,30 @@ Status Projector::AllocArrayData(const DataTypePtr& type, int64_t num_records, } buffers.push_back(std::move(data_buffer)); - *array_data = arrow::ArrayData::Make(type, num_records, std::move(buffers)); + //std::cout << "LR Projector::AllocArrayData 1" << std::endl; + if (type->id() == arrow::Type::LIST) { + // std::cout << "LR Projector::AllocArrayData List. There are number of buffers=" << buffers.size() << std::endl; + auto internal_type = type->field(0)->type(); + ArrayDataPtr child_data; + if (arrow::is_primitive(internal_type->id())) { + //std::cout << "LR Projector::AllocArrayData List 1" << std::endl; + child_data = arrow::ArrayData::Make(internal_type, 0 /*initialize length*/, + {nullptr, std::move(buffers[2])}, 0); + } + if (arrow::is_binary_like(internal_type->id())) { + //std::cout << "LR Projector::AllocArrayData List 2" << std::endl; + child_data = arrow::ArrayData::Make( + internal_type, 0 /*initialize length*/, + {nullptr, std::move(buffers[2]), std::move(buffers[3])}, 0); + } + *array_data = arrow::ArrayData::Make( + type, num_records, {std::move(buffers[0]), std::move(buffers[1])}, {child_data}); + + } else { + *array_data = arrow::ArrayData::Make(type, num_records, std::move(buffers)); + } + + // std::cout << "LR Projector::AllocArrayData Done" << std::endl; return Status::OK(); } @@ -282,15 +505,20 @@ Status Projector::ValidateArrayDataCapacity(const arrow::ArrayData& array_data, ARROW_RETURN_IF(array_data.buffers.size() < 2, Status::Invalid("ArrayData must have at least 2 buffers")); +//std::cout << "LR ValidateArrayDataCapacity" << std::endl; int64_t min_bitmap_len = arrow::bit_util::BytesForBits(num_records); + //std::cout << "LR ValidateArrayDataCapacity arra_data 0 is " << array_data.buffers[0] << std::endl; int64_t bitmap_len = array_data.buffers[0]->capacity(); + //std::cout << "LR ValidateArrayDataCapacity" << std::endl; ARROW_RETURN_IF( bitmap_len < min_bitmap_len, Status::Invalid("Bitmap buffer too small for ", field.name(), " expected minimum ", min_bitmap_len, " actual size ", bitmap_len)); auto type_id = field.type()->id(); - if (arrow::is_binary_like(type_id)) { + //std::cout << "LR ValidateArrayDataCapacity" << std::endl; + //LR TODO + if (arrow::is_binary_like(type_id)) { //|| type_id == arrow::Type::LIST) { // validate size of offsets buffer. int64_t min_offsets_len = arrow::bit_util::BytesForBits((num_records + 1) * 32); int64_t offsets_len = array_data.buffers[1]->capacity(); @@ -312,7 +540,10 @@ Status Projector::ValidateArrayDataCapacity(const arrow::ArrayData& array_data, int64_t data_len = array_data.buffers[1]->capacity(); ARROW_RETURN_IF(data_len < min_data_len, Status::Invalid("Data buffer too small for ", field.name())); - } else { + } else if (type_id == arrow::Type::LIST) { + return Status::OK(); + } + else { return Status::Invalid("Unsupported output data type " + field.type()->ToString()); } @@ -339,4 +570,5 @@ std::shared_ptr Projector::GetSecondaryCacheKey(std::string prima return arrow::Buffer::FromString(key); } + } // namespace gandiva diff --git a/cpp/src/gandiva/projector.h b/cpp/src/gandiva/projector.h index 24ec11e3eab..53d0ef6d624 100644 --- a/cpp/src/gandiva/projector.h +++ b/cpp/src/gandiva/projector.h @@ -154,14 +154,14 @@ class GANDIVA_EXPORT Projector { bool GetBuiltFromCache(); void Clear(); + /// Allocate an ArrowData of length 'length'. + Status AllocArrayData(const DataTypePtr& type, int64_t num_records, + arrow::MemoryPool* pool, ArrayDataPtr* array_data) const; private: Projector(std::unique_ptr llvm_generator, SchemaPtr schema, const FieldVector& output_fields, std::shared_ptr); - /// Allocate an ArrowData of length 'length'. - Status AllocArrayData(const DataTypePtr& type, int64_t num_records, - arrow::MemoryPool* pool, ArrayDataPtr* array_data) const; /// Validate that the ArrayData has sufficient capacity to accommodate 'num_records'. Status ValidateArrayDataCapacity(const arrow::ArrayData& array_data, diff --git a/cpp/src/gandiva/tests/CMakeLists.txt b/cpp/src/gandiva/tests/CMakeLists.txt index b89c0ac2252..bc607702126 100644 --- a/cpp/src/gandiva/tests/CMakeLists.txt +++ b/cpp/src/gandiva/tests/CMakeLists.txt @@ -25,6 +25,7 @@ add_gandiva_test(binary_test) add_gandiva_test(date_time_test) add_gandiva_test(to_string_test) add_gandiva_test(utf8_test) +add_gandiva_test(list_test) add_gandiva_test(hash_test) add_gandiva_test(in_expr_test) add_gandiva_test(null_validity_test) diff --git a/cpp/src/gandiva/tests/list_test.cc b/cpp/src/gandiva/tests/list_test.cc new file mode 100644 index 00000000000..7936873d073 --- /dev/null +++ b/cpp/src/gandiva/tests/list_test.cc @@ -0,0 +1,588 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include + +#include + +#include "arrow/memory_pool.h" +#include "arrow/status.h" +#include "gandiva/execution_context.h" +#include "gandiva/precompiled/types.h" +#include "gandiva/projector.h" +#include "gandiva/tests/test_util.h" +#include "gandiva/tree_expr_builder.h" + +namespace gandiva { + +using arrow::boolean; +using arrow::float32; +using arrow::float64; +using arrow::int32; +using arrow::int64; +using arrow::utf8; +using std::string; +using std::vector; + +class TestList : public ::testing::Test { + public: + void SetUp() { pool_ = arrow::default_memory_pool(); } + + protected: + arrow::MemoryPool* pool_; +}; + +template +void _build_list_array(const vector& values, const vector& length, + const vector& validity, arrow::MemoryPool* pool, + ArrayPtr* array) { + size_t sum = 0; + for (auto& len : length) { + sum += len; + } + EXPECT_TRUE(values.size() == sum); + EXPECT_TRUE(length.size() == validity.size()); + + auto value_builder = std::make_shared(pool); + auto builder = std::make_shared(pool, value_builder); + int i = 0; + for (size_t l = 0; l < length.size(); l++) { + if (validity[l]) { + auto status = builder->Append(); + for (int j = 0; j < length[l]; j++) { + ASSERT_OK(value_builder->Append(values[i])); + i++; + } + } else { + ASSERT_OK(builder->AppendNull()); + for (int j = 0; j < length[l]; j++) { + i++; + } + } + } + ASSERT_OK(builder->Finish(array)); +} + +/* + * expression: + * input: a + * output: res + * typeof(a) can be list / list / list + */ +void _test_list_type_field_alias(DataTypePtr type, ArrayPtr array, + arrow::MemoryPool* pool) { + auto field_a = field("a", type); + auto schema = arrow::schema({field_a}); + auto result = field("res", type); + + auto num_records = 5; + assert(array->length() == num_records); + + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array}); + + // Make expression + std::cout << "Make expression" << std::endl; + auto field_a_node = TreeExprBuilder::MakeField(field_a); + auto expr = TreeExprBuilder::MakeExpression(field_a_node, result); + + std::cout << "Build a projector for the expressions." << std::endl; + // Build a projector for the expressions. + std::shared_ptr projector; + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + std::cout << "status message: " << status.message() << std::endl; + EXPECT_TRUE(status.ok()) << status.message(); + + std::cout << "Evaluate expression" << std::endl; + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool, &outputs); + EXPECT_TRUE(status.ok()) << status.message(); + + std::cout << "Check results" << std::endl; + EXPECT_ARROW_ARRAY_EQUALS(array, outputs[0]); + // EXPECT_ARROW_ARRAY_EQUALS will not check the length of child data, but + // ArrayData::Slice method will check length. ArrayData::ToString method will call + // ArrayData::Slice method + EXPECT_TRUE(array->ToString() == outputs[0]->ToString()); + EXPECT_TRUE(array->null_count() == outputs[0]->null_count()); +} + +/* +TEST_F(TestList, TestListUtf8) { + ArrayPtr array; + _build_list_array( + {"a", "b", "bb", "c", "cc", "ccc", "d", "dd", "ddd", "dddd", "e", "ee", "eee", + "eeee", "eeeee"}, + {1, 4, 3, 2, 5}, {true, true, false, true, true}, pool_, &array); + _test_list_type_field_alias(list(utf8()), array, pool_); +} + +TEST_F(TestList, TestListUtf8WithInvalidData) { + ArrayPtr array; + _build_list_array( + {"a", "b", "bb", "c", "cc", "ccc", "d", "dd", "ddd", "dddd", "e", "ee", "eee", + "eeee", "eeeee"}, + {1, 2, 3, 4, 5}, {true, false, true, true, false}, pool_, &array); + _test_list_type_field_alias(list(utf8()), array, pool_); +} + +TEST_F(TestList, TestListInt64) { + ArrayPtr array; + _build_list_array( + {1, 10, 20, 100, 200, 300, 1000, 2000, 3000, 4000, 10000, 20000, 30000, 40000, + 50000}, + {1, 2, 5, 4, 3}, {true, true, true, true, false}, pool_, &array); + _test_list_type_field_alias(list(int64()), array, pool_); +} +*/ + + +TEST_F(TestList, TestListInt32) { + ArrayPtr array; + _build_list_array( + {1, 10, 20, 100, 200, 300, 1000, 2000, 3000, 4000, 10000, 20000, 30000, 40000, + 50000}, + {5, 2, 3, 4, 1}, {true, false, true, true, true}, pool_, &array); + _test_list_type_field_alias(list(int32()), array, pool_); +} + +TEST_F(TestList, TestMakeArray) { + // schema for input fields + auto field_b = field("b", int32()); + auto schema = arrow::schema({field_b}); + + // output fields + auto res = field("res", list(int32())); + + // Create a row-batch with some sample data + int num_records = 5; + auto array_b = + MakeArrowArrayInt32({42, 43, 44, 45, 46}); + + // expected output + auto exp1 = MakeArrowArrayInt32({ 1, 2, 3, 42, 5}, + {true, true, true, true, true}); + + // auto exp = MakeArrowArrayArray({ 42, 42, 44, 45, 46}, + // {true, true, true, true, true}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_b}); + + // build expressions. + // array_contains(a, b) + + //auto expr = TreeExprBuilder::MakeExpression("array_containsGandiva", {field_a, field_b}, res); + + //std::vector field_nodes; + //auto node2 = TreeExprBuilder::MakeLiteral(42); + //field_nodes.push_back(node2); + + //auto func_node = TreeExprBuilder::MakeFunction("array_makeGandiva", {field_b}, res->type()); + //auto expr = TreeExprBuilder::MakeExpression(func_node, res); + std::cout << "LR test is about to make expression " << std::endl; + auto expr = TreeExprBuilder::MakeExpression("array_makeGandiva", {field_b}, res); + //////// + + // Build a projector for the expressions. + std::shared_ptr projector; + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + std::cout << "LR Test 2 " << std::endl; + //std::cout << "LR IR IS " << projector->DumpIR() << std::endl; + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()) << status.message(); + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp1, outputs.at(0)); + + std::cout << "LR ==============================SECOND=WAY==================================================== " << std::endl; + + + + //Try the second method. + arrow::ArrayDataVector outputs2; + std::shared_ptr listDt = std::make_shared(); + std::shared_ptr dt = std::make_shared(listDt); + + + int num_records2 = 5; + std::vector> buffers; + + + + //int64_t size = arrow::bit_util::BytesForBits(num_records2); + int64_t size = 20; + auto bitmap_buffer = arrow::AllocateBuffer(size, pool_); + buffers.push_back(*std::move(bitmap_buffer)); + auto offsets_len = arrow::bit_util::BytesForBits((num_records2 + 1) * 32); + + auto offsets_buffer = arrow::AllocateBuffer(offsets_len*10, pool_); + buffers.push_back(*std::move(offsets_buffer)); + + std::cout << "LR Test buffers [0] is " << buffers[0] << std::endl; + //auto array_data = arrow::ArrayData::Make(dt, num_records2, buffers, 0, offsets_len); + //outputs2.push_back(array_data); + + + +std::vector> buffers2; +auto bitmap_buffer2 = arrow::AllocateBuffer(size, pool_); + buffers2.push_back(*std::move(bitmap_buffer2)); + + auto offsets_buffer2 = arrow::AllocateBuffer(offsets_len, pool_); + buffers2.push_back(*std::move(offsets_buffer2)); +std::shared_ptr dt2 = std::make_shared(); + + auto array_data_child = arrow::ArrayData::Make(dt2, num_records2, buffers2, 0, 0); + array_data_child->buffers = std::move(buffers2); + + std::vector> kids; + kids.push_back(array_data_child); + + +auto array_data = arrow::ArrayData::Make(dt, num_records2, buffers, kids, 0, 0); +array_data->buffers = std::move(buffers); +outputs2.push_back(array_data); + +std::cout << "LR Test " << array_data << " arra_data 0 is " << array_data->buffers[0] << std::endl; + //std::cout << "LR Test buffers [0] is " << buffers[0] << std::endl; + std::cout << "LR about to evaluate 2nd " << std::endl; + + status = projector->Evaluate(*(in_batch.get()), outputs2); + EXPECT_TRUE(status.ok()) << status.message(); + arrow::ArrayData ad = *outputs2.at(0); + arrow::ArraySpan sp(*ad.child_data.at(0)); + EXPECT_ARROW_ARRAY_EQUALS(exp1, sp.ToArray()); + + + + +for (auto& array_data : outputs2) { + auto child_data = array_data->child_data[0]; + int64_t child_data_size = 1; + if (arrow::is_binary_like(child_data->type->id())) { + /* when allocate array data, child data length is an initialized value, + * after calculating, child data offsets buffer has been resized for results, + * but array data length is unchanged. + * We should recalculate child data length and make ArrayData with new length + * + * Otherwise, child data offsets buffer length is data length + 1 + * and offset data is int32_t, need use buffer->size()/4 - 1 + */ + child_data_size = child_data->buffers[1]->size() / 4 - 1; + } else if (child_data->type->id() == arrow::Type::INT32) { + child_data_size = child_data->buffers[1]->size() / 4; + } else if (child_data->type->id() == arrow::Type::INT64) { + child_data_size = child_data->buffers[1]->size() / 8; + } else if (child_data->type->id() == arrow::Type::FLOAT) { + child_data_size = child_data->buffers[1]->size() / 4; + } else if (child_data->type->id() == arrow::Type::DOUBLE) { + child_data_size = child_data->buffers[1]->size() / 8; + } + auto new_child_data = arrow::ArrayData::Make( + child_data->type, child_data_size, child_data->buffers, child_data->offset); + array_data = arrow::ArrayData::Make(array_data->type, array_data->length, + array_data->buffers, {new_child_data}, + array_data->null_count, array_data->offset); + + + auto newArray = arrow::MakeArray(array_data); + //arrow::ArraySpan sp(newArray); + EXPECT_ARROW_ARRAY_EQUALS(exp1, newArray); +} + + + + std::cout << "LR ====================THIRD=WAY================================== " << std::endl; + { + std::shared_ptr listDt = std::make_shared(); + std::shared_ptr dt = std::make_shared(listDt); + +ArrayDataPtr output_data; + auto s = projector->AllocArrayData(dt, num_records2, pool_, &output_data); + ArrayDataVector output_data_vecs; + output_data_vecs.push_back(output_data); + + status = projector->Evaluate(*(in_batch.get()), output_data_vecs); + EXPECT_TRUE(status.ok()) << status.message(); + arrow::ArraySpan sp(*output_data_vecs.at(0)); + EXPECT_ARROW_ARRAY_EQUALS(exp1, sp.ToArray()); + } +} + +/* +TEST_F(TestList, TestListArrayInt32) { + gandiva::ExecutionContext ctx; + uint64_t ctx_ptr = reinterpret_cast(&ctx); + int32_t data[] = {11, 2, 23, 42}; + int32_t entry_offsets_len = 4; + int32_t contains_data = 42; + + EXPECT_EQ( + array_int32_contains_int32(ctx_ptr, data, entry_offsets_len, + contains_data), + true); +} + + +TEST_F(TestList, TestListInt32LiteralContains) { + // schema for input fields + auto field_a = field("a", list(int32())); + auto field_b = field("b", int32()); + auto schema = arrow::schema({field_a, field_b}); + + // output fields + auto res = field("res", boolean()); + + // Create a row-batch with some sample data + int num_records = 5; + ArrayPtr array_a; + _build_list_array( + {1, 5, 19, 42, 57}, + {1, 1, 1, 1, 1}, {true, true, true, true, true}, pool_, &array_a); + + auto array_b = + MakeArrowArrayInt32({42, 42, 42, 42, 42}); + + // expected output + auto exp = MakeArrowArrayBool({false, false, false, true, false}, + {true, true, true, true, true}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a, array_b}); + + // build expressions. + // array_contains(a, b) + + //auto expr = TreeExprBuilder::MakeExpression("array_containsGandiva", {field_a, field_b}, res); + + std::vector field_nodes; + auto node = TreeExprBuilder::MakeField(field_a); + field_nodes.push_back(node); + + auto node2 = TreeExprBuilder::MakeLiteral(42); + field_nodes.push_back(node2); + + auto func_node = TreeExprBuilder::MakeFunction("array_containsGandiva", field_nodes, res->type()); + auto expr = TreeExprBuilder::MakeExpression(func_node, res); + //////// + + // Build a projector for the expressions. + std::shared_ptr projector; + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()) << status.message(); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0)); +} + +TEST_F(TestList, TestListInt32Contains) { + // schema for input fields + auto field_a = field("a", list(int32())); + auto field_b = field("b", int32()); + auto schema = arrow::schema({field_a, field_b}); + + // output fields + auto res = field("res", boolean()); + + // Create a row-batch with some sample data + int num_records = 5; + ArrayPtr array_a; + _build_list_array( + {1, 5, 19, 42, 57}, + {1, 1, 1, 1, 1}, {true, true, true, true, true}, pool_, &array_a); + + auto array_b = + MakeArrowArrayInt32({42, 42, 42, 42, 42}); + + // expected output + auto exp = MakeArrowArrayBool({false, false, false, true, false}, + {true, true, true, true, true}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a, array_b}); + + // build expressions. + // array_contains(a, b) + auto expr = TreeExprBuilder::MakeExpression("array_containsGandiva", {field_a, field_b}, res); + + // Build a projector for the expressions. + std::shared_ptr projector; + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()) << status.message(); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0)); +} + +TEST_F(TestList, TestListFloat32) { + ArrayPtr array; + _build_list_array( + {1.1f, 11.1f, 22.2f, 111.1f, 222.2f, 333.3f, 1111.1f, 2222.2f, 3333.3f, 4444.4f, + 11111.1f, 22222.2f, 33333.3f, 44444.4f, 55555.5f}, + {1, 2, 3, 4, 5}, {true, true, true, true, true}, pool_, &array); + _test_list_type_field_alias(list(float32()), array, pool_); +} + +TEST_F(TestList, TestListFloat64) { + ArrayPtr array; + _build_list_array( + {1.1, 1.11, 2.22, 1.111, 2.222, 3.333, 1.1111, 2.2222, 3.3333, 4.4444, 1.11111, + 2.22222, 3.33333, 4.44444, 5.55555}, + {1, 2, 4, 3, 5}, {true, false, true, true, true}, pool_, &array); + _test_list_type_field_alias(list(float64()), array, pool_); +} + + +TEST_F(TestList, TestListUtf8Length) { + // schema for input fields + auto field_a = field("a", list(utf8())); + auto schema = arrow::schema({field_a}); + + // output fields + auto res = field("res", int64()); + + // Create a row-batch with some sample data + int num_records = 5; + ArrayPtr array_a; + _build_list_array( + {"a", "b", "bb", "c", "cc", "ccc", "d", "dd", "ddd", "dddd", "e", "ee", "eee", + "eeee", "eeeee"}, + {1, 2, 3, 4, 5}, {true, true, true, true, true}, pool_, &array_a); + + // expected output + auto exp = MakeArrowArrayInt64({1, 2, 3, 4, 5}, {true, true, true, true, true}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a}); + + // build expressions. + // array_length(a) + auto expr = TreeExprBuilder::MakeExpression("array_lengthGandiva", {field_a}, res); + + // Build a projector for the expressions. + std::shared_ptr projector; + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()) << status.message(); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0)); +} + +TEST_F(TestList, TestListUtf8LengthWithInvalidData) { + // schema for input fields + auto field_a = field("a", list(utf8())); + auto schema = arrow::schema({field_a}); + + // output fields + auto res = field("res", int64()); + + // Create a row-batch with some sample data + int num_records = 5; + ArrayPtr array_a; + _build_list_array( + {"a", "b", "bb", "cc", "cc", "ccc", "d", "dd", "ddd"}, {1, 2, 2, 3, 1}, + {true, false, true, false, true}, pool_, &array_a); + + // expected output + auto exp = MakeArrowArrayInt64({1, 2, 2, 3, 1}, {true, false, true, false, true}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a}); + + // build expressions. + // array_length(a) + auto expr = TreeExprBuilder::MakeExpression("array_lengthGandiva", {field_a}, res); + + // Build a projector for the expressions. + std::shared_ptr projector; + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()) << status.message(); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0)); +} + + +TEST_F(TestList, TestListUtf8Contains) { + // schema for input fields + auto field_a = field("a", list(utf8())); + auto field_b = field("b", utf8()); + auto schema = arrow::schema({field_a, field_b}); + + // output fields + auto res = field("res", boolean()); + + // Create a row-batch with some sample data + int num_records = 5; + ArrayPtr array_a; + _build_list_array( + {"rectangle", "circle", "rectangle", "circle", "triangle", "triangle", "circle", + "rectangle"}, + {2, 3, 1, 1, 1}, {true, true, true, true, true}, pool_, &array_a); + auto array_b = + MakeArrowArrayUtf8({"rectangle", "circle", "circle", "circle", "rectangll"}); + + // expected output + auto exp = MakeArrowArrayBool({true, true, false, true, false}, + {true, true, true, true, true}); + + // prepare input record batch + auto in_batch = arrow::RecordBatch::Make(schema, num_records, {array_a, array_b}); + + // build expressions. + // array_contains(a, b) + auto expr = TreeExprBuilder::MakeExpression("array_containsGandiva", {field_a, field_b}, res); + + // Build a projector for the expressions. + std::shared_ptr projector; + auto status = Projector::Make(schema, {expr}, TestConfiguration(), &projector); + EXPECT_TRUE(status.ok()) << status.message(); + + // Evaluate expression + arrow::ArrayVector outputs; + status = projector->Evaluate(*in_batch, pool_, &outputs); + EXPECT_TRUE(status.ok()) << status.message(); + + // Validate results + EXPECT_ARROW_ARRAY_EQUALS(exp, outputs.at(0)); +} +*/ +} // namespace gandiva diff --git a/cpp/src/gandiva/tests/projector_build_validation_test.cc b/cpp/src/gandiva/tests/projector_build_validation_test.cc index 5b86844f940..82b59ef19ad 100644 --- a/cpp/src/gandiva/tests/projector_build_validation_test.cc +++ b/cpp/src/gandiva/tests/projector_build_validation_test.cc @@ -26,6 +26,7 @@ namespace gandiva { using arrow::boolean; using arrow::float32; using arrow::int32; +using arrow::utf8; class TestProjector : public ::testing::Test { public: @@ -80,7 +81,7 @@ TEST_F(TestProjector, TestNotMatchingDataType) { TEST_F(TestProjector, TestNotSupportedDataType) { // schema for input fields - auto field0 = field("f0", list(int32())); + auto field0 = field("f0", map(utf8(), int32())); auto schema = arrow::schema({field0}); // output fields @@ -94,7 +95,7 @@ TEST_F(TestProjector, TestNotSupportedDataType) { std::shared_ptr projector; auto status = Projector::Make(schema, {lt_expr}, TestConfiguration(), &projector); EXPECT_TRUE(status.IsExpressionValidationError()); - std::string expected_error = "Field f0 has unsupported data type list"; + std::string expected_error = "Field f0 has unsupported data type map"; EXPECT_TRUE(status.message().find(expected_error) != std::string::npos); } diff --git a/cpp/src/gandiva/tree_expr_builder.cc b/cpp/src/gandiva/tree_expr_builder.cc index 82bb661ecda..c43285843a1 100644 --- a/cpp/src/gandiva/tree_expr_builder.cc +++ b/cpp/src/gandiva/tree_expr_builder.cc @@ -144,12 +144,12 @@ NodePtr TreeExprBuilder::MakeOr(const NodeVector& children) { static bool print_expr = false; ExpressionPtr TreeExprBuilder::MakeExpression(NodePtr root_node, FieldPtr result_field) { + //std::cout << "LR Expression: " << root_node->ToString() << "\n"; + if (result_field == nullptr) { + //std::cout << "LR MakeExpression result_field is null" << std::endl; return nullptr; } - if (print_expr) { - std::cout << "Expression: " << root_node->ToString() << "\n"; - } return ExpressionPtr(new Expression(root_node, result_field)); } @@ -164,7 +164,9 @@ ExpressionPtr TreeExprBuilder::MakeExpression(const std::string& function, auto node = MakeField(field); field_nodes.push_back(node); } + //std::cout << "LR MakeExpression making function for " << function << std::endl; auto func_node = MakeFunction(function, field_nodes, out_field->type()); + //std::cout << "LR MakeExpression function is " << func_node->ToString() << std::endl; return MakeExpression(func_node, out_field); } diff --git a/java/gandiva/CMakeLists.txt b/java/gandiva/CMakeLists.txt index 629ab2fb347..60762f6307c 100644 --- a/java/gandiva/CMakeLists.txt +++ b/java/gandiva/CMakeLists.txt @@ -38,7 +38,7 @@ set(GANDIVA_PROTO_DIR ${CMAKE_CURRENT_SOURCE_DIR}/proto) get_filename_component(GANDIVA_PROTO_FILE_ABSOLUTE ${GANDIVA_PROTO_DIR}/Types.proto ABSOLUTE) -find_package(Protobuf REQUIRED) +find_package(Protobuf CONFIG REQUIRED) add_custom_command(OUTPUT ${GANDIVA_PROTO_OUTPUT_FILES} COMMAND protobuf::protoc --proto_path ${GANDIVA_PROTO_DIR} --cpp_out ${GANDIVA_PROTO_OUTPUT_DIR} ${GANDIVA_PROTO_FILE_ABSOLUTE} diff --git a/java/gandiva/src/main/cpp/expression_registry_helper.cc b/java/gandiva/src/main/cpp/expression_registry_helper.cc index 6765df3b972..0efb2e412e8 100644 --- a/java/gandiva/src/main/cpp/expression_registry_helper.cc +++ b/java/gandiva/src/main/cpp/expression_registry_helper.cc @@ -136,10 +136,18 @@ void ArrowToProtobuf(DataTypePtr type, types::ExtGandivaType* gandiva_data_type) gandiva_data_type->set_type(types::GandivaType::INTERVAL); gandiva_data_type->set_intervaltype(types::IntervalType::DAY_TIME); break; + case arrow::Type::STRUCT: + gandiva_data_type->set_type(types::GandivaType::STRUCT); + break; + case arrow::Type::LIST: + gandiva_data_type->set_type(types::GandivaType::LIST); + break; default: // un-supported types. test ensures that // when one of these are added build breaks. - DCHECK(false); + //DCHECK(false); + printf("LR Found unsupported type %d\n", type->id()); + fflush(stdout); } } @@ -168,10 +176,17 @@ Java_org_apache_arrow_gandiva_evaluator_ExpressionRegistryJniHelper_getGandivaSu JNIEXPORT jbyteArray JNICALL Java_org_apache_arrow_gandiva_evaluator_ExpressionRegistryJniHelper_getGandivaSupportedFunctions( // NOLINT JNIEnv* env, jobject types_helper) { + printf("LR Entering JNI call getGandivaSupportedFunctions\n"); + fflush(stdout); + ExpressionRegistry expr_registry; types::GandivaFunctions gandiva_functions; for (auto function = expr_registry.function_signature_begin(); function != expr_registry.function_signature_end(); function++) { + printf("LR getGandivaSupportedFunctions Functions: %s\n", (*function).base_name().c_str()); + printf("LR getGandivaSupportedFunctions Functions: %s\n", (*function).ToString().c_str()); + fflush(stdout); + types::FunctionSignature* function_signature = gandiva_functions.add_function(); function_signature->set_name((*function).base_name()); types::ExtGandivaType* return_type = function_signature->mutable_returntype(); diff --git a/java/gandiva/src/main/cpp/jni_common.cc b/java/gandiva/src/main/cpp/jni_common.cc index d5e54f38e36..1f647e0e279 100644 --- a/java/gandiva/src/main/cpp/jni_common.cc +++ b/java/gandiva/src/main/cpp/jni_common.cc @@ -82,10 +82,17 @@ jclass configuration_builder_class_; // refs for self. static jclass gandiva_exception_; static jclass vector_expander_class_; +static jclass listvector_expander_class_; static jclass vector_expander_ret_class_; +static jclass list_expander_ret_class_; static jmethodID vector_expander_method_; +static jmethodID listvector_expander_method_; static jfieldID vector_expander_ret_address_; static jfieldID vector_expander_ret_capacity_; +static jfieldID list_expander_ret_address_; +static jfieldID list_expander_ret_capacity_; +static jfieldID list_expander_offset_ret_address_; +static jfieldID list_expander_offset_ret_capacity_; static jclass secondary_cache_class_; static jmethodID cache_get_method_; @@ -125,16 +132,39 @@ jint JNI_OnLoad(JavaVM* vm, void* reserved) { vector_expander_class_, "expandOutputVectorAtIndex", "(IJ)Lorg/apache/arrow/gandiva/evaluator/VectorExpander$ExpandResult;"); + jclass local_listexpander_class = + env->FindClass("org/apache/arrow/gandiva/evaluator/ListVectorExpander"); + listvector_expander_class_ = (jclass)env->NewGlobalRef(local_listexpander_class); + env->DeleteLocalRef(local_listexpander_class); + + listvector_expander_method_ = env->GetMethodID( + listvector_expander_class_, "expandOutputVectorAtIndex", + "(IJ)Lorg/apache/arrow/gandiva/evaluator/ListVectorExpander$ExpandResult;"); + jclass local_expander_ret_class = env->FindClass("org/apache/arrow/gandiva/evaluator/VectorExpander$ExpandResult"); vector_expander_ret_class_ = (jclass)env->NewGlobalRef(local_expander_ret_class); env->DeleteLocalRef(local_expander_ret_class); + jclass local_list_expander_ret_class = + env->FindClass("org/apache/arrow/gandiva/evaluator/ListVectorExpander$ExpandResult"); + list_expander_ret_class_ = (jclass)env->NewGlobalRef(local_list_expander_ret_class); + env->DeleteLocalRef(local_list_expander_ret_class); + vector_expander_ret_address_ = env->GetFieldID(vector_expander_ret_class_, "address", "J"); vector_expander_ret_capacity_ = env->GetFieldID(vector_expander_ret_class_, "capacity", "J"); + list_expander_ret_address_ = + env->GetFieldID(list_expander_ret_class_, "address", "J"); + list_expander_ret_capacity_ = + env->GetFieldID(list_expander_ret_class_, "capacity", "J"); + list_expander_offset_ret_address_ = + env->GetFieldID(list_expander_ret_class_, "offsetaddress", "J"); + list_expander_offset_ret_capacity_ = + env->GetFieldID(list_expander_ret_class_, "offsetcapacity", "J"); + jclass local_cache_class = env->FindClass("org/apache/arrow/gandiva/evaluator/JavaSecondaryCacheInterface"); secondary_cache_class_ = (jclass)env->NewGlobalRef(local_cache_class); @@ -164,7 +194,9 @@ void JNI_OnUnload(JavaVM* vm, void* reserved) { env->DeleteGlobalRef(configuration_builder_class_); env->DeleteGlobalRef(gandiva_exception_); env->DeleteGlobalRef(vector_expander_class_); + env->DeleteGlobalRef(listvector_expander_class_); env->DeleteGlobalRef(vector_expander_ret_class_); + env->DeleteGlobalRef(list_expander_ret_class_); env->DeleteGlobalRef(secondary_cache_class_); env->DeleteGlobalRef(cache_buf_ret_class_); } @@ -268,9 +300,12 @@ DataTypePtr ProtoTypeToDataType(const types::ExtGandivaType& ext_type) { return ProtoTypeToTimestamp(ext_type); case types::INTERVAL: return ProtoTypeToInterval(ext_type); - case types::FIXED_SIZE_BINARY: - case types::LIST: case types::STRUCT: + return arrow::struct_({field("lattitude", arrow::float64(), false), field("longitude", arrow::float64(), false)}); + case types::LIST: + return arrow::list(arrow::int32()); + //return arrow::list(arrow::utf8()); + case types::FIXED_SIZE_BINARY: case types::UNION: case types::DICTIONARY: case types::MAP: @@ -296,6 +331,7 @@ FieldPtr ProtoTypeToField(const types::Field& f) { NodePtr ProtoTypeToFieldNode(const types::FieldNode& node) { FieldPtr field_ptr = ProtoTypeToField(node.field()); + //std::cout << "LR created field " << field_ptr->ToString(true) << std::endl; if (field_ptr == nullptr) { std::cerr << "Unable to create field node from protobuf\n"; return nullptr; @@ -467,6 +503,7 @@ NodePtr ProtoTypeToNullNode(const types::NullNode& node) { NodePtr ProtoTypeToNode(const types::TreeNode& node) { if (node.has_fieldnode()) { + //std::cout << "LR Found ProtoTypeToNode fieldnode " << std::endl; return ProtoTypeToFieldNode(node.fieldnode()); } @@ -515,6 +552,7 @@ NodePtr ProtoTypeToNode(const types::TreeNode& node) { } if (node.has_stringnode()) { + //std::cout << "LR Found StringNode" << std::endl; return TreeExprBuilder::MakeStringLiteral(node.stringnode().value()); } @@ -624,10 +662,78 @@ Status make_record_batch_with_buf_addrs(SchemaPtr schema, int num_rows, new arrow::Buffer(reinterpret_cast(offsets_addr), offsets_size)); buffers.push_back(offsets); } +////////// + + + +auto type = field->type(); +auto type_id = type->id(); +//num_rows = num_records or ?? + if (type_id == arrow::Type::LIST) { + + if (buf_idx >= in_bufs_len) { + return Status::Invalid("insufficient number of in_buf_addrs"); + } + + // add offsets buffer for variable-len fields. + jlong offsets_addr = in_buf_addrs[buf_idx++]; + jlong offsets_size = in_buf_sizes[sz_idx++]; + auto offsets = std::shared_ptr( + new arrow::Buffer(reinterpret_cast(offsets_addr), offsets_size)); + buffers.push_back(offsets); - auto array_data = arrow::ArrayData::Make(field->type(), num_rows, std::move(buffers)); + + if (arrow::is_binary_like(type->field(0)->type()->id())) { + // child offsets length is internal data length + 1 + // offsets element is int32 + // so here i just allocate extra 32 bit for extra 1 length + jlong offsets_addr = in_buf_addrs[buf_idx++]; + jlong offsets_size = in_buf_sizes[sz_idx++]; + + auto child_offsets_buffer = std::shared_ptr( new arrow::Buffer(reinterpret_cast(offsets_addr), offsets_size)); + + buffers.push_back(std::move(child_offsets_buffer)); + } + } + + + //std::cout << "LR New ArrayData 1" << std::endl; + if (type->id() == arrow::Type::LIST) { + jlong offsets_addr = in_buf_addrs[buf_idx++]; + jlong offsets_size = in_buf_sizes[sz_idx++]; + auto data_buffer = std::shared_ptr( new arrow::Buffer(reinterpret_cast(offsets_addr), offsets_size)); + + + //std::cout << "LR New ArrayData List" << std::endl; + auto internal_type = type->field(0)->type(); + std::shared_ptr child_data; + if (arrow::is_primitive(internal_type->id())) { + //std::cout << "LR New ArrayData List 1" << std::endl; + child_data = arrow::ArrayData::Make(internal_type, 0, + {nullptr, std::move(data_buffer)}, 0); + } + if (arrow::is_binary_like(internal_type->id())) { + //std::cout << "LR New ArrayData List NYI 2" << std::endl; + //child_data = arrow::ArrayData::Make( + // internal_type, 0, + // {nullptr, std::move(data_buffer), std::move(child_data)}, 0); + } + + auto array_data = arrow::ArrayData::Make(type, num_rows, {std::move(buffers[0]), std::move(buffers[1])}, {child_data}); + columns.push_back(array_data); + + } else { + auto array_data = arrow::ArrayData::Make(type, num_rows, std::move(buffers)); columns.push_back(array_data); } + +///////// +//TODO use unique_ptr +//Was +//auto array_data = arrow::ArrayData::Make(field->type(), num_rows, std::move(buffers)); +//columns.push_back(array_data); + + } *batch = arrow::RecordBatch::Make(schema, num_rows, columns); return Status::OK(); } @@ -775,7 +881,7 @@ JNIEXPORT jlong JNICALL Java_org_apache_arrow_gandiva_evaluator_JniWrapper_build status = Projector::Make(schema_ptr, expr_vector, mode, config, sec_cache, &projector); if (!status.ok()) { - ss << "Failed to make LLVM module due to " << status.message() << "\n"; + ss << "Failed to make LLVM module [1]cdue to " << status.message() << "\n"; releaseProjectorInput(schema_arr, schema_bytes, exprs_arr, exprs_bytes, env); goto err_out; } @@ -797,12 +903,15 @@ JNIEXPORT jlong JNICALL Java_org_apache_arrow_gandiva_evaluator_JniWrapper_build /// class JavaResizableBuffer : public arrow::ResizableBuffer { public: - JavaResizableBuffer(JNIEnv* env, jobject jexpander, int32_t vector_idx, uint8_t* buffer, - int32_t len) + JavaResizableBuffer(JNIEnv* env, jobject jexpander, jmethodID jmethod, int32_t vector_idx, uint8_t* buffer, + int32_t len, bool isListVec = false) : ResizableBuffer(buffer, len), env_(env), jexpander_(jexpander), - vector_idx_(vector_idx) { + vector_idx_(vector_idx), + method_(jmethod), + isList(isListVec) + { size_ = 0; } @@ -810,27 +919,60 @@ class JavaResizableBuffer : public arrow::ResizableBuffer { Status Reserve(const int64_t new_capacity) override; - private: + public: JNIEnv* env_; jobject jexpander_; + jmethodID method_; int32_t vector_idx_; + bool isList; }; Status JavaResizableBuffer::Reserve(const int64_t new_capacity) { // callback into java to expand the buffer - jobject ret = env_->CallObjectMethod(jexpander_, vector_expander_method_, vector_idx_, + + //LR TODO listvector_expander_method_ vector_expander_method_ + jobject ret = env_->CallObjectMethod(jexpander_, method_, vector_idx_, new_capacity); if (env_->ExceptionCheck()) { env_->ExceptionDescribe(); env_->ExceptionClear(); - return Status::OutOfMemory("buffer expand failed in java"); + std::cout << "Buffer expand failed. New capacity is " << new_capacity << + " vector id " << vector_idx_ << " expander method " << method_ << + " jexpander_ " << jexpander_ << std::endl; + return Status::OutOfMemory("buffer expand failed in java."); } - jlong ret_address = env_->GetLongField(ret, vector_expander_ret_address_); - jlong ret_capacity = env_->GetLongField(ret, vector_expander_ret_capacity_); - data_ = reinterpret_cast(ret_address); - capacity_ = ret_capacity; + if (isList) { + jlong ret_address = env_->GetLongField(ret, list_expander_ret_address_); + jlong ret_capacity = env_->GetLongField(ret, list_expander_ret_capacity_); + jlong offset_ret_address = env_->GetLongField(ret, list_expander_offset_ret_address_); + jlong offset_ret_capacity = env_->GetLongField(ret, list_expander_offset_ret_capacity_); + + std::cout << "Buffer expand: New capacity is " << new_capacity << + " vector id " << vector_idx_ << " expander method " << method_ << + " jexpander_ " << jexpander_ << " returned size is " << ret_capacity << + " and the original buffer ptr=" << reinterpret_cast(data_) << " and the new ptr=" << ret_address << + " and the original offset ptr=" << reinterpret_cast(offsetBuffer) << " and the new ptr=" << offset_ret_address << std::endl; + + data_ = reinterpret_cast(ret_address); + capacity_ = ret_capacity; + + offsetBuffer = reinterpret_cast(offset_ret_address); + offsetCapacity = offset_ret_capacity; + } else { + jlong ret_address = env_->GetLongField(ret, vector_expander_ret_address_); + jlong ret_capacity = env_->GetLongField(ret, vector_expander_ret_capacity_); + + std::cout << "Buffer expand: New capacity is " << new_capacity << + " vector id " << vector_idx_ << " expander method " << method_ << + " jexpander_ " << jexpander_ << " returned size is " << ret_capacity << + " and the original buffer ptr=" << reinterpret_cast(data_) << " and the new ptr=" << ret_address << std::endl; + + data_ = reinterpret_cast(ret_address); + capacity_ = ret_capacity; + } + return Status::OK(); } @@ -859,10 +1001,11 @@ Status JavaResizableBuffer::Resize(const int64_t new_size, bool shrink_to_fit) { JNIEXPORT void JNICALL Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( - JNIEnv* env, jobject object, jobject jexpander, jlong module_id, jint num_rows, + JNIEnv* env, jobject object, jobject jexpander, jobject jListExpander, jlong module_id, jint num_rows, jlongArray buf_addrs, jlongArray buf_sizes, jint sel_vec_type, jint sel_vec_rows, jlong sel_vec_addr, jlong sel_vec_size, jlongArray out_buf_addrs, jlongArray out_buf_sizes) { + //std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector " << std::endl; Status status; std::shared_ptr holder = projector_modules_.Lookup(module_id); if (holder == nullptr) { @@ -898,7 +1041,21 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( if (!status.ok()) { break; } - + /*std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector " + << " Made a recordbatch num_rows " << num_rows + << in_batch->ToString() + << " there are " << out_bufs_len << " buffers " + << std::endl;*/ + std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector " + << " there are " << out_bufs_len << " buffers " + << std::endl; + for (int i = 0; i < out_bufs_len; i++) { + std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector " + << " buffer " << i + << "length " << out_sizes[i] + << std::endl; + } + std::shared_ptr selection_vector; auto selection_buffer = std::make_shared( reinterpret_cast(sel_vec_addr), sel_vec_size); @@ -925,6 +1082,7 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( break; } + std::shared_ptr outBufJava = nullptr; auto ret_types = holder->rettypes(); ArrayDataVector output; int buf_idx = 0; @@ -933,12 +1091,14 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( for (FieldPtr field : ret_types) { std::vector> buffers; + std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector -2 adding buffer idx=" << buf_idx << std::endl; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* validity_buf = reinterpret_cast(out_bufs[buf_idx++]); jlong bitmap_sz = out_sizes[sz_idx++]; buffers.push_back(std::make_shared(validity_buf, bitmap_sz)); if (arrow::is_binary_like(field->type()->id())) { + std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector -1 adding bufferbuffer idx=" << buf_idx << std::endl; CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); uint8_t* offsets_buf = reinterpret_cast(out_bufs[buf_idx++]); jlong offsets_sz = out_sizes[sz_idx++]; @@ -956,22 +1116,178 @@ Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector( "null"); break; } + + std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 1 adding buffer buffer idx=" << buf_idx - 1 << " size=" << data_sz << std::endl; buffers.push_back(std::make_shared( - env, jexpander, output_vector_idx, value_buf, data_sz)); + env, jexpander, vector_expander_method_, output_vector_idx, value_buf, data_sz)); + } else if (field->type()->id() == arrow::Type::LIST) { + std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 2 adding list offset buffer idx=" << buf_idx - 1 << " size=" << data_sz << std::endl; + std::cout << " size=" << out_sizes[sz_idx - 1] << " outsize index=" << sz_idx - 1 << " address " << out_bufs[buf_idx - 1] + << " output_vector_idx=" << output_vector_idx << std::endl; + buffers.push_back(std::make_shared( + env, jexpander, vector_expander_method_, output_vector_idx, value_buf, data_sz)); } else { buffers.push_back(std::make_shared(value_buf, data_sz)); } + + + if (field->type()->id() == arrow::Type::LIST) { + + std::vector> child_buffers; + + if (jListExpander == nullptr) { + status = Status::Invalid( + "expression has variable len output columns, but the jListExpander object is " + "null"); + break; + } + + + //LR TODO the two buffers... + data_sz = out_sizes[sz_idx++]; + std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 3 adding child nbuffer " << buf_idx + << " size=" << data_sz << std::endl; + CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); + uint8_t* child_offset_buf = reinterpret_cast(out_bufs[buf_idx++]); + child_buffers.push_back(std::make_shared( + env, jListExpander, listvector_expander_method_, output_vector_idx, child_offset_buf, data_sz)); + + + std::cout << "LR Java_org_apache_arrow_gandiva_evaluator_JniWrapper_evaluateProjector 4 adding child buffer " << buf_idx + << " size=" << out_sizes[sz_idx] << " outsize index=" << sz_idx << " address " << out_bufs[buf_idx] + << " output_vector_idx=" << output_vector_idx << std::endl; + data_sz = out_sizes[sz_idx++]; + CHECK_OUT_BUFFER_IDX_AND_BREAK(buf_idx, out_bufs_len); + uint8_t* child_data_buf = reinterpret_cast(out_bufs[buf_idx++]); + + outBufJava = std::make_shared( + env, jListExpander, listvector_expander_method_, output_vector_idx, child_data_buf, data_sz, true); + outBufJava->offsetBuffer = reinterpret_cast(out_bufs[1]); + outBufJava->offsetCapacity = out_sizes[1]; + child_buffers.push_back(outBufJava); + + std::shared_ptr dt2 = std::make_shared(); + auto array_data_child = arrow::ArrayData::Make(dt2, output_row_count, child_buffers); + //array_data_child-> + + + std::vector> kids; + kids.push_back(array_data_child); + //auto array_data = std::make_shared(field->type(), output_row_count); + auto array_data = arrow::ArrayData::Make(field->type(), output_row_count, buffers, kids); + array_data->child_data = std::move(kids); + output.push_back(array_data); + ++output_vector_idx; + + //std::cout << "LR jni_common there are " << buffers.size() << " buffers" << std::endl; + + } else { auto array_data = arrow::ArrayData::Make(field->type(), output_row_count, buffers); output.push_back(array_data); ++output_vector_idx; + } + } if (!status.ok()) { break; } + + //std::cout << "LR jni_common calling evaluate" << std::endl; status = holder->projector()->Evaluate(*in_batch, selection_vector.get(), output); + //LRtest1 + //std::cout << "LR jni_common after evaluating the output size is " << output.size() << std::endl; + arrow::ArraySpan sp(*(output[0])); + //std::cout << "LR jni_common after evaluating the output 0 is " << sp.ToArray()->ToString() << std::endl; + auto array_data = output[0]; + if (array_data->type->id() == arrow::Type::LIST) { + auto child_data = array_data->child_data[0]; + //std::cout << "LR jni_common child array[3] " << + //int32_t( (*(array_data->child_data[0])->buffers[1])[3*4]) << std::endl; + //std::cout << "LR jni_common child array[0] " << + //int32_t( (*(array_data->child_data[0])->buffers[1])[0*4]) << std::endl; + //std::cout << "LR jni_common child via data ptr array[0] " << + //int32_t( *(*(array_data->child_data[0])->buffers[1]).data()) << std::endl; + //std::cout << "LR jni_common there are records=" << array_data->length << " and the first one is=" + // << (array_data->child_data[0])->length << std::endl; + + //LRTest1 Start + int numRecords = (array_data->child_data[0])->length; + //int numRecords = (array_data->child_data[0])->length * array_data->length; + + //std::cout << "LR jni_common there are records=" << array_data->length << " and the first one is=" + // << (array_data->child_data[0])->length << " using numRecords=" << numRecords << std::endl; + //std::cout << "LR jni_common out_bufs[3]=" << out_bufs[3] << " after eval=" + // << (jlong)(array_data->child_data[0])->buffers[1]->data() << std::endl; + //LR test1 + out_bufs[3] = (jlong)(array_data->child_data[0])->buffers[1]->data(); + out_sizes[3] = (jlong)(array_data->child_data[0])->buffers[1]->capacity(); + + //Copy the new buffer ptr back to Java. The above two lines don't copy it to java, just to the local array. + //env->SetLongArrayRegion(out_buf_addrs, 0, out_bufs_len, out_bufs); + //env->SetLongArrayRegion(out_buf_sizes, 0, out_bufs_len, out_sizes); + + //array_data.child_data.at(0)->offset) + + //env->ReleaseLongArrayElements(out_buf_addrs, out_bufs, JNI_ABORT); + //memcpy((void*)out_bufs[3], (array_data->child_data[0])->buffers[1]->data(), recordSize); + //out_sizes[3] = recordSize; + //int test[] = {42,21,42,21,42}; + //memcpy((void *)out_bufs[3], test, 20); + + /*out_sizes[2] = numRecords * 20; + int test[numRecords * 20]; + for (int i = 0; i < numRecords; i++) { + test[i] = 0; + } + memcpy((void *)out_bufs[2], test, numRecords*4); + */ + + //LR test1 Havent tried yet. + //out_bufs[2] = (jlong)(array_data->child_data[0])->buffers[0]->data(); + //out_sizes[2] = (jlong)(array_data->child_data[0])->buffers[0]->capacity(); + + //out_bufs[1] = (jlong)(array_data->child_data[0])->buffers[0]->data(); + //out_sizes[1] = (jlong)(array_data->child_data[0])->buffers[0]->capacity(); + + //out_bufs[1] = (jlong)(array_data)->buffers[0]->data(); + //out_sizes[1] = (jlong)(array_data)->buffers[0]->capacity(); + out_bufs[1] = (jlong) outBufJava->offsetBuffer; + out_sizes[1] = (jlong) outBufJava->offsetCapacity; + + env->SetLongArrayRegion(out_buf_addrs, 0, out_bufs_len, out_bufs); + env->SetLongArrayRegion(out_buf_sizes, 0, out_bufs_len, out_sizes); + + + + + //validity buffer? + //bool valid[] = {true, true, true, true, true}; + //memcpy(&out_bufs[2], valid, 5); + //out_sizes[2] = 5; + + + + + + + + //offset buffer is not needed. + //int32_t offsetsBuffer[] = {0}; + //memcpy(&out_bufs[1], offsetsBuffer, 1 * 4); + //out_sizes[1] = 1; + + //std::cout << "LR jni_common after copy parent buff child array[0] " << + //"," << int32_t( (out_bufs[3])) << + //"," << int32_t( (out_bufs[3]+4)) << + //"," << int32_t( (out_bufs[3])+8) << + //"," << int32_t( (out_bufs[3])+12) << std::endl; + //LRTest1 End + } + } while (0); + env->ReleaseLongArrayElements(buf_addrs, in_buf_addrs, JNI_ABORT); env->ReleaseLongArrayElements(buf_sizes, in_buf_sizes, JNI_ABORT); env->ReleaseLongArrayElements(out_buf_addrs, out_bufs, JNI_ABORT); @@ -1061,7 +1377,7 @@ JNIEXPORT jlong JNICALL Java_org_apache_arrow_gandiva_evaluator_JniWrapper_build // good to invoke the filter builder now status = Filter::Make(schema_ptr, condition_ptr, config, sec_cache, &filter); if (!status.ok()) { - ss << "Failed to make LLVM module due to " << status.message() << "\n"; + ss << "Failed to make LLVM module [2] due to " << status.message() << "\n"; releaseFilterInput(schema_arr, schema_bytes, condition_arr, condition_bytes, env); goto err_out; } diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java index 0155af08234..39358a084ba 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ExpressionRegistry.java @@ -178,11 +178,13 @@ private static ArrowType getArrowType(ExtGandivaType type) { return new ArrowType.Decimal(0, 0, 128); case GandivaType.INTERVAL_VALUE: return new ArrowType.Interval(mapArrowIntervalUnit(type.getIntervalType())); + case GandivaType.STRUCT_VALUE: + return new ArrowType.Struct(); + case GandivaType.LIST_VALUE: + return new ArrowType.List(); case GandivaType.FIXED_SIZE_BINARY_VALUE: case GandivaType.MAP_VALUE: case GandivaType.DICTIONARY_VALUE: - case GandivaType.LIST_VALUE: - case GandivaType.STRUCT_VALUE: case GandivaType.UNION_VALUE: default: assert false; diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniWrapper.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniWrapper.java index 293d51a87a5..f883ed70815 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniWrapper.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/JniWrapper.java @@ -50,6 +50,7 @@ native long buildProjector(Object cache, byte[] schemaBuf, byte[] exprListBuf, * and store the output in ValueVectors. Throws an exception in case of errors * * @param expander VectorExpander object. Used for callbacks from cpp. + * @param listExpander ListVectorExpander object. Used for callbacks from cpp. * @param moduleId moduleId representing expressions. Created using a call to * buildNativeCode * @param numRows Number of rows in the record batch @@ -63,7 +64,7 @@ native long buildProjector(Object cache, byte[] schemaBuf, byte[] exprListBuf, * @param outSizes The allocated size of the output buffers. On successful evaluation, * the result is stored in the output buffers */ - native void evaluateProjector(Object expander, long moduleId, int numRows, + native void evaluateProjector(Object expander, Object listExpander, long moduleId, int numRows, long[] bufAddrs, long[] bufSizes, int selectionVectorType, int selectionVectorSize, long selectionVectorBufferAddr, long selectionVectorBufferSize, diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java new file mode 100644 index 00000000000..c14d2e810e8 --- /dev/null +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/ListVectorExpander.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.gandiva.evaluator; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.vector.complex.ListVector; + +/** + * This class provides the functionality to expand output vectors using a callback mechanism from + * gandiva. + */ +public class ListVectorExpander { + private final ListVector[] vectors; + + public ListVectorExpander(ListVector[] vectors) { + this.vectors = vectors; + } + + /** + * Result of vector expansion. + */ + public static class ExpandResult { + public long address; + public long capacity; + public long offsetaddress; + public long offsetcapacity; + + /** + * fdsfsdfds. + * @param address dsfds + * @param capacity dfsdf + * @param offsetad dsfdsfsd + * @param offsetcap dfsfs + * + */ + public ExpandResult(long address, long capacity, long offsetad, long offsetcap) { + this.address = address; + this.capacity = capacity; + this.offsetaddress = offsetad; + this.offsetcapacity = offsetcap; + } + } + + /** + * Expand vector at specified index. This is used as a back call from jni, and is only + * relevant for ListVectors. + * + * @param index index of buffer in the list passed to jni. + * @param toCapacity the size to which the buffer should be expanded to. + * + * @return address and size of the buffer after expansion. + */ + public ExpandResult expandOutputVectorAtIndex(int index, long toCapacity) { + if (index >= vectors.length || vectors[index] == null) { + throw new IllegalArgumentException("invalid index " + index); + } + + int valueBufferIndex = 1; + ListVector vector = vectors[index]; + while (vector.getDataVector().getFieldBuffers().get(valueBufferIndex).capacity() < toCapacity) { + vector.reAlloc(); + } + System.out.println("LR Expanding ListVector. New capacity=" + + vector.getDataVector().getFieldBuffers().get(valueBufferIndex).capacity()); + System.out.println("LR Expanding ListVector. Offset data is "); + ArrowBuf ab = vector.getOffsetBuffer(); + String s = "offsetBuffer = ["; + for (int i = 0; i < 20; i++) { + s += ab.getInt(i) + ","; + } + System.out.println(s); + return new ExpandResult( + vector.getDataVector().getFieldBuffers().get(valueBufferIndex).memoryAddress(), + vector.getDataVector().getFieldBuffers().get(valueBufferIndex).capacity(), + vector.getOffsetBuffer().memoryAddress(), + vector.getOffsetBuffer().capacity()); + } + +} diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java index c146fce26c1..89321f5911a 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/evaluator/Projector.java @@ -22,16 +22,17 @@ import org.apache.arrow.gandiva.exceptions.EvaluatorClosedException; import org.apache.arrow.gandiva.exceptions.GandivaException; -import org.apache.arrow.gandiva.exceptions.UnsupportedTypeException; import org.apache.arrow.gandiva.expression.ArrowTypeHelper; import org.apache.arrow.gandiva.expression.ExpressionTree; import org.apache.arrow.gandiva.ipc.GandivaTypes; import org.apache.arrow.gandiva.ipc.GandivaTypes.SelectionVectorType; import org.apache.arrow.memory.ArrowBuf; import org.apache.arrow.vector.BaseVariableWidthVector; -import org.apache.arrow.vector.FixedWidthVector; +import org.apache.arrow.vector.BitVectorHelper; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.VariableWidthVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.StructVector; import org.apache.arrow.vector.ipc.message.ArrowBuffer; import org.apache.arrow.vector.ipc.message.ArrowRecordBatch; import org.apache.arrow.vector.types.pojo.Schema; @@ -332,6 +333,7 @@ private void evaluate(int numRows, List buffers, List buf throw new EvaluatorClosedException(); } + logger.error("LR Projector.java evaluate"); if (numExprs != outColumns.size()) { logger.info("Expected " + numExprs + " columns, got " + outColumns.size()); throw new GandivaException("Incorrect number of columns for the output vector"); @@ -352,21 +354,32 @@ private void evaluate(int numRows, List buffers, List buf boolean hasVariableWidthColumns = false; BaseVariableWidthVector[] resizableVectors = new BaseVariableWidthVector[outColumns.size()]; + ListVector[] resizableListVectors = new ListVector[outColumns.size()]; + long[] outAddrs = new long[3 * outColumns.size()]; long[] outSizes = new long[3 * outColumns.size()]; + idx = 0; int outColumnIdx = 0; for (ValueVector valueVector : outColumns) { - boolean isFixedWith = valueVector instanceof FixedWidthVector; + if (valueVector instanceof ListVector) { + //LR HACK there is only one column. + logger.error("LR Projector.java evaluate out columns=" + outColumns.size()); + outAddrs = new long[5 * outColumns.size()]; + outSizes = new long[5 * outColumns.size()]; + } + + /*boolean isFixedWith = valueVector instanceof FixedWidthVector;*/ boolean isVarWidth = valueVector instanceof VariableWidthVector; - if (!isFixedWith && !isVarWidth) { + /*if (!isFixedWith && !isVarWidth) { throw new UnsupportedTypeException( "Unsupported value vector type " + valueVector.getField().getFieldType()); - } + }*/ outAddrs[idx] = valueVector.getValidityBuffer().memoryAddress(); outSizes[idx++] = valueVector.getValidityBuffer().capacity(); if (isVarWidth) { + logger.error("LR Projector.java evaluate isVarWidth setting buffer=" + idx); outAddrs[idx] = valueVector.getOffsetBuffer().memoryAddress(); outSizes[idx++] = valueVector.getOffsetBuffer().capacity(); hasVariableWidthColumns = true; @@ -374,19 +387,355 @@ private void evaluate(int numRows, List buffers, List buf // save vector to allow for resizing. resizableVectors[outColumnIdx] = (BaseVariableWidthVector) valueVector; } - outAddrs[idx] = valueVector.getDataBuffer().memoryAddress(); - outSizes[idx++] = valueVector.getDataBuffer().capacity(); + if (valueVector instanceof StructVector) { + outAddrs[idx] = ((StructVector) valueVector).getChild("lattitude").getDataBuffer().memoryAddress(); + outSizes[idx++] = ((StructVector) valueVector).getChild("lattitude").getDataBuffer().capacity(); + } + if (valueVector instanceof ListVector) { + + /*((ListVector) valueVector).reAlloc(); + ((ListVector) valueVector).reAlloc(); + ((ListVector) valueVector).reAlloc(); //100 rows + ((ListVector) valueVector).reAlloc(); + ((ListVector) valueVector).reAlloc();*/ + + hasVariableWidthColumns = true; + resizableListVectors[outColumnIdx] = (ListVector) valueVector; + //LR TODO figure out what to use here resizableVectors[outColumnIdx] = (BaseVariableWidthVector) valueVector; + //resizableVectors[outColumnIdx] = (BaseVariableWidthVector) valueVector; + //resizeableVectors[outColumnIdx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0); + + List fieldBufs = ((ListVector) valueVector).getDataVector().getFieldBuffers(); + logger.error("LR Projector.java evaluate ListVector has buffers=" + fieldBufs.size()); + + + logger.error("LR Projector.java evaluate isVarlistvector Width setting buffer=" + idx); + outAddrs[idx] = valueVector.getOffsetBuffer().memoryAddress(); + outSizes[idx++] = valueVector.getOffsetBuffer().capacity(); + + //vector valid + logger.error("LR Projector.java evaluate isVarlistvector Width setting buffer=" + idx); + outAddrs[idx] = ((ListVector) valueVector).getDataVector().getValidityBufferAddress(); + outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0).capacity(); + + + //vector offset + logger.error("LR Projector.java evaluate ListVector passing data buffer as " + idx); + + + + //This doesnt actually allocate any memory. + //((ListVector) valueVector).setInitialCapacity(1000000); + //while (((ListVector) valueVector).getValueCapacity() < 1000000) { + // ((ListVector) valueVector).reAlloc(); + //} + + logger.error("LR Projector.java evaluate isVarlistvector Width setting buffer=" + idx); + //The realloc avoids dynamic resizing, will have to be fixed later. + outAddrs[idx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(1).memoryAddress(); + outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(1).capacity(); + //logger.error("LR Projector.java evaluate ListVector set buffer " + idx + + // " as ptr=" + outAddrs[idx - 1] + " size " + outSizes[idx - 1]); + + //vector data + //outAddrs[idx] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(2).memoryAddress(); + //outSizes[idx++] = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(2).capacity(); + + //LR HACK TODO ((ListVector) valueVector).getDataVector().capacity(); + + + + + + + + + + + + } else { + outAddrs[idx] = valueVector.getDataBuffer().memoryAddress(); + outSizes[idx++] = valueVector.getDataBuffer().capacity(); + } valueVector.setValueCount(selectionVectorRecordCount); outColumnIdx++; } + logger.error("LR Projector.java evaluate calling evaluateProjector with buffers=" + idx); + logger.error("LR Projector.java before evaluateProjector buffer[3]=" + outAddrs[3]); + logger.error("LR Projector.java before evaluateProjector buffer[1]=" + outAddrs[1]); wrapper.evaluateProjector( hasVariableWidthColumns ? new VectorExpander(resizableVectors) : null, + hasVariableWidthColumns ? new ListVectorExpander(resizableListVectors) : null, this.moduleId, numRows, bufAddrs, bufSizes, selectionVectorType, selectionVectorRecordCount, selectionVectorAddr, selectionVectorSize, outAddrs, outSizes); + + //outColumns.clear(); + //FieldType ft = new FieldType(true, int32, null); + //ListVector lv = new ListVector("res", allocator, ft, null); + //System.out.println(intVector.getDataVector()); + + + logger.error("LR Projector.java after evaluateProjector buffer[3]=" + outAddrs[3]); + logger.error("LR Projector.java after evaluateProjector buffer[1]=" + outAddrs[1]); + for (ValueVector valueVector : outColumns) { + if (valueVector instanceof ListVector) { + //LR HACK + + //int numRecordsFound = 5 * 100; + //int numRecordsFound = Math.toIntExact(outSizes[3]) / 4; + //logger.error("LR Projector.java using numRecords=" + numRecordsFound + " outSizes[3]=" + outSizes[3]); + + //LR HACK 9-13 10:34 + /*public void startList() { + vector.startNewValue(idx()); + writer.setPosition(vector.getOffsetBuffer().getInt((idx() + 1L) * OFFSET_WIDTH)); + listStarted = true; + } + + @Override + public void endList() { + vector.getOffsetBuffer().setInt((idx() + 1L) * OFFSET_WIDTH, writer.idx()); + setPosition(idx() + 1); + listStarted = false; + */ + + //ArrowBuf ab = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[2], outAddrs[2]); + + + //ArrowBuf ab2 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[3], outAddrs[3]); + + logger.error("LR Projector.java using numRecords=" + + selectionVectorRecordCount + " outSizes[3]=" + outSizes[3]); + + //import org.apache.arrow.vector.complex.impl.UnionListWriter; + /*UnionListWriter writer = ((ListVector) valueVector).getWriter(); + for (int i = 0; i < selectionVectorRecordCount; i++) { + writer.startList(); + writer.setPosition(i); + for (int j = 0; j < 5; j++) { + int index = ((j + (5 * i)) * 4); + //Not sure whats going on. Buffer too small? + try { + writer.writeInt(ab2.getInt(index)); + //writer.writeInt(42); + } catch (IndexOutOfBoundsException e) { + continue; + } + } + writer.setValueCount(5); + writer.endList(); + } + ((ListVector) valueVector).setValueCount(selectionVectorRecordCount);*/ + + + //offsetBuffer = [0,83886080,327680,1280,5,167772160,655360,2560,10,251658240,983040,3840,15, + //335544320,1310720,5120,20, + //419430400,1638400,6400,25,503316480,1966080,7680,30,587202560,2293760,8960,35,671088640,2621440,10240,40, + //754974720,2949120,11520, + + + + + + + + + + + + String s = ""; + List fv = ((ListVector) valueVector).getDataVector().getFieldBuffers(); + for (ArrowBuf ab : fv) { + s = ""; + for (int i = 0; i < 20; i++) { + s += ab.getInt(i) + ","; + } + logger.error("LR Projector.java before updating listvector. size=" + + ab.capacity() + " buffer=" + s); + } + + ArrowBuf fvv = ((ListVector) valueVector).getValidityBuffer(); + s = ""; + for (int i = 0; i < 20; i++) { + s += fvv.getInt(i) + ","; + } + logger.error("LR Projector.java before updating listvector. getValidityBuffer=" + + fvv.capacity() + " buffer=" + s); + + ArrowBuf fvvv = ((ListVector) valueVector).getOffsetBuffer(); + s = ""; + for (int i = 0; i < 20; i++) { + s += fvvv.getInt(i) + ","; + } + logger.error("LR Projector.java before updating listvector. getOffsetBuffer=" + + fvvv.capacity() + " buffer=" + s); + + + ((ListVector) valueVector).getDataVector().setValueCount(selectionVectorRecordCount * 5); + + ((ListVector) valueVector).setLastSet(selectionVectorRecordCount - 1); + /* + //Validity then data. + ArrowBuf abb = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[2], outAddrs[2]); + ArrowBuf abb2 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[3], outAddrs[3]); + List outBufsNew = new ArrayList(); + + //outBufsNew.add(ab0); + outBufsNew.add(abb); + outBufsNew.add(abb2); + ArrowFieldNode afn = new ArrowFieldNode(selectionVectorRecordCount * 5, 0); + ((ListVector) valueVector).getDataVector().clear(); + ((ListVector) valueVector).getDataVector().loadFieldBuffers(afn, outBufsNew); + + //TODO Need to get validity [0] and offset [1] buffer for the listvector. + //((ListVector) valueVector).getDataVector().loadFieldBuffers(afn, outBufsNew); + + List outBufsNew2 = new ArrayList(); + + + + ArrowBuf mabb22 = new ArrowBuf(ReferenceManager.NO_OP, null, selectionVectorRecordCount, outAddrs[0]); + for (int i = 0; i < selectionVectorRecordCount; i++) { + BitVectorHelper.setBit(mabb22, i); + } + + ArrowBuf mabb2 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[1], outAddrs[1]); + //for (int i = 0; i < selectionVectorRecordCount; i++) { + // mabb2.setInt(i * 4, 5 * i); + //} + s = "offset? buffer mabb2, outAddrs[0]="; + for (int i = 0; i < 20; i++) { + s += mabb2.getInt(i) + ","; + } + System.out.println(s); + + outBufsNew2.add(mabb22); + outBufsNew2.add(mabb2); + ArrowFieldNode afn2 = new ArrowFieldNode(selectionVectorRecordCount, 0); + ((ListVector) valueVector).loadFieldBuffers(afn2, outBufsNew2); + + + */ + + //((ListVector) valueVector).setValueCount(selectionVectorRecordCount); + //((ListVector) valueVector).getDataVector().setValueCount(selectionVectorRecordCount); + + int simple = 0; + try { + for (int i = 0; i < selectionVectorRecordCount * 5; i++) { + BitVectorHelper.setBit(((ListVector) valueVector).getDataVector().getValidityBuffer(), i); + simple++; + } + } catch (IndexOutOfBoundsException e) { + simple = 0; + } + try { + for (int i = 0; i < selectionVectorRecordCount; i++) { + BitVectorHelper.setBit(((ListVector) valueVector).getValidityBuffer(), i); + simple++; + } + } catch (IndexOutOfBoundsException e) { + simple = 0; + } + + + + + + + /* + + + + try { + for (int i = 0; i < selectionVectorRecordCount; i++) { + BitVectorHelper.setBit(((ListVector) valueVector).getValidityBuffer(), i); + simple++; + } + } catch (IndexOutOfBoundsException e) { + simple = 0; + } + + + for (int i = 0; i < selectionVectorRecordCount; i++) { + ((ListVector) valueVector).getOffsetBuffer().setInt(i * 4, 5 * i); + } + */ + + + + + + + + //LR HACK 9-13 10:34 All the multiline comment + /* + import org.apache.arrow.memory.ReferenceManager; + import org.apache.arrow.vector.BitVectorHelper; + import org.apache.arrow.vector.ipc.message.ArrowFieldNode; + */ + //ArrowBuf ab0 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[2], outAddrs[2]); + /*ArrowBuf abb = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[2], outAddrs[2]); + ArrowBuf abb2 = new ArrowBuf(ReferenceManager.NO_OP, null, outSizes[3], outAddrs[3]); + List outBufsNew = new ArrayList(); + + StringBuilder sbb = new StringBuilder(); + abb.print(sbb, 1); + System.out.println("LR abb=" + sbb); + + //outBufsNew.add(ab0); + outBufsNew.add(abb); + outBufsNew.add(abb2); + ArrowFieldNode afn = new ArrowFieldNode(numRecordsFound, 0); + ((ListVector) valueVector).getDataVector().clear(); + ((ListVector) valueVector).getDataVector().loadFieldBuffers(afn, outBufsNew); + + //LR HACK 9-12 10:09 + //ArrowBuf offBuff = ((ListVector) valueVector).getOffsetBuffer(); + //for (int i = 0; i < 101; i++) { + // offBuff.setInt(i, 5 * i * 4); + //} + + + + + + //byte[] valid = new byte[outsizes[2]]; + //LR HACK + //for (int i = 0; i < outSizes[2]; i++) { + int simple = 0; + try { + for (int i = 0; i < numRecordsFound * 4; i++) { + BitVectorHelper.setBit(((ListVector) valueVector).getDataVector().getValidityBuffer(), i); + simple++; + //BitVectorHelper.setBit(((ListVector) valueVector).getValidityBuffer(), i); + } + } catch (IndexOutOfBoundsException e) { + simple = 0; + } + ArrowBuf ab3 = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(0); + for (int i = 0; i < 50; i++) { + System.out.println("LR arrowbuf after=" + Integer.reverseBytes(ab3.getInt(i))); + System.out.println("LR arrowbuf after=" + ab3.getInt(i)); + System.out.println("LR arrowbuf after=" + ab3.getShort(i)); + } + ArrowBuf ab3a = ((ListVector) valueVector).getDataVector().getFieldBuffers().get(1); + for (int i = 0; i < 50; i++) { + System.out.println("LR arrowbuf aftera=" + Integer.reverseBytes(ab3a.getInt(i))); + System.out.println("LR arrowbuf aftera=" + ab3a.getInt(i)); + System.out.println("LR arrowbuf aftera=" + ab3a.getShort(i)); + } + IntVector iv = (IntVector) ((ListVector) valueVector).getDataVector(); + for (int i = 0; i < 50; i++) { + System.out.println("LR IntVector=" + iv.get(i)); + }*/ + } + } + } /** diff --git a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java index 90f8684b455..47d97c6b0dc 100644 --- a/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java +++ b/java/gandiva/src/main/java/org/apache/arrow/gandiva/expression/ArrowTypeHelper.java @@ -143,6 +143,16 @@ private static void initArrowTypeDate(ArrowType.Date dateType, } } + private static void initArrowTypeStruct(ArrowType.Struct structType, + GandivaTypes.ExtGandivaType.Builder builder) { + builder.setType(GandivaTypes.GandivaType.STRUCT); + } + + private static void initArrowTypeList(ArrowType.List listType, + GandivaTypes.ExtGandivaType.Builder builder) { + builder.setType(GandivaTypes.GandivaType.LIST); + } + private static void initArrowTypeTime(ArrowType.Time timeType, GandivaTypes.ExtGandivaType.Builder builder) { short timeUnit = timeType.getUnit().getFlatbufID(); @@ -284,9 +294,11 @@ public static GandivaTypes.ExtGandivaType arrowTypeToProtobuf(ArrowType arrowTyp break; } case Type.List: { // 12 + ArrowTypeHelper.initArrowTypeList((ArrowType.List) arrowType, builder); break; } case Type.Struct_: { // 13 + ArrowTypeHelper.initArrowTypeStruct((ArrowType.Struct) arrowType, builder); break; } case Type.Union: { // 14 diff --git a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java index 8dd759ee885..df0fd8639b2 100644 --- a/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java +++ b/java/gandiva/src/test/java/org/apache/arrow/gandiva/evaluator/ProjectorTest.java @@ -48,6 +48,7 @@ import org.apache.arrow.vector.IntervalYearVector; import org.apache.arrow.vector.ValueVector; import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.complex.ListVector; import org.apache.arrow.vector.holders.NullableIntervalDayHolder; import org.apache.arrow.vector.holders.NullableIntervalYearHolder; import org.apache.arrow.vector.ipc.message.ArrowFieldNode; @@ -57,6 +58,7 @@ import org.apache.arrow.vector.types.TimeUnit; import org.apache.arrow.vector.types.pojo.ArrowType; import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; import org.apache.arrow.vector.types.pojo.Schema; import org.junit.Assert; import org.junit.Ignore; @@ -288,6 +290,54 @@ public void testEvaluate() throws GandivaException, Exception { eval.close(); } + @Test + public void testEvaluateArray() throws GandivaException, Exception { + ArrowType int32 = new ArrowType.Int(32, true); + ArrowType listInt32 = new ArrowType.List(); + + Field a = Field.nullable("a", int32); + List args = Lists.newArrayList(a); + + Field retType = Field.nullable("c", listInt32); + ExpressionTree root = TreeBuilder.makeExpression("array_makeGandiva", args, retType); + + List exprs = Lists.newArrayList(root); + + Schema schema = new Schema(args); + Projector eval = Projector.make(schema, exprs); + + int numRows = 16; + byte[] validity = new byte[]{(byte) 255, 0}; + // second half is "undefined" + int[] aValues = new int[]{1, 2, 3, 42, 5}; + + + ArrowBuf validitya = buf(validity); + ArrowBuf valuesa = intBuf(aValues); + ArrowRecordBatch batch = + new ArrowRecordBatch( + numRows, + Lists.newArrayList(new ArrowFieldNode(numRows, 5)), + Lists.newArrayList(validitya, valuesa)); + + FieldType ft = new FieldType(true, int32, null); + ListVector intVector = new ListVector("result", allocator, ft, null); + //ListVector.allocateNew(numRows); + + List output = new ArrayList(); + output.add(intVector); + eval.evaluate(batch, output); + + System.out.println(intVector.getDataVector()); + + + + // free buffers + releaseRecordBatch(batch); + releaseValueVectors(output); + eval.close(); + } + @Test public void testEvaluateDivZero() throws GandivaException, Exception { Field a = Field.nullable("a", int32); diff --git a/java/pom.xml b/java/pom.xml index 6b3a35ec978..71c750db0a2 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -713,16 +713,6 @@ - format - memory - vector - tools - adapter/jdbc - flight - performance - algorithm - adapter/avro - compression