From a7b09f1f560304f3dedf2dfcf86d99bf38ec02e7 Mon Sep 17 00:00:00 2001 From: Ziy1-Tan Date: Wed, 14 Jun 2023 23:31:18 +0800 Subject: [PATCH 01/26] Feat: C++ SDK pushdown prototype Signed-off-by: Ziy1-Tan --- cpp/CMakeLists.txt | 2 + cpp/cmake/apache-arrow.cmake | 10 ++- cpp/include/gar/reader/arrow_chunk_reader.h | 25 ++++--- cpp/include/gar/utils/filesystem.h | 26 +++++++ cpp/src/arrow_chunk_reader.cc | 44 ++++++++++++ cpp/src/filesystem.cc | 77 ++++++++++++++++++++- cpp/test/test_arrow_chunk_reader.cc | 56 +++++++++++++++ 7 files changed, 225 insertions(+), 15 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 63440c90c..37e0e2835 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -191,11 +191,13 @@ macro(build_gar) if(APPLE) target_link_libraries(gar PRIVATE -Wl,-force_load gar_arrow_static "${GAR_PARQUET_STATIC_LIB}" + "${GAR_DATASET_STATIC_LIB}" "${GAR_ACERO_STATIC_LIB}" "${GAR_ARROW_BUNDLED_DEPS_STATIC_LIB}") else() target_link_libraries(gar PRIVATE -Wl,--exclude-libs,ALL -Wl,--whole-archive gar_arrow_static "${GAR_PARQUET_STATIC_LIB}" + "${GAR_DATASET_STATIC_LIB}" "${GAR_ARROW_ACERO_STATIC_LIB}" "${GAR_ARROW_BUNDLED_DEPS_STATIC_LIB}" -Wl,--no-whole-archive) endif() diff --git a/cpp/cmake/apache-arrow.cmake b/cpp/cmake/apache-arrow.cmake index 950f026b6..4a37486c6 100644 --- a/cpp/cmake/apache-arrow.cmake +++ b/cpp/cmake/apache-arrow.cmake @@ -44,6 +44,9 @@ function(build_arrow) set(GAR_PARQUET_STATIC_LIB_FILENAME "${CMAKE_STATIC_LIBRARY_PREFIX}parquet${CMAKE_STATIC_LIBRARY_SUFFIX}") set(GAR_PARQUET_STATIC_LIB "${GAR_ARROW_STATIC_LIBRARY_DIR}/${GAR_PARQUET_STATIC_LIB_FILENAME}" CACHE INTERNAL "parquet lib") + set(GAR_DATASET_STATIC_LIB_FILENAME + "${CMAKE_STATIC_LIBRARY_PREFIX}arrow_dataset${CMAKE_STATIC_LIBRARY_SUFFIX}") + set(GAR_DATASET_STATIC_LIB "${GAR_ARROW_STATIC_LIBRARY_DIR}/${GAR_DATASET_STATIC_LIB_FILENAME}" CACHE INTERNAL "arrow dataset lib") set(GAR_ARROW_BUNDLED_DEPS_STATIC_LIB_FILENAME "${CMAKE_STATIC_LIBRARY_PREFIX}arrow_bundled_dependencies${CMAKE_STATIC_LIBRARY_SUFFIX}") set(GAR_ARROW_BUNDLED_DEPS_STATIC_LIB @@ -83,7 +86,7 @@ function(build_arrow) "-DARROW_S3=ON") set(GAR_ARROW_INCLUDE_DIR "${GAR_ARROW_PREFIX}/include" CACHE INTERNAL "arrow include directory") - set(GAR_ARROW_BUILD_BYPRODUCTS "${GAR_ARROW_STATIC_LIB}" "${GAR_PARQUET_STATIC_LIB}") + set(GAR_ARROW_BUILD_BYPRODUCTS "${GAR_ARROW_STATIC_LIB}" "${GAR_PARQUET_STATIC_LIB}" "${GAR_DATASET_STATIC_LIB}") find_package(Threads) find_package(Arrow QUIET) @@ -104,16 +107,21 @@ function(build_arrow) set(GAR_ARROW_LIBRARY_TARGET gar_arrow_static) set(GAR_PARQUET_LIBRARY_TARGET gar_parquet_static) + set(GAR_DATASET_LIBRARY_TARGET gar_dataset_static) file(MAKE_DIRECTORY "${GAR_ARROW_INCLUDE_DIR}") add_library(${GAR_ARROW_LIBRARY_TARGET} STATIC IMPORTED) add_library(${GAR_PARQUET_LIBRARY_TARGET} STATIC IMPORTED) + add_library(${GAR_DATASET_LIBRARY_TARGET} STATIC IMPORTED) set_target_properties(${GAR_ARROW_LIBRARY_TARGET} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${GAR_ARROW_INCLUDE_DIR} IMPORTED_LOCATION ${GAR_ARROW_STATIC_LIB}) set_target_properties(${GAR_PARQUET_LIBRARY_TARGET} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${GAR_ARROW_INCLUDE_DIR} IMPORTED_LOCATION ${GAR_PARQUET_STATIC_LIB}) + set_target_properties(${GAR_DATASET_LIBRARY_TARGET} + PROPERTIES INTERFACE_INCLUDE_DIRECTORIES ${GAR_ARROW_INCLUDE_DIR} + IMPORTED_LOCATION ${GAR_DATASET_STATIC_LIB}) if (ARROW_VERSION_TO_BUILD GREATER_EQUAL "12.0.0") set(GAR_ARROW_ACERO_STATIC_LIB_FILENAME "${CMAKE_STATIC_LIBRARY_PREFIX}arrow_acero${CMAKE_STATIC_LIBRARY_SUFFIX}") diff --git a/cpp/include/gar/reader/arrow_chunk_reader.h b/cpp/include/gar/reader/arrow_chunk_reader.h index abbd9ae9a..26f4f326b 100644 --- a/cpp/include/gar/reader/arrow_chunk_reader.h +++ b/cpp/include/gar/reader/arrow_chunk_reader.h @@ -33,6 +33,9 @@ limitations under the License. namespace arrow { class Array; class Table; +namespace compute { +class Expression; +} } // namespace arrow namespace GAR_NAMESPACE_INTERNAL { @@ -52,19 +55,7 @@ class VertexPropertyArrowChunkReader { VertexPropertyArrowChunkReader(const VertexInfo& vertex_info, const PropertyGroup& property_group, const std::string& prefix, - IdType chunk_index = 0) - : vertex_info_(vertex_info), - property_group_(property_group), - chunk_index_(chunk_index), - seek_id_(chunk_index * vertex_info.GetChunkSize()), - chunk_table_(nullptr) { - GAR_ASSIGN_OR_RAISE_ERROR(fs_, FileSystemFromUriOrPath(prefix, &prefix_)); - GAR_ASSIGN_OR_RAISE_ERROR(auto pg_path_prefix, - vertex_info.GetPathPrefix(property_group)); - std::string base_dir = prefix_ + pg_path_prefix; - GAR_ASSIGN_OR_RAISE_ERROR(chunk_num_, - utils::GetVertexChunkNum(prefix_, vertex_info)); - } + IdType chunk_index = 0); /** * @brief Sets chunk position indicator for reader by internal vertex id. @@ -97,6 +88,11 @@ class VertexPropertyArrowChunkReader { */ Result> GetChunk() noexcept; + /** + * @brief Return the current arrow chunk table of chunk position indicator. + */ + Result> GetChunk2() noexcept; + /** * @brief Get the vertex id range of current chunk. * @@ -126,6 +122,8 @@ class VertexPropertyArrowChunkReader { */ IdType GetChunkNum() const noexcept { return chunk_num_; } + Status Filter(const arrow::compute::Expression& filter); + private: VertexInfo vertex_info_; PropertyGroup property_group_; @@ -134,6 +132,7 @@ class VertexPropertyArrowChunkReader { IdType seek_id_; IdType chunk_num_; std::shared_ptr chunk_table_; + std::shared_ptr filter_; std::shared_ptr fs_; }; diff --git a/cpp/include/gar/utils/filesystem.h b/cpp/include/gar/utils/filesystem.h index 5313a4dd2..7320346ee 100644 --- a/cpp/include/gar/utils/filesystem.h +++ b/cpp/include/gar/utils/filesystem.h @@ -18,6 +18,7 @@ limitations under the License. #include #include +#include #include "gar/utils/file_type.h" #include "gar/utils/result.h" @@ -28,6 +29,12 @@ limitations under the License. namespace arrow { class Buffer; class Table; +namespace compute { +class Expression; +} +namespace dataset { +class FileFormat; +} namespace fs { class FileSystem; } @@ -65,6 +72,19 @@ class FileSystem { Result> ReadFileToTable( const std::string& path, FileType file_type) const noexcept; + /** + * @brief Read and filter a file as an arrow::Table. + * + * @param path The path of the file to read. + * @param file_type The type of the file to read. + * @param filters The predictors to apply to the file. + * @return A Result containing a std::shared_ptr to an arrow::Table if + * successful, or an error Status if unsuccessful. + */ + Result> ReadAndFilterFileToTable( + const std::string& path, FileType file_type, + std::shared_ptr filter) const noexcept; + /** * @brief Read a file and convert its bytes to a value of type T. * @@ -116,6 +136,12 @@ class FileSystem { Result GetFileNumOfDir(const std::string& dir_path, bool recursive = false) const noexcept; + private: + std::shared_ptr ToFileFormat( + const FileType type) const; + + Status CastTableColumnType(std::shared_ptr table) const; + private: std::shared_ptr arrow_fs_; }; diff --git a/cpp/src/arrow_chunk_reader.cc b/cpp/src/arrow_chunk_reader.cc index c80a774cd..37c7e7730 100644 --- a/cpp/src/arrow_chunk_reader.cc +++ b/cpp/src/arrow_chunk_reader.cc @@ -19,9 +19,32 @@ limitations under the License. #include "gar/reader/arrow_chunk_reader.h" #include "gar/utils/reader_utils.h" +#if defined(ARROW_VERSION) && ARROW_VERSION >= 12000000 +#include "arrow/compute/expression.h" +#else +#include "arrow/compute/exec/expression.h" +#endif namespace GAR_NAMESPACE_INTERNAL { +VertexPropertyArrowChunkReader::VertexPropertyArrowChunkReader( + const VertexInfo& vertex_info, const PropertyGroup& property_group, + const std::string& prefix, IdType chunk_index) + : vertex_info_(vertex_info), + property_group_(property_group), + chunk_index_(chunk_index), + seek_id_(chunk_index * vertex_info.GetChunkSize()), + chunk_table_(nullptr), + filter_(std::make_shared( + arrow::compute::literal(true))) { + GAR_ASSIGN_OR_RAISE_ERROR(fs_, FileSystemFromUriOrPath(prefix, &prefix_)); + GAR_ASSIGN_OR_RAISE_ERROR(auto pg_path_prefix, + vertex_info.GetPathPrefix(property_group)); + std::string base_dir = prefix_ + pg_path_prefix; + GAR_ASSIGN_OR_RAISE_ERROR(chunk_num_, + utils::GetVertexChunkNum(prefix_, vertex_info)); +} + Result> VertexPropertyArrowChunkReader::GetChunk() noexcept { if (chunk_table_ == nullptr) { @@ -36,6 +59,21 @@ VertexPropertyArrowChunkReader::GetChunk() noexcept { return chunk_table_->Slice(row_offset); } +Result> +VertexPropertyArrowChunkReader::GetChunk2() noexcept { + if (chunk_table_ == nullptr) { + GAR_ASSIGN_OR_RAISE( + auto chunk_file_path, + vertex_info_.GetFilePath(property_group_, chunk_index_)); + std::string path = prefix_ + chunk_file_path; + GAR_ASSIGN_OR_RAISE(chunk_table_, + fs_->ReadAndFilterFileToTable( + path, property_group_.GetFileType(), filter_)); + } + IdType row_offset = seek_id_ - chunk_index_ * vertex_info_.GetChunkSize(); + return chunk_table_->Slice(row_offset); +} + Result> VertexPropertyArrowChunkReader::GetRange() noexcept { if (chunk_table_ == nullptr) { @@ -48,6 +86,12 @@ VertexPropertyArrowChunkReader::GetRange() noexcept { seek_id_ + chunk_table_->num_rows() - row_offset); } +Status VertexPropertyArrowChunkReader::Filter( + const arrow::compute::Expression& filter) { + filter_ = std::make_shared(filter); + return Status::OK(); +} + Status AdjListArrowChunkReader::seek_src(IdType id) noexcept { if (adj_list_type_ != AdjListType::unordered_by_source && adj_list_type_ != AdjListType::ordered_by_source) { diff --git a/cpp/src/filesystem.cc b/cpp/src/filesystem.cc index 14bb9e7db..37380c628 100644 --- a/cpp/src/filesystem.cc +++ b/cpp/src/filesystem.cc @@ -22,7 +22,12 @@ limitations under the License. #include "arrow/util/uri.h" #include "parquet/arrow/reader.h" #include "parquet/arrow/writer.h" - +#if defined(ARROW_VERSION) && ARROW_VERSION >= 12000000 +#include "arrow/compute/expression.h" +#else +#include "arrow/compute/exec/expression.h" +#endif +#include "arrow/dataset/api.h" #include "gar/utils/filesystem.h" namespace GAR_NAMESPACE_INTERNAL { @@ -81,6 +86,57 @@ Result ParseFileSystemUri(const std::string& uri_string) { } } // namespace detail +std::shared_ptr FileSystem::ToFileFormat( + const FileType type) const { + switch (type) { + case CSV: + return std::make_shared(); + case PARQUET: + return std::make_shared(); + case ORC: + return std::make_shared(); + default: + return nullptr; + } +} + +Status FileSystem::CastTableColumnType( + std::shared_ptr table) const { + // cast string array to large string array as we need concatenate chunks in + // some places, e.g., in vineyard + for (int i = 0; i < table->num_columns(); ++i) { + std::shared_ptr type = table->column(i)->type(); + if (type->id() == arrow::Type::STRING) { + type = arrow::large_utf8(); + } else if (type->id() == arrow::Type::BINARY) { + type = arrow::large_binary(); + } + if (type->Equals(table->column(i)->type())) { + continue; + } + // do casting + auto field = table->field(i)->WithType(type); + std::shared_ptr chunked_array; + if (type->Equals(arrow::large_utf8())) { + auto status = detail::CastToLargeOffsetArray( + table->column(i), type, chunked_array); + GAR_RETURN_NOT_OK(status); + } else if (type->Equals(arrow::large_binary())) { + auto status = detail::CastToLargeOffsetArray( + table->column(i), type, chunked_array); + GAR_RETURN_NOT_OK(status); + } else { + // noop + chunked_array = table->column(i); + } + GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(table, table->RemoveColumn(i)); + GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN( + table, table->AddColumn(i, field, chunked_array)); + } + return Status::OK(); +} Result> FileSystem::ReadFileToTable( const std::string& path, FileType file_type) const noexcept { arrow::MemoryPool* pool = arrow::default_memory_pool(); @@ -155,6 +211,25 @@ Result> FileSystem::ReadFileToTable( return table; } +Result> FileSystem::ReadAndFilterFileToTable( + const std::string& path, FileType file_type, + std::shared_ptr filter) const noexcept { + std::shared_ptr format = ToFileFormat(file_type); + GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN( + auto factory, arrow::dataset::FileSystemDatasetFactory::Make( + arrow_fs_, {path}, format, + arrow::dataset::FileSystemFactoryOptions())); + GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(auto dataset, factory->Finish()); + // Read specified columns with a row filter + GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(auto scan_builder, dataset->NewScan()); + RETURN_NOT_ARROW_OK(scan_builder->Filter(*filter)); + GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(auto scanner, scan_builder->Finish()); + GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(auto table, scanner->ToTable()); + + GAR_RETURN_NOT_OK(CastTableColumnType(table)); + return table; +} + template Result FileSystem::ReadFileToValue(const std::string& path) const noexcept { T ret; diff --git a/cpp/test/test_arrow_chunk_reader.cc b/cpp/test/test_arrow_chunk_reader.cc index 386140c2a..c6692bad0 100644 --- a/cpp/test/test_arrow_chunk_reader.cc +++ b/cpp/test/test_arrow_chunk_reader.cc @@ -31,6 +31,8 @@ limitations under the License. #define CATCH_CONFIG_MAIN #include +namespace cp = arrow::compute; + TEST_CASE("test_vertex_property_arrow_chunk_reader") { std::string root; REQUIRE(GetTestResourceRoot(&root).ok()); @@ -90,6 +92,60 @@ TEST_CASE("test_vertex_property_arrow_chunk_reader") { REQUIRE(reader.seek(1024).IsIndexError()); } +TEST_CASE("test_vertex_property_pushdown") { + std::string root; + REQUIRE(GetTestResourceRoot(&root).ok()); + + // read file and construct graph info + std::string path = root + "/ldbc_sample/parquet/ldbc_sample.graph.yml"; + auto maybe_graph_info = GAR_NAMESPACE::GraphInfo::Load(path); + REQUIRE(maybe_graph_info.status().ok()); + auto graph_info = maybe_graph_info.value(); + + // construct vertex chunk reader + std::string label = "person", property_name = "gender"; + REQUIRE(graph_info.GetVertexInfo(label).status().ok()); + auto maybe_group = graph_info.GetVertexPropertyGroup(label, property_name); + REQUIRE(maybe_group.status().ok()); + auto group = maybe_group.value(); + auto maybe_reader = GAR_NAMESPACE::ConstructVertexPropertyArrowChunkReader( + graph_info, label, group); + REQUIRE(maybe_reader.status().ok()); + auto reader = maybe_reader.value(); + + SECTION("no pushdown") { + std::cout << "Reader & no pushdown:" << std::endl; + int i = 0; + do { + auto result = reader.GetChunk2(); + REQUIRE(!result.has_error()); + auto [l, r] = reader.GetRange().value(); + auto table = result.value(); + std::cout << "Chunk Index: " << i << ",\tRow Nums: " << table->num_rows() + << ",\tTable Range: [" << l << ", " << r << "]" << '\n'; + i++; + reader.next_chunk(); + } while (i < reader.GetChunkNum()); + } + + SECTION("pushdown `gender=female`") { + std::cout << "\nReader & Pushdown `gender=female`:" << std::endl; + reader.seek(0); + reader.Filter(cp::equal(cp::field_ref("gender"), cp::literal("female"))); + int i = 0; + do { + auto result = reader.GetChunk2(); + REQUIRE(!result.has_error()); + auto [l, r] = reader.GetRange().value(); + auto table = result.value(); + std::cout << "Chunk Index: " << i << ",\tRow Nums: " << table->num_rows() + << ",\tTable Range: [" << l << ", " << r << "]" << '\n'; + i++; + reader.next_chunk(); + } while (i < reader.GetChunkNum()); + } +} + TEST_CASE("test_adj_list_arrow_chunk_reader") { std::string root; REQUIRE(GetTestResourceRoot(&root).ok()); From 2760b3ec872c2c8646f92deafa11649a87be6d85 Mon Sep 17 00:00:00 2001 From: Ziy1-Tan Date: Fri, 16 Jun 2023 11:44:10 +0800 Subject: [PATCH 02/26] Refactor: helper function support pushdown Signed-off-by: Ziy1-Tan --- cpp/include/gar/reader/arrow_chunk_reader.h | 20 ++--- cpp/include/gar/utils/filesystem.h | 6 +- cpp/src/arrow_chunk_reader.cc | 27 ++----- cpp/src/filesystem.cc | 87 ++++++++++----------- cpp/test/test_arrow_chunk_reader.cc | 52 ++++++------ 5 files changed, 87 insertions(+), 105 deletions(-) diff --git a/cpp/include/gar/reader/arrow_chunk_reader.h b/cpp/include/gar/reader/arrow_chunk_reader.h index 26f4f326b..a10fa63a2 100644 --- a/cpp/include/gar/reader/arrow_chunk_reader.h +++ b/cpp/include/gar/reader/arrow_chunk_reader.h @@ -52,10 +52,10 @@ class VertexPropertyArrowChunkReader { * @param property_group The property group that describes the property group. * @param prefix The absolute prefix. */ - VertexPropertyArrowChunkReader(const VertexInfo& vertex_info, - const PropertyGroup& property_group, - const std::string& prefix, - IdType chunk_index = 0); + VertexPropertyArrowChunkReader( + const VertexInfo& vertex_info, const PropertyGroup& property_group, + const std::string& prefix, IdType chunk_index = 0, + std::shared_ptr filter = nullptr); /** * @brief Sets chunk position indicator for reader by internal vertex id. @@ -88,11 +88,6 @@ class VertexPropertyArrowChunkReader { */ Result> GetChunk() noexcept; - /** - * @brief Return the current arrow chunk table of chunk position indicator. - */ - Result> GetChunk2() noexcept; - /** * @brief Get the vertex id range of current chunk. * @@ -122,7 +117,7 @@ class VertexPropertyArrowChunkReader { */ IdType GetChunkNum() const noexcept { return chunk_num_; } - Status Filter(const arrow::compute::Expression& filter); + Status Filter(std::shared_ptr filter); private: VertexInfo vertex_info_; @@ -586,7 +581,8 @@ class AdjListPropertyArrowChunkReader { static inline Result ConstructVertexPropertyArrowChunkReader( const GraphInfo& graph_info, const std::string& label, - const PropertyGroup& property_group) noexcept { + const PropertyGroup& property_group, + std::shared_ptr filter = nullptr) noexcept { VertexInfo vertex_info; GAR_ASSIGN_OR_RAISE(vertex_info, graph_info.GetVertexInfo(label)); if (!vertex_info.ContainPropertyGroup(property_group)) { @@ -594,7 +590,7 @@ ConstructVertexPropertyArrowChunkReader( label, "."); } return VertexPropertyArrowChunkReader(vertex_info, property_group, - graph_info.GetPrefix()); + graph_info.GetPrefix(), 0, filter); } /** diff --git a/cpp/include/gar/utils/filesystem.h b/cpp/include/gar/utils/filesystem.h index 7320346ee..5638422f0 100644 --- a/cpp/include/gar/utils/filesystem.h +++ b/cpp/include/gar/utils/filesystem.h @@ -137,10 +137,8 @@ class FileSystem { bool recursive = false) const noexcept; private: - std::shared_ptr ToFileFormat( - const FileType type) const; - - Status CastTableColumnType(std::shared_ptr table) const; + std::shared_ptr GetFileFormat( + const FileType file_type) const; private: std::shared_ptr arrow_fs_; diff --git a/cpp/src/arrow_chunk_reader.cc b/cpp/src/arrow_chunk_reader.cc index 37c7e7730..72389544d 100644 --- a/cpp/src/arrow_chunk_reader.cc +++ b/cpp/src/arrow_chunk_reader.cc @@ -26,17 +26,18 @@ limitations under the License. #endif namespace GAR_NAMESPACE_INTERNAL { - +namespace cp = arrow::compute; VertexPropertyArrowChunkReader::VertexPropertyArrowChunkReader( const VertexInfo& vertex_info, const PropertyGroup& property_group, - const std::string& prefix, IdType chunk_index) + const std::string& prefix, IdType chunk_index, + std::shared_ptr filter) : vertex_info_(vertex_info), property_group_(property_group), chunk_index_(chunk_index), seek_id_(chunk_index * vertex_info.GetChunkSize()), chunk_table_(nullptr), - filter_(std::make_shared( - arrow::compute::literal(true))) { + filter_(filter ? filter + : std::make_shared(cp::literal(true))) { GAR_ASSIGN_OR_RAISE_ERROR(fs_, FileSystemFromUriOrPath(prefix, &prefix_)); GAR_ASSIGN_OR_RAISE_ERROR(auto pg_path_prefix, vertex_info.GetPathPrefix(property_group)); @@ -47,20 +48,6 @@ VertexPropertyArrowChunkReader::VertexPropertyArrowChunkReader( Result> VertexPropertyArrowChunkReader::GetChunk() noexcept { - if (chunk_table_ == nullptr) { - GAR_ASSIGN_OR_RAISE( - auto chunk_file_path, - vertex_info_.GetFilePath(property_group_, chunk_index_)); - std::string path = prefix_ + chunk_file_path; - GAR_ASSIGN_OR_RAISE(chunk_table_, fs_->ReadFileToTable( - path, property_group_.GetFileType())); - } - IdType row_offset = seek_id_ - chunk_index_ * vertex_info_.GetChunkSize(); - return chunk_table_->Slice(row_offset); -} - -Result> -VertexPropertyArrowChunkReader::GetChunk2() noexcept { if (chunk_table_ == nullptr) { GAR_ASSIGN_OR_RAISE( auto chunk_file_path, @@ -87,8 +74,8 @@ VertexPropertyArrowChunkReader::GetRange() noexcept { } Status VertexPropertyArrowChunkReader::Filter( - const arrow::compute::Expression& filter) { - filter_ = std::make_shared(filter); + std::shared_ptr filter) { + filter_ = filter; return Status::OK(); } diff --git a/cpp/src/filesystem.cc b/cpp/src/filesystem.cc index 37380c628..cf1904454 100644 --- a/cpp/src/filesystem.cc +++ b/cpp/src/filesystem.cc @@ -31,7 +31,8 @@ limitations under the License. #include "gar/utils/filesystem.h" namespace GAR_NAMESPACE_INTERNAL { - +namespace cp = arrow::compute; +namespace ds = arrow::dataset; namespace detail { template static Status CastToLargeOffsetArray( @@ -86,57 +87,20 @@ Result ParseFileSystemUri(const std::string& uri_string) { } } // namespace detail -std::shared_ptr FileSystem::ToFileFormat( +std::shared_ptr FileSystem::GetFileFormat( const FileType type) const { switch (type) { case CSV: - return std::make_shared(); + return std::make_shared(); case PARQUET: - return std::make_shared(); + return std::make_shared(); case ORC: - return std::make_shared(); + return std::make_shared(); default: return nullptr; } } -Status FileSystem::CastTableColumnType( - std::shared_ptr table) const { - // cast string array to large string array as we need concatenate chunks in - // some places, e.g., in vineyard - for (int i = 0; i < table->num_columns(); ++i) { - std::shared_ptr type = table->column(i)->type(); - if (type->id() == arrow::Type::STRING) { - type = arrow::large_utf8(); - } else if (type->id() == arrow::Type::BINARY) { - type = arrow::large_binary(); - } - if (type->Equals(table->column(i)->type())) { - continue; - } - // do casting - auto field = table->field(i)->WithType(type); - std::shared_ptr chunked_array; - if (type->Equals(arrow::large_utf8())) { - auto status = detail::CastToLargeOffsetArray( - table->column(i), type, chunked_array); - GAR_RETURN_NOT_OK(status); - } else if (type->Equals(arrow::large_binary())) { - auto status = detail::CastToLargeOffsetArray( - table->column(i), type, chunked_array); - GAR_RETURN_NOT_OK(status); - } else { - // noop - chunked_array = table->column(i); - } - GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(table, table->RemoveColumn(i)); - GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN( - table, table->AddColumn(i, field, chunked_array)); - } - return Status::OK(); -} Result> FileSystem::ReadFileToTable( const std::string& path, FileType file_type) const noexcept { arrow::MemoryPool* pool = arrow::default_memory_pool(); @@ -213,8 +177,8 @@ Result> FileSystem::ReadFileToTable( Result> FileSystem::ReadAndFilterFileToTable( const std::string& path, FileType file_type, - std::shared_ptr filter) const noexcept { - std::shared_ptr format = ToFileFormat(file_type); + std::shared_ptr filter) const noexcept { + std::shared_ptr format = GetFileFormat(file_type); GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN( auto factory, arrow::dataset::FileSystemDatasetFactory::Make( arrow_fs_, {path}, format, @@ -225,8 +189,39 @@ Result> FileSystem::ReadAndFilterFileToTable( RETURN_NOT_ARROW_OK(scan_builder->Filter(*filter)); GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(auto scanner, scan_builder->Finish()); GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(auto table, scanner->ToTable()); - - GAR_RETURN_NOT_OK(CastTableColumnType(table)); + // cast string array to large string array as we need concatenate chunks in + // some places, e.g., in vineyard + for (int i = 0; i < table->num_columns(); ++i) { + std::shared_ptr type = table->column(i)->type(); + if (type->id() == arrow::Type::STRING) { + type = arrow::large_utf8(); + } else if (type->id() == arrow::Type::BINARY) { + type = arrow::large_binary(); + } + if (type->Equals(table->column(i)->type())) { + continue; + } + // do casting + auto field = table->field(i)->WithType(type); + std::shared_ptr chunked_array; + if (type->Equals(arrow::large_utf8())) { + auto status = detail::CastToLargeOffsetArray( + table->column(i), type, chunked_array); + GAR_RETURN_NOT_OK(status); + } else if (type->Equals(arrow::large_binary())) { + auto status = detail::CastToLargeOffsetArray( + table->column(i), type, chunked_array); + GAR_RETURN_NOT_OK(status); + } else { + // noop + chunked_array = table->column(i); + } + GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(table, table->RemoveColumn(i)); + GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN( + table, table->AddColumn(i, field, chunked_array)); + } return table; } diff --git a/cpp/test/test_arrow_chunk_reader.cc b/cpp/test/test_arrow_chunk_reader.cc index c6692bad0..1110c2c86 100644 --- a/cpp/test/test_arrow_chunk_reader.cc +++ b/cpp/test/test_arrow_chunk_reader.cc @@ -106,43 +106,49 @@ TEST_CASE("test_vertex_property_pushdown") { std::string label = "person", property_name = "gender"; REQUIRE(graph_info.GetVertexInfo(label).status().ok()); auto maybe_group = graph_info.GetVertexPropertyGroup(label, property_name); + auto filter = std::make_shared( + cp::equal(cp::field_ref("gender"), cp::literal("female"))); REQUIRE(maybe_group.status().ok()); auto group = maybe_group.value(); - auto maybe_reader = GAR_NAMESPACE::ConstructVertexPropertyArrowChunkReader( - graph_info, label, group); - REQUIRE(maybe_reader.status().ok()); - auto reader = maybe_reader.value(); - SECTION("no pushdown") { - std::cout << "Reader & no pushdown:" << std::endl; + auto walkReader = [&](GAR_NAMESPACE::VertexPropertyArrowChunkReader& reader) { int i = 0; + int sum = 0; do { - auto result = reader.GetChunk2(); + auto result = reader.GetChunk(); REQUIRE(!result.has_error()); auto [l, r] = reader.GetRange().value(); auto table = result.value(); - std::cout << "Chunk Index: " << i << ",\tRow Nums: " << table->num_rows() - << ",\tTable Range: [" << l << ", " << r << "]" << '\n'; + std::cout << "Chunk : " << i << ",\tNums: " << table->num_rows() + << ",\tRange: [" << l << ", " << r << "]" << '\n'; i++; + sum += table->num_rows(); reader.next_chunk(); } while (i < reader.GetChunkNum()); + std::cout << "item size: " << sum << "/" + << graph_info.GetVertexInfo(label)->GetChunkSize() * + reader.GetChunkNum() + << '\n'; + }; + + SECTION("filter by helper function") { + std::cout << "filter by ConstructVertexPropertyArrowChunkReader():" + << std::endl; + auto maybe_reader = GAR_NAMESPACE::ConstructVertexPropertyArrowChunkReader( + graph_info, label, group, filter); + REQUIRE(maybe_reader.status().ok()); + walkReader(maybe_reader.value()); } - SECTION("pushdown `gender=female`") { - std::cout << "\nReader & Pushdown `gender=female`:" << std::endl; + SECTION("filter by function Filter()") { + std::cout << "\nfilter by Filter():" << std::endl; + auto maybe_reader = GAR_NAMESPACE::ConstructVertexPropertyArrowChunkReader( + graph_info, label, group); + REQUIRE(maybe_reader.status().ok()); + auto reader = maybe_reader.value(); reader.seek(0); - reader.Filter(cp::equal(cp::field_ref("gender"), cp::literal("female"))); - int i = 0; - do { - auto result = reader.GetChunk2(); - REQUIRE(!result.has_error()); - auto [l, r] = reader.GetRange().value(); - auto table = result.value(); - std::cout << "Chunk Index: " << i << ",\tRow Nums: " << table->num_rows() - << ",\tTable Range: [" << l << ", " << r << "]" << '\n'; - i++; - reader.next_chunk(); - } while (i < reader.GetChunkNum()); + reader.Filter(filter); + walkReader(reader); } } From f0c68b4e41225bdf43bc3b384337efd3b5a0845f Mon Sep 17 00:00:00 2001 From: Ziy1-Tan Date: Fri, 16 Jun 2023 17:14:19 +0800 Subject: [PATCH 03/26] Feat: column projection support Signed-off-by: Ziy1-Tan --- cpp/include/gar/reader/arrow_chunk_reader.h | 18 ++++++++++---- cpp/include/gar/utils/filesystem.h | 4 ++- cpp/src/arrow_chunk_reader.cc | 27 +++++++++++++++------ cpp/src/filesystem.cc | 6 ++++- cpp/test/test_arrow_chunk_reader.cc | 18 +++++++++++--- 5 files changed, 55 insertions(+), 18 deletions(-) diff --git a/cpp/include/gar/reader/arrow_chunk_reader.h b/cpp/include/gar/reader/arrow_chunk_reader.h index a10fa63a2..f99d82639 100644 --- a/cpp/include/gar/reader/arrow_chunk_reader.h +++ b/cpp/include/gar/reader/arrow_chunk_reader.h @@ -17,6 +17,7 @@ limitations under the License. #define GAR_READER_ARROW_CHUNK_READER_H_ #include +#include #include #include #include @@ -55,7 +56,8 @@ class VertexPropertyArrowChunkReader { VertexPropertyArrowChunkReader( const VertexInfo& vertex_info, const PropertyGroup& property_group, const std::string& prefix, IdType chunk_index = 0, - std::shared_ptr filter = nullptr); + std::shared_ptr filter = nullptr, + std::optional> columns = std::nullopt); /** * @brief Sets chunk position indicator for reader by internal vertex id. @@ -117,7 +119,11 @@ class VertexPropertyArrowChunkReader { */ IdType GetChunkNum() const noexcept { return chunk_num_; } - Status Filter(std::shared_ptr filter); + void Filter(std::shared_ptr filter); + void ClearFilter(); + + void Project(std::vector columns); + void ClearProjection(); private: VertexInfo vertex_info_; @@ -128,6 +134,7 @@ class VertexPropertyArrowChunkReader { IdType chunk_num_; std::shared_ptr chunk_table_; std::shared_ptr filter_; + std::optional> columns_; std::shared_ptr fs_; }; @@ -582,15 +589,16 @@ static inline Result ConstructVertexPropertyArrowChunkReader( const GraphInfo& graph_info, const std::string& label, const PropertyGroup& property_group, - std::shared_ptr filter = nullptr) noexcept { + std::shared_ptr filter = nullptr, + std::optional> columns = std::nullopt) noexcept { VertexInfo vertex_info; GAR_ASSIGN_OR_RAISE(vertex_info, graph_info.GetVertexInfo(label)); if (!vertex_info.ContainPropertyGroup(property_group)) { return Status::KeyError("No property group ", property_group, " in vertex ", label, "."); } - return VertexPropertyArrowChunkReader(vertex_info, property_group, - graph_info.GetPrefix(), 0, filter); + return VertexPropertyArrowChunkReader( + vertex_info, property_group, graph_info.GetPrefix(), 0, filter, columns); } /** diff --git a/cpp/include/gar/utils/filesystem.h b/cpp/include/gar/utils/filesystem.h index 5638422f0..39e24b3d8 100644 --- a/cpp/include/gar/utils/filesystem.h +++ b/cpp/include/gar/utils/filesystem.h @@ -17,6 +17,7 @@ limitations under the License. #define GAR_UTILS_FILESYSTEM_H_ #include +#include #include #include @@ -83,7 +84,8 @@ class FileSystem { */ Result> ReadAndFilterFileToTable( const std::string& path, FileType file_type, - std::shared_ptr filter) const noexcept; + std::shared_ptr filter, + std::optional> columns) const noexcept; /** * @brief Read a file and convert its bytes to a value of type T. diff --git a/cpp/src/arrow_chunk_reader.cc b/cpp/src/arrow_chunk_reader.cc index 72389544d..eb82acd7b 100644 --- a/cpp/src/arrow_chunk_reader.cc +++ b/cpp/src/arrow_chunk_reader.cc @@ -30,14 +30,16 @@ namespace cp = arrow::compute; VertexPropertyArrowChunkReader::VertexPropertyArrowChunkReader( const VertexInfo& vertex_info, const PropertyGroup& property_group, const std::string& prefix, IdType chunk_index, - std::shared_ptr filter) + std::shared_ptr filter, + std::optional> columns) : vertex_info_(vertex_info), property_group_(property_group), chunk_index_(chunk_index), seek_id_(chunk_index * vertex_info.GetChunkSize()), chunk_table_(nullptr), filter_(filter ? filter - : std::make_shared(cp::literal(true))) { + : std::make_shared(cp::literal(true))), + columns_(columns) { GAR_ASSIGN_OR_RAISE_ERROR(fs_, FileSystemFromUriOrPath(prefix, &prefix_)); GAR_ASSIGN_OR_RAISE_ERROR(auto pg_path_prefix, vertex_info.GetPathPrefix(property_group)); @@ -53,9 +55,9 @@ VertexPropertyArrowChunkReader::GetChunk() noexcept { auto chunk_file_path, vertex_info_.GetFilePath(property_group_, chunk_index_)); std::string path = prefix_ + chunk_file_path; - GAR_ASSIGN_OR_RAISE(chunk_table_, - fs_->ReadAndFilterFileToTable( - path, property_group_.GetFileType(), filter_)); + GAR_ASSIGN_OR_RAISE(chunk_table_, fs_->ReadAndFilterFileToTable( + path, property_group_.GetFileType(), + filter_, columns_)); } IdType row_offset = seek_id_ - chunk_index_ * vertex_info_.GetChunkSize(); return chunk_table_->Slice(row_offset); @@ -73,10 +75,21 @@ VertexPropertyArrowChunkReader::GetRange() noexcept { seek_id_ + chunk_table_->num_rows() - row_offset); } -Status VertexPropertyArrowChunkReader::Filter( +void VertexPropertyArrowChunkReader::Filter( std::shared_ptr filter) { filter_ = filter; - return Status::OK(); +} + +void VertexPropertyArrowChunkReader::ClearFilter() { + filter_ = std::make_shared(cp::literal(true)); +} + +void VertexPropertyArrowChunkReader::Project(std::vector columns) { + columns_ = std::optional>(columns); +} + +void VertexPropertyArrowChunkReader::ClearProjection() { + columns_ = std::nullopt; } Status AdjListArrowChunkReader::seek_src(IdType id) noexcept { diff --git a/cpp/src/filesystem.cc b/cpp/src/filesystem.cc index cf1904454..d79e5884b 100644 --- a/cpp/src/filesystem.cc +++ b/cpp/src/filesystem.cc @@ -177,7 +177,8 @@ Result> FileSystem::ReadFileToTable( Result> FileSystem::ReadAndFilterFileToTable( const std::string& path, FileType file_type, - std::shared_ptr filter) const noexcept { + std::shared_ptr filter, + std::optional> columns) const noexcept { std::shared_ptr format = GetFileFormat(file_type); GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN( auto factory, arrow::dataset::FileSystemDatasetFactory::Make( @@ -187,6 +188,9 @@ Result> FileSystem::ReadAndFilterFileToTable( // Read specified columns with a row filter GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(auto scan_builder, dataset->NewScan()); RETURN_NOT_ARROW_OK(scan_builder->Filter(*filter)); + if (columns) { + RETURN_NOT_ARROW_OK(scan_builder->Project(columns.value())); + } GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(auto scanner, scan_builder->Finish()); GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(auto table, scanner->ToTable()); // cast string array to large string array as we need concatenate chunks in diff --git a/cpp/test/test_arrow_chunk_reader.cc b/cpp/test/test_arrow_chunk_reader.cc index 1110c2c86..b5e4f6518 100644 --- a/cpp/test/test_arrow_chunk_reader.cc +++ b/cpp/test/test_arrow_chunk_reader.cc @@ -106,19 +106,24 @@ TEST_CASE("test_vertex_property_pushdown") { std::string label = "person", property_name = "gender"; REQUIRE(graph_info.GetVertexInfo(label).status().ok()); auto maybe_group = graph_info.GetVertexPropertyGroup(label, property_name); - auto filter = std::make_shared( - cp::equal(cp::field_ref("gender"), cp::literal("female"))); REQUIRE(maybe_group.status().ok()); auto group = maybe_group.value(); + // pushdown options + auto filter = std::make_shared( + cp::equal(cp::field_ref("gender"), cp::literal("female"))); + std::vector column_names = {"firstName", "lastName"}; + auto walkReader = [&](GAR_NAMESPACE::VertexPropertyArrowChunkReader& reader) { int i = 0; int sum = 0; + std::vector names; do { auto result = reader.GetChunk(); REQUIRE(!result.has_error()); auto [l, r] = reader.GetRange().value(); auto table = result.value(); + names = table->ColumnNames(); std::cout << "Chunk : " << i << ",\tNums: " << table->num_rows() << ",\tRange: [" << l << ", " << r << "]" << '\n'; i++; @@ -129,13 +134,18 @@ TEST_CASE("test_vertex_property_pushdown") { << graph_info.GetVertexInfo(label)->GetChunkSize() * reader.GetChunkNum() << '\n'; + std::cout << "Column names: "; + for (const auto& n : names) { + std::cout << n << ' '; + } + std::cout << '\n'; }; SECTION("filter by helper function") { std::cout << "filter by ConstructVertexPropertyArrowChunkReader():" << std::endl; auto maybe_reader = GAR_NAMESPACE::ConstructVertexPropertyArrowChunkReader( - graph_info, label, group, filter); + graph_info, label, group, filter, column_names); REQUIRE(maybe_reader.status().ok()); walkReader(maybe_reader.value()); } @@ -146,8 +156,8 @@ TEST_CASE("test_vertex_property_pushdown") { graph_info, label, group); REQUIRE(maybe_reader.status().ok()); auto reader = maybe_reader.value(); - reader.seek(0); reader.Filter(filter); + reader.Project(column_names); walkReader(reader); } } From 68b1133a800fcc10a12f0b58bd2c01ce91de247d Mon Sep 17 00:00:00 2001 From: Ziy1-Tan Date: Sat, 17 Jun 2023 20:54:49 +0800 Subject: [PATCH 04/26] Refactor: wrap filter and columns into FilterOptions Signed-off-by: Ziy1-Tan --- cpp/include/gar/reader/arrow_chunk_reader.h | 35 +++++++++++++-------- cpp/include/gar/utils/filesystem.h | 10 ++++-- cpp/src/arrow_chunk_reader.cc | 28 ++++++----------- cpp/src/filesystem.cc | 17 +++++----- cpp/test/test_arrow_chunk_reader.cc | 20 ++++++------ 5 files changed, 54 insertions(+), 56 deletions(-) diff --git a/cpp/include/gar/reader/arrow_chunk_reader.h b/cpp/include/gar/reader/arrow_chunk_reader.h index f99d82639..3102b09bb 100644 --- a/cpp/include/gar/reader/arrow_chunk_reader.h +++ b/cpp/include/gar/reader/arrow_chunk_reader.h @@ -22,6 +22,7 @@ limitations under the License. #include #include +#include "arrow/compute/api.h" #include "gar/graph_info.h" #include "gar/utils/data_type.h" #include "gar/utils/filesystem.h" @@ -41,6 +42,16 @@ class Expression; namespace GAR_NAMESPACE_INTERNAL { +using Columns = std::vector; +struct FilterOptions { + std::optional filter; + std::optional columns; + + FilterOptions() : filter(std::nullopt), columns(std::nullopt) {} + FilterOptions(arrow::compute::Expression f, Columns cols) + : filter(f), columns(cols) {} +}; + /** * @brief The arrow chunk reader for vertex property group. */ @@ -53,11 +64,11 @@ class VertexPropertyArrowChunkReader { * @param property_group The property group that describes the property group. * @param prefix The absolute prefix. */ - VertexPropertyArrowChunkReader( - const VertexInfo& vertex_info, const PropertyGroup& property_group, - const std::string& prefix, IdType chunk_index = 0, - std::shared_ptr filter = nullptr, - std::optional> columns = std::nullopt); + VertexPropertyArrowChunkReader(const VertexInfo& vertex_info, + const PropertyGroup& property_group, + const std::string& prefix, + IdType chunk_index = 0, + const FilterOptions& opts = {}); /** * @brief Sets chunk position indicator for reader by internal vertex id. @@ -119,10 +130,10 @@ class VertexPropertyArrowChunkReader { */ IdType GetChunkNum() const noexcept { return chunk_num_; } - void Filter(std::shared_ptr filter); + void Filter(arrow::compute::Expression filter); void ClearFilter(); - void Project(std::vector columns); + void Project(Columns columns); void ClearProjection(); private: @@ -133,8 +144,7 @@ class VertexPropertyArrowChunkReader { IdType seek_id_; IdType chunk_num_; std::shared_ptr chunk_table_; - std::shared_ptr filter_; - std::optional> columns_; + FilterOptions filter_options_; std::shared_ptr fs_; }; @@ -589,16 +599,15 @@ static inline Result ConstructVertexPropertyArrowChunkReader( const GraphInfo& graph_info, const std::string& label, const PropertyGroup& property_group, - std::shared_ptr filter = nullptr, - std::optional> columns = std::nullopt) noexcept { + const FilterOptions& opts = {}) noexcept { VertexInfo vertex_info; GAR_ASSIGN_OR_RAISE(vertex_info, graph_info.GetVertexInfo(label)); if (!vertex_info.ContainPropertyGroup(property_group)) { return Status::KeyError("No property group ", property_group, " in vertex ", label, "."); } - return VertexPropertyArrowChunkReader( - vertex_info, property_group, graph_info.GetPrefix(), 0, filter, columns); + return VertexPropertyArrowChunkReader(vertex_info, property_group, + graph_info.GetPrefix(), 0, opts); } /** diff --git a/cpp/include/gar/utils/filesystem.h b/cpp/include/gar/utils/filesystem.h index 39e24b3d8..de2add725 100644 --- a/cpp/include/gar/utils/filesystem.h +++ b/cpp/include/gar/utils/filesystem.h @@ -26,6 +26,9 @@ limitations under the License. #include "gar/utils/status.h" #include "gar/utils/utils.h" +#include "arrow/compute/api.h" +#include "arrow/dataset/api.h" + // forward declarations namespace arrow { class Buffer; @@ -46,6 +49,8 @@ class RandomAccessFile; namespace GAR_NAMESPACE_INTERNAL { +struct FilterOptions; + /** * This class wraps an arrow::fs::FileSystem and provides methods for * reading and writing arrow::Table objects from and to files, as well as @@ -78,14 +83,13 @@ class FileSystem { * * @param path The path of the file to read. * @param file_type The type of the file to read. - * @param filters The predictors to apply to the file. + * @param opts Filter condition and columns to be read * @return A Result containing a std::shared_ptr to an arrow::Table if * successful, or an error Status if unsuccessful. */ Result> ReadAndFilterFileToTable( const std::string& path, FileType file_type, - std::shared_ptr filter, - std::optional> columns) const noexcept; + const FilterOptions& opts) const noexcept; /** * @brief Read a file and convert its bytes to a value of type T. diff --git a/cpp/src/arrow_chunk_reader.cc b/cpp/src/arrow_chunk_reader.cc index eb82acd7b..66762a017 100644 --- a/cpp/src/arrow_chunk_reader.cc +++ b/cpp/src/arrow_chunk_reader.cc @@ -19,27 +19,18 @@ limitations under the License. #include "gar/reader/arrow_chunk_reader.h" #include "gar/utils/reader_utils.h" -#if defined(ARROW_VERSION) && ARROW_VERSION >= 12000000 -#include "arrow/compute/expression.h" -#else -#include "arrow/compute/exec/expression.h" -#endif namespace GAR_NAMESPACE_INTERNAL { namespace cp = arrow::compute; VertexPropertyArrowChunkReader::VertexPropertyArrowChunkReader( const VertexInfo& vertex_info, const PropertyGroup& property_group, - const std::string& prefix, IdType chunk_index, - std::shared_ptr filter, - std::optional> columns) + const std::string& prefix, IdType chunk_index, const FilterOptions& opts) : vertex_info_(vertex_info), property_group_(property_group), chunk_index_(chunk_index), seek_id_(chunk_index * vertex_info.GetChunkSize()), chunk_table_(nullptr), - filter_(filter ? filter - : std::make_shared(cp::literal(true))), - columns_(columns) { + filter_options_(opts) { GAR_ASSIGN_OR_RAISE_ERROR(fs_, FileSystemFromUriOrPath(prefix, &prefix_)); GAR_ASSIGN_OR_RAISE_ERROR(auto pg_path_prefix, vertex_info.GetPathPrefix(property_group)); @@ -57,7 +48,7 @@ VertexPropertyArrowChunkReader::GetChunk() noexcept { std::string path = prefix_ + chunk_file_path; GAR_ASSIGN_OR_RAISE(chunk_table_, fs_->ReadAndFilterFileToTable( path, property_group_.GetFileType(), - filter_, columns_)); + filter_options_)); } IdType row_offset = seek_id_ - chunk_index_ * vertex_info_.GetChunkSize(); return chunk_table_->Slice(row_offset); @@ -75,21 +66,20 @@ VertexPropertyArrowChunkReader::GetRange() noexcept { seek_id_ + chunk_table_->num_rows() - row_offset); } -void VertexPropertyArrowChunkReader::Filter( - std::shared_ptr filter) { - filter_ = filter; +void VertexPropertyArrowChunkReader::Filter(cp::Expression filter) { + filter_options_.filter = filter; } void VertexPropertyArrowChunkReader::ClearFilter() { - filter_ = std::make_shared(cp::literal(true)); + filter_options_.filter = {}; } -void VertexPropertyArrowChunkReader::Project(std::vector columns) { - columns_ = std::optional>(columns); +void VertexPropertyArrowChunkReader::Project(Columns columns) { + filter_options_.columns = columns; } void VertexPropertyArrowChunkReader::ClearProjection() { - columns_ = std::nullopt; + filter_options_.columns = {}; } Status AdjListArrowChunkReader::seek_src(IdType id) noexcept { diff --git a/cpp/src/filesystem.cc b/cpp/src/filesystem.cc index d79e5884b..1d52d1731 100644 --- a/cpp/src/filesystem.cc +++ b/cpp/src/filesystem.cc @@ -22,16 +22,11 @@ limitations under the License. #include "arrow/util/uri.h" #include "parquet/arrow/reader.h" #include "parquet/arrow/writer.h" -#if defined(ARROW_VERSION) && ARROW_VERSION >= 12000000 -#include "arrow/compute/expression.h" -#else -#include "arrow/compute/exec/expression.h" -#endif -#include "arrow/dataset/api.h" + +#include "gar/reader/arrow_chunk_reader.h" #include "gar/utils/filesystem.h" namespace GAR_NAMESPACE_INTERNAL { -namespace cp = arrow::compute; namespace ds = arrow::dataset; namespace detail { template @@ -177,8 +172,7 @@ Result> FileSystem::ReadFileToTable( Result> FileSystem::ReadAndFilterFileToTable( const std::string& path, FileType file_type, - std::shared_ptr filter, - std::optional> columns) const noexcept { + const FilterOptions& opts) const noexcept { std::shared_ptr format = GetFileFormat(file_type); GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN( auto factory, arrow::dataset::FileSystemDatasetFactory::Make( @@ -187,7 +181,10 @@ Result> FileSystem::ReadAndFilterFileToTable( GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(auto dataset, factory->Finish()); // Read specified columns with a row filter GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(auto scan_builder, dataset->NewScan()); - RETURN_NOT_ARROW_OK(scan_builder->Filter(*filter)); + auto&& [filter, columns] = opts; + if (filter) { + RETURN_NOT_ARROW_OK(scan_builder->Filter(filter.value())); + } if (columns) { RETURN_NOT_ARROW_OK(scan_builder->Project(columns.value())); } diff --git a/cpp/test/test_arrow_chunk_reader.cc b/cpp/test/test_arrow_chunk_reader.cc index b5e4f6518..ecf4b4394 100644 --- a/cpp/test/test_arrow_chunk_reader.cc +++ b/cpp/test/test_arrow_chunk_reader.cc @@ -110,9 +110,8 @@ TEST_CASE("test_vertex_property_pushdown") { auto group = maybe_group.value(); // pushdown options - auto filter = std::make_shared( - cp::equal(cp::field_ref("gender"), cp::literal("female"))); - std::vector column_names = {"firstName", "lastName"}; + auto filter = cp::equal(cp::field_ref("gender"), cp::literal("female")); + std::vector columns = {"firstName", "lastName"}; auto walkReader = [&](GAR_NAMESPACE::VertexPropertyArrowChunkReader& reader) { int i = 0; @@ -124,7 +123,7 @@ TEST_CASE("test_vertex_property_pushdown") { auto [l, r] = reader.GetRange().value(); auto table = result.value(); names = table->ColumnNames(); - std::cout << "Chunk : " << i << ",\tNums: " << table->num_rows() + std::cout << "Chunk: " << i << ",\tNums: " << table->num_rows() << ",\tRange: [" << l << ", " << r << "]" << '\n'; i++; sum += table->num_rows(); @@ -141,23 +140,22 @@ TEST_CASE("test_vertex_property_pushdown") { std::cout << '\n'; }; - SECTION("filter by helper function") { - std::cout << "filter by ConstructVertexPropertyArrowChunkReader():" - << std::endl; + SECTION("pushdown by helper function") { + std::cout << "pushdown by helper function: \n"; auto maybe_reader = GAR_NAMESPACE::ConstructVertexPropertyArrowChunkReader( - graph_info, label, group, filter, column_names); + graph_info, label, group, {filter, columns}); REQUIRE(maybe_reader.status().ok()); walkReader(maybe_reader.value()); } - SECTION("filter by function Filter()") { - std::cout << "\nfilter by Filter():" << std::endl; + SECTION("pushdown by function Filter()") { + std::cout << "\npushdown by Filter():" << std::endl; auto maybe_reader = GAR_NAMESPACE::ConstructVertexPropertyArrowChunkReader( graph_info, label, group); REQUIRE(maybe_reader.status().ok()); auto reader = maybe_reader.value(); reader.Filter(filter); - reader.Project(column_names); + reader.Project(columns); walkReader(reader); } } From 47129e8811a6cdec2e369af351e754b9466336fe Mon Sep 17 00:00:00 2001 From: Ziy1-Tan Date: Mon, 19 Jun 2023 23:50:28 +0800 Subject: [PATCH 05/26] Feat: Adj list filter pushdown support Signed-off-by: Ziy1-Tan --- cpp/include/gar/reader/arrow_chunk_reader.h | 61 ++++++++---- cpp/src/arrow_chunk_reader.cc | 45 ++++----- cpp/src/filesystem.cc | 9 +- cpp/test/test_arrow_chunk_reader.cc | 101 +++++++++++++++++--- 4 files changed, 157 insertions(+), 59 deletions(-) diff --git a/cpp/include/gar/reader/arrow_chunk_reader.h b/cpp/include/gar/reader/arrow_chunk_reader.h index 3102b09bb..c7613137d 100644 --- a/cpp/include/gar/reader/arrow_chunk_reader.h +++ b/cpp/include/gar/reader/arrow_chunk_reader.h @@ -44,12 +44,14 @@ namespace GAR_NAMESPACE_INTERNAL { using Columns = std::vector; struct FilterOptions { - std::optional filter; - std::optional columns; - - FilterOptions() : filter(std::nullopt), columns(std::nullopt) {} - FilterOptions(arrow::compute::Expression f, Columns cols) - : filter(f), columns(cols) {} + // The row filter to apply to the table. + arrow::compute::Expression* filter = nullptr; + // The columns to include in the table. select all columns by default. + Columns* columns = nullptr; + + FilterOptions() {} + FilterOptions(arrow::compute::Expression* filter, Columns* columns) + : filter(filter), columns(columns) {} }; /** @@ -68,7 +70,20 @@ class VertexPropertyArrowChunkReader { const PropertyGroup& property_group, const std::string& prefix, IdType chunk_index = 0, - const FilterOptions& opts = {}); + const FilterOptions& opts = {}) + : vertex_info_(vertex_info), + property_group_(property_group), + chunk_index_(chunk_index), + seek_id_(chunk_index * vertex_info.GetChunkSize()), + chunk_table_(nullptr), + filter_options_(opts) { + GAR_ASSIGN_OR_RAISE_ERROR(fs_, FileSystemFromUriOrPath(prefix, &prefix_)); + GAR_ASSIGN_OR_RAISE_ERROR(auto pg_path_prefix, + vertex_info.GetPathPrefix(property_group)); + std::string base_dir = prefix_ + pg_path_prefix; + GAR_ASSIGN_OR_RAISE_ERROR(chunk_num_, + utils::GetVertexChunkNum(prefix_, vertex_info)); + } /** * @brief Sets chunk position indicator for reader by internal vertex id. @@ -130,10 +145,10 @@ class VertexPropertyArrowChunkReader { */ IdType GetChunkNum() const noexcept { return chunk_num_; } - void Filter(arrow::compute::Expression filter); + void Filter(arrow::compute::Expression* filter); void ClearFilter(); - void Project(Columns columns); + void Project(Columns* columns); void ClearProjection(); private: @@ -440,7 +455,8 @@ class AdjListPropertyArrowChunkReader { const PropertyGroup& property_group, AdjListType adj_list_type, const std::string prefix, - IdType vertex_chunk_index = 0) + IdType vertex_chunk_index = 0, + const FilterOptions& opts = {}) : edge_info_(edge_info), property_group_(property_group), adj_list_type_(adj_list_type), @@ -448,7 +464,8 @@ class AdjListPropertyArrowChunkReader { vertex_chunk_index_(vertex_chunk_index), chunk_index_(0), seek_offset_(0), - chunk_table_(nullptr) { + chunk_table_(nullptr), + filter_options_(opts) { GAR_ASSIGN_OR_RAISE_ERROR(fs_, FileSystemFromUriOrPath(prefix, &prefix_)); GAR_ASSIGN_OR_RAISE_ERROR( auto pg_path_prefix, @@ -474,6 +491,7 @@ class AdjListPropertyArrowChunkReader { chunk_index_(other.chunk_index_), seek_offset_(other.seek_offset_), chunk_table_(nullptr), + filter_options_(other.filter_options_), vertex_chunk_num_(other.vertex_chunk_num_), chunk_num_(other.chunk_num_), base_dir_(other.base_dir_), @@ -575,6 +593,12 @@ class AdjListPropertyArrowChunkReader { return Status::OK(); } + void Filter(arrow::compute::Expression* filter); + void ClearFilter(); + + void Project(Columns* columns); + void ClearProjection(); + private: EdgeInfo edge_info_; PropertyGroup property_group_; @@ -583,6 +607,7 @@ class AdjListPropertyArrowChunkReader { IdType vertex_chunk_index_, chunk_index_; IdType seek_offset_; std::shared_ptr chunk_table_; + FilterOptions filter_options_; IdType vertex_chunk_num_, chunk_num_; std::string base_dir_; std::shared_ptr fs_; @@ -675,12 +700,11 @@ ConstructAdjListOffsetArrowChunkReader(const GraphInfo& graph_info, * @param adj_list_type The adj list type for the edges. */ static inline Result -ConstructAdjListPropertyArrowChunkReader(const GraphInfo& graph_info, - const std::string& src_label, - const std::string& edge_label, - const std::string& dst_label, - const PropertyGroup& property_group, - AdjListType adj_list_type) noexcept { +ConstructAdjListPropertyArrowChunkReader( + const GraphInfo& graph_info, const std::string& src_label, + const std::string& edge_label, const std::string& dst_label, + const PropertyGroup& property_group, AdjListType adj_list_type, + const FilterOptions& opts = {}) noexcept { EdgeInfo edge_info; GAR_ASSIGN_OR_RAISE(edge_info, graph_info.GetEdgeInfo(src_label, edge_label, dst_label)); @@ -695,7 +719,8 @@ ConstructAdjListPropertyArrowChunkReader(const GraphInfo& graph_info, AdjListTypeToString(adj_list_type), "."); } return AdjListPropertyArrowChunkReader(edge_info, property_group, - adj_list_type, graph_info.GetPrefix()); + adj_list_type, graph_info.GetPrefix(), + 0, opts); } } // namespace GAR_NAMESPACE_INTERNAL diff --git a/cpp/src/arrow_chunk_reader.cc b/cpp/src/arrow_chunk_reader.cc index 66762a017..cef340140 100644 --- a/cpp/src/arrow_chunk_reader.cc +++ b/cpp/src/arrow_chunk_reader.cc @@ -22,22 +22,6 @@ limitations under the License. namespace GAR_NAMESPACE_INTERNAL { namespace cp = arrow::compute; -VertexPropertyArrowChunkReader::VertexPropertyArrowChunkReader( - const VertexInfo& vertex_info, const PropertyGroup& property_group, - const std::string& prefix, IdType chunk_index, const FilterOptions& opts) - : vertex_info_(vertex_info), - property_group_(property_group), - chunk_index_(chunk_index), - seek_id_(chunk_index * vertex_info.GetChunkSize()), - chunk_table_(nullptr), - filter_options_(opts) { - GAR_ASSIGN_OR_RAISE_ERROR(fs_, FileSystemFromUriOrPath(prefix, &prefix_)); - GAR_ASSIGN_OR_RAISE_ERROR(auto pg_path_prefix, - vertex_info.GetPathPrefix(property_group)); - std::string base_dir = prefix_ + pg_path_prefix; - GAR_ASSIGN_OR_RAISE_ERROR(chunk_num_, - utils::GetVertexChunkNum(prefix_, vertex_info)); -} Result> VertexPropertyArrowChunkReader::GetChunk() noexcept { @@ -66,20 +50,20 @@ VertexPropertyArrowChunkReader::GetRange() noexcept { seek_id_ + chunk_table_->num_rows() - row_offset); } -void VertexPropertyArrowChunkReader::Filter(cp::Expression filter) { +void VertexPropertyArrowChunkReader::Filter(cp::Expression* filter) { filter_options_.filter = filter; } void VertexPropertyArrowChunkReader::ClearFilter() { - filter_options_.filter = {}; + filter_options_.filter = nullptr; } -void VertexPropertyArrowChunkReader::Project(Columns columns) { +void VertexPropertyArrowChunkReader::Project(Columns* columns) { filter_options_.columns = columns; } void VertexPropertyArrowChunkReader::ClearProjection() { - filter_options_.columns = {}; + filter_options_.columns = nullptr; } Status AdjListArrowChunkReader::seek_src(IdType id) noexcept { @@ -264,11 +248,28 @@ AdjListPropertyArrowChunkReader::GetChunk() noexcept { edge_info_.GetPropertyFilePath(property_group_, adj_list_type_, vertex_chunk_index_, chunk_index_)); std::string path = prefix_ + chunk_file_path; - GAR_ASSIGN_OR_RAISE(chunk_table_, fs_->ReadFileToTable( - path, property_group_.GetFileType())); + GAR_ASSIGN_OR_RAISE(chunk_table_, fs_->ReadAndFilterFileToTable( + path, property_group_.GetFileType(), + filter_options_)); } IdType row_offset = seek_offset_ - chunk_index_ * edge_info_.GetChunkSize(); return chunk_table_->Slice(row_offset); } +void AdjListPropertyArrowChunkReader::Filter(cp::Expression* filter) { + filter_options_.filter = (filter); +} + +void AdjListPropertyArrowChunkReader::ClearFilter() { + filter_options_.filter = nullptr; +} + +void AdjListPropertyArrowChunkReader::Project(Columns* columns) { + filter_options_.columns = columns; +} + +void AdjListPropertyArrowChunkReader::ClearProjection() { + filter_options_.columns = nullptr; +} + } // namespace GAR_NAMESPACE_INTERNAL diff --git a/cpp/src/filesystem.cc b/cpp/src/filesystem.cc index 1d52d1731..c9ebf155d 100644 --- a/cpp/src/filesystem.cc +++ b/cpp/src/filesystem.cc @@ -181,12 +181,11 @@ Result> FileSystem::ReadAndFilterFileToTable( GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(auto dataset, factory->Finish()); // Read specified columns with a row filter GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(auto scan_builder, dataset->NewScan()); - auto&& [filter, columns] = opts; - if (filter) { - RETURN_NOT_ARROW_OK(scan_builder->Filter(filter.value())); + if (opts.filter) { + RETURN_NOT_ARROW_OK(scan_builder->Filter(*opts.filter)); } - if (columns) { - RETURN_NOT_ARROW_OK(scan_builder->Project(columns.value())); + if (opts.columns) { + RETURN_NOT_ARROW_OK(scan_builder->Project(*opts.columns)); } GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(auto scanner, scan_builder->Finish()); GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(auto table, scanner->ToTable()); diff --git a/cpp/test/test_arrow_chunk_reader.cc b/cpp/test/test_arrow_chunk_reader.cc index ecf4b4394..32115b5ca 100644 --- a/cpp/test/test_arrow_chunk_reader.cc +++ b/cpp/test/test_arrow_chunk_reader.cc @@ -105,6 +105,7 @@ TEST_CASE("test_vertex_property_pushdown") { // construct vertex chunk reader std::string label = "person", property_name = "gender"; REQUIRE(graph_info.GetVertexInfo(label).status().ok()); + const auto chunk_size = graph_info.GetVertexInfo(label)->GetChunkSize(); auto maybe_group = graph_info.GetVertexPropertyGroup(label, property_name); REQUIRE(maybe_group.status().ok()); auto group = maybe_group.value(); @@ -123,39 +124,37 @@ TEST_CASE("test_vertex_property_pushdown") { auto [l, r] = reader.GetRange().value(); auto table = result.value(); names = table->ColumnNames(); - std::cout << "Chunk: " << i << ",\tNums: " << table->num_rows() - << ",\tRange: [" << l << ", " << r << "]" << '\n'; + std::cout << "Chunk: " << i << ",\tNums: " << table->num_rows() << "/" + << chunk_size << ",\tRange: [" << l << ", " << r << "]" << '\n'; i++; sum += table->num_rows(); - reader.next_chunk(); - } while (i < reader.GetChunkNum()); + } while (!reader.next_chunk().IsOutOfRange()); std::cout << "item size: " << sum << "/" - << graph_info.GetVertexInfo(label)->GetChunkSize() * - reader.GetChunkNum() - << '\n'; + << reader.GetChunkNum() * chunk_size << '\n'; std::cout << "Column names: "; for (const auto& n : names) { std::cout << n << ' '; } - std::cout << '\n'; + std::cout << "\n\n"; }; SECTION("pushdown by helper function") { - std::cout << "pushdown by helper function: \n"; + std::cout << "vertex property pushdown by helper function: \n"; auto maybe_reader = GAR_NAMESPACE::ConstructVertexPropertyArrowChunkReader( - graph_info, label, group, {filter, columns}); + graph_info, label, group, {&filter, &columns}); REQUIRE(maybe_reader.status().ok()); walkReader(maybe_reader.value()); } - SECTION("pushdown by function Filter()") { - std::cout << "\npushdown by Filter():" << std::endl; + SECTION("pushdown by function Filter() & Project()") { + std::cout << "vertex property pushdown by Filter() & Project():" + << std::endl; auto maybe_reader = GAR_NAMESPACE::ConstructVertexPropertyArrowChunkReader( graph_info, label, group); REQUIRE(maybe_reader.status().ok()); auto reader = maybe_reader.value(); - reader.Filter(filter); - reader.Project(columns); + reader.Filter(&filter); + reader.Project(&columns); walkReader(reader); } } @@ -271,6 +270,80 @@ TEST_CASE("test_adj_list_property_arrow_chunk_reader") { REQUIRE(reader.next_chunk().IsIndexError()); } +TEST_CASE("test_adj_list_property_pushdown") { + std::string root; + REQUIRE(GetTestResourceRoot(&root).ok()); + + // read file and construct graph info + std::string path = root + "/ldbc_sample/parquet/ldbc_sample.graph.yml"; + auto maybe_graph_info = GAR_NAMESPACE::GraphInfo::Load(path); + REQUIRE(maybe_graph_info.status().ok()); + auto graph_info = maybe_graph_info.value(); + + std::string src_label = "person", edge_label = "knows", dst_label = "person", + property_name = "creationDate"; + REQUIRE( + graph_info.GetEdgeInfo(src_label, edge_label, dst_label).status().ok()); + const auto chunk_size = + graph_info.GetEdgeInfo(src_label, edge_label, dst_label)->GetChunkSize(); + auto maybe_group = graph_info.GetEdgePropertyGroup( + src_label, edge_label, dst_label, property_name, + GAR_NAMESPACE::AdjListType::ordered_by_source); + REQUIRE(maybe_group.status().ok()); + auto group = maybe_group.value(); + + auto filter = cp::greater_equal(cp::field_ref("creationDate"), + cp::literal("2012-06-02T04:30:44.526+0000")); + std::vector columns = {"creationDate"}; + GAR_NAMESPACE::FilterOptions opts{&filter, &columns}; + + auto walkReader = + [&](GAR_NAMESPACE::AdjListPropertyArrowChunkReader& reader) { + int i = 0; + int sum = 0; + std::vector names; + do { + auto result = reader.GetChunk(); + REQUIRE(!result.has_error()); + auto table = result.value(); + names = table->ColumnNames(); + std::cout << "Chunk: " << i << ",\tNums: " << table->num_rows() << "/" + << chunk_size << '\n'; + i++; + sum += table->num_rows(); + } while (!reader.next_chunk().IsOutOfRange()); + std::cout << "item size: " << sum << "/" << i * chunk_size << '\n'; + std::cout << "Column names: "; + for (const auto& n : names) { + std::cout << n << ' '; + } + std::cout << "\n\n"; + }; + + SECTION("pushdown by helper function") { + std::cout << "adj list property pushdown by helper function: \n"; + auto maybe_reader = GAR_NAMESPACE::ConstructAdjListPropertyArrowChunkReader( + graph_info, src_label, edge_label, dst_label, group, + GAR_NAMESPACE::AdjListType::ordered_by_source, opts); + REQUIRE(maybe_reader.status().ok()); + auto reader = maybe_reader.value(); + walkReader(reader); + } + + SECTION("pushdown by function Filter() & Project()") { + std::cout << "vertex property pushdown by Filter() & Project():" + << std::endl; + auto maybe_reader = GAR_NAMESPACE::ConstructAdjListPropertyArrowChunkReader( + graph_info, src_label, edge_label, dst_label, group, + GAR_NAMESPACE::AdjListType::ordered_by_source); + REQUIRE(maybe_reader.status().ok()); + auto reader = maybe_reader.value(); + reader.Filter(&filter); + reader.Project(&columns); + walkReader(reader); + } +} + TEST_CASE("test_read_adj_list_offset_chunk_example") { std::string root; REQUIRE(GetTestResourceRoot(&root).ok()); From 22587ba609008f3b1e5371155db461f8badd45e7 Mon Sep 17 00:00:00 2001 From: Ziy1-Tan Date: Thu, 22 Jun 2023 15:43:37 +0800 Subject: [PATCH 06/26] Refactor: improved the usability of filter pushdown API Signed-off-by: Ziy1-Tan --- cpp/include/gar/reader/arrow_chunk_reader.h | 53 +++++------- cpp/include/gar/utils/filesystem.h | 19 +---- cpp/include/gar/utils/reader_utils.h | 15 ++++ cpp/src/arrow_chunk_reader.cc | 47 ++++++----- cpp/src/filesystem.cc | 92 ++------------------- cpp/test/test_arrow_chunk_reader.cc | 24 ++++-- 6 files changed, 91 insertions(+), 159 deletions(-) diff --git a/cpp/include/gar/reader/arrow_chunk_reader.h b/cpp/include/gar/reader/arrow_chunk_reader.h index c7613137d..edc47f1c3 100644 --- a/cpp/include/gar/reader/arrow_chunk_reader.h +++ b/cpp/include/gar/reader/arrow_chunk_reader.h @@ -22,11 +22,9 @@ limitations under the License. #include #include -#include "arrow/compute/api.h" #include "gar/graph_info.h" #include "gar/utils/data_type.h" #include "gar/utils/filesystem.h" -#include "gar/utils/reader_utils.h" #include "gar/utils/result.h" #include "gar/utils/status.h" #include "gar/utils/utils.h" @@ -42,18 +40,6 @@ class Expression; namespace GAR_NAMESPACE_INTERNAL { -using Columns = std::vector; -struct FilterOptions { - // The row filter to apply to the table. - arrow::compute::Expression* filter = nullptr; - // The columns to include in the table. select all columns by default. - Columns* columns = nullptr; - - FilterOptions() {} - FilterOptions(arrow::compute::Expression* filter, Columns* columns) - : filter(filter), columns(columns) {} -}; - /** * @brief The arrow chunk reader for vertex property group. */ @@ -70,13 +56,13 @@ class VertexPropertyArrowChunkReader { const PropertyGroup& property_group, const std::string& prefix, IdType chunk_index = 0, - const FilterOptions& opts = {}) + const utils::FilterOptions& options = {}) : vertex_info_(vertex_info), property_group_(property_group), chunk_index_(chunk_index), seek_id_(chunk_index * vertex_info.GetChunkSize()), chunk_table_(nullptr), - filter_options_(opts) { + filter_options_(options) { GAR_ASSIGN_OR_RAISE_ERROR(fs_, FileSystemFromUriOrPath(prefix, &prefix_)); GAR_ASSIGN_OR_RAISE_ERROR(auto pg_path_prefix, vertex_info.GetPathPrefix(property_group)); @@ -145,10 +131,11 @@ class VertexPropertyArrowChunkReader { */ IdType GetChunkNum() const noexcept { return chunk_num_; } - void Filter(arrow::compute::Expression* filter); + void Filter(const utils::RowFilter& filter); void ClearFilter(); - void Project(Columns* columns); + void Project(const utils::ColumnNames& columns); + void Project(const std::string& column); void ClearProjection(); private: @@ -159,7 +146,7 @@ class VertexPropertyArrowChunkReader { IdType seek_id_; IdType chunk_num_; std::shared_ptr chunk_table_; - FilterOptions filter_options_; + utils::FilterOptions filter_options_; std::shared_ptr fs_; }; @@ -253,7 +240,8 @@ class AdjListArrowChunkReader { } /** - * @brief Return the current chunk of chunk position indicator as arrow::Table + * @brief Return the current chunk of chunk position indicator as + * arrow::Table */ Result> GetChunk() noexcept; @@ -446,7 +434,8 @@ class AdjListPropertyArrowChunkReader { * @brief Initialize the AdjListPropertyArrowChunkReader. * * @param edge_info The edge info that describes the edge type. - * @param property_group The property group that describes the property group. + * @param property_group The property group that describes the property + * group. * @param adj_list_type The adj list type for the edges. * @param prefix The absolute prefix. * @param vertex_chunk_index The vertex chunk index, default is 0. @@ -456,7 +445,7 @@ class AdjListPropertyArrowChunkReader { AdjListType adj_list_type, const std::string prefix, IdType vertex_chunk_index = 0, - const FilterOptions& opts = {}) + const utils::FilterOptions& options = {}) : edge_info_(edge_info), property_group_(property_group), adj_list_type_(adj_list_type), @@ -465,7 +454,7 @@ class AdjListPropertyArrowChunkReader { chunk_index_(0), seek_offset_(0), chunk_table_(nullptr), - filter_options_(opts) { + filter_options_(options) { GAR_ASSIGN_OR_RAISE_ERROR(fs_, FileSystemFromUriOrPath(prefix, &prefix_)); GAR_ASSIGN_OR_RAISE_ERROR( auto pg_path_prefix, @@ -535,7 +524,8 @@ class AdjListPropertyArrowChunkReader { } /** - * @brief Return the current chunk of chunk position indicator as arrow::Table + * @brief Return the current chunk of chunk position indicator as + * arrow::Table */ Result> GetChunk() noexcept; @@ -593,10 +583,11 @@ class AdjListPropertyArrowChunkReader { return Status::OK(); } - void Filter(arrow::compute::Expression* filter); + void Filter(const utils::RowFilter& filter); void ClearFilter(); - void Project(Columns* columns); + void Project(const utils::ColumnNames& columns); + void Project(const std::string& column); void ClearProjection(); private: @@ -607,7 +598,7 @@ class AdjListPropertyArrowChunkReader { IdType vertex_chunk_index_, chunk_index_; IdType seek_offset_; std::shared_ptr chunk_table_; - FilterOptions filter_options_; + utils::FilterOptions filter_options_; IdType vertex_chunk_num_, chunk_num_; std::string base_dir_; std::shared_ptr fs_; @@ -624,7 +615,7 @@ static inline Result ConstructVertexPropertyArrowChunkReader( const GraphInfo& graph_info, const std::string& label, const PropertyGroup& property_group, - const FilterOptions& opts = {}) noexcept { + const utils::FilterOptions& options = {}) noexcept { VertexInfo vertex_info; GAR_ASSIGN_OR_RAISE(vertex_info, graph_info.GetVertexInfo(label)); if (!vertex_info.ContainPropertyGroup(property_group)) { @@ -632,7 +623,7 @@ ConstructVertexPropertyArrowChunkReader( label, "."); } return VertexPropertyArrowChunkReader(vertex_info, property_group, - graph_info.GetPrefix(), 0, opts); + graph_info.GetPrefix(), 0, options); } /** @@ -704,7 +695,7 @@ ConstructAdjListPropertyArrowChunkReader( const GraphInfo& graph_info, const std::string& src_label, const std::string& edge_label, const std::string& dst_label, const PropertyGroup& property_group, AdjListType adj_list_type, - const FilterOptions& opts = {}) noexcept { + const utils::FilterOptions& options = {}) noexcept { EdgeInfo edge_info; GAR_ASSIGN_OR_RAISE(edge_info, graph_info.GetEdgeInfo(src_label, edge_label, dst_label)); @@ -720,7 +711,7 @@ ConstructAdjListPropertyArrowChunkReader( } return AdjListPropertyArrowChunkReader(edge_info, property_group, adj_list_type, graph_info.GetPrefix(), - 0, opts); + 0, options); } } // namespace GAR_NAMESPACE_INTERNAL diff --git a/cpp/include/gar/utils/filesystem.h b/cpp/include/gar/utils/filesystem.h index de2add725..2c6b232f5 100644 --- a/cpp/include/gar/utils/filesystem.h +++ b/cpp/include/gar/utils/filesystem.h @@ -26,8 +26,8 @@ limitations under the License. #include "gar/utils/status.h" #include "gar/utils/utils.h" -#include "arrow/compute/api.h" #include "arrow/dataset/api.h" +#include "gar/utils/reader_utils.h" // forward declarations namespace arrow { @@ -67,29 +67,18 @@ class FileSystem { ~FileSystem() = default; - /** - * @brief Read a file as an arrow::Table. - * - * @param path The path of the file to read. - * @param file_type The type of the file to read. - * @return A Result containing a std::shared_ptr to an arrow::Table if - * successful, or an error Status if unsuccessful. - */ - Result> ReadFileToTable( - const std::string& path, FileType file_type) const noexcept; - /** * @brief Read and filter a file as an arrow::Table. * * @param path The path of the file to read. * @param file_type The type of the file to read. - * @param opts Filter condition and columns to be read + * @param options Filter condition and columns to be read * @return A Result containing a std::shared_ptr to an arrow::Table if * successful, or an error Status if unsuccessful. */ - Result> ReadAndFilterFileToTable( + Result> ReadFileToTable( const std::string& path, FileType file_type, - const FilterOptions& opts) const noexcept; + const utils::FilterOptions& options = {}) const noexcept; /** * @brief Read a file and convert its bytes to a value of type T. diff --git a/cpp/include/gar/utils/reader_utils.h b/cpp/include/gar/utils/reader_utils.h index c63f36011..1acd70c26 100644 --- a/cpp/include/gar/utils/reader_utils.h +++ b/cpp/include/gar/utils/reader_utils.h @@ -18,13 +18,28 @@ limitations under the License. #include #include +#include +#include "arrow/compute/api.h" #include "gar/graph_info.h" namespace GAR_NAMESPACE_INTERNAL { namespace utils { +using RowFilter = arrow::compute::Expression; +using ColumnNames = std::vector; +struct FilterOptions { + // The row filter to apply to the table. + std::optional filter = std::nullopt; + // The columns to include in the table. Select all columns by default. + std::optional columns = std::nullopt; + + FilterOptions() {} + explicit FilterOptions(const RowFilter& filter, const ColumnNames& columns) + : filter(filter), columns(columns) {} +}; + Result> GetAdjListOffsetOfVertex( const EdgeInfo& edge_info, const std::string& prefix, AdjListType adj_list_type, IdType vid) noexcept; diff --git a/cpp/src/arrow_chunk_reader.cc b/cpp/src/arrow_chunk_reader.cc index cef340140..028b6193c 100644 --- a/cpp/src/arrow_chunk_reader.cc +++ b/cpp/src/arrow_chunk_reader.cc @@ -13,15 +13,12 @@ See the License for the specific language governing permissions and limitations under the License. */ -#include - #include "arrow/api.h" #include "gar/reader/arrow_chunk_reader.h" #include "gar/utils/reader_utils.h" namespace GAR_NAMESPACE_INTERNAL { -namespace cp = arrow::compute; Result> VertexPropertyArrowChunkReader::GetChunk() noexcept { @@ -30,9 +27,9 @@ VertexPropertyArrowChunkReader::GetChunk() noexcept { auto chunk_file_path, vertex_info_.GetFilePath(property_group_, chunk_index_)); std::string path = prefix_ + chunk_file_path; - GAR_ASSIGN_OR_RAISE(chunk_table_, fs_->ReadAndFilterFileToTable( - path, property_group_.GetFileType(), - filter_options_)); + GAR_ASSIGN_OR_RAISE( + chunk_table_, fs_->ReadFileToTable(path, property_group_.GetFileType(), + filter_options_)); } IdType row_offset = seek_id_ - chunk_index_ * vertex_info_.GetChunkSize(); return chunk_table_->Slice(row_offset); @@ -50,20 +47,25 @@ VertexPropertyArrowChunkReader::GetRange() noexcept { seek_id_ + chunk_table_->num_rows() - row_offset); } -void VertexPropertyArrowChunkReader::Filter(cp::Expression* filter) { - filter_options_.filter = filter; +void VertexPropertyArrowChunkReader::Filter(const utils::RowFilter& filter) { + filter_options_.filter = std::make_optional(filter); } void VertexPropertyArrowChunkReader::ClearFilter() { - filter_options_.filter = nullptr; + filter_options_.filter.reset(); } -void VertexPropertyArrowChunkReader::Project(Columns* columns) { - filter_options_.columns = columns; +void VertexPropertyArrowChunkReader::Project( + const utils::ColumnNames& columns) { + filter_options_.columns = std::make_optional(columns); +} + +void VertexPropertyArrowChunkReader::Project(const std::string& column) { + Project({column}); } void VertexPropertyArrowChunkReader::ClearProjection() { - filter_options_.columns = nullptr; + filter_options_.columns.reset(); } Status AdjListArrowChunkReader::seek_src(IdType id) noexcept { @@ -248,28 +250,33 @@ AdjListPropertyArrowChunkReader::GetChunk() noexcept { edge_info_.GetPropertyFilePath(property_group_, adj_list_type_, vertex_chunk_index_, chunk_index_)); std::string path = prefix_ + chunk_file_path; - GAR_ASSIGN_OR_RAISE(chunk_table_, fs_->ReadAndFilterFileToTable( - path, property_group_.GetFileType(), - filter_options_)); + GAR_ASSIGN_OR_RAISE( + chunk_table_, fs_->ReadFileToTable(path, property_group_.GetFileType(), + filter_options_)); } IdType row_offset = seek_offset_ - chunk_index_ * edge_info_.GetChunkSize(); return chunk_table_->Slice(row_offset); } -void AdjListPropertyArrowChunkReader::Filter(cp::Expression* filter) { - filter_options_.filter = (filter); +void AdjListPropertyArrowChunkReader::Filter(const utils::RowFilter& filter) { + filter_options_.filter = filter; } void AdjListPropertyArrowChunkReader::ClearFilter() { - filter_options_.filter = nullptr; + filter_options_.filter.reset(); } -void AdjListPropertyArrowChunkReader::Project(Columns* columns) { +void AdjListPropertyArrowChunkReader::Project( + const utils::ColumnNames& columns) { filter_options_.columns = columns; } +void AdjListPropertyArrowChunkReader::Project(const std::string& column) { + Project({column}); +} + void AdjListPropertyArrowChunkReader::ClearProjection() { - filter_options_.columns = nullptr; + filter_options_.columns.reset(); } } // namespace GAR_NAMESPACE_INTERNAL diff --git a/cpp/src/filesystem.cc b/cpp/src/filesystem.cc index c9ebf155d..53bef8710 100644 --- a/cpp/src/filesystem.cc +++ b/cpp/src/filesystem.cc @@ -17,13 +17,9 @@ limitations under the License. #include "arrow/api.h" #include "arrow/csv/api.h" #include "arrow/filesystem/api.h" -#include "arrow/io/api.h" #include "arrow/ipc/writer.h" -#include "arrow/util/uri.h" -#include "parquet/arrow/reader.h" #include "parquet/arrow/writer.h" -#include "gar/reader/arrow_chunk_reader.h" #include "gar/utils/filesystem.h" namespace GAR_NAMESPACE_INTERNAL { @@ -97,96 +93,24 @@ std::shared_ptr FileSystem::GetFileFormat( } Result> FileSystem::ReadFileToTable( - const std::string& path, FileType file_type) const noexcept { - arrow::MemoryPool* pool = arrow::default_memory_pool(); - std::shared_ptr table; - switch (file_type) { - case FileType::CSV: { - GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(auto is, - arrow_fs_->OpenInputStream(path)); - auto read_options = arrow::csv::ReadOptions::Defaults(); - auto parse_options = arrow::csv::ParseOptions::Defaults(); - auto convert_options = arrow::csv::ConvertOptions::Defaults(); - GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN( - auto reader, arrow::csv::TableReader::Make( - arrow::io::IOContext(pool), is, read_options, - parse_options, convert_options)); - GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(table, reader->Read()); - break; - } - case FileType::PARQUET: { - GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(auto input, - arrow_fs_->OpenInputFile(path)); - std::unique_ptr reader; - RETURN_NOT_ARROW_OK(parquet::arrow::OpenFile(input, pool, &reader)); - RETURN_NOT_ARROW_OK(reader->ReadTable(&table)); - break; - } - case FileType::ORC: { - GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(auto input, - arrow_fs_->OpenInputFile(path)); - GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN( - auto reader, arrow::adapters::orc::ORCFileReader::Open(input, pool)); - GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(table, reader->Read()); - break; - } - default: - return Status::Invalid("Unsupported file type: ", - FileTypeToString(file_type)); - } - // cast string array to large string array as we need concatenate chunks in - // some places, e.g., in vineyard - for (int i = 0; i < table->num_columns(); ++i) { - std::shared_ptr type = table->column(i)->type(); - if (type->id() == arrow::Type::STRING) { - type = arrow::large_utf8(); - } else if (type->id() == arrow::Type::BINARY) { - type = arrow::large_binary(); - } - if (type->Equals(table->column(i)->type())) { - continue; - } - // do casting - auto field = table->field(i)->WithType(type); - std::shared_ptr chunked_array; - if (type->Equals(arrow::large_utf8())) { - auto status = detail::CastToLargeOffsetArray( - table->column(i), type, chunked_array); - GAR_RETURN_NOT_OK(status); - } else if (type->Equals(arrow::large_binary())) { - auto status = detail::CastToLargeOffsetArray( - table->column(i), type, chunked_array); - GAR_RETURN_NOT_OK(status); - } else { - // noop - chunked_array = table->column(i); - } - GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(table, table->RemoveColumn(i)); - GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN( - table, table->AddColumn(i, field, chunked_array)); - } - return table; -} - -Result> FileSystem::ReadAndFilterFileToTable( const std::string& path, FileType file_type, - const FilterOptions& opts) const noexcept { + const utils::FilterOptions& options) const noexcept { std::shared_ptr format = GetFileFormat(file_type); GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN( auto factory, arrow::dataset::FileSystemDatasetFactory::Make( arrow_fs_, {path}, format, arrow::dataset::FileSystemFactoryOptions())); GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(auto dataset, factory->Finish()); - // Read specified columns with a row filter GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(auto scan_builder, dataset->NewScan()); - if (opts.filter) { - RETURN_NOT_ARROW_OK(scan_builder->Filter(*opts.filter)); + + // Read specified columns with a row filter + if (auto filter = options.filter; filter.has_value()) { + RETURN_NOT_ARROW_OK(scan_builder->Filter(filter.value())); } - if (opts.columns) { - RETURN_NOT_ARROW_OK(scan_builder->Project(*opts.columns)); + if (auto columns = options.columns; columns.has_value()) { + RETURN_NOT_ARROW_OK(scan_builder->Project(columns.value())); } + GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(auto scanner, scan_builder->Finish()); GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(auto table, scanner->ToTable()); // cast string array to large string array as we need concatenate chunks in diff --git a/cpp/test/test_arrow_chunk_reader.cc b/cpp/test/test_arrow_chunk_reader.cc index 32115b5ca..d6cf20eaf 100644 --- a/cpp/test/test_arrow_chunk_reader.cc +++ b/cpp/test/test_arrow_chunk_reader.cc @@ -112,7 +112,11 @@ TEST_CASE("test_vertex_property_pushdown") { // pushdown options auto filter = cp::equal(cp::field_ref("gender"), cp::literal("female")); - std::vector columns = {"firstName", "lastName"}; + std::vector columns{"firstName", "lastName"}; + + GAR_NAMESPACE::utils::FilterOptions options; + options.filter = filter; + options.columns = columns; auto walkReader = [&](GAR_NAMESPACE::VertexPropertyArrowChunkReader& reader) { int i = 0; @@ -141,7 +145,7 @@ TEST_CASE("test_vertex_property_pushdown") { SECTION("pushdown by helper function") { std::cout << "vertex property pushdown by helper function: \n"; auto maybe_reader = GAR_NAMESPACE::ConstructVertexPropertyArrowChunkReader( - graph_info, label, group, {&filter, &columns}); + graph_info, label, group, options); REQUIRE(maybe_reader.status().ok()); walkReader(maybe_reader.value()); } @@ -153,8 +157,8 @@ TEST_CASE("test_vertex_property_pushdown") { graph_info, label, group); REQUIRE(maybe_reader.status().ok()); auto reader = maybe_reader.value(); - reader.Filter(&filter); - reader.Project(&columns); + reader.Filter(filter); + reader.Project(columns); walkReader(reader); } } @@ -294,8 +298,10 @@ TEST_CASE("test_adj_list_property_pushdown") { auto filter = cp::greater_equal(cp::field_ref("creationDate"), cp::literal("2012-06-02T04:30:44.526+0000")); - std::vector columns = {"creationDate"}; - GAR_NAMESPACE::FilterOptions opts{&filter, &columns}; + std::vector columns{"creationDate"}; + GAR_NAMESPACE::utils::FilterOptions options; + options.filter = filter; + options.columns = columns; auto walkReader = [&](GAR_NAMESPACE::AdjListPropertyArrowChunkReader& reader) { @@ -324,7 +330,7 @@ TEST_CASE("test_adj_list_property_pushdown") { std::cout << "adj list property pushdown by helper function: \n"; auto maybe_reader = GAR_NAMESPACE::ConstructAdjListPropertyArrowChunkReader( graph_info, src_label, edge_label, dst_label, group, - GAR_NAMESPACE::AdjListType::ordered_by_source, opts); + GAR_NAMESPACE::AdjListType::ordered_by_source, options); REQUIRE(maybe_reader.status().ok()); auto reader = maybe_reader.value(); walkReader(reader); @@ -338,8 +344,8 @@ TEST_CASE("test_adj_list_property_pushdown") { GAR_NAMESPACE::AdjListType::ordered_by_source); REQUIRE(maybe_reader.status().ok()); auto reader = maybe_reader.value(); - reader.Filter(&filter); - reader.Project(&columns); + reader.Filter(filter); + reader.Project(columns); walkReader(reader); } } From 3bd19317d57f3e8a577422dd9425531b822f7824 Mon Sep 17 00:00:00 2001 From: Ziy1-Tan Date: Fri, 23 Jun 2023 16:14:36 +0800 Subject: [PATCH 07/26] Refactor: wrapper for arrow::compute::Expression Signed-off-by: Ziy1-Tan --- cpp/include/gar/reader/arrow_chunk_reader.h | 3 - cpp/include/gar/utils/expression.h | 108 ++++++++++++++++++++ cpp/include/gar/utils/filesystem.h | 8 -- cpp/include/gar/utils/reader_utils.h | 7 +- cpp/src/arrow_chunk_reader.cc | 2 +- cpp/src/filesystem.cc | 2 +- cpp/test/test_arrow_chunk_reader.cc | 10 +- 7 files changed, 119 insertions(+), 21 deletions(-) create mode 100644 cpp/include/gar/utils/expression.h diff --git a/cpp/include/gar/reader/arrow_chunk_reader.h b/cpp/include/gar/reader/arrow_chunk_reader.h index edc47f1c3..018184896 100644 --- a/cpp/include/gar/reader/arrow_chunk_reader.h +++ b/cpp/include/gar/reader/arrow_chunk_reader.h @@ -33,9 +33,6 @@ limitations under the License. namespace arrow { class Array; class Table; -namespace compute { -class Expression; -} } // namespace arrow namespace GAR_NAMESPACE_INTERNAL { diff --git a/cpp/include/gar/utils/expression.h b/cpp/include/gar/utils/expression.h new file mode 100644 index 000000000..c1fa609a6 --- /dev/null +++ b/cpp/include/gar/utils/expression.h @@ -0,0 +1,108 @@ +/** Copyright 2022 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +#include + +#include "arrow/compute/api.h" + +#ifndef GAR_UTILS_EXPRESSION_H_ +#define GAR_UTILS_EXPRESSION_H_ + +namespace GAR_NAMESPACE_INTERNAL { + +/** + * CompareOperator is an enum class that represents the relational operators + * that can be used to compare two values. + */ +enum class CompareOperator : std::uint8_t { + equal, //"=" + not_equal, //"<>" + less, //"<" + less_equal, //"<=" + greater, //">" + greater_equal, //">=" +}; + +/** + * This class wraps an arrow::compute::Expression and provides methods for + * reading arrow::compute::Expression objects + */ +class Expression { + friend class FilterBuilder; + + public: + Expression() = default; + Expression(const Expression& other) : expr_(other.expr_) {} + ~Expression() = default; + + bool Equals(const Expression& other) { return expr_.Equals(other.expr_); } + arrow::compute::Expression GetExpr() const { return expr_; } + + private: + explicit Expression(arrow::compute::Expression expr) : expr_(expr) {} + + arrow::compute::Expression expr_; +}; + +/** + * This class builds an expression tree for a filter. + */ +class FilterBuilder { + public: + FilterBuilder() = default; + ~FilterBuilder() = default; + + static auto OperatorTypeToArrowOpFunc(CompareOperator op) { + switch (op) { + case CompareOperator::equal: + return arrow::compute::equal; + case CompareOperator::not_equal: + return arrow::compute::not_equal; + case CompareOperator::less: + return arrow::compute::less; + case CompareOperator::less_equal: + return arrow::compute::less_equal; + case CompareOperator::greater: + return arrow::compute::greater; + case CompareOperator::greater_equal: + return arrow::compute::greater_equal; + } + } + + template + static Expression Make(const std::string& property, CompareOperator op, + const T value) { + auto func = OperatorTypeToArrowOpFunc(op); + return Expression(func(arrow::compute::field_ref(property), + arrow::compute::literal(value))); + } + + static Expression And(const Expression& left, const Expression& right) { + return Expression(arrow::compute::and_(left.GetExpr(), right.GetExpr())); + } + + static Expression Or(const Expression& left, const Expression& right) { + return Expression(arrow::compute::or_(left.GetExpr(), right.GetExpr())); + } + + static Expression Not(const Expression& expr) { + return Expression(arrow::compute::not_(expr.GetExpr())); + } + + private: + Expression expr_; +}; +} // namespace GAR_NAMESPACE_INTERNAL +#endif // GAR_UTILS_EXPRESSION_H_ diff --git a/cpp/include/gar/utils/filesystem.h b/cpp/include/gar/utils/filesystem.h index 2c6b232f5..97b8a3bae 100644 --- a/cpp/include/gar/utils/filesystem.h +++ b/cpp/include/gar/utils/filesystem.h @@ -33,12 +33,6 @@ limitations under the License. namespace arrow { class Buffer; class Table; -namespace compute { -class Expression; -} -namespace dataset { -class FileFormat; -} namespace fs { class FileSystem; } @@ -49,8 +43,6 @@ class RandomAccessFile; namespace GAR_NAMESPACE_INTERNAL { -struct FilterOptions; - /** * This class wraps an arrow::fs::FileSystem and provides methods for * reading and writing arrow::Table objects from and to files, as well as diff --git a/cpp/include/gar/utils/reader_utils.h b/cpp/include/gar/utils/reader_utils.h index 1acd70c26..0a3a8ccf4 100644 --- a/cpp/include/gar/utils/reader_utils.h +++ b/cpp/include/gar/utils/reader_utils.h @@ -20,15 +20,16 @@ limitations under the License. #include #include -#include "arrow/compute/api.h" #include "gar/graph_info.h" +#include "gar/utils/expression.h" namespace GAR_NAMESPACE_INTERNAL { namespace utils { -using RowFilter = arrow::compute::Expression; +using RowFilter = Expression; using ColumnNames = std::vector; + struct FilterOptions { // The row filter to apply to the table. std::optional filter = std::nullopt; @@ -36,7 +37,7 @@ struct FilterOptions { std::optional columns = std::nullopt; FilterOptions() {} - explicit FilterOptions(const RowFilter& filter, const ColumnNames& columns) + FilterOptions(const RowFilter& filter, const ColumnNames& columns) : filter(filter), columns(columns) {} }; diff --git a/cpp/src/arrow_chunk_reader.cc b/cpp/src/arrow_chunk_reader.cc index 028b6193c..1ace816bb 100644 --- a/cpp/src/arrow_chunk_reader.cc +++ b/cpp/src/arrow_chunk_reader.cc @@ -259,7 +259,7 @@ AdjListPropertyArrowChunkReader::GetChunk() noexcept { } void AdjListPropertyArrowChunkReader::Filter(const utils::RowFilter& filter) { - filter_options_.filter = filter; + filter_options_.filter = std::make_optional(filter); } void AdjListPropertyArrowChunkReader::ClearFilter() { diff --git a/cpp/src/filesystem.cc b/cpp/src/filesystem.cc index 53bef8710..5501b3ecc 100644 --- a/cpp/src/filesystem.cc +++ b/cpp/src/filesystem.cc @@ -105,7 +105,7 @@ Result> FileSystem::ReadFileToTable( // Read specified columns with a row filter if (auto filter = options.filter; filter.has_value()) { - RETURN_NOT_ARROW_OK(scan_builder->Filter(filter.value())); + RETURN_NOT_ARROW_OK(scan_builder->Filter(filter.value().GetExpr())); } if (auto columns = options.columns; columns.has_value()) { RETURN_NOT_ARROW_OK(scan_builder->Project(columns.value())); diff --git a/cpp/test/test_arrow_chunk_reader.cc b/cpp/test/test_arrow_chunk_reader.cc index d6cf20eaf..f895cddc0 100644 --- a/cpp/test/test_arrow_chunk_reader.cc +++ b/cpp/test/test_arrow_chunk_reader.cc @@ -31,8 +31,6 @@ limitations under the License. #define CATCH_CONFIG_MAIN #include -namespace cp = arrow::compute; - TEST_CASE("test_vertex_property_arrow_chunk_reader") { std::string root; REQUIRE(GetTestResourceRoot(&root).ok()); @@ -111,7 +109,8 @@ TEST_CASE("test_vertex_property_pushdown") { auto group = maybe_group.value(); // pushdown options - auto filter = cp::equal(cp::field_ref("gender"), cp::literal("female")); + auto filter = GAR_NAMESPACE::FilterBuilder::Make( + "gender", GAR_NAMESPACE::CompareOperator::equal, "female"); std::vector columns{"firstName", "lastName"}; GAR_NAMESPACE::utils::FilterOptions options; @@ -296,8 +295,9 @@ TEST_CASE("test_adj_list_property_pushdown") { REQUIRE(maybe_group.status().ok()); auto group = maybe_group.value(); - auto filter = cp::greater_equal(cp::field_ref("creationDate"), - cp::literal("2012-06-02T04:30:44.526+0000")); + auto filter = GAR_NAMESPACE::FilterBuilder::Make( + "creationDate", GAR_NAMESPACE::CompareOperator::greater_equal, + "2012-06-02T04:30:44.526+0000"); std::vector columns{"creationDate"}; GAR_NAMESPACE::utils::FilterOptions options; options.filter = filter; From 5036b2a36fc9d595e47d698ed9941d629c74f63d Mon Sep 17 00:00:00 2001 From: Ziy1-Tan Date: Sat, 24 Jun 2023 00:26:59 +0800 Subject: [PATCH 08/26] Refactor: make friend class Signed-off-by: Ziy1-Tan --- cpp/include/gar/utils/expression.h | 30 ++++++++++++++--------------- cpp/src/filesystem.cc | 2 +- cpp/test/test_arrow_chunk_reader.cc | 6 +++++- 3 files changed, 20 insertions(+), 18 deletions(-) diff --git a/cpp/include/gar/utils/expression.h b/cpp/include/gar/utils/expression.h index c1fa609a6..c824c10c9 100644 --- a/cpp/include/gar/utils/expression.h +++ b/cpp/include/gar/utils/expression.h @@ -41,29 +41,29 @@ enum class CompareOperator : std::uint8_t { */ class Expression { friend class FilterBuilder; + friend class FileSystem; public: Expression() = default; - Expression(const Expression& other) : expr_(other.expr_) {} + Expression(const Expression& other) = default; ~Expression() = default; - bool Equals(const Expression& other) { return expr_.Equals(other.expr_); } - arrow::compute::Expression GetExpr() const { return expr_; } + bool Equals(const Expression& other) { + return arrow_expr_.Equals(other.arrow_expr_); + } private: - explicit Expression(arrow::compute::Expression expr) : expr_(expr) {} + explicit Expression(arrow::compute::Expression expr) + : arrow_expr_(std::move(expr)) {} - arrow::compute::Expression expr_; + arrow::compute::Expression arrow_expr_; }; /** * This class builds an expression tree for a filter. */ class FilterBuilder { - public: - FilterBuilder() = default; - ~FilterBuilder() = default; - + private: static auto OperatorTypeToArrowOpFunc(CompareOperator op) { switch (op) { case CompareOperator::equal: @@ -81,6 +81,7 @@ class FilterBuilder { } } + public: template static Expression Make(const std::string& property, CompareOperator op, const T value) { @@ -88,21 +89,18 @@ class FilterBuilder { return Expression(func(arrow::compute::field_ref(property), arrow::compute::literal(value))); } - static Expression And(const Expression& left, const Expression& right) { - return Expression(arrow::compute::and_(left.GetExpr(), right.GetExpr())); + return Expression( + arrow::compute::and_(left.arrow_expr_, right.arrow_expr_)); } static Expression Or(const Expression& left, const Expression& right) { - return Expression(arrow::compute::or_(left.GetExpr(), right.GetExpr())); + return Expression(arrow::compute::or_(left.arrow_expr_, right.arrow_expr_)); } static Expression Not(const Expression& expr) { - return Expression(arrow::compute::not_(expr.GetExpr())); + return Expression(arrow::compute::not_(expr.arrow_expr_)); } - - private: - Expression expr_; }; } // namespace GAR_NAMESPACE_INTERNAL #endif // GAR_UTILS_EXPRESSION_H_ diff --git a/cpp/src/filesystem.cc b/cpp/src/filesystem.cc index 5501b3ecc..0d00c4181 100644 --- a/cpp/src/filesystem.cc +++ b/cpp/src/filesystem.cc @@ -105,7 +105,7 @@ Result> FileSystem::ReadFileToTable( // Read specified columns with a row filter if (auto filter = options.filter; filter.has_value()) { - RETURN_NOT_ARROW_OK(scan_builder->Filter(filter.value().GetExpr())); + RETURN_NOT_ARROW_OK(scan_builder->Filter(filter.value().arrow_expr_)); } if (auto columns = options.columns; columns.has_value()) { RETURN_NOT_ARROW_OK(scan_builder->Project(columns.value())); diff --git a/cpp/test/test_arrow_chunk_reader.cc b/cpp/test/test_arrow_chunk_reader.cc index f895cddc0..d0e26f270 100644 --- a/cpp/test/test_arrow_chunk_reader.cc +++ b/cpp/test/test_arrow_chunk_reader.cc @@ -108,7 +108,7 @@ TEST_CASE("test_vertex_property_pushdown") { REQUIRE(maybe_group.status().ok()); auto group = maybe_group.value(); - // pushdown options + // construct pushdown options auto filter = GAR_NAMESPACE::FilterBuilder::Make( "gender", GAR_NAMESPACE::CompareOperator::equal, "female"); std::vector columns{"firstName", "lastName"}; @@ -117,6 +117,7 @@ TEST_CASE("test_vertex_property_pushdown") { options.filter = filter; options.columns = columns; + // print reader result auto walkReader = [&](GAR_NAMESPACE::VertexPropertyArrowChunkReader& reader) { int i = 0; int sum = 0; @@ -295,14 +296,17 @@ TEST_CASE("test_adj_list_property_pushdown") { REQUIRE(maybe_group.status().ok()); auto group = maybe_group.value(); + // construct pushdown options auto filter = GAR_NAMESPACE::FilterBuilder::Make( "creationDate", GAR_NAMESPACE::CompareOperator::greater_equal, "2012-06-02T04:30:44.526+0000"); std::vector columns{"creationDate"}; + GAR_NAMESPACE::utils::FilterOptions options; options.filter = filter; options.columns = columns; + // print reader result auto walkReader = [&](GAR_NAMESPACE::AdjListPropertyArrowChunkReader& reader) { int i = 0; From bdb84830d3f5a6cd10f4b54c2ca738baffc135bc Mon Sep 17 00:00:00 2001 From: Ziy1-Tan Date: Sat, 24 Jun 2023 14:37:40 +0800 Subject: [PATCH 09/26] Refactor: Fix style Signed-off-by: Ziy1-Tan --- cpp/include/gar/graph_info.h | 5 ++ cpp/include/gar/utils/expression.h | 91 ++++++++++++++++------------- cpp/test/test_arrow_chunk_reader.cc | 18 ++++-- 3 files changed, 68 insertions(+), 46 deletions(-) diff --git a/cpp/include/gar/graph_info.h b/cpp/include/gar/graph_info.h index 657321abe..98c7599f8 100644 --- a/cpp/include/gar/graph_info.h +++ b/cpp/include/gar/graph_info.h @@ -43,6 +43,11 @@ struct Property { std::string name; // property name DataType type; // property data type bool is_primary; // primary key tag + + Property() {} + Property(const std::string& name) : name(name) {} + Property(const std::string& name, const DataType& type, bool is_primary) + : name(name), type(type), is_primary(is_primary) {} }; static bool operator==(const Property& lhs, const Property& rhs) { diff --git a/cpp/include/gar/utils/expression.h b/cpp/include/gar/utils/expression.h index c824c10c9..7397044e5 100644 --- a/cpp/include/gar/utils/expression.h +++ b/cpp/include/gar/utils/expression.h @@ -17,6 +17,8 @@ limitations under the License. #include "arrow/compute/api.h" +#include "gar/graph_info.h" + #ifndef GAR_UTILS_EXPRESSION_H_ #define GAR_UTILS_EXPRESSION_H_ @@ -27,12 +29,12 @@ namespace GAR_NAMESPACE_INTERNAL { * that can be used to compare two values. */ enum class CompareOperator : std::uint8_t { - equal, //"=" - not_equal, //"<>" - less, //"<" - less_equal, //"<=" - greater, //">" - greater_equal, //">=" + EQUAL, //"=" + NOT_EQUAL, //"<>" + LESS, //"<" + LESS_EQUAL, //"<=" + GREATER, //">" + GREATER_EQUAL, //">=" }; /** @@ -40,67 +42,74 @@ enum class CompareOperator : std::uint8_t { * reading arrow::compute::Expression objects */ class Expression { - friend class FilterBuilder; friend class FileSystem; + friend Expression And(const Expression& lhs, const Expression& rhs); + friend Expression Or(const Expression& lhs, const Expression& rhs); + friend Expression Not(const Expression& expr); public: Expression() = default; Expression(const Expression& other) = default; ~Expression() = default; - - bool Equals(const Expression& other) { - return arrow_expr_.Equals(other.arrow_expr_); - } - - private: - explicit Expression(arrow::compute::Expression expr) - : arrow_expr_(std::move(expr)) {} - - arrow::compute::Expression arrow_expr_; -}; - -/** - * This class builds an expression tree for a filter. - */ -class FilterBuilder { - private: static auto OperatorTypeToArrowOpFunc(CompareOperator op) { switch (op) { - case CompareOperator::equal: + case CompareOperator::EQUAL: return arrow::compute::equal; - case CompareOperator::not_equal: + case CompareOperator::NOT_EQUAL: return arrow::compute::not_equal; - case CompareOperator::less: + case CompareOperator::LESS: return arrow::compute::less; - case CompareOperator::less_equal: + case CompareOperator::LESS_EQUAL: return arrow::compute::less_equal; - case CompareOperator::greater: + case CompareOperator::GREATER: return arrow::compute::greater; - case CompareOperator::greater_equal: + case CompareOperator::GREATER_EQUAL: return arrow::compute::greater_equal; } } - public: template - static Expression Make(const std::string& property, CompareOperator op, + static Expression Make(const Property& property, CompareOperator op, const T value) { auto func = OperatorTypeToArrowOpFunc(op); - return Expression(func(arrow::compute::field_ref(property), + return Expression(func(arrow::compute::field_ref(property.name), arrow::compute::literal(value))); } - static Expression And(const Expression& left, const Expression& right) { - return Expression( - arrow::compute::and_(left.arrow_expr_, right.arrow_expr_)); - } - static Expression Or(const Expression& left, const Expression& right) { - return Expression(arrow::compute::or_(left.arrow_expr_, right.arrow_expr_)); + template + static Expression Make(const T value, CompareOperator op, + const Property& property) { + return Make(property, op, value); } - static Expression Not(const Expression& expr) { - return Expression(arrow::compute::not_(expr.arrow_expr_)); + static Expression Make(const Property& lhs, CompareOperator op, + const Property& rhs) { + auto func = OperatorTypeToArrowOpFunc(op); + return Expression(func(arrow::compute::field_ref(lhs.name), + arrow::compute::field_ref(rhs.name))); } + + private: + explicit Expression(arrow::compute::Expression expr) + : arrow_expr_(std::move(expr)) {} + + arrow::compute::Expression arrow_expr_; }; + +/** + * This class builds an expression tree for a filter. + */ + +inline Expression And(const Expression& lhs, const Expression& rhs) { + return Expression(arrow::compute::and_(lhs.arrow_expr_, rhs.arrow_expr_)); +} + +inline Expression Or(const Expression& lhs, const Expression& rhs) { + return Expression(arrow::compute::or_(lhs.arrow_expr_, rhs.arrow_expr_)); +} + +inline Expression Not(const Expression& expr) { + return Expression(arrow::compute::not_(expr.arrow_expr_)); +} } // namespace GAR_NAMESPACE_INTERNAL #endif // GAR_UTILS_EXPRESSION_H_ diff --git a/cpp/test/test_arrow_chunk_reader.cc b/cpp/test/test_arrow_chunk_reader.cc index d0e26f270..cf4aab25e 100644 --- a/cpp/test/test_arrow_chunk_reader.cc +++ b/cpp/test/test_arrow_chunk_reader.cc @@ -108,9 +108,12 @@ TEST_CASE("test_vertex_property_pushdown") { REQUIRE(maybe_group.status().ok()); auto group = maybe_group.value(); + GAR_NAMESPACE::Property prop("gender"); + GAR_NAMESPACE::CompareOperator op = GAR_NAMESPACE::CompareOperator::EQUAL; + std::string val("female"); + // construct pushdown options - auto filter = GAR_NAMESPACE::FilterBuilder::Make( - "gender", GAR_NAMESPACE::CompareOperator::equal, "female"); + auto filter = GAR_NAMESPACE::Expression::Make(prop, op, val); std::vector columns{"firstName", "lastName"}; GAR_NAMESPACE::utils::FilterOptions options; @@ -296,10 +299,15 @@ TEST_CASE("test_adj_list_property_pushdown") { REQUIRE(maybe_group.status().ok()); auto group = maybe_group.value(); + GAR_NAMESPACE::Property prop("creationDate"); + GAR_NAMESPACE::CompareOperator op = + GAR_NAMESPACE::CompareOperator::GREATER_EQUAL; + std::string val("2012-06-02T04:30:44.526+0000"); // construct pushdown options - auto filter = GAR_NAMESPACE::FilterBuilder::Make( - "creationDate", GAR_NAMESPACE::CompareOperator::greater_equal, - "2012-06-02T04:30:44.526+0000"); + auto filter = GAR_NAMESPACE::And( + GAR_NAMESPACE::Expression::Make(val, op, prop), + GAR_NAMESPACE::Expression::Make( + prop, GAR_NAMESPACE::CompareOperator::EQUAL, prop)); std::vector columns{"creationDate"}; GAR_NAMESPACE::utils::FilterOptions options; From 11e00f44a39fec705edb8d1200fdb0a60ead381d Mon Sep 17 00:00:00 2001 From: Ziy1-Tan Date: Sat, 24 Jun 2023 14:48:35 +0800 Subject: [PATCH 10/26] Refactor: Fix style Signed-off-by: Ziy1-Tan --- cpp/include/gar/graph_info.h | 2 +- cpp/include/gar/utils/expression.h | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/include/gar/graph_info.h b/cpp/include/gar/graph_info.h index 98c7599f8..55846ea68 100644 --- a/cpp/include/gar/graph_info.h +++ b/cpp/include/gar/graph_info.h @@ -45,7 +45,7 @@ struct Property { bool is_primary; // primary key tag Property() {} - Property(const std::string& name) : name(name) {} + explicit Property(const std::string& name) : name(name) {} Property(const std::string& name, const DataType& type, bool is_primary) : name(name), type(type), is_primary(is_primary) {} }; diff --git a/cpp/include/gar/utils/expression.h b/cpp/include/gar/utils/expression.h index 7397044e5..e3d5bb439 100644 --- a/cpp/include/gar/utils/expression.h +++ b/cpp/include/gar/utils/expression.h @@ -14,6 +14,7 @@ limitations under the License. */ #include +#include #include "arrow/compute/api.h" From b1d536a857d0d3d5554cfd0be782e606a095053f Mon Sep 17 00:00:00 2001 From: Ziy1-Tan Date: Sun, 25 Jun 2023 00:44:08 +0800 Subject: [PATCH 11/26] Refactor: more friendly Expression API Signed-off-by: Ziy1-Tan --- cpp/include/gar/reader/arrow_chunk_reader.h | 8 +- cpp/include/gar/utils/expression.h | 297 ++++++++++++++++---- cpp/include/gar/utils/reader_utils.h | 10 +- cpp/src/arrow_chunk_reader.cc | 24 +- cpp/src/expression.cc | 55 ++++ cpp/src/filesystem.cc | 8 +- cpp/test/test_arrow_chunk_reader.cc | 28 +- 7 files changed, 336 insertions(+), 94 deletions(-) create mode 100644 cpp/src/expression.cc diff --git a/cpp/include/gar/reader/arrow_chunk_reader.h b/cpp/include/gar/reader/arrow_chunk_reader.h index 018184896..8edff87ef 100644 --- a/cpp/include/gar/reader/arrow_chunk_reader.h +++ b/cpp/include/gar/reader/arrow_chunk_reader.h @@ -128,10 +128,10 @@ class VertexPropertyArrowChunkReader { */ IdType GetChunkNum() const noexcept { return chunk_num_; } - void Filter(const utils::RowFilter& filter); + void Filter(utils::RowFilter filter); void ClearFilter(); - void Project(const utils::ColumnNames& columns); + void Project(utils::ColumnNames columns); void Project(const std::string& column); void ClearProjection(); @@ -580,10 +580,10 @@ class AdjListPropertyArrowChunkReader { return Status::OK(); } - void Filter(const utils::RowFilter& filter); + void Filter(utils::RowFilter filter); void ClearFilter(); - void Project(const utils::ColumnNames& columns); + void Project(utils::ColumnNames columns); void Project(const std::string& column); void ClearProjection(); diff --git a/cpp/include/gar/utils/expression.h b/cpp/include/gar/utils/expression.h index e3d5bb439..4f027458c 100644 --- a/cpp/include/gar/utils/expression.h +++ b/cpp/include/gar/utils/expression.h @@ -1,4 +1,4 @@ -/** Copyright 2022 Alibaba Group Holding Limited. +/** Copyright 2023 Alibaba Group Holding Limited. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ +#include #include -#include #include "arrow/compute/api.h" @@ -29,13 +29,19 @@ namespace GAR_NAMESPACE_INTERNAL { * CompareOperator is an enum class that represents the relational operators * that can be used to compare two values. */ -enum class CompareOperator : std::uint8_t { - EQUAL, //"=" - NOT_EQUAL, //"<>" - LESS, //"<" - LESS_EQUAL, //"<=" - GREATER, //">" - GREATER_EQUAL, //">=" + +using ArrowExpression = arrow::compute::Expression; +enum class Operator : std::uint8_t { + Equal, // "=" + NotEqual, // "<>" + Less, // "<" + LessEqual, // "<=" + Greater, // ">" + GreaterEqual, // ">=" + And, // "and" + Or, // "or" + Not, // "not" + IsNull // "is null" }; /** @@ -43,74 +49,255 @@ enum class CompareOperator : std::uint8_t { * reading arrow::compute::Expression objects */ class Expression { - friend class FileSystem; - friend Expression And(const Expression& lhs, const Expression& rhs); - friend Expression Or(const Expression& lhs, const Expression& rhs); - friend Expression Not(const Expression& expr); - public: Expression() = default; Expression(const Expression& other) = default; - ~Expression() = default; - static auto OperatorTypeToArrowOpFunc(CompareOperator op) { - switch (op) { - case CompareOperator::EQUAL: - return arrow::compute::equal; - case CompareOperator::NOT_EQUAL: - return arrow::compute::not_equal; - case CompareOperator::LESS: - return arrow::compute::less; - case CompareOperator::LESS_EQUAL: - return arrow::compute::less_equal; - case CompareOperator::GREATER: - return arrow::compute::greater; - case CompareOperator::GREATER_EQUAL: - return arrow::compute::greater_equal; - } - } + virtual ~Expression() = default; template - static Expression Make(const Property& property, CompareOperator op, - const T value) { - auto func = OperatorTypeToArrowOpFunc(op); - return Expression(func(arrow::compute::field_ref(property.name), - arrow::compute::literal(value))); - } + static inline Result Make(const Property& property, Operator op, + const T& value); template - static Expression Make(const T value, CompareOperator op, - const Property& property) { - return Make(property, op, value); + static inline Result Make(const T& value, Operator op, + const Property& property); + + static inline Result Make(const Property& p1, Operator op, + const Property& p2); + + virtual ArrowExpression Evaluate() = 0; +}; + +class ExpressionProperty : public Expression { + public: + explicit ExpressionProperty(const Property& property) : property_(property) {} + ExpressionProperty(const ExpressionProperty& other) = default; + ~ExpressionProperty() = default; + + ArrowExpression Evaluate() override; + + private: + Property property_; +}; + +template +class ExpressionLiteral : public Expression { + public: + explicit ExpressionLiteral(const T value) : value_(value) {} + ExpressionLiteral(const ExpressionLiteral& other) = default; + ~ExpressionLiteral() = default; + + ArrowExpression Evaluate() { return arrow::compute::literal(value_); } + + private: + T value_; +}; + +class UnaryOperator : public Expression { + public: + UnaryOperator() = default; + explicit UnaryOperator(Expression* expr) : expr_(expr) {} + UnaryOperator(const UnaryOperator& other) = default; + virtual ~UnaryOperator() {} + + protected: + std::shared_ptr expr_; +}; + +class OperatorNot : public UnaryOperator { + public: + OperatorNot() = default; + explicit OperatorNot(Expression* expr) : UnaryOperator(expr) {} + OperatorNot(const OperatorNot& other) = default; + ~OperatorNot() = default; + + ArrowExpression Evaluate() override { + return arrow::compute::not_(expr_->Evaluate()); } +}; + +class OperatorIsNull : public UnaryOperator { + public: + OperatorIsNull() = default; + explicit OperatorIsNull(Expression* expr, bool nan_is_null = false) + : UnaryOperator(expr), nan_is_null_(nan_is_null) {} + OperatorIsNull(const OperatorIsNull& other) = default; + ~OperatorIsNull() = default; - static Expression Make(const Property& lhs, CompareOperator op, - const Property& rhs) { - auto func = OperatorTypeToArrowOpFunc(op); - return Expression(func(arrow::compute::field_ref(lhs.name), - arrow::compute::field_ref(rhs.name))); + ArrowExpression Evaluate() override { + return arrow::compute::is_null(expr_->Evaluate(), nan_is_null_); } private: - explicit Expression(arrow::compute::Expression expr) - : arrow_expr_(std::move(expr)) {} + bool nan_is_null_; +}; + +class BinaryOperator : public Expression { + public: + BinaryOperator() = default; + BinaryOperator(Expression* lhs, Expression* rhs) : lhs_(lhs), rhs_(rhs) {} + BinaryOperator(const BinaryOperator& other) = default; + ~BinaryOperator() = default; + + protected: + std::shared_ptr lhs_; + std::shared_ptr rhs_; +}; + +class OperatorEqual : public BinaryOperator { + public: + OperatorEqual() = default; + OperatorEqual(Expression* lhs, Expression* rhs) : BinaryOperator(lhs, rhs) {} + OperatorEqual(const OperatorEqual& other) = default; + ~OperatorEqual() = default; + + ArrowExpression Evaluate() override; +}; - arrow::compute::Expression arrow_expr_; +class OperatorNotEqual : public BinaryOperator { + public: + OperatorNotEqual() = default; + OperatorNotEqual(Expression* lhs, Expression* rhs) + : BinaryOperator(lhs, rhs) {} + OperatorNotEqual(const OperatorNotEqual& other) = default; + ~OperatorNotEqual() = default; + + ArrowExpression Evaluate() override; +}; + +class OperatorGreater : public BinaryOperator { + public: + OperatorGreater() = default; + OperatorGreater(Expression* lhs, Expression* rhs) + : BinaryOperator(lhs, rhs) {} + OperatorGreater(const OperatorGreater& other) = default; + ~OperatorGreater() = default; + + ArrowExpression Evaluate() override; +}; + +class OperatorGreaterEqual : public BinaryOperator { + public: + OperatorGreaterEqual() = default; + OperatorGreaterEqual(Expression* lhs, Expression* rhs) + : BinaryOperator(lhs, rhs) {} + OperatorGreaterEqual(const OperatorGreaterEqual& other) = default; + ~OperatorGreaterEqual() = default; + + ArrowExpression Evaluate() override; +}; + +class OperatorLess : public BinaryOperator { + public: + OperatorLess() = default; + OperatorLess(Expression* lhs, Expression* rhs) : BinaryOperator(lhs, rhs) {} + OperatorLess(const OperatorLess& other) = default; + ~OperatorLess() = default; + + ArrowExpression Evaluate() override; +}; + +class OperatorLessEqual : public BinaryOperator { + public: + OperatorLessEqual() = default; + OperatorLessEqual(Expression* lhs, Expression* rhs) + : BinaryOperator(lhs, rhs) {} + OperatorLessEqual(const OperatorLessEqual& other) = default; + ~OperatorLessEqual() = default; + + ArrowExpression Evaluate() override; +}; + +class OperatorAnd : public BinaryOperator { + public: + OperatorAnd() = default; + OperatorAnd(Expression* lhs, Expression* rhs) : BinaryOperator(lhs, rhs) {} + OperatorAnd(const OperatorAnd& other) = default; + ~OperatorAnd() = default; + + ArrowExpression Evaluate() override; +}; + +class OperatorOr : public BinaryOperator { + public: + OperatorOr() = default; + OperatorOr(Expression* lhs, Expression* rhs) : BinaryOperator(lhs, rhs) {} + OperatorOr(const OperatorOr& other) = default; + ~OperatorOr() = default; + + ArrowExpression Evaluate() override; }; /** - * This class builds an expression tree for a filter. + * Helper functions to Construct Expression. */ +template +inline Result Expression::Make(const Property& property, + Operator op, const T& value) { + switch (op) { +#define TO_OPERATOR_CASE_PV(_op) \ + case Operator::_op: { \ + return new Operator##_op(new ExpressionProperty((property)), \ + new ExpressionLiteral((value))); \ + } + TO_OPERATOR_CASE_PV(Equal) + TO_OPERATOR_CASE_PV(NotEqual) + TO_OPERATOR_CASE_PV(Less) + TO_OPERATOR_CASE_PV(LessEqual) + TO_OPERATOR_CASE_PV(Greater) + TO_OPERATOR_CASE_PV(GreaterEqual) + TO_OPERATOR_CASE_PV(And) + TO_OPERATOR_CASE_PV(Or) + default: + break; + } + return Status::Invalid("Unrecognized binary operator"); +} -inline Expression And(const Expression& lhs, const Expression& rhs) { - return Expression(arrow::compute::and_(lhs.arrow_expr_, rhs.arrow_expr_)); +template +inline Result Expression::Make(const T& value, Operator op, + const Property& property) { + return Expression::Make(property, op, value); } -inline Expression Or(const Expression& lhs, const Expression& rhs) { - return Expression(arrow::compute::or_(lhs.arrow_expr_, rhs.arrow_expr_)); +inline Result Expression::Make(const Property& p1, Operator op, + const Property& p2) { + switch (op) { +#define TO_OPERATOR_CASE_PP(_op) \ + case Operator::_op: { \ + return new Operator##_op(new ExpressionProperty((p1)), \ + new ExpressionProperty((p2))); \ + } + TO_OPERATOR_CASE_PP(Equal); + TO_OPERATOR_CASE_PP(NotEqual); + TO_OPERATOR_CASE_PP(Less); + TO_OPERATOR_CASE_PP(LessEqual); + TO_OPERATOR_CASE_PP(Greater); + TO_OPERATOR_CASE_PP(GreaterEqual); + TO_OPERATOR_CASE_PP(And); + TO_OPERATOR_CASE_PP(Or); + default: + break; + } + return Status::Invalid("Unrecognized binary operator"); } -inline Expression Not(const Expression& expr) { - return Expression(arrow::compute::not_(expr.arrow_expr_)); +static inline Result Not(Expression* expr) { + return new OperatorNot(expr); } + +static inline Result IsNull(Expression* expr, + bool nan_is_null = false) { + return new OperatorIsNull(expr, nan_is_null); +} + +static inline Result And(Expression* lhs, Expression* rhs) { + return new OperatorAnd(lhs, rhs); +} + +static inline Result Or(Expression* lhs, Expression* rhs) { + return new OperatorOr(lhs, rhs); +} + } // namespace GAR_NAMESPACE_INTERNAL #endif // GAR_UTILS_EXPRESSION_H_ diff --git a/cpp/include/gar/utils/reader_utils.h b/cpp/include/gar/utils/reader_utils.h index 0a3a8ccf4..d998e98ed 100644 --- a/cpp/include/gar/utils/reader_utils.h +++ b/cpp/include/gar/utils/reader_utils.h @@ -27,17 +27,17 @@ namespace GAR_NAMESPACE_INTERNAL { namespace utils { -using RowFilter = Expression; -using ColumnNames = std::vector; +using RowFilter = Expression*; +using ColumnNames = std::vector*; struct FilterOptions { // The row filter to apply to the table. - std::optional filter = std::nullopt; + RowFilter filter = nullptr; // The columns to include in the table. Select all columns by default. - std::optional columns = std::nullopt; + ColumnNames columns = nullptr; FilterOptions() {} - FilterOptions(const RowFilter& filter, const ColumnNames& columns) + FilterOptions(RowFilter filter, ColumnNames columns) : filter(filter), columns(columns) {} }; diff --git a/cpp/src/arrow_chunk_reader.cc b/cpp/src/arrow_chunk_reader.cc index 1ace816bb..410917030 100644 --- a/cpp/src/arrow_chunk_reader.cc +++ b/cpp/src/arrow_chunk_reader.cc @@ -47,17 +47,16 @@ VertexPropertyArrowChunkReader::GetRange() noexcept { seek_id_ + chunk_table_->num_rows() - row_offset); } -void VertexPropertyArrowChunkReader::Filter(const utils::RowFilter& filter) { - filter_options_.filter = std::make_optional(filter); +void VertexPropertyArrowChunkReader::Filter(utils::RowFilter filter) { + filter_options_.filter = filter; } void VertexPropertyArrowChunkReader::ClearFilter() { - filter_options_.filter.reset(); + filter_options_.filter = nullptr; } -void VertexPropertyArrowChunkReader::Project( - const utils::ColumnNames& columns) { - filter_options_.columns = std::make_optional(columns); +void VertexPropertyArrowChunkReader::Project(utils::ColumnNames columns) { + filter_options_.columns = columns; } void VertexPropertyArrowChunkReader::Project(const std::string& column) { @@ -65,7 +64,7 @@ void VertexPropertyArrowChunkReader::Project(const std::string& column) { } void VertexPropertyArrowChunkReader::ClearProjection() { - filter_options_.columns.reset(); + filter_options_.columns = nullptr; } Status AdjListArrowChunkReader::seek_src(IdType id) noexcept { @@ -258,16 +257,15 @@ AdjListPropertyArrowChunkReader::GetChunk() noexcept { return chunk_table_->Slice(row_offset); } -void AdjListPropertyArrowChunkReader::Filter(const utils::RowFilter& filter) { - filter_options_.filter = std::make_optional(filter); +void AdjListPropertyArrowChunkReader::Filter(utils::RowFilter filter) { + filter_options_.filter = filter; } void AdjListPropertyArrowChunkReader::ClearFilter() { - filter_options_.filter.reset(); + filter_options_.filter = nullptr; } -void AdjListPropertyArrowChunkReader::Project( - const utils::ColumnNames& columns) { +void AdjListPropertyArrowChunkReader::Project(utils::ColumnNames columns) { filter_options_.columns = columns; } @@ -276,7 +274,7 @@ void AdjListPropertyArrowChunkReader::Project(const std::string& column) { } void AdjListPropertyArrowChunkReader::ClearProjection() { - filter_options_.columns.reset(); + filter_options_.columns = nullptr; } } // namespace GAR_NAMESPACE_INTERNAL diff --git a/cpp/src/expression.cc b/cpp/src/expression.cc new file mode 100644 index 000000000..22de971bd --- /dev/null +++ b/cpp/src/expression.cc @@ -0,0 +1,55 @@ +/** Copyright 2023 Alibaba Group Holding Limited. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ +#include "gar/utils/expression.h" + +namespace GAR_NAMESPACE_INTERNAL { + +ArrowExpression ExpressionProperty::Evaluate() { + return arrow::compute::field_ref(property_.name); +} + +ArrowExpression OperatorEqual::Evaluate() { + return arrow::compute::equal(lhs_->Evaluate(), rhs_->Evaluate()); +} + +ArrowExpression OperatorNotEqual::Evaluate() { + return arrow::compute::not_equal(lhs_->Evaluate(), rhs_->Evaluate()); +} + +ArrowExpression OperatorGreater::Evaluate() { + return arrow::compute::greater(lhs_->Evaluate(), rhs_->Evaluate()); +} + +ArrowExpression OperatorGreaterEqual::Evaluate() { + return arrow::compute::greater_equal(lhs_->Evaluate(), rhs_->Evaluate()); +} + +ArrowExpression OperatorLess::Evaluate() { + return arrow::compute::less(lhs_->Evaluate(), rhs_->Evaluate()); +} + +ArrowExpression OperatorLessEqual::Evaluate() { + return arrow::compute::less_equal(lhs_->Evaluate(), rhs_->Evaluate()); +} + +ArrowExpression OperatorAnd::Evaluate() { + return arrow::compute::and_(lhs_->Evaluate(), rhs_->Evaluate()); +} + +ArrowExpression OperatorOr::Evaluate() { + return arrow::compute::or_(lhs_->Evaluate(), rhs_->Evaluate()); +} + +} // namespace GAR_NAMESPACE_INTERNAL diff --git a/cpp/src/filesystem.cc b/cpp/src/filesystem.cc index 0d00c4181..8a541d6a2 100644 --- a/cpp/src/filesystem.cc +++ b/cpp/src/filesystem.cc @@ -104,11 +104,11 @@ Result> FileSystem::ReadFileToTable( GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(auto scan_builder, dataset->NewScan()); // Read specified columns with a row filter - if (auto filter = options.filter; filter.has_value()) { - RETURN_NOT_ARROW_OK(scan_builder->Filter(filter.value().arrow_expr_)); + if (options.filter) { + RETURN_NOT_ARROW_OK(scan_builder->Filter(options.filter->Evaluate())); } - if (auto columns = options.columns; columns.has_value()) { - RETURN_NOT_ARROW_OK(scan_builder->Project(columns.value())); + if (options.columns) { + RETURN_NOT_ARROW_OK(scan_builder->Project(*options.columns)); } GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(auto scanner, scan_builder->Finish()); diff --git a/cpp/test/test_arrow_chunk_reader.cc b/cpp/test/test_arrow_chunk_reader.cc index cf4aab25e..6dc546ef1 100644 --- a/cpp/test/test_arrow_chunk_reader.cc +++ b/cpp/test/test_arrow_chunk_reader.cc @@ -26,6 +26,7 @@ limitations under the License. #include "./util.h" #include "gar/reader/arrow_chunk_reader.h" +#include "gar/utils/expression.h" #include "gar/writer/arrow_chunk_writer.h" #define CATCH_CONFIG_MAIN @@ -109,16 +110,16 @@ TEST_CASE("test_vertex_property_pushdown") { auto group = maybe_group.value(); GAR_NAMESPACE::Property prop("gender"); - GAR_NAMESPACE::CompareOperator op = GAR_NAMESPACE::CompareOperator::EQUAL; + GAR_NAMESPACE::Operator op = GAR_NAMESPACE::Operator::Equal; std::string val("female"); // construct pushdown options - auto filter = GAR_NAMESPACE::Expression::Make(prop, op, val); + auto filter = GAR_NAMESPACE::Expression::Make(prop, op, val).value(); std::vector columns{"firstName", "lastName"}; GAR_NAMESPACE::utils::FilterOptions options; options.filter = filter; - options.columns = columns; + options.columns = &columns; // print reader result auto walkReader = [&](GAR_NAMESPACE::VertexPropertyArrowChunkReader& reader) { @@ -161,9 +162,10 @@ TEST_CASE("test_vertex_property_pushdown") { REQUIRE(maybe_reader.status().ok()); auto reader = maybe_reader.value(); reader.Filter(filter); - reader.Project(columns); + reader.Project(&columns); walkReader(reader); } + delete filter; } TEST_CASE("test_adj_list_arrow_chunk_reader") { @@ -299,20 +301,19 @@ TEST_CASE("test_adj_list_property_pushdown") { REQUIRE(maybe_group.status().ok()); auto group = maybe_group.value(); + // construct pushdown options GAR_NAMESPACE::Property prop("creationDate"); - GAR_NAMESPACE::CompareOperator op = - GAR_NAMESPACE::CompareOperator::GREATER_EQUAL; + GAR_NAMESPACE::Operator op1 = GAR_NAMESPACE::Operator::GreaterEqual; + GAR_NAMESPACE::Operator op2 = GAR_NAMESPACE::Operator::Equal; std::string val("2012-06-02T04:30:44.526+0000"); - // construct pushdown options - auto filter = GAR_NAMESPACE::And( - GAR_NAMESPACE::Expression::Make(val, op, prop), - GAR_NAMESPACE::Expression::Make( - prop, GAR_NAMESPACE::CompareOperator::EQUAL, prop)); + auto f1 = GAR_NAMESPACE::Expression::Make(val, op1, prop).value(); + auto f2 = GAR_NAMESPACE::Expression::Make(prop, op2, prop).value(); + auto filter = GAR_NAMESPACE::And(f1, f2).value(); std::vector columns{"creationDate"}; GAR_NAMESPACE::utils::FilterOptions options; options.filter = filter; - options.columns = columns; + options.columns = &columns; // print reader result auto walkReader = @@ -357,9 +358,10 @@ TEST_CASE("test_adj_list_property_pushdown") { REQUIRE(maybe_reader.status().ok()); auto reader = maybe_reader.value(); reader.Filter(filter); - reader.Project(columns); + reader.Project(&columns); walkReader(reader); } + delete filter; } TEST_CASE("test_read_adj_list_offset_chunk_example") { From b9cb893c3194671c865ea73f9469d575198d028c Mon Sep 17 00:00:00 2001 From: Ziy1-Tan Date: Sun, 25 Jun 2023 13:38:54 +0800 Subject: [PATCH 12/26] Refactor: switch case to template call Signed-off-by: Ziy1-Tan --- cpp/include/gar/reader/arrow_chunk_reader.h | 8 +- cpp/include/gar/utils/expression.h | 148 ++++++++++---------- cpp/include/gar/utils/reader_utils.h | 10 +- cpp/src/arrow_chunk_reader.cc | 8 +- cpp/test/test_arrow_chunk_reader.cc | 60 ++++---- 5 files changed, 119 insertions(+), 115 deletions(-) diff --git a/cpp/include/gar/reader/arrow_chunk_reader.h b/cpp/include/gar/reader/arrow_chunk_reader.h index 8edff87ef..b94c37da2 100644 --- a/cpp/include/gar/reader/arrow_chunk_reader.h +++ b/cpp/include/gar/reader/arrow_chunk_reader.h @@ -128,10 +128,10 @@ class VertexPropertyArrowChunkReader { */ IdType GetChunkNum() const noexcept { return chunk_num_; } - void Filter(utils::RowFilter filter); + void Filter(utils::ExpressionPtr filter); void ClearFilter(); - void Project(utils::ColumnNames columns); + void Project(utils::VectorPtr columns); void Project(const std::string& column); void ClearProjection(); @@ -580,10 +580,10 @@ class AdjListPropertyArrowChunkReader { return Status::OK(); } - void Filter(utils::RowFilter filter); + void Filter(utils::ExpressionPtr filter); void ClearFilter(); - void Project(utils::ColumnNames columns); + void Project(utils::VectorPtr columns); void Project(const std::string& column); void ClearProjection(); diff --git a/cpp/include/gar/utils/expression.h b/cpp/include/gar/utils/expression.h index 4f027458c..61b71f10e 100644 --- a/cpp/include/gar/utils/expression.h +++ b/cpp/include/gar/utils/expression.h @@ -25,24 +25,8 @@ limitations under the License. namespace GAR_NAMESPACE_INTERNAL { -/** - * CompareOperator is an enum class that represents the relational operators - * that can be used to compare two values. - */ - using ArrowExpression = arrow::compute::Expression; -enum class Operator : std::uint8_t { - Equal, // "=" - NotEqual, // "<>" - Less, // "<" - LessEqual, // "<=" - Greater, // ">" - GreaterEqual, // ">=" - And, // "and" - Or, // "or" - Not, // "not" - IsNull // "is null" -}; +class BinaryOperator; /** * This class wraps an arrow::compute::Expression and provides methods for @@ -54,16 +38,51 @@ class Expression { Expression(const Expression& other) = default; virtual ~Expression() = default; - template - static inline Result Make(const Property& property, Operator op, - const T& value); - - template - static inline Result Make(const T& value, Operator op, - const Property& property); - - static inline Result Make(const Property& p1, Operator op, - const Property& p2); + /** + * @brief Make a new expression from a property and a value + * + * @tparam OpType The type of the operator, only binary operators are allowed + * e.g. OperatorEq + * @tparam ValType The type of the value, e.g. int64_t + * @param property The property to compare + * @param value The value to compare + * @return A predicate expression for filter pushdown + */ + template < + typename OpType, + typename = std::enable_if_t>, + typename ValType> + static inline Expression* Make(const Property& property, + const ValType& value); + + /** + * @brief Make a new expression from a property and a value + * + * @tparam OpType The type of the operator, only binary operators are allowed + * e.g. OperatorEQ + * @tparam ValType The type of the value, e.g. int64_t + * @param value The value to compare + * @param property The property to compare + * @return A predicate expression for filter pushdown + */ + template < + typename OpType, + typename = std::enable_if_t>, + typename ValType> + static inline Expression* Make(const ValType& value, + const Property& property); + /** + * @brief Make a new expression from a property and a value + * + * @tparam OpType The type of the operator, only binary operators are allowed + * e.g. OperatorEq + * @param p1 The first property to compare + * @param p2 The second property to compare + * @return A predicate expression for filter pushdown + */ + template >> + static inline Expression* Make(const Property& p1, const Property& p2); virtual ArrowExpression Evaluate() = 0; }; @@ -228,74 +247,49 @@ class OperatorOr : public BinaryOperator { ArrowExpression Evaluate() override; }; +using Equal = OperatorEqual; +using NotEqual = OperatorNotEqual; +using Greater = OperatorGreater; +using GreaterThan = OperatorGreaterEqual; +using Less = OperatorLess; +using LessEqual = OperatorLessEqual; + /** * Helper functions to Construct Expression. */ -template -inline Result Expression::Make(const Property& property, - Operator op, const T& value) { - switch (op) { -#define TO_OPERATOR_CASE_PV(_op) \ - case Operator::_op: { \ - return new Operator##_op(new ExpressionProperty((property)), \ - new ExpressionLiteral((value))); \ - } - TO_OPERATOR_CASE_PV(Equal) - TO_OPERATOR_CASE_PV(NotEqual) - TO_OPERATOR_CASE_PV(Less) - TO_OPERATOR_CASE_PV(LessEqual) - TO_OPERATOR_CASE_PV(Greater) - TO_OPERATOR_CASE_PV(GreaterEqual) - TO_OPERATOR_CASE_PV(And) - TO_OPERATOR_CASE_PV(Or) - default: - break; - } - return Status::Invalid("Unrecognized binary operator"); +template +inline Expression* Expression::Make(const Property& property, + const ValType& value) { + return new OpType(new ExpressionProperty(property), + new ExpressionLiteral(value)); } -template -inline Result Expression::Make(const T& value, Operator op, - const Property& property) { - return Expression::Make(property, op, value); +template + +inline Expression* Expression::Make(const ValType& value, + const Property& property) { + return new OpType(new ExpressionLiteral(value), + new ExpressionProperty(property)); } -inline Result Expression::Make(const Property& p1, Operator op, - const Property& p2) { - switch (op) { -#define TO_OPERATOR_CASE_PP(_op) \ - case Operator::_op: { \ - return new Operator##_op(new ExpressionProperty((p1)), \ - new ExpressionProperty((p2))); \ - } - TO_OPERATOR_CASE_PP(Equal); - TO_OPERATOR_CASE_PP(NotEqual); - TO_OPERATOR_CASE_PP(Less); - TO_OPERATOR_CASE_PP(LessEqual); - TO_OPERATOR_CASE_PP(Greater); - TO_OPERATOR_CASE_PP(GreaterEqual); - TO_OPERATOR_CASE_PP(And); - TO_OPERATOR_CASE_PP(Or); - default: - break; - } - return Status::Invalid("Unrecognized binary operator"); +template +inline Expression* Expression::Make(const Property& p1, const Property& p2) { + return new OpType(new ExpressionProperty(p1), new ExpressionProperty(p2)); } -static inline Result Not(Expression* expr) { +static inline Expression* Not(Expression* expr) { return new OperatorNot(expr); } -static inline Result IsNull(Expression* expr, - bool nan_is_null = false) { +static inline Expression* IsNull(Expression* expr, bool nan_is_null = false) { return new OperatorIsNull(expr, nan_is_null); } -static inline Result And(Expression* lhs, Expression* rhs) { +static inline Expression* And(Expression* lhs, Expression* rhs) { return new OperatorAnd(lhs, rhs); } -static inline Result Or(Expression* lhs, Expression* rhs) { +static inline Expression* Or(Expression* lhs, Expression* rhs) { return new OperatorOr(lhs, rhs); } diff --git a/cpp/include/gar/utils/reader_utils.h b/cpp/include/gar/utils/reader_utils.h index d998e98ed..84a36aa1c 100644 --- a/cpp/include/gar/utils/reader_utils.h +++ b/cpp/include/gar/utils/reader_utils.h @@ -27,17 +27,17 @@ namespace GAR_NAMESPACE_INTERNAL { namespace utils { -using RowFilter = Expression*; -using ColumnNames = std::vector*; +using ExpressionPtr = Expression*; +using VectorPtr = std::vector*; struct FilterOptions { // The row filter to apply to the table. - RowFilter filter = nullptr; + ExpressionPtr filter = nullptr; // The columns to include in the table. Select all columns by default. - ColumnNames columns = nullptr; + VectorPtr columns = nullptr; FilterOptions() {} - FilterOptions(RowFilter filter, ColumnNames columns) + FilterOptions(ExpressionPtr filter, VectorPtr columns) : filter(filter), columns(columns) {} }; diff --git a/cpp/src/arrow_chunk_reader.cc b/cpp/src/arrow_chunk_reader.cc index 410917030..b5a33f48f 100644 --- a/cpp/src/arrow_chunk_reader.cc +++ b/cpp/src/arrow_chunk_reader.cc @@ -47,7 +47,7 @@ VertexPropertyArrowChunkReader::GetRange() noexcept { seek_id_ + chunk_table_->num_rows() - row_offset); } -void VertexPropertyArrowChunkReader::Filter(utils::RowFilter filter) { +void VertexPropertyArrowChunkReader::Filter(utils::ExpressionPtr filter) { filter_options_.filter = filter; } @@ -55,7 +55,7 @@ void VertexPropertyArrowChunkReader::ClearFilter() { filter_options_.filter = nullptr; } -void VertexPropertyArrowChunkReader::Project(utils::ColumnNames columns) { +void VertexPropertyArrowChunkReader::Project(utils::VectorPtr columns) { filter_options_.columns = columns; } @@ -257,7 +257,7 @@ AdjListPropertyArrowChunkReader::GetChunk() noexcept { return chunk_table_->Slice(row_offset); } -void AdjListPropertyArrowChunkReader::Filter(utils::RowFilter filter) { +void AdjListPropertyArrowChunkReader::Filter(utils::ExpressionPtr filter) { filter_options_.filter = filter; } @@ -265,7 +265,7 @@ void AdjListPropertyArrowChunkReader::ClearFilter() { filter_options_.filter = nullptr; } -void AdjListPropertyArrowChunkReader::Project(utils::ColumnNames columns) { +void AdjListPropertyArrowChunkReader::Project(utils::VectorPtr columns) { filter_options_.columns = columns; } diff --git a/cpp/test/test_arrow_chunk_reader.cc b/cpp/test/test_arrow_chunk_reader.cc index 6dc546ef1..463ef3afc 100644 --- a/cpp/test/test_arrow_chunk_reader.cc +++ b/cpp/test/test_arrow_chunk_reader.cc @@ -32,6 +32,12 @@ limitations under the License. #define CATCH_CONFIG_MAIN #include +using GAR_NAMESPACE::Equal; +using GAR_NAMESPACE::Expression; +using GAR_NAMESPACE::Less; +using GAR_NAMESPACE::Property; +using GAR_NAMESPACE::utils::FilterOptions; + TEST_CASE("test_vertex_property_arrow_chunk_reader") { std::string root; REQUIRE(GetTestResourceRoot(&root).ok()); @@ -109,15 +115,15 @@ TEST_CASE("test_vertex_property_pushdown") { REQUIRE(maybe_group.status().ok()); auto group = maybe_group.value(); - GAR_NAMESPACE::Property prop("gender"); - GAR_NAMESPACE::Operator op = GAR_NAMESPACE::Operator::Equal; + // construct pushdown options + Property prop("gender"); std::string val("female"); - // construct pushdown options - auto filter = GAR_NAMESPACE::Expression::Make(prop, op, val).value(); + auto filter = Expression::Make(prop, val); + auto defer = std::unique_ptr(filter); std::vector columns{"firstName", "lastName"}; - GAR_NAMESPACE::utils::FilterOptions options; + FilterOptions options; options.filter = filter; options.columns = &columns; @@ -126,6 +132,7 @@ TEST_CASE("test_vertex_property_pushdown") { int i = 0; int sum = 0; std::vector names; + do { auto result = reader.GetChunk(); REQUIRE(!result.has_error()); @@ -137,17 +144,19 @@ TEST_CASE("test_vertex_property_pushdown") { i++; sum += table->num_rows(); } while (!reader.next_chunk().IsOutOfRange()); - std::cout << "item size: " << sum << "/" + + std::cout << "Total Nums: " << sum << "/" << reader.GetChunkNum() * chunk_size << '\n'; - std::cout << "Column names: "; - for (const auto& n : names) { - std::cout << n << ' '; + std::cout << "Column Nums: " << names.size() << "\n"; + std::cout << "Column Names: "; + for (const auto& name : names) { + std::cout << "`" << name << "` "; } std::cout << "\n\n"; }; SECTION("pushdown by helper function") { - std::cout << "vertex property pushdown by helper function: \n"; + std::cout << "vertex property pushdown by helper function:\n"; auto maybe_reader = GAR_NAMESPACE::ConstructVertexPropertyArrowChunkReader( graph_info, label, group, options); REQUIRE(maybe_reader.status().ok()); @@ -155,8 +164,7 @@ TEST_CASE("test_vertex_property_pushdown") { } SECTION("pushdown by function Filter() & Project()") { - std::cout << "vertex property pushdown by Filter() & Project():" - << std::endl; + std::cout << "vertex property pushdown by Filter() & Project():\n"; auto maybe_reader = GAR_NAMESPACE::ConstructVertexPropertyArrowChunkReader( graph_info, label, group); REQUIRE(maybe_reader.status().ok()); @@ -165,7 +173,6 @@ TEST_CASE("test_vertex_property_pushdown") { reader.Project(&columns); walkReader(reader); } - delete filter; } TEST_CASE("test_adj_list_arrow_chunk_reader") { @@ -302,16 +309,17 @@ TEST_CASE("test_adj_list_property_pushdown") { auto group = maybe_group.value(); // construct pushdown options - GAR_NAMESPACE::Property prop("creationDate"); - GAR_NAMESPACE::Operator op1 = GAR_NAMESPACE::Operator::GreaterEqual; - GAR_NAMESPACE::Operator op2 = GAR_NAMESPACE::Operator::Equal; + Property prop("creationDate"); std::string val("2012-06-02T04:30:44.526+0000"); - auto f1 = GAR_NAMESPACE::Expression::Make(val, op1, prop).value(); - auto f2 = GAR_NAMESPACE::Expression::Make(prop, op2, prop).value(); - auto filter = GAR_NAMESPACE::And(f1, f2).value(); + + auto expr1 = Expression::Make(val, prop); + auto expr2 = Expression::Make(prop, prop); + auto filter = And(expr1, expr2); + auto defer = std::unique_ptr(filter); + std::vector columns{"creationDate"}; - GAR_NAMESPACE::utils::FilterOptions options; + FilterOptions options; options.filter = filter; options.columns = &columns; @@ -321,6 +329,7 @@ TEST_CASE("test_adj_list_property_pushdown") { int i = 0; int sum = 0; std::vector names; + do { auto result = reader.GetChunk(); REQUIRE(!result.has_error()); @@ -331,10 +340,12 @@ TEST_CASE("test_adj_list_property_pushdown") { i++; sum += table->num_rows(); } while (!reader.next_chunk().IsOutOfRange()); - std::cout << "item size: " << sum << "/" << i * chunk_size << '\n'; - std::cout << "Column names: "; - for (const auto& n : names) { - std::cout << n << ' '; + + std::cout << "Total Nums: " << sum << "/" << i * chunk_size << '\n'; + std::cout << "Column Nums: " << names.size() << "\n"; + std::cout << "Column Names: "; + for (const auto& name : names) { + std::cout << "`" << name << "` "; } std::cout << "\n\n"; }; @@ -361,7 +372,6 @@ TEST_CASE("test_adj_list_property_pushdown") { reader.Project(&columns); walkReader(reader); } - delete filter; } TEST_CASE("test_read_adj_list_offset_chunk_example") { From ee09d895e8833403b33f3a007d59928fdc0d173c Mon Sep 17 00:00:00 2001 From: Ziy1-Tan Date: Mon, 26 Jun 2023 11:42:57 +0800 Subject: [PATCH 13/26] Docs: filter Expression Signed-off-by: Ziy1-Tan --- cpp/include/gar/reader/arrow_chunk_reader.h | 7 ++----- cpp/include/gar/utils/expression.h | 22 ++++++++++++--------- cpp/include/gar/utils/filesystem.h | 2 +- cpp/include/gar/utils/reader_utils.h | 6 +++--- cpp/src/arrow_chunk_reader.cc | 12 ++--------- cpp/src/expression.cc | 7 +++++++ cpp/test/test_arrow_chunk_reader.cc | 8 ++------ 7 files changed, 30 insertions(+), 34 deletions(-) diff --git a/cpp/include/gar/reader/arrow_chunk_reader.h b/cpp/include/gar/reader/arrow_chunk_reader.h index b94c37da2..0d5faf961 100644 --- a/cpp/include/gar/reader/arrow_chunk_reader.h +++ b/cpp/include/gar/reader/arrow_chunk_reader.h @@ -17,7 +17,6 @@ limitations under the License. #define GAR_READER_ARROW_CHUNK_READER_H_ #include -#include #include #include #include @@ -128,11 +127,10 @@ class VertexPropertyArrowChunkReader { */ IdType GetChunkNum() const noexcept { return chunk_num_; } - void Filter(utils::ExpressionPtr filter); + void Filter(utils::FilterPtr filter); void ClearFilter(); void Project(utils::VectorPtr columns); - void Project(const std::string& column); void ClearProjection(); private: @@ -580,11 +578,10 @@ class AdjListPropertyArrowChunkReader { return Status::OK(); } - void Filter(utils::ExpressionPtr filter); + void Filter(utils::FilterPtr filter); void ClearFilter(); void Project(utils::VectorPtr columns); - void Project(const std::string& column); void ClearProjection(); private: diff --git a/cpp/include/gar/utils/expression.h b/cpp/include/gar/utils/expression.h index 61b71f10e..5487360e3 100644 --- a/cpp/include/gar/utils/expression.h +++ b/cpp/include/gar/utils/expression.h @@ -42,7 +42,7 @@ class Expression { * @brief Make a new expression from a property and a value * * @tparam OpType The type of the operator, only binary operators are allowed - * e.g. OperatorEq + * e.g. OperatorEqual * @tparam ValType The type of the value, e.g. int64_t * @param property The property to compare * @param value The value to compare @@ -59,7 +59,7 @@ class Expression { * @brief Make a new expression from a property and a value * * @tparam OpType The type of the operator, only binary operators are allowed - * e.g. OperatorEQ + * e.g. OperatorEqual * @tparam ValType The type of the value, e.g. int64_t * @param value The value to compare * @param property The property to compare @@ -84,6 +84,15 @@ class Expression { std::is_base_of_v>> static inline Expression* Make(const Property& p1, const Property& p2); + /** + * @brief Parse predicates based on attributes, operators, and values e,g. new + * OperatorEqual(new ExpressionProperty(Property("a")), new + * ExpressionLiteral(1)) will be parsed as + * arrow::compute::equal(arrow::compute::field_ref("a"), + * arrow::compute::literal(1)) + * + * @return The arrow::compute::Expression object + */ virtual ArrowExpression Evaluate() = 0; }; @@ -130,9 +139,7 @@ class OperatorNot : public UnaryOperator { OperatorNot(const OperatorNot& other) = default; ~OperatorNot() = default; - ArrowExpression Evaluate() override { - return arrow::compute::not_(expr_->Evaluate()); - } + ArrowExpression Evaluate() override; }; class OperatorIsNull : public UnaryOperator { @@ -143,9 +150,7 @@ class OperatorIsNull : public UnaryOperator { OperatorIsNull(const OperatorIsNull& other) = default; ~OperatorIsNull() = default; - ArrowExpression Evaluate() override { - return arrow::compute::is_null(expr_->Evaluate(), nan_is_null_); - } + ArrowExpression Evaluate() override; private: bool nan_is_null_; @@ -265,7 +270,6 @@ inline Expression* Expression::Make(const Property& property, } template - inline Expression* Expression::Make(const ValType& value, const Property& property) { return new OpType(new ExpressionLiteral(value), diff --git a/cpp/include/gar/utils/filesystem.h b/cpp/include/gar/utils/filesystem.h index 97b8a3bae..34afccd6c 100644 --- a/cpp/include/gar/utils/filesystem.h +++ b/cpp/include/gar/utils/filesystem.h @@ -64,7 +64,7 @@ class FileSystem { * * @param path The path of the file to read. * @param file_type The type of the file to read. - * @param options Filter condition and columns to be read + * @param options Row filter and columns to be selected * @return A Result containing a std::shared_ptr to an arrow::Table if * successful, or an error Status if unsuccessful. */ diff --git a/cpp/include/gar/utils/reader_utils.h b/cpp/include/gar/utils/reader_utils.h index 84a36aa1c..02b993bd5 100644 --- a/cpp/include/gar/utils/reader_utils.h +++ b/cpp/include/gar/utils/reader_utils.h @@ -27,17 +27,17 @@ namespace GAR_NAMESPACE_INTERNAL { namespace utils { -using ExpressionPtr = Expression*; +using FilterPtr = Expression*; using VectorPtr = std::vector*; struct FilterOptions { // The row filter to apply to the table. - ExpressionPtr filter = nullptr; + FilterPtr filter = nullptr; // The columns to include in the table. Select all columns by default. VectorPtr columns = nullptr; FilterOptions() {} - FilterOptions(ExpressionPtr filter, VectorPtr columns) + FilterOptions(FilterPtr filter, VectorPtr columns) : filter(filter), columns(columns) {} }; diff --git a/cpp/src/arrow_chunk_reader.cc b/cpp/src/arrow_chunk_reader.cc index b5a33f48f..db9425205 100644 --- a/cpp/src/arrow_chunk_reader.cc +++ b/cpp/src/arrow_chunk_reader.cc @@ -47,7 +47,7 @@ VertexPropertyArrowChunkReader::GetRange() noexcept { seek_id_ + chunk_table_->num_rows() - row_offset); } -void VertexPropertyArrowChunkReader::Filter(utils::ExpressionPtr filter) { +void VertexPropertyArrowChunkReader::Filter(utils::FilterPtr filter) { filter_options_.filter = filter; } @@ -59,10 +59,6 @@ void VertexPropertyArrowChunkReader::Project(utils::VectorPtr columns) { filter_options_.columns = columns; } -void VertexPropertyArrowChunkReader::Project(const std::string& column) { - Project({column}); -} - void VertexPropertyArrowChunkReader::ClearProjection() { filter_options_.columns = nullptr; } @@ -257,7 +253,7 @@ AdjListPropertyArrowChunkReader::GetChunk() noexcept { return chunk_table_->Slice(row_offset); } -void AdjListPropertyArrowChunkReader::Filter(utils::ExpressionPtr filter) { +void AdjListPropertyArrowChunkReader::Filter(utils::FilterPtr filter) { filter_options_.filter = filter; } @@ -269,10 +265,6 @@ void AdjListPropertyArrowChunkReader::Project(utils::VectorPtr columns) { filter_options_.columns = columns; } -void AdjListPropertyArrowChunkReader::Project(const std::string& column) { - Project({column}); -} - void AdjListPropertyArrowChunkReader::ClearProjection() { filter_options_.columns = nullptr; } diff --git a/cpp/src/expression.cc b/cpp/src/expression.cc index 22de971bd..8350c29d1 100644 --- a/cpp/src/expression.cc +++ b/cpp/src/expression.cc @@ -19,6 +19,13 @@ namespace GAR_NAMESPACE_INTERNAL { ArrowExpression ExpressionProperty::Evaluate() { return arrow::compute::field_ref(property_.name); } +ArrowExpression OperatorNot::Evaluate() { + return arrow::compute::not_(expr_->Evaluate()); +} + +ArrowExpression OperatorIsNull::Evaluate() { + return arrow::compute::is_null(expr_->Evaluate()); +} ArrowExpression OperatorEqual::Evaluate() { return arrow::compute::equal(lhs_->Evaluate(), rhs_->Evaluate()); diff --git a/cpp/test/test_arrow_chunk_reader.cc b/cpp/test/test_arrow_chunk_reader.cc index 463ef3afc..df16f55ec 100644 --- a/cpp/test/test_arrow_chunk_reader.cc +++ b/cpp/test/test_arrow_chunk_reader.cc @@ -116,10 +116,7 @@ TEST_CASE("test_vertex_property_pushdown") { auto group = maybe_group.value(); // construct pushdown options - Property prop("gender"); - std::string val("female"); - - auto filter = Expression::Make(prop, val); + auto filter = Expression::Make(Property("gender"), "female"); auto defer = std::unique_ptr(filter); std::vector columns{"firstName", "lastName"}; @@ -310,9 +307,8 @@ TEST_CASE("test_adj_list_property_pushdown") { // construct pushdown options Property prop("creationDate"); - std::string val("2012-06-02T04:30:44.526+0000"); - auto expr1 = Expression::Make(val, prop); + auto expr1 = Expression::Make("2012-06-02T04:30:44.526+0000", prop); auto expr2 = Expression::Make(prop, prop); auto filter = And(expr1, expr2); auto defer = std::unique_ptr(filter); From c3fdd5e76c4ae303dcac1709bdb04ad1a3fbae7b Mon Sep 17 00:00:00 2001 From: Ziy1-Tan Date: Mon, 26 Jun 2023 14:06:47 +0800 Subject: [PATCH 14/26] Bugfix: type casting when table is empty Signed-off-by: Ziy1-Tan --- cpp/include/gar/utils/expression.h | 14 +++++--------- cpp/src/filesystem.cc | 6 +++++- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/cpp/include/gar/utils/expression.h b/cpp/include/gar/utils/expression.h index 5487360e3..c550ce3d9 100644 --- a/cpp/include/gar/utils/expression.h +++ b/cpp/include/gar/utils/expression.h @@ -52,8 +52,7 @@ class Expression { typename OpType, typename = std::enable_if_t>, typename ValType> - static inline Expression* Make(const Property& property, - const ValType& value); + static inline Expression* Make(const Property& property, ValType value); /** * @brief Make a new expression from a property and a value @@ -69,8 +68,7 @@ class Expression { typename OpType, typename = std::enable_if_t>, typename ValType> - static inline Expression* Make(const ValType& value, - const Property& property); + static inline Expression* Make(ValType value, const Property& property); /** * @brief Make a new expression from a property and a value * @@ -111,7 +109,7 @@ class ExpressionProperty : public Expression { template class ExpressionLiteral : public Expression { public: - explicit ExpressionLiteral(const T value) : value_(value) {} + explicit ExpressionLiteral(T value) : value_(value) {} ExpressionLiteral(const ExpressionLiteral& other) = default; ~ExpressionLiteral() = default; @@ -263,15 +261,13 @@ using LessEqual = OperatorLessEqual; * Helper functions to Construct Expression. */ template -inline Expression* Expression::Make(const Property& property, - const ValType& value) { +inline Expression* Expression::Make(const Property& property, ValType value) { return new OpType(new ExpressionProperty(property), new ExpressionLiteral(value)); } template -inline Expression* Expression::Make(const ValType& value, - const Property& property) { +inline Expression* Expression::Make(ValType value, const Property& property) { return new OpType(new ExpressionLiteral(value), new ExpressionProperty(property)); } diff --git a/cpp/src/filesystem.cc b/cpp/src/filesystem.cc index 8a541d6a2..36d855335 100644 --- a/cpp/src/filesystem.cc +++ b/cpp/src/filesystem.cc @@ -128,7 +128,11 @@ Result> FileSystem::ReadFileToTable( // do casting auto field = table->field(i)->WithType(type); std::shared_ptr chunked_array; - if (type->Equals(arrow::large_utf8())) { + + if (table->num_rows() == 0) { + GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN( + chunked_array, arrow::ChunkedArray::MakeEmpty(type)); + } else if (type->Equals(arrow::large_utf8())) { auto status = detail::CastToLargeOffsetArray( table->column(i), type, chunked_array); From 52b2092a022346592589b7b940d9b3342c4e2402 Mon Sep 17 00:00:00 2001 From: Ziy1-Tan Date: Tue, 27 Jun 2023 10:54:58 +0800 Subject: [PATCH 15/26] Refactor: clean header files Signed-off-by: Ziy1-Tan --- cpp/include/gar/utils/expression.h | 27 ++++++++++++++------------- cpp/src/expression.cc | 4 ++-- cpp/test/test_arrow_chunk_reader.cc | 13 ++----------- 3 files changed, 18 insertions(+), 26 deletions(-) diff --git a/cpp/include/gar/utils/expression.h b/cpp/include/gar/utils/expression.h index c550ce3d9..cbe638bb4 100644 --- a/cpp/include/gar/utils/expression.h +++ b/cpp/include/gar/utils/expression.h @@ -187,13 +187,13 @@ class OperatorNotEqual : public BinaryOperator { ArrowExpression Evaluate() override; }; -class OperatorGreater : public BinaryOperator { +class OperatorGreaterThan : public BinaryOperator { public: - OperatorGreater() = default; - OperatorGreater(Expression* lhs, Expression* rhs) + OperatorGreaterThan() = default; + OperatorGreaterThan(Expression* lhs, Expression* rhs) : BinaryOperator(lhs, rhs) {} - OperatorGreater(const OperatorGreater& other) = default; - ~OperatorGreater() = default; + OperatorGreaterThan(const OperatorGreaterThan& other) = default; + ~OperatorGreaterThan() = default; ArrowExpression Evaluate() override; }; @@ -209,12 +209,13 @@ class OperatorGreaterEqual : public BinaryOperator { ArrowExpression Evaluate() override; }; -class OperatorLess : public BinaryOperator { +class OperatorLessThan : public BinaryOperator { public: - OperatorLess() = default; - OperatorLess(Expression* lhs, Expression* rhs) : BinaryOperator(lhs, rhs) {} - OperatorLess(const OperatorLess& other) = default; - ~OperatorLess() = default; + OperatorLessThan() = default; + OperatorLessThan(Expression* lhs, Expression* rhs) + : BinaryOperator(lhs, rhs) {} + OperatorLessThan(const OperatorLessThan& other) = default; + ~OperatorLessThan() = default; ArrowExpression Evaluate() override; }; @@ -252,9 +253,9 @@ class OperatorOr : public BinaryOperator { using Equal = OperatorEqual; using NotEqual = OperatorNotEqual; -using Greater = OperatorGreater; -using GreaterThan = OperatorGreaterEqual; -using Less = OperatorLess; +using GreaterThan = OperatorGreaterThan; +using GreaterEqual = OperatorGreaterEqual; +using LessThan = OperatorLessThan; using LessEqual = OperatorLessEqual; /** diff --git a/cpp/src/expression.cc b/cpp/src/expression.cc index 8350c29d1..3194d8ecd 100644 --- a/cpp/src/expression.cc +++ b/cpp/src/expression.cc @@ -35,7 +35,7 @@ ArrowExpression OperatorNotEqual::Evaluate() { return arrow::compute::not_equal(lhs_->Evaluate(), rhs_->Evaluate()); } -ArrowExpression OperatorGreater::Evaluate() { +ArrowExpression OperatorGreaterThan::Evaluate() { return arrow::compute::greater(lhs_->Evaluate(), rhs_->Evaluate()); } @@ -43,7 +43,7 @@ ArrowExpression OperatorGreaterEqual::Evaluate() { return arrow::compute::greater_equal(lhs_->Evaluate(), rhs_->Evaluate()); } -ArrowExpression OperatorLess::Evaluate() { +ArrowExpression OperatorLessThan::Evaluate() { return arrow::compute::less(lhs_->Evaluate(), rhs_->Evaluate()); } diff --git a/cpp/test/test_arrow_chunk_reader.cc b/cpp/test/test_arrow_chunk_reader.cc index df16f55ec..088d6f4aa 100644 --- a/cpp/test/test_arrow_chunk_reader.cc +++ b/cpp/test/test_arrow_chunk_reader.cc @@ -15,26 +15,17 @@ limitations under the License. #include -#include "arrow/adapters/orc/adapter.h" #include "arrow/api.h" -#include "arrow/csv/api.h" -#include "arrow/filesystem/api.h" -#include "arrow/io/api.h" -#include "arrow/stl.h" -#include "arrow/util/uri.h" -#include "parquet/arrow/writer.h" #include "./util.h" #include "gar/reader/arrow_chunk_reader.h" -#include "gar/utils/expression.h" -#include "gar/writer/arrow_chunk_writer.h" #define CATCH_CONFIG_MAIN #include using GAR_NAMESPACE::Equal; using GAR_NAMESPACE::Expression; -using GAR_NAMESPACE::Less; +using GAR_NAMESPACE::LessThan; using GAR_NAMESPACE::Property; using GAR_NAMESPACE::utils::FilterOptions; @@ -308,7 +299,7 @@ TEST_CASE("test_adj_list_property_pushdown") { // construct pushdown options Property prop("creationDate"); - auto expr1 = Expression::Make("2012-06-02T04:30:44.526+0000", prop); + auto expr1 = Expression::Make("2012-06-02T04:30:44.526+0000", prop); auto expr2 = Expression::Make(prop, prop); auto filter = And(expr1, expr2); auto defer = std::unique_ptr(filter); From 70ae4fcafd90da3ed992734c3102a2d18f64b2a9 Mon Sep 17 00:00:00 2001 From: Ziy1-Tan Date: Fri, 30 Jun 2023 22:48:59 +0800 Subject: [PATCH 16/26] Refactor: Fix style Signed-off-by: Ziy1-Tan --- cpp/src/filesystem.cc | 2 +- cpp/test/test_arrow_chunk_reader.cc | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/filesystem.cc b/cpp/src/filesystem.cc index 36d855335..463189ea0 100644 --- a/cpp/src/filesystem.cc +++ b/cpp/src/filesystem.cc @@ -103,7 +103,7 @@ Result> FileSystem::ReadFileToTable( GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(auto dataset, factory->Finish()); GAR_RETURN_ON_ARROW_ERROR_AND_ASSIGN(auto scan_builder, dataset->NewScan()); - // Read specified columns with a row filter + // Apply the row filter and select the specified columns if (options.filter) { RETURN_NOT_ARROW_OK(scan_builder->Filter(options.filter->Evaluate())); } diff --git a/cpp/test/test_arrow_chunk_reader.cc b/cpp/test/test_arrow_chunk_reader.cc index 088d6f4aa..8a093a70d 100644 --- a/cpp/test/test_arrow_chunk_reader.cc +++ b/cpp/test/test_arrow_chunk_reader.cc @@ -131,7 +131,7 @@ TEST_CASE("test_vertex_property_pushdown") { << chunk_size << ",\tRange: [" << l << ", " << r << "]" << '\n'; i++; sum += table->num_rows(); - } while (!reader.next_chunk().IsOutOfRange()); + } while (!reader.next_chunk().IsIndexError()); std::cout << "Total Nums: " << sum << "/" << reader.GetChunkNum() * chunk_size << '\n'; @@ -326,7 +326,7 @@ TEST_CASE("test_adj_list_property_pushdown") { << chunk_size << '\n'; i++; sum += table->num_rows(); - } while (!reader.next_chunk().IsOutOfRange()); + } while (!reader.next_chunk().IsIndexError()); std::cout << "Total Nums: " << sum << "/" << i * chunk_size << '\n'; std::cout << "Column Nums: " << names.size() << "\n"; From d24d26997ecff30e101a9499c130b82af1a6fadb Mon Sep 17 00:00:00 2001 From: Ziy1-Tan Date: Tue, 4 Jul 2023 19:51:26 +0800 Subject: [PATCH 17/26] Refactor: ut for reader Signed-off-by: Ziy1-Tan --- cpp/test/test_arrow_chunk_reader.cc | 75 +++++++++++++++-------------- 1 file changed, 39 insertions(+), 36 deletions(-) diff --git a/cpp/test/test_arrow_chunk_reader.cc b/cpp/test/test_arrow_chunk_reader.cc index 8a093a70d..9f90c5f2b 100644 --- a/cpp/test/test_arrow_chunk_reader.cc +++ b/cpp/test/test_arrow_chunk_reader.cc @@ -91,54 +91,56 @@ TEST_CASE("test_vertex_property_arrow_chunk_reader") { TEST_CASE("test_vertex_property_pushdown") { std::string root; REQUIRE(GetTestResourceRoot(&root).ok()); + std::string path = root + "/ldbc_sample/parquet/ldbc_sample.graph.yml"; + std::string label = "person", property_name = "gender"; + + Property prop("gender"); + std::string value("female"); + auto filter = Expression::Make(prop, value); + auto defer = std::unique_ptr(filter); + std::vector expected_cols{"firstName", "lastName"}; // read file and construct graph info - std::string path = root + "/ldbc_sample/parquet/ldbc_sample.graph.yml"; auto maybe_graph_info = GAR_NAMESPACE::GraphInfo::Load(path); REQUIRE(maybe_graph_info.status().ok()); auto graph_info = maybe_graph_info.value(); - // construct vertex chunk reader - std::string label = "person", property_name = "gender"; REQUIRE(graph_info.GetVertexInfo(label).status().ok()); const auto chunk_size = graph_info.GetVertexInfo(label)->GetChunkSize(); auto maybe_group = graph_info.GetVertexPropertyGroup(label, property_name); REQUIRE(maybe_group.status().ok()); auto group = maybe_group.value(); - // construct pushdown options - auto filter = Expression::Make(Property("gender"), "female"); - auto defer = std::unique_ptr(filter); - std::vector columns{"firstName", "lastName"}; - FilterOptions options; options.filter = filter; - options.columns = &columns; + options.columns = &expected_cols; // print reader result auto walkReader = [&](GAR_NAMESPACE::VertexPropertyArrowChunkReader& reader) { - int i = 0; - int sum = 0; - std::vector names; + int idx = 0, sum = 0; + std::shared_ptr table = nullptr; do { auto result = reader.GetChunk(); REQUIRE(!result.has_error()); - auto [l, r] = reader.GetRange().value(); - auto table = result.value(); - names = table->ColumnNames(); - std::cout << "Chunk: " << i << ",\tNums: " << table->num_rows() << "/" - << chunk_size << ",\tRange: [" << l << ", " << r << "]" << '\n'; - i++; + table = result.value(); + auto [start, end] = reader.GetRange().value(); + std::cout << "Chunk: " << idx << ",\tNums: " << table->num_rows() << "/" + << chunk_size << ",\tRange: (" << start << ", " << end << "]" + << '\n'; + idx++; sum += table->num_rows(); } while (!reader.next_chunk().IsIndexError()); + REQUIRE(idx == reader.GetChunkNum()); + REQUIRE(table->num_columns() == (int) expected_cols.size()); std::cout << "Total Nums: " << sum << "/" << reader.GetChunkNum() * chunk_size << '\n'; - std::cout << "Column Nums: " << names.size() << "\n"; + std::cout << "Column Nums: " << table->num_columns() << "\n"; std::cout << "Column Names: "; - for (const auto& name : names) { - std::cout << "`" << name << "` "; + for (int i = 0; i < table->num_columns(); i++) { + REQUIRE(table->ColumnNames()[i] == expected_cols[i]); + std::cout << "`" << table->ColumnNames()[i] << "` "; } std::cout << "\n\n"; }; @@ -158,7 +160,7 @@ TEST_CASE("test_vertex_property_pushdown") { REQUIRE(maybe_reader.status().ok()); auto reader = maybe_reader.value(); reader.Filter(filter); - reader.Project(&columns); + reader.Project(&expected_cols); walkReader(reader); } } @@ -304,35 +306,36 @@ TEST_CASE("test_adj_list_property_pushdown") { auto filter = And(expr1, expr2); auto defer = std::unique_ptr(filter); - std::vector columns{"creationDate"}; + std::vector expected_cols{"creationDate"}; FilterOptions options; options.filter = filter; - options.columns = &columns; + options.columns = &expected_cols; // print reader result auto walkReader = [&](GAR_NAMESPACE::AdjListPropertyArrowChunkReader& reader) { - int i = 0; + int idx = 0; int sum = 0; - std::vector names; + std::shared_ptr table = nullptr; do { auto result = reader.GetChunk(); REQUIRE(!result.has_error()); - auto table = result.value(); - names = table->ColumnNames(); - std::cout << "Chunk: " << i << ",\tNums: " << table->num_rows() << "/" - << chunk_size << '\n'; - i++; + table = result.value(); + std::cout << "Chunk: " << idx << ",\tNums: " << table->num_rows() + << "/" << chunk_size << '\n'; + idx++; sum += table->num_rows(); } while (!reader.next_chunk().IsIndexError()); + REQUIRE(table->num_columns() == (int) expected_cols.size()); - std::cout << "Total Nums: " << sum << "/" << i * chunk_size << '\n'; - std::cout << "Column Nums: " << names.size() << "\n"; + std::cout << "Total Nums: " << sum << "/" << idx * chunk_size << '\n'; + std::cout << "Column Nums: " << table->num_columns() << "\n"; std::cout << "Column Names: "; - for (const auto& name : names) { - std::cout << "`" << name << "` "; + for (int i = 0; i < table->num_columns(); i++) { + REQUIRE(table->ColumnNames()[i] == expected_cols[i]); + std::cout << "`" << table->ColumnNames()[i] << "` "; } std::cout << "\n\n"; }; @@ -356,7 +359,7 @@ TEST_CASE("test_adj_list_property_pushdown") { REQUIRE(maybe_reader.status().ok()); auto reader = maybe_reader.value(); reader.Filter(filter); - reader.Project(&columns); + reader.Project(&expected_cols); walkReader(reader); } } From 94aae0209325eedd4315bc7bb16eaf3c26f84fbb Mon Sep 17 00:00:00 2001 From: Ziy1-Tan Date: Wed, 5 Jul 2023 00:32:28 +0800 Subject: [PATCH 18/26] Refactor: fix the GetRange() Signed-off-by: Ziy1-Tan --- cpp/include/gar/reader/arrow_chunk_reader.h | 3 +++ cpp/src/arrow_chunk_reader.cc | 10 +++++++--- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/cpp/include/gar/reader/arrow_chunk_reader.h b/cpp/include/gar/reader/arrow_chunk_reader.h index 0d5faf961..5ba8e83ec 100644 --- a/cpp/include/gar/reader/arrow_chunk_reader.h +++ b/cpp/include/gar/reader/arrow_chunk_reader.h @@ -65,6 +65,8 @@ class VertexPropertyArrowChunkReader { std::string base_dir = prefix_ + pg_path_prefix; GAR_ASSIGN_OR_RAISE_ERROR(chunk_num_, utils::GetVertexChunkNum(prefix_, vertex_info)); + GAR_ASSIGN_OR_RAISE_ERROR(vertex_num_, + utils::GetVertexNum(prefix_, vertex_info_)); } /** @@ -140,6 +142,7 @@ class VertexPropertyArrowChunkReader { IdType chunk_index_; IdType seek_id_; IdType chunk_num_; + IdType vertex_num_; std::shared_ptr chunk_table_; utils::FilterOptions filter_options_; std::shared_ptr fs_; diff --git a/cpp/src/arrow_chunk_reader.cc b/cpp/src/arrow_chunk_reader.cc index db9425205..59617d171 100644 --- a/cpp/src/arrow_chunk_reader.cc +++ b/cpp/src/arrow_chunk_reader.cc @@ -42,9 +42,13 @@ VertexPropertyArrowChunkReader::GetRange() noexcept { "The chunk table is not initialized, please call " "GetChunk() first."); } - IdType row_offset = seek_id_ - chunk_index_ * vertex_info_.GetChunkSize(); - return std::make_pair(seek_id_, - seek_id_ + chunk_table_->num_rows() - row_offset); + const auto chunk_size = vertex_info_.GetChunkSize(); + IdType row_offset = seek_id_ - chunk_index_ * chunk_size; + bool is_last_chunk = (chunk_index_ == chunk_num_ - 1); + const auto curr_chunk_size = + is_last_chunk ? (vertex_num_ - chunk_index_ * chunk_size) : chunk_size; + + return std::make_pair(seek_id_, seek_id_ + curr_chunk_size - row_offset); } void VertexPropertyArrowChunkReader::Filter(utils::FilterPtr filter) { From 037553802719b9c70b60cda8ff3fdd79390585c5 Mon Sep 17 00:00:00 2001 From: Ziy1-Tan Date: Wed, 5 Jul 2023 15:44:03 +0800 Subject: [PATCH 19/26] Refactor: improve API docs Signed-off-by: Ziy1-Tan --- cpp/include/gar/reader/arrow_chunk_reader.h | 24 ++++++++++++------ cpp/include/gar/utils/reader_utils.h | 10 ++++---- cpp/src/arrow_chunk_reader.cc | 28 +++++---------------- cpp/test/test_arrow_chunk_reader.cc | 4 +-- 4 files changed, 29 insertions(+), 37 deletions(-) diff --git a/cpp/include/gar/reader/arrow_chunk_reader.h b/cpp/include/gar/reader/arrow_chunk_reader.h index 5ba8e83ec..cb174a3aa 100644 --- a/cpp/include/gar/reader/arrow_chunk_reader.h +++ b/cpp/include/gar/reader/arrow_chunk_reader.h @@ -129,11 +129,21 @@ class VertexPropertyArrowChunkReader { */ IdType GetChunkNum() const noexcept { return chunk_num_; } - void Filter(utils::FilterPtr filter); - void ClearFilter(); + /** + * @brief Apply the row filter to the table. No parameter call Filter() or + * Filter(nullptr) will clear the filter. + * + * @param filter Predicate expression to filter rows. + */ + void Filter(utils::Filter filter = nullptr); - void Project(utils::VectorPtr columns); - void ClearProjection(); + /** + * @brief Apply the projection to the table to be read. No parameter call + * Select() or Select(nullptr) will clear the projection. + * + * @param column_names The name of columns to be selected. + */ + void Select(utils::ColumnNames column_names = nullptr); private: VertexInfo vertex_info_; @@ -581,11 +591,9 @@ class AdjListPropertyArrowChunkReader { return Status::OK(); } - void Filter(utils::FilterPtr filter); - void ClearFilter(); + void Filter(utils::Filter filter = nullptr); - void Project(utils::VectorPtr columns); - void ClearProjection(); + void Select(utils::ColumnNames column_names = nullptr); private: EdgeInfo edge_info_; diff --git a/cpp/include/gar/utils/reader_utils.h b/cpp/include/gar/utils/reader_utils.h index 02b993bd5..95ed9f7b1 100644 --- a/cpp/include/gar/utils/reader_utils.h +++ b/cpp/include/gar/utils/reader_utils.h @@ -27,17 +27,17 @@ namespace GAR_NAMESPACE_INTERNAL { namespace utils { -using FilterPtr = Expression*; -using VectorPtr = std::vector*; +using Filter = Expression*; +using ColumnNames = std::vector*; struct FilterOptions { // The row filter to apply to the table. - FilterPtr filter = nullptr; + Filter filter = nullptr; // The columns to include in the table. Select all columns by default. - VectorPtr columns = nullptr; + ColumnNames columns = nullptr; FilterOptions() {} - FilterOptions(FilterPtr filter, VectorPtr columns) + FilterOptions(Filter filter, ColumnNames columns) : filter(filter), columns(columns) {} }; diff --git a/cpp/src/arrow_chunk_reader.cc b/cpp/src/arrow_chunk_reader.cc index 59617d171..cc099f3cd 100644 --- a/cpp/src/arrow_chunk_reader.cc +++ b/cpp/src/arrow_chunk_reader.cc @@ -51,20 +51,12 @@ VertexPropertyArrowChunkReader::GetRange() noexcept { return std::make_pair(seek_id_, seek_id_ + curr_chunk_size - row_offset); } -void VertexPropertyArrowChunkReader::Filter(utils::FilterPtr filter) { +void VertexPropertyArrowChunkReader::Filter(utils::Filter filter) { filter_options_.filter = filter; } -void VertexPropertyArrowChunkReader::ClearFilter() { - filter_options_.filter = nullptr; -} - -void VertexPropertyArrowChunkReader::Project(utils::VectorPtr columns) { - filter_options_.columns = columns; -} - -void VertexPropertyArrowChunkReader::ClearProjection() { - filter_options_.columns = nullptr; +void VertexPropertyArrowChunkReader::Select(utils::ColumnNames column_names) { + filter_options_.columns = column_names; } Status AdjListArrowChunkReader::seek_src(IdType id) noexcept { @@ -257,20 +249,12 @@ AdjListPropertyArrowChunkReader::GetChunk() noexcept { return chunk_table_->Slice(row_offset); } -void AdjListPropertyArrowChunkReader::Filter(utils::FilterPtr filter) { +void AdjListPropertyArrowChunkReader::Filter(utils::Filter filter) { filter_options_.filter = filter; } -void AdjListPropertyArrowChunkReader::ClearFilter() { - filter_options_.filter = nullptr; -} - -void AdjListPropertyArrowChunkReader::Project(utils::VectorPtr columns) { - filter_options_.columns = columns; -} - -void AdjListPropertyArrowChunkReader::ClearProjection() { - filter_options_.columns = nullptr; +void AdjListPropertyArrowChunkReader::Select(utils::ColumnNames column_names) { + filter_options_.columns = column_names; } } // namespace GAR_NAMESPACE_INTERNAL diff --git a/cpp/test/test_arrow_chunk_reader.cc b/cpp/test/test_arrow_chunk_reader.cc index 9f90c5f2b..4492d9042 100644 --- a/cpp/test/test_arrow_chunk_reader.cc +++ b/cpp/test/test_arrow_chunk_reader.cc @@ -160,7 +160,7 @@ TEST_CASE("test_vertex_property_pushdown") { REQUIRE(maybe_reader.status().ok()); auto reader = maybe_reader.value(); reader.Filter(filter); - reader.Project(&expected_cols); + reader.Select(&expected_cols); walkReader(reader); } } @@ -359,7 +359,7 @@ TEST_CASE("test_adj_list_property_pushdown") { REQUIRE(maybe_reader.status().ok()); auto reader = maybe_reader.value(); reader.Filter(filter); - reader.Project(&expected_cols); + reader.Select(&expected_cols); walkReader(reader); } } From 8e59514c984cec040f23c1ee3b49350bc57ee25c Mon Sep 17 00:00:00 2001 From: Ziy1-Tan Date: Thu, 6 Jul 2023 00:47:45 +0800 Subject: [PATCH 20/26] Refactor: improve expression usability Signed-off-by: Ziy1-Tan --- cpp/include/gar/utils/expression.h | 245 +++++++++++++-------------- cpp/include/gar/utils/reader_utils.h | 3 +- cpp/src/expression.cc | 20 +-- cpp/test/test_arrow_chunk_reader.cc | 27 ++- 4 files changed, 143 insertions(+), 152 deletions(-) diff --git a/cpp/include/gar/utils/expression.h b/cpp/include/gar/utils/expression.h index cbe638bb4..2bc1dae65 100644 --- a/cpp/include/gar/utils/expression.h +++ b/cpp/include/gar/utils/expression.h @@ -26,7 +26,6 @@ limitations under the License. namespace GAR_NAMESPACE_INTERNAL { using ArrowExpression = arrow::compute::Expression; -class BinaryOperator; /** * This class wraps an arrow::compute::Expression and provides methods for @@ -39,57 +38,13 @@ class Expression { virtual ~Expression() = default; /** - * @brief Make a new expression from a property and a value - * - * @tparam OpType The type of the operator, only binary operators are allowed - * e.g. OperatorEqual - * @tparam ValType The type of the value, e.g. int64_t - * @param property The property to compare - * @param value The value to compare - * @return A predicate expression for filter pushdown - */ - template < - typename OpType, - typename = std::enable_if_t>, - typename ValType> - static inline Expression* Make(const Property& property, ValType value); - - /** - * @brief Make a new expression from a property and a value - * - * @tparam OpType The type of the operator, only binary operators are allowed - * e.g. OperatorEqual - * @tparam ValType The type of the value, e.g. int64_t - * @param value The value to compare - * @param property The property to compare - * @return A predicate expression for filter pushdown - */ - template < - typename OpType, - typename = std::enable_if_t>, - typename ValType> - static inline Expression* Make(ValType value, const Property& property); - /** - * @brief Make a new expression from a property and a value - * - * @tparam OpType The type of the operator, only binary operators are allowed - * e.g. OperatorEq - * @param p1 The first property to compare - * @param p2 The second property to compare - * @return A predicate expression for filter pushdown - */ - template >> - static inline Expression* Make(const Property& p1, const Property& p2); - - /** - * @brief Parse predicates based on attributes, operators, and values e,g. new - * OperatorEqual(new ExpressionProperty(Property("a")), new - * ExpressionLiteral(1)) will be parsed as + * @brief Evaluate Expression as arrow::compute::Expression e.g. new + * ExpressionEqual(new ExpressionProperty("a"), new + * ExpressionLiteral(1)) will be parsed as * arrow::compute::equal(arrow::compute::field_ref("a"), * arrow::compute::literal(1)) * - * @return The arrow::compute::Expression object + * @return The arrow::compute::Expression instance */ virtual ArrowExpression Evaluate() = 0; }; @@ -97,6 +52,8 @@ class Expression { class ExpressionProperty : public Expression { public: explicit ExpressionProperty(const Property& property) : property_(property) {} + explicit ExpressionProperty(const std::string& name) + : property_(Property(name)) {} ExpressionProperty(const ExpressionProperty& other) = default; ~ExpressionProperty() = default; @@ -122,7 +79,7 @@ class ExpressionLiteral : public Expression { class UnaryOperator : public Expression { public: UnaryOperator() = default; - explicit UnaryOperator(Expression* expr) : expr_(expr) {} + explicit UnaryOperator(std::shared_ptr expr) : expr_(expr) {} UnaryOperator(const UnaryOperator& other) = default; virtual ~UnaryOperator() {} @@ -130,23 +87,25 @@ class UnaryOperator : public Expression { std::shared_ptr expr_; }; -class OperatorNot : public UnaryOperator { +class ExpressionNot : public UnaryOperator { public: - OperatorNot() = default; - explicit OperatorNot(Expression* expr) : UnaryOperator(expr) {} - OperatorNot(const OperatorNot& other) = default; - ~OperatorNot() = default; + ExpressionNot() = default; + explicit ExpressionNot(std::shared_ptr expr) + : UnaryOperator(expr) {} + ExpressionNot(const ExpressionNot& other) = default; + ~ExpressionNot() = default; ArrowExpression Evaluate() override; }; -class OperatorIsNull : public UnaryOperator { +class ExpressionIsNull : public UnaryOperator { public: - OperatorIsNull() = default; - explicit OperatorIsNull(Expression* expr, bool nan_is_null = false) + ExpressionIsNull() = default; + explicit ExpressionIsNull(std::shared_ptr expr, + bool nan_is_null = false) : UnaryOperator(expr), nan_is_null_(nan_is_null) {} - OperatorIsNull(const OperatorIsNull& other) = default; - ~OperatorIsNull() = default; + ExpressionIsNull(const ExpressionIsNull& other) = default; + ~ExpressionIsNull() = default; ArrowExpression Evaluate() override; @@ -157,7 +116,9 @@ class OperatorIsNull : public UnaryOperator { class BinaryOperator : public Expression { public: BinaryOperator() = default; - BinaryOperator(Expression* lhs, Expression* rhs) : lhs_(lhs), rhs_(rhs) {} + BinaryOperator(std::shared_ptr lhs, + std::shared_ptr rhs) + : lhs_(lhs), rhs_(rhs) {} BinaryOperator(const BinaryOperator& other) = default; ~BinaryOperator() = default; @@ -166,133 +127,163 @@ class BinaryOperator : public Expression { std::shared_ptr rhs_; }; -class OperatorEqual : public BinaryOperator { +class ExpressionEqual : public BinaryOperator { public: - OperatorEqual() = default; - OperatorEqual(Expression* lhs, Expression* rhs) : BinaryOperator(lhs, rhs) {} - OperatorEqual(const OperatorEqual& other) = default; - ~OperatorEqual() = default; + ExpressionEqual() = default; + ExpressionEqual(std::shared_ptr lhs, + std::shared_ptr rhs) + : BinaryOperator(lhs, rhs) {} + ExpressionEqual(const ExpressionEqual& other) = default; + ~ExpressionEqual() = default; ArrowExpression Evaluate() override; }; -class OperatorNotEqual : public BinaryOperator { +class ExpressionNotEqual : public BinaryOperator { public: - OperatorNotEqual() = default; - OperatorNotEqual(Expression* lhs, Expression* rhs) + ExpressionNotEqual() = default; + ExpressionNotEqual(std::shared_ptr lhs, + std::shared_ptr rhs) : BinaryOperator(lhs, rhs) {} - OperatorNotEqual(const OperatorNotEqual& other) = default; - ~OperatorNotEqual() = default; + ExpressionNotEqual(const ExpressionNotEqual& other) = default; + ~ExpressionNotEqual() = default; ArrowExpression Evaluate() override; }; -class OperatorGreaterThan : public BinaryOperator { +class ExpressionGreaterThan : public BinaryOperator { public: - OperatorGreaterThan() = default; - OperatorGreaterThan(Expression* lhs, Expression* rhs) + ExpressionGreaterThan() = default; + ExpressionGreaterThan(std::shared_ptr lhs, + std::shared_ptr rhs) : BinaryOperator(lhs, rhs) {} - OperatorGreaterThan(const OperatorGreaterThan& other) = default; - ~OperatorGreaterThan() = default; + ExpressionGreaterThan(const ExpressionGreaterThan& other) = default; + ~ExpressionGreaterThan() = default; ArrowExpression Evaluate() override; }; -class OperatorGreaterEqual : public BinaryOperator { +class ExpressionGreaterEqual : public BinaryOperator { public: - OperatorGreaterEqual() = default; - OperatorGreaterEqual(Expression* lhs, Expression* rhs) + ExpressionGreaterEqual() = default; + ExpressionGreaterEqual(std::shared_ptr lhs, + std::shared_ptr rhs) : BinaryOperator(lhs, rhs) {} - OperatorGreaterEqual(const OperatorGreaterEqual& other) = default; - ~OperatorGreaterEqual() = default; + ExpressionGreaterEqual(const ExpressionGreaterEqual& other) = default; + ~ExpressionGreaterEqual() = default; ArrowExpression Evaluate() override; }; -class OperatorLessThan : public BinaryOperator { +class ExpressionLessThan : public BinaryOperator { public: - OperatorLessThan() = default; - OperatorLessThan(Expression* lhs, Expression* rhs) + ExpressionLessThan() = default; + ExpressionLessThan(std::shared_ptr lhs, + std::shared_ptr rhs) : BinaryOperator(lhs, rhs) {} - OperatorLessThan(const OperatorLessThan& other) = default; - ~OperatorLessThan() = default; + ExpressionLessThan(const ExpressionLessThan& other) = default; + ~ExpressionLessThan() = default; ArrowExpression Evaluate() override; }; -class OperatorLessEqual : public BinaryOperator { +class ExpressionLessEqual : public BinaryOperator { public: - OperatorLessEqual() = default; - OperatorLessEqual(Expression* lhs, Expression* rhs) + ExpressionLessEqual() = default; + ExpressionLessEqual(std::shared_ptr lhs, + std::shared_ptr rhs) : BinaryOperator(lhs, rhs) {} - OperatorLessEqual(const OperatorLessEqual& other) = default; - ~OperatorLessEqual() = default; + ExpressionLessEqual(const ExpressionLessEqual& other) = default; + ~ExpressionLessEqual() = default; ArrowExpression Evaluate() override; }; -class OperatorAnd : public BinaryOperator { +class ExpressionAnd : public BinaryOperator { public: - OperatorAnd() = default; - OperatorAnd(Expression* lhs, Expression* rhs) : BinaryOperator(lhs, rhs) {} - OperatorAnd(const OperatorAnd& other) = default; - ~OperatorAnd() = default; + ExpressionAnd() = default; + ExpressionAnd(std::shared_ptr lhs, + std::shared_ptr rhs) + : BinaryOperator(lhs, rhs) {} + ExpressionAnd(const ExpressionAnd& other) = default; + ~ExpressionAnd() = default; ArrowExpression Evaluate() override; }; -class OperatorOr : public BinaryOperator { +class ExpressionOr : public BinaryOperator { public: - OperatorOr() = default; - OperatorOr(Expression* lhs, Expression* rhs) : BinaryOperator(lhs, rhs) {} - OperatorOr(const OperatorOr& other) = default; - ~OperatorOr() = default; + ExpressionOr() = default; + ExpressionOr(std::shared_ptr lhs, std::shared_ptr rhs) + : BinaryOperator(lhs, rhs) {} + ExpressionOr(const ExpressionOr& other) = default; + ~ExpressionOr() = default; ArrowExpression Evaluate() override; }; -using Equal = OperatorEqual; -using NotEqual = OperatorNotEqual; -using GreaterThan = OperatorGreaterThan; -using GreaterEqual = OperatorGreaterEqual; -using LessThan = OperatorLessThan; -using LessEqual = OperatorLessEqual; - /** * Helper functions to Construct Expression. */ -template -inline Expression* Expression::Make(const Property& property, ValType value) { - return new OpType(new ExpressionProperty(property), - new ExpressionLiteral(value)); +[[nodiscard]] static inline std::shared_ptr _Property( + const Property& property) { + return std::make_shared(property); } -template -inline Expression* Expression::Make(ValType value, const Property& property) { - return new OpType(new ExpressionLiteral(value), - new ExpressionProperty(property)); +[[nodiscard]] static inline std::shared_ptr _Property( + const std::string& name) { + return std::make_shared(name); } -template -inline Expression* Expression::Make(const Property& p1, const Property& p2) { - return new OpType(new ExpressionProperty(p1), new ExpressionProperty(p2)); +template +[[nodiscard]] static inline std::shared_ptr _Literal(T value) { + return std::make_shared>(value); } -static inline Expression* Not(Expression* expr) { - return new OperatorNot(expr); +[[nodiscard]] static inline std::shared_ptr _Not( + std::shared_ptr expr) { + return std::make_shared(expr); } -static inline Expression* IsNull(Expression* expr, bool nan_is_null = false) { - return new OperatorIsNull(expr, nan_is_null); +[[nodiscard]] static inline std::shared_ptr _IsNull( + std::shared_ptr expr, bool nan_is_null = false) { + return std::make_shared(expr, nan_is_null); } -static inline Expression* And(Expression* lhs, Expression* rhs) { - return new OperatorAnd(lhs, rhs); +[[nodiscard]] static inline std::shared_ptr _Equal( + std::shared_ptr lhs, std::shared_ptr rhs) { + return std::make_shared(lhs, rhs); +} +[[nodiscard]] static inline std::shared_ptr _NotEqual( + std::shared_ptr lhs, std::shared_ptr rhs) { + return std::make_shared(lhs, rhs); +} +[[nodiscard]] static inline std::shared_ptr _GreaterThan( + std::shared_ptr lhs, std::shared_ptr rhs) { + return std::make_shared(lhs, rhs); +} +[[nodiscard]] static inline std::shared_ptr _GreaterEqual( + std::shared_ptr lhs, std::shared_ptr rhs) { + return std::make_shared(lhs, rhs); +} +[[nodiscard]] static inline std::shared_ptr _LessThan( + std::shared_ptr lhs, std::shared_ptr rhs) { + return std::make_shared(lhs, rhs); } -static inline Expression* Or(Expression* lhs, Expression* rhs) { - return new OperatorOr(lhs, rhs); +[[nodiscard]] static inline std::shared_ptr _LessEqual( + std::shared_ptr lhs, std::shared_ptr rhs) { + return std::make_shared(lhs, rhs); } +[[nodiscard]] static inline std::shared_ptr _And( + std::shared_ptr lhs, std::shared_ptr rhs) { + return std::make_shared(lhs, rhs); +} + +[[nodiscard]] static inline std::shared_ptr Or( + std::shared_ptr lhs, std::shared_ptr rhs) { + return std::make_shared(lhs, rhs); +} } // namespace GAR_NAMESPACE_INTERNAL #endif // GAR_UTILS_EXPRESSION_H_ diff --git a/cpp/include/gar/utils/reader_utils.h b/cpp/include/gar/utils/reader_utils.h index 95ed9f7b1..8b475a2cc 100644 --- a/cpp/include/gar/utils/reader_utils.h +++ b/cpp/include/gar/utils/reader_utils.h @@ -16,6 +16,7 @@ limitations under the License. #ifndef GAR_UTILS_READER_UTILS_H_ #define GAR_UTILS_READER_UTILS_H_ +#include #include #include #include @@ -27,7 +28,7 @@ namespace GAR_NAMESPACE_INTERNAL { namespace utils { -using Filter = Expression*; +using Filter = std::shared_ptr; using ColumnNames = std::vector*; struct FilterOptions { diff --git a/cpp/src/expression.cc b/cpp/src/expression.cc index 3194d8ecd..3d0ad5525 100644 --- a/cpp/src/expression.cc +++ b/cpp/src/expression.cc @@ -19,43 +19,43 @@ namespace GAR_NAMESPACE_INTERNAL { ArrowExpression ExpressionProperty::Evaluate() { return arrow::compute::field_ref(property_.name); } -ArrowExpression OperatorNot::Evaluate() { +ArrowExpression ExpressionNot::Evaluate() { return arrow::compute::not_(expr_->Evaluate()); } -ArrowExpression OperatorIsNull::Evaluate() { +ArrowExpression ExpressionIsNull::Evaluate() { return arrow::compute::is_null(expr_->Evaluate()); } -ArrowExpression OperatorEqual::Evaluate() { +ArrowExpression ExpressionEqual::Evaluate() { return arrow::compute::equal(lhs_->Evaluate(), rhs_->Evaluate()); } -ArrowExpression OperatorNotEqual::Evaluate() { +ArrowExpression ExpressionNotEqual::Evaluate() { return arrow::compute::not_equal(lhs_->Evaluate(), rhs_->Evaluate()); } -ArrowExpression OperatorGreaterThan::Evaluate() { +ArrowExpression ExpressionGreaterThan::Evaluate() { return arrow::compute::greater(lhs_->Evaluate(), rhs_->Evaluate()); } -ArrowExpression OperatorGreaterEqual::Evaluate() { +ArrowExpression ExpressionGreaterEqual::Evaluate() { return arrow::compute::greater_equal(lhs_->Evaluate(), rhs_->Evaluate()); } -ArrowExpression OperatorLessThan::Evaluate() { +ArrowExpression ExpressionLessThan::Evaluate() { return arrow::compute::less(lhs_->Evaluate(), rhs_->Evaluate()); } -ArrowExpression OperatorLessEqual::Evaluate() { +ArrowExpression ExpressionLessEqual::Evaluate() { return arrow::compute::less_equal(lhs_->Evaluate(), rhs_->Evaluate()); } -ArrowExpression OperatorAnd::Evaluate() { +ArrowExpression ExpressionAnd::Evaluate() { return arrow::compute::and_(lhs_->Evaluate(), rhs_->Evaluate()); } -ArrowExpression OperatorOr::Evaluate() { +ArrowExpression ExpressionOr::Evaluate() { return arrow::compute::or_(lhs_->Evaluate(), rhs_->Evaluate()); } diff --git a/cpp/test/test_arrow_chunk_reader.cc b/cpp/test/test_arrow_chunk_reader.cc index 4492d9042..3406e1b47 100644 --- a/cpp/test/test_arrow_chunk_reader.cc +++ b/cpp/test/test_arrow_chunk_reader.cc @@ -23,9 +23,12 @@ limitations under the License. #define CATCH_CONFIG_MAIN #include -using GAR_NAMESPACE::Equal; +using GAR_NAMESPACE::_And; +using GAR_NAMESPACE::_Equal; +using GAR_NAMESPACE::_LessThan; +using GAR_NAMESPACE::_Literal; +using GAR_NAMESPACE::_Property; using GAR_NAMESPACE::Expression; -using GAR_NAMESPACE::LessThan; using GAR_NAMESPACE::Property; using GAR_NAMESPACE::utils::FilterOptions; @@ -94,10 +97,7 @@ TEST_CASE("test_vertex_property_pushdown") { std::string path = root + "/ldbc_sample/parquet/ldbc_sample.graph.yml"; std::string label = "person", property_name = "gender"; - Property prop("gender"); - std::string value("female"); - auto filter = Expression::Make(prop, value); - auto defer = std::unique_ptr(filter); + auto filter = _Equal(_Property("gender"), _Literal("female")); std::vector expected_cols{"firstName", "lastName"}; // read file and construct graph info @@ -118,7 +118,7 @@ TEST_CASE("test_vertex_property_pushdown") { // print reader result auto walkReader = [&](GAR_NAMESPACE::VertexPropertyArrowChunkReader& reader) { int idx = 0, sum = 0; - std::shared_ptr table = nullptr; + std::shared_ptr table; do { auto result = reader.GetChunk(); @@ -301,10 +301,10 @@ TEST_CASE("test_adj_list_property_pushdown") { // construct pushdown options Property prop("creationDate"); - auto expr1 = Expression::Make("2012-06-02T04:30:44.526+0000", prop); - auto expr2 = Expression::Make(prop, prop); - auto filter = And(expr1, expr2); - auto defer = std::unique_ptr(filter); + auto expr1 = + _LessThan(_Literal("2012-06-02T04:30:44.526+0000"), _Property(prop)); + auto expr2 = _Equal(_Property(prop), _Property(prop)); + auto filter = _And(expr1, expr2); std::vector expected_cols{"creationDate"}; @@ -315,9 +315,8 @@ TEST_CASE("test_adj_list_property_pushdown") { // print reader result auto walkReader = [&](GAR_NAMESPACE::AdjListPropertyArrowChunkReader& reader) { - int idx = 0; - int sum = 0; - std::shared_ptr table = nullptr; + int idx = 0, sum = 0; + std::shared_ptr table; do { auto result = reader.GetChunk(); From 2246136329e0a67cde8e8668dc20b80f466daf3f Mon Sep 17 00:00:00 2001 From: Ziy1-Tan Date: Thu, 6 Jul 2023 10:15:12 +0800 Subject: [PATCH 21/26] Refactor: fix style Signed-off-by: Ziy1-Tan --- cpp/test/test_arrow_chunk_reader.cc | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/cpp/test/test_arrow_chunk_reader.cc b/cpp/test/test_arrow_chunk_reader.cc index 3406e1b47..e480614a5 100644 --- a/cpp/test/test_arrow_chunk_reader.cc +++ b/cpp/test/test_arrow_chunk_reader.cc @@ -28,8 +28,6 @@ using GAR_NAMESPACE::_Equal; using GAR_NAMESPACE::_LessThan; using GAR_NAMESPACE::_Literal; using GAR_NAMESPACE::_Property; -using GAR_NAMESPACE::Expression; -using GAR_NAMESPACE::Property; using GAR_NAMESPACE::utils::FilterOptions; TEST_CASE("test_vertex_property_arrow_chunk_reader") { @@ -146,15 +144,15 @@ TEST_CASE("test_vertex_property_pushdown") { }; SECTION("pushdown by helper function") { - std::cout << "vertex property pushdown by helper function:\n"; + std::cout << "Vertex property pushdown by helper function:\n"; auto maybe_reader = GAR_NAMESPACE::ConstructVertexPropertyArrowChunkReader( graph_info, label, group, options); REQUIRE(maybe_reader.status().ok()); walkReader(maybe_reader.value()); } - SECTION("pushdown by function Filter() & Project()") { - std::cout << "vertex property pushdown by Filter() & Project():\n"; + SECTION("pushdown by function Filter() & Select()") { + std::cout << "Vertex property pushdown by Filter() & Select():\n"; auto maybe_reader = GAR_NAMESPACE::ConstructVertexPropertyArrowChunkReader( graph_info, label, group); REQUIRE(maybe_reader.status().ok()); @@ -299,11 +297,9 @@ TEST_CASE("test_adj_list_property_pushdown") { auto group = maybe_group.value(); // construct pushdown options - Property prop("creationDate"); - - auto expr1 = - _LessThan(_Literal("2012-06-02T04:30:44.526+0000"), _Property(prop)); - auto expr2 = _Equal(_Property(prop), _Property(prop)); + auto expr1 = _LessThan(_Literal("2012-06-02T04:30:44.526+0000"), + _Property(property_name)); + auto expr2 = _Equal(_Property(property_name), _Property(property_name)); auto filter = _And(expr1, expr2); std::vector expected_cols{"creationDate"}; @@ -340,7 +336,7 @@ TEST_CASE("test_adj_list_property_pushdown") { }; SECTION("pushdown by helper function") { - std::cout << "adj list property pushdown by helper function: \n"; + std::cout << "Adj list property pushdown by helper function: \n"; auto maybe_reader = GAR_NAMESPACE::ConstructAdjListPropertyArrowChunkReader( graph_info, src_label, edge_label, dst_label, group, GAR_NAMESPACE::AdjListType::ordered_by_source, options); @@ -349,8 +345,8 @@ TEST_CASE("test_adj_list_property_pushdown") { walkReader(reader); } - SECTION("pushdown by function Filter() & Project()") { - std::cout << "vertex property pushdown by Filter() & Project():" + SECTION("pushdown by function Filter() & Select()") { + std::cout << "Adj list property pushdown by Filter() & Select():" << std::endl; auto maybe_reader = GAR_NAMESPACE::ConstructAdjListPropertyArrowChunkReader( graph_info, src_label, edge_label, dst_label, group, From 0d03fe77df06d57d258d9ccffa13f63290457e6c Mon Sep 17 00:00:00 2001 From: Ziy1-Tan Date: Thu, 6 Jul 2023 10:49:56 +0800 Subject: [PATCH 22/26] Refactor: fix style Signed-off-by: Ziy1-Tan --- cpp/include/gar/reader/arrow_chunk_reader.h | 22 ++++++++++++++++----- cpp/include/gar/utils/expression.h | 4 ++++ cpp/include/gar/utils/reader_utils.h | 5 +++-- cpp/test/test_arrow_chunk_reader.cc | 8 ++++---- 4 files changed, 28 insertions(+), 11 deletions(-) diff --git a/cpp/include/gar/reader/arrow_chunk_reader.h b/cpp/include/gar/reader/arrow_chunk_reader.h index cb174a3aa..e350d9376 100644 --- a/cpp/include/gar/reader/arrow_chunk_reader.h +++ b/cpp/include/gar/reader/arrow_chunk_reader.h @@ -130,8 +130,8 @@ class VertexPropertyArrowChunkReader { IdType GetChunkNum() const noexcept { return chunk_num_; } /** - * @brief Apply the row filter to the table. No parameter call Filter() or - * Filter(nullptr) will clear the filter. + * @brief Apply the row filter to the table. No parameter call Filter() will + * clear the filter. * * @param filter Predicate expression to filter rows. */ @@ -139,11 +139,11 @@ class VertexPropertyArrowChunkReader { /** * @brief Apply the projection to the table to be read. No parameter call - * Select() or Select(nullptr) will clear the projection. + * Select() will clear the projection. * * @param column_names The name of columns to be selected. */ - void Select(utils::ColumnNames column_names = nullptr); + void Select(utils::ColumnNames column_names = std::nullopt); private: VertexInfo vertex_info_; @@ -591,9 +591,21 @@ class AdjListPropertyArrowChunkReader { return Status::OK(); } + /** + * @brief Apply the row filter to the table. No parameter call Filter() will + * clear the filter. + * + * @param filter Predicate expression to filter rows. + */ void Filter(utils::Filter filter = nullptr); - void Select(utils::ColumnNames column_names = nullptr); + /** + * @brief Apply the projection to the table to be read. No parameter call + * Select() will clear the projection. + * + * @param column_names The name of columns to be selected. + */ + void Select(utils::ColumnNames column_names = std::nullopt); private: EdgeInfo edge_info_; diff --git a/cpp/include/gar/utils/expression.h b/cpp/include/gar/utils/expression.h index 2bc1dae65..fb56a9a7e 100644 --- a/cpp/include/gar/utils/expression.h +++ b/cpp/include/gar/utils/expression.h @@ -254,18 +254,22 @@ template std::shared_ptr lhs, std::shared_ptr rhs) { return std::make_shared(lhs, rhs); } + [[nodiscard]] static inline std::shared_ptr _NotEqual( std::shared_ptr lhs, std::shared_ptr rhs) { return std::make_shared(lhs, rhs); } + [[nodiscard]] static inline std::shared_ptr _GreaterThan( std::shared_ptr lhs, std::shared_ptr rhs) { return std::make_shared(lhs, rhs); } + [[nodiscard]] static inline std::shared_ptr _GreaterEqual( std::shared_ptr lhs, std::shared_ptr rhs) { return std::make_shared(lhs, rhs); } + [[nodiscard]] static inline std::shared_ptr _LessThan( std::shared_ptr lhs, std::shared_ptr rhs) { return std::make_shared(lhs, rhs); diff --git a/cpp/include/gar/utils/reader_utils.h b/cpp/include/gar/utils/reader_utils.h index 8b475a2cc..28a1df892 100644 --- a/cpp/include/gar/utils/reader_utils.h +++ b/cpp/include/gar/utils/reader_utils.h @@ -29,13 +29,14 @@ namespace GAR_NAMESPACE_INTERNAL { namespace utils { using Filter = std::shared_ptr; -using ColumnNames = std::vector*; +using ColumnNames = + std::optional>>; struct FilterOptions { // The row filter to apply to the table. Filter filter = nullptr; // The columns to include in the table. Select all columns by default. - ColumnNames columns = nullptr; + ColumnNames columns = std::nullopt; FilterOptions() {} FilterOptions(Filter filter, ColumnNames columns) diff --git a/cpp/test/test_arrow_chunk_reader.cc b/cpp/test/test_arrow_chunk_reader.cc index e480614a5..13e26e08a 100644 --- a/cpp/test/test_arrow_chunk_reader.cc +++ b/cpp/test/test_arrow_chunk_reader.cc @@ -111,7 +111,7 @@ TEST_CASE("test_vertex_property_pushdown") { // construct pushdown options FilterOptions options; options.filter = filter; - options.columns = &expected_cols; + options.columns = expected_cols; // print reader result auto walkReader = [&](GAR_NAMESPACE::VertexPropertyArrowChunkReader& reader) { @@ -158,7 +158,7 @@ TEST_CASE("test_vertex_property_pushdown") { REQUIRE(maybe_reader.status().ok()); auto reader = maybe_reader.value(); reader.Filter(filter); - reader.Select(&expected_cols); + reader.Select(expected_cols); walkReader(reader); } } @@ -306,7 +306,7 @@ TEST_CASE("test_adj_list_property_pushdown") { FilterOptions options; options.filter = filter; - options.columns = &expected_cols; + options.columns = expected_cols; // print reader result auto walkReader = @@ -354,7 +354,7 @@ TEST_CASE("test_adj_list_property_pushdown") { REQUIRE(maybe_reader.status().ok()); auto reader = maybe_reader.value(); reader.Filter(filter); - reader.Select(&expected_cols); + reader.Select(expected_cols); walkReader(reader); } } From 8e27aff4718e3da48dd5a2bcc033133726b30ec2 Mon Sep 17 00:00:00 2001 From: Ziy1-Tan Date: Wed, 12 Jul 2023 23:27:34 +0800 Subject: [PATCH 23/26] Docs: Expression Signed-off-by: Ziy1-Tan --- cpp/include/gar/utils/expression.h | 133 ++++++++++++++++++++------- cpp/src/expression.cc | 2 +- docs/reference/api-reference-cpp.rst | 70 ++++++++++++++ 3 files changed, 170 insertions(+), 35 deletions(-) diff --git a/cpp/include/gar/utils/expression.h b/cpp/include/gar/utils/expression.h index fb56a9a7e..b972a08c9 100644 --- a/cpp/include/gar/utils/expression.h +++ b/cpp/include/gar/utils/expression.h @@ -28,8 +28,8 @@ namespace GAR_NAMESPACE_INTERNAL { using ArrowExpression = arrow::compute::Expression; /** - * This class wraps an arrow::compute::Expression and provides methods for - * reading arrow::compute::Expression objects + * This class wraps an arrow::compute::Expression and provides a way to + * construct it */ class Expression { public: @@ -49,6 +49,10 @@ class Expression { virtual ArrowExpression Evaluate() = 0; }; +/** + * This class wraps the Property and provides a way to construct property + * expression + */ class ExpressionProperty : public Expression { public: explicit ExpressionProperty(const Property& property) : property_(property) {} @@ -63,6 +67,10 @@ class ExpressionProperty : public Expression { Property property_; }; +/** + * This class wraps the literal. Only bool, int32, int64, float, double and + * string are allowed. + */ template class ExpressionLiteral : public Expression { public: @@ -76,34 +84,46 @@ class ExpressionLiteral : public Expression { T value_; }; -class UnaryOperator : public Expression { +/** + * This class constructs a unary operator expression that accepts only one + * expression + */ +class ExpressionUnaryOp : public Expression { public: - UnaryOperator() = default; - explicit UnaryOperator(std::shared_ptr expr) : expr_(expr) {} - UnaryOperator(const UnaryOperator& other) = default; - virtual ~UnaryOperator() {} + ExpressionUnaryOp() = default; + explicit ExpressionUnaryOp(std::shared_ptr expr) : expr_(expr) {} + ExpressionUnaryOp(const ExpressionUnaryOp& other) = default; + virtual ~ExpressionUnaryOp() {} protected: std::shared_ptr expr_; }; -class ExpressionNot : public UnaryOperator { +/** + * This class constructs a NOT operator expression. e.g. new ExpressionNot(new + * ExpressionLiteral(true)) => NOT TRUE + */ +class ExpressionNot : public ExpressionUnaryOp { public: ExpressionNot() = default; explicit ExpressionNot(std::shared_ptr expr) - : UnaryOperator(expr) {} + : ExpressionUnaryOp(expr) {} ExpressionNot(const ExpressionNot& other) = default; ~ExpressionNot() = default; ArrowExpression Evaluate() override; }; -class ExpressionIsNull : public UnaryOperator { +/** + * This class constructs a IS NULL operator expression. e.g. new + * ExpressionIsNull(new ExpressionProperty("a")) => a IS NULL + */ +class ExpressionIsNull : public ExpressionUnaryOp { public: ExpressionIsNull() = default; explicit ExpressionIsNull(std::shared_ptr expr, bool nan_is_null = false) - : UnaryOperator(expr), nan_is_null_(nan_is_null) {} + : ExpressionUnaryOp(expr), nan_is_null_(nan_is_null) {} ExpressionIsNull(const ExpressionIsNull& other) = default; ~ExpressionIsNull() = default; @@ -113,109 +133,153 @@ class ExpressionIsNull : public UnaryOperator { bool nan_is_null_; }; -class BinaryOperator : public Expression { +/** + * This class constructs a binary operator expression that accepts two + * expressions e.g. a = 1, a > 1, a AND b, a OR b + */ +class ExpressionBinaryOp : public Expression { public: - BinaryOperator() = default; - BinaryOperator(std::shared_ptr lhs, - std::shared_ptr rhs) + ExpressionBinaryOp() = default; + ExpressionBinaryOp(std::shared_ptr lhs, + std::shared_ptr rhs) : lhs_(lhs), rhs_(rhs) {} - BinaryOperator(const BinaryOperator& other) = default; - ~BinaryOperator() = default; + ExpressionBinaryOp(const ExpressionBinaryOp& other) = default; + ~ExpressionBinaryOp() = default; protected: std::shared_ptr lhs_; std::shared_ptr rhs_; }; -class ExpressionEqual : public BinaryOperator { +/** + * This class constructs a EQUAL operator expression. + * e.g. new ExpressionEqual(new ExpressionProperty("a"), new + * ExpressionLiteral(1)) => a = 1 + */ +class ExpressionEqual : public ExpressionBinaryOp { public: ExpressionEqual() = default; ExpressionEqual(std::shared_ptr lhs, std::shared_ptr rhs) - : BinaryOperator(lhs, rhs) {} + : ExpressionBinaryOp(lhs, rhs) {} ExpressionEqual(const ExpressionEqual& other) = default; ~ExpressionEqual() = default; ArrowExpression Evaluate() override; }; -class ExpressionNotEqual : public BinaryOperator { +/** + * This class constructs a NOT EQUAL operator expression. + * e.g. new ExpressionNotEqual(new ExpressionProperty("a"), new + * ExpressionLiteral(1)) => a != 1 + */ +class ExpressionNotEqual : public ExpressionBinaryOp { public: ExpressionNotEqual() = default; ExpressionNotEqual(std::shared_ptr lhs, std::shared_ptr rhs) - : BinaryOperator(lhs, rhs) {} + : ExpressionBinaryOp(lhs, rhs) {} ExpressionNotEqual(const ExpressionNotEqual& other) = default; ~ExpressionNotEqual() = default; ArrowExpression Evaluate() override; }; -class ExpressionGreaterThan : public BinaryOperator { +/** + * This class constructs a GREATER THAN operator expression. + * e.g. new ExpressionGreaterThan(new ExpressionProperty("a"), new + * ExpressionLiteral(1)) => a > 1 + */ +class ExpressionGreaterThan : public ExpressionBinaryOp { public: ExpressionGreaterThan() = default; ExpressionGreaterThan(std::shared_ptr lhs, std::shared_ptr rhs) - : BinaryOperator(lhs, rhs) {} + : ExpressionBinaryOp(lhs, rhs) {} ExpressionGreaterThan(const ExpressionGreaterThan& other) = default; ~ExpressionGreaterThan() = default; ArrowExpression Evaluate() override; }; -class ExpressionGreaterEqual : public BinaryOperator { +/** + * This class constructs a GREATER EQUAL operator expression. + * e.g. new ExpressionGreaterEqual(new ExpressionProperty("a"), new + * ExpressionLiteral(1)) => a >= 1 + */ +class ExpressionGreaterEqual : public ExpressionBinaryOp { public: ExpressionGreaterEqual() = default; ExpressionGreaterEqual(std::shared_ptr lhs, std::shared_ptr rhs) - : BinaryOperator(lhs, rhs) {} + : ExpressionBinaryOp(lhs, rhs) {} ExpressionGreaterEqual(const ExpressionGreaterEqual& other) = default; ~ExpressionGreaterEqual() = default; ArrowExpression Evaluate() override; }; -class ExpressionLessThan : public BinaryOperator { +/** + * This class constructs a LESS THAN operator expression. + * e.g. new ExpressionLessThan(new ExpressionProperty("a"), new + * ExpressionLiteral(1)) => a < 1 + */ +class ExpressionLessThan : public ExpressionBinaryOp { public: ExpressionLessThan() = default; ExpressionLessThan(std::shared_ptr lhs, std::shared_ptr rhs) - : BinaryOperator(lhs, rhs) {} + : ExpressionBinaryOp(lhs, rhs) {} ExpressionLessThan(const ExpressionLessThan& other) = default; ~ExpressionLessThan() = default; ArrowExpression Evaluate() override; }; -class ExpressionLessEqual : public BinaryOperator { +/** + * This class constructs a LESS EQUAL operator expression. + * e.g. new ExpressionLessEqual(new ExpressionProperty("a"), new + * ExpressionLiteral(1)) => a <= 1 + */ +class ExpressionLessEqual : public ExpressionBinaryOp { public: ExpressionLessEqual() = default; ExpressionLessEqual(std::shared_ptr lhs, std::shared_ptr rhs) - : BinaryOperator(lhs, rhs) {} + : ExpressionBinaryOp(lhs, rhs) {} ExpressionLessEqual(const ExpressionLessEqual& other) = default; ~ExpressionLessEqual() = default; ArrowExpression Evaluate() override; }; -class ExpressionAnd : public BinaryOperator { +/** + * This class constructs a AND operator expression. + * e.g. new ExpressionAnd(new ExpressionLiteral(true), new + * ExpressionLiteral(1)) => TRUE AND 1 + */ +class ExpressionAnd : public ExpressionBinaryOp { public: ExpressionAnd() = default; ExpressionAnd(std::shared_ptr lhs, std::shared_ptr rhs) - : BinaryOperator(lhs, rhs) {} + : ExpressionBinaryOp(lhs, rhs) {} ExpressionAnd(const ExpressionAnd& other) = default; ~ExpressionAnd() = default; ArrowExpression Evaluate() override; }; -class ExpressionOr : public BinaryOperator { +/** + * This class constructs a OR operator expression. + * e.g. new ExpressionOr(new ExpressionLiteral(0), new + * ExpressionLiteral(true)) => 0 OR TRUE + */ +class ExpressionOr : public ExpressionBinaryOp { public: ExpressionOr() = default; ExpressionOr(std::shared_ptr lhs, std::shared_ptr rhs) - : BinaryOperator(lhs, rhs) {} + : ExpressionBinaryOp(lhs, rhs) {} ExpressionOr(const ExpressionOr& other) = default; ~ExpressionOr() = default; @@ -223,7 +287,7 @@ class ExpressionOr : public BinaryOperator { }; /** - * Helper functions to Construct Expression. + * Helper functions to construct a Expression. */ [[nodiscard]] static inline std::shared_ptr _Property( const Property& property) { @@ -291,3 +355,4 @@ template } } // namespace GAR_NAMESPACE_INTERNAL #endif // GAR_UTILS_EXPRESSION_H_ + diff --git a/cpp/src/expression.cc b/cpp/src/expression.cc index 3d0ad5525..2b9f061c0 100644 --- a/cpp/src/expression.cc +++ b/cpp/src/expression.cc @@ -24,7 +24,7 @@ ArrowExpression ExpressionNot::Evaluate() { } ArrowExpression ExpressionIsNull::Evaluate() { - return arrow::compute::is_null(expr_->Evaluate()); + return arrow::compute::is_null(expr_->Evaluate(), nan_is_null_); } ArrowExpression ExpressionEqual::Evaluate() { diff --git a/docs/reference/api-reference-cpp.rst b/docs/reference/api-reference-cpp.rst index 8066772c8..5ee9c4796 100644 --- a/docs/reference/api-reference-cpp.rst +++ b/docs/reference/api-reference-cpp.rst @@ -220,3 +220,73 @@ Info Version .. doxygenclass:: GraphArchive::InfoVersion :members: :undoc-members: + +Expression +~~~~~~~~~~~~~~~~~~~ + +.. doxygenclass:: GraphArchive::Expression + :members: + :undoc-members: + +.. doxygenclass:: GraphArchive::ExpressionProperty + :members: + :undoc-members: + +.. doxygenclass:: GraphArchive::ExpressionLiteral + :members: + :undoc-members: + +.. doxygenclass:: GraphArchive::ExpressionNot + :members: + :undoc-members: + +.. doxygenclass:: GraphArchive::ExpressionIsNull + :members: + :undoc-members: + +.. doxygenclass:: GraphArchive::ExpressionEqual + :members: + :undoc-members: + +.. doxygenclass:: GraphArchive::ExpressionNotEqual + :members: + :undoc-members: + +.. doxygenclass:: GraphArchive::ExpressionGreaterThan + :members: + :undoc-members: + +.. doxygenclass:: GraphArchive::ExpressionGreaterEqual + :members: + :undoc-members: + + +.. doxygenclass:: GraphArchive::ExpressionLessThan + :members: + :undoc-members: + +.. doxygenclass:: GraphArchive::ExpressionLessEqual + :members: + :undoc-members: + +.. doxygenclass:: GraphArchive::ExpressionAnd + :members: + :undoc-members: + +.. doxygenclass:: GraphArchive::ExpressionOr + :members: + :undoc-members: + +.. doxygenfunction:: GraphArchive::_Property +.. doxygenfunction:: GraphArchive::_Literal +.. doxygenfunction:: GraphArchive::_Not +.. doxygenfunction:: GraphArchive::_IsNull +.. doxygenfunction:: GraphArchive::_Equal +.. doxygenfunction:: GraphArchive::_NotEqual +.. doxygenfunction:: GraphArchive::_GreaterThan +.. doxygenfunction:: GraphArchive::_GreaterEqual +.. doxygenfunction:: GraphArchive::_LessThan +.. doxygenfunction:: GraphArchive::_LessEqual +.. doxygenfunction:: GraphArchive::_And +.. doxygenfunction:: GraphArchive::_Or + From bf47cadfe992fc2a15554cba5daf3e242802bd55 Mon Sep 17 00:00:00 2001 From: Ziy1-Tan Date: Wed, 12 Jul 2023 23:47:53 +0800 Subject: [PATCH 24/26] Refactor: fix style Signed-off-by: Ziy1-Tan --- cpp/include/gar/utils/expression.h | 1 - 1 file changed, 1 deletion(-) diff --git a/cpp/include/gar/utils/expression.h b/cpp/include/gar/utils/expression.h index b972a08c9..d382a3141 100644 --- a/cpp/include/gar/utils/expression.h +++ b/cpp/include/gar/utils/expression.h @@ -355,4 +355,3 @@ template } } // namespace GAR_NAMESPACE_INTERNAL #endif // GAR_UTILS_EXPRESSION_H_ - From 38282a1c46330a3cc382dbdb2754dfec95507a5a Mon Sep 17 00:00:00 2001 From: Ziy1-Tan Date: Mon, 24 Jul 2023 00:43:58 +0800 Subject: [PATCH 25/26] Refactor: filter expression validation & ut Signed-off-by: Ziy1-Tan --- cpp/include/gar/graph_info.h | 7 +++ cpp/include/gar/utils/expression.h | 75 ++++++++++++++-------------- cpp/include/gar/utils/reader_utils.h | 3 ++ cpp/src/arrow_chunk_reader.cc | 4 ++ cpp/src/expression.cc | 67 ++++++++++++++++--------- cpp/src/filesystem.cc | 3 +- cpp/src/reader_utils.cc | 34 +++++++++++++ cpp/test/test_arrow_chunk_reader.cc | 39 +++++++++++++-- 8 files changed, 166 insertions(+), 66 deletions(-) diff --git a/cpp/include/gar/graph_info.h b/cpp/include/gar/graph_info.h index 39351a3bf..dd230ce5f 100644 --- a/cpp/include/gar/graph_info.h +++ b/cpp/include/gar/graph_info.h @@ -19,6 +19,7 @@ limitations under the License. #include #include #include +#include #include #include "utils/adj_list_type.h" @@ -92,6 +93,7 @@ class PropertyGroup { std::vector names; for (auto& property : properties_) { names.push_back(property.name); + property_names_.insert(property.name); } prefix_ = util::ConcatStringWithDelimiter(names, REGULAR_SEPERATOR) + "/"; } @@ -126,6 +128,10 @@ class PropertyGroup { return properties_; } + inline bool ContainProperty(const std::string& property_name) const { + return property_names_.find(property_name) != property_names_.end(); + } + /** Get the file type of property group chunk file. * * @return The file type of group. @@ -151,6 +157,7 @@ class PropertyGroup { private: std::vector properties_; + std::unordered_set property_names_; FileType file_type_; std::string prefix_; }; diff --git a/cpp/include/gar/utils/expression.h b/cpp/include/gar/utils/expression.h index d382a3141..427720c7e 100644 --- a/cpp/include/gar/utils/expression.h +++ b/cpp/include/gar/utils/expression.h @@ -46,7 +46,7 @@ class Expression { * * @return The arrow::compute::Expression instance */ - virtual ArrowExpression Evaluate() = 0; + virtual Result Evaluate() = 0; }; /** @@ -61,7 +61,7 @@ class ExpressionProperty : public Expression { ExpressionProperty(const ExpressionProperty& other) = default; ~ExpressionProperty() = default; - ArrowExpression Evaluate() override; + Result Evaluate() override; private: Property property_; @@ -71,14 +71,21 @@ class ExpressionProperty : public Expression { * This class wraps the literal. Only bool, int32, int64, float, double and * string are allowed. */ -template +template || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v || + std::is_same_v, + typename = std::enable_if_t> class ExpressionLiteral : public Expression { public: explicit ExpressionLiteral(T value) : value_(value) {} ExpressionLiteral(const ExpressionLiteral& other) = default; ~ExpressionLiteral() = default; - ArrowExpression Evaluate() { return arrow::compute::literal(value_); } + Result Evaluate() { return arrow::compute::literal(value_); } private: T value_; @@ -111,26 +118,7 @@ class ExpressionNot : public ExpressionUnaryOp { ExpressionNot(const ExpressionNot& other) = default; ~ExpressionNot() = default; - ArrowExpression Evaluate() override; -}; - -/** - * This class constructs a IS NULL operator expression. e.g. new - * ExpressionIsNull(new ExpressionProperty("a")) => a IS NULL - */ -class ExpressionIsNull : public ExpressionUnaryOp { - public: - ExpressionIsNull() = default; - explicit ExpressionIsNull(std::shared_ptr expr, - bool nan_is_null = false) - : ExpressionUnaryOp(expr), nan_is_null_(nan_is_null) {} - ExpressionIsNull(const ExpressionIsNull& other) = default; - ~ExpressionIsNull() = default; - - ArrowExpression Evaluate() override; - - private: - bool nan_is_null_; + Result Evaluate() override; }; /** @@ -146,6 +134,15 @@ class ExpressionBinaryOp : public Expression { ExpressionBinaryOp(const ExpressionBinaryOp& other) = default; ~ExpressionBinaryOp() = default; + protected: + inline Status CheckNullArgs(std::shared_ptr lhs, + std::shared_ptr rhs) noexcept { + if (lhs == nullptr || rhs == nullptr) { + return Status::Invalid("Invalid expression: lhs or rhs is null"); + } + return Status::OK(); + } + protected: std::shared_ptr lhs_; std::shared_ptr rhs_; @@ -165,7 +162,7 @@ class ExpressionEqual : public ExpressionBinaryOp { ExpressionEqual(const ExpressionEqual& other) = default; ~ExpressionEqual() = default; - ArrowExpression Evaluate() override; + Result Evaluate() override; }; /** @@ -182,7 +179,7 @@ class ExpressionNotEqual : public ExpressionBinaryOp { ExpressionNotEqual(const ExpressionNotEqual& other) = default; ~ExpressionNotEqual() = default; - ArrowExpression Evaluate() override; + Result Evaluate() override; }; /** @@ -199,7 +196,7 @@ class ExpressionGreaterThan : public ExpressionBinaryOp { ExpressionGreaterThan(const ExpressionGreaterThan& other) = default; ~ExpressionGreaterThan() = default; - ArrowExpression Evaluate() override; + Result Evaluate() override; }; /** @@ -216,7 +213,7 @@ class ExpressionGreaterEqual : public ExpressionBinaryOp { ExpressionGreaterEqual(const ExpressionGreaterEqual& other) = default; ~ExpressionGreaterEqual() = default; - ArrowExpression Evaluate() override; + Result Evaluate() override; }; /** @@ -233,7 +230,7 @@ class ExpressionLessThan : public ExpressionBinaryOp { ExpressionLessThan(const ExpressionLessThan& other) = default; ~ExpressionLessThan() = default; - ArrowExpression Evaluate() override; + Result Evaluate() override; }; /** @@ -250,7 +247,7 @@ class ExpressionLessEqual : public ExpressionBinaryOp { ExpressionLessEqual(const ExpressionLessEqual& other) = default; ~ExpressionLessEqual() = default; - ArrowExpression Evaluate() override; + Result Evaluate() override; }; /** @@ -267,7 +264,7 @@ class ExpressionAnd : public ExpressionBinaryOp { ExpressionAnd(const ExpressionAnd& other) = default; ~ExpressionAnd() = default; - ArrowExpression Evaluate() override; + Result Evaluate() override; }; /** @@ -283,7 +280,7 @@ class ExpressionOr : public ExpressionBinaryOp { ExpressionOr(const ExpressionOr& other) = default; ~ExpressionOr() = default; - ArrowExpression Evaluate() override; + Result Evaluate() override; }; /** @@ -299,7 +296,14 @@ class ExpressionOr : public ExpressionBinaryOp { return std::make_shared(name); } -template +template || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v || std::is_same_v || + std::is_same_v || + std::is_same_v, + typename = std::enable_if_t> [[nodiscard]] static inline std::shared_ptr _Literal(T value) { return std::make_shared>(value); } @@ -309,11 +313,6 @@ template return std::make_shared(expr); } -[[nodiscard]] static inline std::shared_ptr _IsNull( - std::shared_ptr expr, bool nan_is_null = false) { - return std::make_shared(expr, nan_is_null); -} - [[nodiscard]] static inline std::shared_ptr _Equal( std::shared_ptr lhs, std::shared_ptr rhs) { return std::make_shared(lhs, rhs); diff --git a/cpp/include/gar/utils/reader_utils.h b/cpp/include/gar/utils/reader_utils.h index 28a1df892..88d80e17a 100644 --- a/cpp/include/gar/utils/reader_utils.h +++ b/cpp/include/gar/utils/reader_utils.h @@ -43,6 +43,9 @@ struct FilterOptions { : filter(filter), columns(columns) {} }; +Status CheckFilterOptions(const FilterOptions& filter_options, + const PropertyGroup& property_group) noexcept; + Result> GetAdjListOffsetOfVertex( const EdgeInfo& edge_info, const std::string& prefix, AdjListType adj_list_type, IdType vid) noexcept; diff --git a/cpp/src/arrow_chunk_reader.cc b/cpp/src/arrow_chunk_reader.cc index cc099f3cd..ec1e9a1f0 100644 --- a/cpp/src/arrow_chunk_reader.cc +++ b/cpp/src/arrow_chunk_reader.cc @@ -22,6 +22,8 @@ namespace GAR_NAMESPACE_INTERNAL { Result> VertexPropertyArrowChunkReader::GetChunk() noexcept { + GAR_RETURN_NOT_OK( + utils::CheckFilterOptions(filter_options_, property_group_)); if (chunk_table_ == nullptr) { GAR_ASSIGN_OR_RAISE( auto chunk_file_path, @@ -235,6 +237,8 @@ AdjListOffsetArrowChunkReader::GetChunk() noexcept { Result> AdjListPropertyArrowChunkReader::GetChunk() noexcept { + GAR_RETURN_NOT_OK( + utils::CheckFilterOptions(filter_options_, property_group_)); if (chunk_table_ == nullptr) { GAR_ASSIGN_OR_RAISE( auto chunk_file_path, diff --git a/cpp/src/expression.cc b/cpp/src/expression.cc index 2b9f061c0..e968efa34 100644 --- a/cpp/src/expression.cc +++ b/cpp/src/expression.cc @@ -16,47 +16,68 @@ limitations under the License. namespace GAR_NAMESPACE_INTERNAL { -ArrowExpression ExpressionProperty::Evaluate() { +Result ExpressionProperty::Evaluate() { return arrow::compute::field_ref(property_.name); } -ArrowExpression ExpressionNot::Evaluate() { - return arrow::compute::not_(expr_->Evaluate()); +Result ExpressionNot::Evaluate() { + GAR_ASSIGN_OR_RAISE(auto expr, expr_->Evaluate()); + return arrow::compute::not_(expr); } -ArrowExpression ExpressionIsNull::Evaluate() { - return arrow::compute::is_null(expr_->Evaluate(), nan_is_null_); +Result ExpressionEqual::Evaluate() { + GAR_RETURN_NOT_OK(CheckNullArgs(lhs_, rhs_)); + GAR_ASSIGN_OR_RAISE(auto lexpr, lhs_->Evaluate()); + GAR_ASSIGN_OR_RAISE(auto rexpr, rhs_->Evaluate()); + return arrow::compute::equal(lexpr, rexpr); } -ArrowExpression ExpressionEqual::Evaluate() { - return arrow::compute::equal(lhs_->Evaluate(), rhs_->Evaluate()); +Result ExpressionNotEqual::Evaluate() { + GAR_RETURN_NOT_OK(CheckNullArgs(lhs_, rhs_)); + GAR_ASSIGN_OR_RAISE(auto lexpr, lhs_->Evaluate()); + GAR_ASSIGN_OR_RAISE(auto rexpr, rhs_->Evaluate()); + return arrow::compute::not_equal(lexpr, rexpr); } -ArrowExpression ExpressionNotEqual::Evaluate() { - return arrow::compute::not_equal(lhs_->Evaluate(), rhs_->Evaluate()); +Result ExpressionGreaterThan::Evaluate() { + GAR_RETURN_NOT_OK(CheckNullArgs(lhs_, rhs_)); + GAR_ASSIGN_OR_RAISE(auto lexpr, lhs_->Evaluate()); + GAR_ASSIGN_OR_RAISE(auto rexpr, rhs_->Evaluate()); + return arrow::compute::greater(lexpr, rexpr); } -ArrowExpression ExpressionGreaterThan::Evaluate() { - return arrow::compute::greater(lhs_->Evaluate(), rhs_->Evaluate()); +Result ExpressionGreaterEqual::Evaluate() { + GAR_RETURN_NOT_OK(CheckNullArgs(lhs_, rhs_)); + GAR_ASSIGN_OR_RAISE(auto lexpr, lhs_->Evaluate()); + GAR_ASSIGN_OR_RAISE(auto rexpr, rhs_->Evaluate()); + return arrow::compute::greater_equal(lexpr, rexpr); } -ArrowExpression ExpressionGreaterEqual::Evaluate() { - return arrow::compute::greater_equal(lhs_->Evaluate(), rhs_->Evaluate()); +Result ExpressionLessThan::Evaluate() { + GAR_RETURN_NOT_OK(CheckNullArgs(lhs_, rhs_)); + GAR_ASSIGN_OR_RAISE(auto lexpr, lhs_->Evaluate()); + GAR_ASSIGN_OR_RAISE(auto rexpr, rhs_->Evaluate()); + return arrow::compute::less(lexpr, rexpr); } -ArrowExpression ExpressionLessThan::Evaluate() { - return arrow::compute::less(lhs_->Evaluate(), rhs_->Evaluate()); +Result ExpressionLessEqual::Evaluate() { + GAR_RETURN_NOT_OK(CheckNullArgs(lhs_, rhs_)); + GAR_ASSIGN_OR_RAISE(auto lexpr, lhs_->Evaluate()); + GAR_ASSIGN_OR_RAISE(auto rexpr, rhs_->Evaluate()); + return arrow::compute::less_equal(lexpr, rexpr); } -ArrowExpression ExpressionLessEqual::Evaluate() { - return arrow::compute::less_equal(lhs_->Evaluate(), rhs_->Evaluate()); +Result ExpressionAnd::Evaluate() { + GAR_RETURN_NOT_OK(CheckNullArgs(lhs_, rhs_)); + GAR_ASSIGN_OR_RAISE(auto lexpr, lhs_->Evaluate()); + GAR_ASSIGN_OR_RAISE(auto rexpr, rhs_->Evaluate()); + return arrow::compute::and_(lexpr, rexpr); } -ArrowExpression ExpressionAnd::Evaluate() { - return arrow::compute::and_(lhs_->Evaluate(), rhs_->Evaluate()); -} - -ArrowExpression ExpressionOr::Evaluate() { - return arrow::compute::or_(lhs_->Evaluate(), rhs_->Evaluate()); +Result ExpressionOr::Evaluate() { + GAR_RETURN_NOT_OK(CheckNullArgs(lhs_, rhs_)); + GAR_ASSIGN_OR_RAISE(auto lexpr, lhs_->Evaluate()); + GAR_ASSIGN_OR_RAISE(auto rexpr, rhs_->Evaluate()); + return arrow::compute::or_(lexpr, rexpr); } } // namespace GAR_NAMESPACE_INTERNAL diff --git a/cpp/src/filesystem.cc b/cpp/src/filesystem.cc index 463189ea0..d10b45c3b 100644 --- a/cpp/src/filesystem.cc +++ b/cpp/src/filesystem.cc @@ -105,7 +105,8 @@ Result> FileSystem::ReadFileToTable( // Apply the row filter and select the specified columns if (options.filter) { - RETURN_NOT_ARROW_OK(scan_builder->Filter(options.filter->Evaluate())); + GAR_ASSIGN_OR_RAISE(auto filter, options.filter->Evaluate()); + RETURN_NOT_ARROW_OK(scan_builder->Filter(filter)); } if (options.columns) { RETURN_NOT_ARROW_OK(scan_builder->Project(*options.columns)); diff --git a/cpp/src/reader_utils.cc b/cpp/src/reader_utils.cc index 08b0cc49f..a0e6ea5e2 100644 --- a/cpp/src/reader_utils.cc +++ b/cpp/src/reader_utils.cc @@ -27,6 +27,40 @@ limitations under the License. namespace GAR_NAMESPACE_INTERNAL { namespace utils { + +/** + * @brief Checks whether the property names in the FilterOptions match the + * properties in the property group + * + * @param filter_options filter options + * @param property_group property group + * @return Status error if the property names in the FilterOptions do not match + */ +Status CheckFilterOptions(const FilterOptions& filter_options, + const PropertyGroup& property_group) noexcept { + if (filter_options.filter) { + GAR_ASSIGN_OR_RAISE(auto filter, filter_options.filter->Evaluate()); + for (const auto& field : arrow::compute::FieldsInExpression(filter)) { + auto property_name = *field.name(); + if (!property_group.ContainProperty(property_name)) { + return Status::Invalid( + property_name, " in the filter does not match the property group: ", + property_group); + } + } + } + if (filter_options.columns.has_value()) { + for (const auto& col : filter_options.columns.value().get()) { + if (!property_group.ContainProperty(col)) { + return Status::Invalid( + col, " in the columns does not match the property group: ", + property_group); + } + } + } + return Status::OK(); +} + /** * @brief parse the vertex id to related adj list offset * diff --git a/cpp/test/test_arrow_chunk_reader.cc b/cpp/test/test_arrow_chunk_reader.cc index 13e26e08a..2801e2c08 100644 --- a/cpp/test/test_arrow_chunk_reader.cc +++ b/cpp/test/test_arrow_chunk_reader.cc @@ -108,10 +108,6 @@ TEST_CASE("test_vertex_property_pushdown") { auto maybe_group = graph_info.GetVertexPropertyGroup(label, property_name); REQUIRE(maybe_group.status().ok()); auto group = maybe_group.value(); - // construct pushdown options - FilterOptions options; - options.filter = filter; - options.columns = expected_cols; // print reader result auto walkReader = [&](GAR_NAMESPACE::VertexPropertyArrowChunkReader& reader) { @@ -145,6 +141,10 @@ TEST_CASE("test_vertex_property_pushdown") { SECTION("pushdown by helper function") { std::cout << "Vertex property pushdown by helper function:\n"; + // construct pushdown options + FilterOptions options; + options.filter = filter; + options.columns = expected_cols; auto maybe_reader = GAR_NAMESPACE::ConstructVertexPropertyArrowChunkReader( graph_info, label, group, options); REQUIRE(maybe_reader.status().ok()); @@ -161,6 +161,37 @@ TEST_CASE("test_vertex_property_pushdown") { reader.Select(expected_cols); walkReader(reader); } + + SECTION("pushdown property that don't exist") { + std::cout << "Vertex property pushdown property that don't exist:\n"; + auto filter = _Equal(_Property("id"), _Literal(933)); + FilterOptions options; + options.filter = filter; + options.columns = expected_cols; + auto maybe_reader = GAR_NAMESPACE::ConstructVertexPropertyArrowChunkReader( + graph_info, label, group, options); + REQUIRE(maybe_reader.status().ok()); + auto reader = maybe_reader.value(); + auto result = reader.GetChunk(); + REQUIRE(result.error().IsInvalid()); + std::cerr << result.error().message() << std::endl; + } + + SECTION("pushdown column that don't exist") { + std::cout << "Vertex property pushdown column that don't exist:\n"; + auto filter = _Literal(true); + std::vector expected_cols{"id"}; + FilterOptions options; + options.filter = filter; + options.columns = expected_cols; + auto maybe_reader = GAR_NAMESPACE::ConstructVertexPropertyArrowChunkReader( + graph_info, label, group, options); + REQUIRE(maybe_reader.status().ok()); + auto reader = maybe_reader.value(); + auto result = reader.GetChunk(); + REQUIRE(result.error().IsInvalid()); + std::cerr << result.error().message() << std::endl; + } } TEST_CASE("test_adj_list_arrow_chunk_reader") { From 8524fbf3a09e6280483e594672492c2febd95dc7 Mon Sep 17 00:00:00 2001 From: Ziy1-Tan Date: Mon, 24 Jul 2023 00:49:23 +0800 Subject: [PATCH 26/26] Refactor: fix style Signed-off-by: Ziy1-Tan --- cpp/include/gar/utils/expression.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/gar/utils/expression.h b/cpp/include/gar/utils/expression.h index 427720c7e..cc53505fc 100644 --- a/cpp/include/gar/utils/expression.h +++ b/cpp/include/gar/utils/expression.h @@ -348,7 +348,7 @@ template (lhs, rhs); } -[[nodiscard]] static inline std::shared_ptr Or( +[[nodiscard]] static inline std::shared_ptr _Or( std::shared_ptr lhs, std::shared_ptr rhs) { return std::make_shared(lhs, rhs); }