diff --git a/.travis.yml b/.travis.yml index 87ee43b9..8fe291cb 100644 --- a/.travis.yml +++ b/.travis.yml @@ -28,7 +28,7 @@ matrix: os: linux before_script: - source $TRAVIS_BUILD_DIR/ci/before_script_travis.sh - - cmake -DCMAKE_CXX_FLAGS="-Werror" -DPARQUET_TEST_MEMCHECK=ON -DPARQUET_GENERATE_COVERAGE=1 $TRAVIS_BUILD_DIR + - cmake -DCMAKE_CXX_FLAGS="-Werror" -DPARQUET_TEST_MEMCHECK=ON -DPARQUET_BUILD_BENCHMARKS=ON -DPARQUET_GENERATE_COVERAGE=1 $TRAVIS_BUILD_DIR - export PARQUET_TEST_DATA=$TRAVIS_BUILD_DIR/data - compiler: clang os: linux diff --git a/CMakeLists.txt b/CMakeLists.txt index 56e9dea8..39f75853 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -74,6 +74,9 @@ if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}") option(PARQUET_USE_SSE "Build with SSE4 optimizations" OFF) + option(PARQUET_BUILD_BENCHMARKS + "Build the libparquet benchmark suite" + OFF) option(PARQUET_BUILD_TESTS "Build the libparquet test suite" ON) @@ -102,6 +105,60 @@ else() set(BUILD_OUTPUT_ROOT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${BUILD_SUBDIR_NAME}") endif() +############################################################ +# Benchmarking +############################################################ +# Add a new micro benchmark, with or without an executable that should be built. +# If benchmarks are enabled then they will be run along side unit tests with ctest. +# 'make runbenchmark' and 'make unittest' to build/run only benchmark or unittests, +# respectively. +# +# REL_BENCHMARK_NAME is the name of the benchmark app. It may be a single component +# (e.g. monotime-benchmark) or contain additional components (e.g. +# net/net_util-benchmark). Either way, the last component must be a globally +# unique name. + +# The benchmark will registered as unit test with ctest with a label +# of 'benchmark'. +# +# Arguments after the test name will be passed to set_tests_properties(). +function(ADD_PARQUET_BENCHMARK REL_BENCHMARK_NAME) + if(NOT PARQUET_BUILD_BENCHMARKS) + return() + endif() + get_filename_component(BENCHMARK_NAME ${REL_BENCHMARK_NAME} NAME_WE) + + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${REL_BENCHMARK_NAME}.cc) + # This benchmark has a corresponding .cc file, set it up as an executable. + set(BENCHMARK_PATH "${EXECUTABLE_OUTPUT_PATH}/${BENCHMARK_NAME}") + add_executable(${BENCHMARK_NAME} "${REL_BENCHMARK_NAME}.cc") + target_link_libraries(${BENCHMARK_NAME} ${PARQUET_BENCHMARK_LINK_LIBS}) + add_dependencies(runbenchmark ${BENCHMARK_NAME}) + set(NO_COLOR "--color_print=false") + else() + # No executable, just invoke the benchmark (probably a script) directly. + set(BENCHMARK_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${REL_BENCHMARK_NAME}) + set(NO_COLOR "") + endif() + + add_test(${BENCHMARK_NAME} + ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} benchmark ${BENCHMARK_PATH} ${NO_COLOR}) + set_tests_properties(${BENCHMARK_NAME} PROPERTIES LABELS "benchmark") + if(ARGN) + set_tests_properties(${BENCHMARK_NAME} PROPERTIES ${ARGN}) + endif() +endfunction() + +# A wrapper for add_dependencies() that is compatible with NO_BENCHMARKS. +function(ADD_PARQUET_BENCHMARK_DEPENDENCIES REL_BENCHMARK_NAME) + if(NOT PARQUET_BUILD_BENCHMARKS) + return() + endif() + get_filename_component(BENCMARK_NAME ${REL_BENCHMARK_NAME} NAME_WE) + + add_dependencies(${BENCHMARK_NAME} ${ARGN}) +endfunction() + ############################################################ # Testing ############################################################ @@ -113,6 +170,9 @@ endif() # net/net_util-test). Either way, the last component must be a globally # unique name. # +# The unit test is added with a label of "unittest" to support filtering with +# ctest. +# # Arguments after the test name will be passed to set_tests_properties(). function(ADD_PARQUET_TEST REL_TEST_NAME) if(NOT PARQUET_BUILD_TESTS) @@ -124,6 +184,7 @@ function(ADD_PARQUET_TEST REL_TEST_NAME) # This test has a corresponding .cc file, set it up as an executable. set(TEST_PATH "${EXECUTABLE_OUTPUT_PATH}/${TEST_NAME}") add_executable(${TEST_NAME} "${REL_TEST_NAME}.cc") + add_dependencies(unittest ${TEST_NAME}) if(APPLE) # On OS X / Thrift >= 0.9.2, tr1/tuple.h is not in libc++ @@ -149,8 +210,9 @@ function(ADD_PARQUET_TEST REL_TEST_NAME) valgrind --tool=memcheck --leak-check=full --error-exitcode=1 ${TEST_PATH}) else() add_test(${TEST_NAME} - ${BUILD_SUPPORT_DIR}/run-test.sh ${TEST_PATH}) + ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} test ${TEST_PATH}) endif() + set_tests_properties(${TEST_NAME} PROPERTIES LABELS "unittest") if(ARGN) set_tests_properties(${TEST_NAME} PROPERTIES ${ARGN}) endif() @@ -213,11 +275,26 @@ add_library(zlibstatic STATIC IMPORTED) set_target_properties(zlibstatic PROPERTIES IMPORTED_LOCATION ${ZLIB_STATIC_LIB}) ## GTest +add_custom_target(unittest ctest -L unittest) find_package(GTest REQUIRED) include_directories(SYSTEM ${GTEST_INCLUDE_DIR}) add_library(gtest STATIC IMPORTED) set_target_properties(gtest PROPERTIES IMPORTED_LOCATION ${GTEST_STATIC_LIB}) +## Google Benchmark +if ("$ENV{GBENCHMARK_HOME}" STREQUAL "") + set(GBENCHMARK_HOME ${THIRDPARTY_DIR}/installed) +endif() + +if(PARQUET_BUILD_BENCHMARKS) + add_custom_target(runbenchmark ctest -L benchmark) + find_package(GBenchmark REQUIRED) + include_directories(SYSTEM ${GBENCHMARK_INCLUDE_DIR}) + message(${GBENCHMARK_STATIC_LIB}) + add_library(gbenchmark STATIC IMPORTED) + set_target_properties(gbenchmark PROPERTIES IMPORTED_LOCATION ${GBENCHMARK_STATIC_LIB}) +endif() + # Thrift requires these definitions for some types that we use add_definitions(-DHAVE_INTTYPES_H -DHAVE_NETINET_IN_H -DHAVE_NETDB_H) add_definitions(-fPIC) @@ -331,6 +408,11 @@ set(PARQUET_MIN_TEST_LIBS parquet) set(PARQUET_TEST_LINK_LIBS ${PARQUET_MIN_TEST_LIBS}) +############################################################# +# Benchmark linking + +set(PARQUET_BENCHMARK_LINK_LIBS parquet parquet_benchmark_main) + ############################################################# # Code coverage diff --git a/README.md b/README.md index a38b555e..cf48ff78 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,7 @@ - zlib - thrift 0.7+ [install instructions](https://thrift.apache.org/docs/install/) - googletest 1.7.0 (cannot be installed with package managers) +- Google Benchmark (only required if building benchmarks) You can install these dependencies using a package manager or using the `thirdparty/` scripts in this repository. On Homebrew, you can run: @@ -87,7 +88,7 @@ This library uses Google's `googletest` unit test framework. After building with `make`, you can run the test suite by running ``` -ctest +make unittest ``` The test suite relies on an environment variable `PARQUET_TEST_DATA` pointing @@ -107,6 +108,19 @@ you can use valgrind with ctest to look for memory leaks: valgrind --tool=memcheck --leak-check=yes ctest ``` +## Building/Running benchmarks + +Follow the directions for simple build except run cmake +with the `--PARQUET_BUILD_BENCHMARKS` parameter set correctly: + + cmake -DPARQUET_BUILD_BENCHMARKS=ON .. + +and instead of make unittest run either `make; ctest` to run both unit tests +and benchmarks or `make runbenchmark` to run only the benchmark tests. + +Benchmark logs will be placed in the build directory under `build/benchmark-logs`. + + ## Out-of-source builds parquet-cpp supports out of source builds. For example: @@ -116,7 +130,7 @@ mkdir test-build cd test-build cmake .. make -ctest +ctest -L unittest ``` By using out-of-source builds you can preserve your current build state in case @@ -172,7 +186,7 @@ mkdir coverage-build cd coverage-build cmake -DPARQUET_GENERATE_COVERAGE=1 make -j$PARALLEL -ctest +ctest -L unittest ``` The `gcov` artifacts are not located in a place that works well with either @@ -205,4 +219,4 @@ coveralls -t $PARQUET_CPP_COVERAGE_TOKEN --gcov-options '\-l' -r $PARQUET_ROOT - Note that `gcov` throws off artifacts from the STL, so I excluded my toolchain -root stored in `$NATIVE_TOOLCHAIN` to avoid a cluttered coverage report. \ No newline at end of file +root stored in `$NATIVE_TOOLCHAIN` to avoid a cluttered coverage report. diff --git a/build-support/run-test.sh b/build-support/run-test.sh index 889e2a23..7c3b570c 100755 --- a/build-support/run-test.sh +++ b/build-support/run-test.sh @@ -20,15 +20,23 @@ # Script which wraps running a test and redirects its output to a # test log directory. # -# If PARQUET_COMPRESS_TEST_OUTPUT is non-empty, then the logs will be -# gzip-compressed while they are written. +# Arguments: +# $1 - Base path for logs/artifacts. +# $2 - type of test (e.g. test or benchmark) +# $3 - path to executable +# $ARGN - arguments for executable +# +OUTPUT_ROOT=$1 +shift ROOT=$(cd $(dirname $BASH_SOURCE)/..; pwd) -TEST_LOGDIR=$ROOT/build/test-logs +TEST_LOGDIR=$OUTPUT_ROOT/build/$1-logs mkdir -p $TEST_LOGDIR -TEST_DEBUGDIR=$ROOT/build/test-debug +RUN_TYPE=$1 +shift +TEST_DEBUGDIR=$OUTPUT_ROOT/build/$RUN_TYPE-debug mkdir -p $TEST_DEBUGDIR TEST_DIRNAME=$(cd $(dirname $1); pwd) @@ -37,11 +45,8 @@ shift TEST_EXECUTABLE="$TEST_DIRNAME/$TEST_FILENAME" TEST_NAME=$(echo $TEST_FILENAME | perl -pe 's/\..+?$//') # Remove path and extension (if any). -TEST_EXECUTION_ATTEMPTS=1 - - # We run each test in its own subdir to avoid core file related races. -TEST_WORKDIR=$ROOT/build/test-work/$TEST_NAME +TEST_WORKDIR=$OUTPUT_ROOT/build/test-work/$TEST_NAME mkdir -p $TEST_WORKDIR pushd $TEST_WORKDIR >/dev/null || exit 1 rm -f * @@ -51,40 +56,57 @@ set -o pipefail LOGFILE=$TEST_LOGDIR/$TEST_NAME.txt XMLFILE=$TEST_LOGDIR/$TEST_NAME.xml -# Remove both the compressed and uncompressed output, so the developer -# doesn't accidentally get confused and read output from a prior test -# run. +TEST_EXECUTION_ATTEMPTS=1 + +# Remove both the uncompressed output, so the developer doesn't accidentally get confused +# and read output from a prior test run. rm -f $LOGFILE $LOGFILE.gz -if [ -n "$PARQUET_COMPRESS_TEST_OUTPUT" ] && [ "$PARQUET_COMPRESS_TEST_OUTPUT" -ne 0 ] ; then - pipe_cmd=gzip - LOGFILE=${LOGFILE}.gz -else - pipe_cmd=cat -fi +pipe_cmd=cat # Allow for collecting core dumps. PARQUET_TEST_ULIMIT_CORE=${PARQUET_TEST_ULIMIT_CORE:-0} ulimit -c $PARQUET_TEST_ULIMIT_CORE -# Run the actual test. -for ATTEMPT_NUMBER in $(seq 1 $TEST_EXECUTION_ATTEMPTS) ; do - if [ $ATTEMPT_NUMBER -lt $TEST_EXECUTION_ATTEMPTS ]; then - # If the test fails, the test output may or may not be left behind, - # depending on whether the test cleaned up or exited immediately. Either - # way we need to clean it up. We do this by comparing the data directory - # contents before and after the test runs, and deleting anything new. - # - # The comm program requires that its two inputs be sorted. - TEST_TMPDIR_BEFORE=$(find $TEST_TMPDIR -maxdepth 1 -type d | sort) + +function setup_sanitizers() { + # Sets environment variables for different sanitizers (it configures how) the run_tests. Function works. + + # Configure TSAN (ignored if this isn't a TSAN build). + # + # Deadlock detection (new in clang 3.5) is disabled because: + # 1. The clang 3.5 deadlock detector crashes in some unit tests. It + # needs compiler-rt commits c4c3dfd, 9a8efe3, and possibly others. + # 2. Many unit tests report lock-order-inversion warnings; they should be + # fixed before reenabling the detector. + TSAN_OPTIONS="$TSAN_OPTIONS detect_deadlocks=0" + TSAN_OPTIONS="$TSAN_OPTIONS suppressions=$ROOT/build-support/tsan-suppressions.txt" + TSAN_OPTIONS="$TSAN_OPTIONS history_size=7" + export TSAN_OPTIONS + + # Enable leak detection even under LLVM 3.4, where it was disabled by default. + # This flag only takes effect when running an ASAN build. + ASAN_OPTIONS="$ASAN_OPTIONS detect_leaks=1" + export ASAN_OPTIONS + + # Set up suppressions for LeakSanitizer + LSAN_OPTIONS="$LSAN_OPTIONS suppressions=$ROOT/build-support/lsan-suppressions.txt" + export LSAN_OPTIONS + + # Suppressions require symbolization. We'll default to using the symbolizer in + # thirdparty. + if [ -z "$ASAN_SYMBOLIZER_PATH" ]; then + export ASAN_SYMBOLIZER_PATH=$(find $NATIVE_TOOLCHAIN/llvm-3.7.0/bin -name llvm-symbolizer) fi +} + +function run_test() { + # Run gtest style tests with sanitizers if they are setup appropriately. # gtest won't overwrite old junit test files, resulting in a build failure # even when retries are successful. rm -f $XMLFILE - echo "Running $TEST_NAME, redirecting output into $LOGFILE" \ - "(attempt ${ATTEMPT_NUMBER}/$TEST_EXECUTION_ATTEMPTS)" $TEST_EXECUTABLE "$@" 2>&1 \ | $ROOT/build-support/stacktrace_addr2line.pl $TEST_EXECUTABLE \ | $pipe_cmd > $LOGFILE @@ -104,6 +126,46 @@ for ATTEMPT_NUMBER in $(seq 1 $TEST_EXECUTION_ATTEMPTS) ; do STATUS=1 rm -f $XMLFILE fi +} + +function post_process_tests() { + # If we have a LeakSanitizer report, and XML reporting is configured, add a new test + # case result to the XML file for the leak report. Otherwise Jenkins won't show + # us which tests had LSAN errors. + if zgrep --silent "ERROR: LeakSanitizer: detected memory leaks" $LOGFILE ; then + echo Test had memory leaks. Editing XML + perl -p -i -e ' + if (m##) { + print "\n"; + print " \n"; + print " See txt log file for details\n"; + print " \n"; + print "\n"; + }' $XMLFILE + fi +} + +function run_other() { + # Generic run function for test like executables that aren't actually gtest + $TEST_EXECUTABLE "$@" 2>&1 | $pipe_cmd > $LOGFILE + STATUS=$? +} + +if [ $RUN_TYPE = "test" ]; then + setup_sanitizers +fi + +# Run the actual test. +for ATTEMPT_NUMBER in $(seq 1 $TEST_EXECUTION_ATTEMPTS) ; do + if [ $ATTEMPT_NUMBER -lt $TEST_EXECUTION_ATTEMPTS ]; then + # If the test fails, the test output may or may not be left behind, + # depending on whether the test cleaned up or exited immediately. Either + # way we need to clean it up. We do this by comparing the data directory + # contents before and after the test runs, and deleting anything new. + # + # The comm program requires that its two inputs be sorted. + TEST_TMPDIR_BEFORE=$(find $TEST_TMPDIR -maxdepth 1 -type d | sort) + fi if [ $ATTEMPT_NUMBER -lt $TEST_EXECUTION_ATTEMPTS ]; then # Now delete any new test output. @@ -123,7 +185,13 @@ for ATTEMPT_NUMBER in $(seq 1 $TEST_EXECUTION_ATTEMPTS) ; do fi done fi - + echo "Running $TEST_NAME, redirecting output into $LOGFILE" \ + "(attempt ${ATTEMPT_NUMBER}/$TEST_EXECUTION_ATTEMPTS)" + if [ $RUN_TYPE = "test" ]; then + run_test $* + else + run_other $* + fi if [ "$STATUS" -eq "0" ]; then break elif [ "$ATTEMPT_NUMBER" -lt "$TEST_EXECUTION_ATTEMPTS" ]; then @@ -132,6 +200,10 @@ for ATTEMPT_NUMBER in $(seq 1 $TEST_EXECUTION_ATTEMPTS) ; do fi done +if [ $RUN_TYPE = "test" ]; then + post_process_tests +fi + # Capture and compress core file and binary. COREFILES=$(ls | grep ^core) if [ -n "$COREFILES" ]; then diff --git a/ci/travis_script_cpp.sh b/ci/travis_script_cpp.sh index c8c0ac07..16291a92 100755 --- a/ci/travis_script_cpp.sh +++ b/ci/travis_script_cpp.sh @@ -14,13 +14,13 @@ fi if [ $TRAVIS_OS_NAME == "linux" ]; then make -j4 || exit 1 - ctest || { cat $TRAVIS_BUILD_DIR/parquet-build/Testing/Temporary/LastTest.log; exit 1; } + ctest -L unittest || { cat $TRAVIS_BUILD_DIR/parquet-build/Testing/Temporary/LastTest.log; exit 1; } sudo pip install cpp_coveralls export PARQUET_ROOT=$TRAVIS_BUILD_DIR $TRAVIS_BUILD_DIR/ci/upload_coverage.sh else make -j4 || exit 1 - ctest || { cat $TRAVIS_BUILD_DIR/parquet-build/Testing/Temporary/LastTest.log; exit 1; } + ctest -L unittest || { cat $TRAVIS_BUILD_DIR/parquet-build/Testing/Temporary/LastTest.log; exit 1; } fi popd diff --git a/cmake_modules/FindGBenchmark.cmake b/cmake_modules/FindGBenchmark.cmake new file mode 100644 index 00000000..3e46a60f --- /dev/null +++ b/cmake_modules/FindGBenchmark.cmake @@ -0,0 +1,88 @@ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Tries to find Google benchmark headers and libraries. +# +# Usage of this module as follows: +# +# find_package(GBenchark) +# +# Variables used by this module, they can change the default behaviour and need +# to be set before calling find_package: +# +# GBenchmark_HOME - When set, this path is inspected instead of standard library +# locations as the root of the benchark installation. +# The environment variable GBENCHMARK_HOME overrides this veriable. +# +# This module defines +# GBENCHMARK_INCLUDE_DIR, directory containing benchmark header directory +# GBENCHMARK_LIBS, directory containing benchmark libraries +# GBENCHMARK_STATIC_LIB, path to libbenchmark.a +# GBENCHMARK_FOUND, whether gbenchmark has been found + +if( NOT "$ENV{GBENCHMARK_HOME}" STREQUAL "") + file( TO_CMAKE_PATH "$ENV{GBENCHMARK_HOME}" _native_path ) + list( APPEND _gbenchmark_roots ${_native_path} ) +elseif ( GBenchmark_HOME ) + list( APPEND _gbenchmark_roots ${GBenchmark_HOME} ) +endif() + +# Try the parameterized roots, if they exist +if ( _gbenchmark_roots ) + find_path( GBENCHMARK_INCLUDE_DIR NAMES benchmark/benchmark.h + PATHS ${_gbenchmark_roots} NO_DEFAULT_PATH + PATH_SUFFIXES "include" ) + find_library( GBENCHMARK_LIBRARIES NAMES benchmark + PATHS ${_gbenchmark_roots} NO_DEFAULT_PATH + PATH_SUFFIXES "lib" ) +else () + find_path( GBENCHMARK_INCLUDE_DIR NAMES benchmark/benchmark.hh ) + find_library( GBENCHMARK_LIBRARIES NAMES benchmark ) +endif () + + +if (GBENCHMARK_INCLUDE_DIR AND GBENCHMARK_LIBRARIES) + set(GBENCHMARK_FOUND TRUE) + get_filename_component( GBENCHMARK_LIBS ${GBENCHMARK_LIBRARIES} PATH ) + set(GBENCHMARK_LIB_NAME libbenchmark) + set(GBENCHMARK_STATIC_LIB ${GBENCHMARK_LIBS}/${GBENCHMARK_LIB_NAME}.a) +else () + set(GBENCHMARK_FOUND FALSE) +endif () + +if (GBENCHMARK_FOUND) + if (NOT GBenchmark_FIND_QUIETLY) + message(STATUS "Found the GBenchmark library: ${GBENCHMARK_LIBRARIES}") + endif () +else () + if (NOT GBenchmark_FIND_QUIETLY) + set(GBENCHMARK_ERR_MSG "Could not find the GBenchmark library. Looked in ") + if ( _gbenchmark_roots ) + set(GBENCHMARK_ERR_MSG "${GBENCHMARK_ERR_MSG} in ${_gbenchmark_roots}.") + else () + set(GBENCHMARK_ERR_MSG "${GBENCHMARK_ERR_MSG} system search paths.") + endif () + if (GBenchmark_FIND_REQUIRED) + message(FATAL_ERROR "${GBENCHMARK_ERR_MSG}") + else (GBenchmark_FIND_REQUIRED) + message(STATUS "${GBENCHMARK_ERR_MSG}") + endif (GBenchmark_FIND_REQUIRED) + endif () +endif () + +mark_as_advanced( + GBENCHMARK_INCLUDE_DIR + GBENCHMARK_LIBS + GBENCHMARK_LIBRARIES + GBENCHMARK_STATIC_LIB +) diff --git a/src/parquet/column/CMakeLists.txt b/src/parquet/column/CMakeLists.txt index e11c7a81..ace00720 100644 --- a/src/parquet/column/CMakeLists.txt +++ b/src/parquet/column/CMakeLists.txt @@ -27,3 +27,5 @@ ADD_PARQUET_TEST(column-reader-test) ADD_PARQUET_TEST(column-writer-test) ADD_PARQUET_TEST(levels-test) ADD_PARQUET_TEST(scanner-test) + +ADD_PARQUET_BENCHMARK(column-io-benchmark) diff --git a/src/parquet/column/column-io-benchmark.cc b/src/parquet/column/column-io-benchmark.cc new file mode 100644 index 00000000..8007ed55 --- /dev/null +++ b/src/parquet/column/column-io-benchmark.cc @@ -0,0 +1,119 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +#include "parquet/file/reader-internal.h" +#include "parquet/file/writer-internal.h" +#include "parquet/column/reader.h" +#include "parquet/column/writer.h" +#include "parquet/util/input.h" + +namespace parquet { + +using format::ColumnChunk; +using schema::PrimitiveNode; + +namespace benchmark { + +std::unique_ptr BuildWriter(int64_t output_size, OutputStream* dst, + ColumnChunk* metadata, ColumnDescriptor* schema) { + std::unique_ptr pager( + new SerializedPageWriter(dst, Compression::UNCOMPRESSED, metadata)); + return std::unique_ptr( + new Int64Writer(schema, std::move(pager), output_size)); +} + +std::shared_ptr Int64Schema(Repetition::type repetition) { + auto node = PrimitiveNode::Make("int64", repetition, Type::INT64); + return std::make_shared( + node, repetition != Repetition::REQUIRED, repetition == Repetition::REPEATED); +} + +template +static void BM_WriteInt64Column(::benchmark::State& state) { + format::ColumnChunk metadata; + std::vector values(state.range_x(), 128); + std::vector definition_levels(state.range_x(), 1); + std::vector repetition_levels(state.range_x(), 0); + std::shared_ptr schema = Int64Schema(repetition); + + while (state.KeepRunning()) { + InMemoryOutputStream dst; + std::unique_ptr writer = + BuildWriter(state.range_x(), &dst, &metadata, schema.get()); + writer->WriteBatch( + values.size(), definition_levels.data(), repetition_levels.data(), values.data()); + writer->Close(); + } +} + +BENCHMARK_TEMPLATE(BM_WriteInt64Column, Repetition::REQUIRED)->Range(1024, 65536); + +BENCHMARK_TEMPLATE(BM_WriteInt64Column, Repetition::OPTIONAL)->Range(1024, 65536); + +BENCHMARK_TEMPLATE(BM_WriteInt64Column, Repetition::REPEATED)->Range(1024, 65536); + +std::unique_ptr BuildReader( + std::shared_ptr& buffer, ColumnDescriptor* schema) { + std::unique_ptr source(new InMemoryInputStream(buffer)); + std::unique_ptr page_reader( + new SerializedPageReader(std::move(source), Compression::UNCOMPRESSED)); + return std::unique_ptr(new Int64Reader(schema, std::move(page_reader))); +} + +template +static void BM_ReadInt64Column(::benchmark::State& state) { + format::ColumnChunk metadata; + std::vector values(state.range_x(), 128); + std::vector definition_levels(state.range_x(), 1); + std::vector repetition_levels(state.range_x(), 0); + std::shared_ptr schema = Int64Schema(repetition); + + InMemoryOutputStream dst; + std::unique_ptr writer = + BuildWriter(state.range_x(), &dst, &metadata, schema.get()); + writer->WriteBatch( + values.size(), definition_levels.data(), repetition_levels.data(), values.data()); + writer->Close(); + + std::shared_ptr src = dst.GetBuffer(); + std::vector values_out(state.range_y()); + std::vector definition_levels_out(state.range_y()); + std::vector repetition_levels_out(state.range_y()); + while (state.KeepRunning()) { + std::unique_ptr reader = BuildReader(src, schema.get()); + int64_t values_read = 0; + for (size_t i = 0; i < values.size(); i += values_read) { + reader->ReadBatch(values_out.size(), definition_levels_out.data(), + repetition_levels_out.data(), values_out.data(), &values_read); + } + } +} + +BENCHMARK_TEMPLATE(BM_ReadInt64Column, Repetition::REQUIRED) + ->RangePair(1024, 65536, 1, 1024); + +BENCHMARK_TEMPLATE(BM_ReadInt64Column, Repetition::OPTIONAL) + ->RangePair(1024, 65536, 1, 1024); + +BENCHMARK_TEMPLATE(BM_ReadInt64Column, Repetition::REPEATED) + ->RangePair(1024, 65536, 1, 1024); + +} // namespace benchmark + +} // namespace parquet diff --git a/src/parquet/encodings/CMakeLists.txt b/src/parquet/encodings/CMakeLists.txt index eb4cc3cf..00565b25 100644 --- a/src/parquet/encodings/CMakeLists.txt +++ b/src/parquet/encodings/CMakeLists.txt @@ -27,3 +27,4 @@ install(FILES DESTINATION include/parquet/encodings) ADD_PARQUET_TEST(encoding-test) +ADD_PARQUET_BENCHMARK(encoding-benchmark) diff --git a/src/parquet/encodings/encoding-benchmark.cc b/src/parquet/encodings/encoding-benchmark.cc new file mode 100644 index 00000000..92bc29e6 --- /dev/null +++ b/src/parquet/encodings/encoding-benchmark.cc @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +#include "parquet/encodings/plain-encoding.h" + +namespace parquet { + +namespace benchmark { + +static void BM_PlainEncodingBoolean(::benchmark::State& state) { + std::vector values(state.range_x(), 64); + PlainEncoder encoder(nullptr); + + while (state.KeepRunning()) { + InMemoryOutputStream dst; + encoder.Encode(values, values.size(), &dst); + } +} + +BENCHMARK(BM_PlainEncodingBoolean)->Range(1024, 65536); + +static void BM_PlainDecodingBoolean(::benchmark::State& state) { + std::vector values(state.range_x(), 64); + bool* output = new bool[state.range_x()]; + PlainEncoder encoder(nullptr); + InMemoryOutputStream dst; + encoder.Encode(values, values.size(), &dst); + std::shared_ptr buf = dst.GetBuffer(); + + while (state.KeepRunning()) { + PlainDecoder decoder(nullptr); + decoder.SetData(values.size(), buf->data(), buf->size()); + decoder.Decode(output, values.size()); + } + + delete[] output; +} + +BENCHMARK(BM_PlainDecodingBoolean)->Range(1024, 65536); + +static void BM_PlainEncodingInt64(::benchmark::State& state) { + std::vector values(state.range_x(), 64); + PlainEncoder encoder(nullptr); + + while (state.KeepRunning()) { + InMemoryOutputStream dst; + encoder.Encode(values.data(), values.size(), &dst); + } +} + +BENCHMARK(BM_PlainEncodingInt64)->Range(1024, 65536); + +static void BM_PlainDecodingInt64(::benchmark::State& state) { + std::vector values(state.range_x(), 64); + PlainEncoder encoder(nullptr); + InMemoryOutputStream dst; + encoder.Encode(values.data(), values.size(), &dst); + std::shared_ptr buf = dst.GetBuffer(); + + while (state.KeepRunning()) { + PlainDecoder decoder(nullptr); + decoder.SetData(values.size(), buf->data(), buf->size()); + decoder.Decode(values.data(), values.size()); + } +} + +BENCHMARK(BM_PlainDecodingInt64)->Range(1024, 65536); + +} // namespace benchmark + +} // namespace parquet diff --git a/src/parquet/util/CMakeLists.txt b/src/parquet/util/CMakeLists.txt index b4faaa14..171c0541 100644 --- a/src/parquet/util/CMakeLists.txt +++ b/src/parquet/util/CMakeLists.txt @@ -63,6 +63,20 @@ if(PARQUET_BUILD_TESTS) endif() endif() +if (PARQUET_BUILD_BENCHMARKS) + add_library(parquet_benchmark_main benchmark_main.cc) + if (APPLE) + target_link_libraries(parquet_benchmark_main + gbenchmark + ) + else() + target_link_libraries(parquet_benchmark_main + gbenchmark + pthread + ) + endif() +endif() + ADD_PARQUET_TEST(bit-util-test) ADD_PARQUET_TEST(buffer-test) ADD_PARQUET_TEST(input-output-test) diff --git a/src/parquet/util/benchmark_main.cc b/src/parquet/util/benchmark_main.cc new file mode 100644 index 00000000..c9739af0 --- /dev/null +++ b/src/parquet/util/benchmark_main.cc @@ -0,0 +1,24 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "benchmark/benchmark.h" + +int main(int argc, char** argv) { + benchmark::Initialize(&argc, argv); + benchmark::RunSpecifiedBenchmarks(); + return 0; +} diff --git a/thirdparty/build_thirdparty.sh b/thirdparty/build_thirdparty.sh index 5f00055a..b637a366 100755 --- a/thirdparty/build_thirdparty.sh +++ b/thirdparty/build_thirdparty.sh @@ -17,6 +17,7 @@ else case $arg in "lz4") F_LZ4=1 ;; "zlib") F_ZLIB=1 ;; + "gbenchmark") F_GBENCHMARK=1 ;; "gtest") F_GTEST=1 ;; "snappy") F_SNAPPY=1 ;; "thrift") F_THRIFT=1 ;; @@ -55,17 +56,34 @@ if [ -n "$F_ALL" -o -n "$F_SNAPPY" ]; then make -j$PARALLEL install fi +STANDARD_DARWIN_FLAGS="-std=c++11 -stdlib=libc++" + # build googletest +GOOGLETEST_ERROR="failed for googletest!" if [ -n "$F_ALL" -o -n "$F_GTEST" ]; then cd $TP_DIR/$GTEST_BASEDIR if [[ "$OSTYPE" == "darwin"* ]]; then - cmake -DCMAKE_CXX_FLAGS="-fPIC -std=c++11 -stdlib=libc++ -DGTEST_USE_OWN_TR1_TUPLE=1 -Wno-unused-value -Wno-ignored-attributes" + CXXFLAGS=-fPIC cmake -DCMAKE_CXX_FLAGS="$STANDARD_DARWIN_FLAGS -DGTEST_USE_OWN_TR1_TUPLE=1 -Wno-unused-value -Wno-ignored-attributes" || { echo "cmake $GOOGLETEST_ERROR" ; exit 1; } else - CXXFLAGS=-fPIC cmake -DCMAKE_INSTALL_PREFIX:PATH=$PREFIX . + CXXFLAGS=-fPIC cmake . || { echo "cmake $GOOGLETEST_ERROR"; exit 1; } + fi + + make VERBOSE=1 || { echo "Make $GOOGLETEST_ERROR" ; exit 1; } +fi + +# build google benchmark +GBENCHMARK_ERROR="failed for google benchmark" +if [ -n "$F_ALL" -o -n "$F_GBENCHMARK" ]; then + cd $TP_DIR/$GBENCHMARK_BASEDIR + + CMAKE_CXX_FLAGS="--std=c++11" + if [[ "$OSTYPE" == "darwin"* ]]; then + CMAKE_CXX_FLAGS=$STANDARD_DARWIN_FLAGS fi + cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=$PREFIX -DCMAKE_CXX_FLAGS="-fPIC $CMAKE_CXX_FLAGS" . || { echo "cmake $GBENCHMARK_ERROR" ; exit 1; } - make + make VERBOSE=1 install || { echo "make $GBENCHMARK_ERROR" ; exit 1; } fi # build lz4 diff --git a/thirdparty/download_thirdparty.sh b/thirdparty/download_thirdparty.sh index 1ea2eba5..a0bd14db 100755 --- a/thirdparty/download_thirdparty.sh +++ b/thirdparty/download_thirdparty.sh @@ -29,6 +29,11 @@ if [ ! -d ${GTEST_BASEDIR} ]; then download_extract_and_cleanup $GTEST_URL fi +if [ ! -d ${GBENCHMARK_BASEDIR} ]; then + echo "Fetching gtest" + download_extract_and_cleanup $GBENCHMARK_URL +fi + if [ ! -d ${THRIFT_BASEDIR} ]; then echo "Fetching thrift" download_extract_and_cleanup $THRIFT_URL diff --git a/thirdparty/set_thirdparty_env.sh b/thirdparty/set_thirdparty_env.sh index 72b7074b..52b705da 100644 --- a/thirdparty/set_thirdparty_env.sh +++ b/thirdparty/set_thirdparty_env.sh @@ -16,3 +16,4 @@ if [ "$(uname)" != "Darwin" ]; then fi export GTEST_HOME=$THIRDPARTY_DIR/$GTEST_BASEDIR +export GBENCHMARK_HOME=$THIRDPARTY_DIR/installed diff --git a/thirdparty/versions.sh b/thirdparty/versions.sh index 8c222656..83805803 100755 --- a/thirdparty/versions.sh +++ b/thirdparty/versions.sh @@ -10,6 +10,11 @@ THRIFT_VERSION=0.9.1 THRIFT_URL="http://archive.apache.org/dist/thrift/${THRIFT_VERSION}/thrift-${THRIFT_VERSION}.tar.gz" THRIFT_BASEDIR=thrift-$THRIFT_VERSION + +GBENCHMARK_VERSION=1.0.0 +GBENCHMARK_URL="https://github.com/google/benchmark/archive/v${GBENCHMARK_VERSION}.tar.gz" +GBENCHMARK_BASEDIR=benchmark-$GBENCHMARK_VERSION + GTEST_VERSION=1.7.0 GTEST_URL="https://github.com/google/googletest/archive/release-${GTEST_VERSION}.tar.gz" GTEST_BASEDIR=googletest-release-$GTEST_VERSION