apache · xhochy · Apr 30, 2016 · May 1, 2016 · May 1, 2016 · May 1, 2016
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -74,6 +74,9 @@ if ("${CMAKE_SOURCE_DIR}" STREQUAL "${CMAKE_CURRENT_SOURCE_DIR}")
   option(PARQUET_USE_SSE
 	"Build with SSE4 optimizations"
 	OFF)
+  option(PARQUET_BUILD_BENCHMARKS
+	"Build the libparquet benchmark suite"
+    OFF)
   option(PARQUET_BUILD_TESTS
 	"Build the libparquet test suite"
 	ON)
@@ -102,6 +105,60 @@ else()
   set(BUILD_OUTPUT_ROOT_DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/${BUILD_SUBDIR_NAME}")
 endif()
 
+############################################################
+# Benchmarking
+############################################################
+# Add a new micro benchmark, with or without an executable that should be built.
+# If benchmarks are enabled then they will be run along side unit tests with ctest.
+# 'make runbenchmark' and 'make unittest' to build/run only benchmark or unittests,
+# respectively.
+#
+# REL_BENCHMARK_NAME is the name of the benchmark app. It may be a single component
+# (e.g. monotime-benchmark) or contain additional components (e.g.
+# net/net_util-benchmark). Either way, the last component must be a globally
+# unique name.
+
+# The benchmark will registered as unit test with ctest with a label
+# of 'benchmark'.
+#
+# Arguments after the test name will be passed to set_tests_properties().
+function(ADD_PARQUET_BENCHMARK REL_BENCHMARK_NAME)
+    if(NOT PARQUET_BUILD_BENCHMARKS)
+    return()
+  endif()
+  get_filename_component(BENCHMARK_NAME ${REL_BENCHMARK_NAME} NAME_WE)
+
+  if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${REL_BENCHMARK_NAME}.cc)
+    # This benchmark has a corresponding .cc file, set it up as an executable.
+    set(BENCHMARK_PATH "${EXECUTABLE_OUTPUT_PATH}/${BENCHMARK_NAME}")
+    add_executable(${BENCHMARK_NAME} "${REL_BENCHMARK_NAME}.cc")
+    target_link_libraries(${BENCHMARK_NAME} ${PARQUET_BENCHMARK_LINK_LIBS})
+    add_dependencies(runbenchmark ${BENCHMARK_NAME})
+    set(NO_COLOR "--color_print=false")
+  else()
+    # No executable, just invoke the benchmark (probably a script) directly.
+    set(BENCHMARK_PATH ${CMAKE_CURRENT_SOURCE_DIR}/${REL_BENCHMARK_NAME})
+    set(NO_COLOR "")
+  endif()
+
+  add_test(${BENCHMARK_NAME}
+    ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} benchmark ${BENCHMARK_PATH} ${NO_COLOR})
+  set_tests_properties(${BENCHMARK_NAME} PROPERTIES LABELS "benchmark")
+  if(ARGN)
+    set_tests_properties(${BENCHMARK_NAME} PROPERTIES ${ARGN})
+  endif()
+endfunction()
+
+# A wrapper for add_dependencies() that is compatible with NO_BENCHMARKS.
+function(ADD_PARQUET_BENCHMARK_DEPENDENCIES REL_BENCHMARK_NAME)
+  if(NOT PARQUET_BUILD_BENCHMARKS)
+    return()
+  endif()
+  get_filename_component(BENCMARK_NAME ${REL_BENCHMARK_NAME} NAME_WE)
+
+  add_dependencies(${BENCHMARK_NAME} ${ARGN})
+endfunction()
+
 ############################################################
 # Testing
 ############################################################
@@ -113,6 +170,9 @@ endif()
 # net/net_util-test). Either way, the last component must be a globally
 # unique name.
 #
+# The unit test is added with a label of "unittest" to support filtering with
+# ctest.
+#
 # Arguments after the test name will be passed to set_tests_properties().
 function(ADD_PARQUET_TEST REL_TEST_NAME)
   if(NOT PARQUET_BUILD_TESTS)
@@ -124,6 +184,7 @@ function(ADD_PARQUET_TEST REL_TEST_NAME)
     # This test has a corresponding .cc file, set it up as an executable.
     set(TEST_PATH "${EXECUTABLE_OUTPUT_PATH}/${TEST_NAME}")
     add_executable(${TEST_NAME} "${REL_TEST_NAME}.cc")
+    add_dependencies(unittest ${TEST_NAME})
 
 	if(APPLE)
 	  # On OS X / Thrift >= 0.9.2, tr1/tuple.h is not in libc++
@@ -149,8 +210,9 @@ function(ADD_PARQUET_TEST REL_TEST_NAME)
 	  valgrind --tool=memcheck --leak-check=full --error-exitcode=1 ${TEST_PATH})
   else()
 	add_test(${TEST_NAME}
-      ${BUILD_SUPPORT_DIR}/run-test.sh ${TEST_PATH})
+        ${BUILD_SUPPORT_DIR}/run-test.sh ${CMAKE_BINARY_DIR} test ${TEST_PATH})
   endif()
+  set_tests_properties(${TEST_NAME} PROPERTIES LABELS "unittest")
   if(ARGN)
     set_tests_properties(${TEST_NAME} PROPERTIES ${ARGN})
   endif()
@@ -213,11 +275,26 @@ add_library(zlibstatic STATIC IMPORTED)
 set_target_properties(zlibstatic PROPERTIES IMPORTED_LOCATION ${ZLIB_STATIC_LIB})
 
 ## GTest
+add_custom_target(unittest ctest -L unittest)
 find_package(GTest REQUIRED)
 include_directories(SYSTEM ${GTEST_INCLUDE_DIR})
 add_library(gtest STATIC IMPORTED)
 set_target_properties(gtest PROPERTIES IMPORTED_LOCATION ${GTEST_STATIC_LIB})
 
+## Google Benchmark
+if ("$ENV{GBENCHMARK_HOME}" STREQUAL "")
+  set(GBENCHMARK_HOME ${THIRDPARTY_DIR}/installed)
+endif()
+
+if(PARQUET_BUILD_BENCHMARKS)
+  add_custom_target(runbenchmark ctest -L benchmark)
+  find_package(GBenchmark REQUIRED)
+  include_directories(SYSTEM ${GBENCHMARK_INCLUDE_DIR})
+  message(${GBENCHMARK_STATIC_LIB})
+  add_library(gbenchmark STATIC IMPORTED)
+  set_target_properties(gbenchmark PROPERTIES IMPORTED_LOCATION ${GBENCHMARK_STATIC_LIB})
+endif()
+
 # Thrift requires these definitions for some types that we use
 add_definitions(-DHAVE_INTTYPES_H -DHAVE_NETINET_IN_H -DHAVE_NETDB_H)
 add_definitions(-fPIC)
@@ -331,6 +408,11 @@ set(PARQUET_MIN_TEST_LIBS
   parquet)
 set(PARQUET_TEST_LINK_LIBS ${PARQUET_MIN_TEST_LIBS})
 
+#############################################################
+# Benchmark linking
+
+set(PARQUET_BENCHMARK_LINK_LIBS parquet parquet_benchmark_main)
+
 #############################################################
 # Code coverage
 

diff --git a/README.md b/README.md
@@ -24,6 +24,7 @@
 - zlib
 - thrift 0.7+ [install instructions](https://thrift.apache.org/docs/install/)
 - googletest 1.7.0 (cannot be installed with package managers)
+- Google Benchmark (only required if building benchmarks)
 
 You can install these dependencies using a package manager or using the
 `thirdparty/` scripts in this repository. On Homebrew, you can run:
@@ -87,7 +88,7 @@ This library uses Google's `googletest` unit test framework. After building
 with `make`, you can run the test suite by running
 
 ```
-ctest
+make unittest
 ```
 
 The test suite relies on an environment variable `PARQUET_TEST_DATA` pointing
@@ -107,6 +108,19 @@ you can use valgrind with ctest to look for memory leaks:
 valgrind --tool=memcheck --leak-check=yes ctest
 ```
 
+## Building/Running benchmarks
+
+Follow the directions for simple build except run cmake
+with the `--PARQUET_BUILD_BENCHMARKS` parameter set correctly:
+
+    cmake -DPARQUET_BUILD_BENCHMARKS=ON ..
+
+and instead of make unittest run either `make; ctest` to run both unit tests
+and benchmarks or `make runbenchmark` to run only the benchmark tests.
+
+Benchmark logs will be placed in the build directory under `build/benchmark-logs`.
+
+
 ## Out-of-source builds
 
 parquet-cpp supports out of source builds. For example:
@@ -116,7 +130,7 @@ mkdir test-build
 cd test-build
 cmake ..
 make
-ctest
+ctest -L unittest
 ```
 
 By using out-of-source builds you can preserve your current build state in case
@@ -172,7 +186,7 @@ mkdir coverage-build
 cd coverage-build
 cmake -DPARQUET_GENERATE_COVERAGE=1
 make -j$PARALLEL
-ctest
+ctest -L unittest
 ```
 
 The `gcov` artifacts are not located in a place that works well with either
@@ -205,4 +219,4 @@ coveralls -t $PARQUET_CPP_COVERAGE_TOKEN --gcov-options '\-l' -r $PARQUET_ROOT -
 
 
 Note that `gcov` throws off artifacts from the STL, so I excluded my toolchain
-root stored in `$NATIVE_TOOLCHAIN` to avoid a cluttered coverage report.
+root stored in `$NATIVE_TOOLCHAIN` to avoid a cluttered coverage report.
diff --git a/build-support/run-test.sh b/build-support/run-test.sh
@@ -20,15 +20,23 @@
 # Script which wraps running a test and redirects its output to a
 # test log directory.
 #
-# If PARQUET_COMPRESS_TEST_OUTPUT is non-empty, then the logs will be
-# gzip-compressed while they are written.
+# Arguments:
+#    $1 - Base path for logs/artifacts.
+#    $2 - type of test (e.g. test or benchmark)
+#    $3 - path to executable
+#    $ARGN - arguments for executable
+#
 
+OUTPUT_ROOT=$1
+shift
 ROOT=$(cd $(dirname $BASH_SOURCE)/..; pwd)
 
-TEST_LOGDIR=$ROOT/build/test-logs
+TEST_LOGDIR=$OUTPUT_ROOT/build/$1-logs
 mkdir -p $TEST_LOGDIR
 
-TEST_DEBUGDIR=$ROOT/build/test-debug
+RUN_TYPE=$1
+shift
+TEST_DEBUGDIR=$OUTPUT_ROOT/build/$RUN_TYPE-debug
 mkdir -p $TEST_DEBUGDIR
 
 TEST_DIRNAME=$(cd $(dirname $1); pwd)
@@ -37,11 +45,8 @@ shift
 TEST_EXECUTABLE="$TEST_DIRNAME/$TEST_FILENAME"
 TEST_NAME=$(echo $TEST_FILENAME | perl -pe 's/\..+?$//') # Remove path and extension (if any).
 
-TEST_EXECUTION_ATTEMPTS=1
-
-
 # We run each test in its own subdir to avoid core file related races.
-TEST_WORKDIR=$ROOT/build/test-work/$TEST_NAME
+TEST_WORKDIR=$OUTPUT_ROOT/build/test-work/$TEST_NAME
 mkdir -p $TEST_WORKDIR
 pushd $TEST_WORKDIR >/dev/null || exit 1
 rm -f *
@@ -51,40 +56,57 @@ set -o pipefail
 LOGFILE=$TEST_LOGDIR/$TEST_NAME.txt
 XMLFILE=$TEST_LOGDIR/$TEST_NAME.xml
 
-# Remove both the compressed and uncompressed output, so the developer
-# doesn't accidentally get confused and read output from a prior test
-# run.
+TEST_EXECUTION_ATTEMPTS=1
+
+# Remove both the uncompressed output, so the developer doesn't accidentally get confused
+# and read output from a prior test run.
 rm -f $LOGFILE $LOGFILE.gz
 
-if [ -n "$PARQUET_COMPRESS_TEST_OUTPUT" ] && [ "$PARQUET_COMPRESS_TEST_OUTPUT" -ne 0 ] ; then
-  pipe_cmd=gzip
-  LOGFILE=${LOGFILE}.gz
-else
-  pipe_cmd=cat
-fi
+pipe_cmd=cat
 
 # Allow for collecting core dumps.
-PARQUET_TEST_ULIMIT_CORE=${PARQUET_TEST_ULIMIT_CORE:-0}
-ulimit -c $PARQUET_TEST_ULIMIT_CORE
+ARROW_TEST_ULIMIT_CORE=${ARROW_TEST_ULIMIT_CORE:-0}
+ulimit -c $ARROW_TEST_ULIMIT_CORE
 
-# Run the actual test.
-for ATTEMPT_NUMBER in $(seq 1 $TEST_EXECUTION_ATTEMPTS) ; do
-  if [ $ATTEMPT_NUMBER -lt $TEST_EXECUTION_ATTEMPTS ]; then
-    # If the test fails, the test output may or may not be left behind,
-    # depending on whether the test cleaned up or exited immediately. Either
-    # way we need to clean it up. We do this by comparing the data directory
-    # contents before and after the test runs, and deleting anything new.
-    #
-    # The comm program requires that its two inputs be sorted.
-    TEST_TMPDIR_BEFORE=$(find $TEST_TMPDIR -maxdepth 1 -type d | sort)
+
+function setup_sanitizers() {
+  # Sets environment variables for different sanitizers (it configures how) the run_tests. Function works.
+
+  # Configure TSAN (ignored if this isn't a TSAN build).
+  #
+  # Deadlock detection (new in clang 3.5) is disabled because:
+  # 1. The clang 3.5 deadlock detector crashes in some unit tests. It
+  #    needs compiler-rt commits c4c3dfd, 9a8efe3, and possibly others.
+  # 2. Many unit tests report lock-order-inversion warnings; they should be
+  #    fixed before reenabling the detector.
+  TSAN_OPTIONS="$TSAN_OPTIONS detect_deadlocks=0"
+  TSAN_OPTIONS="$TSAN_OPTIONS suppressions=$ROOT/build-support/tsan-suppressions.txt"
+  TSAN_OPTIONS="$TSAN_OPTIONS history_size=7"
+  export TSAN_OPTIONS
+
+  # Enable leak detection even under LLVM 3.4, where it was disabled by default.
+  # This flag only takes effect when running an ASAN build.
+  ASAN_OPTIONS="$ASAN_OPTIONS detect_leaks=1"
+  export ASAN_OPTIONS
+
+  # Set up suppressions for LeakSanitizer
+  LSAN_OPTIONS="$LSAN_OPTIONS suppressions=$ROOT/build-support/lsan-suppressions.txt"
+  export LSAN_OPTIONS
+
+  # Suppressions require symbolization. We'll default to using the symbolizer in
+  # thirdparty.
+  if [ -z "$ASAN_SYMBOLIZER_PATH" ]; then
+    export ASAN_SYMBOLIZER_PATH=$(find $NATIVE_TOOLCHAIN/llvm-3.7.0/bin -name llvm-symbolizer)
   fi
+}
+
+function run_test() {
+  # Run gtest style tests with sanitizers if they are setup appropriately.
 
   # gtest won't overwrite old junit test files, resulting in a build failure
   # even when retries are successful.
   rm -f $XMLFILE
 
-  echo "Running $TEST_NAME, redirecting output into $LOGFILE" \
-    "(attempt ${ATTEMPT_NUMBER}/$TEST_EXECUTION_ATTEMPTS)"
   $TEST_EXECUTABLE "$@" 2>&1 \
     | $ROOT/build-support/stacktrace_addr2line.pl $TEST_EXECUTABLE \
     | $pipe_cmd > $LOGFILE
@@ -104,6 +126,46 @@ for ATTEMPT_NUMBER in $(seq 1 $TEST_EXECUTION_ATTEMPTS) ; do
     STATUS=1
     rm -f $XMLFILE
   fi
+}
+
+function post_process_tests() {
+  # If we have a LeakSanitizer report, and XML reporting is configured, add a new test
+  # case result to the XML file for the leak report. Otherwise Jenkins won't show
+  # us which tests had LSAN errors.
+  if zgrep --silent "ERROR: LeakSanitizer: detected memory leaks" $LOGFILE ; then
+      echo Test had memory leaks. Editing XML
+      perl -p -i -e '
+      if (m#</testsuite>#) {
+        print "<testcase name=\"LeakSanitizer\" status=\"run\" classname=\"LSAN\">\n";
+        print "  <failure message=\"LeakSanitizer failed\" type=\"\">\n";
+        print "    See txt log file for details\n";
+        print "  </failure>\n";
+        print "</testcase>\n";
+      }' $XMLFILE
+  fi
+}
+
+function run_other() {
+  # Generic run function for test like executables that aren't actually gtest
+  $TEST_EXECUTABLE "$@" 2>&1 | $pipe_cmd > $LOGFILE
+  STATUS=$?
+}
+
+if [ $RUN_TYPE = "test" ]; then
+    setup_sanitizers
+fi
+
+# Run the actual test.
+for ATTEMPT_NUMBER in $(seq 1 $TEST_EXECUTION_ATTEMPTS) ; do
+  if [ $ATTEMPT_NUMBER -lt $TEST_EXECUTION_ATTEMPTS ]; then
+    # If the test fails, the test output may or may not be left behind,
+    # depending on whether the test cleaned up or exited immediately. Either
+    # way we need to clean it up. We do this by comparing the data directory
+    # contents before and after the test runs, and deleting anything new.
+    #
+    # The comm program requires that its two inputs be sorted.
+    TEST_TMPDIR_BEFORE=$(find $TEST_TMPDIR -maxdepth 1 -type d | sort)
+  fi
 
   if [ $ATTEMPT_NUMBER -lt $TEST_EXECUTION_ATTEMPTS ]; then
     # Now delete any new test output.
@@ -123,7 +185,13 @@ for ATTEMPT_NUMBER in $(seq 1 $TEST_EXECUTION_ATTEMPTS) ; do
       fi
     done
   fi
-
+  echo "Running $TEST_NAME, redirecting output into $LOGFILE" \
+    "(attempt ${ATTEMPT_NUMBER}/$TEST_EXECUTION_ATTEMPTS)"
+  if [ $RUN_TYPE = "test" ]; then
+    run_test $*
+  else
+    run_other $*
+  fi
   if [ "$STATUS" -eq "0" ]; then
     break
   elif [ "$ATTEMPT_NUMBER" -lt "$TEST_EXECUTION_ATTEMPTS" ]; then
@@ -132,6 +200,10 @@ for ATTEMPT_NUMBER in $(seq 1 $TEST_EXECUTION_ATTEMPTS) ; do
   fi
 done
 
+if [ $RUN_TYPE = "test" ]; then
+  post_process_tests
+fi
+
 # Capture and compress core file and binary.
 COREFILES=$(ls | grep ^core)
 if [ -n "$COREFILES" ]; then