diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml new file mode 100644 index 00000000..4cce4036 --- /dev/null +++ b/.github/workflows/benchmarks.yml @@ -0,0 +1,25 @@ +name: benchmarks + +on: workflow_dispatch + +jobs: + benchmarks: + runs-on: [macos-13] + + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + sparse-checkout: | + scripts/ci-run-benchmarks.sh + + - name: Cache + uses: actions/cache@v4 + with: + key: ${{ runner.os }}-benchmarks + path: | + **/*.csv + **/*.tar.gz + + - name: Run benchmarks + run: ./scripts/ci-run-benchmarks.sh diff --git a/.gitignore b/.gitignore index dd0f528d..9b769b4e 100644 --- a/.gitignore +++ b/.gitignore @@ -49,6 +49,7 @@ build app/external install.sh .artifacts +.benchmarks .rpmbuild amd64-linux-* amd64-windows-* diff --git a/app/benchmark/README.md b/app/benchmark/README.md index 83d3de3c..fc81984a 100644 --- a/app/benchmark/README.md +++ b/app/benchmark/README.md @@ -10,7 +10,7 @@ by ~1.5x-27x, by similar or smaller margins (in each case we tested, by at least 20%) on other operating systems - Four utilities were tested: `zsv`, `xsv`, `tsv-utils` and `mlr` -- The below figured were based on results from runs on OSX (Intel). Similar +- The below figures were based on results from runs on OSX (Intel). Similar results were observed on other operating systems, but in some cases the difference was significantly smaller (for example, `zsv`) - On most platforms, `zsv` performed about 2x as fast as xsv, 1.5-2x as fast as @@ -66,30 +66,30 @@ Below are reported from tests run on OSX (Intel). Similar results were achieved on Windows, Linux and FreeBSD. See above note for results on M1. In some cases, especially on Windows, compiler settings had a significant -impact. If you observe results that materially differ, in terms of zsv vs other -utility performance, from what shown below, please let us know. +impact. If you observe results that materially differ, in terms of `zsv` vs +other utility performance, from what shown below, please let us know. ## Utilities compared The following utilities were compared: -- `xsv`: version 0.13.0, installed via brew +- `xsv` (0.13.0): installed via brew - `tsv-utils` (v2.2.1): installed via download of pre-built PGO-optimized binaries -- `mlr` (5.10.2): installed via brew (not shown in graph-- very slow compared to others) +- `mlr` (5.10.2): installed via brew (not shown in graph - very slow compared to others) - `zsv` (alpha): built from source using the default `configure` settings -- `csvcut` (1.0.6) (not shown in graph-- very slow compared to others) +- `csvcut` (1.0.6): (not shown in graph - very slow compared to others) ## Further notes -- `tsv-util` using a comma delimiter does *not* handle quoted data, unlike `xsv` - (and `zsv`), and thus its output may be incorrect. For this reason, these - tests ran `tsv-utils` both using a custom delimiter, and also on TSV data that - had been converted from the original CSV data. The performance in either case - was effectively the same +- `tsv-utils` using a comma delimiter does *not* handle quoted data, unlike + `xsv` (and `zsv`), and thus its output may be incorrect. For this reason, + these tests ran `tsv-utils` both using a custom delimiter, and also on TSV + data that had been converted from the original CSV data. The performance in + either case was effectively the same - `mlr` and `csvcut` are not shown in the graph since their performance was well over 10x slower than the others. `mlr` was included in the test was to compare - with another solution written in the same language (i.e. C) as zsv, since + with another solution written in the same language (i.e. C) as `zsv`, since `tsv-utils`, `xsv` and `zsv` are all written in different languages, and `csvcut` was included since `csvcut`/`csvkit` seem to be fairly commonly used for CSV processing. diff --git a/scripts/ci-run-benchmarks.sh b/scripts/ci-run-benchmarks.sh new file mode 100755 index 00000000..e0c79681 --- /dev/null +++ b/scripts/ci-run-benchmarks.sh @@ -0,0 +1,149 @@ +#!/bin/sh + +set -e + +echo "[INF] Running $0" + +BENCHMARKS_DIR=".benchmarks" +mkdir -p "$BENCHMARKS_DIR" +cd "$BENCHMARKS_DIR" + +CSV_URL="https://burntsushi.net/stuff/worldcitiespop_mil.csv" +CSV="$(echo "$CSV_URL" | sed 's:.*/::')" +echo "[INF] Downloading CSV file... [$CSV]" +if [ ! -f "$CSV" ]; then + wget -q "$CSV_URL" + echo "[INF] Downloaded successfully!" +else + echo "[INF] Download skipped! CSV file already exists! [$CSV]" +fi + +ls -Gghl "$CSV" + +ZSV_TAR_URL="https://github.com/liquidaty/zsv/releases/download/v0.3.9-alpha/zsv-0.3.9-alpha-amd64-macosx-gcc.tar.gz" +TSV_TAR_URL="https://github.com/eBay/tsv-utils/releases/download/v2.2.1/tsv-utils-v2.2.1_osx-x86_64_ldc2.tar.gz" +XSV_TAR_URL="https://github.com/BurntSushi/xsv/releases/download/0.13.0/xsv-0.13.0-x86_64-apple-darwin.tar.gz" + +for URL in "$ZSV_TAR_URL" "$TSV_TAR_URL" "$XSV_TAR_URL"; do + TAR="$(echo "$URL" | sed 's:.*/::')" + echo "[INF] Downloading... [$TAR]" + if [ ! -f "$TAR" ]; then + wget -q "$URL" + echo "[INF] Downloaded successfully! [$TAR]" + else + echo "[INF] Download skipped! Archive already exists! [$TAR]" + fi +done + +ls -Gghl ./*.tar.gz + +for TAR in *.tar.gz; do + echo "[INF] Extracting... [$TAR]" + tar xf "$TAR" +done + +TOOLS_DIR="tools" +rm -rf ./"$TOOLS_DIR" +mkdir -p "$TOOLS_DIR" + +FILES="$(find . -type f)" +for FILE in $FILES; do + if [ -x "$FILE" ]; then + cp "$FILE" "$TOOLS_DIR" + fi +done + +ls -Gghl "$TOOLS_DIR" + +COUNT_OUTPUT_FILE="count.out" +SELECT_OUTPUT_FILE="select.out" + +rm -f "$COUNT_OUTPUT_FILE" "$SELECT_OUTPUT_FILE" + +RUNS=6 + +echo "[INF] Running count benchmarks..." +for TOOL in zsv xsv tsv; do + CMD= + if [ "$TOOL" = "zsv" ]; then + CMD="$TOOLS_DIR/zsv count" + elif [ "$TOOL" = "xsv" ]; then + CMD="$TOOLS_DIR/xsv count" + elif [ "$TOOL" = "tsv" ]; then + CMD="$TOOLS_DIR/number-lines -d," + fi + + I=1 + while [ "$I" -le "$RUNS" ]; do + { + printf "%d | %s : " "$I" "$TOOL" + (time $CMD <"$CSV" >/dev/null) 2>&1 | xargs + } | tee -a "$COUNT_OUTPUT_FILE" + I=$((I + 1)) + done +done + +echo "[INF] Running select benchmarks..." +for TOOL in zsv xsv tsv; do + CMD= + if [ "$TOOL" = "zsv" ]; then + CMD="$TOOLS_DIR/zsv select -W -n -- 2 1 3-7" + elif [ "$TOOL" = "xsv" ]; then + CMD="$TOOLS_DIR/xsv select 2,1,3-7" + elif [ "$TOOL" = "tsv" ]; then + CMD="$TOOLS_DIR/tsv-select -d, -f 1-7" + fi + + I=1 + while [ "$I" -le "$RUNS" ]; do + { + printf "%d | %s : " "$I" "$TOOL" + (time $CMD <"$CSV" >/dev/null) 2>&1 | xargs + } | tee -a "$SELECT_OUTPUT_FILE" + I=$((I + 1)) + done +done + +MARKDOWN_OUTPUT="benchmarks.md" +echo "[INF] Generating Markdown output... [$MARKDOWN_OUTPUT]" +TIMESTAMP="$(date -u +"%Y-%m-%dT%H:%M:%SZ")" +{ + echo '# Benchmarks' + echo + echo "- Timestamp UTC: \`$TIMESTAMP\`" + echo + echo "## Releases Used" + echo + echo "- <$ZSV_TAR_URL>" + echo "- <$TSV_TAR_URL>" + echo "- <$XSV_TAR_URL>" + echo + echo '## Results' + echo + echo '### count' + echo + echo '```' + cat "$COUNT_OUTPUT_FILE" + echo '```' + echo + echo '### select' + echo + echo '```' + cat "$SELECT_OUTPUT_FILE" + echo '```' + echo +} >"$MARKDOWN_OUTPUT" +echo "[INF] Generated Markdown output successfully!" + +# GitHub Actions +if [ "$CI" = true ]; then + echo "[INF] Generating step summary..." + { + cat "$MARKDOWN_OUTPUT" + } >>"$GITHUB_STEP_SUMMARY" + echo "[INF] Generated step summary successfully!" +fi + +cd .. + +echo "[INF] --- [DONE] ---"