liquidaty · liquidaty · Oct 4, 2024 · Oct 4, 2024 · Oct 4, 2024 · Oct 4, 2024
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
@@ -0,0 +1,25 @@
+name: benchmarks
+
+on: workflow_dispatch
+
+jobs:
+  benchmarks:
+    runs-on: [macos-13]
+
+    steps:
+    - name: Checkout
+      uses: actions/checkout@v4
+      with:
+        sparse-checkout: |
+          scripts/ci-run-benchmarks.sh
+
+    - name: Cache
+      uses: actions/cache@v4
+      with:
+        key: ${{ runner.os }}-benchmarks
+        path: |
+          **/*.csv
+          **/*.tar.gz
+
+    - name: Run benchmarks
+      run: ./scripts/ci-run-benchmarks.sh
diff --git a/.gitignore b/.gitignore
@@ -49,6 +49,7 @@ build
 app/external
 install.sh
 .artifacts
+.benchmarks
 .rpmbuild
 amd64-linux-*
 amd64-windows-*

diff --git a/app/benchmark/README.md b/app/benchmark/README.md
@@ -10,7 +10,7 @@
   by ~1.5x-27x, by similar or smaller margins (in each case we tested, by at
   least 20%) on other operating systems
 - Four utilities were tested: `zsv`, `xsv`, `tsv-utils` and `mlr`
-- The below figured were based on results from runs on OSX (Intel). Similar
+- The below figures were based on results from runs on OSX (Intel). Similar
   results were observed on other operating systems, but in some cases the
   difference was significantly smaller (for example, `zsv`)
 - On most platforms, `zsv` performed about 2x as fast as xsv, 1.5-2x as fast as
@@ -66,30 +66,30 @@ Below are reported from tests run on OSX (Intel). Similar results were achieved
 on Windows, Linux and FreeBSD. See above note for results on M1.
 
 In some cases, especially on Windows, compiler settings had a significant
-impact. If you observe results that materially differ, in terms of zsv vs other
-utility performance, from what shown below, please let us know.
+impact. If you observe results that materially differ, in terms of `zsv` vs
+other utility performance, from what shown below, please let us know.
 
 ## Utilities compared
 
 The following utilities were compared:
 
-- `xsv`: version 0.13.0, installed via brew
+- `xsv` (0.13.0): installed via brew
 - `tsv-utils` (v2.2.1): installed via download of pre-built PGO-optimized binaries
-- `mlr` (5.10.2): installed via brew (not shown in graph-- very slow compared to others)
+- `mlr` (5.10.2): installed via brew (not shown in graph - very slow compared to others)
 - `zsv` (alpha): built from source using the default `configure` settings
-- `csvcut` (1.0.6) (not shown in graph-- very slow compared to others)
+- `csvcut` (1.0.6): (not shown in graph - very slow compared to others)
 
 ## Further notes
 
-- `tsv-util` using a comma delimiter does *not* handle quoted data, unlike `xsv`
-  (and `zsv`), and thus its output may be incorrect. For this reason, these
-  tests ran `tsv-utils` both using a custom delimiter, and also on TSV data that
-  had been converted from the original CSV data. The performance in either case
-  was effectively the same
+- `tsv-utils` using a comma delimiter does *not* handle quoted data, unlike
+  `xsv` (and `zsv`), and thus its output may be incorrect. For this reason,
+  these tests ran `tsv-utils` both using a custom delimiter, and also on TSV
+  data that had been converted from the original CSV data. The performance in
+  either case was effectively the same
 
 - `mlr` and `csvcut` are not shown in the graph since their performance was well
   over 10x slower than the others. `mlr` was included in the test was to compare
-  with another solution written in the same language (i.e. C) as zsv, since
+  with another solution written in the same language (i.e. C) as `zsv`, since
   `tsv-utils`, `xsv` and `zsv` are all written in different languages, and
   `csvcut` was included since `csvcut`/`csvkit` seem to be fairly commonly used
   for CSV processing.

diff --git a/scripts/ci-run-benchmarks.sh b/scripts/ci-run-benchmarks.sh
@@ -0,0 +1,149 @@
+#!/bin/sh
+
+set -e
+
+echo "[INF] Running $0"
+
+BENCHMARKS_DIR=".benchmarks"
+mkdir -p "$BENCHMARKS_DIR"
+cd "$BENCHMARKS_DIR"
+
+CSV_URL="https://burntsushi.net/stuff/worldcitiespop_mil.csv"
+CSV="$(echo "$CSV_URL" | sed 's:.*/::')"
+echo "[INF] Downloading CSV file... [$CSV]"
+if [ ! -f "$CSV" ]; then
+  wget -q "$CSV_URL"
+  echo "[INF] Downloaded successfully!"
+else
+  echo "[INF] Download skipped! CSV file already exists! [$CSV]"
+fi
+
+ls -Gghl "$CSV"
+
+ZSV_TAR_URL="https://github.com/liquidaty/zsv/releases/download/v0.3.9-alpha/zsv-0.3.9-alpha-amd64-macosx-gcc.tar.gz"
+TSV_TAR_URL="https://github.com/eBay/tsv-utils/releases/download/v2.2.1/tsv-utils-v2.2.1_osx-x86_64_ldc2.tar.gz"
+XSV_TAR_URL="https://github.com/BurntSushi/xsv/releases/download/0.13.0/xsv-0.13.0-x86_64-apple-darwin.tar.gz"
+
+for URL in "$ZSV_TAR_URL" "$TSV_TAR_URL" "$XSV_TAR_URL"; do
+  TAR="$(echo "$URL" | sed 's:.*/::')"
+  echo "[INF] Downloading... [$TAR]"
+  if [ ! -f "$TAR" ]; then
+    wget -q "$URL"
+    echo "[INF] Downloaded successfully! [$TAR]"
+  else
+    echo "[INF] Download skipped! Archive already exists! [$TAR]"
+  fi
+done
+
+ls -Gghl ./*.tar.gz
+
+for TAR in *.tar.gz; do
+  echo "[INF] Extracting... [$TAR]"
+  tar xf "$TAR"
+done
+
+TOOLS_DIR="tools"
+rm -rf ./"$TOOLS_DIR"
+mkdir -p "$TOOLS_DIR"
+
+FILES="$(find . -type f)"
+for FILE in $FILES; do
+  if [ -x "$FILE" ]; then
+    cp "$FILE" "$TOOLS_DIR"
+  fi
+done
+
+ls -Gghl "$TOOLS_DIR"
+
+COUNT_OUTPUT_FILE="count.out"
+SELECT_OUTPUT_FILE="select.out"
+
+rm -f "$COUNT_OUTPUT_FILE" "$SELECT_OUTPUT_FILE"
+
+RUNS=6
+
+echo "[INF] Running count benchmarks..."
+for TOOL in zsv xsv tsv; do
+  CMD=
+  if [ "$TOOL" = "zsv" ]; then
+    CMD="$TOOLS_DIR/zsv count"
+  elif [ "$TOOL" = "xsv" ]; then
+    CMD="$TOOLS_DIR/xsv count"
+  elif [ "$TOOL" = "tsv" ]; then
+    CMD="$TOOLS_DIR/number-lines -d,"
+  fi
+
+  I=1
+  while [ "$I" -le "$RUNS" ]; do
+    {
+      printf "%d | %s : " "$I" "$TOOL"
+      (time $CMD <"$CSV" >/dev/null) 2>&1 | xargs
+    } | tee -a "$COUNT_OUTPUT_FILE"
+    I=$((I + 1))
+  done
+done
+
+echo "[INF] Running select benchmarks..."
+for TOOL in zsv xsv tsv; do
+  CMD=
+  if [ "$TOOL" = "zsv" ]; then
+    CMD="$TOOLS_DIR/zsv select -W -n -- 2 1 3-7"
+  elif [ "$TOOL" = "xsv" ]; then
+    CMD="$TOOLS_DIR/xsv select 2,1,3-7"
+  elif [ "$TOOL" = "tsv" ]; then
+    CMD="$TOOLS_DIR/tsv-select -d, -f 1-7"
+  fi
+
+  I=1
+  while [ "$I" -le "$RUNS" ]; do
+    {
+      printf "%d | %s : " "$I" "$TOOL"
+      (time $CMD <"$CSV" >/dev/null) 2>&1 | xargs
+    } | tee -a "$SELECT_OUTPUT_FILE"
+    I=$((I + 1))
+  done
+done
+
+MARKDOWN_OUTPUT="benchmarks.md"
+echo "[INF] Generating Markdown output... [$MARKDOWN_OUTPUT]"
+TIMESTAMP="$(date -u +"%Y-%m-%dT%H:%M:%SZ")"
+{
+  echo '# Benchmarks'
+  echo
+  echo "- Timestamp UTC: \`$TIMESTAMP\`"
+  echo
+  echo "## Releases Used"
+  echo
+  echo "- <$ZSV_TAR_URL>"
+  echo "- <$TSV_TAR_URL>"
+  echo "- <$XSV_TAR_URL>"
+  echo
+  echo '## Results'
+  echo
+  echo '### count'
+  echo
+  echo '```'
+  cat "$COUNT_OUTPUT_FILE"
+  echo '```'
+  echo
+  echo '### select'
+  echo
+  echo '```'
+  cat "$SELECT_OUTPUT_FILE"
+  echo '```'
+  echo
+} >"$MARKDOWN_OUTPUT"
+echo "[INF] Generated Markdown output successfully!"
+
+# GitHub Actions
+if [ "$CI" = true ]; then
+  echo "[INF] Generating step summary..."
+  {
+    cat "$MARKDOWN_OUTPUT"
+  } >>"$GITHUB_STEP_SUMMARY"
+  echo "[INF] Generated step summary successfully!"
+fi
+
+cd ..
+
+echo "[INF] --- [DONE] ---"