From e6824ad9b9ebbd70cc9dd6c36f8138e671d148d2 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 30 Dec 2025 11:16:25 -0700
Subject: [PATCH 1/6] add microbenchmark for string functions

---
 microbenchmarks/requirements.txt              |   3 +
 microbenchmarks/string_functions_benchmark.py | 337 ++++++++++++++++++
 2 files changed, 340 insertions(+)
 create mode 100644 microbenchmarks/requirements.txt
 create mode 100755 microbenchmarks/string_functions_benchmark.py

diff --git a/microbenchmarks/requirements.txt b/microbenchmarks/requirements.txt
new file mode 100644
index 0000000..e685421
--- /dev/null
+++ b/microbenchmarks/requirements.txt
@@ -0,0 +1,3 @@
+pyarrow>=14.0.0
+datafusion==50.0.0
+duckdb==1.4.3
diff --git a/microbenchmarks/string_functions_benchmark.py b/microbenchmarks/string_functions_benchmark.py
new file mode 100755
index 0000000..36d9f56
--- /dev/null
+++ b/microbenchmarks/string_functions_benchmark.py
@@ -0,0 +1,337 @@
+#!/usr/bin/env python3
+"""
+Microbenchmark comparing DataFusion and DuckDB performance
+for SQL string functions on Parquet files.
+"""
+
+import tempfile
+import time
+import os
+from dataclasses import dataclass
+from pathlib import Path
+
+import pyarrow as pa
+import pyarrow.parquet as pq
+import datafusion
+import duckdb
+
+
+@dataclass
+class BenchmarkResult:
+    """Stores benchmark results for a single function."""
+    function_name: str
+    datafusion_time_ms: float
+    duckdb_time_ms: float
+    rows: int
+
+    @property
+    def speedup(self) -> float:
+        """DuckDB time / DataFusion time (>1 means DataFusion is faster)."""
+        if self.datafusion_time_ms == 0:
+            return float('inf')
+        return self.duckdb_time_ms / self.datafusion_time_ms
+
+
+@dataclass
+class StringFunction:
+    """Defines a string function with syntax for both engines."""
+    name: str
+    datafusion_expr: str  # Expression using {col} as placeholder for column name
+    duckdb_expr: str      # Expression using {col} as placeholder for column name
+
+
+# String functions to benchmark
+# {col} will be replaced with the actual column name
+STRING_FUNCTIONS = [
+    StringFunction("trim", "trim({col})", "trim({col})"),
+    StringFunction("ltrim", "ltrim({col})", "ltrim({col})"),
+    StringFunction("rtrim", "rtrim({col})", "rtrim({col})"),
+    StringFunction("lower", "lower({col})", "lower({col})"),
+    StringFunction("upper", "upper({col})", "upper({col})"),
+    StringFunction("length", "length({col})", "length({col})"),
+    StringFunction("char_length", "char_length({col})", "length({col})"),
+    StringFunction("reverse", "reverse({col})", "reverse({col})"),
+    StringFunction("repeat_3", "repeat({col}, 3)", "repeat({col}, 3)"),
+    StringFunction("concat", "concat({col}, {col})", "concat({col}, {col})"),
+    StringFunction("concat_ws", "concat_ws('-', {col}, {col})", "concat_ws('-', {col}, {col})"),
+    StringFunction("substring_1_5", "substring({col}, 1, 5)", "substring({col}, 1, 5)"),
+    StringFunction("left_5", "left({col}, 5)", "left({col}, 5)"),
+    StringFunction("right_5", "right({col}, 5)", "right({col}, 5)"),
+    StringFunction("lpad_20", "lpad({col}, 20, '*')", "lpad({col}, 20, '*')"),
+    StringFunction("rpad_20", "rpad({col}, 20, '*')", "rpad({col}, 20, '*')"),
+    StringFunction("replace", "replace({col}, 'a', 'X')", "replace({col}, 'a', 'X')"),
+    StringFunction("translate", "translate({col}, 'aeiou', '12345')", "translate({col}, 'aeiou', '12345')"),
+    StringFunction("ascii", "ascii({col})", "ascii({col})"),
+    StringFunction("md5", "md5({col})", "md5({col})"),
+    StringFunction("sha256", "sha256({col})", "sha256({col})"),
+    StringFunction("btrim", "btrim({col}, ' ')", "trim({col}, ' ')"),
+    StringFunction("split_part", "split_part({col}, ' ', 1)", "split_part({col}, ' ', 1)"),
+    StringFunction("starts_with", "starts_with({col}, 'test')", "starts_with({col}, 'test')"),
+    StringFunction("ends_with", "ends_with({col}, 'data')", "ends_with({col}, 'data')"),
+    StringFunction("strpos", "strpos({col}, 'e')", "strpos({col}, 'e')"),
+    StringFunction("regexp_replace", "regexp_replace({col}, '[aeiou]', '*')", "regexp_replace({col}, '[aeiou]', '*', 'g')"),
+]
+
+
+def generate_test_data(num_rows: int = 1_000_000) -> pa.Table:
+    """Generate test data with various string patterns."""
+    import random
+    import string
+
+    random.seed(42)  # For reproducibility
+
+    # Generate diverse string data
+    strings = []
+    for i in range(num_rows):
+        # Mix of different string patterns
+        pattern_type = i % 5
+        if pattern_type == 0:
+            # Short strings with spaces
+            s = f"  test_{i % 1000}  "
+        elif pattern_type == 1:
+            # Longer strings
+            s = ''.join(random.choices(string.ascii_lowercase, k=20))
+        elif pattern_type == 2:
+            # Mixed case with numbers
+            s = f"TestData_{i}_Value"
+        elif pattern_type == 3:
+            # Strings with special patterns
+            s = f"hello world {i % 100} data"
+        else:
+            # Random length strings
+            length = random.randint(5, 50)
+            s = ''.join(random.choices(string.ascii_letters + string.digits + ' ', k=length))
+        strings.append(s)
+
+    table = pa.table({
+        'str_col': pa.array(strings, type=pa.string())
+    })
+
+    return table
+
+
+def setup_datafusion(parquet_path: str) -> datafusion.SessionContext:
+    """Create and configure DataFusion context."""
+    ctx = datafusion.SessionContext()
+    ctx.register_parquet('test_data', parquet_path)
+    return ctx
+
+
+def setup_duckdb(parquet_path: str) -> duckdb.DuckDBPyConnection:
+    """Create and configure DuckDB connection."""
+    conn = duckdb.connect(':memory:')
+    conn.execute(f"CREATE VIEW test_data AS SELECT * FROM read_parquet('{parquet_path}')")
+    return conn
+
+
+def benchmark_datafusion(ctx: datafusion.SessionContext, expr: str,
+                         warmup: int = 2, iterations: int = 5) -> float:
+    """Benchmark a query in DataFusion, return average time in ms."""
+    query = f"SELECT {expr} FROM test_data"
+
+    # Warmup runs
+    for _ in range(warmup):
+        ctx.sql(query).collect()
+
+    # Timed runs
+    times = []
+    for _ in range(iterations):
+        start = time.perf_counter()
+        ctx.sql(query).collect()
+        end = time.perf_counter()
+        times.append((end - start) * 1000)  # Convert to ms
+
+    return sum(times) / len(times)
+
+
+def benchmark_duckdb(conn: duckdb.DuckDBPyConnection, expr: str,
+                     warmup: int = 2, iterations: int = 5) -> float:
+    """Benchmark a query in DuckDB, return average time in ms."""
+    query = f"SELECT {expr} FROM test_data"
+
+    # Warmup runs
+    for _ in range(warmup):
+        conn.execute(query).fetchall()
+
+    # Timed runs
+    times = []
+    for _ in range(iterations):
+        start = time.perf_counter()
+        conn.execute(query).fetchall()
+        end = time.perf_counter()
+        times.append((end - start) * 1000)  # Convert to ms
+
+    return sum(times) / len(times)
+
+
+def run_benchmarks(num_rows: int = 1_000_000,
+                   warmup: int = 2,
+                   iterations: int = 5) -> list[BenchmarkResult]:
+    """Run all benchmarks and return results."""
+    results = []
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        parquet_path = os.path.join(tmpdir, 'test_data.parquet')
+
+        # Generate and save test data
+        print(f"Generating {num_rows:,} rows of test data...")
+        table = generate_test_data(num_rows)
+        pq.write_table(table, parquet_path)
+        print(f"Parquet file written to: {parquet_path}")
+        print(f"File size: {os.path.getsize(parquet_path) / 1024 / 1024:.2f} MB")
+
+        # Setup engines
+        print("\nSetting up DataFusion...")
+        df_ctx = setup_datafusion(parquet_path)
+
+        print("Setting up DuckDB...")
+        duck_conn = setup_duckdb(parquet_path)
+
+        # Run benchmarks
+        print(f"\nRunning benchmarks ({warmup} warmup, {iterations} iterations each)...\n")
+
+        col = 'str_col'
+        for func in STRING_FUNCTIONS:
+            df_expr = func.datafusion_expr.format(col=col)
+            duck_expr = func.duckdb_expr.format(col=col)
+
+            print(f"  Benchmarking: {func.name}...", end=" ", flush=True)
+
+            try:
+                df_time = benchmark_datafusion(df_ctx, df_expr, warmup, iterations)
+            except Exception as e:
+                print(f"DataFusion error: {e}")
+                df_time = float('nan')
+
+            try:
+                duck_time = benchmark_duckdb(duck_conn, duck_expr, warmup, iterations)
+            except Exception as e:
+                print(f"DuckDB error: {e}")
+                duck_time = float('nan')
+
+            result = BenchmarkResult(
+                function_name=func.name,
+                datafusion_time_ms=df_time,
+                duckdb_time_ms=duck_time,
+                rows=num_rows
+            )
+            results.append(result)
+
+            # Print progress
+            if df_time == df_time and duck_time == duck_time:  # Check for NaN
+                faster = "DataFusion" if df_time < duck_time else "DuckDB"
+                ratio = max(df_time, duck_time) / min(df_time, duck_time)
+                print(f"done ({faster} {ratio:.2f}x faster)")
+            else:
+                print("done (with errors)")
+
+        duck_conn.close()
+
+    return results
+
+
+def format_results_markdown(results: list[BenchmarkResult]) -> str:
+    """Format benchmark results as a markdown table."""
+    lines = [
+        "# String Function Microbenchmarks: DataFusion vs DuckDB",
+        "",
+        f"**Rows:** {results[0].rows:,}",
+        "",
+        "| Function | DataFusion (ms) | DuckDB (ms) | Speedup | Faster |",
+        "|----------|----------------:|------------:|--------:|--------|",
+    ]
+
+    for r in results:
+        if r.datafusion_time_ms != r.datafusion_time_ms or r.duckdb_time_ms != r.duckdb_time_ms:
+            # Handle NaN
+            lines.append(f"| {r.function_name} | ERROR | ERROR | N/A | N/A |")
+        else:
+            speedup = r.speedup
+            if speedup > 1:
+                faster = "DataFusion"
+                speedup_str = f"{speedup:.2f}x"
+            else:
+                faster = "DuckDB"
+                speedup_str = f"{1/speedup:.2f}x"
+
+            lines.append(
+                f"| {r.function_name} | {r.datafusion_time_ms:.2f} | "
+                f"{r.duckdb_time_ms:.2f} | {speedup_str} | {faster} |"
+            )
+
+    # Summary statistics
+    valid_results = [r for r in results
+                     if r.datafusion_time_ms == r.datafusion_time_ms
+                     and r.duckdb_time_ms == r.duckdb_time_ms]
+
+    if valid_results:
+        df_wins = sum(1 for r in valid_results if r.speedup > 1)
+        duck_wins = len(valid_results) - df_wins
+
+        df_total = sum(r.datafusion_time_ms for r in valid_results)
+        duck_total = sum(r.duckdb_time_ms for r in valid_results)
+
+        lines.extend([
+            "",
+            "## Summary",
+            "",
+            f"- **Functions tested:** {len(valid_results)}",
+            f"- **DataFusion faster:** {df_wins} functions",
+            f"- **DuckDB faster:** {duck_wins} functions",
+            f"- **Total DataFusion time:** {df_total:.2f} ms",
+            f"- **Total DuckDB time:** {duck_total:.2f} ms",
+        ])
+
+    return "\n".join(lines)
+
+
+def main():
+    import argparse
+
+    parser = argparse.ArgumentParser(
+        description="Benchmark string functions: DataFusion vs DuckDB"
+    )
+    parser.add_argument(
+        "--rows", type=int, default=1_000_000,
+        help="Number of rows in test data (default: 1,000,000)"
+    )
+    parser.add_argument(
+        "--warmup", type=int, default=2,
+        help="Number of warmup iterations (default: 2)"
+    )
+    parser.add_argument(
+        "--iterations", type=int, default=5,
+        help="Number of timed iterations (default: 5)"
+    )
+    parser.add_argument(
+        "--output", type=str, default=None,
+        help="Output file for markdown results (default: stdout)"
+    )
+
+    args = parser.parse_args()
+
+    print("=" * 60)
+    print("String Function Microbenchmarks: DataFusion vs DuckDB")
+    print("=" * 60)
+
+    results = run_benchmarks(
+        num_rows=args.rows,
+        warmup=args.warmup,
+        iterations=args.iterations
+    )
+
+    markdown = format_results_markdown(results)
+
+    print("\n" + "=" * 60)
+    print("RESULTS")
+    print("=" * 60 + "\n")
+    print(markdown)
+
+    if args.output:
+        with open(args.output, 'w') as f:
+            f.write(markdown)
+        print(f"\nResults saved to: {args.output}")
+
+
+if __name__ == "__main__":
+    main()

From 858111b9fd42692ab72bbcce1fac63e3ede3a5cb Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 30 Dec 2025 11:58:02 -0700
Subject: [PATCH 2/6] optimize duckdb

---
 microbenchmarks/conditional_results.md        | 39 +++++++++++++++
 microbenchmarks/string_functions_benchmark.py |  7 +--
 microbenchmarks/string_results.md             | 41 ++++++++++++++++
 microbenchmarks/temporal_results.md           | 47 +++++++++++++++++++
 4 files changed, 131 insertions(+), 3 deletions(-)
 create mode 100644 microbenchmarks/conditional_results.md
 create mode 100644 microbenchmarks/string_results.md
 create mode 100644 microbenchmarks/temporal_results.md

diff --git a/microbenchmarks/conditional_results.md b/microbenchmarks/conditional_results.md
new file mode 100644
index 0000000..1727220
--- /dev/null
+++ b/microbenchmarks/conditional_results.md
@@ -0,0 +1,39 @@
+# Conditional Expressions Microbenchmarks: DataFusion vs DuckDB
+
+**Rows:** 1,000,000
+
+| Function | DataFusion (ms) | DuckDB (ms) | Speedup | Faster |
+|----------|----------------:|------------:|--------:|--------|
+| case_2_branches | 17.67 | 16.57 | 1.07x | DuckDB |
+| case_3_branches | 42.93 | 21.81 | 1.97x | DuckDB |
+| case_5_branches | 59.96 | 18.96 | 3.16x | DuckDB |
+| case_10_branches | 87.85 | 25.07 | 3.50x | DuckDB |
+| case_simple_match | 41.32 | 7.72 | 5.35x | DuckDB |
+| case_simple_match_10 | 100.42 | 16.69 | 6.02x | DuckDB |
+| case_multi_condition | 51.24 | 32.91 | 1.56x | DuckDB |
+| case_nested_2_levels | 55.03 | 31.26 | 1.76x | DuckDB |
+| case_nested_3_levels | 66.14 | 24.20 | 2.73x | DuckDB |
+| case_expr_result | 33.84 | 15.87 | 2.13x | DuckDB |
+| case_string_concat | 74.08 | 24.31 | 3.05x | DuckDB |
+| coalesce_2 | 17.79 | 7.55 | 2.35x | DuckDB |
+| coalesce_3 | 20.66 | 10.73 | 1.93x | DuckDB |
+| coalesce_5 | 22.41 | 11.48 | 1.95x | DuckDB |
+| nullif_int | 4.99 | 6.30 | 1.26x | DataFusion |
+| nullif_string | 9.62 | 12.77 | 1.33x | DataFusion |
+| case_null_check | 42.70 | 17.38 | 2.46x | DuckDB |
+| case_null_propagation | 29.92 | 16.20 | 1.85x | DuckDB |
+| case_bucketing | 77.56 | 32.37 | 2.40x | DuckDB |
+| case_range_lookup | 55.64 | 19.65 | 2.83x | DuckDB |
+| case_complex_business | 84.08 | 32.88 | 2.56x | DuckDB |
+| case_boolean_result | 3.98 | 9.74 | 2.45x | DataFusion |
+| greatest_2 | 8.56 | 7.99 | 1.07x | DuckDB |
+| least_2 | 8.46 | 7.62 | 1.11x | DuckDB |
+| greatest_3 | 19.38 | 13.85 | 1.40x | DuckDB |
+
+## Summary
+
+- **Functions tested:** 25
+- **DataFusion faster:** 3 functions
+- **DuckDB faster:** 22 functions
+- **Total DataFusion time:** 1036.25 ms
+- **Total DuckDB time:** 441.88 ms
\ No newline at end of file
diff --git a/microbenchmarks/string_functions_benchmark.py b/microbenchmarks/string_functions_benchmark.py
index 36d9f56..f4b5f38 100755
--- a/microbenchmarks/string_functions_benchmark.py
+++ b/microbenchmarks/string_functions_benchmark.py
@@ -149,15 +149,16 @@ def benchmark_duckdb(conn: duckdb.DuckDBPyConnection, expr: str,
     """Benchmark a query in DuckDB, return average time in ms."""
     query = f"SELECT {expr} FROM test_data"
 
-    # Warmup runs
+    # Use fetch_arrow_table() for fair comparison with DataFusion's collect()
+    # Both return Arrow data without Python object conversion overhead
     for _ in range(warmup):
-        conn.execute(query).fetchall()
+        conn.execute(query).fetch_arrow_table()
 
     # Timed runs
     times = []
     for _ in range(iterations):
         start = time.perf_counter()
-        conn.execute(query).fetchall()
+        conn.execute(query).fetch_arrow_table()
         end = time.perf_counter()
         times.append((end - start) * 1000)  # Convert to ms
 
diff --git a/microbenchmarks/string_results.md b/microbenchmarks/string_results.md
new file mode 100644
index 0000000..bae7430
--- /dev/null
+++ b/microbenchmarks/string_results.md
@@ -0,0 +1,41 @@
+# String Functions Microbenchmarks: DataFusion vs DuckDB
+
+**Rows:** 1,000,000
+
+| Function | DataFusion (ms) | DuckDB (ms) | Speedup | Faster |
+|----------|----------------:|------------:|--------:|--------|
+| trim | 46.57 | 123.44 | 2.65x | DataFusion |
+| ltrim | 45.19 | 58.57 | 1.30x | DataFusion |
+| rtrim | 43.32 | 114.99 | 2.65x | DataFusion |
+| lower | 44.93 | 63.10 | 1.40x | DataFusion |
+| upper | 39.99 | 67.38 | 1.68x | DataFusion |
+| length | 22.11 | 26.54 | 1.20x | DataFusion |
+| char_length | 23.50 | 26.75 | 1.14x | DataFusion |
+| reverse | 36.64 | 60.70 | 1.66x | DataFusion |
+| repeat_3 | 46.49 | 75.45 | 1.62x | DataFusion |
+| concat | 70.13 | 67.12 | 1.04x | DuckDB |
+| concat_ws | 36.96 | 73.64 | 1.99x | DataFusion |
+| substring_1_5 | 35.43 | 41.34 | 1.17x | DataFusion |
+| left_5 | 37.96 | 47.62 | 1.25x | DataFusion |
+| right_5 | 63.57 | 60.91 | 1.04x | DuckDB |
+| lpad_20 | 341.24 | 94.19 | 3.62x | DuckDB |
+| rpad_20 | 343.67 | 94.86 | 3.62x | DuckDB |
+| replace | 51.88 | 106.83 | 2.06x | DataFusion |
+| translate | 768.85 | 299.88 | 2.56x | DuckDB |
+| ascii | 19.25 | 23.28 | 1.21x | DataFusion |
+| md5 | 283.36 | 139.64 | 2.03x | DuckDB |
+| sha256 | 61.61 | 265.46 | 4.31x | DataFusion |
+| btrim | 38.98 | 128.57 | 3.30x | DataFusion |
+| split_part | 77.90 | 57.19 | 1.36x | DuckDB |
+| starts_with | 19.47 | 26.80 | 1.38x | DataFusion |
+| ends_with | 27.12 | 22.07 | 1.23x | DuckDB |
+| strpos | 43.98 | 29.50 | 1.49x | DuckDB |
+| regexp_replace | 93.95 | 410.35 | 4.37x | DataFusion |
+
+## Summary
+
+- **Functions tested:** 27
+- **DataFusion faster:** 18 functions
+- **DuckDB faster:** 9 functions
+- **Total DataFusion time:** 2764.06 ms
+- **Total DuckDB time:** 2606.18 ms
\ No newline at end of file
diff --git a/microbenchmarks/temporal_results.md b/microbenchmarks/temporal_results.md
new file mode 100644
index 0000000..da9a3f0
--- /dev/null
+++ b/microbenchmarks/temporal_results.md
@@ -0,0 +1,47 @@
+# Temporal Functions Microbenchmarks: DataFusion vs DuckDB
+
+**Rows:** 1,000,000
+
+| Function | DataFusion (ms) | DuckDB (ms) | Speedup | Faster |
+|----------|----------------:|------------:|--------:|--------|
+| extract_year | 18.87 | 10.97 | 1.72x | DuckDB |
+| extract_month | 18.80 | 11.27 | 1.67x | DuckDB |
+| extract_day | 18.05 | 11.29 | 1.60x | DuckDB |
+| extract_hour | 18.40 | 12.94 | 1.42x | DuckDB |
+| extract_minute | 18.44 | 14.02 | 1.32x | DuckDB |
+| extract_second | 26.27 | 13.17 | 2.00x | DuckDB |
+| extract_dow | 19.33 | 13.82 | 1.40x | DuckDB |
+| extract_doy | 18.71 | 15.51 | 1.21x | DuckDB |
+| extract_week | 20.53 | 29.80 | 1.45x | DataFusion |
+| extract_quarter | 17.94 | 18.59 | 1.04x | DataFusion |
+| extract_epoch | 11.89 | 11.81 | 1.01x | DuckDB |
+| date_trunc_year | 30.51 | 20.60 | 1.48x | DuckDB |
+| date_trunc_quarter | 32.21 | 23.21 | 1.39x | DuckDB |
+| date_trunc_month | 27.34 | 25.80 | 1.06x | DuckDB |
+| date_trunc_week | 29.38 | 12.78 | 2.30x | DuckDB |
+| date_trunc_day | 10.58 | 9.93 | 1.07x | DuckDB |
+| date_trunc_hour | 10.62 | 22.84 | 2.15x | DataFusion |
+| date_trunc_minute | 9.70 | 23.40 | 2.41x | DataFusion |
+| date_trunc_second | 10.19 | 21.94 | 2.15x | DataFusion |
+| date_part_year | 14.56 | 11.32 | 1.29x | DuckDB |
+| date_part_month | 14.82 | 10.99 | 1.35x | DuckDB |
+| date_part_day | 15.29 | 11.00 | 1.39x | DuckDB |
+| date_part_hour | 15.20 | 13.90 | 1.09x | DuckDB |
+| date_part_dow | 16.21 | 13.85 | 1.17x | DuckDB |
+| date_part_week | 17.74 | 27.30 | 1.54x | DataFusion |
+| add_days | 53.87 | 22.94 | 2.35x | DuckDB |
+| sub_days | 56.84 | 23.79 | 2.39x | DuckDB |
+| add_months | 59.00 | 38.79 | 1.52x | DuckDB |
+| add_hours | 37.52 | 25.44 | 1.47x | DuckDB |
+| add_minutes | 39.40 | 23.77 | 1.66x | DuckDB |
+| to_char_date | 150.98 | 52.48 | 2.88x | DuckDB |
+| to_char_datetime | 216.22 | 95.88 | 2.26x | DuckDB |
+| to_char_time | 129.67 | 48.38 | 2.68x | DuckDB |
+
+## Summary
+
+- **Functions tested:** 33
+- **DataFusion faster:** 6 functions
+- **DuckDB faster:** 27 functions
+- **Total DataFusion time:** 1205.07 ms
+- **Total DuckDB time:** 743.49 ms
\ No newline at end of file

From fd9132108684774bc85c7ecf7befbe90b8bc795e Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 30 Dec 2025 11:58:16 -0700
Subject: [PATCH 3/6] optimize duckdb

---
 microbenchmarks/conditional_results.md | 39 ---------------------
 microbenchmarks/string_results.md      | 41 ----------------------
 microbenchmarks/temporal_results.md    | 47 --------------------------
 3 files changed, 127 deletions(-)
 delete mode 100644 microbenchmarks/conditional_results.md
 delete mode 100644 microbenchmarks/string_results.md
 delete mode 100644 microbenchmarks/temporal_results.md

diff --git a/microbenchmarks/conditional_results.md b/microbenchmarks/conditional_results.md
deleted file mode 100644
index 1727220..0000000
--- a/microbenchmarks/conditional_results.md
+++ /dev/null
@@ -1,39 +0,0 @@
-# Conditional Expressions Microbenchmarks: DataFusion vs DuckDB
-
-**Rows:** 1,000,000
-
-| Function | DataFusion (ms) | DuckDB (ms) | Speedup | Faster |
-|----------|----------------:|------------:|--------:|--------|
-| case_2_branches | 17.67 | 16.57 | 1.07x | DuckDB |
-| case_3_branches | 42.93 | 21.81 | 1.97x | DuckDB |
-| case_5_branches | 59.96 | 18.96 | 3.16x | DuckDB |
-| case_10_branches | 87.85 | 25.07 | 3.50x | DuckDB |
-| case_simple_match | 41.32 | 7.72 | 5.35x | DuckDB |
-| case_simple_match_10 | 100.42 | 16.69 | 6.02x | DuckDB |
-| case_multi_condition | 51.24 | 32.91 | 1.56x | DuckDB |
-| case_nested_2_levels | 55.03 | 31.26 | 1.76x | DuckDB |
-| case_nested_3_levels | 66.14 | 24.20 | 2.73x | DuckDB |
-| case_expr_result | 33.84 | 15.87 | 2.13x | DuckDB |
-| case_string_concat | 74.08 | 24.31 | 3.05x | DuckDB |
-| coalesce_2 | 17.79 | 7.55 | 2.35x | DuckDB |
-| coalesce_3 | 20.66 | 10.73 | 1.93x | DuckDB |
-| coalesce_5 | 22.41 | 11.48 | 1.95x | DuckDB |
-| nullif_int | 4.99 | 6.30 | 1.26x | DataFusion |
-| nullif_string | 9.62 | 12.77 | 1.33x | DataFusion |
-| case_null_check | 42.70 | 17.38 | 2.46x | DuckDB |
-| case_null_propagation | 29.92 | 16.20 | 1.85x | DuckDB |
-| case_bucketing | 77.56 | 32.37 | 2.40x | DuckDB |
-| case_range_lookup | 55.64 | 19.65 | 2.83x | DuckDB |
-| case_complex_business | 84.08 | 32.88 | 2.56x | DuckDB |
-| case_boolean_result | 3.98 | 9.74 | 2.45x | DataFusion |
-| greatest_2 | 8.56 | 7.99 | 1.07x | DuckDB |
-| least_2 | 8.46 | 7.62 | 1.11x | DuckDB |
-| greatest_3 | 19.38 | 13.85 | 1.40x | DuckDB |
-
-## Summary
-
-- **Functions tested:** 25
-- **DataFusion faster:** 3 functions
-- **DuckDB faster:** 22 functions
-- **Total DataFusion time:** 1036.25 ms
-- **Total DuckDB time:** 441.88 ms
\ No newline at end of file
diff --git a/microbenchmarks/string_results.md b/microbenchmarks/string_results.md
deleted file mode 100644
index bae7430..0000000
--- a/microbenchmarks/string_results.md
+++ /dev/null
@@ -1,41 +0,0 @@
-# String Functions Microbenchmarks: DataFusion vs DuckDB
-
-**Rows:** 1,000,000
-
-| Function | DataFusion (ms) | DuckDB (ms) | Speedup | Faster |
-|----------|----------------:|------------:|--------:|--------|
-| trim | 46.57 | 123.44 | 2.65x | DataFusion |
-| ltrim | 45.19 | 58.57 | 1.30x | DataFusion |
-| rtrim | 43.32 | 114.99 | 2.65x | DataFusion |
-| lower | 44.93 | 63.10 | 1.40x | DataFusion |
-| upper | 39.99 | 67.38 | 1.68x | DataFusion |
-| length | 22.11 | 26.54 | 1.20x | DataFusion |
-| char_length | 23.50 | 26.75 | 1.14x | DataFusion |
-| reverse | 36.64 | 60.70 | 1.66x | DataFusion |
-| repeat_3 | 46.49 | 75.45 | 1.62x | DataFusion |
-| concat | 70.13 | 67.12 | 1.04x | DuckDB |
-| concat_ws | 36.96 | 73.64 | 1.99x | DataFusion |
-| substring_1_5 | 35.43 | 41.34 | 1.17x | DataFusion |
-| left_5 | 37.96 | 47.62 | 1.25x | DataFusion |
-| right_5 | 63.57 | 60.91 | 1.04x | DuckDB |
-| lpad_20 | 341.24 | 94.19 | 3.62x | DuckDB |
-| rpad_20 | 343.67 | 94.86 | 3.62x | DuckDB |
-| replace | 51.88 | 106.83 | 2.06x | DataFusion |
-| translate | 768.85 | 299.88 | 2.56x | DuckDB |
-| ascii | 19.25 | 23.28 | 1.21x | DataFusion |
-| md5 | 283.36 | 139.64 | 2.03x | DuckDB |
-| sha256 | 61.61 | 265.46 | 4.31x | DataFusion |
-| btrim | 38.98 | 128.57 | 3.30x | DataFusion |
-| split_part | 77.90 | 57.19 | 1.36x | DuckDB |
-| starts_with | 19.47 | 26.80 | 1.38x | DataFusion |
-| ends_with | 27.12 | 22.07 | 1.23x | DuckDB |
-| strpos | 43.98 | 29.50 | 1.49x | DuckDB |
-| regexp_replace | 93.95 | 410.35 | 4.37x | DataFusion |
-
-## Summary
-
-- **Functions tested:** 27
-- **DataFusion faster:** 18 functions
-- **DuckDB faster:** 9 functions
-- **Total DataFusion time:** 2764.06 ms
-- **Total DuckDB time:** 2606.18 ms
\ No newline at end of file
diff --git a/microbenchmarks/temporal_results.md b/microbenchmarks/temporal_results.md
deleted file mode 100644
index da9a3f0..0000000
--- a/microbenchmarks/temporal_results.md
+++ /dev/null
@@ -1,47 +0,0 @@
-# Temporal Functions Microbenchmarks: DataFusion vs DuckDB
-
-**Rows:** 1,000,000
-
-| Function | DataFusion (ms) | DuckDB (ms) | Speedup | Faster |
-|----------|----------------:|------------:|--------:|--------|
-| extract_year | 18.87 | 10.97 | 1.72x | DuckDB |
-| extract_month | 18.80 | 11.27 | 1.67x | DuckDB |
-| extract_day | 18.05 | 11.29 | 1.60x | DuckDB |
-| extract_hour | 18.40 | 12.94 | 1.42x | DuckDB |
-| extract_minute | 18.44 | 14.02 | 1.32x | DuckDB |
-| extract_second | 26.27 | 13.17 | 2.00x | DuckDB |
-| extract_dow | 19.33 | 13.82 | 1.40x | DuckDB |
-| extract_doy | 18.71 | 15.51 | 1.21x | DuckDB |
-| extract_week | 20.53 | 29.80 | 1.45x | DataFusion |
-| extract_quarter | 17.94 | 18.59 | 1.04x | DataFusion |
-| extract_epoch | 11.89 | 11.81 | 1.01x | DuckDB |
-| date_trunc_year | 30.51 | 20.60 | 1.48x | DuckDB |
-| date_trunc_quarter | 32.21 | 23.21 | 1.39x | DuckDB |
-| date_trunc_month | 27.34 | 25.80 | 1.06x | DuckDB |
-| date_trunc_week | 29.38 | 12.78 | 2.30x | DuckDB |
-| date_trunc_day | 10.58 | 9.93 | 1.07x | DuckDB |
-| date_trunc_hour | 10.62 | 22.84 | 2.15x | DataFusion |
-| date_trunc_minute | 9.70 | 23.40 | 2.41x | DataFusion |
-| date_trunc_second | 10.19 | 21.94 | 2.15x | DataFusion |
-| date_part_year | 14.56 | 11.32 | 1.29x | DuckDB |
-| date_part_month | 14.82 | 10.99 | 1.35x | DuckDB |
-| date_part_day | 15.29 | 11.00 | 1.39x | DuckDB |
-| date_part_hour | 15.20 | 13.90 | 1.09x | DuckDB |
-| date_part_dow | 16.21 | 13.85 | 1.17x | DuckDB |
-| date_part_week | 17.74 | 27.30 | 1.54x | DataFusion |
-| add_days | 53.87 | 22.94 | 2.35x | DuckDB |
-| sub_days | 56.84 | 23.79 | 2.39x | DuckDB |
-| add_months | 59.00 | 38.79 | 1.52x | DuckDB |
-| add_hours | 37.52 | 25.44 | 1.47x | DuckDB |
-| add_minutes | 39.40 | 23.77 | 1.66x | DuckDB |
-| to_char_date | 150.98 | 52.48 | 2.88x | DuckDB |
-| to_char_datetime | 216.22 | 95.88 | 2.26x | DuckDB |
-| to_char_time | 129.67 | 48.38 | 2.68x | DuckDB |
-
-## Summary
-
-- **Functions tested:** 33
-- **DataFusion faster:** 6 functions
-- **DuckDB faster:** 27 functions
-- **Total DataFusion time:** 1205.07 ms
-- **Total DuckDB time:** 743.49 ms
\ No newline at end of file

From 5a22f47cad4fe73948b59694733414a53c58d577 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 30 Dec 2025 12:12:33 -0700
Subject: [PATCH 4/6] add README

---
 microbenchmarks/README.md | 109 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 microbenchmarks/README.md

diff --git a/microbenchmarks/README.md b/microbenchmarks/README.md
new file mode 100644
index 0000000..3b1d79f
--- /dev/null
+++ b/microbenchmarks/README.md
@@ -0,0 +1,109 @@
+<!---
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# Microbenchmarks
+
+This directory contains microbenchmarks for comparing DataFusion and DuckDB performance on individual SQL functions. Unlike the TPC-H and TPC-DS benchmarks which test full query execution, these microbenchmarks focus on the performance of specific SQL functions and expressions.
+
+## Overview
+
+The benchmarks generate synthetic data, write it to Parquet format, and then measure the execution time of various SQL functions across both DataFusion and DuckDB. Results include per-function timing comparisons and summary statistics.
+
+## Benchmark Suites
+
+Three benchmark suites are available:
+
+- **string**: String manipulation functions (trim, lower, upper, concat, substring, regex, etc.)
+- **temporal**: Date/time functions (extract, date_trunc, date_part, interval arithmetic, formatting)
+- **conditional**: Conditional expressions (CASE WHEN, COALESCE, NULLIF, GREATEST/LEAST)
+
+## Setup
+
+Create a virtual environment and install dependencies:
+
+```shell
+cd microbenchmarks
+python3 -m venv venv
+source venv/bin/activate
+pip install -r requirements.txt
+```
+
+## Usage
+
+Run a benchmark suite:
+
+```shell
+python string_functions_benchmark.py --suite <suite_name>
+```
+
+### Options
+
+| Option | Default | Description |
+|--------|---------|-------------|
+| `--suite` | `string` | Benchmark suite to run: `string`, `temporal`, or `conditional` |
+| `--rows` | `1000000` | Number of rows in the generated test data |
+| `--warmup` | `2` | Number of warmup iterations before timing |
+| `--iterations` | `5` | Number of timed iterations (results are averaged) |
+| `--output` | stdout | Output file path for markdown results |
+
+### Examples
+
+Run the string functions benchmark with default settings:
+
+```shell
+python string_functions_benchmark.py
+```
+
+Run the temporal functions benchmark with 10 million rows:
+
+```shell
+python string_functions_benchmark.py --suite temporal --rows 10000000
+```
+
+Run the conditional expressions benchmark and save results to a file:
+
+```shell
+python string_functions_benchmark.py --suite conditional --output results.md
+```
+
+## Output
+
+The benchmark outputs a markdown table comparing execution times:
+
+| Function | DataFusion (ms) | DuckDB (ms) | Speedup | Faster |
+|----------|----------------:|------------:|--------:|--------|
+| trim | 12.34 | 15.67 | 1.27x | DataFusion |
+| lower | 8.90 | 7.50 | 1.19x | DuckDB |
+| ... | ... | ... | ... | ... |
+
+A summary section shows overall statistics including how many functions each engine won and total execution times.
+
+## Adding New Benchmarks
+
+To add new functions to an existing suite, add a `BenchmarkFunction` entry to the appropriate list in `string_functions_benchmark.py`:
+
+```python
+BenchmarkFunction(
+    "function_name",
+    "datafusion_sql_expression({col})",
+    "duckdb_sql_expression({col})"
+)
+```
+
+The placeholders (e.g., `{col}`, `{str_col}`, `{ts_col}`) are replaced with actual column names at runtime.

From 5027a2911ebdf48e6b5a0cbbd946932bf209ff66 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Tue, 30 Dec 2025 12:27:13 -0700
Subject: [PATCH 5/6] rename script, update README

---
 microbenchmarks/README.md                     | 41 ++++---------------
 ...ctions_benchmark.py => microbenchmarks.py} |  4 +-
 2 files changed, 12 insertions(+), 33 deletions(-)
 rename microbenchmarks/{string_functions_benchmark.py => microbenchmarks.py} (98%)

diff --git a/microbenchmarks/README.md b/microbenchmarks/README.md
index 3b1d79f..d4ddc1f 100644
--- a/microbenchmarks/README.md
+++ b/microbenchmarks/README.md
@@ -25,14 +25,6 @@ This directory contains microbenchmarks for comparing DataFusion and DuckDB perf
 
 The benchmarks generate synthetic data, write it to Parquet format, and then measure the execution time of various SQL functions across both DataFusion and DuckDB. Results include per-function timing comparisons and summary statistics.
 
-## Benchmark Suites
-
-Three benchmark suites are available:
-
-- **string**: String manipulation functions (trim, lower, upper, concat, substring, regex, etc.)
-- **temporal**: Date/time functions (extract, date_trunc, date_part, interval arithmetic, formatting)
-- **conditional**: Conditional expressions (CASE WHEN, COALESCE, NULLIF, GREATEST/LEAST)
-
 ## Setup
 
 Create a virtual environment and install dependencies:
@@ -46,17 +38,16 @@ pip install -r requirements.txt
 
 ## Usage
 
-Run a benchmark suite:
+Run a benchmark:
 
 ```shell
-python string_functions_benchmark.py --suite <suite_name>
+python microbenchmarks.py
 ```
 
 ### Options
 
 | Option | Default | Description |
 |--------|---------|-------------|
-| `--suite` | `string` | Benchmark suite to run: `string`, `temporal`, or `conditional` |
 | `--rows` | `1000000` | Number of rows in the generated test data |
 | `--warmup` | `2` | Number of warmup iterations before timing |
 | `--iterations` | `5` | Number of timed iterations (results are averaged) |
@@ -64,22 +55,22 @@ python string_functions_benchmark.py --suite <suite_name>
 
 ### Examples
 
-Run the string functions benchmark with default settings:
+Run the benchmark with default settings:
 
 ```shell
-python string_functions_benchmark.py
+python microbenchmark.py
 ```
 
-Run the temporal functions benchmark with 10 million rows:
+Run the benchmark with 10 million rows:
 
 ```shell
-python string_functions_benchmark.py --suite temporal --rows 10000000
+python microbenchmarks.py --rows 10000000
 ```
 
-Run the conditional expressions benchmark and save results to a file:
+Run the benchmark and save results to a file:
 
 ```shell
-python string_functions_benchmark.py --suite conditional --output results.md
+python microbenchmarks.py --output results.md
 ```
 
 ## Output
@@ -92,18 +83,4 @@ The benchmark outputs a markdown table comparing execution times:
 | lower | 8.90 | 7.50 | 1.19x | DuckDB |
 | ... | ... | ... | ... | ... |
 
-A summary section shows overall statistics including how many functions each engine won and total execution times.
-
-## Adding New Benchmarks
-
-To add new functions to an existing suite, add a `BenchmarkFunction` entry to the appropriate list in `string_functions_benchmark.py`:
-
-```python
-BenchmarkFunction(
-    "function_name",
-    "datafusion_sql_expression({col})",
-    "duckdb_sql_expression({col})"
-)
-```
-
-The placeholders (e.g., `{col}`, `{str_col}`, `{ts_col}`) are replaced with actual column names at runtime.
+A summary section shows overall statistics including how many functions each engine won and total execution times.
\ No newline at end of file
diff --git a/microbenchmarks/string_functions_benchmark.py b/microbenchmarks/microbenchmarks.py
similarity index 98%
rename from microbenchmarks/string_functions_benchmark.py
rename to microbenchmarks/microbenchmarks.py
index f4b5f38..c6aaa12 100755
--- a/microbenchmarks/string_functions_benchmark.py
+++ b/microbenchmarks/microbenchmarks.py
@@ -236,9 +236,11 @@ def format_results_markdown(results: list[BenchmarkResult]) -> str:
     lines = [
         "# String Function Microbenchmarks: DataFusion vs DuckDB",
         "",
+        f"**DataFusion version:** {datafusion.__version__}  ",
+        f"**DuckDB version:** {duckdb.__version__}  ",
         f"**Rows:** {results[0].rows:,}",
         "",
-        "| Function | DataFusion (ms) | DuckDB (ms) | Speedup | Faster |",
+        f"| Function | DataFusion {datafusion.__version__} (ms) | DuckDB {duckdb.__version__} (ms) | Speedup | Faster |",
         "|----------|----------------:|------------:|--------:|--------|",
     ]
 

From 6ab4e5aaabf500cfa93b09be7076c0dd1f930bb8 Mon Sep 17 00:00:00 2001
From: Andy Grove <agrove@apache.org>
Date: Wed, 31 Dec 2025 15:57:36 -0700
Subject: [PATCH 6/6] address feedback

---
 microbenchmarks/microbenchmarks.py | 38 ++++++++++++++++++++----------
 1 file changed, 25 insertions(+), 13 deletions(-)

diff --git a/microbenchmarks/microbenchmarks.py b/microbenchmarks/microbenchmarks.py
index c6aaa12..c57483d 100755
--- a/microbenchmarks/microbenchmarks.py
+++ b/microbenchmarks/microbenchmarks.py
@@ -73,7 +73,7 @@ class StringFunction:
 ]
 
 
-def generate_test_data(num_rows: int = 1_000_000) -> pa.Table:
+def generate_test_data(num_rows: int = 1_000_000, use_string_view: bool = False) -> pa.Table:
     """Generate test data with various string patterns."""
     import random
     import string
@@ -103,23 +103,25 @@ def generate_test_data(num_rows: int = 1_000_000) -> pa.Table:
             s = ''.join(random.choices(string.ascii_letters + string.digits + ' ', k=length))
         strings.append(s)
 
+    str_type = pa.string_view() if use_string_view else pa.string()
     table = pa.table({
-        'str_col': pa.array(strings, type=pa.string())
+        'str_col': pa.array(strings, type=str_type)
     })
 
     return table
 
 
 def setup_datafusion(parquet_path: str) -> datafusion.SessionContext:
-    """Create and configure DataFusion context."""
-    ctx = datafusion.SessionContext()
+    """Create and configure DataFusion context with single thread/partition."""
+    config = datafusion.SessionConfig().with_target_partitions(1)
+    ctx = datafusion.SessionContext(config)
     ctx.register_parquet('test_data', parquet_path)
     return ctx
 
 
 def setup_duckdb(parquet_path: str) -> duckdb.DuckDBPyConnection:
-    """Create and configure DuckDB connection."""
-    conn = duckdb.connect(':memory:')
+    """Create and configure DuckDB connection with single thread."""
+    conn = duckdb.connect(':memory:', config={'threads': 1})
     conn.execute(f"CREATE VIEW test_data AS SELECT * FROM read_parquet('{parquet_path}')")
     return conn
 
@@ -167,7 +169,8 @@ def benchmark_duckdb(conn: duckdb.DuckDBPyConnection, expr: str,
 
 def run_benchmarks(num_rows: int = 1_000_000,
                    warmup: int = 2,
-                   iterations: int = 5) -> list[BenchmarkResult]:
+                   iterations: int = 5,
+                   use_string_view: bool = False) -> list[BenchmarkResult]:
     """Run all benchmarks and return results."""
     results = []
 
@@ -175,8 +178,9 @@ def run_benchmarks(num_rows: int = 1_000_000,
         parquet_path = os.path.join(tmpdir, 'test_data.parquet')
 
         # Generate and save test data
-        print(f"Generating {num_rows:,} rows of test data...")
-        table = generate_test_data(num_rows)
+        str_type = "StringView" if use_string_view else "String"
+        print(f"Generating {num_rows:,} rows of test data (type: {str_type})...")
+        table = generate_test_data(num_rows, use_string_view)
         pq.write_table(table, parquet_path)
         print(f"Parquet file written to: {parquet_path}")
         print(f"File size: {os.path.getsize(parquet_path) / 1024 / 1024:.2f} MB")
@@ -231,14 +235,17 @@ def run_benchmarks(num_rows: int = 1_000_000,
     return results
 
 
-def format_results_markdown(results: list[BenchmarkResult]) -> str:
+def format_results_markdown(results: list[BenchmarkResult], use_string_view: bool = False) -> str:
     """Format benchmark results as a markdown table."""
+    str_type = "StringView" if use_string_view else "String"
     lines = [
         "# String Function Microbenchmarks: DataFusion vs DuckDB",
         "",
         f"**DataFusion version:** {datafusion.__version__}  ",
         f"**DuckDB version:** {duckdb.__version__}  ",
-        f"**Rows:** {results[0].rows:,}",
+        f"**Rows:** {results[0].rows:,}  ",
+        f"**String type:** {str_type}  ",
+        "**Configuration:** Single thread, single partition",
         "",
         f"| Function | DataFusion {datafusion.__version__} (ms) | DuckDB {duckdb.__version__} (ms) | Speedup | Faster |",
         "|----------|----------------:|------------:|--------:|--------|",
@@ -310,6 +317,10 @@ def main():
         "--output", type=str, default=None,
         help="Output file for markdown results (default: stdout)"
     )
+    parser.add_argument(
+        "--string-view", action="store_true",
+        help="Use StringView type instead of String (default: False)"
+    )
 
     args = parser.parse_args()
 
@@ -320,10 +331,11 @@ def main():
     results = run_benchmarks(
         num_rows=args.rows,
         warmup=args.warmup,
-        iterations=args.iterations
+        iterations=args.iterations,
+        use_string_view=args.string_view
     )
 
-    markdown = format_results_markdown(results)
+    markdown = format_results_markdown(results, use_string_view=args.string_view)
 
     print("\n" + "=" * 60)
     print("RESULTS")