apache · eric-maynard · Jul 10, 2025 · Apr 30, 2025 · Apr 30, 2025 · Apr 30, 2025
diff --git a/benchmarks/README.md b/benchmarks/README.md
@@ -27,6 +27,7 @@ Benchmarks for the Polaris service using Gatling.
 - `org.apache.polaris.benchmarks.simulations.ReadTreeDataset`: Performs read-only operations to fetch namespaces, tables, and views.  Some attributes of the objects are also fetched.  This benchmark is intended to be used against a Polaris instance with a pre-existing tree dataset.  It has no side effects on the dataset and can be executed multiple times without any issues.
 - `org.apache.polaris.benchmarks.simulations.ReadUpdateTreeDataset`: Performs read and update operations against a Polaris instance populated with a test dataset.  It is a read/write workload that can be used to test the system's ability to handle concurrent read and update operations.  It is not destructive and does not prevent subsequent executions of `ReadTreeDataset` or `ReadUpdateTreeDataset`.
 - `org.apache.polaris.benchmarks.simulations.CreateCommits`: Creates table and view commits at configurable rates.  This benchmark is useful for testing the system's ability to handle table and view commits and can be used to generate a history of thousands of commits for both tables and views.
+- `org.apache.polaris.benchmarks.simulations.WeightedWorkloadOnTreeDataset`: Performs reads and writes against tables in accordance with the configured distributions. This is useful for testing performance when clients conflict.
 
 ## Parameters
 

diff --git a/benchmarks/src/gatling/resources/benchmark-defaults.conf b/benchmarks/src/gatling/resources/benchmark-defaults.conf
@@ -180,4 +180,32 @@ workload {
     # Default: 5
     duration-in-minutes = 5
   }
+
+  # Configuration for the WeightedWorkloadOnTreeDataset simulation
+  weighted-workload-on-tree-dataset {
+    # Seed used for RNG during the test
+    seed = 42
+
+    # Distributions for readers
+    # Each distribution will have `count` threads assigned to it
+    # mean / variance describe the properties of the normal distribution
+    # Readers will read a random table in the table space based on sampling
+    # Default: [{ count = 8, mean = 0.3, variance = 0.0278 }]
+    readers = [
+      { count = 8, mean = 0.3, variance = 0.0278 }
+    ]
+
+    # Distributions for writers
+    # Each distribution will have `count` threads assigned to it
+    # mean / variance describe the properties of the normal distribution
+    # Writers will write to a random table in the table space based on sampling
+    # Default: [{ count = 2, mean = 0.7, variance = 0.0278 }]
+    writers = [
+      { count = 2, mean = 0.7, variance = 0.0278 }
+    ]
+
+    # Duration of the simulation in minutes
+    # Default: 5
+    duration-in-minutes = 5
+  }
 }
diff --git a/...marks/src/gatling/scala/org/apache/polaris/benchmarks/actions/AuthenticationActions.scala b/...marks/src/gatling/scala/org/apache/polaris/benchmarks/actions/AuthenticationActions.scala
@@ -85,8 +85,9 @@ case class AuthenticationActions(
         .check(jsonPath("$.access_token").saveAs("accessToken"))
     )
       .exec { session =>
-        if (session.contains("accessToken"))
+        if (session.contains("accessToken")) {
           accessToken.set(session("accessToken").as[String])
+        }
         session
       }
 

diff --git a/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/BenchmarkConfig.scala b/benchmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/BenchmarkConfig.scala
@@ -43,6 +43,7 @@ object BenchmarkConfig {
       val rtdConfig = workload.getConfig("read-tree-dataset")
       val ctdConfig = workload.getConfig("create-tree-dataset")
       val rutdConfig = workload.getConfig("read-update-tree-dataset")
+      val wwotdConfig = workload.getConfig("weighted-workload-on-tree-dataset")
 
       WorkloadParameters(
         CreateCommitsParameters(
@@ -62,6 +63,12 @@ object BenchmarkConfig {
           rutdConfig.getDouble("read-write-ratio"),
           rutdConfig.getInt("throughput"),
           rutdConfig.getInt("duration-in-minutes")
+        ),
+        WeightedWorkloadOnTreeDatasetParameters(
+          wwotdConfig.getInt("seed"),
+          WeightedWorkloadOnTreeDatasetParameters.loadDistributionsList(wwotdConfig, "readers"),
+          WeightedWorkloadOnTreeDatasetParameters.loadDistributionsList(wwotdConfig, "writers"),
+          wwotdConfig.getInt("duration-in-minutes")
         )
       )
     }

diff --git a/...hmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/DatasetParameters.scala b/...hmarks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/DatasetParameters.scala
@@ -56,7 +56,7 @@ case class DatasetParameters(
     storageConfigInfo: String
 ) {
   val nAryTree: NAryTreeBuilder = NAryTreeBuilder(nsWidth, nsDepth)
-  private val maxPossibleTables = nAryTree.numberOfLastLevelElements * numTablesPerNs
+  val maxPossibleTables: Int = nAryTree.numberOfLastLevelElements * numTablesPerNs
   private val maxPossibleViews = nAryTree.numberOfLastLevelElements * numViewsPerNs
   val numTables: Int = if (numTablesMax <= 0) maxPossibleTables else numTablesMax
   val numViews: Int = if (numViewsMax <= 0) maxPossibleViews else numViewsMax

diff --git a/...la/org/apache/polaris/benchmarks/parameters/WeightedWorkloadOnTreeDatasetParameters.scala b/...la/org/apache/polaris/benchmarks/parameters/WeightedWorkloadOnTreeDatasetParameters.scala
@@ -0,0 +1,168 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.polaris.benchmarks.parameters
+
+import com.typesafe.config.Config
+import com.typesafe.scalalogging.Logger
+import org.slf4j.LoggerFactory
+
+import scala.jdk.CollectionConverters._
+import scala.collection.immutable.LazyList
+import scala.collection.mutable
+import scala.collection.mutable.ArrayBuffer
+import scala.util.Random
+
+/**
+ * Case class to hold the parameters for the WeightedWorkloadOnTreeDataset simulation.
+ *
+ * @param seed The RNG seed to use
+ * @param readers A seq of distrbutions to use for reading tables
+ * @param writers A seq of distrbutions to use for writing to tables
+ */
+case class WeightedWorkloadOnTreeDatasetParameters(
+    seed: Int,
+    readers: Seq[Distribution],
+    writers: Seq[Distribution],
+    durationInMinutes: Int
+) {
+  require(readers.nonEmpty || writers.nonEmpty, "At least one reader or writer is required")
+  require(durationInMinutes > 0, "Duration in minutes must be positive")
+}
+
+object WeightedWorkloadOnTreeDatasetParameters {
+  def loadDistributionsList(config: Config, key: String): List[Distribution] =
+    config.getConfigList(key).asScala.toList.map { conf =>
+      Distribution(
+        count = conf.getInt("count"),
+        mean = conf.getDouble("mean"),
+        variance = conf.getDouble("variance")
+      )
+    }
+}
+
+case class Distribution(count: Int, mean: Double, variance: Double) {
+  private val logger = LoggerFactory.getLogger(getClass)
+
+  def printDescription(dataset: DatasetParameters): Unit = {
+    println(s"Summary for ${this}:")
+
+    // Visualize distributions
+    printVisualization(dataset.maxPossibleTables)
+
+    // Warn if a large amount of resampling will be needed. We use a unique, but fixed,
+    // seed here as it would be impossible to represent all the different reader & writer
+    // seeds in one RandomNumberProvider here. The resulting samples, therefore, are
+    // just an approximation of what will happen in the scenario.
+    val debugRandomNumberProvider = RandomNumberProvider("debug".hashCode, -1)
+    def resampleStream: LazyList[Double] =
+      LazyList.continually(sample(dataset.maxPossibleTables, debugRandomNumberProvider))
+
+    val (_, resamples) = resampleStream.zipWithIndex
+      .take(100000)
+      .find { case (value, _) => value >= 0 && value < dataset.maxPossibleTables }
+      .getOrElse((-1, 100000))
+
+    if (resamples > 100) {
+      logger.warn(
+        s"A distribution appears to require aggressive resampling: ${this} took ${resamples + 1} samples!"
+      )
+    }
+  }
+
+  /**
+   * Return a value in [0, items) based on this distribution using truncated normal resampling.
+   */
+  def sample(items: Int, randomNumberProvider: RandomNumberProvider): Int = {
+    val stddev = math.sqrt(variance)
+    // Resample until the value is in [0, 1]
+    val maxSamples = 100000
+    val value = Iterator
+      .continually(randomNumberProvider.next() * stddev + mean)
+      .take(maxSamples)
+      .find(x => x >= 0.0 && x <= 1.0)
+      .getOrElse(
+        throw new RuntimeException(
+          s"Failed to sample a value in [0, 1] after ${maxSamples} attempts"
+        )
+      )
+
+    (value * items).toInt.min(items - 1)
+  }
+
+  def printVisualization(tables: Int, samples: Int = 100000, bins: Int = 10): Unit = {
+    val binCounts = Array.fill(bins)(0)
+    val hits = new mutable.HashMap[Int, Int]()
+
+    // We use a unique, but fixed, seed here as it would be impossible to represent all
+    // the different reader & writer seeds in one RandomNumberProvider here. The resulting
+    // samples, therefore, are just an approximation of what will happen in the scenario.
+    val rng = RandomNumberProvider("visualization".hashCode, -1)
+
+    (1 to samples).foreach { _ =>
+      val value = sample(tables, rng)
+      val bin = ((value.toDouble / tables) * bins).toInt.min(bins - 1)
+      hits.put(value, hits.getOrElse(value, 0) + 1)
+      binCounts(bin) += 1
+    }
+
+    val maxBarWidth = 50
+    val total = binCounts.sum.toDouble
+    println("  Range         | % of Samples | Visualization")
+    println("  --------------|--------------|------------------")
+
+    (0 until bins).foreach { i =>
+      val low = i.toDouble / bins
+      val high = (i + 1).toDouble / bins
+      val percent = binCounts(i) / total * 100
+      val bar = "█" * ((percent / 100 * maxBarWidth).round.toInt)
+      println(f"  [$low%.1f - $high%.1f) | $percent%6.2f%%      | $bar")
+    }
+    println()
+
+    val mode = hits.maxBy(_._2)
+    val modePercentage: Int = Math.round(mode._2.toFloat / samples * 100)
+    println(s"  The most frequently selected table was chosen in ~${modePercentage}% of samples")
+
+    println()
+  }
+}
+
+object Distribution {
+
+  // Map an index back to a table path
+  def tableIndexToIdentifier(index: Int, dp: DatasetParameters): (String, List[String], String) = {
+    require(
+      dp.numTablesMax == -1,
+      "Sampling is incompatible with numTablesMax settings other than -1"
+    )
+
+    val namespaceIndex = index / dp.numTablesPerNs
+    val namespaceOrdinal = dp.nAryTree.lastLevelOrdinals.toList.apply(namespaceIndex)
+    val namespacePath = dp.nAryTree.pathToRoot(namespaceOrdinal)
+
+    // TODO Refactor this line once entity names are configurable
+    (s"C_0", namespacePath.map(n => s"NS_${n}"), s"T_${index}")
+  }
+}
+
+case class RandomNumberProvider(seed: Int, threadId: Int) {
+  private[this] val random = new Random(seed + threadId)
+  def next(): Double = random.nextGaussian()
+}
diff --git a/...marks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/WorkloadParameters.scala b/...marks/src/gatling/scala/org/apache/polaris/benchmarks/parameters/WorkloadParameters.scala
@@ -23,5 +23,6 @@ case class WorkloadParameters(
     createCommits: CreateCommitsParameters,
     readTreeDataset: ReadTreeDatasetParameters,
     createTreeDataset: CreateTreeDatasetParameters,
-    readUpdateTreeDataset: ReadUpdateTreeDatasetParameters
+    readUpdateTreeDataset: ReadUpdateTreeDatasetParameters,
+    weightedWorkloadOnTreeDataset: WeightedWorkloadOnTreeDatasetParameters
 ) {}