rapidsai · simoneves · Oct 22, 2025 · Sep 16, 2025 · Sep 16, 2025 · Sep 16, 2025
@@ -15,3 +15,6 @@ __pycache__/
 
 # Default benchmark output directory
 benchmark_output
+
+# Generated Presto Config
+presto/docker/config/generated/
@@ -0,0 +1,15 @@
+{
+  "sys_reserved_mem_percent": 0.05,
+  "sys_reserved_mem_cap_gb": 2,
+  "heap_size_percent_of_container_mem": 0.9,
+  "headroom_percent_of_heap": 0.2,
+  "query_max_total_mem_per_node_percent_of_heap": 0.8,
+  "query_max_mem_per_node_percent_of_total": 0.9,
+  "proxygen_mem_per_worker_gb": 0.125,
+  "proxygen_mem_cap_gb": 2,
+  "native_buffer_mem_percent": 0.05,
+  "native_buffer_mem_cap_gb": 32,
+  "native_query_mem_percent_of_sys_mem": 0.95,
+  "join_max_bcast_size_percent_of_container_mem": 0.01,
+  "memory_push_back_start_below_limit_gb": 5
+}
@@ -0,0 +1,21 @@
+# Select the connector implementation. "hive-hadoop2" uses the Hive connector
+# backed by Hadoop 2.x libraries which is the default for Presto's Hive support.
+connector.name=hive-hadoop2
+
+# Configure the metastore implementation. "file" enables a simple file-based
+# metastore suitable for local testing without an external Hive Metastore (HMS).
+# See https://prestodb.io/docs/current/installation/deployment.html#configuring-a-file-based-metastore for more details.
+hive.metastore=file
+# Root directory where the file-based metastore stores table and partition
+# metadata. This path is inside the container volume so state persists across
+# server restarts during tests.
+hive.metastore.catalog.dir=file:/var/lib/presto/data/hive/metastore
+# Allow DROP TABLE statements. Enabled to make smoke/perf tests able to reset
+# state and clean up artifacts without manual intervention.
+hive.allow-drop-table=true
+
+# Control whether Presto can split files for parallel reads. Disable when the
+# file compression/format isn't splittable to avoid read failures. TPCH Parquet
+# test data commonly uses SNAPPY compression that isn't splittable at the file
+# level here, hence this must be false.
+hive.file-splittable=false
@@ -0,0 +1,7 @@
+# Select the built-in TPCH connector that generates synthetic datasets on the
+# fly. Used for functional and performance testing without external storage.
+connector.name=tpch
+
+# Choose the column naming convention for generated tables. "STANDARD" matches
+# the canonical TPC-H schema so queries from benchmarks run unmodified.
+tpch.column-naming=STANDARD
@@ -1,12 +1,26 @@
+# Enable JVM server mode for better JIT optimization on long-running servers.
 -server
--Xmx24G
+# Maximum Java heap size; templated to match container memory.
+-Xmx{{ .HeapSizeGb }}G
+# Initial Java heap size; equal to max to avoid heap resizing pauses.
+-Xms{{ .HeapSizeGb }}G
+# Use the G1 garbage collector for predictable pause times.
 -XX:+UseG1GC
+# Tune G1 region size to balance GC throughput and fragmentation.
 -XX:G1HeapRegionSize=32M
+# Abort when GC overhead becomes excessive to prevent hangs.
 -XX:+UseGCOverheadLimit
+# Make System.gc() invoke concurrent collections to reduce pauses.
 -XX:+ExplicitGCInvokesConcurrent
+# Create heap dumps on OOM for postmortem analysis.
 -XX:+HeapDumpOnOutOfMemoryError
+# Exit the JVM on OOM so orchestration can restart the process.
 -XX:+ExitOnOutOfMemoryError
+# Cap NIO direct buffer cache to limit retained off-heap memory.
+-Djdk.nio.maxCachedBufferSize=2000000
+# Allow self-attach for profilers (e.g., async-profiler) during debugging.
 -Djdk.attach.allowAttachSelf=true
+# Open JDK internals for reflection required by Presto and dependencies under Java 11+ modules.
 --add-opens=java.base/java.io=ALL-UNNAMED
 --add-opens=java.base/java.lang=ALL-UNNAMED
 --add-opens=java.base/java.lang.ref=ALL-UNNAMED

@@ -0,0 +1,4 @@
+# Root logger level for all Presto server components. INFO provides useful
+# operational diagnostics while keeping logs compact for tests. Increase to
+# DEBUG when deep troubleshooting is needed.
+com.facebook.presto=INFO
@@ -0,0 +1,25 @@
+# Java coordinator configuration. Avoid adding Presto Native-only properties
+# here, as they are unsupported by the Java engine and may prevent startup.
+# Run this node as the cluster coordinator; it schedules and manages queries.
+coordinator=true
+# Do not schedule worker tasks on the coordinator to avoid resource contention.
+node-scheduler.include-coordinator=false
+# Coordinator REST/HTTP port for clients and workers.
+http-server.http.port=8080
+# Embedded service that provides node discovery for workers.
+discovery-server.enabled=true
+# Address workers use to register with the discovery service.
+discovery.uri=http://presto-coordinator:8080
+
+# Min workers before query starts; keep minimal for quick tests.
+query-manager.required-workers=1
+# Maximum wait for required workers to join.
+query-manager.required-workers-max-wait=10s
+
+# Memory auto-configuration is not wired for Java engine in this template.
+# Uncomment and set values if using Java workers/coordinator end-to-end.
+#query.max-total-memory-per-node={{ .JavaQueryMaxTotalMemPerNodeGb }}GB
+#query.max-total-memory={{ mul .JavaQueryMaxTotalMemPerNodeGb .NumberOfWorkers }}GB
+#query.max-memory-per-node={{ .JavaQueryMaxMemPerNodeGb }}GB
+#query.max-memory={{ mul .JavaQueryMaxMemPerNodeGb .NumberOfWorkers }}GB
+#memory.heap-headroom-per-node={{ .HeadroomGb }}GB
diff --git a/presto/docker/config/template/etc_coordinator/config_native_cpu.properties b/presto/docker/config/template/etc_coordinator/config_native_cpu.properties
@@ -0,0 +1,112 @@
+# Run this node as the cluster coordinator; it schedules and manages queries.
+coordinator=true
+# Do not schedule worker tasks on the coordinator to avoid resource contention.
+node-scheduler.include-coordinator=false
+# Coordinator REST/HTTP port for clients and workers.
+http-server.http.port=8080
+# Embedded service that provides node discovery for workers.
+discovery-server.enabled=true
+# Address workers use to register with the discovery service.
+discovery.uri=http://presto-coordinator:8080
+
+# Set Presto version string to match workers for compatibility in tests.
+presto.version=testversion
+
+# Keep up to 30 rolled log files to bound disk usage.
+log.max-history=30
+# Rotate logs at ~100MB per file for manageable artifacts.
+log.max-size=104857600B
+# Reserve heap headroom per node to reduce full GC and OOM risk.
+memory.heap-headroom-per-node={{ .HeadroomGb }}GB
+
+# Limit pending splits per task to avoid excessive memory usage.
+node-scheduler.max-pending-splits-per-task=2000
+# Cap concurrent splits per node for balanced scheduling.
+node-scheduler.max-splits-per-node=2000
+
+# Optimizer flags
+#optimizer.joins-not-null-inference-strategy=USE_FUNCTION_METADATA
+#optimizer.default-filter-factor-enabled=true
+# Use known constraints to simplify plan and filters.
+optimizer.exploit-constraints=true
+# Rewrite large IN lists as joins for performance in some cases.
+optimizer.in-predicates-as-inner-joins-enabled=true
+# Allow partial aggregations to reduce data shuffled across stages.
+optimizer.partial-aggregation-strategy=automatic
+# Prefer partial aggregations when beneficial.
+optimizer.prefer-partial-aggregation=true
+# Default selectivity heuristic for joins when stats are missing.
+optimizer.default-join-selectivity-coefficient=0.1
+# Infer additional range predicates to improve filtering.
+optimizer.infer-inequality-predicates=true
+# Support complex equi-join patterns in the optimizer.
+optimizer.handle-complex-equi-joins=true
+# Add dynamic domain filters to reduce scanned data.
+optimizer.generate-domain-filters=true
+# Upper limit for broadcasted table size to avoid memory blowups.
+# See: https://github.com/prestodb/presto/issues/22161#issuecomment-1994128619
+join-max-broadcast-table-size={{ .JoinMaxBroadcastTableSizeMb }}MB
+
+# Client request timeout to avoid hung queries.
+query.client.timeout=30m
+# Use phased execution policy for improved large query scheduling.
+query.execution-policy=phased
+# Kill queries based on total reservation on blocked nodes to recover memory.
+query.low-memory-killer.policy=total-reservation-on-blocked-nodes
+# Upper limit on query wall time to keep tests bounded.
+query.max-execution-time=30m
+# Keep metadata of up to 1000 queries for UI and debugging.
+query.max-history=1000
+# Memory quotas per node and cluster to protect stability.
+query.max-total-memory-per-node={{ .JavaQueryMaxTotalMemPerNodeGb }}GB
+query.max-total-memory={{ mul .JavaQueryMaxTotalMemPerNodeGb .NumberOfWorkers }}GB
+query.max-memory-per-node={{ .JavaQueryMaxMemPerNodeGb }}GB
+query.max-memory={{ mul .JavaQueryMaxMemPerNodeGb .NumberOfWorkers }}GB
+# Allow deep stage DAGs required by certain benchmark queries.
+query.max-stage-count=1300
+# Retain query info at least this long for diagnostics.
+query.min-expire-age=120.00m
+# Larger scheduling batches for better throughput in benchmarks.
+query.min-schedule-split-batch-size=2000
+# Raise warning threshold to align with higher max stage count.
+query.stage-count-warning-threshold=150
+# Increase serialized plan/query length limit for complex benchmark queries.
+query.max-length=2000000
+
+# Disable dynamic filtering for deterministic benchmarking.
+experimental.enable-dynamic-filtering=false
+# Cap revocable memory per node to avoid overcommit.
+experimental.max-revocable-memory-per-node=50GB
+# Limit disk spill usage per node to bound IO and disk usage.
+experimental.max-spill-per-node=50GB
+# Enable repartitioning improvements for shuffle efficiency.
+experimental.optimized-repartitioning=true
+# Enable dereference and subfield pushdown to reduce scanned data.
+experimental.pushdown-dereference-enabled=true
+experimental.pushdown-subfields-enabled=true
+# Cluster-wide guardrail for spill during a single query per node.
+experimental.query-max-spill-per-node=50GB
+# Disable reserved memory pool to simplify test behavior.
+experimental.reserved-pool-enabled=false
+# Stop spilling when disk usage exceeds this fraction.
+experimental.spiller-max-used-space-threshold=0.7
+# Directory for spill files during execution.
+experimental.spiller-spill-path=/tmp
+
+
+# Min workers before query starts; keep minimal for quick tests.
+query-manager.required-workers=1
+# Maximum wait for required workers to join.
+query-manager.required-workers-max-wait=10s
+
+# Set required configuration for Presto C++ workers as indicated in https://prestodb.io/docs/current/presto_cpp/properties.html#coordinator-properties
+native-execution-enabled=true
+# Disable Java-side hash generation optimizations not used by native workers.
+optimizer.optimize-hash-generation=false
+# Use RE2J regex engine for performance and determinism.
+regex-library=RE2J
+# Enable alternative function signatures for native compatibility.
+use-alternative-function-signatures=true
+
+# Optimize for queries that can run entirely on a single worker.
+single-node-execution-enabled=true