Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
196 changes: 115 additions & 81 deletions src/step/batching.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,114 +3,148 @@
//! When passing large file lists to shell commands, the total argument length can exceed
//! the operating system's `ARG_MAX` limit, causing "Argument list too long" errors.
//!
//! This module provides automatic batching that:
//! 1. Estimates the shell-quoted size of file lists
//! 2. Uses binary search to find optimal batch sizes
//! 3. Splits jobs to stay within safe limits
//! This module renders the actual run command for each job and only splits jobs whose
//! rendered command exceeds the safe limit. Steps whose commands don't reference
//! `{{files}}` (or render to a small string for any other reason) are left as a
//! single job, even when the underlying file list is large.

use crate::env;
use crate::step_job::StepJob;
use crate::tera;
use std::path::PathBuf;
use std::sync::Arc;

use super::types::Step;

impl Step {
/// Estimates the size of the `{{files}}` template variable expansion.
///
/// This includes shell quoting overhead and spaces between files.
/// Uses a conservative estimate (2x + 2 for quotes, +1 for space) to ensure
/// we don't underestimate.
///
/// # Arguments
///
/// * `files` - The list of files to estimate
///
/// # Returns
///
/// Estimated byte size of the quoted file list string
/// Used as a fallback when rendering the run command fails (e.g. the
/// template references a variable not yet present in the context).
pub(crate) fn estimate_files_string_size(&self, files: &[PathBuf]) -> usize {
files
.iter()
.map(|f| {
let path_str = f.to_str().unwrap_or("");
// Estimate quoted size: conservative estimate assuming worst-case quoting
// For shell quoting, worst case is roughly 2x + 2 (quotes)
path_str.len() * 2 + 2 + 1 // +1 for space separator
// Worst-case quoted size: 2x + 2 (quotes), +1 for space separator
path_str.len() * 2 + 2 + 1
})
.sum()
}

/// Automatically batch jobs if the file list would exceed safe ARG_MAX limits.
///
/// This prevents "Argument list too long" errors when passing large file lists
/// to commands. Uses binary search to find the largest batch size that fits
/// within a safe limit (50% of ARG_MAX to account for environment variables
/// and the command itself).
///
/// # Arguments
///
/// * `jobs` - The jobs to potentially batch
/// Render the run command for a hypothetical job containing `files` and return
/// its byte length. Returns `None` if rendering fails (e.g. the template
/// references a variable not in the context).
fn render_run_command_size(
&self,
original_job: &StepJob,
files: &[PathBuf],
base_tctx: &tera::Context,
) -> Option<usize> {
let run_cmd = if original_job.check_first {
self.check_first_cmd()
} else {
self.run_cmd(original_job.run_type)
}?;
let run = run_cmd.to_string();
if run.trim().is_empty() {
return None;
}
Comment on lines +50 to +52

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Returning None for empty or whitespace-only commands triggers the fallback to estimate_files_string_size. If the file list is large, this will cause the job to be unnecessarily batched, which is exactly what this PR aims to avoid for commands that don't reference {{files}}. Returning Some(0) would correctly indicate that the command fits within the limit.

Suggested change
if run.trim().is_empty() {
return None;
}
if run.trim().is_empty() {
return Some(0);
}

let run = if let Some(prefix) = &self.prefix {
format!("{prefix} {run}")
} else {
run
};

let mut temp = StepJob::new(
Arc::clone(&original_job.step),
files.to_vec(),
original_job.run_type,
);
temp.check_first = original_job.check_first;
if let Some(wi) = original_job.workspace_indicator() {
temp = temp.with_workspace_indicator(wi.clone());
}
let tctx = temp.tctx(base_tctx);
tera::render(&run, &tctx).ok().map(|s| s.len())
}

/// Automatically batch jobs whose rendered run command would exceed the safe ARG_MAX limit.
///
/// # Returns
/// Uses 50% of `ARG_MAX` as the safety margin (accounts for env vars and the command itself).
/// Renders the actual run command with each candidate file subset; if the rendered command
/// fits, no batching is performed. Otherwise binary-searches the largest batch size whose
/// rendered command still fits.
///
/// A new list of jobs, potentially with large jobs split into multiple smaller ones
pub(crate) fn auto_batch_jobs_if_needed(&self, jobs: Vec<StepJob>) -> Vec<StepJob> {
// Use 50% of ARG_MAX as a safety margin, accounting for environment variables
// and the command itself
let safe_limit = *env::ARG_MAX / 2;
/// If rendering fails for any reason, falls back to estimating the size of the quoted
/// file-list expansion — preserves the previous (purely size-based) behavior as a safety net.
pub(crate) fn auto_batch_jobs(
&self,
jobs: Vec<StepJob>,
base_tctx: &tera::Context,
) -> Vec<StepJob> {
if self.stdin.is_some() {
// stdin path doesn't pass files via argv; never auto-batch
return jobs;
}

let mut batched_jobs = Vec::new();
let safe_limit = *env::ARG_MAX / 2;
let mut batched_jobs = Vec::with_capacity(jobs.len());

for job in jobs {
let estimated_size = self.estimate_files_string_size(&job.files);

if estimated_size > safe_limit && job.files.len() > 1 {
// Need to batch this job
debug!(
"{}: auto-batching {} files (estimated size: {} bytes, limit: {} bytes)",
self.name,
job.files.len(),
estimated_size,
safe_limit
);

// Binary search to find the largest batch_size where files fit within safe_limit
let mut low = 1;
let mut high = job.files.len();

while low < high {
let mid = (low + high).div_ceil(2);
let test_size = self.estimate_files_string_size(&job.files[..mid]);

if test_size <= safe_limit {
// mid files fit, try larger
low = mid;
} else {
// mid files don't fit, try smaller
high = mid - 1;
}
if job.skip_reason.is_some() || job.files.len() <= 1 {
batched_jobs.push(job);
continue;
}

// Try render-based sizing first; fall back to byte estimation on render failure.
let full_size = self
.render_run_command_size(&job, &job.files, base_tctx)
.unwrap_or_else(|| self.estimate_files_string_size(&job.files));

if full_size <= safe_limit {
batched_jobs.push(job);
continue;
}

debug!(
"{}: auto-batching {} files (rendered size: {} bytes, limit: {} bytes)",
self.name,
job.files.len(),
full_size,
safe_limit
);

// Binary search the largest batch size whose rendered command fits.
let mut low = 1;
let mut high = job.files.len();
while low < high {
let mid = (low + high).div_ceil(2);
let test_size = self
.render_run_command_size(&job, &job.files[..mid], base_tctx)
.unwrap_or_else(|| self.estimate_files_string_size(&job.files[..mid]));
if test_size <= safe_limit {
low = mid;
} else {
high = mid - 1;
}
}
let batch_size = low.max(1);

debug!(
"{}: using batch size of {} files per batch",
self.name, batch_size
);

// After binary search, low contains the largest batch size that fits
let batch_size = low.max(1); // Ensure at least 1 file per batch

debug!(
"{}: using batch size of {} files per batch",
self.name, batch_size
);

// Create batched jobs - use the StepJob constructor to properly handle private fields
for chunk in job.files.chunks(batch_size) {
let new_job = StepJob::new(job.step.clone(), chunk.to_vec(), job.run_type);
// Note: we can't preserve workspace_indicator or other private fields
// without adding public methods to StepJob. For now, batching will
// break workspace_indicator jobs, but that's acceptable since those
// are typically small workspaces.
batched_jobs.push(new_job);
for chunk in job.files.chunks(batch_size) {
let mut new_job = StepJob::new(Arc::clone(&job.step), chunk.to_vec(), job.run_type);
// Preserve job-level state that isn't reconstructed by StepJob::new.
new_job.check_first = job.check_first;
new_job.skip_reason = job.skip_reason.clone();
if let Some(wi) = job.workspace_indicator() {
new_job = new_job.with_workspace_indicator(wi.clone());
}
} else {
// No batching needed
batched_jobs.push(job);
batched_jobs.push(new_job);
}
}

Expand Down
6 changes: 5 additions & 1 deletion src/step/execution.rs
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,16 @@ impl Step {
}

let files = ctx.hook_ctx.files();
let mut jobs = self.build_step_jobs(
let jobs = self.build_step_jobs(
&files,
ctx.hook_ctx.run_type,
&ctx.hook_ctx.files_in_contention.lock().unwrap(),
&ctx.hook_ctx.skip_steps,
)?;
// Apply ARG_MAX-safe auto-batching now that the full tera context is
// available — only split jobs whose rendered run command would actually
// exceed the limit.
let mut jobs = self.auto_batch_jobs(jobs, &ctx.hook_ctx.tctx);
if let Some(job) = jobs.first_mut() {
job.semaphore = Some(semaphore);
}
Expand Down
15 changes: 9 additions & 6 deletions src/step/job_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,11 @@ impl Step {
/// 2. Check if step has a command for the run type
/// 3. Filter files based on step configuration
/// 4. Split into workspace jobs (if workspace_indicator set) or batch jobs
/// 5. Apply auto-batching for ARG_MAX safety
/// 6. Configure check_first based on file contention
/// 5. Configure check_first based on file contention
///
/// Auto-batching for ARG_MAX safety is applied later by [`Step::auto_batch_jobs`]
/// (called from execution time) so the full tera context is available to render
/// the actual command.
///
/// # Arguments
///
Expand Down Expand Up @@ -143,10 +146,10 @@ impl Step {
)]
};

if self.stdin.is_none() {
// Auto-batch any jobs where the file list would exceed safe limits
jobs = self.auto_batch_jobs_if_needed(jobs);
}
// Note: auto-batching for ARG_MAX safety happens at execution time
// (see `Step::auto_batch_jobs`) where the full tera context is available
// to render the actual run command — so steps whose commands don't
// reference `{{files}}` are not split into many jobs unnecessarily.

// Apply profile skip only after determining files/no-files, so NoFilesToProcess wins
// Also, if a condition is present, defer profile checks to run() so ConditionFalse wins
Expand Down
4 changes: 4 additions & 0 deletions src/step_job.rs
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,10 @@ impl StepJob {
self
}

pub fn workspace_indicator(&self) -> Option<&PathBuf> {
self.workspace_indicator.as_ref()
}

pub fn tctx(&self, base: &tera::Context) -> tera::Context {
let mut tctx = base.clone();

Expand Down
34 changes: 34 additions & 0 deletions test/auto_batch_large_files.bats
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,40 @@ EOF
assert_output --partial "file10.txt"
}

@test "auto-batch does not split steps that don't reference {{files}}" {
# Regression test: previously hk would auto-batch any step on a large file
# list based on the *file list* size, even if the run command never
# interpolated `{{files}}`. With render-based sizing, a static command
# should run as a single job regardless of how many files match.
cat <<EOF > hk.pkl
amends "$PKL_PATH/Config.pkl"
hooks {
["check"] {
steps {
["static-message"] {
check = "echo hello world"
}
}
}
}
EOF

# Create enough files (with long paths) that the *file-list* expansion
# would blow past ARG_MAX/2 and force batching under the old logic.
for i in {1..1000}; do
dir="very/long/directory/path/number/$i/with/many/levels/to/increase/path/length"
mkdir -p "$dir"
echo "test" > "$dir/file_with_very_long_name_to_increase_arg_size_$i.txt"
done

run hk check --all
assert_success
# Each batch produces its own "N files – ... – <command>" progress line;
# one such line means the step ran as a single job.
run bash -c "hk check --all 2>&1 | grep -Ec 'files –.*– echo hello world' || true"
assert_output "1"
}

@test "auto-batch with fix command" {
cat <<EOF > hk.pkl
amends "$PKL_PATH/Config.pkl"
Expand Down
Loading