diff --git a/scripts/gantry_run_benchmark.sh b/scripts/gantry_run_benchmark.sh
new file mode 100755
index 0000000000..80c3cf669e
--- /dev/null
+++ b/scripts/gantry_run_benchmark.sh
@@ -0,0 +1,54 @@
+#!/bin/bash
+# Runs the benchmark on gantry. Takes one argument which is the response length.
+# Usage: ./gantry_run_benchmark.sh [response_length]
+# E.g. $ ./gantry_run_benchmark.sh 64000
+set -e
+
+# Set default value for response_length
+response_length=64000
+
+# If first argument exists and is a number, use it as response_length
+if [[ "$1" =~ ^[0-9]+$ ]]; then
+  response_length="$1"
+  shift
+fi
+
+gantry run \
+       --name open_instruct-benchmark_generators \
+       --workspace ai2/oe-eval \
+       --weka=oe-eval-default:/weka \
+       --gpus 1 \
+       --beaker-image nathanl/open_instruct_auto \
+       --cluster ai2/jupiter-cirrascale-2 \
+       --budget ai2/oe-eval \
+       --install 'pip install --upgrade pip "setuptools<70.0.0" wheel 
+# TODO, unpin setuptools when this issue in flash attention is resolved
+pip install torch==2.7.0 torchvision==0.22.0 --index-url https://download.pytorch.org/whl/cu128
+pip install packaging
+pip install flash-attn==2.8.0.post2 --no-build-isolation
+pip install -r requirements.txt
+pip install -e .
+python -m nltk.downloader punkt' \
+       -- python -m open_instruct.benchmark_generators \
+    --model_name_or_path "hamishivi/qwen2_5_openthoughts2" \
+    --tokenizer_name_or_path "hamishivi/qwen2_5_openthoughts2" \
+    --dataset_mixer_list "hamishivi/hamishivi_rlvr_orz_math_57k_collected_all_filtered_hamishivi_qwen2_5_openthoughts2" "1.0" \
+    --dataset_mixer_list_splits "train" \
+    --max_token_length 10240 \
+    --max_prompt_token_length 2048 \
+    --temperature 1.0 \
+    --response_length "$response_length" \
+    --vllm_top_p 0.9 \
+    --num_unique_prompts_rollout 8 \
+    --num_samples_per_prompt_rollout 16 \
+    --vllm_num_engines 1 \
+    --vllm_tensor_parallel_size 1 \
+    --vllm_gpu_memory_utilization 0.9 \
+    --pack_length 20480 \
+    --chat_template_name "tulu_thinker" \
+    --trust_remote_code \
+    --seed 42 \
+    --dataset_local_cache_dir "benchmark_cache" \
+    --dataset_cache_mode "local" \
+    --dataset_transform_fn "rlvr_tokenize_v1" "rlvr_filter_v1" \
+    "$@"
diff --git a/scripts/run_benchmark.sh b/scripts/run_benchmark.sh
new file mode 100755
index 0000000000..0a00fc3caf
--- /dev/null
+++ b/scripts/run_benchmark.sh
@@ -0,0 +1,38 @@
+#!/bin/bash
+# Runs the benchmark on gantry. Takes one argument which is the response length.
+# Usage: ./gantry_run_benchmark.sh [response_length]
+# E.g. $ ./gantry_run_benchmark.sh 64000
+set -e
+
+# Set default value for response_length
+response_length=64000
+
+# If first argument exists and is a number, use it as response_length
+if [[ "$1" =~ ^[0-9]+$ ]]; then
+  response_length="$1"
+  shift
+fi
+
+uv run python -m open_instruct.benchmark_generators \
+    --model_name_or_path "hamishivi/qwen2_5_openthoughts2" \
+    --tokenizer_name_or_path "hamishivi/qwen2_5_openthoughts2" \
+    --dataset_mixer_list "hamishivi/hamishivi_rlvr_orz_math_57k_collected_all_filtered_hamishivi_qwen2_5_openthoughts2" "1.0" \
+    --dataset_mixer_list_splits "train" \
+    --max_token_length 10240 \
+    --max_prompt_token_length 2048 \
+    --temperature 1.0 \
+    --response_length 64 \
+    --vllm_top_p 0.9 \
+    --num_unique_prompts_rollout 8 \
+    --num_samples_per_prompt_rollout 1 \
+    --vllm_num_engines 1 \
+    --vllm_tensor_parallel_size 1 \
+    --vllm_gpu_memory_utilization 0.9 \
+    --pack_length 20480 \
+    --chat_template_name "tulu_thinker" \
+    --trust_remote_code \
+    --seed 42 \
+    --dataset_local_cache_dir "benchmark_cache" \
+    --dataset_cache_mode "local" \
+    --dataset_transform_fn "rlvr_tokenize_v1" "rlvr_filter_v1" \
+    "$@"