Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 54 additions & 0 deletions scripts/gantry_run_benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#!/bin/bash
# Runs the benchmark on gantry. Takes one argument which is the response length.
# Usage: ./gantry_run_benchmark.sh [response_length]
# E.g. $ ./gantry_run_benchmark.sh 64000
set -e

# Set default value for response_length
response_length=64000

# If first argument exists and is a number, use it as response_length
if [[ "$1" =~ ^[0-9]+$ ]]; then
response_length="$1"
shift
fi

gantry run \
--name open_instruct-benchmark_generators \
--workspace ai2/oe-eval \
--weka=oe-eval-default:/weka \
--gpus 1 \
--beaker-image nathanl/open_instruct_auto \
--cluster ai2/jupiter-cirrascale-2 \
--budget ai2/oe-eval \
--install 'pip install --upgrade pip "setuptools<70.0.0" wheel
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

do we need to do this install even when we are using the open-intruct image?

Copy link
Collaborator Author

@finbarrtimbers finbarrtimbers Jul 15, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You mean the nathanl/open_instruct_auto image? I will check again but I believe it was failing last time I did so.

Launched a run.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Fails without it:
Screenshot 2025-07-15 at 11 58 02 AM

# TODO, unpin setuptools when this issue in flash attention is resolved
pip install torch==2.7.0 torchvision==0.22.0 --index-url https://download.pytorch.org/whl/cu128
pip install packaging
pip install flash-attn==2.8.0.post2 --no-build-isolation
pip install -r requirements.txt
pip install -e .
python -m nltk.downloader punkt' \
-- python -m open_instruct.benchmark_generators \
--model_name_or_path "hamishivi/qwen2_5_openthoughts2" \
--tokenizer_name_or_path "hamishivi/qwen2_5_openthoughts2" \
--dataset_mixer_list "hamishivi/hamishivi_rlvr_orz_math_57k_collected_all_filtered_hamishivi_qwen2_5_openthoughts2" "1.0" \
--dataset_mixer_list_splits "train" \
--max_token_length 10240 \
--max_prompt_token_length 2048 \
--temperature 1.0 \
--response_length "$response_length" \
--vllm_top_p 0.9 \
--num_unique_prompts_rollout 8 \
--num_samples_per_prompt_rollout 16 \
--vllm_num_engines 1 \
--vllm_tensor_parallel_size 1 \
--vllm_gpu_memory_utilization 0.9 \
--pack_length 20480 \
--chat_template_name "tulu_thinker" \
--trust_remote_code \
--seed 42 \
--dataset_local_cache_dir "benchmark_cache" \
--dataset_cache_mode "local" \
--dataset_transform_fn "rlvr_tokenize_v1" "rlvr_filter_v1" \
"$@"
38 changes: 38 additions & 0 deletions scripts/run_benchmark.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#!/bin/bash
# Runs the benchmark on gantry. Takes one argument which is the response length.
# Usage: ./gantry_run_benchmark.sh [response_length]
# E.g. $ ./gantry_run_benchmark.sh 64000
set -e

# Set default value for response_length
response_length=64000

# If first argument exists and is a number, use it as response_length
if [[ "$1" =~ ^[0-9]+$ ]]; then
response_length="$1"
shift
fi

uv run python -m open_instruct.benchmark_generators \
--model_name_or_path "hamishivi/qwen2_5_openthoughts2" \
--tokenizer_name_or_path "hamishivi/qwen2_5_openthoughts2" \
--dataset_mixer_list "hamishivi/hamishivi_rlvr_orz_math_57k_collected_all_filtered_hamishivi_qwen2_5_openthoughts2" "1.0" \
--dataset_mixer_list_splits "train" \
--max_token_length 10240 \
--max_prompt_token_length 2048 \
--temperature 1.0 \
--response_length 64 \
--vllm_top_p 0.9 \
--num_unique_prompts_rollout 8 \
--num_samples_per_prompt_rollout 1 \
--vllm_num_engines 1 \
--vllm_tensor_parallel_size 1 \
--vllm_gpu_memory_utilization 0.9 \
--pack_length 20480 \
--chat_template_name "tulu_thinker" \
--trust_remote_code \
--seed 42 \
--dataset_local_cache_dir "benchmark_cache" \
--dataset_cache_mode "local" \
--dataset_transform_fn "rlvr_tokenize_v1" "rlvr_filter_v1" \
"$@"