diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index e99282490..f12b586f4 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -1724,9 +1724,8 @@ gptoss-fp4-h200-vllm: - { tp: 8, conc-start: 4, conc-end: 32 } dsr1-fp4-gb200-dynamo-trt: - image: nvcr.io#nvidia/ai-dynamo/tensorrtllm-runtime:0.5.1-rc0.pre3 - # Models are pre-downloaded to this path on GB200 runner to avoid repeated downloading - model: /mnt/lustre01/models/deepseek-r1-0528-fp4-v2 + image: nvcr.io/nvidia/ai-dynamo/tensorrtllm-runtime:0.8.1.post2 + model: nvidia/DeepSeek-R1-0528-NVFP4-v2 model-prefix: dsr1 runner: gb200 precision: fp4 @@ -1737,440 +1736,563 @@ dsr1-fp4-gb200-dynamo-trt: - isl: 1024 osl: 1024 search-space: - # MTP configurations - # tep - Run Tensor-Expert Parallel mode (attention_dp=false) - # NOTE: Prefill tp and ep are always 4 because each GB200 node has 4 GPUs and - # ctx_tp_size is hardcoded to 4 in launch_gb200-nv.sh. Decode tp/ep matches gen_tp_size. - # For 1k/1k: prefill batch-size=4, max-num-tokens=4608 + # MTP configurations (spec_decoding="mtp") - spec-decoding: "mtp" - conc-list: [ 1, 2, 4, 8, 16, 36 ] + conc-list: [ 180 ] prefill: num-worker: 1 tp: 4 ep: 4 - dp-attn: false + dp-attn: true additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [ 4, 8, 12, 24, 48 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" decode: num-worker: 4 tp: 8 ep: 8 dp-attn: false - additional-settings: - - "DECODE_MAX_NUM_TOKENS=128" - - "DECODE_MAX_BATCH_SIZE=32" - - "DECODE_GPU_MEM_FRACTION=0.9" - - "DECODE_MTP_SIZE=3" - - # dep - Run Data-Expert Parallel mode (attention_dp=true) - spec-decoding: "mtp" - conc-list: [ 512, 1075 ] + conc-list: [ 4301 ] prefill: - num-worker: 1 + num-worker: 2 tp: 4 ep: 4 dp-attn: true additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx2_gen1_dep16_batch256_eplb256_mtp1.yaml" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=256" - - "DECODE_MAX_BATCH_SIZE=64" - - "DECODE_GPU_MEM_FRACTION=0.7" - - "DECODE_MTP_SIZE=3" - - spec-decoding: "mtp" - conc-list: [ 2150 ] + conc-list: [ 2253 ] prefill: - num-worker: 2 + num-worker: 3 tp: 4 ep: 4 dp-attn: true additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen1_dep32_batch64_eplb288_mtp1.yaml" decode: num-worker: 1 - tp: 16 - ep: 16 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [ 16130 ] + prefill: + num-worker: 3 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "DECODE_MAX_NUM_TOKENS=256" - - "DECODE_MAX_BATCH_SIZE=128" - - "DECODE_GPU_MEM_FRACTION=0.7" - - "DECODE_MTP_SIZE=1" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/mtp/ctx3_gen5_dep4_batch768_eplb0_mtp1.yaml" + decode: + num-worker: 5 + tp: 4 + ep: 4 + dp-attn: true - - spec-decoding: "mtp" - conc-list: [ 512 ] + + # Non-MTP configurations (default spec_decoding="none") + - conc-list: [ 4301 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep8_batch512_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 8 + ep: 8 + dp-attn: true + - conc-list: [ 666 ] prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true + - conc-list: [ 6144 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true additional-settings: - - "DECODE_MAX_NUM_TOKENS=64" - - "DECODE_MAX_BATCH_SIZE=16" - - "DECODE_GPU_MEM_FRACTION=0.6" - - "DECODE_MTP_SIZE=3" - - - spec-decoding: "mtp" - conc-list: [ 2252 ] + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen2_dep4_batch768_eplb0_mtp0.yaml" + decode: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + - conc-list: [ 12, 24, 48, 96, 192 ] prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch32_eplb0_mtp0.yaml" decode: + num-worker: 4 + tp: 8 + ep: 8 + dp-attn: false + - conc-list: [ 5 ] + prefill: num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" + decode: + num-worker: 4 tp: 8 ep: 8 + dp-attn: false + - conc-list: [ 4301 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "DECODE_MAX_NUM_TOKENS=512" - - "DECODE_MAX_BATCH_SIZE=256" - - "DECODE_GPU_MEM_FRACTION=0.8" - - "DECODE_MTP_SIZE=1" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep16_batch256_eplb256_mtp0.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true + - conc-list: [ 2253 ] + prefill: + num-worker: 2 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k1k/stp/ctx2_gen1_dep32_batch64_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true - # Non-MTP configurations (default spec_decoding="none") - # tep - Run Tensor-Expert Parallel mode (attention_dp=false) - - conc-list: [ 1, 2, 4, 8, 16, 32, 64, 141 ] + - isl: 1024 + osl: 8192 + search-space: + # MTP configurations (spec_decoding="mtp") + - spec-decoding: "mtp" + conc-list: [ 4, 8, 12, 24, 48 ] prefill: num-worker: 1 tp: 4 ep: 4 - dp-attn: false + dp-attn: true additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen7_tep8_batch4_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen7_tep8_batch4_eplb0_mtp3.yaml" decode: - num-worker: 4 + num-worker: 7 tp: 8 ep: 8 dp-attn: false + - spec-decoding: "mtp" + conc-list: [ 7 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true additional-settings: - - "DECODE_MAX_NUM_TOKENS=128" - - "DECODE_MAX_BATCH_SIZE=128" - - "DECODE_GPU_MEM_FRACTION=0.9" - - "DECODE_MTP_SIZE=0" - - # dep - Run Data-Expert Parallel mode (attention_dp=true) - - conc-list: [ 1075 ] + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen7_tep8_batch1_eplb0_mtp3.yaml" + decode: + num-worker: 7 + tp: 8 + ep: 8 + dp-attn: false + - spec-decoding: "mtp" + conc-list: [ 128 ] prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch4_eplb0_mtp3.yaml" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=32" - - "DECODE_MAX_BATCH_SIZE=32" - - "DECODE_GPU_MEM_FRACTION=0.7" - - "DECODE_MTP_SIZE=0" - - - conc-list: [ 1075 ] + - spec-decoding: "mtp" + conc-list: [ 512 ] prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch16_eplb0_mtp3.yaml" decode: num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - spec-decoding: "mtp" + conc-list: [ 3072 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen3_dep16_batch64_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen3_dep16_batch64_eplb0_mtp3.yaml" + decode: + num-worker: 3 tp: 16 ep: 16 dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=64" - - "DECODE_MAX_BATCH_SIZE=64" - - "DECODE_GPU_MEM_FRACTION=0.75" - - "DECODE_MTP_SIZE=0" - - - conc-list: [ 2048, 4300 ] + - spec-decoding: "mtp" + conc-list: [ 6144 ] prefill: - num-worker: 2 + num-worker: 1 tp: 4 ep: 4 dp-attn: true additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen3_dep16_batch128_eplb0_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen3_dep16_batch128_eplb0_mtp1.yaml" decode: - num-worker: 1 + num-worker: 3 tp: 16 ep: 16 dp-attn: true + - spec-decoding: "mtp" + conc-list: [ 8192 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true additional-settings: - - "DECODE_MAX_NUM_TOKENS=256" - - "DECODE_MAX_BATCH_SIZE=256" - - "DECODE_GPU_MEM_FRACTION=0.75" - - "DECODE_MTP_SIZE=0" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch256_eplb288_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/mtp/ctx1_gen1_dep32_batch256_eplb288_mtp1.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true - - conc-list: [ 4300 ] + # Non-MTP configurations (default spec_decoding="none") + - conc-list: [ 5 ] prefill: num-worker: 1 tp: 4 ep: 4 dp-attn: true additional-settings: - - "PREFILL_MAX_NUM_TOKENS=4608" - - "PREFILL_MAX_BATCH_SIZE=4" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen7_tep8_batch1_eplb0_mtp0.yaml" decode: - num-worker: 1 + num-worker: 7 tp: 8 ep: 8 + dp-attn: false + - conc-list: [ 60 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 dp-attn: true additional-settings: - - "DECODE_MAX_NUM_TOKENS=512" - - "DECODE_MAX_BATCH_SIZE=512" - - "DECODE_GPU_MEM_FRACTION=0.8" - - "DECODE_MTP_SIZE=0" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen15_tep4_batch4_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen15_tep4_batch4_eplb0_mtp0.yaml" + decode: + num-worker: 15 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [ 135 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen15_tep4_batch8_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen15_tep4_batch8_eplb0_mtp0.yaml" + decode: + num-worker: 15 + tp: 4 + ep: 4 + dp-attn: false + - conc-list: [ 563 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch16_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [ 2048 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch64_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch64_eplb0_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [ 4096 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch128_eplb288_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch128_eplb288_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true + - conc-list: [ 8192 ] + prefill: + num-worker: 1 + tp: 4 + ep: 4 + dp-attn: true + additional-settings: + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch256_eplb288_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/1k8k/stp/ctx1_gen1_dep32_batch256_eplb288_mtp0.yaml" + decode: + num-worker: 1 + tp: 32 + ep: 32 + dp-attn: true - isl: 8192 osl: 1024 search-space: # MTP configurations (spec_decoding="mtp") - # tep - Run Tensor-Expert Parallel mode (attention_dp=false) - # For 8k/1k: prefill batch-size=1, max-num-tokens=8448 - spec-decoding: "mtp" - conc-list: [ 1, 2, 4, 8, 18 ] + conc-list: [ 4, 8, 12, 24, 48 ] prefill: num-worker: 1 tp: 4 ep: 4 - dp-attn: false + dp-attn: true additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx1_gen4_tep8_batch8_eplb0_mtp3.yaml" decode: - num-worker: 3 + num-worker: 4 tp: 8 ep: 8 dp-attn: false - additional-settings: - - "DECODE_MAX_NUM_TOKENS=64" - - "DECODE_MAX_BATCH_SIZE=16" - - "DECODE_GPU_MEM_FRACTION=0.9" - - "DECODE_MTP_SIZE=3" - - # dep - Run Data-Expert Parallel mode (attention_dp=true) - spec-decoding: "mtp" - conc-list: [ 128, 269 ] + conc-list: [ 180 ] prefill: - num-worker: 5 + num-worker: 3 tp: 4 ep: 4 dp-attn: true additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx3_gen1_dep32_batch4_eplb0_mtp3.yaml" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true + - spec-decoding: "mtp" + conc-list: [ 1229 ] + prefill: + num-worker: 7 + tp: 4 + ep: 4 + dp-attn: true additional-settings: - - "DECODE_MAX_NUM_TOKENS=32" - - "DECODE_MAX_BATCH_SIZE=8" - - "DECODE_GPU_MEM_FRACTION=0.7" - - "DECODE_MTP_SIZE=3" - + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx7_gen1_dep16_batch64_eplb256_mtp1.yaml" + decode: + num-worker: 1 + tp: 16 + ep: 16 + dp-attn: true - spec-decoding: "mtp" - conc-list: [ 538 ] + conc-list: [ 666 ] prefill: num-worker: 8 tp: 4 ep: 4 dp-attn: true additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx8_gen1_dep32_batch16_eplb0_mtp3.yaml" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=64" - - "DECODE_MAX_BATCH_SIZE=16" - - "DECODE_GPU_MEM_FRACTION=0.7" - - "DECODE_MTP_SIZE=3" - - spec-decoding: "mtp" - conc-list: [ 1075 ] + conc-list: [ 4301 ] prefill: - num-worker: 8 + num-worker: 11 tp: 4 ep: 4 dp-attn: true additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/mtp/ctx11_gen1_dep16_batch256_eplb256_mtp1.yaml" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=256" - - "DECODE_MAX_BATCH_SIZE=64" - - "DECODE_GPU_MEM_FRACTION=0.75" - - "DECODE_MTP_SIZE=2" - - spec-decoding: "mtp" - conc-list: [ 2150 ] + # Non-MTP configurations (default spec_decoding="none") + - conc-list: [ 12, 44, 76 ] prefill: - num-worker: 6 + num-worker: 1 tp: 4 ep: 4 dp-attn: true additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch16_eplb0_mtp0.yaml" decode: - num-worker: 1 + num-worker: 4 tp: 8 ep: 8 dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=512" - - "DECODE_MAX_BATCH_SIZE=256" - - "DECODE_GPU_MEM_FRACTION=0.8" - - "DECODE_MTP_SIZE=1" - - # Non-MTP configurations (default spec_decoding="none") - # tep - Run Tensor-Expert Parallel mode (attention_dp=false) - - conc-list: [ 1, 2, 4, 8, 16, 34 ] + - conc-list: [ 5 ] prefill: num-worker: 1 tp: 4 ep: 4 - dp-attn: false + dp-attn: true additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx1_gen4_tep8_batch1_eplb0_mtp0.yaml" decode: - num-worker: 3 + num-worker: 4 tp: 8 ep: 8 - dp-attn: false - additional-settings: - - "DECODE_MAX_NUM_TOKENS=32" - - "DECODE_MAX_BATCH_SIZE=32" - - "DECODE_GPU_MEM_FRACTION=0.9" - - "DECODE_MTP_SIZE=0" - - # dep - Run Data-Expert Parallel mode (attention_dp=true) - - conc-list: [ 256, 538 ] + dp-attn: true + - conc-list: [ 333 ] prefill: - num-worker: 4 + num-worker: 2 tp: 4 ep: 4 dp-attn: true additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx2_gen1_dep32_batch8_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 32 ep: 32 dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=16" - - "DECODE_MAX_BATCH_SIZE=16" - - "DECODE_GPU_MEM_FRACTION=0.7" - - "DECODE_MTP_SIZE=0" - - - conc-list: [ 1075 ] + - conc-list: [ 1229 ] prefill: - num-worker: 6 + num-worker: 7 tp: 4 ep: 4 dp-attn: true additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx7_gen1_dep32_batch32_eplb0_mtp0.yaml" decode: num-worker: 1 - tp: 16 - ep: 16 + tp: 32 + ep: 32 dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=64" - - "DECODE_MAX_BATCH_SIZE=64" - - "DECODE_GPU_MEM_FRACTION=0.75" - - "DECODE_MTP_SIZE=0" - - - conc-list: [ 2150 ] + - conc-list: [ 2253 ] prefill: num-worker: 8 tp: 4 ep: 4 dp-attn: true additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx8_gen1_dep16_batch128_eplb0_mtp0.yaml" decode: num-worker: 1 tp: 16 ep: 16 dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=128" - - "DECODE_MAX_BATCH_SIZE=128" - - "DECODE_GPU_MEM_FRACTION=0.75" - - "DECODE_MTP_SIZE=0" - - - conc-list: [ 2150 ] + - conc-list: [ 4096 ] prefill: - num-worker: 5 + num-worker: 10 tp: 4 ep: 4 dp-attn: true additional-settings: - - "PREFILL_MAX_NUM_TOKENS=8448" - - "PREFILL_MAX_BATCH_SIZE=1" + # https://github.com/ishandhanani/srt-slurm/blob/sa-submission-q1-2026/recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml + - "CONFIG_FILE=recipes/trtllm/gb200-fp4/8k1k/stp/ctx10_gen1_dep16_batch256_eplb256_mtp0.yaml" decode: num-worker: 1 - tp: 8 - ep: 8 + tp: 16 + ep: 16 dp-attn: true - additional-settings: - - "DECODE_MAX_NUM_TOKENS=256" - - "DECODE_MAX_BATCH_SIZE=256" - - "DECODE_GPU_MEM_FRACTION=0.8" - - "DECODE_MTP_SIZE=0" dsr1-fp8-gb200-dynamo-sglang: image: lmsysorg/sglang:v0.5.5.post2 - # model: deepseek-ai/DeepSeek-R1-0528 - # Models are pre-downloaded to this path on GB200 runner to avoid repeated downloading - model: /mnt/lustre01/models/deepseek-r1-0528 + model: deepseek-ai/DeepSeek-R1-0528 model-prefix: dsr1 runner: gb200 precision: fp8 @@ -2291,11 +2413,8 @@ dsr1-fp8-gb200-dynamo-sglang: - "DECODE_NODES=8" dsr1-fp4-gb200-dynamo-sglang: - image: lmsysorg/sglang:v0.5.5.post2 - # TODO: what is the right name? - # model: deepseek-ai/DeepSeek-R1-0528-fp4-v2 - # Models are pre-downloaded to this path on GB200 runner to avoid repeated downloading - model: /mnt/lustre01/models/deepseek-r1-0528-fp4-v2 + image: "lmsysorg/sglang:v0.5.5.post2" + model: nvidia/DeepSeek-R1-0528-NVFP4-v2 model-prefix: dsr1 runner: gb200 precision: fp4 @@ -2710,4 +2829,4 @@ gptoss-fp4-gb200-dynamo-trt: - "DECODE_MAX_NUM_TOKENS=20000" - "DECODE_MAX_BATCH_SIZE=512" - "DECODE_GPU_MEM_FRACTION=0.9" - \ No newline at end of file + diff --git a/benchmarks/dsr1_fp4_gb200_dynamo-trt.sh b/benchmarks/dsr1_fp4_gb200_dynamo-trt.sh deleted file mode 100644 index b7e4836ba..000000000 --- a/benchmarks/dsr1_fp4_gb200_dynamo-trt.sh +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/bash - -set -x - -source "$(dirname "$0")/benchmark_lib.sh" - -check_env_vars \ - CONC_LIST \ - ISL \ - OSL \ - IMAGE \ - SPEC_DECODING \ - PREFILL_NUM_WORKERS \ - PREFILL_TP \ - PREFILL_EP \ - PREFILL_DP_ATTN \ - DECODE_NUM_WORKERS \ - DECODE_TP \ - DECODE_EP \ - DECODE_DP_ATTN \ - PREFILL_MAX_NUM_TOKENS \ - PREFILL_MAX_BATCH_SIZE \ - DECODE_MAX_NUM_TOKENS \ - DECODE_MAX_BATCH_SIZE \ - DECODE_GPU_MEM_FRACTION \ - MODEL_PATH \ - SERVED_MODEL_NAME - -if [ "$SPEC_DECODING" == "mtp" ]; then - check_env_vars DECODE_MTP_SIZE -else - DECODE_MTP_SIZE="0" -fi - -PERFORMANCE_SWEEPS_PATH="components/backends/trtllm/performance_sweeps" - -echo "Cloning Dynamo repository..." -git clone https://github.com/ai-dynamo/dynamo.git -cd dynamo -git checkout release/0.5.1-rc0.20251105 -git submodule update --init --recursive - -cd "$PERFORMANCE_SWEEPS_PATH" - -# Set up environment variables based on ISL/OSL -if [ "$ISL" = "1024" ] && [ "$OSL" = "1024" ]; then - export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=4608 -elif [ "$ISL" = "8192" ] && [ "$OSL" = "1024" ]; then - export CACHE_TRANSCEIVER_MAX_NUM_TOKENS=8448 -else - echo "Unsupported ISL/OSL combination: $ISL/$OSL" - exit 1 -fi - -kind=dynamo_disagg -additional_slurm_args="--time=04:00:00" -ntasks_per_node=4 - -gen_nodes=$(((DECODE_TP + 3)/4 * DECODE_NUM_WORKERS)) -total_nodes=$((PREFILL_NUM_WORKERS + gen_nodes)) -total_tasks=$((total_nodes * ntasks_per_node)) - -decode_eplb_num_slots=0 - -sbatch --nodes=${total_nodes} \ - --ntasks=${total_tasks} \ - --ntasks-per-node=${ntasks_per_node} \ - --segment=${total_nodes} ${additional_slurm_args} \ - benchmark_disagg.slurm \ - ${PREFILL_NUM_WORKERS} ${PREFILL_TP} \ - ${PREFILL_MAX_BATCH_SIZE} ${PREFILL_MAX_NUM_TOKENS} \ - ${PREFILL_DP_ATTN} ${DECODE_NUM_WORKERS} \ - ${DECODE_TP} ${DECODE_MAX_BATCH_SIZE} \ - ${DECODE_MAX_NUM_TOKENS} ${DECODE_DP_ATTN} \ - ${DECODE_GPU_MEM_FRACTION} ${decode_eplb_num_slots} \ - ${DECODE_MTP_SIZE} "${CONC_LIST}" \ - ${gen_nodes} ${kind} \ - ${MODEL_PATH} ${SERVED_MODEL_NAME} \ - ${IMAGE} ${ISL} ${OSL} \ No newline at end of file diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 168904456..6fbdb671c 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -309,3 +309,12 @@ - "Includes MTP and STP configurations for 1k1k and 8k1k sequence lengths" - "Concurrency levels: 4, 8, 16, 32, 64, 128, 256, 512" pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/570 + +- config-keys: + - dsr1-fp4-gb200-dynamo-trt + description: + - "Update Dynamo TRT image from 0.5.1-rc0.pre3 to 0.8.1.post2" + - "Update TRT configurations" + - "Refactor configurations to use CONFIG_FILE-based recipes instead of inline parameter settings" + - "Introduce srt-slurm workflow for launching Dynamo jobs" + pr-link: https://github.com/InferenceMAX/InferenceMAX/pull/510 diff --git a/runners/launch_gb200-nv.sh b/runners/launch_gb200-nv.sh index ed626e252..1944e04e0 100755 --- a/runners/launch_gb200-nv.sh +++ b/runners/launch_gb200-nv.sh @@ -4,116 +4,62 @@ set -x -# Set up environment variables for SLURM -export SLURM_PARTITION="batch" -export SLURM_ACCOUNT="benchmark" -export SLURM_JOB_NAME="benchmark-dynamo.job" - -# For SGLang - we are working on updating the 8k1k configs -# For now we add conditionals to this script to use newer code for the 1k1k configs - -### FRAMEWORK_DIFF_IF_STATEMENT #1 - difference in setting up envvars -SQUASH_FILE="/mnt/lustre01/users/sa-shared/images/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" -srun --partition=$SLURM_PARTITION --exclusive --time=180 bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" - -# Update the IMAGE variable to the squash file -export IMAGE=$SQUASH_FILE - -# MODEL_PATH is set in `nvidia-master.yaml` or any other yaml files -export MODEL_PATH=$MODEL - +# MODEL_PATH: Override with pre-downloaded paths on GB200 runner +# The yaml files specify HuggingFace model IDs for portability, but we use +# local paths to avoid repeated downloading on the shared GB200 cluster. if [[ $FRAMEWORK == "dynamo-sglang" ]]; then export CONFIG_DIR="/mnt/lustre01/artifacts/sglang-configs/1k1k" - export SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs" + if [[ $MODEL_PREFIX == "dsr1" ]]; then + export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528" + else + export MODEL_PATH=$MODEL + fi elif [[ $FRAMEWORK == "dynamo-trt" ]]; then if [[ $MODEL_PREFIX == "gptoss" ]]; then export MODEL_PATH="/mnt/lustre01/models/gpt-oss-120b" export SERVED_MODEL_NAME="gpt-oss-120b" elif [[ $MODEL_PREFIX == "dsr1" ]]; then + export MODEL_PATH="/mnt/lustre01/models/deepseek-r1-0528-fp4-v2/" export SERVED_MODEL_NAME="deepseek-r1-fp4" else - echo "Unsupported model prefix: $MODEL_PREFIX. Supported prefixes are: gptoss" + echo "Unsupported model prefix: $MODEL_PREFIX. Supported prefixes are: gptoss or dsr1" exit 1 fi +else + export MODEL_PATH=$MODEL fi -export ISL="$ISL" -export OSL="$OSL" - -bash benchmarks/"${EXP_NAME%%_*}_${PRECISION}_gb200_${FRAMEWORK}.sh" - -# Wait for all jobs to complete -echo "Waiting for all jobs to complete..." -while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do - echo "Jobs still running..." - squeue --steps -u $USER - sleep 30 -done - -# FIXME: The below is bad and is a result of the indirection of the ways in which -# Dynamo jobs are launched. In a follow-up PR, the location of the result file should not -# depend on the runner, it should always be in the same spot in the GH workspace. - -# Process results from all configurations -if [[ $FRAMEWORK == "dynamo-trt" ]]; then - - # Find the logs directory (should be only one for this ISL/OSL combination) - LOGS_DIR=$(find . -name "dynamo_disagg-bm-${ISL}-${OSL}" -type d | head -1) - if [ -z "$LOGS_DIR" ]; then - echo "No logs directory found for ISL=${ISL}, OSL=${OSL}" - exit 1 - fi - - echo "Found logs directory: $LOGS_DIR" - - # Find all result subdirectories in this logs directory - RESULT_SUBDIRS=$(find "$LOGS_DIR" -name "ctx*_gen*_*_batch*_eplb*_mtp*" -type d) - - if [ -z "$RESULT_SUBDIRS" ]; then - echo "No result subdirectories found in $LOGS_DIR" - exit 1 - fi - - echo "Found result subdirectories:" - echo "$RESULT_SUBDIRS" - - # Process results from all configurations - for result_subdir in $RESULT_SUBDIRS; do - echo "Processing result subdirectory: $result_subdir" +# Set up environment variables for SLURM +export SLURM_PARTITION="batch" +export SLURM_ACCOUNT="benchmark" +export SLURM_JOB_NAME="benchmark-dynamo.job" - # Extract configuration info from directory name - CONFIG_NAME=$(basename "$result_subdir") +NGINX_IMAGE="nginx:1.27.4" - # Process individual concurrency result files - RESULTS_SUBDIR="$result_subdir/results" +SQUASH_FILE="/mnt/lustre01/users-public/sa-shared/$(echo "$IMAGE" | sed 's/[\/:@#]/_/g').sqsh" +NGINX_SQUASH_FILE="/mnt/lustre01/users-public/sa-shared/$(echo "$NGINX_IMAGE" | sed 's/[\/:@#]/_/g').sqsh" - if [ -d "$RESULTS_SUBDIR" ]; then - echo "Processing results from: $RESULTS_SUBDIR" +srun -N 1 -A $SLURM_ACCOUNT -p $SLURM_PARTITION bash -c "enroot import -o $SQUASH_FILE docker://$IMAGE" +srun -N 1 -A $SLURM_ACCOUNT -p $SLURM_PARTITION bash -c "enroot import -o $NGINX_SQUASH_FILE docker://$NGINX_IMAGE" - # Find all concurrency result files with new format - CONCURRENCY_FILES=$(find "$RESULTS_SUBDIR" -name "results_concurrency_*_gpus_*.json") - for result_file in $CONCURRENCY_FILES; do - if [ -f "$result_file" ]; then - # Extract concurrency and GPU count from filename - filename=$(basename "$result_file") - concurrency=$(echo "$filename" | sed 's/results_concurrency_\([0-9]*\)_gpus_.*\.json/\1/') - gpus=$(echo "$filename" | sed 's/results_concurrency_.*_gpus_\([0-9]*\)\.json/\1/') - echo "Processing concurrency $concurrency with $gpus GPUs: $result_file" - # Copy the result file to workspace with a unique name - WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus${gpus}.json" - cp "$result_file" "$WORKSPACE_RESULT_FILE" +export ISL="$ISL" +export OSL="$OSL" - echo "Copied result file to: $WORKSPACE_RESULT_FILE" - fi - done - else - echo "Results subdirectory not found: $RESULTS_SUBDIR" - fi +if [[ $FRAMEWORK == "dynamo-sglang" ]]; then + export IMAGE=$SQUASH_FILE + export SGL_SLURM_JOBS_PATH="dynamo/examples/backends/sglang/slurm_jobs" + bash benchmarks/"${EXP_NAME%%_*}_${PRECISION}_gb200_${FRAMEWORK}.sh" + # Wait for all jobs to complete + echo "Waiting for all jobs to complete..." + while [ -n "$(squeue -u $USER --noheader --format='%i')" ]; do + echo "Jobs still running..." + squeue --steps -u $USER + sleep 30 done -else # search for "FRAMEWORK_DIFF_IF_STATEMENT #3" for this if-statement - # Find the latest log directory that contains the data + + # Find the latest log directory that contains the data cat > collect_latest_results.py <<'PY' import os, sys sgl_job_dir, isl, osl, nexp = sys.argv[1], int(sys.argv[2]), int(sys.argv[3]), int(sys.argv[4]) @@ -141,6 +87,162 @@ PY cp $result_file $WORKSPACE_RESULT_FILE fi done + + exit 0 +fi + +echo "Cloning srt-slurm repository..." +SRT_REPO_DIR="srt-slurm" +if [ -d "$SRT_REPO_DIR" ]; then + echo "Removing existing $SRT_REPO_DIR..." + rm -rf "$SRT_REPO_DIR" fi -echo "All result files processed" +git clone https://github.com/ishandhanani/srt-slurm.git "$SRT_REPO_DIR" +cd "$SRT_REPO_DIR" +git checkout sa-submission-q1-2026 + +echo "Installing srtctl..." +curl -LsSf https://astral.sh/uv/install.sh | sh +source $HOME/.local/bin/env + +uv venv +source .venv/bin/activate +uv pip install -e . + +if ! command -v srtctl &> /dev/null; then + echo "Error: Failed to install srtctl" + exit 1 +fi + +echo "Configs available at: $SRT_REPO_DIR/" + +# Create srtslurm.yaml for srtctl (used by both frameworks) +SRTCTL_ROOT="${GITHUB_WORKSPACE}/srt-slurm" +echo "Creating srtslurm.yaml configuration..." +cat > srtslurm.yaml <&1) +else + SRTCTL_OUTPUT=$(srtctl apply -f "$CONFIG_FILE" --tags "gb200,${MODEL_PREFIX},${PRECISION},${ISL}x${OSL},infmax-$(date +%Y%m%d)" 2>&1) +fi +echo "$SRTCTL_OUTPUT" + +JOB_ID=$(echo "$SRTCTL_OUTPUT" | grep -oP '✅ Job \K[0-9]+' || echo "$SRTCTL_OUTPUT" | grep -oP 'Job \K[0-9]+') + +if [ -z "$JOB_ID" ]; then + echo "Error: Failed to extract JOB_ID from srtctl output" + exit 1 +fi + +echo "Extracted JOB_ID: $JOB_ID" + +# Wait for this specific job to complete +echo "Waiting for job $JOB_ID to complete..." +while [ -n "$(squeue -j $JOB_ID --noheader 2>/dev/null)" ]; do + echo "Job $JOB_ID still running..." + squeue -j $JOB_ID + sleep 30 +done +echo "Job $JOB_ID completed!" + +echo "Collecting results..." + +# Use the JOB_ID to find the logs directory +# srtctl creates logs in outputs/JOB_ID/logs/ +LOGS_DIR="outputs/$JOB_ID/logs" + +if [ ! -d "$LOGS_DIR" ]; then + echo "Warning: Logs directory not found at $LOGS_DIR" + exit 1 +fi + +echo "Found logs directory: $LOGS_DIR" + +cat $LOGS_DIR/sweep_$JOB_ID.log + +for file in $LOGS_DIR/*; do + if [ -f "$file" ]; then + tail -n 500 $file + fi +done + +# Find all result subdirectories +RESULT_SUBDIRS=$(find "$LOGS_DIR" -maxdepth 1 -type d -name "*isl*osl*" 2>/dev/null) + +if [ -z "$RESULT_SUBDIRS" ]; then + echo "Warning: No result subdirectories found in $LOGS_DIR" +else + # Process results from all configurations + for result_subdir in $RESULT_SUBDIRS; do + echo "Processing result subdirectory: $result_subdir" + + # Extract configuration info from directory name + CONFIG_NAME=$(basename "$result_subdir") + + # Find all result JSON files + RESULT_FILES=$(find "$result_subdir" -name "results_concurrency_*.json" 2>/dev/null) + + for result_file in $RESULT_FILES; do + if [ -f "$result_file" ]; then + # Extract metadata from filename + # Files are of the format "results_concurrency_gpus_{num gpus}_ctx_{num ctx}_gen_{num gen}.json" + filename=$(basename "$result_file") + concurrency=$(echo "$filename" | sed -n 's/results_concurrency_\([0-9]*\)_gpus_.*/\1/p') + gpus=$(echo "$filename" | sed -n 's/results_concurrency_[0-9]*_gpus_\([0-9]*\)_ctx_.*/\1/p') + ctx=$(echo "$filename" | sed -n 's/.*_ctx_\([0-9]*\)_gen_.*/\1/p') + gen=$(echo "$filename" | sed -n 's/.*_gen_\([0-9]*\)\.json/\1/p') + + echo "Processing concurrency $concurrency with $gpus GPUs (ctx: $ctx, gen: $gen): $result_file" + + WORKSPACE_RESULT_FILE="$GITHUB_WORKSPACE/${RESULT_FILENAME}_${CONFIG_NAME}_conc${concurrency}_gpus_${gpus}_ctx_${ctx}_gen_${gen}.json" + cp "$result_file" "$WORKSPACE_RESULT_FILE" + + echo "Copied result file to: $WORKSPACE_RESULT_FILE" + fi + done + done +fi + +# Cleanup +echo "Cleaning up..." +deactivate 2>/dev/null || true +rm -rf .venv +echo "Cleanup complete" + +echo "All result files processed" \ No newline at end of file