diff --git a/.github/configs/nvidia-master.yaml b/.github/configs/nvidia-master.yaml index b9867c03a..9ea0644b4 100644 --- a/.github/configs/nvidia-master.yaml +++ b/.github/configs/nvidia-master.yaml @@ -2007,27 +2007,6 @@ kimik2.5-int4-b200-vllm: search-space: - { tp: 8, conc-start: 4, conc-end: 64 } -# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html -# does not have a B300-specific recipe, so this config reuses the existing -# Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available. -kimik2.5-int4-b300-vllm: - image: vllm/vllm-openai:v0.19.0-cu130 - model: moonshotai/Kimi-K2.5 - model-prefix: kimik2.5 - runner: b300 - precision: int4 - framework: vllm - multinode: false - seq-len-configs: - - isl: 1024 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - - isl: 8192 - osl: 1024 - search-space: - - { tp: 8, conc-start: 4, conc-end: 64 } - kimik2.5-int4-h200-vllm: image: vllm/vllm-openai:v0.16.0 model: moonshotai/Kimi-K2.5 diff --git a/benchmarks/single_node/kimik2.5_int4_b300.sh b/benchmarks/single_node/kimik2.5_int4_b300.sh deleted file mode 100755 index 9f428f9e9..000000000 --- a/benchmarks/single_node/kimik2.5_int4_b300.sh +++ /dev/null @@ -1,81 +0,0 @@ -#!/usr/bin/env bash - -# NOTE: At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html -# does not have a B300-specific recipe, so this script reuses the existing -# Kimi-K2.5 INT4 B200 vLLM recipe as-is until B300-specific tuning is available. - -source "$(dirname "$0")/../benchmark_lib.sh" - -check_env_vars \ - MODEL \ - TP \ - CONC \ - ISL \ - OSL \ - MAX_MODEL_LEN \ - RANDOM_RANGE_RATIO \ - RESULT_FILENAME - -if [[ -n "$SLURM_JOB_ID" ]]; then - echo "JOB $SLURM_JOB_ID running on $SLURMD_NODENAME" -fi - -hf download "$MODEL" - -nvidia-smi - -export PYTHONNOUSERSITE=1 -export VLLM_USE_FLASHINFER_MOE_INT4=1 - -SERVER_LOG=/workspace/server.log -PORT=${PORT:-8888} - -if [ "${EVAL_ONLY}" = "true" ]; then - setup_eval_context - MAX_MODEL_LEN="$EVAL_MAX_MODEL_LEN" -fi -# Start GPU monitoring (power, temperature, clocks every second) -start_gpu_monitor - -set -x -vllm serve $MODEL --host 0.0.0.0 --port $PORT \ ---gpu-memory-utilization 0.95 \ ---tensor-parallel-size $TP \ ---max-model-len $MAX_MODEL_LEN \ ---max-num-seqs $CONC \ ---reasoning-parser kimi_k2 \ ---tool-call-parser kimi_k2 \ ---compilation_config.pass_config.fuse_allreduce_rms true \ ---trust-remote-code \ ---disable-log-requests \ ---no-enable-prefix-caching > $SERVER_LOG 2>&1 & - -SERVER_PID=$! - -# Wait for server to be ready -wait_for_server_ready --port "$PORT" --server-log "$SERVER_LOG" --server-pid "$SERVER_PID" - -pip install -q datasets pandas - -run_benchmark_serving \ - --model "$MODEL" \ - --port "$PORT" \ - --backend vllm \ - --input-len "$ISL" \ - --output-len "$OSL" \ - --random-range-ratio "$RANDOM_RANGE_RATIO" \ - --num-prompts $(( CONC * 10 )) \ - --max-concurrency "$CONC" \ - --result-filename "$RESULT_FILENAME" \ - --result-dir /workspace/ \ - --trust-remote-code - -# After throughput, run evaluation only if RUN_EVAL is true -if [ "${RUN_EVAL}" = "true" ]; then - run_eval --framework lm-eval --port "$PORT" - append_lm_eval_summary -fi - -# Stop GPU monitoring -stop_gpu_monitor -set +x diff --git a/perf-changelog.yaml b/perf-changelog.yaml index 82cbf1467..299540dbf 100644 --- a/perf-changelog.yaml +++ b/perf-changelog.yaml @@ -1469,14 +1469,6 @@ - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html does not have a B300-specific recipe, so this reuses the existing Kimi-K2.5 FP4 B200 vLLM recipe as-is" pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1056 -- config-keys: - - kimik2.5-int4-b300-vllm - description: - - "Add Kimi-K2.5 INT4 B300 vLLM benchmark" - - "Image: vllm/vllm-openai:v0.19.0-cu130" - - "At the time of submission, https://docs.vllm.ai/projects/recipes/en/latest/moonshotai/Kimi-K2.5.html does not have a B300-specific recipe, so this reuses the existing Kimi-K2.5 INT4 B200 vLLM recipe as-is" - pr-link: https://github.com/SemiAnalysisAI/InferenceX/pull/1057 - - config-keys: - gptoss-fp4-mi300x-vllm description: