diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py
new file mode 100644
index 0000000000..75ad094fa1
--- /dev/null
+++ b/.buildkite/check-wheel-size.py
@@ -0,0 +1,36 @@
+import os
+import zipfile
+
+MAX_SIZE_MB = 200
+
+
+def print_top_10_largest_files(zip_file):
+ with zipfile.ZipFile(zip_file, 'r') as z:
+ file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()]
+ file_sizes.sort(key=lambda x: x[1], reverse=True)
+ for f, size in file_sizes[:10]:
+ print(f"{f}: {size/(1024*1024)} MBs uncompressed.")
+
+
+def check_wheel_size(directory):
+ for root, _, files in os.walk(directory):
+ for f in files:
+ if f.endswith(".whl"):
+ wheel_path = os.path.join(root, f)
+ wheel_size = os.path.getsize(wheel_path)
+ wheel_size_mb = wheel_size / (1024 * 1024)
+ if wheel_size_mb > MAX_SIZE_MB:
+ print(
+ f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) "
+ f"compare to the allowed size ({MAX_SIZE_MB} MB).")
+ print_top_10_largest_files(wheel_path)
+ return 1
+ else:
+ print(f"Wheel {wheel_path} is within the allowed size "
+ f"({wheel_size_mb} MB).")
+ return 0
+
+
+if __name__ == "__main__":
+ import sys
+ sys.exit(check_wheel_size(sys.argv[1]))
diff --git a/.buildkite/download-images.sh b/.buildkite/download-images.sh
new file mode 100644
index 0000000000..389a12956c
--- /dev/null
+++ b/.buildkite/download-images.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+set -ex
+set -o pipefail
+
+(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+
+# aws s3 sync s3://air-example-data-2/vllm_opensource_llava/ images/
+mkdir -p images
+cd images
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_pixel_values.pt
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign_image_features.pt
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_pixel_values.pt
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom_image_features.pt
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/stop_sign.jpg
+wget https://air-example-data-2.s3.us-west-2.amazonaws.com/vllm_opensource_llava/cherry_blossom.jpg
+
+cd -
diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh
index 83a56e25ac..bde8ab6184 100644
--- a/.buildkite/run-amd-test.sh
+++ b/.buildkite/run-amd-test.sh
@@ -1,38 +1,73 @@
-# This script build the ROCm docker image and run the API server inside the container.
-# It serves a sanity check for compilation and basic model usage.
+# This script runs test inside the corresponding ROCm docker container.
set -ex
# Print ROCm version
+echo "--- ROCm info"
rocminfo
-# Try building the docker image
-docker build -t rocm -f Dockerfile.rocm .
+# cleanup older docker images
+cleanup_docker() {
+ # Get Docker's root directory
+ docker_root=$(docker info -f '{{.DockerRootDir}}')
+ if [ -z "$docker_root" ]; then
+ echo "Failed to determine Docker root directory."
+ exit 1
+ fi
+ echo "Docker root directory: $docker_root"
+ # Check disk usage of the filesystem where Docker's root directory is located
+ disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
+ # Define the threshold
+ threshold=70
+ if [ "$disk_usage" -gt "$threshold" ]; then
+ echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
+ # Remove dangling images (those that are not tagged and not used by any container)
+ docker image prune -f
+ # Remove unused volumes
+ docker volume prune -f
+ echo "Docker images and volumes cleanup completed."
+ else
+ echo "Disk usage is below $threshold%. No cleanup needed."
+ fi
+}
-# Setup cleanup
-remove_docker_container() { docker rm -f rocm || true; }
-trap remove_docker_container EXIT
-remove_docker_container
-
-# Run the image
-docker run --device /dev/kfd --device /dev/dri --network host --name rocm rocm python3 -m vllm.entrypoints.api_server &
-
-# Wait for the server to start
-wait_for_server_to_start() {
- timeout=300
- counter=0
-
- while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
- sleep 1
- counter=$((counter + 1))
- if [ $counter -ge $timeout ]; then
- echo "Timeout after $timeout seconds"
- break
+# Call the cleanup docker function
+cleanup_docker
+
+echo "--- Resetting GPUs"
+
+echo "reset" > /opt/amdgpu/etc/gpu_state
+
+while true; do
+ sleep 3
+ if grep -q clean /opt/amdgpu/etc/gpu_state; then
+ echo "GPUs state is \"clean\""
+ break
fi
- done
+done
+
+echo "--- Building container"
+sha=$(git rev-parse --short HEAD)
+image_name=rocm_${sha}
+container_name=rocm_${sha}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)
+docker build \
+ -t ${image_name} \
+ -f Dockerfile.rocm \
+ --progress plain \
+ .
+
+remove_docker_container() {
+ docker rm -f ${container_name} || docker image rm -f ${image_name} || true
}
-wait_for_server_to_start
+trap remove_docker_container EXIT
+
+echo "--- Running container"
+
+docker run \
+ --device /dev/kfd --device /dev/dri \
+ --network host \
+ --rm \
+ -e HF_TOKEN \
+ --name ${container_name} \
+ ${image_name} \
+ /bin/bash -c "${@}"
-# Test a simple prompt
-curl -X POST -H "Content-Type: application/json" \
- localhost:8000/generate \
- -d '{"prompt": "San Francisco is a"}'
diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh
index 865068628f..1efc963959 100644
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
@@ -9,10 +9,10 @@ cd "$(dirname "${BASH_SOURCE[0]}")/.."
(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
# run python-based benchmarks and upload the result to buildkite
-python3 benchmarks/benchmark_latency.py 2>&1 | tee benchmark_latency.txt
+python3 benchmarks/benchmark_latency.py --output-json latency_results.json 2>&1 | tee benchmark_latency.txt
bench_latency_exit_code=$?
-python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 2>&1 | tee benchmark_throughput.txt
+python3 benchmarks/benchmark_throughput.py --input-len 256 --output-len 256 --output-json throughput_results.json 2>&1 | tee benchmark_throughput.txt
bench_throughput_exit_code=$?
# run server-based benchmarks and upload the result to buildkite
@@ -23,8 +23,9 @@ wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/r
# wait for server to start, timeout after 600 seconds
timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
python3 benchmarks/benchmark_serving.py \
- --backend openai \
- --dataset ./ShareGPT_V3_unfiltered_cleaned_split.json \
+ --backend vllm \
+ --dataset-name sharegpt \
+ --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json \
--model meta-llama/Llama-2-7b-chat-hf \
--num-prompts 20 \
--endpoint /v1/completions \
@@ -48,7 +49,14 @@ sed -n '$p' benchmark_throughput.txt >> benchmark_results.md # last line
echo "### Serving Benchmarks" >> benchmark_results.md
sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
echo "" >> benchmark_results.md
-tail -n 13 benchmark_serving.txt >> benchmark_results.md # last 13 lines
+echo '```' >> benchmark_results.md
+tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
+echo '```' >> benchmark_results.md
+
+# if the agent binary is not found, skip uploading the results, exit 0
+if [ ! -f /workspace/buildkite-agent ]; then
+ exit 0
+fi
# upload the results to buildkite
/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
@@ -66,4 +74,5 @@ if [ $bench_serving_exit_code -ne 0 ]; then
exit $bench_serving_exit_code
fi
-/workspace/buildkite-agent artifact upload openai-*.json
+rm ShareGPT_V3_unfiltered_cleaned_split.json
+/workspace/buildkite-agent artifact upload "*.json"
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
new file mode 100644
index 0000000000..6a86bc0ebf
--- /dev/null
+++ b/.buildkite/run-cpu-test.sh
@@ -0,0 +1,24 @@
+# This script build the CPU docker image and run the offline inference inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -ex
+
+# Try building the docker image
+docker build -t cpu-test -f Dockerfile.cpu .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f cpu-test || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image
+docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
+
+# offline inference
+docker exec cpu-test bash -c "python3 examples/offline_inference.py"
+
+# Run basic model test
+docker exec cpu-test bash -c "cd tests;
+ pip install pytest Pillow protobuf
+ bash ../.buildkite/download-images.sh
+ cd ../
+ pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
diff --git a/.buildkite/run-neuron-test.sh b/.buildkite/run-neuron-test.sh
new file mode 100644
index 0000000000..252c0f7fec
--- /dev/null
+++ b/.buildkite/run-neuron-test.sh
@@ -0,0 +1,51 @@
+# This script build the Neuron docker image and run the API server inside the container.
+# It serves a sanity check for compilation and basic model usage.
+set -e
+
+# Try building the docker image
+aws ecr get-login-password --region us-west-2 | docker login --username AWS --password-stdin 763104351884.dkr.ecr.us-west-2.amazonaws.com
+
+# prune old image and containers to save disk space, and only once a day
+# by using a timestamp file in tmp.
+if [ -f /tmp/neuron-docker-build-timestamp ]; then
+ last_build=$(cat /tmp/neuron-docker-build-timestamp)
+ current_time=$(date +%s)
+ if [ $((current_time - last_build)) -gt 86400 ]; then
+ docker system prune -f
+ echo $current_time > /tmp/neuron-docker-build-timestamp
+ fi
+else
+ echo $(date +%s) > /tmp/neuron-docker-build-timestamp
+fi
+
+docker build -t neuron -f Dockerfile.neuron .
+
+# Setup cleanup
+remove_docker_container() { docker rm -f neuron || true; }
+trap remove_docker_container EXIT
+remove_docker_container
+
+# Run the image
+docker run --device=/dev/neuron0 --device=/dev/neuron1 --network host --name neuron neuron python3 -m vllm.entrypoints.api_server \
+ --model TinyLlama/TinyLlama-1.1B-Chat-v1.0 --max-num-seqs 8 --max-model-len 128 --block-size 128 --device neuron --tensor-parallel-size 2 &
+
+# Wait for the server to start
+wait_for_server_to_start() {
+ timeout=300
+ counter=0
+
+ while [ "$(curl -s -o /dev/null -w ''%{http_code}'' localhost:8000/health)" != "200" ]; do
+ sleep 1
+ counter=$((counter + 1))
+ if [ $counter -ge $timeout ]; then
+ echo "Timeout after $timeout seconds"
+ break
+ fi
+ done
+}
+wait_for_server_to_start
+
+# Test a simple prompt
+curl -X POST -H "Content-Type: application/json" \
+ localhost:8000/generate \
+ -d '{"prompt": "San Francisco is a"}'
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 6ae351130f..4edd1cadfb 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -5,71 +5,167 @@
steps:
- label: Regression Test
+ mirror_hardwares: [amd]
command: pytest -v -s test_regression.py
working_dir: "/vllm-workspace/tests" # optional
- label: AsyncEngine Test
+ #mirror_hardwares: [amd]
command: pytest -v -s async_engine
- label: Basic Correctness Test
- command: pytest -v -s --forked basic_correctness
+ mirror_hardwares: [amd]
+ commands:
+ - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_basic_correctness.py
+ - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_basic_correctness.py
+ - VLLM_ATTENTION_BACKEND=XFORMERS pytest -v -s basic_correctness/test_chunked_prefill.py
+ - VLLM_ATTENTION_BACKEND=FLASH_ATTN pytest -v -s basic_correctness/test_chunked_prefill.py
+ - VLLM_TEST_ENABLE_ARTIFICIAL_PREEMPT=1 pytest -v -s basic_correctness/test_preemption.py
- label: Core Test
+ mirror_hardwares: [amd]
command: pytest -v -s core
- label: Distributed Comm Ops Test
- command: pytest -v -s --forked test_comm_ops.py
- working_dir: "/vllm-workspace/tests/distributed"
- num_gpus: 2 # only support 1 or 2 for now.
-
-- label: Distributed Correctness Test
- command: pytest -v -s --forked test_basic_distributed_correctness.py
- working_dir: "/vllm-workspace/tests/distributed"
- num_gpus: 2 # only support 1 or 2 for now.
+ #mirror_hardwares: [amd]
+ command: pytest -v -s distributed/test_comm_ops.py
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+
+- label: Distributed Tests
+ mirror_hardwares: [amd]
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 2
+ commands:
+ - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+ - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_basic_distributed_correctness.py
+ - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
+ - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=ray pytest -v -s distributed/test_chunked_prefill_distributed.py
+ - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+ - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
+ - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
+ - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
+ - pytest -v -s spec_decode/e2e/test_integration_dist.py
+
+- label: Distributed Tests (Multiple Groups)
+ #mirror_hardwares: [amd]
+ working_dir: "/vllm-workspace/tests"
+ num_gpus: 4
+ commands:
+ - pytest -v -s distributed/test_pynccl.py
- label: Engine Test
- command: pytest -v -s engine tokenization test_sequence.py test_config.py
+ mirror_hardwares: [amd]
+ command: pytest -v -s engine tokenization test_sequence.py test_config.py test_logger.py
- label: Entrypoints Test
- command: pytest -v -s entrypoints
+ mirror_hardwares: [amd]
+
+ commands:
+ - pytest -v -s entrypoints -m llm
+ - pytest -v -s entrypoints -m openai
+
+- label: Examples Test
+ working_dir: "/vllm-workspace/examples"
+ mirror_hardwares: [amd]
+ commands:
+ # install aws cli for llava_example.py
+ # install tensorizer for tensorize_vllm_model.py
+ - pip install awscli tensorizer
+ - python3 offline_inference.py
+ - python3 offline_inference_with_prefix.py
+ - python3 llm_engine_example.py
+ - python3 llava_example.py
+ - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
+
+- label: Inputs Test
+ #mirror_hardwares: [amd]
+ commands:
+ - bash ../.buildkite/download-images.sh
+ - pytest -v -s test_inputs.py
+ - pytest -v -s multimodal
- label: Kernels Test %N
+ #mirror_hardwares: [amd]
command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
parallelism: 4
- label: Models Test
+ #mirror_hardwares: [amd]
+ commands:
+ - bash ../.buildkite/download-images.sh
+ - pytest -v -s models --ignore=models/test_llava.py
+
+- label: Llava Test
+ mirror_hardwares: [amd]
commands:
- - pytest -v -s models --forked
- soft_fail: true
+ - bash ../.buildkite/download-images.sh
+ - pytest -v -s models/test_llava.py
- label: Prefix Caching Test
+ mirror_hardwares: [amd]
commands:
- pytest -v -s prefix_caching
- label: Samplers Test
- command: pytest -v -s samplers --forked
+ #mirror_hardwares: [amd]
+ command: pytest -v -s samplers
+
+- label: LogitsProcessor Test
+ mirror_hardwares: [amd]
+ command: pytest -v -s test_logits_processor.py
+
+- label: Utils Test
+ command: pytest -v -s test_utils.py
- label: Worker Test
+ mirror_hardwares: [amd]
command: pytest -v -s worker
- label: Speculative decoding tests
+ #mirror_hardwares: [amd]
command: pytest -v -s spec_decode
- label: LoRA Test %N
- command: pytest -v -s lora --forked --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
+ #mirror_hardwares: [amd]
+ command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
parallelism: 4
+- label: LoRA Long Context (Distributed)
+ #mirror_hardwares: [amd]
+ num_gpus: 4
+ # This test runs llama 13B, so it is required to run on 4 GPUs.
+ commands:
+ # Temporarily run this way because we cannot clean up GPU mem usage
+ # for multi GPU tests.
+ # TODO(sang): Fix it.
+ - pytest -v -s lora/test_long_context.py::test_rotary_emb_replaced
+ - pytest -v -s lora/test_long_context.py::test_batched_rope_kernel
+ - pytest -v -s lora/test_long_context.py::test_self_consistency
+ - pytest -v -s lora/test_long_context.py::test_quality
+ - pytest -v -s lora/test_long_context.py::test_max_len
+
+- label: Tensorizer Test
+ #mirror_hardwares: [amd]
+ command: apt-get install curl libsodium23 && pytest -v -s tensorizer_loader
+
- label: Metrics Test
+ mirror_hardwares: [amd]
command: pytest -v -s metrics
+- label: Quantization Test
+ #mirror_hardwares: [amd]
+ command: pytest -v -s quantization
+
- label: Benchmarks
working_dir: "/vllm-workspace/.buildkite"
+ mirror_hardwares: [amd]
commands:
- pip install aiohttp
- bash run-benchmarks.sh
- label: Documentation Build
- working_dir: "/vllm-workspace/docs"
+ working_dir: "/vllm-workspace/test_docs/docs"
no_gpu: True
commands:
- pip install -r requirements-docs.txt
diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2
new file mode 100644
index 0000000000..9f7d07acca
--- /dev/null
+++ b/.buildkite/test-template-aws.j2
@@ -0,0 +1,59 @@
+{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
+{% set default_working_dir = "/vllm-workspace/tests" %}
+
+steps:
+ - label: ":docker: build image"
+ agents:
+ queue: cpu_queue
+ commands:
+ - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+ - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
+ - "docker push {{ docker_image }}"
+ env:
+ DOCKER_BUILDKIT: "1"
+ retry:
+ automatic:
+ - exit_status: -1 # Agent was lost
+ limit: 5
+ - exit_status: -10 # Agent was lost
+ limit: 5
+ - wait
+
+ {% for step in steps %}
+ - label: "{{ step.label }}"
+ agents:
+ {% if step.no_gpu %}
+ queue: cpu_queue
+ {% elif step.num_gpus == 2 or step.num_gpus == 4 %}
+ queue: gpu_4_queue
+ {% else %}
+ queue: gpu_1_queue
+ {% endif %}
+ soft_fail: true
+ {% if step.parallelism %}
+ parallelism: {{ step.parallelism }}
+ {% endif %}
+ retry:
+ automatic:
+ - exit_status: -1 # Agent was lost
+ limit: 5
+ - exit_status: -10 # Agent was lost
+ limit: 5
+ plugins:
+ - docker#v5.2.0:
+ image: {{ docker_image }}
+ always-pull: true
+ propagate-environment: true
+ {% if not step.no_gpu %}
+ gpus: all
+ {% endif %}
+ command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe }} && {{ step.command or (step.commands | join(' && ')) | safe }}"]
+ environment:
+ - VLLM_USAGE_SOURCE=ci-test
+ - HF_TOKEN
+ {% if step.label == "Speculative decoding tests" %}
+ - VLLM_ATTENTION_BACKEND=XFORMERS
+ {% endif %}
+ volumes:
+ - /dev/shm:/dev/shm
+ {% endfor %}
diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
index 2ff58cc2e0..4a20a462b9 100644
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@@ -3,11 +3,6 @@
{% set default_working_dir = "/vllm-workspace/tests" %}
steps:
- - label: "AMD Test"
- agents:
- queue: amd
- command: bash .buildkite/run-amd-test.sh
-
- label: ":docker: build image"
commands:
- "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
@@ -18,8 +13,38 @@ steps:
automatic:
- exit_status: -1 # Agent was lost
limit: 5
+ - exit_status: -10 # Agent was lost
+ limit: 5
- wait
+ - group: "AMD Tests"
+ depends_on: ~
+ steps:
+ {% for step in steps %}
+ {% if step.mirror_hardwares and "amd" in step.mirror_hardwares %}
+ - label: "AMD: {{ step.label }}"
+ agents:
+ queue: amd
+ command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe }} ; {{ step.command or (step.commands | join(" ; ")) | safe }}"
+ env:
+ DOCKER_BUILDKIT: "1"
+ soft_fail: true
+ {% endif %}
+ {% endfor %}
+
+ - label: "Neuron Test"
+ depends_on: ~
+ agents:
+ queue: neuron
+ command: bash .buildkite/run-neuron-test.sh
+ soft_fail: false
+
+ - label: "Intel Test"
+ depends_on: ~
+ agents:
+ queue: intel
+ command: bash .buildkite/run-cpu-test.sh
+
{% for step in steps %}
- label: "{{ step.label }}"
agents:
@@ -32,9 +57,14 @@ steps:
automatic:
- exit_status: -1 # Agent was lost
limit: 5
+ - exit_status: -10 # Agent was lost
+ limit: 5
plugins:
- kubernetes:
podSpec:
+ {% if step.num_gpus %}
+ priorityClassName: gpu-priority-cls-{{ step.num_gpus }}
+ {% endif %}
volumes:
- name: dshm
emptyDir:
@@ -53,6 +83,8 @@ steps:
nvidia.com/gpu: "{{ step.num_gpus or default_num_gpu }}"
{% endif %}
env:
+ - name: VLLM_USAGE_SOURCE
+ value: ci-test
- name: HF_TOKEN
valueFrom:
secretKeyRef:
diff --git a/.clang-format b/.clang-format
new file mode 100644
index 0000000000..7f9e6d720f
--- /dev/null
+++ b/.clang-format
@@ -0,0 +1,26 @@
+BasedOnStyle: Google
+UseTab: Never
+IndentWidth: 2
+ColumnLimit: 80
+
+# Force pointers to the type for C++.
+DerivePointerAlignment: false
+PointerAlignment: Left
+
+# Reordering #include statements can (and currently will) introduce errors
+SortIncludes: false
+
+# Style choices
+AlignConsecutiveAssignments: false
+AlignConsecutiveDeclarations: false
+IndentPPDirectives: BeforeHash
+
+IncludeCategories:
+ - Regex: '^<'
+ Priority: 4
+ - Regex: '^"(llvm|llvm-c|clang|clang-c|mlir|mlir-c)/'
+ Priority: 3
+ - Regex: '^"(qoda|\.\.)/'
+ Priority: 2
+ - Regex: '.*'
+ Priority: 1
diff --git a/.github/ISSUE_TEMPLATE/200-installation.yml b/.github/ISSUE_TEMPLATE/200-installation.yml
index 4c6c96187c..df41ade8c3 100644
--- a/.github/ISSUE_TEMPLATE/200-installation.yml
+++ b/.github/ISSUE_TEMPLATE/200-installation.yml
@@ -18,6 +18,7 @@ body:
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
+ It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`
diff --git a/.github/ISSUE_TEMPLATE/300-usage.yml b/.github/ISSUE_TEMPLATE/300-usage.yml
index 88227b4b2e..54763af105 100644
--- a/.github/ISSUE_TEMPLATE/300-usage.yml
+++ b/.github/ISSUE_TEMPLATE/300-usage.yml
@@ -18,6 +18,7 @@ body:
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
+ It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`
diff --git a/.github/ISSUE_TEMPLATE/400-bug report.yml b/.github/ISSUE_TEMPLATE/400-bug report.yml
index f1124dfa78..ce980c3f4a 100644
--- a/.github/ISSUE_TEMPLATE/400-bug report.yml
+++ b/.github/ISSUE_TEMPLATE/400-bug report.yml
@@ -18,6 +18,7 @@ body:
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
+ It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`
@@ -57,6 +58,10 @@ body:
If the code is too long (hopefully, it isn't), feel free to put it in a public gist and link it in the issue: https://gist.github.com.
Please also paste or describe the results you observe instead of the expected results. If you observe an error, please paste the error message including the **full** traceback of the exception. It may be relevant to wrap error messages in ```` ```triple quotes blocks``` ````.
+
+ Please set the environment variable `export VLLM_LOGGING_LEVEL=DEBUG` to turn on more logging to help debugging potential issues.
+
+ If you experienced crashes or hangs, it would be helpful to run vllm with `export VLLM_TRACE_FUNCTION=1` . All the function calls in vllm will be recorded. Inspect these log files, and tell which function crashes or hangs.
placeholder: |
A clear and concise description of what the bug is.
diff --git a/.github/ISSUE_TEMPLATE/700-performance discussion.yml b/.github/ISSUE_TEMPLATE/700-performance discussion.yml
index 9e8e7b4aa3..4f8843420a 100644
--- a/.github/ISSUE_TEMPLATE/700-performance discussion.yml
+++ b/.github/ISSUE_TEMPLATE/700-performance discussion.yml
@@ -39,6 +39,7 @@ body:
# For security purposes, please feel free to check the contents of collect_env.py before running it.
python collect_env.py
```
+ It is suggested to download and execute the latest script, as vllm might frequently update the diagnosis information needed for accurately and quickly responding to issues.
value: |
```text
The output of `python collect_env.py`
diff --git a/.github/ISSUE_TEMPLATE/750-RFC.yml b/.github/ISSUE_TEMPLATE/750-RFC.yml
new file mode 100644
index 0000000000..5382b124dc
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/750-RFC.yml
@@ -0,0 +1,49 @@
+name: 💬 Request for comments (RFC).
+description: Ask for feedback on major architectural changes or design choices.
+title: "[RFC]: "
+labels: ["RFC"]
+
+body:
+- type: markdown
+ attributes:
+ value: >
+ #### Please take a look at previous [RFCs](https://github.com/vllm-project/vllm/issues?q=label%3ARFC+sort%3Aupdated-desc) for reference.
+- type: textarea
+ attributes:
+ label: Motivation.
+ description: >
+ The motivation of the RFC.
+ validations:
+ required: true
+- type: textarea
+ attributes:
+ label: Proposed Change.
+ description: >
+ The proposed change of the RFC.
+ validations:
+ required: true
+- type: textarea
+ attributes:
+ label: Feedback Period.
+ description: >
+ The feedback period of the RFC. Usually at least one week.
+ validations:
+ required: false
+- type: textarea
+ attributes:
+ label: CC List.
+ description: >
+ The list of people you want to CC.
+ validations:
+ required: false
+- type: textarea
+ attributes:
+ label: Any Other Things.
+ description: >
+ Any other things you would like to mention.
+ validations:
+ required: false
+- type: markdown
+ attributes:
+ value: >
+ Thanks for contributing 🎉!
diff --git a/.github/actions/free-up-disk-space/action.yml b/.github/actions/free-up-disk-space/action.yml
new file mode 100644
index 0000000000..b0c77401df
--- /dev/null
+++ b/.github/actions/free-up-disk-space/action.yml
@@ -0,0 +1,36 @@
+name: "Free up disk space"
+description: "Removes non-essential tools, libraries and cached files from GitHub action runner node"
+
+runs:
+ using: "composite"
+ steps:
+ - name: "Remove non-essential tools and libraries"
+ shell: bash
+ run: |
+ # https://github.com/actions/runner-images/issues/2840#issuecomment-790492173
+ echo "Disk usage before cleanup:"
+ df -h
+ echo "Removing non-essential tools and libraries ..."
+ sudo rm -rf /opt/ghc
+ sudo rm -rf /usr/local/.ghcup
+ sudo rm -rf /usr/share/dotnet
+ # sudo rm -rf /usr/local/share/boost
+ echo "Deleting libraries for Android (12G), CodeQL (5.3G), PowerShell (1.3G), Swift (1.7G) ..."
+ sudo rm -rf /usr/local/lib/android
+ sudo rm -rf "${AGENT_TOOLSDIRECTORY}/CodeQL"
+ sudo rm -rf /usr/local/share/powershell
+ sudo rm -rf /usr/share/swift
+ # ref: https://github.com/jlumbroso/free-disk-space/blob/main/action.yml
+ echo "Deleting some larger apt packages:"
+ sudo apt-get remove -y azure-cli google-chrome-stable firefox powershell mono-devel libgl1-mesa-dri --fix-missing || echo "::warning::The command [sudo apt-get remove -y azure-cli google-chrome-stable firefox powershell mono-devel libgl1-mesa-dri --fix-missing] failed to complete successfully. Proceeding..."
+ echo "Disk usage after cleanup:"
+ df -h
+
+ - name: "Prune docker images"
+ shell: bash
+ run: |
+ echo "Pruning docker images ..."
+ docker image prune -a -f
+ docker system df
+ echo "Disk usage after pruning docker images:"
+ df -h
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 0000000000..f5051eb865
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,128 @@
+name: "Build"
+
+on:
+ workflow_dispatch:
+
+ push:
+ branches:
+ - release
+ paths-ignore:
+ - "**.md"
+ - "proto/**"
+
+ pull_request:
+ branches:
+ - main
+ paths-ignore:
+ - "**.md"
+ - "proto/**"
+
+defaults:
+ run:
+ shell: bash
+
+env:
+ SERVER_IMAGE: "quay.io/wxpe/tgis-vllm"
+ IMAGE_REGISTRY: "quay.io"
+
+jobs:
+ build:
+ runs-on: ubuntu-latest
+ permissions:
+ packages: write
+ contents: read
+ env:
+ CACHE_IMAGE: "ghcr.io/ibm/tgis-vllm:build-cache"
+ CACHE_REGISTRY: "ghcr.io"
+ CACHE_PACKAGE_NAME: "tgis-vllm"
+
+ steps:
+ - name: "Checkout"
+ uses: actions/checkout@v4
+
+ - name: "Free up disk space"
+ uses: ./.github/actions/free-up-disk-space
+
+ - name: "Set up QEMU"
+ uses: docker/setup-qemu-action@v3
+
+ - name: "Set up Docker Buildx"
+ uses: docker/setup-buildx-action@v3
+
+ - name: "Log in to container registry (server-release)"
+ uses: docker/login-action@v3
+ if: github.event_name != 'pull_request'
+ with:
+ registry: ${{ env.IMAGE_REGISTRY }}
+ username: ${{ secrets.WXPE_QUAY_USER }}
+ password: ${{ secrets.WXPE_QUAY_TOKEN }}
+
+ - name: "Log in to container registry (cache image)"
+ uses: docker/login-action@v3
+ if: github.event_name != 'pull_request'
+ with:
+ registry: ${{ env.CACHE_REGISTRY }}
+ username: ${{ github.actor }}
+ password: ${{ secrets.GITHUB_TOKEN }}
+
+ - name: "Set build cache target"
+ run: |
+ # For push to `main` (PR merged), push a new cache image with all layers (cache-mode=max).
+ # For PR builds, use GitHub action cache which isolates cached layers by PR/branch.
+ # to optimize builds for subsequent pushes to the same PR/branch.
+ # Do not set a cache-to image for PR builds to not overwrite the `main` cache image and
+ # to not ping-pong cache images for two or more different PRs.
+ # Do not push cache images for each PR or multiple branches to not exceed GitHub package
+ # usage and traffic limitations.
+ # UPDATE 2024/02/26: GHA cache appears to have issues, cannot use `cache-to: gha,mode=min`
+ # if `cache-from: reg...,mode=max` but `cache-to: gha,mode=max` takes longer than uncached
+ # build and exhausts GHA cache size limits, so use cache `type=inline` (no external cache).
+ if [ "${{ github.event_name }}" == "pull_request" ]
+ then
+ #CACHE_TO="type=gha,mode=min"
+ CACHE_TO="type=inline"
+ else
+ CACHE_TO="type=registry,ref=${{ env.CACHE_IMAGE }},mode=max"
+ fi
+ echo "CACHE_TO=$CACHE_TO" >> $GITHUB_ENV
+
+ - name: "Generate tags"
+ id: meta
+ uses: docker/metadata-action@v5
+ with:
+ images: |
+ ${{ env.SERVER_IMAGE }}
+ tags: |
+ type=ref,event=branch
+ type=semver,pattern={{version}}
+ type=semver,pattern={{major}}.{{minor}}
+ type=sha,enable=true,priority=100,prefix=,suffix=,format=short
+ type=sha,enable=true,priority=100,prefix=${{ github.ref_name }}.,suffix=,format=short
+
+ - name: "UBI Docker build"
+ uses: docker/build-push-action@v5
+ with:
+ context: .
+ target: vllm-openai
+ tags: ${{ steps.meta.outputs.tags }}
+ cache-from: type=registry,ref=${{ env.CACHE_IMAGE }}
+ cache-to: ${{ env.CACHE_TO }}
+ push: ${{ github.event_name != 'pull_request' }}
+ file: Dockerfile.ubi
+
+ - name: "List docker images"
+ run: docker images
+
+ - name: "Cleanup old cache images"
+ uses: actions/delete-package-versions@v5
+ if: ${{ github.event_name == 'push' }}
+ with:
+ package-name: ${{ env.CACHE_PACKAGE_NAME }}
+ package-type: container
+ delete-only-untagged-versions: true
+
+ - name: "Check disk usage"
+ shell: bash
+ run: |
+ docker system df
+ df -h
diff --git a/.github/workflows/clang-format.yml b/.github/workflows/clang-format.yml
new file mode 100644
index 0000000000..e9b6e28fa6
--- /dev/null
+++ b/.github/workflows/clang-format.yml
@@ -0,0 +1,42 @@
+name: clang-format
+
+on:
+ # Trigger the workflow on push or pull request,
+ # but only for the main branch
+ push:
+ branches:
+ - main
+ pull_request:
+ branches:
+ - main
+
+jobs:
+ clang-format:
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python-version: ["3.11"]
+ steps:
+ - uses: actions/checkout@v2
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install clang-format==18.1.5
+ - name: Running clang-format
+ run: |
+ EXCLUDES=(
+ 'csrc/moe/topk_softmax_kernels.cu'
+ 'csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu'
+ 'csrc/punica/bgmv/bgmv_config.h'
+ 'csrc/punica/bgmv/bgmv_impl.cuh'
+ 'csrc/punica/bgmv/vec_dtypes.cuh'
+ 'csrc/punica/punica_ops.cu'
+ 'csrc/punica/type_convert.h'
+ )
+ find csrc/ \( -name '*.h' -o -name '*.cpp' -o -name '*.cu' -o -name '*.cuh' \) -print \
+ | grep -vFf <(printf "%s\n" "${EXCLUDES[@]}") \
+ | xargs clang-format --dry-run --Werror
\ No newline at end of file
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
new file mode 100644
index 0000000000..22e6c2ef01
--- /dev/null
+++ b/.github/workflows/mypy.yaml
@@ -0,0 +1,51 @@
+name: mypy
+
+on:
+ # Trigger the workflow on push or pull request,
+ # but only for the main branch
+ push:
+ branches:
+ - main
+ pull_request:
+ branches:
+ - main
+
+jobs:
+ ruff:
+ runs-on: ubuntu-latest
+ strategy:
+ matrix:
+ python-version: ["3.8", "3.9", "3.10", "3.11"]
+ steps:
+ - uses: actions/checkout@v2
+ - name: Set up Python ${{ matrix.python-version }}
+ uses: actions/setup-python@v2
+ with:
+ python-version: ${{ matrix.python-version }}
+ - name: Install dependencies
+ run: |
+ python -m pip install --upgrade pip
+ pip install mypy==1.9.0
+ pip install types-setuptools
+ pip install types-PyYAML
+ pip install types-requests
+ pip install types-setuptools
+ - name: Mypy
+ run: |
+ mypy vllm/attention --config-file pyproject.toml
+ mypy vllm/core --config-file pyproject.toml
+ mypy vllm/distributed --config-file pyproject.toml
+ mypy vllm/entrypoints --config-file pyproject.toml
+ mypy vllm/executor --config-file pyproject.toml
+ mypy vllm/multimodal --config-file pyproject.toml
+ mypy vllm/usage --config-file pyproject.toml
+ mypy vllm/*.py --config-file pyproject.toml
+ mypy vllm/transformers_utils --config-file pyproject.toml
+ mypy vllm/engine --config-file pyproject.toml
+ mypy vllm/worker --config-file pyproject.toml
+ mypy vllm/spec_decode --config-file pyproject.toml
+ mypy vllm/model_executor --config-file pyproject.toml
+ mypy vllm/lora --config-file pyproject.toml
+ mypy vllm/logging --config-file pyproject.toml
+ mypy vllm/model_executor --config-file pyproject.toml
+
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
deleted file mode 100644
index 5211dc1807..0000000000
--- a/.github/workflows/publish.yml
+++ /dev/null
@@ -1,102 +0,0 @@
-# This workflow will upload a Python Package to Release asset
-# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions
-
-name: Create Release
-
-on:
- push:
- tags:
- - v*
-
-# Needed to create release and upload assets
-permissions:
- contents: write
-
-jobs:
- release:
- # Retrieve tag and create release
- name: Create Release
- runs-on: ubuntu-latest
- outputs:
- upload_url: ${{ steps.create_release.outputs.upload_url }}
- steps:
- - name: Checkout
- uses: actions/checkout@v3
-
- - name: Extract branch info
- shell: bash
- run: |
- echo "release_tag=${GITHUB_REF#refs/*/}" >> $GITHUB_ENV
-
- - name: Create Release
- id: create_release
- uses: "actions/github-script@v6"
- env:
- RELEASE_TAG: ${{ env.release_tag }}
- with:
- github-token: "${{ secrets.GITHUB_TOKEN }}"
- script: |
- const script = require('.github/workflows/scripts/create_release.js')
- await script(github, context, core)
-
- wheel:
- name: Build Wheel
- runs-on: ${{ matrix.os }}
- needs: release
-
- strategy:
- fail-fast: false
- matrix:
- os: ['ubuntu-20.04']
- python-version: ['3.8', '3.9', '3.10', '3.11']
- pytorch-version: ['2.1.2'] # Must be the most recent version that meets requirements.txt.
- cuda-version: ['11.8', '12.1']
-
- steps:
- - name: Checkout
- uses: actions/checkout@v3
-
- - name: Set up Linux Env
- if: ${{ runner.os == 'Linux' }}
- run: |
- bash -x .github/workflows/scripts/env.sh
-
- - name: Set up Python
- uses: actions/setup-python@v4
- with:
- python-version: ${{ matrix.python-version }}
-
- - name: Install CUDA ${{ matrix.cuda-version }}
- run: |
- bash -x .github/workflows/scripts/cuda-install.sh ${{ matrix.cuda-version }} ${{ matrix.os }}
-
- - name: Install PyTorch ${{ matrix.pytorch-version }} with CUDA ${{ matrix.cuda-version }}
- run: |
- bash -x .github/workflows/scripts/pytorch-install.sh ${{ matrix.python-version }} ${{ matrix.pytorch-version }} ${{ matrix.cuda-version }}
-
- - name: Build wheel
- shell: bash
- run: |
- bash -x .github/workflows/scripts/build.sh ${{ matrix.python-version }} ${{ matrix.cuda-version }}
- wheel_name=$(ls dist/*whl | xargs -n 1 basename)
- asset_name=${wheel_name//"linux"/"manylinux1"}
- echo "wheel_name=${wheel_name}" >> $GITHUB_ENV
- echo "asset_name=${asset_name}" >> $GITHUB_ENV
-
- - name: Upload Release Asset
- uses: actions/upload-release-asset@v1
- env:
- GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
- with:
- upload_url: ${{ needs.release.outputs.upload_url }}
- asset_path: ./dist/${{ env.wheel_name }}
- asset_name: ${{ env.asset_name }}
- asset_content_type: application/*
-
- # (Danielkinz): This last step will publish the .whl to pypi. Warning: untested
- # - name: Publish package
- # uses: pypa/gh-action-pypi-publish@release/v1.8
- # with:
- # repository-url: https://test.pypi.org/legacy/
- # password: ${{ secrets.PYPI_API_TOKEN }}
- # skip-existing: true
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
index cd16cecf21..e71033f828 100644
--- a/.github/workflows/ruff.yml
+++ b/.github/workflows/ruff.yml
@@ -15,7 +15,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
- python-version: ["3.10"]
+ python-version: ["3.8", "3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
@@ -25,10 +25,13 @@ jobs:
- name: Install dependencies
run: |
python -m pip install --upgrade pip
- pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1
+ pip install ruff==0.1.5 codespell==2.2.6 tomli==2.0.1 isort==5.13.2
- name: Analysing the code with ruff
run: |
ruff .
- name: Spelling check with codespell
run: |
- codespell --toml pyproject.toml
\ No newline at end of file
+ codespell --toml pyproject.toml
+ - name: Run isort
+ run: |
+ isort . --check-only
diff --git a/.github/workflows/scripts/build.sh b/.github/workflows/scripts/build.sh
deleted file mode 100644
index 2578d44843..0000000000
--- a/.github/workflows/scripts/build.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/bin/bash
-
-python_executable=python$1
-cuda_home=/usr/local/cuda-$2
-
-# Update paths
-PATH=${cuda_home}/bin:$PATH
-LD_LIBRARY_PATH=${cuda_home}/lib64:$LD_LIBRARY_PATH
-
-# Install requirements
-$python_executable -m pip install wheel packaging
-$python_executable -m pip install -r requirements.txt
-
-# Limit the number of parallel jobs to avoid OOM
-export MAX_JOBS=1
-# Make sure punica is built for the release (for LoRA)
-export VLLM_INSTALL_PUNICA_KERNELS=1
-
-# Build
-$python_executable setup.py bdist_wheel --dist-dir=dist
diff --git a/.github/workflows/scripts/create_release.js b/.github/workflows/scripts/create_release.js
deleted file mode 100644
index 0f25624b4c..0000000000
--- a/.github/workflows/scripts/create_release.js
+++ /dev/null
@@ -1,20 +0,0 @@
-// Uses Github's API to create the release and wait for result.
-// We use a JS script since github CLI doesn't provide a way to wait for the release's creation and returns immediately.
-
-module.exports = async (github, context, core) => {
- try {
- const response = await github.rest.repos.createRelease({
- draft: false,
- generate_release_notes: true,
- name: process.env.RELEASE_TAG,
- owner: context.repo.owner,
- prerelease: false,
- repo: context.repo.repo,
- tag_name: process.env.RELEASE_TAG,
- });
-
- core.setOutput('upload_url', response.data.upload_url);
- } catch (error) {
- core.setFailed(error.message);
- }
-}
\ No newline at end of file
diff --git a/.github/workflows/scripts/cuda-install.sh b/.github/workflows/scripts/cuda-install.sh
deleted file mode 100644
index 312c6e82f3..0000000000
--- a/.github/workflows/scripts/cuda-install.sh
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/bin/bash
-
-# Replace '.' with '-' ex: 11.8 -> 11-8
-cuda_version=$(echo $1 | tr "." "-")
-# Removes '-' and '.' ex: ubuntu-20.04 -> ubuntu2004
-OS=$(echo $2 | tr -d ".\-")
-
-# Installs CUDA
-wget -nv https://developer.download.nvidia.com/compute/cuda/repos/${OS}/x86_64/cuda-keyring_1.1-1_all.deb
-sudo dpkg -i cuda-keyring_1.1-1_all.deb
-rm cuda-keyring_1.1-1_all.deb
-sudo apt -qq update
-sudo apt -y install cuda-${cuda_version} cuda-nvcc-${cuda_version} cuda-libraries-dev-${cuda_version}
-sudo apt clean
-
-# Test nvcc
-PATH=/usr/local/cuda-$1/bin:${PATH}
-nvcc --version
-
-# Log gcc, g++, c++ versions
-gcc --version
-g++ --version
-c++ --version
diff --git a/.github/workflows/scripts/env.sh b/.github/workflows/scripts/env.sh
deleted file mode 100644
index d7baaecbbc..0000000000
--- a/.github/workflows/scripts/env.sh
+++ /dev/null
@@ -1,56 +0,0 @@
-#!/bin/bash
-
-# This file installs common linux environment tools
-
-export LANG C.UTF-8
-
-# python_version=$1
-
-sudo apt-get update && \
-sudo apt-get install -y --no-install-recommends \
- software-properties-common \
-
-sudo apt-get install -y --no-install-recommends \
- build-essential \
- apt-utils \
- ca-certificates \
- wget \
- git \
- vim \
- libssl-dev \
- curl \
- unzip \
- unrar \
- cmake \
- net-tools \
- sudo \
- autotools-dev \
- rsync \
- jq \
- openssh-server \
- tmux \
- screen \
- htop \
- pdsh \
- openssh-client \
- lshw \
- dmidecode \
- util-linux \
- automake \
- autoconf \
- libtool \
- net-tools \
- pciutils \
- libpci-dev \
- libaio-dev \
- libcap2 \
- libtinfo5 \
- fakeroot \
- devscripts \
- debhelper \
- nfs-common
-
-# Remove github bloat files to free up disk space
-sudo rm -rf "/usr/local/share/boost"
-sudo rm -rf "$AGENT_TOOLSDIRECTORY"
-sudo rm -rf "/usr/share/dotnet"
diff --git a/.github/workflows/scripts/pytorch-install.sh b/.github/workflows/scripts/pytorch-install.sh
deleted file mode 100644
index dfc1851d76..0000000000
--- a/.github/workflows/scripts/pytorch-install.sh
+++ /dev/null
@@ -1,15 +0,0 @@
-#!/bin/bash
-
-python_executable=python$1
-pytorch_version=$2
-cuda_version=$3
-
-# Install torch
-$python_executable -m pip install numpy pyyaml scipy ipython mkl mkl-include ninja cython typing pandas typing-extensions dataclasses setuptools && conda clean -ya
-$python_executable -m pip install torch==${pytorch_version}+cu${cuda_version//./} --extra-index-url https://download.pytorch.org/whl/cu${cuda_version//./}
-
-# Print version information
-$python_executable --version
-$python_executable -c "import torch; print('PyTorch:', torch.__version__)"
-$python_executable -c "import torch; print('CUDA:', torch.version.cuda)"
-$python_executable -c "from torch.utils import cpp_extension; print (cpp_extension.CUDA_HOME)"
diff --git a/.github/workflows/yapf.yml b/.github/workflows/yapf.yml
index b163c960db..04f307bcf8 100644
--- a/.github/workflows/yapf.yml
+++ b/.github/workflows/yapf.yml
@@ -14,7 +14,7 @@ jobs:
runs-on: ubuntu-latest
strategy:
matrix:
- python-version: ["3.10"]
+ python-version: ["3.8", "3.9", "3.10", "3.11"]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
diff --git a/.gitignore b/.gitignore
index b5195629e5..e077366d1e 100644
--- a/.gitignore
+++ b/.gitignore
@@ -70,6 +70,8 @@ instance/
# Sphinx documentation
docs/_build/
+docs/source/getting_started/examples/*.rst
+!**/*.template.rst
# PyBuilder
.pybuilder/
@@ -181,6 +183,7 @@ _build/
# hip files generated by PyTorch
*.hip
*_hip*
+hip_compat.h
# Benchmark dataset
*.json
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 150fcebeb8..a197063f33 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,7 +2,10 @@ cmake_minimum_required(VERSION 3.21)
project(vllm_extensions LANGUAGES CXX)
+option(VLLM_TARGET_DEVICE "Target device backend for vLLM" "cuda")
+
message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
+message(STATUS "Target device: ${VLLM_TARGET_DEVICE}")
include(${CMAKE_CURRENT_LIST_DIR}/cmake/utils.cmake)
@@ -16,7 +19,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
# Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")
#
# Supported/expected torch versions for CUDA/ROCm.
@@ -28,7 +31,7 @@ set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
# requirements.txt files and should be kept consistent. The ROCm torch
# versions are derived from Dockerfile.rocm
#
-set(TORCH_SUPPORTED_VERSION_CUDA "2.1.2")
+set(TORCH_SUPPORTED_VERSION_CUDA "2.3.0")
set(TORCH_SUPPORTED_VERSION_ROCM_5X "2.0.1")
set(TORCH_SUPPORTED_VERSION_ROCM_6X "2.1.1")
@@ -51,7 +54,7 @@ append_cmake_prefix_path("torch" "torch.utils.cmake_prefix_path")
# Ensure the 'nvcc' command is in the PATH
find_program(NVCC_EXECUTABLE nvcc)
-if (NOT NVCC_EXECUTABLE)
+if (CUDA_FOUND AND NOT NVCC_EXECUTABLE)
message(FATAL_ERROR "nvcc not found")
endif()
@@ -70,10 +73,24 @@ find_package(Torch REQUIRED)
# config is used for standalone C++ binaries that link against torch).
# The `libtorch_python.so` library defines some of the glue code between
# torch/python via pybind and is required by VLLM extensions for this
-# reason. So, add it by manually using `append_torchlib_if_found` from
-# torch's cmake setup.
+# reason. So, add it by manually with `find_library` using torch's
+# installed library path.
+#
+find_library(torch_python_LIBRARY torch_python PATHS
+ "${TORCH_INSTALL_PREFIX}/lib")
+
#
-append_torchlib_if_found(torch_python)
+# Forward the non-CUDA device extensions to external CMake scripts.
+#
+if (NOT VLLM_TARGET_DEVICE STREQUAL "cuda" AND
+ NOT VLLM_TARGET_DEVICE STREQUAL "rocm")
+ if (VLLM_TARGET_DEVICE STREQUAL "cpu")
+ include(${CMAKE_CURRENT_LIST_DIR}/cmake/cpu_extension.cmake)
+ else()
+ message(FATAL_ERROR "Unsupported vLLM target device: ${VLLM_TARGET_DEVICE}")
+ endif()
+ return()
+endif()
#
# Set up GPU language and check the torch version and warn if it isn't
@@ -150,15 +167,47 @@ set(VLLM_EXT_SRC
"csrc/layernorm_kernels.cu"
"csrc/quantization/squeezellm/quant_cuda_kernel.cu"
"csrc/quantization/gptq/q_gemm.cu"
+ "csrc/quantization/compressed_tensors/int8_quant_kernels.cu"
+ "csrc/quantization/fp8/common.cu"
"csrc/cuda_utils_kernels.cu"
"csrc/moe_align_block_size_kernels.cu"
"csrc/pybind.cpp")
if(VLLM_GPU_LANG STREQUAL "CUDA")
+ include(FetchContent)
+ SET(CUTLASS_ENABLE_HEADERS_ONLY=ON)
+ FetchContent_Declare(
+ cutlass
+ GIT_REPOSITORY https://github.com/nvidia/cutlass.git
+ # CUTLASS 3.5.0
+ GIT_TAG 7d49e6c7e2f8896c47f586706e67e1fb215529dc
+ )
+ FetchContent_MakeAvailable(cutlass)
+
list(APPEND VLLM_EXT_SRC
+ "csrc/quantization/aqlm/gemm_kernels.cu"
"csrc/quantization/awq/gemm_kernels.cu"
- "csrc/quantization/marlin/marlin_cuda_kernel.cu"
- "csrc/custom_all_reduce.cu")
+ "csrc/quantization/marlin/dense/marlin_cuda_kernel.cu"
+ "csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu"
+ "csrc/quantization/gptq_marlin/gptq_marlin.cu"
+ "csrc/quantization/gptq_marlin/gptq_marlin_repack.cu"
+ "csrc/custom_all_reduce.cu"
+ "csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu"
+ "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu"
+ "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu")
+
+ #
+ # The CUTLASS kernels for Hopper require sm90a to be enabled.
+ # This is done via the below gencode option, BUT that creates kernels for both sm90 and sm90a.
+ # That adds an extra 17MB to compiled binary, so instead we selectively enable it.
+ if(${CMAKE_CUDA_COMPILER_VERSION} VERSION_GREATER 12.0)
+ set_source_files_properties(
+ "csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu"
+ PROPERTIES
+ COMPILE_FLAGS
+ "-gencode arch=compute_90a,code=sm_90a")
+ endif()
+
endif()
define_gpu_extension_target(
@@ -168,6 +217,7 @@ define_gpu_extension_target(
SOURCES ${VLLM_EXT_SRC}
COMPILE_FLAGS ${VLLM_GPU_FLAGS}
ARCHITECTURES ${VLLM_GPU_ARCHES}
+ INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
WITH_SOABI)
#
@@ -193,24 +243,13 @@ define_gpu_extension_target(
set(VLLM_PUNICA_EXT_SRC
"csrc/punica/bgmv/bgmv_bf16_bf16_bf16.cu"
- "csrc/punica/bgmv/bgmv_bf16_bf16_fp16.cu"
- "csrc/punica/bgmv/bgmv_bf16_fp16_bf16.cu"
- "csrc/punica/bgmv/bgmv_bf16_fp16_fp16.cu"
"csrc/punica/bgmv/bgmv_bf16_fp32_bf16.cu"
- "csrc/punica/bgmv/bgmv_bf16_fp32_fp16.cu"
- "csrc/punica/bgmv/bgmv_fp16_bf16_bf16.cu"
- "csrc/punica/bgmv/bgmv_fp16_bf16_fp16.cu"
- "csrc/punica/bgmv/bgmv_fp16_fp16_bf16.cu"
"csrc/punica/bgmv/bgmv_fp16_fp16_fp16.cu"
- "csrc/punica/bgmv/bgmv_fp16_fp32_bf16.cu"
"csrc/punica/bgmv/bgmv_fp16_fp32_fp16.cu"
"csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
- "csrc/punica/bgmv/bgmv_fp32_bf16_fp16.cu"
- "csrc/punica/bgmv/bgmv_fp32_fp16_bf16.cu"
"csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
- "csrc/punica/bgmv/bgmv_fp32_fp32_bf16.cu"
- "csrc/punica/bgmv/bgmv_fp32_fp32_fp16.cu"
- "csrc/punica/punica_ops.cc")
+ "csrc/punica/punica_ops.cu"
+ "csrc/punica/punica_pybind.cpp")
#
# Copy GPU compilation flags+update for punica
@@ -234,6 +273,9 @@ if (${VLLM_GPU_LANG} STREQUAL "CUDA")
endif()
endforeach()
message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
+elseif(${VLLM_GPU_LANG} STREQUAL "HIP")
+ set(VLLM_PUNICA_GPU_ARCHES ${VLLM_GPU_ARCHES})
+ message(STATUS "Punica target arches: ${VLLM_PUNICA_GPU_ARCHES}")
endif()
if (VLLM_PUNICA_GPU_ARCHES)
@@ -268,9 +310,7 @@ add_custom_target(default)
if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
message(STATUS "Enabling C extension.")
add_dependencies(default _C)
-endif()
-if(VLLM_GPU_LANG STREQUAL "CUDA")
message(STATUS "Enabling moe extension.")
add_dependencies(default _moe_C)
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 8db5e569b6..81a8db2b26 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -21,7 +21,6 @@ Express your support on Twitter if vLLM aids you, or simply offer your appreciat
### Build from source
```bash
-pip install -r requirements.txt
pip install -e . # This may take several minutes.
```
@@ -30,6 +29,8 @@ pip install -e . # This may take several minutes.
```bash
pip install -r requirements-dev.txt
+# linting and formatting
+bash format.sh
# Static type checking
mypy
# Unit tests
diff --git a/Dockerfile b/Dockerfile
index 1f254c76fe..eb96bf3c1d 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,8 +1,13 @@
# The vLLM Dockerfile is used to construct vLLM image that can be directly used
# to run the OpenAI compatible server.
+# Please update any changes made here to
+# docs/source/dev/dockerfile/dockerfile.rst and
+# docs/source/assets/dev/dockerfile-stages-dependency.png
+
#################### BASE BUILD IMAGE ####################
-FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
+# prepare basic build environment
+FROM nvidia/cuda:12.4.1-devel-ubuntu22.04 AS dev
RUN apt-get update -y \
&& apt-get install -y python3-pip git
@@ -11,23 +16,31 @@ RUN apt-get update -y \
# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
# this won't be needed for future versions of this docker image
# or future versions of triton.
-RUN ldconfig /usr/local/cuda-12.1/compat/
+RUN ldconfig /usr/local/cuda-12.4/compat/
WORKDIR /workspace
# install build and runtime dependencies
-COPY requirements.txt requirements.txt
+COPY requirements-common.txt requirements-common.txt
+COPY requirements-cuda.txt requirements-cuda.txt
RUN --mount=type=cache,target=/root/.cache/pip \
- pip install -r requirements.txt
+ pip install -r requirements-cuda.txt
# install development dependencies
COPY requirements-dev.txt requirements-dev.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-dev.txt
+
+# cuda arch list used by torch
+# can be useful for both `dev` and `test`
+# explicitly set the list to avoid issues with torch 2.2
+# see https://github.com/pytorch/pytorch/pull/123243
+ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
+ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
#################### BASE BUILD IMAGE ####################
-#################### EXTENSION BUILD IMAGE ####################
+#################### WHEEL BUILD IMAGE ####################
FROM dev AS build
# install build dependencies
@@ -35,18 +48,19 @@ COPY requirements-build.txt requirements-build.txt
RUN --mount=type=cache,target=/root/.cache/pip \
pip install -r requirements-build.txt
-# copy input files
+# install compiler cache to speed up compilation leveraging local or remote caching
+RUN apt-get update -y && apt-get install -y ccache
+
+# files and directories related to build wheels
COPY csrc csrc
COPY setup.py setup.py
COPY cmake cmake
COPY CMakeLists.txt CMakeLists.txt
-COPY requirements.txt requirements.txt
+COPY requirements-common.txt requirements-common.txt
+COPY requirements-cuda.txt requirements-cuda.txt
COPY pyproject.toml pyproject.toml
-COPY vllm/__init__.py vllm/__init__.py
+COPY vllm vllm
-# cuda arch list used by torch
-ARG torch_cuda_arch_list='7.0 7.5 8.0 8.6 8.9 9.0+PTX'
-ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
# max jobs used by Ninja to build extensions
ARG max_jobs=2
ENV MAX_JOBS=${max_jobs}
@@ -56,76 +70,67 @@ ENV NVCC_THREADS=$nvcc_threads
# make sure punica kernels are built (for LoRA)
ENV VLLM_INSTALL_PUNICA_KERNELS=1
-RUN python3 setup.py build_ext --inplace
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+ --mount=type=cache,target=/root/.cache/pip \
+ python3 setup.py bdist_wheel --dist-dir=dist
+
+# check the size of the wheel, we cannot upload wheels larger than 100MB
+COPY .buildkite/check-wheel-size.py check-wheel-size.py
+RUN python3 check-wheel-size.py dist
+
#################### EXTENSION Build IMAGE ####################
-#################### FLASH_ATTENTION Build IMAGE ####################
-FROM dev as flash-attn-builder
-# max jobs used for build
-ARG max_jobs=2
-ENV MAX_JOBS=${max_jobs}
-# flash attention version
-ARG flash_attn_version=v2.5.6
-ENV FLASH_ATTN_VERSION=${flash_attn_version}
+#################### vLLM installation IMAGE ####################
+# image with vLLM installed
+FROM nvidia/cuda:12.4.1-base-ubuntu22.04 AS vllm-base
+WORKDIR /vllm-workspace
-WORKDIR /usr/src/flash-attention-v2
+RUN apt-get update -y \
+ && apt-get install -y python3-pip git vim
-# Download the wheel or build it if a pre-compiled release doesn't exist
-RUN pip --verbose wheel flash-attn==${FLASH_ATTN_VERSION} \
- --no-build-isolation --no-deps --no-cache-dir
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-12.4/compat/
+
+# install vllm wheel first, so that torch etc will be installed
+RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \
+ --mount=type=cache,target=/root/.cache/pip \
+ pip install dist/*.whl --verbose
+#################### vLLM installation IMAGE ####################
-#################### FLASH_ATTENTION Build IMAGE ####################
#################### TEST IMAGE ####################
# image to run unit testing suite
-FROM dev AS test
+# note that this uses vllm installed by `pip`
+FROM vllm-base AS test
-# copy pytorch extensions separately to avoid having to rebuild
-# when python code changes
-WORKDIR /vllm-workspace
-# ADD is used to preserve directory structure
ADD . /vllm-workspace/
-COPY --from=build /workspace/vllm/*.so /vllm-workspace/vllm/
-# Install flash attention (from pre-built wheel)
-RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
- pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
-# ignore build dependencies installation because we are using pre-complied extensions
-RUN rm pyproject.toml
-RUN --mount=type=cache,target=/root/.cache/pip VLLM_USE_PRECOMPILED=1 pip install . --verbose
-#################### TEST IMAGE ####################
-
-
-#################### RUNTIME BASE IMAGE ####################
-# We used base cuda image because pytorch installs its own cuda libraries.
-# However cupy depends on cuda libraries so we had to switch to the runtime image
-# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
-FROM nvidia/cuda:12.1.0-runtime-ubuntu22.04 AS vllm-base
-# libnccl required for ray
-RUN apt-get update -y \
- && apt-get install -y python3-pip
-
-WORKDIR /workspace
-COPY requirements.txt requirements.txt
+# install development dependencies (for testing)
RUN --mount=type=cache,target=/root/.cache/pip \
- pip install -r requirements.txt
-
-# Install flash attention (from pre-built wheel)
-RUN --mount=type=bind,from=flash-attn-builder,src=/usr/src/flash-attention-v2,target=/usr/src/flash-attention-v2 \
- pip install /usr/src/flash-attention-v2/*.whl --no-cache-dir
+ pip install -r requirements-dev.txt
-#################### RUNTIME BASE IMAGE ####################
+# doc requires source code
+# we hide them inside `test_docs/` , so that this source code
+# will not be imported by other tests
+RUN mkdir test_docs
+RUN mv docs test_docs/
+RUN mv vllm test_docs/
+#################### TEST IMAGE ####################
#################### OPENAI API SERVER ####################
# openai api server alternative
FROM vllm-base AS vllm-openai
+
# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
pip install accelerate hf_transfer modelscope
-COPY --from=build /workspace/vllm/*.so /workspace/vllm/
-COPY vllm vllm
+ENV VLLM_USAGE_SOURCE production-docker-image
ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
#################### OPENAI API SERVER ####################
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
new file mode 100644
index 0000000000..403a1cd039
--- /dev/null
+++ b/Dockerfile.cpu
@@ -0,0 +1,26 @@
+# This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
+
+FROM ubuntu:22.04 AS cpu-test-1
+
+RUN apt-get update -y \
+ && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
+ && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
+
+RUN pip install --upgrade pip \
+ && pip install wheel packaging ninja "setuptools>=49.4.0" numpy
+
+FROM cpu-test-1 AS build
+
+COPY ./ /workspace/vllm
+
+WORKDIR /workspace/vllm
+
+RUN pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
+
+RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
+
+WORKDIR /workspace/
+
+RUN ln -s /workspace/vllm/tests && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
+CMD ["/bin/bash"]
diff --git a/Dockerfile.neuron b/Dockerfile.neuron
new file mode 100644
index 0000000000..fe42b4ef39
--- /dev/null
+++ b/Dockerfile.neuron
@@ -0,0 +1,36 @@
+# default base image
+ARG BASE_IMAGE="763104351884.dkr.ecr.us-west-2.amazonaws.com/pytorch-inference-neuronx:2.1.1-neuronx-py310-sdk2.17.0-ubuntu20.04"
+
+FROM $BASE_IMAGE
+
+RUN echo "Base image is $BASE_IMAGE"
+
+# Install some basic utilities
+RUN apt-get update && apt-get install python3 python3-pip -y
+
+### Mount Point ###
+# When launching the container, mount the code directory to /app
+ARG APP_MOUNT=/app
+VOLUME [ ${APP_MOUNT} ]
+WORKDIR ${APP_MOUNT}
+
+RUN python3 -m pip install --upgrade pip
+RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers pandas
+RUN python3 -m pip install sentencepiece transformers==4.36.2 -U
+RUN python3 -m pip install transformers-neuronx --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+RUN python3 -m pip install --pre neuronx-cc==2.12.* --extra-index-url=https://pip.repos.neuron.amazonaws.com -U
+
+COPY ./vllm /app/vllm/vllm
+COPY ./setup.py /app/vllm/setup.py
+COPY ./requirements-common.txt /app/vllm/requirements-common.txt
+COPY ./requirements-neuron.txt /app/vllm/requirements-neuron.txt
+
+RUN cd /app/vllm \
+ && python3 -m pip install -U -r requirements-neuron.txt
+
+ENV VLLM_BUILD_WITH_NEURON 1
+RUN cd /app/vllm \
+ && pip install -e . \
+ && cd ..
+
+CMD ["/bin/bash"]
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index a45265d79a..e30a2aaf30 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -14,7 +14,7 @@ RUN echo "Base image is $BASE_IMAGE"
ARG FA_GFX_ARCHS="gfx90a;gfx942"
RUN echo "FA_GFX_ARCHS is $FA_GFX_ARCHS"
-ARG FA_BRANCH="3d2b6f5"
+ARG FA_BRANCH="ae7928c"
RUN echo "FA_BRANCH is $FA_BRANCH"
# whether to build flash-attention
@@ -23,8 +23,8 @@ RUN echo "FA_BRANCH is $FA_BRANCH"
# In that case, we need to use the python reference attention implementation in vllm
ARG BUILD_FA="1"
-# whether to build cupy on rocm
-ARG BUILD_CUPY="1"
+# whether to build triton on rocm
+ARG BUILD_TRITON="1"
# Install some basic utilities
RUN apt-get update && apt-get install python3 python3-pip -y
@@ -46,7 +46,7 @@ RUN apt-get update && apt-get install -y \
### Mount Point ###
# When launching the container, mount the code directory to /app
-ARG APP_MOUNT=/app
+ARG APP_MOUNT=/vllm-workspace
VOLUME [ ${APP_MOUNT} ]
WORKDIR ${APP_MOUNT}
@@ -78,38 +78,38 @@ RUN if [ "$BUILD_FA" = "1" ]; then \
RUN if [ "$BASE_IMAGE" = "rocm/pytorch:rocm6.0_ubuntu20.04_py3.9_pytorch_2.1.1" ]; then \
rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/; fi
-# build cupy
-RUN if [ "$BUILD_CUPY" = "1" ]; then \
+# build triton
+RUN if [ "$BUILD_TRITON" = "1" ]; then \
mkdir -p libs \
&& cd libs \
- && git clone -b hipgraph_enablement --recursive https://github.com/ROCm/cupy.git \
- && cd cupy \
- && pip install mpi4py-mpich \
- && pip install scipy==1.9.3 \
- && pip install cython==0.29.* \
- && env CC=$MPI_HOME/bin/mpicc python -m pip install mpi4py \
- && export CUPY_INSTALL_USE_HIP=1 \
- && export ROCM_HOME=/opt/rocm \
- && export HCC_AMDGPU_TARGET="gfx90a,gfx942,gfx1100" \
- && pip install . \
- && cd ..; \
+ && pip uninstall -y triton \
+ && git clone https://github.com/ROCm/triton.git \
+ && cd triton/python \
+ && pip3 install . \
+ && cd ../..; \
fi
-COPY ./ /app/vllm
+WORKDIR /vllm-workspace
+COPY . .
-RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install xformers==0.0.23 --no-deps
-
-RUN cd /app \
- && cd vllm \
- && pip install -U -r requirements-rocm.txt \
- && if [ "$BUILD_FA" = "1" ]; then \
- bash patch_xformers.rocm.sh; fi \
- && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h /app/vllm/rocm_patch/rocm_bf16.patch \
+#RUN python3 -m pip install pynvml # to be removed eventually
+RUN python3 -m pip install --upgrade pip numba
+
+# make sure punica kernels are built (for LoRA)
+ENV VLLM_INSTALL_PUNICA_KERNELS=1
+# Workaround for ray >= 2.10.0
+ENV RAY_EXPERIMENTAL_NOSET_ROCR_VISIBLE_DEVICES=1
+
+ENV VLLM_NCCL_SO_PATH=/opt/rocm/lib/librccl.so
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+ pip install -U -r requirements-rocm.txt \
+ && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
&& python3 setup.py install \
+ && cp build/lib.linux-x86_64-cpython-39/vllm/_C.cpython-39-x86_64-linux-gnu.so vllm/ \
+ && cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.cpython-39-x86_64-linux-gnu.so vllm/ \
+ && cp build/lib.linux-x86_64-cpython-39/vllm/_moe_C.cpython-39-x86_64-linux-gnu.so vllm/ \
&& cd ..
-RUN python3 -m pip install --upgrade pip
-RUN python3 -m pip install --no-cache-dir ray[all]
CMD ["/bin/bash"]
diff --git a/Dockerfile.ubi b/Dockerfile.ubi
new file mode 100644
index 0000000000..d4989f726d
--- /dev/null
+++ b/Dockerfile.ubi
@@ -0,0 +1,294 @@
+# Please update any changes made here to
+# docs/source/dev/dockerfile-ubi/dockerfile-ubi.rst
+
+## Global Args #################################################################
+ARG BASE_UBI_IMAGE_TAG=9.4-949.1714662671
+ARG PYTHON_VERSION=3.11
+
+# NOTE: This setting only has an effect when not using prebuilt-wheel kernels
+ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.9 9.0+PTX"
+
+
+## Base Layer ##################################################################
+FROM registry.access.redhat.com/ubi9/ubi-minimal:${BASE_UBI_IMAGE_TAG} as base
+
+WORKDIR /workspace
+
+ENV LANG=C.UTF-8 \
+ LC_ALL=C.UTF-8
+
+# Some utils for dev purposes - tar required for kubectl cp
+RUN microdnf install -y \
+ which procps findutils tar vim \
+ && microdnf clean all
+
+
+## Python Installer ############################################################
+FROM base as python-install
+
+ARG PYTHON_VERSION
+ARG MINIFORGE_VERSION=24.3.0-0
+
+RUN curl -fsSL -o ~/miniforge3.sh -O "https://github.com/conda-forge/miniforge/releases/download/${MINIFORGE_VERSION}/Miniforge3-$(uname)-$(uname -m).sh" && \
+ chmod +x ~/miniforge3.sh && \
+ bash ~/miniforge3.sh -b -p /opt/conda && \
+ source "/opt/conda/etc/profile.d/conda.sh" && \
+ conda create -y -p /opt/vllm python=${PYTHON_VERSION} && \
+ conda activate /opt/vllm && \
+ rm ~/miniforge3.sh
+# use of the /opt/vllm env requires:
+# ENV PATH=/opt/vllm/bin/:$PATH
+
+## CUDA Base ###################################################################
+FROM base as cuda-base
+
+# The Nvidia operator won't allow deploying on CUDA 12.0 hosts if
+# this env var is set to 12.2.0, even though it's compatible
+#ENV CUDA_VERSION=12.2.0 \
+ENV CUDA_VERSION=12.0.0 \
+ NV_CUDA_LIB_VERSION=12.2.0-1 \
+ NVIDIA_VISIBLE_DEVICES=all \
+ NVIDIA_DRIVER_CAPABILITIES=compute,utility \
+ NV_CUDA_CUDART_VERSION=12.2.53-1 \
+ NV_CUDA_COMPAT_VERSION=535.104.12
+
+RUN curl -Lo /etc/yum.repos.d/cuda-rhel9.repo \
+ https://developer.download.nvidia.com/compute/cuda/repos/rhel9/x86_64/cuda-rhel9.repo
+
+RUN microdnf install -y \
+ cuda-cudart-12-2-${NV_CUDA_CUDART_VERSION} \
+ cuda-compat-12-2-${NV_CUDA_COMPAT_VERSION} \
+ && microdnf clean all
+
+ENV CUDA_HOME="/usr/local/cuda" \
+ PATH="/usr/local/nvidia/bin:${CUDA_HOME}/bin:${PATH}" \
+ LD_LIBRARY_PATH="/usr/local/nvidia/lib:/usr/local/nvidia/lib64:$CUDA_HOME/lib64:$CUDA_HOME/extras/CUPTI/lib64:${LD_LIBRARY_PATH}"
+
+
+## CUDA Runtime ################################################################
+FROM cuda-base as cuda-runtime
+
+ENV NV_NVTX_VERSION=12.2.53-1 \
+ NV_LIBNPP_VERSION=12.1.1.14-1 \
+ NV_LIBCUBLAS_VERSION=12.2.1.16-1 \
+ NV_LIBNCCL_PACKAGE_VERSION=2.18.5-1+cuda12.2
+
+RUN microdnf install -y \
+ cuda-libraries-12-2-${NV_CUDA_LIB_VERSION} \
+ cuda-nvtx-12-2-${NV_NVTX_VERSION} \
+ libnpp-12-2-${NV_LIBNPP_VERSION} \
+ libcublas-12-2-${NV_LIBCUBLAS_VERSION} \
+ libnccl-${NV_LIBNCCL_PACKAGE_VERSION} \
+ && microdnf clean all
+
+
+## CUDA Development ############################################################
+FROM cuda-base as cuda-devel
+
+ENV NV_CUDA_CUDART_DEV_VERSION=12.2.53-1 \
+ NV_NVML_DEV_VERSION=12.2.81-1 \
+ NV_LIBCUBLAS_DEV_VERSION=12.2.1.16-1 \
+ NV_LIBNPP_DEV_VERSION=12.1.1.14-1 \
+ NV_LIBNCCL_DEV_PACKAGE_VERSION=2.18.5-1+cuda12.2
+
+RUN microdnf install -y \
+ cuda-command-line-tools-12-2-${NV_CUDA_LIB_VERSION} \
+ cuda-libraries-devel-12-2-${NV_CUDA_LIB_VERSION} \
+ cuda-minimal-build-12-2-${NV_CUDA_LIB_VERSION} \
+ cuda-cudart-devel-12-2-${NV_CUDA_CUDART_DEV_VERSION} \
+ cuda-nvml-devel-12-2-${NV_NVML_DEV_VERSION} \
+ libcublas-devel-12-2-${NV_LIBCUBLAS_DEV_VERSION} \
+ libnpp-devel-12-2-${NV_LIBNPP_DEV_VERSION} \
+ libnccl-devel-${NV_LIBNCCL_DEV_PACKAGE_VERSION} \
+ && microdnf clean all
+
+ENV LIBRARY_PATH="$CUDA_HOME/lib64/stubs"
+
+# Workaround for https://github.com/openai/triton/issues/2507 and
+# https://github.com/pytorch/pytorch/issues/107960 -- hopefully
+# this won't be needed for future versions of this docker image
+# or future versions of triton.
+RUN ldconfig /usr/local/cuda-12.2/compat/
+
+## Python cuda base #################################################################
+FROM cuda-devel as python-cuda-base
+
+COPY --from=python-install --link /opt/vllm /opt/vllm
+ENV PATH=/opt/vllm/bin/:$PATH
+
+# install cuda and common dependencies
+RUN --mount=type=cache,target=/root/.cache/pip \
+ --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
+ --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
+ pip3 install \
+ -r requirements-cuda.txt
+
+## Development #################################################################
+FROM python-cuda-base AS dev
+
+# install build and runtime dependencies
+RUN --mount=type=cache,target=/root/.cache/pip \
+ --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
+ --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
+ --mount=type=bind,source=requirements-dev.txt,target=requirements-dev.txt \
+ pip3 install \
+ -r requirements-cuda.txt \
+ -r requirements-dev.txt
+
+## Proto Compilation ###########################################################
+FROM python-install AS gen-protos
+
+ENV PATH=/opt/vllm/bin/:$PATH
+
+RUN microdnf install -y \
+ make \
+ findutils \
+ && microdnf clean all
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+ --mount=type=bind,source=Makefile,target=Makefile \
+ --mount=type=bind,source=proto,target=proto \
+ make gen-protos
+
+## Extension Cache #############################################################
+# Instead of compiling artifacts every build just copy from pre-built wheel
+# This might not work if the PyTorch and CUDA versions don't match!
+FROM base as prebuilt-wheel
+
+RUN microdnf install -y \
+ unzip \
+ && microdnf clean all
+
+ARG PYTHON_VERSION
+# 0.4.2 is built for CUDA 12.1 and PyTorch 2.3.0
+ARG VLLM_WHEEL_VERSION=0.4.3
+
+RUN curl -Lo vllm.whl https://github.com/vllm-project/vllm/releases/download/v${VLLM_WHEEL_VERSION}/vllm-${VLLM_WHEEL_VERSION}-cp${PYTHON_VERSION//.}-cp${PYTHON_VERSION//.}-manylinux1_x86_64.whl \
+ && unzip vllm.whl \
+ && rm vllm.whl
+# compiled extensions located at /workspace/vllm/*.so
+
+## Builder #####################################################################
+FROM dev AS build
+
+# install build dependencies
+RUN --mount=type=cache,target=/root/.cache/pip \
+ --mount=type=bind,source=requirements-build.txt,target=requirements-build.txt \
+ pip install -r requirements-build.txt
+
+# copy input files
+COPY csrc csrc
+COPY setup.py setup.py
+COPY cmake cmake
+COPY CMakeLists.txt CMakeLists.txt
+COPY requirements-common.txt requirements-common.txt
+COPY requirements-cuda.txt requirements-cuda.txt
+COPY pyproject.toml pyproject.toml
+COPY vllm/__init__.py vllm/__init__.py
+
+ARG TORCH_CUDA_ARCH_LIST
+ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
+
+# max jobs used by Ninja to build extensions
+ARG max_jobs=2
+ENV MAX_JOBS=${max_jobs}
+# number of threads used by nvcc
+ARG nvcc_threads=2
+ENV NVCC_THREADS=$nvcc_threads
+# make sure punica kernels are built (for LoRA)
+ENV VLLM_INSTALL_PUNICA_KERNELS=1
+
+# Setup path stuff? Ref: https://github.com/vllm-project/vllm/blob/main/.github/workflows/scripts/build.sh#L6-L8
+ENV PATH=/usr/local/cuda/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+
+# Copy the entire directory before building wheel
+COPY --link vllm vllm
+
+# Comment if building *.so files from scratch
+##################################################
+# Copy the prebuilt *.so files
+COPY --from=prebuilt-wheel --link /workspace/vllm/*.so /workspace/vllm/
+ENV VLLM_USE_PRECOMPILED=1
+##################################################
+# Comment if not building .so files from scratch
+#RUN microdnf install -y git \
+# && microdnf clean all
+##################################################
+
+# Copy over the generated *.pb2 files
+COPY --from=gen-protos --link /workspace/vllm/entrypoints/grpc/pb vllm/entrypoints/grpc/pb
+
+ENV CCACHE_DIR=/root/.cache/ccache
+RUN --mount=type=cache,target=/root/.cache/ccache \
+ --mount=type=cache,target=/root/.cache/pip \
+ CMAKE_BUILD_TYPE=Release python3 setup.py bdist_wheel --dist-dir=dist
+
+## Release #####################################################################
+# Note from the non-UBI Dockerfile:
+# We used base cuda image because pytorch installs its own cuda libraries.
+# However pynccl depends on cuda libraries so we had to switch to the runtime image
+# In the future it would be nice to get a container with pytorch and cuda without duplicating cuda
+FROM cuda-runtime AS vllm-openai
+
+WORKDIR /workspace
+
+# Create release python environment
+COPY --from=python-cuda-base --link /opt/vllm /opt/vllm
+ENV PATH=/opt/vllm/bin/:$PATH
+
+# install vllm wheel first, so that torch etc will be installed
+RUN --mount=type=bind,from=build,src=/workspace/dist,target=/workspace/dist \
+ --mount=type=cache,target=/root/.cache/pip \
+ pip install $(echo dist/*.whl)'[tensorizer]' --verbose
+
+# Install the vllm_nccl package which is a bit quirky
+RUN --mount=type=cache,target=/root/.cache/pip \
+ --mount=type=bind,source=requirements-common.txt,target=requirements-common.txt \
+ --mount=type=bind,source=requirements-cuda.txt,target=requirements-cuda.txt \
+ # The "install" happens in `setup.py` so it happens when built...
+ # Remove the already installed package and the cached wheel
+ pip uninstall -y vllm-nccl-cu12 \
+ && pip cache remove vllm_nccl* \
+ # install the version depended on by vllm requirements
+ && pip install vllm-nccl-cu12 -r requirements-cuda.txt \
+ # The lib is downloaded to root's home directory... move it
+ && mv ~/.config/vllm/nccl/cu12/libnccl.so.2* /usr/local/lib/libnccl.so.2
+ENV VLLM_NCCL_SO_PATH=/usr/local/lib/libnccl.so.2
+
+RUN --mount=type=cache,target=/root/.cache/pip \
+ pip3 install \
+ # additional dependencies for the TGIS gRPC server
+ grpcio-tools==1.63.0 \
+ # additional dependencies for openai api_server
+ accelerate==0.30.0 \
+ # hf_transfer for faster HF hub downloads
+ hf_transfer==0.1.6
+
+# Triton needs a CC compiler
+RUN microdnf install -y gcc \
+ && microdnf clean all
+
+# patch triton (fix for #720)
+COPY triton_patch/custom_cache_manager.py /opt/vllm/lib/python3.11/site-packages/triton/runtime/custom_cache_manager.py
+
+ENV HF_HUB_OFFLINE=1 \
+ PORT=8000 \
+ GRPC_PORT=8033 \
+ HOME=/home/vllm \
+ VLLM_USAGE_SOURCE=production-docker-image \
+ VLLM_WORKER_MULTIPROC_METHOD=fork \
+ TRITON_CACHE_MANAGER="triton.runtime.custom_cache_manager:CustomCacheManager"
+
+# setup non-root user for OpenShift
+RUN microdnf install -y shadow-utils \
+ && umask 002 \
+ && useradd --uid 2000 --gid 0 vllm \
+ && microdnf remove -y shadow-utils \
+ && microdnf clean all \
+ && chmod g+rwx $HOME /usr/src /workspace
+
+COPY LICENSE /licenses/vllm.md
+
+USER 2000
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/MANIFEST.in b/MANIFEST.in
index aa16da6500..82be639ef4 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,5 +1,9 @@
include LICENSE
-include requirements.txt
+include requirements-common.txt
+include requirements-cuda.txt
+include requirements-rocm.txt
+include requirements-neuron.txt
+include requirements-cpu.txt
include CMakeLists.txt
recursive-include cmake *
diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000000..8907934d90
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,10 @@
+
+target_path := "vllm/entrypoints/grpc/pb"
+gen-protos:
+ # Compile protos
+ pip install grpcio-tools==1.62.1 mypy-protobuf==3.5.0 'types-protobuf>=3.20.4'
+ mkdir -p $(target_path)
+ python -m grpc_tools.protoc -Iproto --python_out=$(target_path) \
+ --grpc_python_out=$(target_path) --mypy_out=$(target_path) proto/generation.proto
+ find $(target_path)/ -type f -name "*.py" -print0 -exec sed -i -e 's/^\(import.*pb2\)/from . \1/g' {} \;
+ touch $(target_path)/__init__.py
diff --git a/README.md b/README.md
index f57c3f7862..b740a13e3a 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,9 @@
+
+🌶️🌶️🌶️ TGIS README HERE 🌶️🌶️🌶️
+
+
+---
+
@@ -16,16 +22,17 @@ Easy, fast, and cheap LLM serving for everyone
---
-**The Third vLLM Bay Area Meetup (April 2nd 6pm-8:30pm PT)**
+**The Fourth vLLM Bay Area Meetup (June 11th 5:30pm-8pm PT)**
-We are thrilled to announce our third vLLM Meetup!
+We are thrilled to announce our fourth vLLM Meetup!
The vLLM team will share recent updates and roadmap.
-We will also have vLLM collaborators from Roblox coming up to the stage to discuss their experience in deploying LLMs with vLLM.
-Please register [here](https://robloxandvllmmeetup2024.splashthat.com/) and join us!
+We will also have vLLM collaborators from BentoML and Cloudflare coming up to the stage to discuss their experience in deploying LLMs with vLLM.
+Please register [here](https://lu.ma/agivllm) and join us!
---
*Latest News* 🔥
+- [2024/04] We hosted [the third vLLM meetup](https://robloxandvllmmeetup2024.splashthat.com/) with Roblox! Please find the meetup slides [here](https://docs.google.com/presentation/d/1A--47JAK4BJ39t954HyTkvtfwn0fkqtsL8NGFuslReM/edit?usp=sharing).
- [2024/01] We hosted [the second vLLM meetup](https://lu.ma/ygxbpzhl) in SF! Please find the meetup slides [here](https://docs.google.com/presentation/d/12mI2sKABnUw5RBWXDYY-HtHth4iMSNcEoQ10jDQbxgA/edit?usp=sharing).
- [2024/01] Added ROCm 6.0 support to vLLM.
- [2023/12] Added ROCm 5.7 support to vLLM.
@@ -61,34 +68,14 @@ vLLM is flexible and easy to use with:
- (Experimental) Prefix caching support
- (Experimental) Multi-lora support
-vLLM seamlessly supports many Hugging Face models, including the following architectures:
-
-- Aquila & Aquila2 (`BAAI/AquilaChat2-7B`, `BAAI/AquilaChat2-34B`, `BAAI/Aquila-7B`, `BAAI/AquilaChat-7B`, etc.)
-- Baichuan & Baichuan2 (`baichuan-inc/Baichuan2-13B-Chat`, `baichuan-inc/Baichuan-7B`, etc.)
-- BLOOM (`bigscience/bloom`, `bigscience/bloomz`, etc.)
-- ChatGLM (`THUDM/chatglm2-6b`, `THUDM/chatglm3-6b`, etc.)
-- DeciLM (`Deci/DeciLM-7B`, `Deci/DeciLM-7B-instruct`, etc.)
-- Falcon (`tiiuae/falcon-7b`, `tiiuae/falcon-40b`, `tiiuae/falcon-rw-7b`, etc.)
-- Gemma (`google/gemma-2b`, `google/gemma-7b`, etc.)
-- GPT-2 (`gpt2`, `gpt2-xl`, etc.)
-- GPT BigCode (`bigcode/starcoder`, `bigcode/gpt_bigcode-santacoder`, etc.)
-- GPT-J (`EleutherAI/gpt-j-6b`, `nomic-ai/gpt4all-j`, etc.)
-- GPT-NeoX (`EleutherAI/gpt-neox-20b`, `databricks/dolly-v2-12b`, `stabilityai/stablelm-tuned-alpha-7b`, etc.)
-- InternLM (`internlm/internlm-7b`, `internlm/internlm-chat-7b`, etc.)
-- InternLM2 (`internlm/internlm2-7b`, `internlm/internlm2-chat-7b`, etc.)
-- LLaMA & LLaMA-2 (`meta-llama/Llama-2-70b-hf`, `lmsys/vicuna-13b-v1.3`, `young-geng/koala`, `openlm-research/open_llama_13b`, etc.)
-- Mistral (`mistralai/Mistral-7B-v0.1`, `mistralai/Mistral-7B-Instruct-v0.1`, etc.)
-- Mixtral (`mistralai/Mixtral-8x7B-v0.1`, `mistralai/Mixtral-8x7B-Instruct-v0.1`, etc.)
-- MPT (`mosaicml/mpt-7b`, `mosaicml/mpt-30b`, etc.)
-- OLMo (`allenai/OLMo-1B`, `allenai/OLMo-7B`, etc.)
-- OPT (`facebook/opt-66b`, `facebook/opt-iml-max-30b`, etc.)
-- Orion (`OrionStarAI/Orion-14B-Base`, `OrionStarAI/Orion-14B-Chat`, etc.)
-- Phi (`microsoft/phi-1_5`, `microsoft/phi-2`, etc.)
-- Qwen (`Qwen/Qwen-7B`, `Qwen/Qwen-7B-Chat`, etc.)
-- Qwen2 (`Qwen/Qwen2-7B-beta`, `Qwen/Qwen-7B-Chat-beta`, etc.)
-- StableLM(`stabilityai/stablelm-3b-4e1t`, `stabilityai/stablelm-base-alpha-7b-v2`, etc.)
-- Starcoder2(`bigcode/starcoder2-3b`, `bigcode/starcoder2-7b`, `bigcode/starcoder2-15b`, etc.)
-- Yi (`01-ai/Yi-6B`, `01-ai/Yi-34B`, etc.)
+vLLM seamlessly supports most popular open-source models on HuggingFace, including:
+- Transformer-like LLMs (e.g., Llama)
+- Mixture-of-Expert LLMs (e.g., Mixtral)
+- Multi-modal LLMs (e.g., LLaVA)
+
+Find the full list of supported models [here](https://docs.vllm.ai/en/latest/models/supported_models.html).
+
+## Getting Started
Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/getting_started/installation.html#build-from-source):
@@ -96,9 +83,7 @@ Install vLLM with pip or [from source](https://vllm.readthedocs.io/en/latest/get
pip install vllm
```
-## Getting Started
-
-Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started.
+Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to learn more.
- [Installation](https://vllm.readthedocs.io/en/latest/getting_started/installation.html)
- [Quickstart](https://vllm.readthedocs.io/en/latest/getting_started/quickstart.html)
- [Supported Models](https://vllm.readthedocs.io/en/latest/models/supported_models.html)
@@ -108,6 +93,32 @@ Visit our [documentation](https://vllm.readthedocs.io/en/latest/) to get started
We welcome and value any contributions and collaborations.
Please check out [CONTRIBUTING.md](./CONTRIBUTING.md) for how to get involved.
+## Sponsors
+
+vLLM is a community project. Our compute resources for development and testing are supported by the following organizations. Thank you for your support!
+
+
+
+
+- a16z
+- AMD
+- Anyscale
+- AWS
+- Crusoe Cloud
+- Databricks
+- DeepInfra
+- Dropbox
+- Lambda Lab
+- NVIDIA
+- Replicate
+- Roblox
+- RunPod
+- Trainy
+- UC Berkeley
+- UC San Diego
+
+We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
+
## Citation
If you use vLLM for your research, please cite our [paper](https://arxiv.org/abs/2309.06180):
diff --git a/TGIS_README.md b/TGIS_README.md
new file mode 100644
index 0000000000..00dfe5b29d
--- /dev/null
+++ b/TGIS_README.md
@@ -0,0 +1,101 @@
+# Repo organization and building the TGIS-vLLM image
+
+This fork attempts to remain aligned with the vLLM repo as much as possible,
+while also containing a set of permanent changes to add:
+- A TGIS api adapter layer (see [TGIS](https://github.com/IBM/text-generation-inference))
+- A RedHat UBI-based Docker image delivery
+
+Given the fast pace of vLLM development, we also provide builds that include yet-to-be-merged
+PRs to vLLM by squash-merging open vLLM PRs onto a release branch on top of main that is continually
+reconstructed as we make more contributions.
+
+See a sketch of the commit graph
+
+
+## Contributing changes for vLLM
+
+To contribute improvements to vLLM that would have utility for the whole community, _don't_ base them on the `main` branch in this repo.
+Either contribute to your own fork of vLLM, or create a branch in this repo at the latest `vllm-project/vllm:main` commit.
+
+Open all PRs into [vLLM](https://github.com/vllm-project/vllm)
+
+Once you have opened a PR, follow the steps for [Reconstructing the release branch](#release-branch-building-procedure)
+
+## Contributing changes specific to the TGIS adapter or IBM delivery
+
+Contributing changes to the TGIS adapter or IBM delivery process is business as usual,
+make changes on a branch or fork directly from this repo's `main` branch and PR it back in.
+
+NB: When there are pending PRs to vLLM that are squashed onto the `release` branch, you will likely want to also apply your
+changes on a branch off of the latest `release.xxx` tag to build and test.
+
+Most of the IBM-specific changes in this repo are located in these files/packages:
+- `vllm/tgis_utils` contains many custom implementations of classes required for feature parity with TGIS
+- `vllm/entrypoints/openai/api_server.py` contains changes to parse TGIS specific command line args
+- `vllm/entrypoints/openai/cli_args.py` contains changes to inject TGIS specific command line args
+- `vllm/entrypoints/grpc` is entirely our own package for implementing a grpc server with the TGIS api
+- `proto` contains the TGIS api defs
+- `Dockerfile.ubi` is the UBI-based dockerfile that we build and ship
+- `.github/workflows/build.yml` contains our workflow for building and pushing the UBI-based image
+
+## Main branch rebasing procedure
+
+Rebasing vllm:main onto ibm:main is pretty straightforward. Assuming you have vllm-project/vllm as the
+`upstream` remote and ibm/vllm as the `origin` remote, one way to do this is:
+```shell
+# fetch latest ibm main
+git fetch origin main
+# fetch latest vllm main
+git fetch upstream main
+
+# Check out IBM main and cherry pick all new vllm commits here
+# NB: This works because the vllm main uses squash commits for a linear history.
+# Rebasing vllm main onto ibm main creates some small issues where manual conflict resolution causes commits to differ, and must be skipped or re-applied with every rebase
+git checkout origin/main
+git cherry-pick $(cat vllm_main_commit.txt)..upstream/main
+# Store the new latest vllm main commit
+echo "$(git rev-parse --short upstream/main)" > vllm_main_commit.txt
+git add vllm_main_commit.txt
+git commit -s -m "Update vLLM to $(git rev-parse --short upstream/main)"
+# Push to origin/main
+git push origin HEAD:main
+```
+
+## Release branch building procedure
+
+To rebuild the release branch, we want to squash each pending vLLM PR into a commit on top of main.
+
+Assuming this repo is the `origin` remote and vllm-project/vllm is `upstream`, this looks something like:
+```shell
+# Fetch latest mains
+git fetch upstream main
+git fetch origin main
+git checkout origin/main
+
+# Start a new release branch here at ibm:main
+git branch -f release HEAD
+git checkout release
+
+# for each ${PR_NUMBER} to squash in:
+# We first fetch the PR head from vLLM
+git branch -D ${PR_NUMBER}
+git fetch upstream pull/${PR_NUMBER}/head:${PR_NUMBER}
+# Then we want to squash-merge on top of vLLM:main
+git checkout upstream/main
+git merge --squash ${PR_NUMBER}
+# (Resolve any conflicts here)
+git commit -m "Squash ${PR_NUMBER}"
+# Then we want to apply that squash commit with only that PR's changes to `release`
+export SQUASH_HEAD=$(git rev-parse --short HEAD)
+git checkout release
+git cherry-pick $SQUASH_HEAD
+# Merge conflicts should be minimal if all PRs are in a mergeable state with vllm:main
+# But pending PRs may create conflicting changes with each other
+
+# force-push: we're overwriting the `release` branch
+git push -f origin HEAD:release
+
+# Create a tag to save this build
+git tag release.$(git rev-parse --short HEAD)
+git push origin release.$(git rev-parse --short HEAD)
+```
diff --git a/benchmarks/backend_request_func.py b/benchmarks/backend_request_func.py
index 7e6f3c3ed4..58dcc6167e 100644
--- a/benchmarks/backend_request_func.py
+++ b/benchmarks/backend_request_func.py
@@ -1,8 +1,10 @@
import json
import os
+import sys
import time
-from dataclasses import dataclass
-from typing import Optional
+import traceback
+from dataclasses import dataclass, field
+from typing import List, Optional
import aiohttp
from tqdm.asyncio import tqdm
@@ -25,9 +27,12 @@ class RequestFuncInput:
class RequestFuncOutput:
generated_text: str = ""
success: bool = False
- latency: float = 0
- ttft: float = 0
+ latency: float = 0.0
+ ttft: float = 0.0 # Time to first token
+ itl: List[float] = field(
+ default_factory=list) # List of inter-token latencies
prompt_len: int = 0
+ error: str = ""
async def async_request_tgi(
@@ -53,73 +58,44 @@ async def async_request_tgi(
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
- ttft = 0
+ ttft = 0.0
st = time.perf_counter()
+ most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload) as response:
if response.status == 200:
- async for data in response.content.iter_any():
- if ttft == 0:
- ttft = time.perf_counter() - st
- output.ttft = ttft
- output.latency = time.perf_counter() - st
-
- body = remove_prefix(data.decode("utf-8"), "data:")
- output.generated_text = json.loads(body)["generated_text"]
- output.success = True
- else:
- output.success = False
- except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
- output.success = False
-
- if pbar:
- pbar.update(1)
- return output
-
-
-async def async_request_vllm(
- request_func_input: RequestFuncInput,
- pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
- api_url = request_func_input.api_url
- assert api_url.endswith("generate")
+ async for chunk_bytes in response.content:
+ chunk_bytes = chunk_bytes.strip()
+ if not chunk_bytes:
+ continue
- async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
- payload = {
- "prompt": request_func_input.prompt,
- "n": 1,
- "best_of": request_func_input.best_of,
- "use_beam_search": request_func_input.use_beam_search,
- "temperature": 0.0 if request_func_input.use_beam_search else 1.0,
- "top_p": 1.0,
- "max_tokens": request_func_input.output_len,
- "ignore_eos": True,
- "stream": True,
- }
- output = RequestFuncOutput()
- output.prompt_len = request_func_input.prompt_len
+ chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+ "data:")
- ttft = 0
- st = time.perf_counter()
- try:
- async with session.post(url=api_url, json=payload) as response:
- if response.status == 200:
- async for data in response.content.iter_any():
- if ttft == 0:
+ data = json.loads(chunk)
+ timestamp = time.perf_counter()
+ # First token
+ if ttft == 0.0:
ttft = time.perf_counter() - st
output.ttft = ttft
- output.latency = time.perf_counter() - st
- # When streaming, '\0' is appended to the end of response.
- body = data.decode("utf-8").strip("\0")
- output.generated_text = json.loads(
- body)["text"][0][len(request_func_input.prompt):]
- output.success = True
+ # Decoding phase
+ else:
+ output.itl.append(timestamp -
+ most_recent_timestamp)
+ most_recent_timestamp = timestamp
+
+ output.latency = most_recent_timestamp - st
+ output.success = True
+ output.generated_text = data["generated_text"]
else:
+ output.error = response.reason or ""
output.success = False
- except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+ except Exception:
output.success = False
+ exc_info = sys.exc_info()
+ output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
@@ -146,26 +122,46 @@ async def async_request_trt_llm(
}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
- ttft = 0
+ ttft = 0.0
st = time.perf_counter()
+ most_recent_timestamp = st
try:
- async with session.post(url=api_url, json=payload) as resp:
- if resp.status == 200:
- async for data in resp.content.iter_any():
- if ttft == 0:
+ async with session.post(url=api_url, json=payload) as response:
+ if response.status == 200:
+ async for chunk_bytes in response.content:
+ chunk_bytes = chunk_bytes.strip()
+ if not chunk_bytes:
+ continue
+
+ chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+ "data:")
+
+ data = json.loads(chunk)
+ output.generated_text += data["text_output"]
+ timestamp = time.perf_counter()
+ # First token
+ if ttft == 0.0:
ttft = time.perf_counter() - st
output.ttft = ttft
- output.latency = time.perf_counter() - st
- body = remove_prefix(data.decode("utf-8"), "data:")
- output.generated_text = json.loads(body)["text_output"]
+ # Decoding phase
+ else:
+ output.itl.append(timestamp -
+ most_recent_timestamp)
+
+ most_recent_timestamp = timestamp
+
+ output.latency = most_recent_timestamp - st
output.success = True
else:
+ output.error = response.reason or ""
output.success = False
- except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+ except Exception:
output.success = False
+ exc_info = sys.exc_info()
+ output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
@@ -181,35 +177,35 @@ async def async_request_deepspeed_mii(
assert not request_func_input.use_beam_search
payload = {
- "prompts": request_func_input.prompt,
- "max_new_tokens": request_func_input.output_len,
- "ignore_eos": True,
- "do_sample": True,
- "temperature":
- 0.01, # deepspeed-mii does not accept 0.0 temperature.
+ "prompt": request_func_input.prompt,
+ "max_tokens": request_func_input.output_len,
+ "temperature": 0.01, # deepspeed-mii does not accept 0.0 temp.
"top_p": 1.0,
}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
- # DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
+ # NOTE: DeepSpeed-MII doesn't support streaming as of Jan 28 2024,
# will use 0 as placeholder.
- # https://github.com/microsoft/DeepSpeed-MII/pull/311
+ # See https://github.com/microsoft/DeepSpeed-MII/pull/311
output.ttft = 0
st = time.perf_counter()
try:
async with session.post(url=request_func_input.api_url,
- json=payload) as resp:
- if resp.status == 200:
- parsed_resp = await resp.json()
+ json=payload) as response:
+ if response.status == 200:
+ parsed_resp = await response.json()
output.latency = time.perf_counter() - st
- output.generated_text = parsed_resp[0]["generated_text"]
+ output.generated_text = parsed_resp["text"][0]
output.success = True
else:
+ output.error = response.reason or ""
output.success = False
- except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+ except Exception:
output.success = False
+ exc_info = sys.exc_info()
+ output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
@@ -221,7 +217,9 @@ async def async_request_openai_completions(
pbar: Optional[tqdm] = None,
) -> RequestFuncOutput:
api_url = request_func_input.api_url
- assert api_url.endswith("v1/completions")
+ assert api_url.endswith(
+ "v1/completions"
+ ), "OpenAI Completions API URL must end with 'v1/completions'."
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
assert not request_func_input.use_beam_search
@@ -241,35 +239,53 @@ async def async_request_openai_completions(
output.prompt_len = request_func_input.prompt_len
generated_text = ""
- ttft = 0
+ ttft = 0.0
st = time.perf_counter()
+ most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload,
headers=headers) as response:
if response.status == 200:
- async for chunk in response.content:
- if ttft == 0:
- ttft = time.perf_counter() - st
- output.ttft = ttft
-
- chunk = chunk.strip()
- if not chunk:
+ async for chunk_bytes in response.content:
+ chunk_bytes = chunk_bytes.strip()
+ if not chunk_bytes:
continue
- chunk = remove_prefix(chunk.decode("utf-8"), "data: ")
+ chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+ "data: ")
if chunk == "[DONE]":
latency = time.perf_counter() - st
else:
- body = json.loads(chunk)
- generated_text += body["choices"][0]["text"]
+ data = json.loads(chunk)
+
+ if data["choices"][0]["text"]:
+ timestamp = time.perf_counter()
+ # First token
+ if ttft == 0.0:
+ ttft = time.perf_counter() - st
+ output.ttft = ttft
+
+ # Decoding phase
+ # NOTE: Some completion API might have a last
+ # usage summary response without a token so we
+ # do not want to include as inter-token-latency
+ elif data.get("usage", None) is None:
+ output.itl.append(timestamp -
+ most_recent_timestamp)
+
+ most_recent_timestamp = timestamp
+ generated_text += data["choices"][0]["text"]
output.generated_text = generated_text
output.success = True
output.latency = latency
else:
+ output.error = response.reason or ""
output.success = False
- except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+ except Exception:
output.success = False
+ exc_info = sys.exc_info()
+ output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
@@ -283,7 +299,7 @@ async def async_request_openai_chat_completions(
api_url = request_func_input.api_url
assert api_url.endswith(
"v1/chat/completions"
- ), "OpenAI Chat API URL must end with 'v1/chat/completions'."
+ ), "OpenAI Chat Completions API URL must end with 'v1/chat/completions'."
async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
assert not request_func_input.use_beam_search
@@ -301,44 +317,59 @@ async def async_request_openai_chat_completions(
}
headers = {
"Content-Type": "application/json",
- "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"
+ "Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}",
}
output = RequestFuncOutput()
output.prompt_len = request_func_input.prompt_len
generated_text = ""
- ttft = 0
+ ttft = 0.0
st = time.perf_counter()
+ most_recent_timestamp = st
try:
async with session.post(url=api_url, json=payload,
headers=headers) as response:
if response.status == 200:
- async for chunk in response.content:
- if ttft == 0:
- ttft = time.perf_counter() - st
- output.ttft = ttft
-
- chunk = chunk.strip()
- if not chunk:
+ async for chunk_bytes in response.content:
+ chunk_bytes = chunk_bytes.strip()
+ if not chunk_bytes:
continue
- chunk = remove_prefix(chunk.decode("utf-8"), "data: ")
+ chunk = remove_prefix(chunk_bytes.decode("utf-8"),
+ "data: ")
if chunk == "[DONE]":
latency = time.perf_counter() - st
else:
- body = json.loads(chunk)
- if "content" in body["choices"][0]["delta"]:
- generated_text += body["choices"][0]["delta"][
- "content"]
+ timestamp = time.perf_counter()
+ data = json.loads(chunk)
+
+ delta = data["choices"][0]["delta"]
+ if delta.get("content", None):
+ # First token
+ if ttft == 0.0:
+ ttft = time.perf_counter() - st
+ output.ttft = ttft
+
+ # Decoding phase
+ else:
+ output.itl.append(timestamp -
+ most_recent_timestamp)
+
+ generated_text += delta["content"]
+
+ most_recent_timestamp = timestamp
output.generated_text = generated_text
output.success = True
output.latency = latency
else:
+ output.error = response.reason or ""
output.success = False
- except (aiohttp.ClientOSError, aiohttp.ServerDisconnectedError):
+ except Exception:
output.success = False
+ exc_info = sys.exc_info()
+ output.error = "".join(traceback.format_exception(*exc_info))
if pbar:
pbar.update(1)
@@ -355,7 +386,8 @@ def remove_prefix(text: str, prefix: str) -> str:
ASYNC_REQUEST_FUNCS = {
"tgi": async_request_tgi,
- "vllm": async_request_vllm,
+ "vllm": async_request_openai_completions,
+ "lmdeploy": async_request_openai_completions,
"deepspeed-mii": async_request_deepspeed_mii,
"openai": async_request_openai_completions,
"openai-chat": async_request_openai_chat_completions,
diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index 2fdc08c5c2..f69d91a086 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -1,14 +1,17 @@
"""Benchmark the latency of processing a single batch of requests."""
import argparse
+import json
import time
from pathlib import Path
-from typing import Optional
+from typing import List, Optional
import numpy as np
import torch
from tqdm import tqdm
from vllm import LLM, SamplingParams
+from vllm.inputs import PromptStrictInputs
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
def main(args: argparse.Namespace):
@@ -16,18 +19,24 @@ def main(args: argparse.Namespace):
# NOTE(woosuk): If the request cannot be processed in a single batch,
# the engine will automatically process the request in multiple batches.
- llm = LLM(
- model=args.model,
- tokenizer=args.tokenizer,
- quantization=args.quantization,
- tensor_parallel_size=args.tensor_parallel_size,
- trust_remote_code=args.trust_remote_code,
- dtype=args.dtype,
- enforce_eager=args.enforce_eager,
- kv_cache_dtype=args.kv_cache_dtype,
- device=args.device,
- ray_workers_use_nsight=args.ray_workers_use_nsight,
- )
+ llm = LLM(model=args.model,
+ speculative_model=args.speculative_model,
+ num_speculative_tokens=args.num_speculative_tokens,
+ tokenizer=args.tokenizer,
+ quantization=args.quantization,
+ tensor_parallel_size=args.tensor_parallel_size,
+ trust_remote_code=args.trust_remote_code,
+ dtype=args.dtype,
+ enforce_eager=args.enforce_eager,
+ kv_cache_dtype=args.kv_cache_dtype,
+ quantization_param_path=args.quantization_param_path,
+ device=args.device,
+ ray_workers_use_nsight=args.ray_workers_use_nsight,
+ use_v2_block_manager=args.use_v2_block_manager,
+ enable_chunked_prefill=args.enable_chunked_prefill,
+ download_dir=args.download_dir,
+ block_size=args.block_size,
+ gpu_memory_utilization=args.gpu_memory_utilization)
sampling_params = SamplingParams(
n=args.n,
@@ -41,7 +50,9 @@ def main(args: argparse.Namespace):
dummy_prompt_token_ids = np.random.randint(10000,
size=(args.batch_size,
args.input_len))
- dummy_prompt_token_ids = dummy_prompt_token_ids.tolist()
+ dummy_inputs: List[PromptStrictInputs] = [{
+ "prompt_token_ids": batch
+ } for batch in dummy_prompt_token_ids.tolist()]
def run_to_completion(profile_dir: Optional[str] = None):
if profile_dir:
@@ -52,13 +63,13 @@ def run_to_completion(profile_dir: Optional[str] = None):
],
on_trace_ready=torch.profiler.tensorboard_trace_handler(
str(profile_dir))) as p:
- llm.generate(prompt_token_ids=dummy_prompt_token_ids,
+ llm.generate(dummy_inputs,
sampling_params=sampling_params,
use_tqdm=False)
print(p.key_averages())
else:
start_time = time.perf_counter()
- llm.generate(prompt_token_ids=dummy_prompt_token_ids,
+ llm.generate(dummy_inputs,
sampling_params=sampling_params,
use_tqdm=False)
end_time = time.perf_counter()
@@ -66,7 +77,8 @@ def run_to_completion(profile_dir: Optional[str] = None):
return latency
print("Warming up...")
- run_to_completion(profile_dir=None)
+ for _ in tqdm(range(args.num_iters_warmup), desc="Warmup iterations"):
+ run_to_completion(profile_dir=None)
if args.profile:
profile_dir = args.profile_result_dir
@@ -82,7 +94,22 @@ def run_to_completion(profile_dir: Optional[str] = None):
latencies = []
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
latencies.append(run_to_completion(profile_dir=None))
+ latencies = np.array(latencies)
+ percentages = [10, 25, 50, 75, 90]
+ percentiles = np.percentile(latencies, percentages)
print(f'Avg latency: {np.mean(latencies)} seconds')
+ for percentage, percentile in zip(percentages, percentiles):
+ print(f'{percentage}% percentile latency: {percentile} seconds')
+
+ # Output JSON results if specified
+ if args.output_json:
+ results = {
+ "avg_latency": np.mean(latencies),
+ "latencies": latencies.tolist(),
+ "percentiles": dict(zip(percentages, percentiles.tolist())),
+ }
+ with open(args.output_json, "w") as f:
+ json.dump(results, f, indent=4)
if __name__ == '__main__':
@@ -90,10 +117,12 @@ def run_to_completion(profile_dir: Optional[str] = None):
description='Benchmark the latency of processing a single batch of '
'requests till completion.')
parser.add_argument('--model', type=str, default='facebook/opt-125m')
+ parser.add_argument('--speculative-model', type=str, default=None)
+ parser.add_argument('--num-speculative-tokens', type=int, default=None)
parser.add_argument('--tokenizer', type=str, default=None)
parser.add_argument('--quantization',
'-q',
- choices=['awq', 'gptq', 'squeezellm', None],
+ choices=[*QUANTIZATION_METHODS, None],
default=None)
parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
parser.add_argument('--input-len', type=int, default=32)
@@ -104,9 +133,13 @@ def run_to_completion(profile_dir: Optional[str] = None):
default=1,
help='Number of generated sequences per prompt.')
parser.add_argument('--use-beam-search', action='store_true')
+ parser.add_argument('--num-iters-warmup',
+ type=int,
+ default=10,
+ help='Number of iterations to run for warmup.')
parser.add_argument('--num-iters',
type=int,
- default=3,
+ default=30,
help='Number of iterations to run.')
parser.add_argument('--trust-remote-code',
action='store_true',
@@ -124,12 +157,23 @@ def run_to_completion(profile_dir: Optional[str] = None):
action='store_true',
help='enforce eager mode and disable CUDA graph')
parser.add_argument(
- "--kv-cache-dtype",
+ '--kv-cache-dtype',
type=str,
- choices=['auto', 'fp8_e5m2'],
- default='auto',
- help=
- 'Data type for kv cache storage. If "auto", will use model data type.')
+ choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
+ default="auto",
+ help='Data type for kv cache storage. If "auto", will use model '
+ 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
+ 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
+ parser.add_argument(
+ '--quantization-param-path',
+ type=str,
+ default=None,
+ help='Path to the JSON file containing the KV cache scaling factors. '
+ 'This should generally be supplied, when KV cache dtype is FP8. '
+ 'Otherwise, KV cache scaling factors default to 1.0, which may cause '
+ 'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
+ 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
+ 'instead supported for common inference criteria.')
parser.add_argument(
'--profile',
action='store_true',
@@ -144,12 +188,38 @@ def run_to_completion(profile_dir: Optional[str] = None):
"--device",
type=str,
default="cuda",
- choices=["cuda"],
- help='device type for vLLM execution, supporting CUDA only currently.')
+ choices=["cuda", "cpu"],
+ help='device type for vLLM execution, supporting CUDA and CPU.')
+ parser.add_argument('--block-size',
+ type=int,
+ default=16,
+ help='block size of key/value cache')
+ parser.add_argument(
+ '--enable-chunked-prefill',
+ action='store_true',
+ help='If True, the prefill requests can be chunked based on the '
+ 'max_num_batched_tokens')
+ parser.add_argument('--use-v2-block-manager', action='store_true')
parser.add_argument(
"--ray-workers-use-nsight",
action='store_true',
help="If specified, use nsight to profile ray workers",
)
+ parser.add_argument('--download-dir',
+ type=str,
+ default=None,
+ help='directory to download and load the weights, '
+ 'default to the default cache dir of huggingface')
+ parser.add_argument(
+ '--output-json',
+ type=str,
+ default=None,
+ help='Path to save the latency results in JSON format.')
+ parser.add_argument('--gpu-memory-utilization',
+ type=float,
+ default=0.9,
+ help='the fraction of GPU memory to be used for '
+ 'the model executor, which can range from 0 to 1.'
+ 'If unspecified, will use the default value of 0.9.')
args = parser.parse_args()
main(args)
diff --git a/benchmarks/benchmark_prefix_caching.py b/benchmarks/benchmark_prefix_caching.py
index 546c61e847..0899669869 100644
--- a/benchmarks/benchmark_prefix_caching.py
+++ b/benchmarks/benchmark_prefix_caching.py
@@ -1,8 +1,7 @@
import argparse
import time
-from vllm import LLM
-from vllm import SamplingParams
+from vllm import LLM, SamplingParams
PROMPT = "You are a helpful assistant in recognizes the content of tables in markdown format. Here is a table as fellows. You need to answer my question about the table.\n# Table\n|Opening|Opening|Sl. No.|Film|Cast|Director|Music Director|Notes|\n|----|----|----|----|----|----|----|----|\n|J A N|9|1|Agni Pushpam|Jayabharathi, Kamalahasan|Jeassy|M. K. Arjunan||\n|J A N|16|2|Priyamvada|Mohan Sharma, Lakshmi, KPAC Lalitha|K. S. Sethumadhavan|V. Dakshinamoorthy||\n|J A N|23|3|Yakshagaanam|Madhu, Sheela|Sheela|M. S. Viswanathan||\n|J A N|30|4|Paalkkadal|Sheela, Sharada|T. K. Prasad|A. T. Ummer||\n|F E B|5|5|Amma|Madhu, Srividya|M. Krishnan Nair|M. K. Arjunan||\n|F E B|13|6|Appooppan|Thikkurissi Sukumaran Nair, Kamal Haasan|P. Bhaskaran|M. S. Baburaj||\n|F E B|20|7|Srishti|Chowalloor Krishnankutty, Ravi Alummoodu|K. T. Muhammad|M. S. Baburaj||\n|F E B|20|8|Vanadevatha|Prem Nazir, Madhubala|Yusufali Kechery|G. Devarajan||\n|F E B|27|9|Samasya|Madhu, Kamalahaasan|K. Thankappan|Shyam||\n|F E B|27|10|Yudhabhoomi|K. P. Ummer, Vidhubala|Crossbelt Mani|R. K. Shekhar||\n|M A R|5|11|Seemantha Puthran|Prem Nazir, Jayabharathi|A. B. Raj|M. K. Arjunan||\n|M A R|12|12|Swapnadanam|Rani Chandra, Dr. Mohandas|K. G. George|Bhaskar Chandavarkar||\n|M A R|19|13|Thulavarsham|Prem Nazir, sreedevi, Sudheer|N. Sankaran Nair|V. Dakshinamoorthy||\n|M A R|20|14|Aruthu|Kaviyoor Ponnamma, Kamalahasan|Ravi|G. Devarajan||\n|M A R|26|15|Swimming Pool|Kamal Haasan, M. G. Soman|J. Sasikumar|M. K. Arjunan||\n\n# Question\nWhat' s the content in the (1,1) cells\n" # noqa: E501
@@ -17,20 +16,22 @@ def test_prefix(llm=None, sampling_params=None, prompts=None):
def main(args):
- llm = LLM(model="baichuan-inc/Baichuan2-13B-Chat",
+ llm = LLM(model=args.model,
tokenizer_mode='auto',
trust_remote_code=True,
enforce_eager=True,
+ use_v2_block_manager=args.use_v2_block_manager,
+ tensor_parallel_size=args.tensor_parallel_size,
enable_prefix_caching=args.enable_prefix_caching)
num_prompts = 100
prompts = [PROMPT] * num_prompts
- sampling_params = SamplingParams(temperature=0, max_tokens=100)
+ sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
print("------warm up------")
test_prefix(
llm=llm,
- prompts=prompts[:1],
+ prompts=prompts,
sampling_params=sampling_params,
)
@@ -46,8 +47,16 @@ def main(args):
parser = argparse.ArgumentParser(
description='Benchmark the performance with or without automatic '
'prefix caching.')
+ parser.add_argument('--model',
+ type=str,
+ default='baichuan-inc/Baichuan2-13B-Chat')
+ parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
+ parser.add_argument('--output-len', type=int, default=10)
parser.add_argument('--enable-prefix-caching',
action='store_true',
help='enable prefix caching')
+ parser.add_argument('--use-v2-block-manager',
+ action='store_true',
+ help='Use BlockSpaceMangerV2')
args = parser.parse_args()
main(args)
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 9404608b55..f3d71de775 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -1,8 +1,8 @@
"""Benchmark online serving throughput.
On the server side, run one of the following commands:
- (vLLM backend)
- python -m vllm.entrypoints.api_server \
+ vLLM OpenAI API server
+ python -m vllm.entrypoints.openai.api_server \
--model --swap-space 16 \
--disable-log-requests
@@ -12,28 +12,34 @@
On the client side, run:
python benchmarks/benchmark_serving.py \
--backend \
- --model --dataset \
- --request-rate
+ --model \
+ --dataset-name sharegpt \
+ --dataset-path \
+ --request-rate \ # By default is inf
+ --num-prompts # By default is 1000
+
+ when using tgi backend, add
+ --endpoint /generate_stream
+ to the end of the command above.
"""
import argparse
import asyncio
import json
+import os
import random
import time
+import warnings
from dataclasses import dataclass
from datetime import datetime
-from typing import AsyncGenerator, List, Tuple
+from typing import AsyncGenerator, List, Optional, Tuple
import numpy as np
+from backend_request_func import (ASYNC_REQUEST_FUNCS, RequestFuncInput,
+ RequestFuncOutput)
from tqdm.asyncio import tqdm
from transformers import PreTrainedTokenizerBase
-from vllm.transformers_utils.tokenizer import get_tokenizer
-from backend_request_func import (
- ASYNC_REQUEST_FUNCS,
- RequestFuncInput,
- RequestFuncOutput,
-)
+from vllm.transformers_utils.tokenizer import get_tokenizer
@dataclass
@@ -52,11 +58,15 @@ class BenchmarkMetrics:
p99_tpot_ms: float
-def sample_requests(
+def sample_sharegpt_requests(
dataset_path: str,
num_requests: int,
tokenizer: PreTrainedTokenizerBase,
+ fixed_output_len: Optional[int] = None,
) -> List[Tuple[str, int, int]]:
+ if fixed_output_len is not None and fixed_output_len < 4:
+ raise ValueError("output_len too small")
+
# Load the dataset.
with open(dataset_path) as f:
dataset = json.load(f)
@@ -66,37 +76,101 @@ def sample_requests(
dataset = [(data["conversations"][0]["value"],
data["conversations"][1]["value"]) for data in dataset]
- # some of these will be filtered out, so sample more than we need
- sampled_indices = random.sample(range(len(dataset)),
- int(num_requests * 1.2))
- dataset = [dataset[i] for i in sampled_indices]
-
- # Tokenize the prompts and completions.
- prompts = [prompt for prompt, _ in dataset]
- prompt_token_ids = tokenizer(prompts).input_ids
- completions = [completion for _, completion in dataset]
- completion_token_ids = tokenizer(completions).input_ids
- tokenized_dataset = []
- for i in range(len(dataset)):
- output_len = len(completion_token_ids[i])
- tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
+ # Shuffle the dataset.
+ random.shuffle(dataset)
- # Filter out too long sequences.
+ # Filter out sequences that are too long or too short
filtered_dataset: List[Tuple[str, int, int]] = []
- for prompt, prompt_token_ids, output_len in tokenized_dataset:
+ for i in range(len(dataset)):
+ if len(filtered_dataset) == num_requests:
+ break
+
+ # Tokenize the prompts and completions.
+ prompt = dataset[i][0]
+ prompt_token_ids = tokenizer(prompt).input_ids
+ completion = dataset[i][1]
+ completion_token_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_token_ids)
+ output_len = len(completion_token_ids
+ ) if fixed_output_len is None else fixed_output_len
if prompt_len < 4 or output_len < 4:
# Prune too short sequences.
- # This is because TGI causes errors when the input or output length
- # is too short.
continue
if prompt_len > 1024 or prompt_len + output_len > 2048:
# Prune too long sequences.
continue
filtered_dataset.append((prompt, prompt_len, output_len))
- # Sample the requests.
- sampled_requests = random.sample(filtered_dataset, num_requests)
+ return filtered_dataset
+
+
+def sample_sonnet_requests(
+ dataset_path: str,
+ num_requests: int,
+ input_len: int,
+ output_len: int,
+ prefix_len: int,
+ tokenizer: PreTrainedTokenizerBase,
+) -> List[Tuple[str, str, int, int]]:
+ assert (
+ input_len > prefix_len
+ ), "'args.sonnet-input-len' must be greater than 'args.prefix-input-len'."
+
+ # Load the dataset.
+ with open(dataset_path) as f:
+ poem_lines = f.readlines()
+
+ # Tokenize the poem lines.
+ poem_token_ids = tokenizer(poem_lines).input_ids
+ average_poem_len = sum(
+ len(token_ids) for token_ids in poem_token_ids) / len(poem_token_ids)
+
+ # Base prefix for all requests.
+ base_prompt = "Pick as many lines as you can from these poem lines:\n"
+ base_message = [{
+ "role": "user",
+ "content": base_prompt,
+ }]
+ base_prompt_formatted = tokenizer.apply_chat_template(
+ base_message, add_generation_prompt=True, tokenize=False)
+ base_prompt_offset = len(tokenizer(base_prompt_formatted).input_ids)
+
+ assert (
+ input_len > base_prompt_offset
+ ), f"Please set 'args.sonnet-input-len' higher than {base_prompt_offset}."
+ num_input_lines = round(
+ (input_len - base_prompt_offset) / average_poem_len)
+
+ # First approximately `prefix_len` number of tokens in the
+ # prompt are fixed poem lines.
+ assert (
+ prefix_len > base_prompt_offset
+ ), f"Please set 'args.sonnet-prefix-len' higher than {base_prompt_offset}."
+
+ num_prefix_lines = round(
+ (prefix_len - base_prompt_offset) / average_poem_len)
+ prefix_lines = poem_lines[:num_prefix_lines]
+
+ # Sample the rest of lines per request.
+ sampled_requests: List[Tuple[str, int, int]] = []
+ for _ in range(num_requests):
+ sampled_lines = "".join(
+ prefix_lines +
+ random.sample(poem_lines, num_input_lines - num_prefix_lines))
+
+ prompt = f"{base_prompt}{sampled_lines}"
+ message = [
+ {
+ "role": "user",
+ "content": prompt,
+ },
+ ]
+ prompt_formatted = tokenizer.apply_chat_template(
+ message, add_generation_prompt=True, tokenize=False)
+ prompt_len = len(tokenizer(prompt_formatted).input_ids)
+ sampled_requests.append(
+ (prompt, prompt_formatted, prompt_len, output_len))
+
return sampled_requests
@@ -122,37 +196,47 @@ def calculate_metrics(
outputs: List[RequestFuncOutput],
dur_s: float,
tokenizer: PreTrainedTokenizerBase,
-) -> BenchmarkMetrics:
- total_output = 0
+) -> Tuple[BenchmarkMetrics, List[int]]:
+ actual_output_lens = []
total_input = 0
completed = 0
- per_token_latencies = []
+ tpots = []
ttfts = []
for i in range(len(outputs)):
if outputs[i].success:
- output_len = len(tokenizer.encode(outputs[i].generated_text))
- total_output += output_len
+ output_len = len(tokenizer(outputs[i].generated_text).input_ids)
+ actual_output_lens.append(output_len)
total_input += input_requests[i][1]
- per_token_latencies.append(outputs[i].latency / output_len)
+ if output_len > 1:
+ tpots.append(
+ (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
ttfts.append(outputs[i].ttft)
completed += 1
-
+ else:
+ actual_output_lens.append(0)
+
+ if completed == 0:
+ warnings.warn(
+ "All requests failed. This is likely due to a misconfiguration "
+ "on the benchmark arguments.",
+ stacklevel=2)
metrics = BenchmarkMetrics(
completed=completed,
total_input=total_input,
- total_output=total_output,
+ total_output=sum(actual_output_lens),
request_throughput=completed / dur_s,
input_throughput=total_input / dur_s,
- output_throughput=total_output / dur_s,
- mean_ttft_ms=np.mean(ttfts) * 1000,
- median_ttft_ms=np.median(ttfts) * 1000,
- p99_ttft_ms=np.percentile(ttfts, 99) * 1000,
- mean_tpot_ms=np.mean(per_token_latencies) * 1000,
- median_tpot_ms=np.median(per_token_latencies) * 1000,
- p99_tpot_ms=np.percentile(per_token_latencies, 99) * 1000,
+ output_throughput=sum(actual_output_lens) / dur_s,
+ mean_ttft_ms=np.mean(ttfts or 0) *
+ 1000, # ttfts is empty if streaming is not supported by backend
+ median_ttft_ms=np.median(ttfts or 0) * 1000,
+ p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
+ mean_tpot_ms=np.mean(tpots or 0) * 1000,
+ median_tpot_ms=np.median(tpots or 0) * 1000,
+ p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
)
- return metrics
+ return metrics, actual_output_lens
async def benchmark(
@@ -171,6 +255,24 @@ async def benchmark(
else:
raise ValueError(f"Unknown backend: {backend}")
+ print("Starting initial single prompt test run...")
+ test_prompt, test_prompt_len, test_output_len = input_requests[0]
+ test_input = RequestFuncInput(
+ model=model_id,
+ prompt=test_prompt,
+ api_url=api_url,
+ prompt_len=test_prompt_len,
+ output_len=test_output_len,
+ best_of=best_of,
+ use_beam_search=use_beam_search,
+ )
+ test_output = await request_func(request_func_input=test_input)
+ if not test_output.success:
+ raise ValueError(
+ "Initial test run failed - Please make sure benchmark arguments "
+ f"are correctly specified. Error: {test_output.error}")
+ else:
+ print("Initial test run completed. Starting main benchmark run...")
print(f"Traffic request rate: {request_rate}")
pbar = None if disable_tqdm else tqdm(total=len(input_requests))
@@ -192,40 +294,53 @@ async def benchmark(
asyncio.create_task(
request_func(request_func_input=request_func_input,
pbar=pbar)))
- outputs = await asyncio.gather(*tasks)
+ outputs: List[RequestFuncOutput] = await asyncio.gather(*tasks)
if not disable_tqdm:
pbar.close()
benchmark_duration = time.perf_counter() - benchmark_start_time
- metrics = calculate_metrics(
+ metrics, actual_output_lens = calculate_metrics(
input_requests=input_requests,
outputs=outputs,
dur_s=benchmark_duration,
tokenizer=tokenizer,
)
- print(f"Successful requests: {metrics.completed}")
- print(f"Benchmark duration: {benchmark_duration:2f} s")
- print(f"Total input tokens: {metrics.total_input}")
- print(f"Total generated tokens: {metrics.total_output}")
- print(f"Request throughput: {metrics.request_throughput:.2f} requests/s")
- print(f"Input token throughput: {metrics.input_throughput:.2f} tokens/s")
- print(f"Output token throughput: {metrics.output_throughput:.2f} tokens/s")
- print(f"Mean TTFT: {metrics.mean_ttft_ms:.2f} ms")
- print(f"Median TTFT: {metrics.median_ttft_ms:.2f} ms")
- print(f"P99 TTFT: {metrics.p99_ttft_ms:.2f} ms")
- print(f"Mean TPOT: {metrics.mean_tpot_ms:.2f} ms")
- print(f"Median TPOT: {metrics.median_tpot_ms:.2f} ms")
- print(f"P99 TPOT: {metrics.p99_tpot_ms:.2f} ms")
+ print("{s:{c}^{n}}".format(s=' Serving Benchmark Result ', n=50, c='='))
+ print("{:<40} {:<10}".format("Successful requests:", metrics.completed))
+ print("{:<40} {:<10.2f}".format("Benchmark duration (s):",
+ benchmark_duration))
+ print("{:<40} {:<10}".format("Total input tokens:", metrics.total_input))
+ print("{:<40} {:<10}".format("Total generated tokens:",
+ metrics.total_output))
+ print("{:<40} {:<10.2f}".format("Request throughput (req/s):",
+ metrics.request_throughput))
+ print("{:<40} {:<10.2f}".format("Input token throughput (tok/s):",
+ metrics.input_throughput))
+ print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):",
+ metrics.output_throughput))
+ print("{s:{c}^{n}}".format(s='Time to First Token', n=50, c='-'))
+ print("{:<40} {:<10.2f}".format("Mean TTFT (ms):", metrics.mean_ttft_ms))
+ print("{:<40} {:<10.2f}".format("Median TTFT (ms):",
+ metrics.median_ttft_ms))
+ print("{:<40} {:<10.2f}".format("P99 TTFT (ms):", metrics.p99_ttft_ms))
+ print("{s:{c}^{n}}".format(s='Time per Output Token (excl. 1st token)',
+ n=50,
+ c='-'))
+ print("{:<40} {:<10.2f}".format("Mean TPOT (ms):", metrics.mean_tpot_ms))
+ print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
+ metrics.median_tpot_ms))
+ print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
+ print("=" * 50)
result = {
"duration": benchmark_duration,
"completed": metrics.completed,
"total_input_tokens": metrics.total_input,
"total_output_tokens": metrics.total_output,
- "request_inthroughput": metrics.request_throughput,
+ "request_throughput": metrics.request_throughput,
"input_throughput": metrics.input_throughput,
"output_throughput": metrics.output_throughput,
"mean_ttft_ms": metrics.mean_ttft_ms,
@@ -233,7 +348,13 @@ async def benchmark(
"p99_ttft_ms": metrics.p99_ttft_ms,
"mean_tpot_ms": metrics.mean_tpot_ms,
"median_tpot_ms": metrics.median_tpot_ms,
- "p99_tpot_ms": metrics.p99_tpot_ms
+ "p99_tpot_ms": metrics.p99_tpot_ms,
+ "input_lens": [output.prompt_len for output in outputs],
+ "output_lens": actual_output_lens,
+ "ttfts": [output.ttft for output in outputs],
+ "itls": [output.itl for output in outputs],
+ "generated_texts": [output.generated_text for output in outputs],
+ "errors": [output.error for output in outputs],
}
return result
@@ -254,7 +375,60 @@ def main(args: argparse.Namespace):
tokenizer = get_tokenizer(tokenizer_id,
trust_remote_code=args.trust_remote_code)
- input_requests = sample_requests(args.dataset, args.num_prompts, tokenizer)
+
+ if args.dataset is not None:
+ warnings.warn(
+ "The '--dataset' argument will be deprecated in the next "
+ "release. Please use '--dataset-name' and "
+ "'--dataset-path' in the future runs.",
+ stacklevel=2)
+ input_requests = sample_sharegpt_requests(
+ dataset_path=args.dataset,
+ num_requests=args.num_prompts,
+ tokenizer=tokenizer,
+ fixed_output_len=args.sharegpt_output_len,
+ )
+
+ elif args.dataset_name == "sharegpt":
+ input_requests = sample_sharegpt_requests(
+ dataset_path=args.dataset_path,
+ num_requests=args.num_prompts,
+ tokenizer=tokenizer,
+ fixed_output_len=args.sharegpt_output_len,
+ )
+
+ elif args.dataset_name == "sonnet":
+ # Do not format the prompt, pass to message directly
+ if args.backend == "openai-chat":
+ input_requests = sample_sonnet_requests(
+ dataset_path=args.dataset_path,
+ num_requests=args.num_prompts,
+ input_len=args.sonnet_input_len,
+ output_len=args.sonnet_output_len,
+ prefix_len=args.sonnet_prefix_len,
+ tokenizer=tokenizer,
+ )
+ input_requests = [(prompt, prompt_len, output_len)
+ for prompt, prompt_formatted, prompt_len,
+ output_len in input_requests]
+ else:
+ assert (
+ tokenizer.chat_template or tokenizer.default_chat_template
+ ), "Tokenizer/model must have chat template for sonnet dataset."
+ input_requests = sample_sonnet_requests(
+ dataset_path=args.dataset_path,
+ num_requests=args.num_prompts,
+ input_len=args.sonnet_input_len,
+ output_len=args.sonnet_output_len,
+ prefix_len=args.sonnet_prefix_len,
+ tokenizer=tokenizer,
+ )
+ input_requests = [(prompt_formatted, prompt_len, output_len)
+ for prompt, prompt_formatted, prompt_len,
+ output_len in input_requests]
+
+ else:
+ raise ValueError(f"Unknown dataset: {args.dataset_name}")
benchmark_result = asyncio.run(
benchmark(
@@ -277,13 +451,23 @@ def main(args: argparse.Namespace):
current_dt = datetime.now().strftime("%Y%m%d-%H%M%S")
result_json["date"] = current_dt
result_json["backend"] = backend
- result_json["version"] = args.version
result_json["model_id"] = model_id
result_json["tokenizer_id"] = tokenizer_id
result_json["best_of"] = args.best_of
result_json["use_beam_search"] = args.use_beam_search
result_json["num_prompts"] = args.num_prompts
+ # Metadata
+ if args.metadata:
+ for item in args.metadata:
+ if "=" in item:
+ kvstring = item.split("=")
+ result_json[kvstring[0].strip()] = kvstring[1].strip()
+ else:
+ raise ValueError(
+ "Invalid metadata format. Please use KEY=VALUE format."
+ )
+
# Traffic
result_json["request_rate"] = (
args.request_rate if args.request_rate < float("inf") else "inf")
@@ -293,9 +477,9 @@ def main(args: argparse.Namespace):
# Save to file
base_model_id = model_id.split("/")[-1]
- file_name = (
- f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json"
- )
+ file_name = f"{backend}-{args.request_rate}qps-{base_model_id}-{current_dt}.json" #noqa
+ if args.result_dir:
+ file_name = os.path.join(args.result_dir, file_name)
with open(file_name, "w") as outfile:
json.dump(result_json, outfile)
@@ -309,12 +493,6 @@ def main(args: argparse.Namespace):
default="vllm",
choices=list(ASYNC_REQUEST_FUNCS.keys()),
)
- parser.add_argument(
- "--version",
- type=str,
- default="N/A",
- help="Version of the serving backend/engine.",
- )
parser.add_argument(
"--base-url",
type=str,
@@ -326,12 +504,26 @@ def main(args: argparse.Namespace):
parser.add_argument(
"--endpoint",
type=str,
- default="/generate",
+ default="/v1/completions",
help="API endpoint.",
)
- parser.add_argument("--dataset",
+ parser.add_argument(
+ "--dataset",
+ type=str,
+ default=None,
+ help="Path to the ShareGPT dataset, will be deprecated in the "
+ "next release.",
+ )
+ parser.add_argument(
+ "--dataset-name",
+ type=str,
+ default="sharegpt",
+ choices=["sharegpt", "sonnet"],
+ help="Name of the dataset to benchmark on.",
+ )
+ parser.add_argument("--dataset-path",
type=str,
- required=True,
+ default=None,
help="Path to the dataset.")
parser.add_argument(
"--model",
@@ -359,6 +551,33 @@ def main(args: argparse.Namespace):
default=1000,
help="Number of prompts to process.",
)
+ parser.add_argument(
+ "--sharegpt-output-len",
+ type=int,
+ default=None,
+ help="Output length for each request. Overrides the output length "
+ "from the ShareGPT dataset.")
+ parser.add_argument(
+ "--sonnet-input-len",
+ type=int,
+ default=550,
+ help=
+ "Number of input tokens per request, used only for sonnet dataset.",
+ )
+ parser.add_argument(
+ "--sonnet-output-len",
+ type=int,
+ default=150,
+ help=
+ "Number of output tokens per request, used only for sonnet dataset.",
+ )
+ parser.add_argument(
+ "--sonnet-prefix-len",
+ type=int,
+ default=200,
+ help=
+ "Number of prefix tokens per request, used only for sonnet dataset.",
+ )
parser.add_argument(
"--request-rate",
type=float,
@@ -384,6 +603,21 @@ def main(args: argparse.Namespace):
action="store_true",
help="Specify to save benchmark results to a json file",
)
+ parser.add_argument(
+ "--metadata",
+ metavar="KEY=VALUE",
+ nargs="*",
+ help="Key-value pairs (e.g, --metadata version=0.3.3 tp=1) "
+ "for metadata of this run to be saved in the result JSON file "
+ "for record keeping purposes.",
+ )
+ parser.add_argument(
+ "--result-dir",
+ type=str,
+ default=None,
+ help="Specify directory to save benchmark json results."
+ "If not specified, results are saved in the current directory.",
+ )
args = parser.parse_args()
main(args)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 72bdc4b3b4..7c8cb5ee8c 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -6,9 +6,11 @@
from typing import List, Optional, Tuple
import torch
+from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer,
PreTrainedTokenizerBase)
-from tqdm import tqdm
+
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
def sample_requests(
@@ -29,22 +31,23 @@ def sample_requests(
dataset = [(data["conversations"][0]["value"],
data["conversations"][1]["value"]) for data in dataset]
- # Tokenize the prompts and completions.
- prompts = [prompt for prompt, _ in dataset]
- prompt_token_ids = tokenizer(prompts).input_ids
- completions = [completion for _, completion in dataset]
- completion_token_ids = tokenizer(completions).input_ids
- tokenized_dataset = []
- for i in range(len(dataset)):
- output_len = len(completion_token_ids[i])
- if fixed_output_len is not None:
- output_len = fixed_output_len
- tokenized_dataset.append((prompts[i], prompt_token_ids[i], output_len))
+ # Shuffle the dataset.
+ random.shuffle(dataset)
- # Filter out too long sequences.
+ # Filter out sequences that are too long or too short
filtered_dataset: List[Tuple[str, int, int]] = []
- for prompt, prompt_token_ids, output_len in tokenized_dataset:
+ for i in range(len(dataset)):
+ if len(filtered_dataset) == num_requests:
+ break
+
+ # Tokenize the prompts and completions.
+ prompt = dataset[i][0]
+ prompt_token_ids = tokenizer(prompt).input_ids
+ completion = dataset[i][1]
+ completion_token_ids = tokenizer(completion).input_ids
prompt_len = len(prompt_token_ids)
+ output_len = len(completion_token_ids
+ ) if fixed_output_len is None else fixed_output_len
if prompt_len < 4 or output_len < 4:
# Prune too short sequences.
continue
@@ -53,9 +56,7 @@ def sample_requests(
continue
filtered_dataset.append((prompt, prompt_len, output_len))
- # Sample the requests.
- sampled_requests = random.sample(filtered_dataset, num_requests)
- return sampled_requests
+ return filtered_dataset
def run_vllm(
@@ -72,45 +73,52 @@ def run_vllm(
max_model_len: Optional[int],
enforce_eager: bool,
kv_cache_dtype: str,
+ quantization_param_path: Optional[str],
device: str,
enable_prefix_caching: bool,
+ enable_chunked_prefill: bool,
+ max_num_batched_tokens: int,
gpu_memory_utilization: float = 0.9,
+ download_dir: Optional[str] = None,
) -> float:
from vllm import LLM, SamplingParams
- llm = LLM(model=model,
- tokenizer=tokenizer,
- quantization=quantization,
- tensor_parallel_size=tensor_parallel_size,
- seed=seed,
- trust_remote_code=trust_remote_code,
- dtype=dtype,
- max_model_len=max_model_len,
- gpu_memory_utilization=gpu_memory_utilization,
- enforce_eager=enforce_eager,
- kv_cache_dtype=kv_cache_dtype,
- device=device,
- enable_prefix_caching=enable_prefix_caching)
+ llm = LLM(
+ model=model,
+ tokenizer=tokenizer,
+ quantization=quantization,
+ tensor_parallel_size=tensor_parallel_size,
+ seed=seed,
+ trust_remote_code=trust_remote_code,
+ dtype=dtype,
+ max_model_len=max_model_len,
+ gpu_memory_utilization=gpu_memory_utilization,
+ enforce_eager=enforce_eager,
+ kv_cache_dtype=kv_cache_dtype,
+ quantization_param_path=quantization_param_path,
+ device=device,
+ enable_prefix_caching=enable_prefix_caching,
+ download_dir=download_dir,
+ enable_chunked_prefill=enable_chunked_prefill,
+ max_num_batched_tokens=max_num_batched_tokens,
+ )
# Add the requests to the engine.
+ prompts = []
+ sampling_params = []
for prompt, _, output_len in requests:
- sampling_params = SamplingParams(
- n=n,
- temperature=0.0 if use_beam_search else 1.0,
- top_p=1.0,
- use_beam_search=use_beam_search,
- ignore_eos=True,
- max_tokens=output_len,
- )
- # FIXME(woosuk): Do not use internal method.
- llm._add_request(
- prompt=prompt,
- prompt_token_ids=None,
- sampling_params=sampling_params,
- )
+ prompts.append(prompt)
+ sampling_params.append(
+ SamplingParams(
+ n=n,
+ temperature=0.0 if use_beam_search else 1.0,
+ top_p=1.0,
+ use_beam_search=use_beam_search,
+ ignore_eos=True,
+ max_tokens=output_len,
+ ))
start = time.perf_counter()
- # FIXME(woosuk): Do not use internal method.
- llm._run_engine(use_tqdm=True)
+ llm.generate(prompts, sampling_params, use_tqdm=True)
end = time.perf_counter()
return end - start
@@ -181,13 +189,15 @@ def run_mii(
tensor_parallel_size: int,
output_len: int,
) -> float:
- from mii import pipeline
- llm = pipeline(model, tensor_parallel=tensor_parallel_size)
+ from mii import client, serve
+ llm = serve(model, tensor_parallel=tensor_parallel_size)
prompts = [prompt for prompt, _, _ in requests]
start = time.perf_counter()
- llm(prompts, max_new_tokens=output_len)
+ llm.generate(prompts, max_new_tokens=output_len)
end = time.perf_counter()
+ client = client(model)
+ client.terminate_server()
return end - start
@@ -212,8 +222,11 @@ def main(args: argparse.Namespace):
requests, args.model, args.tokenizer, args.quantization,
args.tensor_parallel_size, args.seed, args.n, args.use_beam_search,
args.trust_remote_code, args.dtype, args.max_model_len,
- args.enforce_eager, args.kv_cache_dtype, args.device,
- args.enable_prefix_caching, args.gpu_memory_utilization)
+ args.enforce_eager, args.kv_cache_dtype,
+ args.quantization_param_path, args.device,
+ args.enable_prefix_caching, args.enable_chunked_prefill,
+ args.max_num_batched_tokens, args.gpu_memory_utilization,
+ args.download_dir)
elif args.backend == "hf":
assert args.tensor_parallel_size == 1
elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -229,6 +242,18 @@ def main(args: argparse.Namespace):
print(f"Throughput: {len(requests) / elapsed_time:.2f} requests/s, "
f"{total_num_tokens / elapsed_time:.2f} tokens/s")
+ # Output JSON results if specified
+ if args.output_json:
+ results = {
+ "elapsed_time": elapsed_time,
+ "num_requests": len(requests),
+ "total_num_tokens": total_num_tokens,
+ "requests_per_second": len(requests) / elapsed_time,
+ "tokens_per_second": total_num_tokens / elapsed_time,
+ }
+ with open(args.output_json, "w") as f:
+ json.dump(results, f, indent=4)
+
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Benchmark the throughput.")
@@ -253,7 +278,7 @@ def main(args: argparse.Namespace):
parser.add_argument("--tokenizer", type=str, default=None)
parser.add_argument('--quantization',
'-q',
- choices=['awq', 'gptq', 'squeezellm', None],
+ choices=[*QUANTIZATION_METHODS, None],
default=None)
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
parser.add_argument("--n",
@@ -298,22 +323,51 @@ def main(args: argparse.Namespace):
action="store_true",
help="enforce eager execution")
parser.add_argument(
- "--kv-cache-dtype",
+ '--kv-cache-dtype',
type=str,
- choices=["auto", "fp8_e5m2"],
+ choices=['auto', 'fp8', 'fp8_e5m2', 'fp8_e4m3'],
default="auto",
- help=
- 'Data type for kv cache storage. If "auto", will use model data type.')
+ help='Data type for kv cache storage. If "auto", will use model '
+ 'data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. '
+ 'ROCm (AMD GPU) supports fp8 (=fp8_e4m3)')
+ parser.add_argument(
+ '--quantization-param-path',
+ type=str,
+ default=None,
+ help='Path to the JSON file containing the KV cache scaling factors. '
+ 'This should generally be supplied, when KV cache dtype is FP8. '
+ 'Otherwise, KV cache scaling factors default to 1.0, which may cause '
+ 'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
+ 'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
+ 'instead supported for common inference criteria.')
parser.add_argument(
"--device",
type=str,
default="cuda",
- choices=["cuda"],
- help='device type for vLLM execution, supporting CUDA only currently.')
+ choices=["cuda", "cpu"],
+ help='device type for vLLM execution, supporting CUDA and CPU.')
parser.add_argument(
"--enable-prefix-caching",
action='store_true',
help="enable automatic prefix caching for vLLM backend.")
+ parser.add_argument("--enable-chunked-prefill",
+ action='store_true',
+ help="enable chunked prefill for vLLM backend.")
+ parser.add_argument('--max-num-batched-tokens',
+ type=int,
+ default=None,
+ help='maximum number of batched tokens per '
+ 'iteration')
+ parser.add_argument('--download-dir',
+ type=str,
+ default=None,
+ help='directory to download and load the weights, '
+ 'default to the default cache dir of huggingface')
+ parser.add_argument(
+ '--output-json',
+ type=str,
+ default=None,
+ help='Path to save the throughput results in JSON format.')
args = parser.parse_args()
if args.tokenizer is None:
args.tokenizer = args.model
diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
new file mode 100644
index 0000000000..6de56f6187
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -0,0 +1,352 @@
+import argparse
+import copy
+import itertools
+import pickle as pkl
+import time
+from typing import Callable, Iterable, List, Tuple
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:]
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
+
+# helpers
+
+
+def to_fp8(tensor: torch.tensor) -> torch.tensor:
+ finfo = torch.finfo(torch.float8_e4m3fn)
+ return torch.round(tensor.clamp(
+ min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.tensor) -> torch.tensor:
+ return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
+ k: int) -> Tuple[torch.tensor, torch.tensor]:
+
+ a = torch.randn((m, k), device='cuda') * 5
+ b = torch.randn((n, k), device='cuda').t() * 5
+
+ if dtype == torch.int8:
+ return to_int8(a), to_int8(b)
+ if dtype == torch.float8_e4m3fn:
+ return to_fp8(a), to_fp8(b)
+
+ raise ValueError("unsupported dtype")
+
+
+# impl
+
+
+def pytorch_i8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
+ scale_b: torch.tensor,
+ out_dtype: torch.dtype) -> torch.tensor:
+ return torch.mm(a, b)
+
+
+def pytorch_fp8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
+ scale_b: torch.tensor,
+ out_dtype: torch.dtype) -> torch.tensor:
+ return torch._scaled_mm(a,
+ b,
+ scale_a=scale_a,
+ scale_b=scale_b,
+ out_dtype=out_dtype)
+
+
+def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
+ scale_a: torch.tensor, scale_b: torch.tensor,
+ out_dtype: torch.dtype) -> torch.tensor:
+ return torch._scaled_mm(a,
+ b,
+ scale_a=scale_a,
+ scale_b=scale_b,
+ out_dtype=out_dtype,
+ use_fast_accum=True)
+
+
+def cutlass_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
+ scale_b: torch.tensor,
+ out_dtype: torch.dtype) -> torch.tensor:
+ return ops.cutlass_scaled_mm_dq(a,
+ b,
+ scale_a,
+ scale_b,
+ out_dtype=out_dtype)
+
+
+# bench
+def bench_fn(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
+ scale_b: torch.tensor, out_dtype: torch.dtype, label: str,
+ sub_label: str, fn: Callable, description: str) -> TMeasurement:
+
+ min_run_time = 1
+
+ globals = {
+ "a": a,
+ "b": b,
+ "scale_a": scale_a,
+ "scale_b": scale_b,
+ "out_dtype": out_dtype,
+ "fn": fn,
+ }
+ return TBenchmark.Timer(
+ stmt="fn(a, b, scale_a, scale_b, out_dtype)",
+ globals=globals,
+ label=label,
+ sub_label=sub_label,
+ description=description,
+ ).blocked_autorange(min_run_time=min_run_time)
+
+
+def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+ sub_label: str) -> Iterable[TMeasurement]:
+ assert dtype == torch.int8
+ a, b = make_rand_tensors(torch.int8, m, n, k)
+ scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+ scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+
+ timers = []
+ # pytorch impl
+ timers.append(
+ bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
+ b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
+ torch.bfloat16, label, sub_label, pytorch_i8_impl,
+ "pytorch_bf16_bf16_bf16_matmul-no-scales"))
+
+ # cutlass impl
+ timers.append(
+ bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
+ torch.bfloat16, label, sub_label, cutlass_impl,
+ "cutlass_i8_i8_bf16_scaled_mm"))
+
+ return timers
+
+
+def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+ sub_label: str) -> Iterable[TMeasurement]:
+ assert dtype == torch.float8_e4m3fn
+ a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
+ scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+ scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+
+ timers = []
+
+ # pytorch impl: bf16 output, without fp8 fast accum
+ timers.append(
+ bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
+ pytorch_fp8_impl, "pytorch_fp8_fp8_bf16_scaled_mm"))
+
+ # pytorch impl: bf16 output, with fp8 fast accum
+ timers.append(
+ bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
+ pytorch_fp8_impl_fast_accum,
+ "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum"))
+
+ # pytorch impl: fp16 output, without fp8 fast accum
+ timers.append(
+ bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
+ pytorch_fp8_impl, "pytorch_fp8_fp8_fp16_scaled_mm"))
+
+ # pytorch impl: fp16 output, with fp8 fast accum
+ timers.append(
+ bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
+ pytorch_fp8_impl_fast_accum,
+ "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum"))
+
+ # cutlass impl: bf16 output
+ timers.append(
+ bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
+ torch.bfloat16, label, sub_label, cutlass_impl,
+ "cutlass_fp8_fp8_bf16_scaled_mm"))
+ # cutlass impl: fp16 output
+ timers.append(
+ bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
+ torch.float16, label, sub_label, cutlass_impl,
+ "cutlass_fp8_fp8_fp16_scaled_mm"))
+ return timers
+
+
+def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+ sub_label: str) -> Iterable[TMeasurement]:
+ if dtype == torch.int8:
+ return bench_int8(dtype, m, k, n, label, sub_label)
+ if dtype == torch.float8_e4m3fn:
+ return bench_fp8(dtype, m, k, n, label, sub_label)
+ raise ValueError("unsupported type")
+
+
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+ compare = TBenchmark.Compare(timers)
+ compare.print()
+
+
+def run(dtype: torch.dtype,
+ MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+
+ results = []
+ for m, k, n in MKNs:
+ timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
+ f"MKN=({m}x{k}x{n})")
+ print_timers(timers)
+ results.extend(timers)
+
+ return results
+
+
+# output makers
+def make_output(data: Iterable[TMeasurement],
+ MKNs: Iterable[Tuple[int, int, int]],
+ base_description: str,
+ timestamp=None):
+
+ print(f"== All Results {base_description} ====")
+ print_timers(data)
+
+ # pickle all the results
+ timestamp = int(time.time()) if timestamp is None else timestamp
+ with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+ pkl.dump(data, f)
+
+
+# argparse runners
+
+
+def run_square_bench(args):
+ dim_sizes = list(
+ range(args.dim_start, args.dim_end + 1, args.dim_increment))
+ MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
+ data = run(args.dtype, MKNs)
+
+ make_output(data, MKNs, f"square_bench-{args.dtype}")
+
+
+def run_range_bench(args):
+ dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
+ n = len(dim_sizes)
+ Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
+ Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
+ Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
+ MKNs = list(zip(Ms, Ks, Ns))
+ data = run(args.dtype, MKNs)
+
+ make_output(data, MKNs, f"range_bench-{args.dtype}")
+
+
+def run_model_bench(args):
+
+ print("Benchmarking models:")
+ for i, model in enumerate(args.models):
+ print(f"[{i}] {model}")
+
+ def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+ KNs = []
+ for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
+ KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+ KNs.append(KN)
+ return KNs
+
+ model_bench_data = []
+ models_tps = list(itertools.product(args.models, args.tp_sizes))
+ for model, tp_size in models_tps:
+ Ms = args.batch_sizes
+ KNs = model_shapes(model, tp_size)
+ MKNs = []
+ for m in Ms:
+ for k, n in KNs:
+ MKNs.append((m, k, n))
+
+ data = run(args.dtype, MKNs)
+ model_bench_data.append(data)
+
+ # Print all results
+ for data, model_tp in zip(model_bench_data, models_tps):
+ model, tp_size = model_tp
+ print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
+ print_timers(data)
+
+ timestamp = int(time.time())
+
+ all_data = []
+ for d in model_bench_data:
+ all_data.extend(d)
+ # pickle all data
+ with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
+ pkl.dump(all_data, f)
+
+
+if __name__ == '__main__':
+
+ def to_torch_dtype(dt):
+ if dt == "int8":
+ return torch.int8
+ if dt == "fp8":
+ return torch.float8_e4m3fn
+ raise ValueError("unsupported dtype")
+
+ parser = argparse.ArgumentParser(
+ description="""
+Benchmark Cutlass GEMM.
+
+ To run square GEMMs:
+ python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
+
+ To run constant N and K and sweep M:
+ python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
+
+ To run dimensions from a model:
+ python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
+
+ Output:
+ - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
+ """, # noqa: E501
+ formatter_class=argparse.RawTextHelpFormatter)
+
+ parser.add_argument("--dtype",
+ type=to_torch_dtype,
+ required=True,
+ help="Available options are ['int8', 'fp8']")
+ subparsers = parser.add_subparsers(dest="cmd")
+
+ square_parser = subparsers.add_parser("square_bench")
+ square_parser.add_argument("--dim-start", type=int, required=True)
+ square_parser.add_argument("--dim-end", type=int, required=True)
+ square_parser.add_argument("--dim-increment", type=int, required=True)
+ square_parser.set_defaults(func=run_square_bench)
+
+ range_parser = subparsers.add_parser("range_bench")
+ range_parser.add_argument("--dim-start", type=int, required=True)
+ range_parser.add_argument("--dim-end", type=int, required=True)
+ range_parser.add_argument("--dim-increment", type=int, required=True)
+ range_parser.add_argument("--m-constant", type=int, default=None)
+ range_parser.add_argument("--n-constant", type=int, default=None)
+ range_parser.add_argument("--k-constant", type=int, default=None)
+ range_parser.set_defaults(func=run_range_bench)
+
+ model_parser = subparsers.add_parser("model_bench")
+ model_parser.add_argument("--models",
+ nargs="+",
+ type=str,
+ default=DEFAULT_MODELS,
+ choices=WEIGHT_SHAPES.keys())
+ model_parser.add_argument("--tp-sizes",
+ nargs="+",
+ type=int,
+ default=DEFAULT_TP_SIZES)
+ model_parser.add_argument("--batch-sizes",
+ nargs="+",
+ type=int,
+ default=DEFAULT_BATCH_SIZES)
+ model_parser.set_defaults(func=run_model_bench)
+
+ args = parser.parse_args()
+ args.func(args)
diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py
new file mode 100644
index 0000000000..7ad4a53d37
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
@@ -0,0 +1,37 @@
+# Weight Shapes are in the format
+# ([K, N], TP_SPLIT_DIM)
+# Example:
+# A shape of ([14336, 4096], 0) indicates the following GEMM shape,
+# - TP1 : K = 14336, N = 4096
+# - TP2 : K = 7168, N = 4096
+# A shape of ([4096, 6144], 1) indicates the following GEMM shape,
+# - TP1 : K = 4096, N = 6144
+# - TP4 : K = 4096, N = 1536
+
+# TP1 shapes
+WEIGHT_SHAPES = {
+ "mistralai/Mistral-7B-v0.1": [
+ ([4096, 6144], 1),
+ ([4096, 4096], 0),
+ ([4096, 28672], 1),
+ ([14336, 4096], 0),
+ ],
+ "meta-llama/Llama-2-7b-hf": [
+ ([4096, 12288], 1),
+ ([4096, 4096], 0),
+ ([4096, 22016], 1),
+ ([11008, 4096], 0),
+ ],
+ "meta-llama/Llama-2-13b-hf": [
+ ([5120, 15360], 1),
+ ([5120, 5120], 0),
+ ([5120, 27648], 1),
+ ([13824, 5120], 0),
+ ],
+ "meta-llama/Llama-2-70b-hf": [
+ ([8192, 10240], 1),
+ ([8192, 8192], 0),
+ ([8192, 57344], 1),
+ ([28672, 8192], 0),
+ ],
+}
diff --git a/benchmarks/kernels/benchmark_aqlm.py b/benchmarks/kernels/benchmark_aqlm.py
new file mode 100644
index 0000000000..59392947b1
--- /dev/null
+++ b/benchmarks/kernels/benchmark_aqlm.py
@@ -0,0 +1,302 @@
+import argparse
+import os
+import sys
+from typing import Optional
+
+import torch
+import torch.nn.functional as F
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.aqlm import (
+ dequantize_weight, generic_dequantize_gemm, get_int_dtype,
+ optimized_dequantize_gemm)
+
+os.environ['CUDA_VISIBLE_DEVICES'] = '0'
+
+
+def torch_mult(
+ input: torch.Tensor, # [..., in_features]
+ weights: torch.Tensor,
+ scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
+) -> torch.Tensor:
+ output = F.linear(input, weights)
+ return output
+
+
+def dequant_out_scale(
+ input: torch.Tensor, # [..., in_features]
+ codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
+ codebooks: torch.
+ Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
+ scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
+ output_partition_sizes: torch.IntTensor,
+ bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+
+ weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+ if bias is None:
+ output = F.linear(input, weights, bias)
+ orig_shape = output.shape
+ flattened_output = output.view(-1, output.size(-1))
+ f_scales = scales.view(-1, scales.shape[0])
+ b_scales = f_scales.expand(flattened_output.shape[0], -1)
+ flattened_output *= b_scales
+ return flattened_output.view(orig_shape)
+ else:
+ b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
+ -1, weights.shape[1])
+ weights *= b_scales
+ return F.linear(input, weights, bias)
+
+
+def dequant_weight_scale(
+ input: torch.Tensor, # [..., in_features]
+ codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
+ codebooks: torch.
+ Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
+ scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
+ output_partition_sizes: torch.IntTensor,
+ bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+
+ weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+ b_scales = scales.view(scales.shape[:-3] + (-1, )).expand(
+ -1, weights.shape[1])
+ weights *= b_scales
+ return F.linear(input, weights, bias)
+
+
+def dequant_no_scale(
+ input: torch.Tensor, # [..., in_features]
+ codes: torch.IntTensor, # [num_out_groups, num_in_groups, num_codebooks]
+ codebooks: torch.
+ Tensor, # [num_codebooks, codebook_size, out_group_size, in_group_size]
+ scales: torch.Tensor, # [num_out_groups, 1, 1, 1]
+ output_partition_sizes: torch.IntTensor,
+ bias: Optional[torch.Tensor],
+) -> torch.Tensor:
+
+ weights = ops.aqlm_dequant(codes, codebooks, output_partition_sizes)
+
+ return F.linear(input, weights, bias)
+
+
+# Compare the optimized 1x16 and 2x8 cuda decompression/dequant kernels against
+# the generic pytorch version.
+# Just visual comparison.
+def dequant_test(k: int, parts: torch.tensor, nbooks: int, bits: int) -> None:
+
+ n = parts.sum().item()
+
+ device = torch.device('cuda:0')
+
+ code_range = (1 << bits) // 2
+ ingroups = 8
+
+ codes = torch.randint(-code_range,
+ code_range,
+ size=(n, k // ingroups, nbooks),
+ dtype=get_int_dtype(bits),
+ device=device)
+
+ codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
+ dtype=torch.float16,
+ device=device)
+
+ count = 0
+ for index in range(16):
+ for i in range(8):
+ for book in range(nbooks):
+ codebooks[book, index, 0, i] = count * (10**book)
+ count += 1
+
+ print("codes shape", codes.shape)
+
+ for i in range(16):
+ for book in range(nbooks):
+ codes[0, i, book] = i
+ codes[0, -i, book] = i
+
+ weights = dequantize_weight(codes, codebooks, None)
+ weights2 = ops.aqlm_dequant(codes, codebooks, parts)
+
+ print("weights shape:", weights.shape)
+ print("weights2 shape:", weights2.shape)
+
+ print("weights are:", weights)
+ print("weights2 are:", weights2)
+
+ print("first 128 weights are", weights[0, 0:128].to(torch.int32))
+ print("first 128 weights2 are:", weights2[0, 0:128].to(torch.int32))
+
+ print("last 128 weights are", weights[0, -128:])
+ print("last 128 weights2 are:", weights2[0, -128:])
+
+
+def main():
+
+ parser = argparse.ArgumentParser(description="Benchmark aqlm performance.")
+
+ # Add arguments
+ parser.add_argument("--nbooks",
+ type=int,
+ default=1,
+ help="Number of codebooks (default: 1)")
+ parser.add_argument("--bits",
+ type=int,
+ default=16,
+ help="Number of bits per code element (default: 16)")
+ parser.add_argument(
+ "--test",
+ type=bool,
+ default=False,
+ help="Run the decompression/dequant tester rather than benchmarking "
+ "(default: False)")
+
+ # Parse the arguments
+ args = parser.parse_args()
+
+ # Extract values
+ nbooks = args.nbooks
+ bits = args.bits
+
+ if args.test:
+ dequant_test(4096, torch.tensor((4096, )), nbooks, bits)
+ return
+
+ # Otherwise, benchmark.
+ methods = [
+ ops.aqlm_gemm,
+ dequant_out_scale,
+ generic_dequantize_gemm,
+ optimized_dequantize_gemm,
+ dequant_weight_scale,
+ torch_mult,
+ dequant_no_scale,
+ ]
+
+ filename = f"./aqlm_benchmark_{nbooks}x{bits}.csv"
+ print(f"writing benchmarks to file {filename}")
+ with open(filename, "w") as f:
+ sys.stdout = f
+
+ print('m | k | n | n parts', end='')
+ for method in methods:
+ print(f" | {method.__name__.replace('_', ' ')} (µs)", end='')
+ print('')
+
+ # These are reasonable prefill sizes.
+ ksandpartions = ((4096, (4096, 4096, 4096)), (4096, (4096, )),
+ (4096, (11008, 11008)), (11008, (4096, )))
+
+ # reasonable ranges for m.
+ for m in [
+ 1, 2, 4, 8, 10, 12, 14, 16, 24, 32, 48, 52, 56, 64, 96, 112,
+ 128, 256, 512, 1024, 1536, 2048, 3072, 4096
+ ]:
+ print(f'{m}', file=sys.__stdout__)
+ for ksp in ksandpartions:
+ run_grid(m, ksp[0], torch.tensor(ksp[1]), nbooks, bits,
+ methods)
+
+ sys.stdout = sys.__stdout__
+
+
+def run_grid(m: int, k: int, parts: torch.tensor, nbooks: int, bits: int,
+ methods):
+
+ # I didn't see visible improvements from increasing these, but feel free :)
+ num_warmup_trials = 1
+ num_trials = 1
+
+ num_calls = 100
+
+ # warmup.
+ for method in methods:
+ for _ in range(num_warmup_trials):
+ run_timing(
+ num_calls=num_calls,
+ m=m,
+ k=k,
+ parts=parts,
+ nbooks=nbooks,
+ bits=bits,
+ method=method,
+ )
+
+ n = parts.sum().item()
+ print(f'{m} | {k} | {n} | {parts.tolist()}', end='')
+
+ for method in methods:
+ best_time_us = 1e20
+ for _ in range(num_trials):
+ kernel_dur_ms = run_timing(
+ num_calls=num_calls,
+ m=m,
+ k=k,
+ parts=parts,
+ nbooks=nbooks,
+ bits=bits,
+ method=method,
+ )
+
+ kernel_dur_us = 1000 * kernel_dur_ms
+
+ if kernel_dur_us < best_time_us:
+ best_time_us = kernel_dur_us
+
+ print(f' | {kernel_dur_us:.0f}', end='')
+
+ print('')
+
+
+def run_timing(num_calls: int, m: int, k: int, parts: torch.tensor,
+ nbooks: int, bits: int, method) -> float:
+
+ n = parts.sum().item()
+
+ device = torch.device('cuda:0')
+
+ input = torch.randn((1, m, k), dtype=torch.float16, device=device)
+
+ code_range = (1 << bits) // 2
+ ingroups = 8
+
+ codes = torch.randint(-code_range,
+ code_range,
+ size=(n, k // ingroups, nbooks),
+ dtype=get_int_dtype(bits),
+ device=device)
+
+ codebooks = torch.randn(size=(parts.shape[0] * nbooks, 1 << bits, 1, 8),
+ dtype=torch.float16,
+ device=device)
+
+ scales = torch.randn(size=(n, 1, 1, 1), dtype=torch.float16, device=device)
+
+ # for comparison to just a pytorch mult.
+ weights = torch.randn((n, k), dtype=torch.float16, device=device)
+
+ start_event = torch.cuda.Event(enable_timing=True)
+ end_event = torch.cuda.Event(enable_timing=True)
+
+ start_event.record()
+
+ if method is torch_mult:
+ for i in range(num_calls):
+ torch_mult(input, weights, scales)
+ else:
+ for i in range(num_calls):
+ method(input, codes, codebooks, scales, parts, None)
+
+ end_event.record()
+ end_event.synchronize()
+
+ dur_ms = start_event.elapsed_time(end_event) / num_calls
+ return dur_ms
+
+
+if __name__ == "__main__":
+ sys.exit(main())
diff --git a/benchmarks/kernels/benchmark_marlin.py b/benchmarks/kernels/benchmark_marlin.py
new file mode 100644
index 0000000000..b771911781
--- /dev/null
+++ b/benchmarks/kernels/benchmark_marlin.py
@@ -0,0 +1,233 @@
+import argparse
+
+import torch
+import torch.utils.benchmark as benchmark
+from benchmark_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+from vllm.model_executor.layers.quantization.gptq_marlin import (
+ GPTQ_MARLIN_MAX_PARALLEL, GPTQ_MARLIN_MIN_THREAD_N,
+ GPTQ_MARLIN_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_SUPPORTED_NUM_BITS)
+from vllm.model_executor.layers.quantization.gptq_marlin_24 import (
+ GPTQ_MARLIN_24_MAX_PARALLEL, GPTQ_MARLIN_24_MIN_THREAD_N,
+ GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES, GPTQ_MARLIN_24_SUPPORTED_NUM_BITS)
+from vllm.model_executor.layers.quantization.utils.marlin_utils import (
+ MarlinWorkspace, marlin_24_quantize, marlin_quantize)
+from vllm.model_executor.layers.quantization.utils.quant_utils import (
+ gptq_pack, quantize_weights, sort_weights)
+
+DEFAULT_MODELS = ["meta-llama/Llama-2-7b-hf/TP1"]
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+
+ACT_ORDER_OPTS = [False, True]
+K_FULL_OPTS = [False, True]
+
+
+def bench_run(results, model, act_order, is_k_full, num_bits, group_size,
+ size_m, size_k, size_n):
+ label = "Quant Matmul"
+
+ sub_label = ("{}, act={} k_full={}, b={}, g={}, "
+ "MKN=({}x{}x{})".format(model, act_order, is_k_full, num_bits,
+ group_size, size_m, size_k, size_n))
+
+ print(f"Testing: {sub_label}")
+
+ a = torch.randn(size_m, size_k).to(torch.half).cuda()
+ b = torch.rand(size_k, size_n).to(torch.half).cuda()
+
+ a_tmp = (torch.zeros(size_m, size_k).to(torch.half).cuda())
+
+ # Marlin quant
+ (
+ marlin_w_ref,
+ marlin_q_w,
+ marlin_s,
+ marlin_g_idx,
+ marlin_sort_indices,
+ marlin_rand_perm,
+ ) = marlin_quantize(b, num_bits, group_size, act_order)
+
+ # Marlin_24 quant
+ (marlin_24_w_ref, marlin_24_q_w_comp, marlin_24_meta,
+ marlin_24_s) = marlin_24_quantize(b, num_bits, group_size)
+
+ # GPTQ quant
+ (w_ref, q_w, s, g_idx,
+ rand_perm) = quantize_weights(b, num_bits, group_size, act_order)
+ q_w_gptq = gptq_pack(q_w, num_bits, size_k, size_n)
+
+ # For act_order, sort the "weights" and "g_idx"
+ # so that group ids are increasing
+ repack_sort_indices = torch.empty(0, dtype=torch.int, device=b.device)
+ if act_order:
+ (q_w, g_idx, repack_sort_indices) = sort_weights(q_w, g_idx)
+
+ # Prepare
+ marlin_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_MIN_THREAD_N,
+ GPTQ_MARLIN_MAX_PARALLEL)
+
+ marlin_24_workspace = MarlinWorkspace(size_n, GPTQ_MARLIN_24_MIN_THREAD_N,
+ GPTQ_MARLIN_24_MAX_PARALLEL)
+
+ globals = {
+ # Gen params
+ "num_bits": num_bits,
+ "group_size": group_size,
+ "size_m": size_m,
+ "size_n": size_n,
+ "size_k": size_k,
+ "a": a,
+ "a_tmp": a_tmp,
+ # Marlin params
+ "marlin_w_ref": marlin_w_ref,
+ "marlin_q_w": marlin_q_w,
+ "marlin_s": marlin_s,
+ "marlin_g_idx": marlin_g_idx,
+ "marlin_sort_indices": marlin_sort_indices,
+ "marlin_rand_perm": marlin_rand_perm,
+ "marlin_workspace": marlin_workspace,
+ "is_k_full": is_k_full,
+ # Marlin_24 params
+ "marlin_24_w_ref": marlin_24_w_ref,
+ "marlin_24_q_w_comp": marlin_24_q_w_comp,
+ "marlin_24_meta": marlin_24_meta,
+ "marlin_24_s": marlin_24_s,
+ "marlin_24_workspace": marlin_24_workspace,
+ # GPTQ params
+ "q_w_gptq": q_w_gptq,
+ "repack_sort_indices": repack_sort_indices,
+ # Kernels
+ "gptq_marlin_gemm": ops.gptq_marlin_gemm,
+ "gptq_marlin_24_gemm": ops.gptq_marlin_24_gemm,
+ "gptq_marlin_repack": ops.gptq_marlin_repack,
+ }
+
+ min_run_time = 1
+
+ # Warmup pytorch
+ for i in range(5):
+ torch.matmul(a, marlin_w_ref)
+
+ results.append(
+ benchmark.Timer(
+ stmt="torch.matmul(a, marlin_w_ref)",
+ globals=globals,
+ label=label,
+ sub_label=sub_label,
+ description="pytorch_gemm",
+ ).blocked_autorange(min_run_time=min_run_time))
+
+ results.append(
+ benchmark.Timer(
+ stmt=
+ "output = gptq_marlin_gemm(a, marlin_q_w, marlin_s, marlin_g_idx, marlin_sort_indices, marlin_workspace.scratch, num_bits, size_m, size_n, size_k, is_k_full)", # noqa: E501
+ globals=globals,
+ label=label,
+ sub_label=sub_label,
+ description="gptq_marlin_gemm",
+ ).blocked_autorange(min_run_time=min_run_time))
+
+ if (num_bits in GPTQ_MARLIN_24_SUPPORTED_NUM_BITS
+ and group_size in GPTQ_MARLIN_24_SUPPORTED_GROUP_SIZES):
+ results.append(
+ benchmark.Timer(
+ stmt=
+ "output = gptq_marlin_24_gemm(a, marlin_24_q_w_comp, marlin_24_meta, marlin_24_s, marlin_24_workspace.scratch, num_bits, size_m, size_n, size_k)", # noqa: E501
+ globals=globals,
+ label=label,
+ sub_label=sub_label,
+ description="gptq_marlin_24_gemm",
+ ).blocked_autorange(min_run_time=min_run_time))
+
+ results.append(
+ benchmark.Timer(
+ stmt=
+ "q_res = gptq_marlin_repack(q_w_gptq, repack_sort_indices, size_k, size_n, num_bits)", # noqa: E501
+ globals=globals,
+ label=label,
+ sub_label=sub_label,
+ description="gptq_marlin_repack",
+ ).blocked_autorange(min_run_time=min_run_time))
+
+
+def main(args):
+ print("Benchmarking models:")
+ for i, model in enumerate(args.models):
+ print(f"[{i}] {model}")
+
+ results = []
+
+ for model in args.models:
+ for layer in WEIGHT_SHAPES[model]:
+ size_k = layer[0]
+ size_n = layer[1]
+
+ if len(args.limit_k) > 0 and size_k not in args.limit_k:
+ continue
+
+ if len(args.limit_n) > 0 and size_n not in args.limit_n:
+ continue
+
+ for act_order in ACT_ORDER_OPTS:
+ if len(args.limit_act_order
+ ) > 0 and act_order not in args.limit_act_order:
+ continue
+
+ for is_k_full in K_FULL_OPTS:
+ if len(args.limit_k_full
+ ) > 0 and is_k_full not in args.limit_k_full:
+ continue
+
+ for num_bits in GPTQ_MARLIN_SUPPORTED_NUM_BITS:
+ if len(args.limit_num_bits
+ ) > 0 and num_bits not in args.limit_num_bits:
+ continue
+
+ for group_size in GPTQ_MARLIN_SUPPORTED_GROUP_SIZES:
+ if len(
+ args.limit_group_size
+ ) > 0 and group_size not in args.limit_group_size:
+ continue
+
+ # For act_order, the group_size must be less than
+ # size_k
+ if act_order and (group_size == size_k
+ or group_size == -1):
+ continue
+
+ for size_m in args.batch_sizes:
+ bench_run(results, model, act_order, is_k_full,
+ num_bits, group_size, size_m, size_k,
+ size_n)
+
+ compare = benchmark.Compare(results)
+ compare.print()
+
+
+# For quick benchmarking use:
+# python benchmark_marlin.py --batch-sizes 1 16 32 --limit-k 4096 --limit-n 4096 --limit-group-size 128 --limit-num-bits 4 --limit-act-order 0 --limit-k-full 1 # noqa E501
+#
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description="Benchmark Marlin across specified models/shapes/batches")
+ parser.add_argument(
+ "--models",
+ nargs="+",
+ type=str,
+ default=DEFAULT_MODELS,
+ choices=WEIGHT_SHAPES.keys(),
+ )
+ parser.add_argument("--batch-sizes",
+ nargs="+",
+ type=int,
+ default=DEFAULT_BATCH_SIZES)
+ parser.add_argument("--limit-k", nargs="+", type=int, default=[])
+ parser.add_argument("--limit-n", nargs="+", type=int, default=[])
+ parser.add_argument("--limit-group-size", nargs="+", type=int, default=[])
+ parser.add_argument("--limit-num-bits", nargs="+", type=int, default=[])
+ parser.add_argument("--limit-act-order", nargs="+", type=int, default=[])
+ parser.add_argument("--limit-k-full", nargs="+", type=int, default=[])
+
+ args = parser.parse_args()
+ main(args)
diff --git a/benchmarks/kernels/benchmark_mixtral_moe.py b/benchmarks/kernels/benchmark_mixtral_moe.py
deleted file mode 100644
index 964eca5aaf..0000000000
--- a/benchmarks/kernels/benchmark_mixtral_moe.py
+++ /dev/null
@@ -1,180 +0,0 @@
-import json
-import os
-import sys
-
-from vllm.model_executor.layers.fused_moe import fused_moe, get_config_file_name
-import torch
-import torch.nn.functional as F
-import triton
-
-os.environ['CUDA_VISIBLE_DEVICES'] = '0'
-
-
-def main():
- method = fused_moe
- for bs in [
- 1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
- 2048, 3072, 4096
- ]:
- run_grid(bs, method=method)
-
-
-def run_grid(bs, method):
- d_model = 4096
- num_total_experts = 8
- top_k = 2
- tp_size = 2
- model_intermediate_size = 14336
- num_layers = 32
- num_calls = 100
-
- num_warmup_trials = 1
- num_trials = 1
-
- configs = []
- if bs <= 16:
- BLOCK_SIZES_M = [16]
- elif bs <= 32:
- BLOCK_SIZES_M = [16, 32]
- elif bs <= 64:
- BLOCK_SIZES_M = [16, 32, 64]
- elif bs <= 128:
- BLOCK_SIZES_M = [16, 32, 64, 128]
- else:
- BLOCK_SIZES_M = [16, 32, 64, 128, 256]
-
- for block_size_n in [32, 64, 128, 256]:
- for block_size_m in BLOCK_SIZES_M:
- for block_size_k in [64, 128, 256]:
- for group_size_m in [1, 16, 32, 64]:
- for num_warps in [4, 8]:
- configs.append({
- "BLOCK_SIZE_M": block_size_m,
- "BLOCK_SIZE_N": block_size_n,
- "BLOCK_SIZE_K": block_size_k,
- "GROUP_SIZE_M": group_size_m,
- "num_warps": num_warps,
- "num_stages": 4,
- })
-
- best_config = None
- best_time_us = 1e20
-
- for config in configs:
- print(f'{tp_size=} {bs=}')
- print(f'{config}')
- # warmup
- print('warming up')
- try:
- for _ in range(num_warmup_trials):
- run_timing(
- num_calls=num_calls,
- bs=bs,
- d_model=d_model,
- num_total_experts=num_total_experts,
- top_k=top_k,
- tp_size=tp_size,
- model_intermediate_size=model_intermediate_size,
- method=method,
- config=config,
- )
- except triton.runtime.autotuner.OutOfResources:
- continue
-
- # trial
- print('benchmarking')
- for _ in range(num_trials):
- kernel_dur_ms = run_timing(
- num_calls=num_calls,
- bs=bs,
- d_model=d_model,
- num_total_experts=num_total_experts,
- top_k=top_k,
- tp_size=tp_size,
- model_intermediate_size=model_intermediate_size,
- method=method,
- config=config,
- )
-
- kernel_dur_us = 1000 * kernel_dur_ms
- model_dur_ms = kernel_dur_ms * num_layers
-
- if kernel_dur_us < best_time_us:
- best_config = config
- best_time_us = kernel_dur_us
-
- print(f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f}'
- f' {bs=} {tp_size=} {top_k=} {num_total_experts=} '
- f'{d_model=} {model_intermediate_size=} {num_layers=}')
-
- print("best_time_us", best_time_us)
- print("best_config", best_config)
-
- # holds Dict[str, Dict[str, int]]
- filename = get_config_file_name(num_total_experts,
- model_intermediate_size // tp_size)
- print(f"writing config to file {filename}")
- existing_content = {}
- if os.path.exists(filename):
- with open(filename, "r") as f:
- existing_content = json.load(f)
- existing_content[str(bs)] = best_config
- with open(filename, "w") as f:
- json.dump(existing_content, f, indent=4)
- f.write("\n")
-
-
-def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
- top_k: int, tp_size: int, model_intermediate_size: int, method,
- config) -> float:
- shard_intermediate_size = model_intermediate_size // tp_size
-
- hidden_states = torch.rand(
- (bs, d_model),
- device="cuda:0",
- dtype=torch.bfloat16,
- )
-
- ws = torch.rand(
- (num_total_experts, 2 * shard_intermediate_size, d_model),
- device=hidden_states.device,
- dtype=hidden_states.dtype,
- )
-
- w2s = torch.rand(
- (num_total_experts, d_model, shard_intermediate_size),
- device=hidden_states.device,
- dtype=hidden_states.dtype,
- )
-
- gating_output = F.softmax(torch.rand(
- (num_calls, bs, num_total_experts),
- device=hidden_states.device,
- dtype=torch.float32,
- ),
- dim=-1)
-
- start_event = torch.cuda.Event(enable_timing=True)
- end_event = torch.cuda.Event(enable_timing=True)
-
- start_event.record()
- for i in range(num_calls):
- hidden_states = method(
- hidden_states=hidden_states,
- w1=ws,
- w2=w2s,
- gating_output=gating_output[i],
- topk=2,
- renormalize=True,
- inplace=True,
- override_config=config,
- )
- end_event.record()
- end_event.synchronize()
-
- dur_ms = start_event.elapsed_time(end_event) / num_calls
- return dur_ms
-
-
-if __name__ == "__main__":
- sys.exit(main())
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
new file mode 100644
index 0000000000..2edc63142d
--- /dev/null
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -0,0 +1,321 @@
+import argparse
+import time
+from datetime import datetime
+from typing import Any, Dict, List, Tuple
+
+import ray
+import torch
+import triton
+from ray.experimental.tqdm_ray import tqdm
+from transformers import AutoConfig
+
+from vllm.model_executor.layers.fused_moe.fused_moe import *
+
+
+def benchmark_config(
+ config: Dict[str, int],
+ num_tokens: int,
+ num_experts: int,
+ shard_intermediate_size: int,
+ hidden_size: int,
+ topk: int,
+ dtype: torch.dtype,
+ use_fp8: bool,
+ num_iters: int = 100,
+) -> float:
+ init_dtype = torch.float16 if use_fp8 else dtype
+ x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+ w1 = torch.randn(num_experts,
+ shard_intermediate_size,
+ hidden_size,
+ dtype=init_dtype)
+ w2 = torch.randn(num_experts,
+ hidden_size,
+ shard_intermediate_size // 2,
+ dtype=init_dtype)
+ gating_output = torch.randn(num_iters,
+ num_tokens,
+ num_experts,
+ dtype=torch.float32)
+
+ w1_scale = None
+ w2_scale = None
+ a1_scale = None
+ a2_scale = None
+ if use_fp8:
+ w1_scale = torch.randn(num_experts, dtype=torch.float32)
+ w2_scale = torch.randn(num_experts, dtype=torch.float32)
+ a1_scale = torch.randn(1, dtype=torch.float32)
+ a2_scale = torch.randn(1, dtype=torch.float32)
+
+ w1 = w1.to(torch.float8_e4m3fn)
+ w2 = w2.to(torch.float8_e4m3fn)
+
+ input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32)
+
+ def prepare(i: int):
+ input_gating.copy_(gating_output[i])
+
+ def run():
+ fused_moe(
+ x,
+ w1,
+ w2,
+ input_gating,
+ topk,
+ renormalize=True,
+ inplace=True,
+ override_config=config,
+ use_fp8=use_fp8,
+ w1_scale=w1_scale,
+ w2_scale=w2_scale,
+ a1_scale=a1_scale,
+ a2_scale=a2_scale,
+ )
+
+ # JIT compilation & warmup
+ run()
+ torch.cuda.synchronize()
+
+ # Capture 10 invocations with CUDA graph
+ graph = torch.cuda.CUDAGraph()
+ with torch.cuda.graph(graph):
+ for _ in range(10):
+ run()
+ torch.cuda.synchronize()
+
+ # Warmup
+ for _ in range(5):
+ graph.replay()
+ torch.cuda.synchronize()
+
+ start_event = torch.cuda.Event(enable_timing=True)
+ end_event = torch.cuda.Event(enable_timing=True)
+
+ latencies = []
+ for i in range(num_iters):
+ prepare(i)
+ torch.cuda.synchronize()
+
+ start_event.record()
+ graph.replay()
+ end_event.record()
+ end_event.synchronize()
+ latencies.append(start_event.elapsed_time(end_event))
+ avg = sum(latencies) / (num_iters * 10) * 1000 # us
+ graph.reset()
+ return avg
+
+
+def get_configs_compute_bound() -> List[Dict[str, int]]:
+ # Reduced search space for faster tuning.
+ # TODO(woosuk): Increase the search space and use a performance model to
+ # prune the search space.
+ configs = []
+ for num_stages in [2, 3, 4, 5]:
+ for block_m in [16, 32, 64, 128, 256]:
+ for block_k in [64, 128, 256]:
+ for block_n in [32, 64, 128, 256]:
+ for num_warps in [4, 8]:
+ for group_size in [1, 16, 32, 64]:
+ configs.append({
+ "BLOCK_SIZE_M": block_m,
+ "BLOCK_SIZE_N": block_n,
+ "BLOCK_SIZE_K": block_k,
+ "GROUP_SIZE_M": group_size,
+ "num_warps": num_warps,
+ "num_stages": num_stages,
+ })
+ return configs
+
+
+@ray.remote(num_gpus=1)
+class BenchmarkWorker:
+
+ def __init__(self, seed: int) -> None:
+ torch.set_default_device("cuda")
+ torch.cuda.manual_seed_all(seed)
+ self.seed = seed
+
+ def benchmark(
+ self,
+ num_tokens: int,
+ num_experts: int,
+ shard_intermediate_size: int,
+ hidden_size: int,
+ topk: int,
+ dtype: torch.dtype,
+ use_fp8: bool,
+ ) -> Tuple[Dict[str, int], float]:
+ torch.cuda.manual_seed_all(self.seed)
+
+ dtype_str = "float8" if use_fp8 else None
+ # NOTE(woosuk): The current naming convention uses w2.shape[2], which
+ # is the intermediate size after silu_and_mul.
+ op_config = get_moe_configs(num_experts, shard_intermediate_size // 2,
+ dtype_str)
+ if op_config is None:
+ config = get_default_config(num_tokens, num_experts,
+ shard_intermediate_size, hidden_size,
+ topk, dtype_str)
+ else:
+ config = op_config[min(op_config.keys(),
+ key=lambda x: abs(x - num_tokens))]
+ kernel_time = benchmark_config(config, num_tokens, num_experts,
+ shard_intermediate_size, hidden_size,
+ topk, dtype, use_fp8)
+ return config, kernel_time
+
+ def tune(
+ self,
+ num_tokens: int,
+ num_experts: int,
+ shard_intermediate_size: int,
+ hidden_size: int,
+ topk: int,
+ dtype: torch.dtype,
+ use_fp8: bool,
+ search_space: List[Dict[str, int]],
+ ) -> Dict[str, int]:
+ best_config = None
+ best_time = float("inf")
+ for config in tqdm(search_space):
+ try:
+ kernel_time = benchmark_config(config,
+ num_tokens,
+ num_experts,
+ shard_intermediate_size,
+ hidden_size,
+ topk,
+ dtype,
+ use_fp8,
+ num_iters=10)
+ except triton.runtime.autotuner.OutOfResources:
+ # Some configurations may be invalid and fail to compile.
+ continue
+
+ if kernel_time < best_time:
+ best_time = kernel_time
+ best_config = config
+ now = datetime.now()
+ print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
+ return best_config
+
+
+def sort_config(config: Dict[str, int]) -> Dict[str, int]:
+ return {
+ "BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
+ "BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
+ "BLOCK_SIZE_K": config["BLOCK_SIZE_K"],
+ "GROUP_SIZE_M": config["GROUP_SIZE_M"],
+ "num_warps": config["num_warps"],
+ "num_stages": config["num_stages"],
+ }
+
+
+def save_configs(
+ configs: Dict[int, Dict[str, int]],
+ num_experts: int,
+ shard_intermediate_size: int,
+ hidden_size: int,
+ topk: int,
+ dtype: torch.dtype,
+ use_fp8: bool,
+) -> None:
+ dtype_str = "float8" if use_fp8 else None
+ # NOTE(woosuk): The current naming convention uses w2.shape[2], which
+ # is the intermediate size after silu_and_mul.
+ filename = get_config_file_name(num_experts, shard_intermediate_size // 2,
+ dtype_str)
+ print(f"Writing best config to {filename}...")
+ with open(filename, "w") as f:
+ json.dump(configs, f, indent=4)
+ f.write("\n")
+
+
+def main(args: argparse.Namespace):
+ print(args)
+
+ config = AutoConfig.from_pretrained(args.model)
+ if config.architectures[0] == "DbrxForCausalLM":
+ E = config.ffn_config.moe_num_experts
+ topk = config.ffn_config.moe_top_k
+ intermediate_size = config.ffn_config.ffn_hidden_size
+ shard_intermediate_size = 2 * intermediate_size // args.tp_size
+ else:
+ # Default: Mixtral.
+ E = config.num_local_experts
+ topk = config.num_experts_per_tok
+ intermediate_size = config.intermediate_size
+ shard_intermediate_size = 2 * intermediate_size // args.tp_size
+
+ hidden_size = config.hidden_size
+ dtype = config.torch_dtype
+ use_fp8 = args.dtype == "fp8"
+
+ if args.batch_size is None:
+ batch_sizes = [
+ 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 1536, 2048, 3072, 4096
+ ]
+ else:
+ batch_sizes = [args.batch_size]
+
+ ray.init()
+ num_gpus = int(ray.available_resources()["GPU"])
+ workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
+
+ def _distribute(method: str, inputs: List[Any]) -> List[Any]:
+ outputs = []
+ worker_idx = 0
+ for input_args in inputs:
+ worker = workers[worker_idx]
+ worker_method = getattr(worker, method)
+ output = worker_method.remote(*input_args)
+ outputs.append(output)
+ worker_idx = (worker_idx + 1) % num_gpus
+ return ray.get(outputs)
+
+ if args.tune:
+ search_space = get_configs_compute_bound()
+ print(f"Start tuning over {len(search_space)} configurations...")
+
+ start = time.time()
+ configs = _distribute(
+ "tune", [(batch_size, E, shard_intermediate_size, hidden_size,
+ topk, dtype, use_fp8, search_space)
+ for batch_size in batch_sizes])
+ best_configs = {
+ M: sort_config(config)
+ for M, config in zip(batch_sizes, configs)
+ }
+ save_configs(best_configs, E, shard_intermediate_size, hidden_size,
+ topk, dtype, use_fp8)
+ end = time.time()
+ print(f"Tuning took {end - start:.2f} seconds")
+ else:
+ outputs = _distribute("benchmark",
+ [(batch_size, E, shard_intermediate_size,
+ hidden_size, topk, dtype, use_fp8)
+ for batch_size in batch_sizes])
+
+ for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
+ print(f"Batch size: {batch_size}, config: {config}")
+ print(f"Kernel time: {kernel_time:.2f} us")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--model",
+ type=str,
+ default="mistralai/Mixtral-8x7B-Instruct-v0.1")
+ parser.add_argument("--tp-size", "-tp", type=int, default=2)
+ parser.add_argument("--dtype",
+ type=str,
+ choices=["auto", "fp8"],
+ default="auto")
+ parser.add_argument("--seed", type=int, default=0)
+ parser.add_argument("--batch-size", type=int, required=False)
+ parser.add_argument("--tune", action="store_true")
+ args = parser.parse_args()
+
+ main(args)
diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
index d921dea122..e6f4e9e6b9 100644
--- a/benchmarks/kernels/benchmark_paged_attention.py
+++ b/benchmarks/kernels/benchmark_paged_attention.py
@@ -1,12 +1,12 @@
-from typing import Optional
import argparse
import random
import time
+from typing import Optional
import torch
+from vllm import _custom_ops as ops
from vllm.utils import STR_DTYPE_TO_TORCH_DTYPE, create_kv_caches_with_random
-from vllm._C import ops
NUM_BLOCKS = 1024
PARTITION_SIZE = 512
@@ -16,7 +16,7 @@
def main(
version: str,
num_seqs: int,
- context_len: int,
+ seq_len: int,
num_query_heads: int,
num_kv_heads: int,
head_size: int,
@@ -48,12 +48,12 @@ def main(
dtype=torch.float,
device=device)
- context_lens = [context_len for _ in range(num_seqs)]
- max_context_len = max(context_lens)
- context_lens = torch.tensor(context_lens, dtype=torch.int, device=device)
+ seq_lens = [seq_len for _ in range(num_seqs)]
+ max_seq_len = max(seq_lens)
+ seq_lens = torch.tensor(seq_lens, dtype=torch.int, device=device)
# Create the block tables.
- max_num_blocks_per_seq = (max_context_len + block_size - 1) // block_size
+ max_num_blocks_per_seq = (max_seq_len + block_size - 1) // block_size
block_tables = []
for _ in range(num_seqs):
block_table = [
@@ -77,8 +77,7 @@ def main(
# Prepare for the paged attention kernel.
output = torch.empty_like(query)
if version == "v2":
- num_partitions = ((max_context_len + PARTITION_SIZE - 1) //
- PARTITION_SIZE)
+ num_partitions = ((max_seq_len + PARTITION_SIZE - 1) // PARTITION_SIZE)
tmp_output = torch.empty(
size=(num_seqs, num_query_heads, num_partitions, head_size),
dtype=output.dtype,
@@ -97,6 +96,9 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
torch.cuda.cudart().cudaProfilerStart()
start_time = time.perf_counter()
+ # Using default kv_scale
+ kv_scale = 1.0
+
for _ in range(num_iters):
if version == "v1":
ops.paged_attention_v1(
@@ -107,11 +109,12 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
num_kv_heads,
scale,
block_tables,
- context_lens,
+ seq_lens,
block_size,
- max_context_len,
+ max_seq_len,
alibi_slopes,
kv_cache_dtype,
+ kv_scale,
)
elif version == "v2":
ops.paged_attention_v2(
@@ -125,11 +128,12 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
num_kv_heads,
scale,
block_tables,
- context_lens,
+ seq_lens,
block_size,
- max_context_len,
+ max_seq_len,
alibi_slopes,
kv_cache_dtype,
+ kv_scale,
)
else:
raise ValueError(f"Invalid version: {version}")
@@ -161,12 +165,12 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
choices=["v1", "v2"],
default="v2")
parser.add_argument("--batch-size", type=int, default=8)
- parser.add_argument("--context-len", type=int, default=4096)
+ parser.add_argument("--seq_len", type=int, default=4096)
parser.add_argument("--num-query-heads", type=int, default=64)
parser.add_argument("--num-kv-heads", type=int, default=8)
parser.add_argument("--head-size",
type=int,
- choices=[64, 80, 96, 112, 128, 256],
+ choices=[64, 80, 96, 112, 128, 192, 256],
default=128)
parser.add_argument("--block-size", type=int, choices=[16, 32], default=16)
parser.add_argument("--use-alibi", action="store_true")
@@ -179,11 +183,11 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
parser.add_argument(
"--kv-cache-dtype",
type=str,
- choices=["auto", "fp8_e5m2"],
+ choices=["auto", "fp8", "fp8_e5m2", "fp8_e4m3"],
default="auto",
- help=
- 'Data type for kv cache storage. If "auto", will use model data type.')
- parser.add_argument("--device", type=str, choices=["cuda"], default="cuda")
+ help="Data type for kv cache storage. If 'auto', will use model "
+ "data type. CUDA 11.8+ supports fp8 (=fp8_e4m3) and fp8_e5m2. "
+ "ROCm (AMD GPU) supports fp8 (=fp8_e4m3)")
args = parser.parse_args()
print(args)
@@ -192,7 +196,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
main(
version=args.version,
num_seqs=args.batch_size,
- context_len=args.context_len,
+ seq_len=args.seq_len,
num_query_heads=args.num_query_heads,
num_kv_heads=args.num_kv_heads,
head_size=args.head_size,
diff --git a/benchmarks/kernels/benchmark_rope.py b/benchmarks/kernels/benchmark_rope.py
index f9564dd958..00e55f6060 100644
--- a/benchmarks/kernels/benchmark_rope.py
+++ b/benchmarks/kernels/benchmark_rope.py
@@ -1,9 +1,10 @@
+import argparse
+from itertools import accumulate
from typing import Optional
-import argparse
-import torch
import nvtx
-from itertools import accumulate
+import torch
+
from vllm.model_executor.layers.rotary_embedding import get_rope
@@ -92,7 +93,7 @@ def benchmark_rope_kernels_multi_lora(
parser.add_argument("--num-heads", type=int, default=8)
parser.add_argument("--head-size",
type=int,
- choices=[64, 80, 96, 112, 128, 256],
+ choices=[64, 80, 96, 112, 128, 192, 256],
default=128)
parser.add_argument("--rotary-dim", type=int, choices=[16, 32], default=32)
parser.add_argument("--dtype",
diff --git a/benchmarks/kernels/benchmark_shapes.py b/benchmarks/kernels/benchmark_shapes.py
new file mode 100644
index 0000000000..4eeeca35a3
--- /dev/null
+++ b/benchmarks/kernels/benchmark_shapes.py
@@ -0,0 +1,75 @@
+WEIGHT_SHAPES = {
+ "ideal": [[4 * 256 * 32, 256 * 32]],
+ "mistralai/Mistral-7B-v0.1/TP1": [
+ [4096, 6144],
+ [4096, 4096],
+ [4096, 28672],
+ [14336, 4096],
+ ],
+ "mistralai/Mistral-7B-v0.1/TP2": [
+ [4096, 3072],
+ [2048, 4096],
+ [4096, 14336],
+ [7168, 4096],
+ ],
+ "mistralai/Mistral-7B-v0.1/TP4": [
+ [4096, 1536],
+ [1024, 4096],
+ [4096, 7168],
+ [3584, 4096],
+ ],
+ "meta-llama/Llama-2-7b-hf/TP1": [
+ [4096, 12288],
+ [4096, 4096],
+ [4096, 22016],
+ [11008, 4096],
+ ],
+ "meta-llama/Llama-2-7b-hf/TP2": [
+ [4096, 6144],
+ [2048, 4096],
+ [4096, 11008],
+ [5504, 4096],
+ ],
+ "meta-llama/Llama-2-7b-hf/TP4": [
+ [4096, 3072],
+ [1024, 4096],
+ [4096, 5504],
+ [2752, 4096],
+ ],
+ "meta-llama/Llama-2-13b-hf/TP1": [
+ [5120, 15360],
+ [5120, 5120],
+ [5120, 27648],
+ [13824, 5120],
+ ],
+ "meta-llama/Llama-2-13b-hf/TP2": [
+ [5120, 7680],
+ [2560, 5120],
+ [5120, 13824],
+ [6912, 5120],
+ ],
+ "meta-llama/Llama-2-13b-hf/TP4": [
+ [5120, 3840],
+ [1280, 5120],
+ [5120, 6912],
+ [3456, 5120],
+ ],
+ "meta-llama/Llama-2-70b-hf/TP1": [
+ [8192, 10240],
+ [8192, 8192],
+ [8192, 57344],
+ [28672, 8192],
+ ],
+ "meta-llama/Llama-2-70b-hf/TP2": [
+ [8192, 5120],
+ [4096, 8192],
+ [8192, 28672],
+ [14336, 8192],
+ ],
+ "meta-llama/Llama-2-70b-hf/TP4": [
+ [8192, 2560],
+ [2048, 8192],
+ [8192, 14336],
+ [7168, 8192],
+ ],
+}
diff --git a/benchmarks/launch_tgi_server.sh b/benchmarks/launch_tgi_server.sh
index 64d3c4f4b3..f491c90d06 100755
--- a/benchmarks/launch_tgi_server.sh
+++ b/benchmarks/launch_tgi_server.sh
@@ -4,7 +4,7 @@ PORT=8000
MODEL=$1
TOKENS=$2
-docker run --gpus all --shm-size 1g -p $PORT:80 \
+docker run -e HF_TOKEN=$HF_TOKEN --gpus all --shm-size 1g -p $PORT:80 \
-v $PWD/data:/data \
ghcr.io/huggingface/text-generation-inference:1.4.0 \
--model-id $MODEL \
diff --git a/benchmarks/overheads/benchmark_hashing.py b/benchmarks/overheads/benchmark_hashing.py
new file mode 100644
index 0000000000..c846e47de1
--- /dev/null
+++ b/benchmarks/overheads/benchmark_hashing.py
@@ -0,0 +1,63 @@
+import argparse
+import cProfile
+import pstats
+
+from vllm import LLM, SamplingParams
+
+# A very long prompt, total number of tokens is about 15k.
+LONG_PROMPT = ["You are an expert in large language models, aren't you?"
+ ] * 1000
+LONG_PROMPT = ' '.join(LONG_PROMPT)
+
+
+def main(args):
+ llm = LLM(
+ model=args.model,
+ enforce_eager=True,
+ enable_prefix_caching=True,
+ tensor_parallel_size=args.tensor_parallel_size,
+ use_v2_block_manager=args.use_v2_block_manager,
+ )
+
+ sampling_params = SamplingParams(temperature=0, max_tokens=args.output_len)
+ profiler = cProfile.Profile()
+
+ print("------warm up------")
+ for i in range(3):
+ output = llm.generate(LONG_PROMPT, sampling_params)
+ print(output[0].outputs[0].text)
+
+ print("------start generating------")
+ for i in range(3):
+ profiler.runctx('llm.generate(LONG_PROMPT, sampling_params)',
+ globals(), locals())
+
+ # analyze the runtime of hashing function
+ stats = pstats.Stats(profiler)
+ stats.sort_stats('cumulative')
+ total_time = 0
+ total_calls = 0
+ for func in stats.stats:
+ if 'hash_of_block' in func[2]:
+ total_time = stats.stats[func][3]
+ total_calls = stats.stats[func][0]
+ percentage = (total_time / stats.total_tt) * 100
+ print(f"Hashing took {total_time:.2f} seconds,"
+ f"{percentage:.2f}% of the total runtime.")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(
+ description='Benchmark the performance of hashing function in'
+ 'automatic prefix caching.')
+ parser.add_argument('--model', type=str, default='lmsys/longchat-7b-16k')
+ parser.add_argument('--tensor-parallel-size', '-tp', type=int, default=1)
+ parser.add_argument('--output-len', type=int, default=10)
+ parser.add_argument('--enable-prefix-caching',
+ action='store_true',
+ help='enable prefix caching')
+ parser.add_argument('--use-v2-block-manager',
+ action='store_true',
+ help='Use BlockSpaceMangerV2')
+ args = parser.parse_args()
+ main(args)
diff --git a/benchmarks/sonnet.txt b/benchmarks/sonnet.txt
new file mode 100644
index 0000000000..34c444e8ce
--- /dev/null
+++ b/benchmarks/sonnet.txt
@@ -0,0 +1,518 @@
+FROM fairest creatures we desire increase,
+That thereby beauty's rose might never die,
+But as the riper should by time decease,
+His tender heir might bear his memory:
+But thou, contracted to thine own bright eyes,
+Feed'st thy light'st flame with self-substantial fuel,
+Making a famine where abundance lies,
+Thyself thy foe, to thy sweet self too cruel.
+Thou that art now the world's fresh ornament
+And only herald to the gaudy spring,
+Within thine own bud buriest thy content
+And, tender churl, makest waste in niggarding.
+Pity the world, or else this glutton be,
+To eat the world's due, by the grave and thee.
+When forty winters shall beseige thy brow,
+And dig deep trenches in thy beauty's field,
+Thy youth's proud livery, so gazed on now,
+Will be a tatter'd weed, of small worth held:
+Then being ask'd where all thy beauty lies,
+Where all the treasure of thy lusty days,
+To say, within thine own deep-sunken eyes,
+Were an all-eating shame and thriftless praise.
+How much more praise deserved thy beauty's use,
+If thou couldst answer 'This fair child of mine
+Shall sum my count and make my old excuse,'
+Proving his beauty by succession thine!
+This were to be new made when thou art old,
+And see thy blood warm when thou feel'st it cold.
+Look in thy glass, and tell the face thou viewest
+Now is the time that face should form another;
+Whose fresh repair if now thou not renewest,
+Thou dost beguile the world, unbless some mother.
+For where is she so fair whose unear'd womb
+Disdains the tillage of thy husbandry?
+Or who is he so fond will be the tomb
+Of his self-love, to stop posterity?
+Thou art thy mother's glass, and she in thee
+Calls back the lovely April of her prime:
+So thou through windows of thine age shall see
+Despite of wrinkles this thy golden time.
+But if thou live, remember'd not to be,
+Die single, and thine image dies with thee.
+Unthrifty loveliness, why dost thou spend
+Upon thyself thy beauty's legacy?
+Nature's bequest gives nothing but doth lend,
+And being frank she lends to those are free.
+Then, beauteous niggard, why dost thou abuse
+The bounteous largess given thee to give?
+Profitless usurer, why dost thou use
+So great a sum of sums, yet canst not live?
+For having traffic with thyself alone,
+Thou of thyself thy sweet self dost deceive.
+Then how, when nature calls thee to be gone,
+What acceptable audit canst thou leave?
+Thy unused beauty must be tomb'd with thee,
+Which, used, lives th' executor to be.
+Those hours, that with gentle work did frame
+The lovely gaze where every eye doth dwell,
+Will play the tyrants to the very same
+And that unfair which fairly doth excel:
+For never-resting time leads summer on
+To hideous winter and confounds him there;
+Sap cheque'd with frost and lusty leaves quite gone,
+Beauty o'ersnow'd and bareness every where:
+Then, were not summer's distillation left,
+A liquid prisoner pent in walls of glass,
+Beauty's effect with beauty were bereft,
+Nor it nor no remembrance what it was:
+But flowers distill'd though they with winter meet,
+Leese but their show; their substance still lives sweet.
+Then let not winter's ragged hand deface
+In thee thy summer, ere thou be distill'd:
+Make sweet some vial; treasure thou some place
+With beauty's treasure, ere it be self-kill'd.
+That use is not forbidden usury,
+Which happies those that pay the willing loan;
+That's for thyself to breed another thee,
+Or ten times happier, be it ten for one;
+Ten times thyself were happier than thou art,
+If ten of thine ten times refigured thee:
+Then what could death do, if thou shouldst depart,
+Leaving thee living in posterity?
+Be not self-will'd, for thou art much too fair
+To be death's conquest and make worms thine heir.
+Lo! in the orient when the gracious light
+Lifts up his burning head, each under eye
+Doth homage to his new-appearing sight,
+Serving with looks his sacred majesty;
+And having climb'd the steep-up heavenly hill,
+Resembling strong youth in his middle age,
+yet mortal looks adore his beauty still,
+Attending on his golden pilgrimage;
+But when from highmost pitch, with weary car,
+Like feeble age, he reeleth from the day,
+The eyes, 'fore duteous, now converted are
+From his low tract and look another way:
+So thou, thyself out-going in thy noon,
+Unlook'd on diest, unless thou get a son.
+Music to hear, why hear'st thou music sadly?
+Sweets with sweets war not, joy delights in joy.
+Why lovest thou that which thou receivest not gladly,
+Or else receivest with pleasure thine annoy?
+If the true concord of well-tuned sounds,
+By unions married, do offend thine ear,
+They do but sweetly chide thee, who confounds
+In singleness the parts that thou shouldst bear.
+Mark how one string, sweet husband to another,
+Strikes each in each by mutual ordering,
+Resembling sire and child and happy mother
+Who all in one, one pleasing note do sing:
+Whose speechless song, being many, seeming one,
+Sings this to thee: 'thou single wilt prove none.'
+Is it for fear to wet a widow's eye
+That thou consumest thyself in single life?
+Ah! if thou issueless shalt hap to die.
+The world will wail thee, like a makeless wife;
+The world will be thy widow and still weep
+That thou no form of thee hast left behind,
+When every private widow well may keep
+By children's eyes her husband's shape in mind.
+Look, what an unthrift in the world doth spend
+Shifts but his place, for still the world enjoys it;
+But beauty's waste hath in the world an end,
+And kept unused, the user so destroys it.
+No love toward others in that bosom sits
+That on himself such murderous shame commits.
+For shame! deny that thou bear'st love to any,
+Who for thyself art so unprovident.
+Grant, if thou wilt, thou art beloved of many,
+But that thou none lovest is most evident;
+For thou art so possess'd with murderous hate
+That 'gainst thyself thou stick'st not to conspire.
+Seeking that beauteous roof to ruinate
+Which to repair should be thy chief desire.
+O, change thy thought, that I may change my mind!
+Shall hate be fairer lodged than gentle love?
+Be, as thy presence is, gracious and kind,
+Or to thyself at least kind-hearted prove:
+Make thee another self, for love of me,
+That beauty still may live in thine or thee.
+As fast as thou shalt wane, so fast thou growest
+In one of thine, from that which thou departest;
+And that fresh blood which youngly thou bestowest
+Thou mayst call thine when thou from youth convertest.
+Herein lives wisdom, beauty and increase:
+Without this, folly, age and cold decay:
+If all were minded so, the times should cease
+And threescore year would make the world away.
+Let those whom Nature hath not made for store,
+Harsh featureless and rude, barrenly perish:
+Look, whom she best endow'd she gave the more;
+Which bounteous gift thou shouldst in bounty cherish:
+She carved thee for her seal, and meant thereby
+Thou shouldst print more, not let that copy die.
+When I do count the clock that tells the time,
+And see the brave day sunk in hideous night;
+When I behold the violet past prime,
+And sable curls all silver'd o'er with white;
+When lofty trees I see barren of leaves
+Which erst from heat did canopy the herd,
+And summer's green all girded up in sheaves
+Borne on the bier with white and bristly beard,
+Then of thy beauty do I question make,
+That thou among the wastes of time must go,
+Since sweets and beauties do themselves forsake
+And die as fast as they see others grow;
+And nothing 'gainst Time's scythe can make defence
+Save breed, to brave him when he takes thee hence.
+O, that you were yourself! but, love, you are
+No longer yours than you yourself here live:
+Against this coming end you should prepare,
+And your sweet semblance to some other give.
+So should that beauty which you hold in lease
+Find no determination: then you were
+Yourself again after yourself's decease,
+When your sweet issue your sweet form should bear.
+Who lets so fair a house fall to decay,
+Which husbandry in honour might uphold
+Against the stormy gusts of winter's day
+And barren rage of death's eternal cold?
+O, none but unthrifts! Dear my love, you know
+You had a father: let your son say so.
+Not from the stars do I my judgment pluck;
+And yet methinks I have astronomy,
+But not to tell of good or evil luck,
+Of plagues, of dearths, or seasons' quality;
+Nor can I fortune to brief minutes tell,
+Pointing to each his thunder, rain and wind,
+Or say with princes if it shall go well,
+By oft predict that I in heaven find:
+But from thine eyes my knowledge I derive,
+And, constant stars, in them I read such art
+As truth and beauty shall together thrive,
+If from thyself to store thou wouldst convert;
+Or else of thee this I prognosticate:
+Thy end is truth's and beauty's doom and date.
+When I consider every thing that grows
+Holds in perfection but a little moment,
+That this huge stage presenteth nought but shows
+Whereon the stars in secret influence comment;
+When I perceive that men as plants increase,
+Cheered and cheque'd even by the self-same sky,
+Vaunt in their youthful sap, at height decrease,
+And wear their brave state out of memory;
+Then the conceit of this inconstant stay
+Sets you most rich in youth before my sight,
+Where wasteful Time debateth with Decay,
+To change your day of youth to sullied night;
+And all in war with Time for love of you,
+As he takes from you, I engraft you new.
+But wherefore do not you a mightier way
+Make war upon this bloody tyrant, Time?
+And fortify yourself in your decay
+With means more blessed than my barren rhyme?
+Now stand you on the top of happy hours,
+And many maiden gardens yet unset
+With virtuous wish would bear your living flowers,
+Much liker than your painted counterfeit:
+So should the lines of life that life repair,
+Which this, Time's pencil, or my pupil pen,
+Neither in inward worth nor outward fair,
+Can make you live yourself in eyes of men.
+To give away yourself keeps yourself still,
+And you must live, drawn by your own sweet skill.
+Who will believe my verse in time to come,
+If it were fill'd with your most high deserts?
+Though yet, heaven knows, it is but as a tomb
+Which hides your life and shows not half your parts.
+If I could write the beauty of your eyes
+And in fresh numbers number all your graces,
+The age to come would say 'This poet lies:
+Such heavenly touches ne'er touch'd earthly faces.'
+So should my papers yellow'd with their age
+Be scorn'd like old men of less truth than tongue,
+And your true rights be term'd a poet's rage
+And stretched metre of an antique song:
+But were some child of yours alive that time,
+You should live twice; in it and in my rhyme.
+Shall I compare thee to a summer's day?
+Thou art more lovely and more temperate:
+Rough winds do shake the darling buds of May,
+And summer's lease hath all too short a date:
+Sometime too hot the eye of heaven shines,
+And often is his gold complexion dimm'd;
+And every fair from fair sometime declines,
+By chance or nature's changing course untrimm'd;
+But thy eternal summer shall not fade
+Nor lose possession of that fair thou owest;
+Nor shall Death brag thou wander'st in his shade,
+When in eternal lines to time thou growest:
+So long as men can breathe or eyes can see,
+So long lives this and this gives life to thee.
+Devouring Time, blunt thou the lion's paws,
+And make the earth devour her own sweet brood;
+Pluck the keen teeth from the fierce tiger's jaws,
+And burn the long-lived phoenix in her blood;
+Make glad and sorry seasons as thou fleets,
+And do whate'er thou wilt, swift-footed Time,
+To the wide world and all her fading sweets;
+But I forbid thee one most heinous crime:
+O, carve not with thy hours my love's fair brow,
+Nor draw no lines there with thine antique pen;
+Him in thy course untainted do allow
+For beauty's pattern to succeeding men.
+Yet, do thy worst, old Time: despite thy wrong,
+My love shall in my verse ever live young.
+A woman's face with Nature's own hand painted
+Hast thou, the master-mistress of my passion;
+A woman's gentle heart, but not acquainted
+With shifting change, as is false women's fashion;
+An eye more bright than theirs, less false in rolling,
+Gilding the object whereupon it gazeth;
+A man in hue, all 'hues' in his controlling,
+Much steals men's eyes and women's souls amazeth.
+And for a woman wert thou first created;
+Till Nature, as she wrought thee, fell a-doting,
+And by addition me of thee defeated,
+By adding one thing to my purpose nothing.
+But since she prick'd thee out for women's pleasure,
+Mine be thy love and thy love's use their treasure.
+So is it not with me as with that Muse
+Stirr'd by a painted beauty to his verse,
+Who heaven itself for ornament doth use
+And every fair with his fair doth rehearse
+Making a couplement of proud compare,
+With sun and moon, with earth and sea's rich gems,
+With April's first-born flowers, and all things rare
+That heaven's air in this huge rondure hems.
+O' let me, true in love, but truly write,
+And then believe me, my love is as fair
+As any mother's child, though not so bright
+As those gold candles fix'd in heaven's air:
+Let them say more than like of hearsay well;
+I will not praise that purpose not to sell.
+My glass shall not persuade me I am old,
+So long as youth and thou are of one date;
+But when in thee time's furrows I behold,
+Then look I death my days should expiate.
+For all that beauty that doth cover thee
+Is but the seemly raiment of my heart,
+Which in thy breast doth live, as thine in me:
+How can I then be elder than thou art?
+O, therefore, love, be of thyself so wary
+As I, not for myself, but for thee will;
+Bearing thy heart, which I will keep so chary
+As tender nurse her babe from faring ill.
+Presume not on thy heart when mine is slain;
+Thou gavest me thine, not to give back again.
+As an unperfect actor on the stage
+Who with his fear is put besides his part,
+Or some fierce thing replete with too much rage,
+Whose strength's abundance weakens his own heart.
+So I, for fear of trust, forget to say
+The perfect ceremony of love's rite,
+And in mine own love's strength seem to decay,
+O'ercharged with burden of mine own love's might.
+O, let my books be then the eloquence
+And dumb presagers of my speaking breast,
+Who plead for love and look for recompense
+More than that tongue that more hath more express'd.
+O, learn to read what silent love hath writ:
+To hear with eyes belongs to love's fine wit.
+Mine eye hath play'd the painter and hath stell'd
+Thy beauty's form in table of my heart;
+My body is the frame wherein 'tis held,
+And perspective it is the painter's art.
+For through the painter must you see his skill,
+To find where your true image pictured lies;
+Which in my bosom's shop is hanging still,
+That hath his windows glazed with thine eyes.
+Now see what good turns eyes for eyes have done:
+Mine eyes have drawn thy shape, and thine for me
+Are windows to my breast, where-through the sun
+Delights to peep, to gaze therein on thee;
+Yet eyes this cunning want to grace their art;
+They draw but what they see, know not the heart.
+Let those who are in favour with their stars
+Of public honour and proud titles boast,
+Whilst I, whom fortune of such triumph bars,
+Unlook'd for joy in that I honour most.
+Great princes' favourites their fair leaves spread
+But as the marigold at the sun's eye,
+And in themselves their pride lies buried,
+For at a frown they in their glory die.
+The painful warrior famoused for fight,
+After a thousand victories once foil'd,
+Is from the book of honour razed quite,
+And all the rest forgot for which he toil'd:
+Then happy I, that love and am beloved
+Where I may not remove nor be removed.
+Lord of my love, to whom in vassalage
+Thy merit hath my duty strongly knit,
+To thee I send this written embassage,
+To witness duty, not to show my wit:
+Duty so great, which wit so poor as mine
+May make seem bare, in wanting words to show it,
+But that I hope some good conceit of thine
+In thy soul's thought, all naked, will bestow it;
+Till whatsoever star that guides my moving
+Points on me graciously with fair aspect
+And puts apparel on my tatter'd loving,
+To show me worthy of thy sweet respect:
+Then may I dare to boast how I do love thee;
+Till then not show my head where thou mayst prove me.
+Weary with toil, I haste me to my bed,
+The dear repose for limbs with travel tired;
+But then begins a journey in my head,
+To work my mind, when body's work's expired:
+For then my thoughts, from far where I abide,
+Intend a zealous pilgrimage to thee,
+And keep my drooping eyelids open wide,
+Looking on darkness which the blind do see
+Save that my soul's imaginary sight
+Presents thy shadow to my sightless view,
+Which, like a jewel hung in ghastly night,
+Makes black night beauteous and her old face new.
+Lo! thus, by day my limbs, by night my mind,
+For thee and for myself no quiet find.
+How can I then return in happy plight,
+That am debarr'd the benefit of rest?
+When day's oppression is not eased by night,
+But day by night, and night by day, oppress'd?
+And each, though enemies to either's reign,
+Do in consent shake hands to torture me;
+The one by toil, the other to complain
+How far I toil, still farther off from thee.
+I tell the day, to please them thou art bright
+And dost him grace when clouds do blot the heaven:
+So flatter I the swart-complexion'd night,
+When sparkling stars twire not thou gild'st the even.
+But day doth daily draw my sorrows longer
+And night doth nightly make grief's strength seem stronger.
+When, in disgrace with fortune and men's eyes,
+I all alone beweep my outcast state
+And trouble deal heaven with my bootless cries
+And look upon myself and curse my fate,
+Wishing me like to one more rich in hope,
+Featured like him, like him with friends possess'd,
+Desiring this man's art and that man's scope,
+With what I most enjoy contented least;
+Yet in these thoughts myself almost despising,
+Haply I think on thee, and then my state,
+Like to the lark at break of day arising
+From sullen earth, sings hymns at heaven's gate;
+For thy sweet love remember'd such wealth brings
+That then I scorn to change my state with kings.
+When to the sessions of sweet silent thought
+I summon up remembrance of things past,
+I sigh the lack of many a thing I sought,
+And with old woes new wail my dear time's waste:
+Then can I drown an eye, unused to flow,
+For precious friends hid in death's dateless night,
+And weep afresh love's long since cancell'd woe,
+And moan the expense of many a vanish'd sight:
+Then can I grieve at grievances foregone,
+And heavily from woe to woe tell o'er
+The sad account of fore-bemoaned moan,
+Which I new pay as if not paid before.
+But if the while I think on thee, dear friend,
+All losses are restored and sorrows end.
+Thy bosom is endeared with all hearts,
+Which I by lacking have supposed dead,
+And there reigns love and all love's loving parts,
+And all those friends which I thought buried.
+How many a holy and obsequious tear
+Hath dear religious love stol'n from mine eye
+As interest of the dead, which now appear
+But things removed that hidden in thee lie!
+Thou art the grave where buried love doth live,
+Hung with the trophies of my lovers gone,
+Who all their parts of me to thee did give;
+That due of many now is thine alone:
+Their images I loved I view in thee,
+And thou, all they, hast all the all of me.
+If thou survive my well-contented day,
+When that churl Death my bones with dust shall cover,
+And shalt by fortune once more re-survey
+These poor rude lines of thy deceased lover,
+Compare them with the bettering of the time,
+And though they be outstripp'd by every pen,
+Reserve them for my love, not for their rhyme,
+Exceeded by the height of happier men.
+O, then vouchsafe me but this loving thought:
+'Had my friend's Muse grown with this growing age,
+A dearer birth than this his love had brought,
+To march in ranks of better equipage:
+But since he died and poets better prove,
+Theirs for their style I'll read, his for his love.'
+Full many a glorious morning have I seen
+Flatter the mountain-tops with sovereign eye,
+Kissing with golden face the meadows green,
+Gilding pale streams with heavenly alchemy;
+Anon permit the basest clouds to ride
+With ugly rack on his celestial face,
+And from the forlorn world his visage hide,
+Stealing unseen to west with this disgrace:
+Even so my sun one early morn did shine
+With all triumphant splendor on my brow;
+But out, alack! he was but one hour mine;
+The region cloud hath mask'd him from me now.
+Yet him for this my love no whit disdaineth;
+Suns of the world may stain when heaven's sun staineth.
+Why didst thou promise such a beauteous day,
+And make me travel forth without my cloak,
+To let base clouds o'ertake me in my way,
+Hiding thy bravery in their rotten smoke?
+'Tis not enough that through the cloud thou break,
+To dry the rain on my storm-beaten face,
+For no man well of such a salve can speak
+That heals the wound and cures not the disgrace:
+Nor can thy shame give physic to my grief;
+Though thou repent, yet I have still the loss:
+The offender's sorrow lends but weak relief
+To him that bears the strong offence's cross.
+Ah! but those tears are pearl which thy love sheds,
+And they are rich and ransom all ill deeds.
+No more be grieved at that which thou hast done:
+Roses have thorns, and silver fountains mud;
+Clouds and eclipses stain both moon and sun,
+And loathsome canker lives in sweetest bud.
+All men make faults, and even I in this,
+Authorizing thy trespass with compare,
+Myself corrupting, salving thy amiss,
+Excusing thy sins more than thy sins are;
+For to thy sensual fault I bring in sense--
+Thy adverse party is thy advocate--
+And 'gainst myself a lawful plea commence:
+Such civil war is in my love and hate
+That I an accessary needs must be
+To that sweet thief which sourly robs from me.
+Let me confess that we two must be twain,
+Although our undivided loves are one:
+So shall those blots that do with me remain
+Without thy help by me be borne alone.
+In our two loves there is but one respect,
+Though in our lives a separable spite,
+Which though it alter not love's sole effect,
+Yet doth it steal sweet hours from love's delight.
+I may not evermore acknowledge thee,
+Lest my bewailed guilt should do thee shame,
+Nor thou with public kindness honour me,
+Unless thou take that honour from thy name:
+But do not so; I love thee in such sort
+As, thou being mine, mine is thy good report.
+As a decrepit father takes delight
+To see his active child do deeds of youth,
+So I, made lame by fortune's dearest spite,
+Take all my comfort of thy worth and truth.
+For whether beauty, birth, or wealth, or wit,
+Or any of these all, or all, or more,
+Entitled in thy parts do crowned sit,
+I make my love engrafted to this store:
+So then I am not lame, poor, nor despised,
+Whilst that this shadow doth such substance give
+That I in thy abundance am sufficed
+And by a part of all thy glory live.
+Look, what is best, that best I wish in thee:
+This wish I have; then ten times happy me!
\ No newline at end of file
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
new file mode 100644
index 0000000000..0cf37769a6
--- /dev/null
+++ b/cmake/cpu_extension.cmake
@@ -0,0 +1,90 @@
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+#
+# Define environment variables for special configurations
+#
+if(DEFINED ENV{VLLM_CPU_AVX512BF16})
+ set(ENABLE_AVX512BF16 ON)
+endif()
+
+include_directories("${CMAKE_SOURCE_DIR}/csrc")
+
+#
+# Check the compile flags
+#
+list(APPEND CXX_COMPILE_FLAGS
+ "-fopenmp"
+ "-DVLLM_CPU_EXTENSION")
+
+execute_process(COMMAND cat /proc/cpuinfo
+ RESULT_VARIABLE CPUINFO_RET
+ OUTPUT_VARIABLE CPUINFO)
+
+if (NOT CPUINFO_RET EQUAL 0)
+ message(FATAL_ERROR "Failed to check CPU features via /proc/cpuinfo")
+endif()
+
+function (find_isa CPUINFO TARGET OUT)
+ string(FIND ${CPUINFO} ${TARGET} ISA_FOUND)
+ if(NOT ISA_FOUND EQUAL -1)
+ set(${OUT} ON PARENT_SCOPE)
+ else()
+ set(${OUT} OFF PARENT_SCOPE)
+ endif()
+endfunction()
+
+find_isa(${CPUINFO} "avx512f" AVX512_FOUND)
+
+if (AVX512_FOUND)
+ list(APPEND CXX_COMPILE_FLAGS
+ "-mavx512f"
+ "-mavx512vl"
+ "-mavx512bw"
+ "-mavx512dq")
+
+ find_isa(${CPUINFO} "avx512_bf16" AVX512BF16_FOUND)
+ if (AVX512BF16_FOUND OR ENABLE_AVX512BF16)
+ if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
+ CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
+ list(APPEND CXX_COMPILE_FLAGS "-mavx512bf16")
+ else()
+ message(WARNING "Disable AVX512-BF16 ISA support, requires gcc/g++ >= 12.3")
+ endif()
+ else()
+ message(WARNING "Disable AVX512-BF16 ISA support, no avx512_bf16 found in local CPU flags." " If cross-compilation is required, please set env VLLM_CPU_AVX512BF16=1.")
+ endif()
+else()
+ message(FATAL_ERROR "vLLM CPU backend requires AVX512 ISA support.")
+endif()
+
+message(STATUS "CPU extension compile flags: ${CXX_COMPILE_FLAGS}")
+
+
+#
+# Define extension targets
+#
+
+#
+# _C extension
+#
+set(VLLM_EXT_SRC
+ "csrc/cpu/activation.cpp"
+ "csrc/cpu/attention.cpp"
+ "csrc/cpu/cache.cpp"
+ "csrc/cpu/layernorm.cpp"
+ "csrc/cpu/pos_encoding.cpp"
+ "csrc/cpu/pybind.cpp")
+
+define_gpu_extension_target(
+ _C
+ DESTINATION vllm
+ LANGUAGE CXX
+ SOURCES ${VLLM_EXT_SRC}
+ COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
+ WITH_SOABI
+)
+
+add_custom_target(default)
+message(STATUS "Enabling C extension.")
+add_dependencies(default _C)
+
diff --git a/cmake/hipify.py b/cmake/hipify.py
index c4d8450630..340e41c817 100755
--- a/cmake/hipify.py
+++ b/cmake/hipify.py
@@ -9,8 +9,8 @@
#
import argparse
-import shutil
import os
+import shutil
from torch.utils.hipify.hipify_python import hipify
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index bb222bb437..00c81e4d00 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -99,7 +99,14 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
"Failed to determine torch nvcc compiler flags")
if (CUDA_VERSION VERSION_GREATER_EQUAL 11.8)
- list(APPEND GPU_FLAGS "-DENABLE_FP8_E5M2")
+ list(APPEND GPU_FLAGS "-DENABLE_FP8")
+ endif()
+ if (CUDA_VERSION VERSION_GREATER_EQUAL 12.0)
+ list(REMOVE_ITEM GPU_FLAGS
+ "-D__CUDA_NO_HALF_OPERATORS__"
+ "-D__CUDA_NO_HALF_CONVERSIONS__"
+ "-D__CUDA_NO_BFLOAT16_CONVERSIONS__"
+ "-D__CUDA_NO_HALF2_OPERATORS__")
endif()
elseif(${GPU_LANG} STREQUAL "HIP")
@@ -112,6 +119,7 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
list(APPEND GPU_FLAGS
"-DUSE_ROCM"
+ "-DENABLE_FP8"
"-U__HIP_NO_HALF_CONVERSIONS__"
"-U__HIP_NO_HALF_OPERATORS__"
"-fno-gpu-rdc")
@@ -240,9 +248,12 @@ macro(override_gpu_arches GPU_ARCHES GPU_LANG GPU_SUPPORTED_ARCHES)
endif()
if (_SM)
- set(_VIRT "")
+ # -real suffix let CMake to only generate elf code for the kernels.
+ # we want this, otherwise the added ptx (default) will increase binary size.
+ set(_VIRT "-real")
set(_CODE_ARCH ${_SM})
else()
+ # -virtual suffix let CMake to generate ptx code for the kernels.
set(_VIRT "-virtual")
set(_CODE_ARCH ${_CODE})
endif()
@@ -281,7 +292,7 @@ endmacro()
# not provided.
# COMPILE_FLAGS - Extra compiler flags passed to NVCC/hip.
# INCLUDE_DIRECTORIES - Extra include directories.
-# LINK_LIBRARIES - Extra link libraries.
+# LIBRARIES - Extra link libraries.
# WITH_SOABI - Generate library with python SOABI suffix name.
#
# Note: optimization level/debug info is set via cmake build type.
@@ -327,8 +338,17 @@ function (define_gpu_extension_target GPU_MOD_NAME)
target_include_directories(${GPU_MOD_NAME} PRIVATE csrc
${GPU_INCLUDE_DIRECTORIES})
- target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES}
+ target_link_libraries(${GPU_MOD_NAME} PRIVATE torch ${torch_python_LIBRARY}
${GPU_LIBRARIES})
+ # Don't use `TORCH_LIBRARIES` for CUDA since it pulls in a bunch of
+ # dependencies that are not necessary and may not be installed.
+ if (GPU_LANGUAGE STREQUAL "CUDA")
+ target_link_libraries(${GPU_MOD_NAME} PRIVATE ${CUDA_CUDA_LIB}
+ ${CUDA_LIBRARIES})
+ else()
+ target_link_libraries(${GPU_MOD_NAME} PRIVATE ${TORCH_LIBRARIES})
+ endif()
+
install(TARGETS ${GPU_MOD_NAME} LIBRARY DESTINATION ${GPU_DESTINATION})
endfunction()
diff --git a/collect_env.py b/collect_env.py
index edcbfe73b3..5a015c3c8f 100644
--- a/collect_env.py
+++ b/collect_env.py
@@ -6,10 +6,10 @@
# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
import datetime
import locale
+import os
import re
import subprocess
import sys
-import os
from collections import namedtuple
try:
@@ -63,6 +63,8 @@
"magma",
"triton",
"optree",
+ "nccl",
+ "transformers",
}
DEFAULT_PIP_PATTERNS = {
@@ -73,6 +75,8 @@
"triton",
"optree",
"onnx",
+ "nccl",
+ "transformers",
}
diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index 24d972702c..867f63f12d 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -10,11 +10,11 @@
namespace vllm {
// Activation and gating kernel template.
-template
+template
__global__ void act_and_mul_kernel(
- scalar_t* __restrict__ out, // [..., d]
- const scalar_t* __restrict__ input, // [..., 2, d]
- const int d) {
+ scalar_t* __restrict__ out, // [..., d]
+ const scalar_t* __restrict__ input, // [..., 2, d]
+ const int d) {
const int64_t token_idx = blockIdx.x;
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
const scalar_t x = VLLM_LDG(&input[token_idx * 2 * d + idx]);
@@ -23,72 +23,66 @@ __global__ void act_and_mul_kernel(
}
}
-template
+template
__device__ __forceinline__ T silu_kernel(const T& x) {
// x * sigmoid(x)
- return (T) (((float) x) / (1.0f + expf((float) -x)));
+ return (T)(((float)x) / (1.0f + expf((float)-x)));
}
-template
+template
__device__ __forceinline__ T gelu_kernel(const T& x) {
// Equivalent to PyTorch GELU with 'none' approximation.
// Refer to:
// https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L36-L38
- const float f = (float) x;
+ const float f = (float)x;
constexpr float ALPHA = M_SQRT1_2;
- return (T) (f * 0.5f * (1.0f + ::erf(f * ALPHA)));
+ return (T)(f * 0.5f * (1.0f + ::erf(f * ALPHA)));
}
-template
+template
__device__ __forceinline__ T gelu_tanh_kernel(const T& x) {
// Equivalent to PyTorch GELU with 'tanh' approximation.
// Refer to:
// https://github.com/pytorch/pytorch/blob/8ac9b20d4b090c213799e81acf48a55ea8d437d6/aten/src/ATen/native/cuda/ActivationGeluKernel.cu#L25-L30
- const float f = (float) x;
+ const float f = (float)x;
constexpr float BETA = M_SQRT2 * M_2_SQRTPI * 0.5f;
constexpr float KAPPA = 0.044715;
float x_cube = f * f * f;
float inner = BETA * (f + KAPPA * x_cube);
- return (T) (0.5f * f * (1.0f + ::tanhf(inner)));
+ return (T)(0.5f * f * (1.0f + ::tanhf(inner)));
}
-} // namespace vllm
+} // namespace vllm
// Launch activation and gating kernel.
-#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL) \
- int d = input.size(-1) / 2; \
- int64_t num_tokens = input.numel() / input.size(-1); \
- dim3 grid(num_tokens); \
- dim3 block(std::min(d, 1024)); \
- const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
- const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
- VLLM_DISPATCH_FLOATING_TYPES( \
- input.scalar_type(), \
- "act_and_mul_kernel", \
- [&] { \
- vllm::act_and_mul_kernel><<>>( \
- out.data_ptr(), \
- input.data_ptr(), \
- d); \
- });
-
-void silu_and_mul(
- torch::Tensor& out, // [..., d]
- torch::Tensor& input) // [..., 2 * d]
+#define LAUNCH_ACTIVATION_GATE_KERNEL(KERNEL) \
+ int d = input.size(-1) / 2; \
+ int64_t num_tokens = input.numel() / input.size(-1); \
+ dim3 grid(num_tokens); \
+ dim3 block(std::min(d, 1024)); \
+ const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
+ const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
+ VLLM_DISPATCH_FLOATING_TYPES( \
+ input.scalar_type(), "act_and_mul_kernel", [&] { \
+ vllm::act_and_mul_kernel> \
+ <<>>(out.data_ptr(), \
+ input.data_ptr(), d); \
+ });
+
+void silu_and_mul(torch::Tensor& out, // [..., d]
+ torch::Tensor& input) // [..., 2 * d]
{
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::silu_kernel);
}
-void gelu_and_mul(
- torch::Tensor& out, // [..., d]
- torch::Tensor& input) // [..., 2 * d]
+void gelu_and_mul(torch::Tensor& out, // [..., d]
+ torch::Tensor& input) // [..., 2 * d]
{
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_kernel);
}
-void gelu_tanh_and_mul(
- torch::Tensor& out, // [..., d]
- torch::Tensor& input) // [..., 2 * d]
+void gelu_tanh_and_mul(torch::Tensor& out, // [..., d]
+ torch::Tensor& input) // [..., 2 * d]
{
LAUNCH_ACTIVATION_GATE_KERNEL(vllm::gelu_tanh_kernel);
}
@@ -96,11 +90,11 @@ void gelu_tanh_and_mul(
namespace vllm {
// Element-wise activation kernel template.
-template
+template
__global__ void activation_kernel(
- scalar_t* __restrict__ out, // [..., d]
- const scalar_t* __restrict__ input, // [..., d]
- const int d) {
+ scalar_t* __restrict__ out, // [..., d]
+ const scalar_t* __restrict__ input, // [..., d]
+ const int d) {
const int64_t token_idx = blockIdx.x;
for (int64_t idx = threadIdx.x; idx < d; idx += blockDim.x) {
const scalar_t x = VLLM_LDG(&input[token_idx * d + idx]);
@@ -108,54 +102,49 @@ __global__ void activation_kernel(
}
}
-} // namespace vllm
+} // namespace vllm
// Launch element-wise activation kernel.
-#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \
- int d = input.size(-1); \
- int64_t num_tokens = input.numel() / d; \
- dim3 grid(num_tokens); \
- dim3 block(std::min(d, 1024)); \
- const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
- const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
- VLLM_DISPATCH_FLOATING_TYPES( \
- input.scalar_type(), \
- "activation_kernel", \
- [&] { \
- vllm::activation_kernel><<>>( \
- out.data_ptr(), \
- input.data_ptr(), \
- d); \
- });
+#define LAUNCH_ACTIVATION_KERNEL(KERNEL) \
+ int d = input.size(-1); \
+ int64_t num_tokens = input.numel() / d; \
+ dim3 grid(num_tokens); \
+ dim3 block(std::min(d, 1024)); \
+ const at::cuda::OptionalCUDAGuard device_guard(device_of(input)); \
+ const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); \
+ VLLM_DISPATCH_FLOATING_TYPES(input.scalar_type(), "activation_kernel", [&] { \
+ vllm::activation_kernel> \
+ <<>>(out.data_ptr(), \
+ input.data_ptr(), d); \
+ });
namespace vllm {
-template
+template
__device__ __forceinline__ T gelu_new_kernel(const T& x) {
- const float x3 = (float) (x * x * x);
- const T t = (T) tanhf((T) (0.79788456f * (float) (x + (T) (0.044715f * x3))));
- return ((T) 0.5) * x * (((T) 1.0) + t);
+ const float x3 = (float)(x * x * x);
+ const T t = (T)tanhf((T)(0.79788456f * (float)(x + (T)(0.044715f * x3))));
+ return ((T)0.5) * x * (((T)1.0) + t);
}
-template
+template
__device__ __forceinline__ T gelu_fast_kernel(const T& x) {
- const float f = (float) x;
- const T t = (T) tanhf(((T) (f * 0.79788456f)) * (((T) 1.0) + (T) (0.044715f * f) * x));
- return ((T) 0.5) * x * (((T) 1.0) + t);
+ const float f = (float)x;
+ const T t =
+ (T)tanhf(((T)(f * 0.79788456f)) * (((T)1.0) + (T)(0.044715f * f) * x));
+ return ((T)0.5) * x * (((T)1.0) + t);
}
-} // namespace vllm
+} // namespace vllm
-void gelu_new(
- torch::Tensor& out, // [..., d]
- torch::Tensor& input) // [..., d]
+void gelu_new(torch::Tensor& out, // [..., d]
+ torch::Tensor& input) // [..., d]
{
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_new_kernel);
}
-void gelu_fast(
- torch::Tensor& out, // [..., d]
- torch::Tensor& input) // [..., d]
+void gelu_fast(torch::Tensor& out, // [..., d]
+ torch::Tensor& input) // [..., d]
{
LAUNCH_ACTIVATION_KERNEL(vllm::gelu_fast_kernel);
}
diff --git a/csrc/attention/attention_dtypes.h b/csrc/attention/attention_dtypes.h
index 61748e6b1e..64f86381d9 100644
--- a/csrc/attention/attention_dtypes.h
+++ b/csrc/attention/attention_dtypes.h
@@ -4,4 +4,4 @@
#include "dtype_float16.cuh"
#include "dtype_float32.cuh"
#include "dtype_bfloat16.cuh"
-#include "dtype_fp8_e5m2.cuh"
+#include "dtype_fp8.cuh"
diff --git a/csrc/attention/attention_generic.cuh b/csrc/attention/attention_generic.cuh
index 31fb401cbe..62409c0cce 100644
--- a/csrc/attention/attention_generic.cuh
+++ b/csrc/attention/attention_generic.cuh
@@ -1,5 +1,6 @@
/*
- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
* Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
@@ -22,31 +23,31 @@
namespace vllm {
// A vector type to store Q, K, V elements.
-template
+template
struct Vec {};
// A vector type to store FP32 accumulators.
-template
+template
struct FloatVec {};
// Template vector operations.
-template
+template
inline __device__ Acc mul(A a, B b);
-template
+template
inline __device__ float sum(T v);
-template
+template
inline __device__ float dot(T a, T b) {
return sum(mul(a, b));
}
-template
+template
inline __device__ float dot(T a, T b) {
return sum(mul(a, b));
}
-template
+template
inline __device__ void zero(T& dst) {
constexpr int WORDS = sizeof(T) / 4;
union {
@@ -61,4 +62,4 @@ inline __device__ void zero(T& dst) {
dst = tmp.raw;
}
-} // namespace vllm
+} // namespace vllm
diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
index 5e61668d5c..8f89f89786 100644
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -1,5 +1,6 @@
/*
- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
* Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
@@ -19,14 +20,24 @@
#include
#include
#include
+#include
#include "attention_dtypes.h"
#include "attention_utils.cuh"
-#ifdef ENABLE_FP8_E5M2
-#include "../quantization/fp8_e5m2_kvcache/quant_utils.cuh"
+
+#ifdef USE_ROCM
+ #include
+ #include "../quantization/fp8/amd/quant_utils.cuh"
+typedef __hip_bfloat16 __nv_bfloat16;
+#else
+ #include "../quantization/fp8/nvidia/quant_utils.cuh"
#endif
-#include
+#ifndef USE_ROCM
+ #define WARP_SIZE 32
+#else
+ #define WARP_SIZE warpSize
+#endif
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MIN(a, b) ((a) < (b) ? (a) : (b))
@@ -35,7 +46,7 @@
namespace vllm {
// Utility function for attention softmax.
-template
+template
inline __device__ float block_sum(float* red_smem, float sum) {
// Decompose the thread index into warp / lane.
int warp = threadIdx.x / WARP_SIZE;
@@ -72,57 +83,65 @@ inline __device__ float block_sum(float* red_smem, float sum) {
// TODO(woosuk): Merge the last two dimensions of the grid.
// Grid: (num_heads, num_seqs, max_num_partitions).
-template<
- typename scalar_t,
- typename cache_t,
- int HEAD_SIZE,
- int BLOCK_SIZE,
- int NUM_THREADS,
- bool IS_FP8_E5M2_KV_CACHE,
- int PARTITION_SIZE = 0> // Zero means no partitioning.
+template // Zero means no partitioning.
__device__ void paged_attention_kernel(
- float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
- float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
- scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions, head_size]
- const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
- const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x]
- const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size]
- const int num_kv_heads, // [num_heads]
- const float scale,
- const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
- const int* __restrict__ context_lens, // [num_seqs]
- const int max_num_blocks_per_seq,
- const float* __restrict__ alibi_slopes, // [num_heads]
- const int q_stride,
- const int kv_block_stride,
- const int kv_head_stride) {
+ float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
+ float* __restrict__ max_logits, // [num_seqs, num_heads,
+ // max_num_partitions]
+ scalar_t* __restrict__ out, // [num_seqs, num_heads, max_num_partitions,
+ // head_size]
+ const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
+ const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads,
+ // head_size/x, block_size, x]
+ const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads,
+ // head_size, block_size]
+ const int num_kv_heads, // [num_heads]
+ const float scale,
+ const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
+ const int* __restrict__ seq_lens, // [num_seqs]
+ const int max_num_blocks_per_seq,
+ const float* __restrict__ alibi_slopes, // [num_heads]
+ const int q_stride, const int kv_block_stride, const int kv_head_stride,
+ const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
+ const int blocksparse_vert_stride, const int blocksparse_block_size,
+ const int blocksparse_head_sliding_step) {
const int seq_idx = blockIdx.y;
const int partition_idx = blockIdx.z;
const int max_num_partitions = gridDim.z;
constexpr bool USE_PARTITIONING = PARTITION_SIZE > 0;
- const int context_len = context_lens[seq_idx];
- if (USE_PARTITIONING && partition_idx * PARTITION_SIZE >= context_len) {
+ const int seq_len = seq_lens[seq_idx];
+ if (USE_PARTITIONING && partition_idx * PARTITION_SIZE >= seq_len) {
// No work to do. Terminate the thread block.
return;
}
- const int num_context_blocks = DIVIDE_ROUND_UP(context_len, BLOCK_SIZE);
- const int num_blocks_per_partition = USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_context_blocks;
+ const int num_seq_blocks = DIVIDE_ROUND_UP(seq_len, BLOCK_SIZE);
+ const int num_blocks_per_partition =
+ USE_PARTITIONING ? PARTITION_SIZE / BLOCK_SIZE : num_seq_blocks;
// [start_block_idx, end_block_idx) is the range of blocks to process.
- const int start_block_idx = USE_PARTITIONING ? partition_idx * num_blocks_per_partition : 0;
- const int end_block_idx = MIN(start_block_idx + num_blocks_per_partition, num_context_blocks);
+ const int start_block_idx =
+ USE_PARTITIONING ? partition_idx * num_blocks_per_partition : 0;
+ const int end_block_idx =
+ MIN(start_block_idx + num_blocks_per_partition, num_seq_blocks);
const int num_blocks = end_block_idx - start_block_idx;
// [start_token_idx, end_token_idx) is the range of tokens to process.
const int start_token_idx = start_block_idx * BLOCK_SIZE;
- const int end_token_idx = MIN(start_token_idx + num_blocks * BLOCK_SIZE, context_len);
+ const int end_token_idx =
+ MIN(start_token_idx + num_blocks * BLOCK_SIZE, seq_len);
const int num_tokens = end_token_idx - start_token_idx;
constexpr int THREAD_GROUP_SIZE = MAX(WARP_SIZE / BLOCK_SIZE, 1);
- constexpr int NUM_THREAD_GROUPS = NUM_THREADS / THREAD_GROUP_SIZE; // Note: This assumes THREAD_GROUP_SIZE divides NUM_THREADS
+ constexpr int NUM_THREAD_GROUPS =
+ NUM_THREADS / THREAD_GROUP_SIZE; // Note: This assumes THREAD_GROUP_SIZE
+ // divides NUM_THREADS
assert(NUM_THREADS % THREAD_GROUP_SIZE == 0);
- constexpr int NUM_TOKENS_PER_THREAD_GROUP = DIVIDE_ROUND_UP(BLOCK_SIZE, WARP_SIZE);
+ constexpr int NUM_TOKENS_PER_THREAD_GROUP =
+ DIVIDE_ROUND_UP(BLOCK_SIZE, WARP_SIZE);
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
const int thread_idx = threadIdx.x;
const int warp_idx = thread_idx / WARP_SIZE;
@@ -132,19 +151,18 @@ __device__ void paged_attention_kernel(
const int num_heads = gridDim.x;
const int num_queries_per_kv = num_heads / num_kv_heads;
const int kv_head_idx = head_idx / num_queries_per_kv;
- const float alibi_slope = alibi_slopes == nullptr ? 0.f : alibi_slopes[head_idx];
+ const float alibi_slope =
+ alibi_slopes == nullptr ? 0.f : alibi_slopes[head_idx];
// A vector type to store a part of a key or a query.
- // The vector size is configured in such a way that the threads in a thread group
- // fetch or compute 16 bytes at a time.
- // For example, if the size of a thread group is 4 and the data type is half,
- // then the vector size is 16 / (4 * sizeof(half)) == 2.
+ // The vector size is configured in such a way that the threads in a thread
+ // group fetch or compute 16 bytes at a time. For example, if the size of a
+ // thread group is 4 and the data type is half, then the vector size is 16 /
+ // (4 * sizeof(half)) == 2.
constexpr int VEC_SIZE = MAX(16 / (THREAD_GROUP_SIZE * sizeof(scalar_t)), 1);
using K_vec = typename Vec::Type;
using Q_vec = typename Vec::Type;
-#ifdef ENABLE_FP8_E5M2
using Quant_vec = typename Vec::Type;
-#endif
constexpr int NUM_ELEMS_PER_THREAD = HEAD_SIZE / THREAD_GROUP_SIZE;
constexpr int NUM_VECS_PER_THREAD = NUM_ELEMS_PER_THREAD / VEC_SIZE;
@@ -154,18 +172,21 @@ __device__ void paged_attention_kernel(
// Load the query to registers.
// Each thread in a thread group has a different part of the query.
- // For example, if the the thread group size is 4, then the first thread in the group
- // has 0, 4, 8, ... th vectors of the query, and the second thread has 1, 5, 9, ...
- // th vectors of the query, and so on.
- // NOTE(woosuk): Because q is split from a qkv tensor, it may not be contiguous.
+ // For example, if the the thread group size is 4, then the first thread in
+ // the group has 0, 4, 8, ... th vectors of the query, and the second thread
+ // has 1, 5, 9, ... th vectors of the query, and so on. NOTE(woosuk): Because
+ // q is split from a qkv tensor, it may not be contiguous.
const scalar_t* q_ptr = q + seq_idx * q_stride + head_idx * HEAD_SIZE;
__shared__ Q_vec q_vecs[THREAD_GROUP_SIZE][NUM_VECS_PER_THREAD];
#pragma unroll
- for (int i = thread_group_idx; i < NUM_VECS_PER_THREAD; i += NUM_THREAD_GROUPS) {
+ for (int i = thread_group_idx; i < NUM_VECS_PER_THREAD;
+ i += NUM_THREAD_GROUPS) {
const int vec_idx = thread_group_offset + i * THREAD_GROUP_SIZE;
- q_vecs[thread_group_offset][i] = *reinterpret_cast(q_ptr + vec_idx * VEC_SIZE);
+ q_vecs[thread_group_offset][i] =
+ *reinterpret_cast(q_ptr + vec_idx * VEC_SIZE);
}
- __syncthreads(); // TODO(naed90): possible speedup if this is replaced with a memory wall right before we use q_vecs
+ __syncthreads(); // TODO(naed90): possible speedup if this is replaced with a
+ // memory wall right before we use q_vecs
// Memory planning.
extern __shared__ char shared_mem[];
@@ -184,53 +205,101 @@ __device__ void paged_attention_kernel(
// Each thread group in a warp fetches a key from the block, and computes
// dot product with the query.
const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
- for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx; block_idx += NUM_WARPS) {
- // NOTE(woosuk): The block number is stored in int32. However, we cast it to int64
- // because int32 can lead to overflow when this variable is multiplied by large numbers
- // (e.g., kv_block_stride).
- const int64_t physical_block_number = static_cast(block_table[block_idx]);
+
+ // blocksparse specific vars
+ int bs_block_offset;
+ int q_bs_block_id;
+ if constexpr (IS_BLOCK_SPARSE) {
+ // const int num_blocksparse_blocks = DIVIDE_ROUND_UP(seq_len,
+ // blocksparse_block_size);
+ q_bs_block_id = (seq_len - 1) / blocksparse_block_size;
+ if (blocksparse_head_sliding_step >= 0)
+ // sliding on q heads
+ bs_block_offset =
+ (tp_rank * num_heads + head_idx) * blocksparse_head_sliding_step + 1;
+ else
+ // sliding on kv heads
+ bs_block_offset = (tp_rank * num_kv_heads + kv_head_idx) *
+ (-blocksparse_head_sliding_step) +
+ 1;
+ }
+
+ for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx;
+ block_idx += NUM_WARPS) {
+ // NOTE(woosuk): The block number is stored in int32. However, we cast it to
+ // int64 because int32 can lead to overflow when this variable is multiplied
+ // by large numbers (e.g., kv_block_stride).
+ // For blocksparse attention: skip computation on blocks that are not
+ // attended
+ if constexpr (IS_BLOCK_SPARSE) {
+ const int k_bs_block_id = block_idx * BLOCK_SIZE / blocksparse_block_size;
+ const bool is_remote =
+ ((k_bs_block_id + bs_block_offset) % blocksparse_vert_stride == 0);
+ const bool is_local =
+ (k_bs_block_id > q_bs_block_id - blocksparse_local_blocks);
+ if (!is_remote && !is_local) {
+ for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
+ const int physical_block_offset =
+ (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE;
+ const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
+
+ if (thread_group_offset == 0) {
+ // NOTE(linxihui): assign very large number to skipped tokens to
+ // avoid contribution to the sumexp softmax normalizer. This will
+ // not be used at computing sum(softmax*v) as the blocks will be
+ // skipped.
+ logits[token_idx - start_token_idx] = -FLT_MAX;
+ }
+ }
+ continue;
+ }
+ }
+ const int64_t physical_block_number =
+ static_cast(block_table[block_idx]);
// Load a key to registers.
// Each thread in a thread group has a different part of the key.
- // For example, if the the thread group size is 4, then the first thread in the group
- // has 0, 4, 8, ... th vectors of the key, and the second thread has 1, 5, 9, ... th
- // vectors of the key, and so on.
+ // For example, if the the thread group size is 4, then the first thread in
+ // the group has 0, 4, 8, ... th vectors of the key, and the second thread
+ // has 1, 5, 9, ... th vectors of the key, and so on.
for (int i = 0; i < NUM_TOKENS_PER_THREAD_GROUP; i++) {
- const int physical_block_offset = (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE;
+ const int physical_block_offset =
+ (thread_group_idx + i * WARP_SIZE) % BLOCK_SIZE;
const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
K_vec k_vecs[NUM_VECS_PER_THREAD];
#pragma unroll
for (int j = 0; j < NUM_VECS_PER_THREAD; j++) {
- const cache_t* k_ptr = k_cache + physical_block_number * kv_block_stride
- + kv_head_idx * kv_head_stride
- + physical_block_offset * x;
+ const cache_t* k_ptr =
+ k_cache + physical_block_number * kv_block_stride +
+ kv_head_idx * kv_head_stride + physical_block_offset * x;
const int vec_idx = thread_group_offset + j * THREAD_GROUP_SIZE;
const int offset1 = (vec_idx * VEC_SIZE) / x;
const int offset2 = (vec_idx * VEC_SIZE) % x;
- if constexpr (IS_FP8_E5M2_KV_CACHE) {
-#ifdef ENABLE_FP8_E5M2
- Quant_vec k_vec_quant = *reinterpret_cast(k_ptr + offset1 * BLOCK_SIZE * x + offset2);
- // Vector conversion from Quant_vec to K_vec.
- k_vecs[j] = fp8_e5m2_unscaled::vec_conversion(k_vec_quant);
-#else
- assert(false);
-#endif
+
+ if constexpr (KV_DTYPE == Fp8KVCacheDataType::kAuto) {
+ k_vecs[j] = *reinterpret_cast(
+ k_ptr + offset1 * BLOCK_SIZE * x + offset2);
} else {
- k_vecs[j] = *reinterpret_cast(k_ptr + offset1 * BLOCK_SIZE * x + offset2);
+ // Vector conversion from Quant_vec to K_vec.
+ Quant_vec k_vec_quant = *reinterpret_cast(
+ k_ptr + offset1 * BLOCK_SIZE * x + offset2);
+ k_vecs[j] = fp8::scaled_convert(
+ k_vec_quant, kv_scale);
}
}
// Compute dot product.
// This includes a reduction across the threads in the same thread group.
- float qk = scale * Qk_dot::dot(q_vecs[thread_group_offset], k_vecs);
+ float qk = scale * Qk_dot::dot(
+ q_vecs[thread_group_offset], k_vecs);
// Add the ALiBi bias if slopes are given.
- qk += (alibi_slope != 0) ? alibi_slope * (token_idx - context_len + 1) : 0;
+ qk += (alibi_slope != 0) ? alibi_slope * (token_idx - seq_len + 1) : 0;
if (thread_group_offset == 0) {
// Store the partial reductions to shared memory.
// NOTE(woosuk): It is required to zero out the masked logits.
- const bool mask = token_idx >= context_len;
+ const bool mask = token_idx >= seq_len;
logits[token_idx - start_token_idx] = mask ? 0.f : qk;
// Update the max value.
qk_max = mask ? qk_max : fmaxf(qk_max, qk);
@@ -278,13 +347,12 @@ __device__ void paged_attention_kernel(
// If partitioning is enabled, store the max logit and exp_sum.
if (USE_PARTITIONING && thread_idx == 0) {
- float* max_logits_ptr = max_logits + seq_idx * num_heads * max_num_partitions
- + head_idx * max_num_partitions
- + partition_idx;
+ float* max_logits_ptr = max_logits +
+ seq_idx * num_heads * max_num_partitions +
+ head_idx * max_num_partitions + partition_idx;
*max_logits_ptr = qk_max;
- float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions
- + head_idx * max_num_partitions
- + partition_idx;
+ float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions +
+ head_idx * max_num_partitions + partition_idx;
*exp_sums_ptr = exp_sum;
}
@@ -292,14 +360,13 @@ __device__ void paged_attention_kernel(
constexpr int V_VEC_SIZE = MIN(16 / sizeof(scalar_t), BLOCK_SIZE);
using V_vec = typename Vec::Type;
using L_vec = typename Vec::Type;
-#ifdef ENABLE_FP8_E5M2
using V_quant_vec = typename Vec::Type;
-#endif
using Float_L_vec = typename FloatVec::Type;
constexpr int NUM_V_VECS_PER_ROW = BLOCK_SIZE / V_VEC_SIZE;
constexpr int NUM_ROWS_PER_ITER = WARP_SIZE / NUM_V_VECS_PER_ROW;
- constexpr int NUM_ROWS_PER_THREAD = DIVIDE_ROUND_UP(HEAD_SIZE, NUM_ROWS_PER_ITER);
+ constexpr int NUM_ROWS_PER_THREAD =
+ DIVIDE_ROUND_UP(HEAD_SIZE, NUM_ROWS_PER_ITER);
// NOTE(woosuk): We use FP32 for the accumulator for better accuracy.
float accs[NUM_ROWS_PER_THREAD];
@@ -310,43 +377,55 @@ __device__ void paged_attention_kernel(
scalar_t zero_value;
zero(zero_value);
- for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx; block_idx += NUM_WARPS) {
- // NOTE(woosuk): The block number is stored in int32. However, we cast it to int64
- // because int32 can lead to overflow when this variable is multiplied by large numbers
- // (e.g., kv_block_stride).
- const int64_t physical_block_number = static_cast(block_table[block_idx]);
+ for (int block_idx = start_block_idx + warp_idx; block_idx < end_block_idx;
+ block_idx += NUM_WARPS) {
+ // NOTE(woosuk): The block number is stored in int32. However, we cast it to
+ // int64 because int32 can lead to overflow when this variable is multiplied
+ // by large numbers (e.g., kv_block_stride).
+ // For blocksparse attention: skip computation on blocks that are not
+ // attended
+ if constexpr (IS_BLOCK_SPARSE) {
+ int v_bs_block_id = block_idx * BLOCK_SIZE / blocksparse_block_size;
+ if (!((v_bs_block_id + bs_block_offset) % blocksparse_vert_stride == 0) &&
+ !((v_bs_block_id > q_bs_block_id - blocksparse_local_blocks))) {
+ continue;
+ }
+ }
+ const int64_t physical_block_number =
+ static_cast(block_table[block_idx]);
const int physical_block_offset = (lane % NUM_V_VECS_PER_ROW) * V_VEC_SIZE;
const int token_idx = block_idx * BLOCK_SIZE + physical_block_offset;
L_vec logits_vec;
- from_float(logits_vec, *reinterpret_cast(logits + token_idx - start_token_idx));
+ from_float(logits_vec, *reinterpret_cast(logits + token_idx -
+ start_token_idx));
- const cache_t* v_ptr = v_cache + physical_block_number * kv_block_stride
- + kv_head_idx * kv_head_stride;
+ const cache_t* v_ptr = v_cache + physical_block_number * kv_block_stride +
+ kv_head_idx * kv_head_stride;
#pragma unroll
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
if (row_idx < HEAD_SIZE) {
const int offset = row_idx * BLOCK_SIZE + physical_block_offset;
V_vec v_vec;
- if constexpr (IS_FP8_E5M2_KV_CACHE) {
-#ifdef ENABLE_FP8_E5M2
- V_quant_vec v_quant_vec = *reinterpret_cast(v_ptr + offset);
- // Vector conversion from V_quant_vec to V_vec.
- v_vec = fp8_e5m2_unscaled::vec_conversion(v_quant_vec);
-#else
- assert(false);
-#endif
- } else {
+
+ if constexpr (KV_DTYPE == Fp8KVCacheDataType::kAuto) {
v_vec = *reinterpret_cast(v_ptr + offset);
+ } else {
+ V_quant_vec v_quant_vec =
+ *reinterpret_cast(v_ptr + offset);
+ // Vector conversion from V_quant_vec to V_vec.
+ v_vec = fp8::scaled_convert(v_quant_vec,
+ kv_scale);
}
- if (block_idx == num_context_blocks - 1) {
- // NOTE(woosuk): When v_vec contains the tokens that are out of the context,
- // we should explicitly zero out the values since they may contain NaNs.
- // See https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472
+ if (block_idx == num_seq_blocks - 1) {
+ // NOTE(woosuk): When v_vec contains the tokens that are out of the
+ // context, we should explicitly zero out the values since they may
+ // contain NaNs. See
+ // https://github.com/vllm-project/vllm/issues/641#issuecomment-1682544472
scalar_t* v_vec_ptr = reinterpret_cast(&v_vec);
#pragma unroll
for (int j = 0; j < V_VEC_SIZE; j++) {
- v_vec_ptr[j] = token_idx + j < context_len ? v_vec_ptr[j] : zero_value;
+ v_vec_ptr[j] = token_idx + j < seq_len ? v_vec_ptr[j] : zero_value;
}
}
accs[i] += dot(logits_vec, v_vec);
@@ -365,8 +444,8 @@ __device__ void paged_attention_kernel(
accs[i] = acc;
}
- // NOTE(woosuk): A barrier is required because the shared memory space for logits
- // is reused for the output.
+ // NOTE(woosuk): A barrier is required because the shared memory space for
+ // logits is reused for the output.
__syncthreads();
// Perform reduction across warps.
@@ -403,9 +482,9 @@ __device__ void paged_attention_kernel(
// Write the final output.
if (warp_idx == 0) {
- scalar_t* out_ptr = out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
- + head_idx * max_num_partitions * HEAD_SIZE
- + partition_idx * HEAD_SIZE;
+ scalar_t* out_ptr =
+ out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+ head_idx * max_num_partitions * HEAD_SIZE + partition_idx * HEAD_SIZE;
#pragma unroll
for (int i = 0; i < NUM_ROWS_PER_THREAD; i++) {
const int row_idx = lane / NUM_V_VECS_PER_ROW + i * NUM_ROWS_PER_ITER;
@@ -417,87 +496,96 @@ __device__ void paged_attention_kernel(
}
// Grid: (num_heads, num_seqs, 1).
-template<
- typename scalar_t,
- typename cache_t,
- int HEAD_SIZE,
- int BLOCK_SIZE,
- int NUM_THREADS,
- bool IS_FP8_E5M2_KV_CACHE>
+template
__global__ void paged_attention_v1_kernel(
- scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
- const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
- const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x]
- const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size]
- const int num_kv_heads, // [num_heads]
- const float scale,
- const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
- const int* __restrict__ context_lens, // [num_seqs]
- const int max_num_blocks_per_seq,
- const float* __restrict__ alibi_slopes, // [num_heads]
- const int q_stride,
- const int kv_block_stride,
- const int kv_head_stride) {
- paged_attention_kernel(
- /* exp_sums */ nullptr, /* max_logits */ nullptr,
- out, q, k_cache, v_cache, num_kv_heads, scale, block_tables, context_lens,
- max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride, kv_head_stride);
+ scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
+ const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
+ const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads,
+ // head_size/x, block_size, x]
+ const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads,
+ // head_size, block_size]
+ const int num_kv_heads, // [num_heads]
+ const float scale,
+ const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
+ const int* __restrict__ seq_lens, // [num_seqs]
+ const int max_num_blocks_per_seq,
+ const float* __restrict__ alibi_slopes, // [num_heads]
+ const int q_stride, const int kv_block_stride, const int kv_head_stride,
+ const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
+ const int blocksparse_vert_stride, const int blocksparse_block_size,
+ const int blocksparse_head_sliding_step) {
+ paged_attention_kernel(
+ /* exp_sums */ nullptr, /* max_logits */ nullptr, out, q, k_cache,
+ v_cache, num_kv_heads, scale, block_tables, seq_lens,
+ max_num_blocks_per_seq, alibi_slopes, q_stride, kv_block_stride,
+ kv_head_stride, kv_scale, tp_rank, blocksparse_local_blocks,
+ blocksparse_vert_stride, blocksparse_block_size,
+ blocksparse_head_sliding_step);
}
// Grid: (num_heads, num_seqs, max_num_partitions).
-template<
- typename scalar_t,
- typename cache_t,
- int HEAD_SIZE,
- int BLOCK_SIZE,
- int NUM_THREADS,
- bool IS_FP8_E5M2_KV_CACHE,
- int PARTITION_SIZE>
+template
__global__ void paged_attention_v2_kernel(
- float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
- float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
- scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
- const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
- const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads, head_size/x, block_size, x]
- const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads, head_size, block_size]
- const int num_kv_heads, // [num_heads]
- const float scale,
- const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
- const int* __restrict__ context_lens, // [num_seqs]
- const int max_num_blocks_per_seq,
- const float* __restrict__ alibi_slopes, // [num_heads]
- const int q_stride,
- const int kv_block_stride,
- const int kv_head_stride) {
- paged_attention_kernel(
- exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale,
- block_tables, context_lens, max_num_blocks_per_seq, alibi_slopes,
- q_stride, kv_block_stride, kv_head_stride);
+ float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
+ float* __restrict__ max_logits, // [num_seqs, num_heads,
+ // max_num_partitions]
+ scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
+ // max_num_partitions, head_size]
+ const scalar_t* __restrict__ q, // [num_seqs, num_heads, head_size]
+ const cache_t* __restrict__ k_cache, // [num_blocks, num_kv_heads,
+ // head_size/x, block_size, x]
+ const cache_t* __restrict__ v_cache, // [num_blocks, num_kv_heads,
+ // head_size, block_size]
+ const int num_kv_heads, // [num_heads]
+ const float scale,
+ const int* __restrict__ block_tables, // [num_seqs, max_num_blocks_per_seq]
+ const int* __restrict__ seq_lens, // [num_seqs]
+ const int max_num_blocks_per_seq,
+ const float* __restrict__ alibi_slopes, // [num_heads]
+ const int q_stride, const int kv_block_stride, const int kv_head_stride,
+ const float kv_scale, const int tp_rank, const int blocksparse_local_blocks,
+ const int blocksparse_vert_stride, const int blocksparse_block_size,
+ const int blocksparse_head_sliding_step) {
+ paged_attention_kernel(
+ exp_sums, max_logits, tmp_out, q, k_cache, v_cache, num_kv_heads, scale,
+ block_tables, seq_lens, max_num_blocks_per_seq, alibi_slopes, q_stride,
+ kv_block_stride, kv_head_stride, kv_scale, tp_rank,
+ blocksparse_local_blocks, blocksparse_vert_stride, blocksparse_block_size,
+ blocksparse_head_sliding_step);
}
// Grid: (num_heads, num_seqs).
-template<
- typename scalar_t,
- int HEAD_SIZE,
- int NUM_THREADS,
- int PARTITION_SIZE>
+template
__global__ void paged_attention_v2_reduce_kernel(
- scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
- const float* __restrict__ exp_sums, // [num_seqs, num_heads, max_num_partitions]
- const float* __restrict__ max_logits, // [num_seqs, num_heads, max_num_partitions]
- const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
- const int* __restrict__ context_lens, // [num_seqs]
- const int max_num_partitions) {
+ scalar_t* __restrict__ out, // [num_seqs, num_heads, head_size]
+ const float* __restrict__ exp_sums, // [num_seqs, num_heads,
+ // max_num_partitions]
+ const float* __restrict__ max_logits, // [num_seqs, num_heads,
+ // max_num_partitions]
+ const scalar_t* __restrict__ tmp_out, // [num_seqs, num_heads,
+ // max_num_partitions, head_size]
+ const int* __restrict__ seq_lens, // [num_seqs]
+ const int max_num_partitions) {
const int num_heads = gridDim.x;
const int head_idx = blockIdx.x;
const int seq_idx = blockIdx.y;
- const int context_len = context_lens[seq_idx];
- const int num_partitions = DIVIDE_ROUND_UP(context_len, PARTITION_SIZE);
+ const int seq_len = seq_lens[seq_idx];
+ const int num_partitions = DIVIDE_ROUND_UP(seq_len, PARTITION_SIZE);
if (num_partitions == 1) {
// No need to reduce. Only copy tmp_out to out.
- scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
- const scalar_t* tmp_out_ptr = tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
- + head_idx * max_num_partitions * HEAD_SIZE;
+ scalar_t* out_ptr =
+ out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
+ const scalar_t* tmp_out_ptr =
+ tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+ head_idx * max_num_partitions * HEAD_SIZE;
for (int i = threadIdx.x; i < HEAD_SIZE; i += blockDim.x) {
out_ptr[i] = tmp_out_ptr[i];
}
@@ -516,8 +604,9 @@ __global__ void paged_attention_v2_reduce_kernel(
// Load max logits to shared memory.
float* shared_max_logits = reinterpret_cast(shared_mem);
- const float* max_logits_ptr = max_logits + seq_idx * num_heads * max_num_partitions
- + head_idx * max_num_partitions;
+ const float* max_logits_ptr = max_logits +
+ seq_idx * num_heads * max_num_partitions +
+ head_idx * max_num_partitions;
float max_logit = -FLT_MAX;
for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) {
const float l = max_logits_ptr[i];
@@ -546,9 +635,11 @@ __global__ void paged_attention_v2_reduce_kernel(
max_logit = VLLM_SHFL_SYNC(max_logit, 0);
// Load rescaled exp sums to shared memory.
- float* shared_exp_sums = reinterpret_cast(shared_mem + sizeof(float) * num_partitions);
- const float* exp_sums_ptr = exp_sums + seq_idx * num_heads * max_num_partitions
- + head_idx * max_num_partitions;
+ float* shared_exp_sums =
+ reinterpret_cast(shared_mem + sizeof(float) * num_partitions);
+ const float* exp_sums_ptr = exp_sums +
+ seq_idx * num_heads * max_num_partitions +
+ head_idx * max_num_partitions;
float global_exp_sum = 0.0f;
for (int i = threadIdx.x; i < num_partitions; i += blockDim.x) {
float l = shared_max_logits[i];
@@ -561,59 +652,52 @@ __global__ void paged_attention_v2_reduce_kernel(
const float inv_global_exp_sum = __fdividef(1.0f, global_exp_sum + 1e-6f);
// Aggregate tmp_out to out.
- const scalar_t* tmp_out_ptr = tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE
- + head_idx * max_num_partitions * HEAD_SIZE;
- scalar_t* out_ptr = out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
+ const scalar_t* tmp_out_ptr =
+ tmp_out + seq_idx * num_heads * max_num_partitions * HEAD_SIZE +
+ head_idx * max_num_partitions * HEAD_SIZE;
+ scalar_t* out_ptr =
+ out + seq_idx * num_heads * HEAD_SIZE + head_idx * HEAD_SIZE;
#pragma unroll
for (int i = threadIdx.x; i < HEAD_SIZE; i += NUM_THREADS) {
float acc = 0.0f;
for (int j = 0; j < num_partitions; ++j) {
- acc += to_float(tmp_out_ptr[j * HEAD_SIZE + i]) * shared_exp_sums[j] * inv_global_exp_sum;
+ acc += to_float(tmp_out_ptr[j * HEAD_SIZE + i]) * shared_exp_sums[j] *
+ inv_global_exp_sum;
}
from_float(out_ptr[i], acc);
}
}
-} // namespace vllm
-
-#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \
- VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \
- ((void*)vllm::paged_attention_v1_kernel), shared_mem_size); \
- vllm::paged_attention_v1_kernel<<>>( \
- out_ptr, \
- query_ptr, \
- key_cache_ptr, \
- value_cache_ptr, \
- num_kv_heads, \
- scale, \
- block_tables_ptr, \
- context_lens_ptr, \
- max_num_blocks_per_seq, \
- alibi_slopes_ptr, \
- q_stride, \
- kv_block_stride, \
- kv_head_stride);
+} // namespace vllm
+
+#define LAUNCH_PAGED_ATTENTION_V1(HEAD_SIZE) \
+ VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( \
+ ((void*)vllm::paged_attention_v1_kernel), \
+ shared_mem_size); \
+ vllm::paged_attention_v1_kernel \
+ <<>>( \
+ out_ptr, query_ptr, key_cache_ptr, value_cache_ptr, num_kv_heads, \
+ scale, block_tables_ptr, seq_lens_ptr, max_num_blocks_per_seq, \
+ alibi_slopes_ptr, q_stride, kv_block_stride, kv_head_stride, \
+ kv_scale, tp_rank, blocksparse_local_blocks, \
+ blocksparse_vert_stride, blocksparse_block_size, \
+ blocksparse_head_sliding_step);
// TODO(woosuk): Tune NUM_THREADS.
-template<
- typename T,
- typename CACHE_T,
- int BLOCK_SIZE,
- bool IS_FP8_E5M2_KV_CACHE,
- int NUM_THREADS = 128>
+template
void paged_attention_v1_launcher(
- torch::Tensor& out,
- torch::Tensor& query,
- torch::Tensor& key_cache,
- torch::Tensor& value_cache,
- int num_kv_heads,
- float scale,
- torch::Tensor& block_tables,
- torch::Tensor& context_lens,
- int max_context_len,
- const c10::optional& alibi_slopes) {
+ torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
+ torch::Tensor& value_cache, int num_kv_heads, float scale,
+ torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
+ const c10::optional& alibi_slopes, float kv_scale,
+ const int tp_rank, const int blocksparse_local_blocks,
+ const int blocksparse_vert_stride, const int blocksparse_block_size,
+ const int blocksparse_head_sliding_step) {
int num_seqs = query.size(0);
int num_heads = query.size(1);
int head_size = query.size(2);
@@ -626,20 +710,22 @@ void paged_attention_v1_launcher(
assert(head_size % thread_group_size == 0);
// NOTE: alibi_slopes is optional.
- const float* alibi_slopes_ptr = alibi_slopes ?
- reinterpret_cast(alibi_slopes.value().data_ptr())
- : nullptr;
+ const float* alibi_slopes_ptr =
+ alibi_slopes
+ ? reinterpret_cast(alibi_slopes.value().data_ptr())
+ : nullptr;
T* out_ptr = reinterpret_cast(out.data_ptr());
T* query_ptr = reinterpret_cast(query.data_ptr());
CACHE_T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr());
CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr());
int* block_tables_ptr = block_tables.data_ptr();
- int* context_lens_ptr = context_lens.data_ptr();
+ int* seq_lens_ptr = seq_lens.data_ptr();
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
- int padded_max_context_len = DIVIDE_ROUND_UP(max_context_len, BLOCK_SIZE) * BLOCK_SIZE;
- int logits_size = padded_max_context_len * sizeof(float);
+ int padded_max_seq_len =
+ DIVIDE_ROUND_UP(max_seq_len, BLOCK_SIZE) * BLOCK_SIZE;
+ int logits_size = padded_max_seq_len * sizeof(float);
int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
// Python-side check in vllm.worker.worker._check_if_can_support_max_seq_len
// Keep that in sync with the logic here!
@@ -668,6 +754,9 @@ void paged_attention_v1_launcher(
case 128:
LAUNCH_PAGED_ATTENTION_V1(128);
break;
+ case 192:
+ LAUNCH_PAGED_ATTENTION_V1(192);
+ break;
case 256:
LAUNCH_PAGED_ATTENTION_V1(256);
break;
@@ -677,124 +766,93 @@ void paged_attention_v1_launcher(
}
}
-#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_E5M2_KV_CACHE) \
- paged_attention_v1_launcher( \
- out, \
- query, \
- key_cache, \
- value_cache, \
- num_kv_heads, \
- scale, \
- block_tables, \
- context_lens, \
- max_context_len, \
- alibi_slopes);
+#define CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \
+ paged_attention_v1_launcher( \
+ out, query, key_cache, value_cache, num_kv_heads, scale, block_tables, \
+ seq_lens, max_seq_len, alibi_slopes, kv_scale, tp_rank, \
+ blocksparse_local_blocks, blocksparse_vert_stride, \
+ blocksparse_block_size, blocksparse_head_sliding_step);
+
+#define CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
+ switch (is_block_sparse) { \
+ case true: \
+ CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
+ break; \
+ case false: \
+ CALL_V1_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
+ break; \
+ }
// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
// 1, 2, 4, 64, 128, 256.
-#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_E5M2_KV_CACHE) \
- switch (block_size) { \
- case 8: \
- CALL_V1_LAUNCHER(T, CACHE_T, 8, IS_FP8_E5M2_KV_CACHE); \
- break; \
- case 16: \
- CALL_V1_LAUNCHER(T, CACHE_T, 16, IS_FP8_E5M2_KV_CACHE); \
- break; \
- case 32: \
- CALL_V1_LAUNCHER(T, CACHE_T, 32, IS_FP8_E5M2_KV_CACHE); \
- break; \
- default: \
- TORCH_CHECK(false, "Unsupported block size: ", block_size); \
- break; \
+#define CALL_V1_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \
+ switch (block_size) { \
+ case 8: \
+ CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \
+ break; \
+ case 16: \
+ CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \
+ break; \
+ case 32: \
+ CALL_V1_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \
+ break; \
+ default: \
+ TORCH_CHECK(false, "Unsupported block size: ", block_size); \
+ break; \
}
void paged_attention_v1(
- torch::Tensor& out, // [num_seqs, num_heads, head_size]
- torch::Tensor& query, // [num_seqs, num_heads, head_size]
- torch::Tensor& key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
- torch::Tensor& value_cache, // [num_blocks, num_heads, head_size, block_size]
- int num_kv_heads, // [num_heads]
- float scale,
- torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
- torch::Tensor& context_lens, // [num_seqs]
- int block_size,
- int max_context_len,
- const c10::optional& alibi_slopes,
- const std::string& kv_cache_dtype) {
- if (kv_cache_dtype == "auto") {
- if (query.dtype() == at::ScalarType::Float) {
- CALL_V1_LAUNCHER_BLOCK_SIZE(float, float, false);
- } else if (query.dtype() == at::ScalarType::Half) {
- CALL_V1_LAUNCHER_BLOCK_SIZE(uint16_t, uint16_t, false);
- } else if (query.dtype() == at::ScalarType::BFloat16) {
- CALL_V1_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, __nv_bfloat16, false);
- } else {
- TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
- }
- } else if (kv_cache_dtype == "fp8_e5m2") {
- if (query.dtype() == at::ScalarType::Float) {
- CALL_V1_LAUNCHER_BLOCK_SIZE(float, uint8_t, true);
- } else if (query.dtype() == at::ScalarType::Half) {
- CALL_V1_LAUNCHER_BLOCK_SIZE(uint16_t, uint8_t, true);
- } else if (query.dtype() == at::ScalarType::BFloat16) {
- CALL_V1_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, uint8_t, true);
- } else {
- TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
- }
- } else {
- TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
- }
+ torch::Tensor& out, // [num_seqs, num_heads, head_size]
+ torch::Tensor& query, // [num_seqs, num_heads, head_size]
+ torch::Tensor&
+ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
+ torch::Tensor&
+ value_cache, // [num_blocks, num_heads, head_size, block_size]
+ int num_kv_heads, // [num_heads]
+ float scale,
+ torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
+ torch::Tensor& seq_lens, // [num_seqs]
+ int block_size, int max_seq_len,
+ const c10::optional& alibi_slopes,
+ const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
+ const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+ const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
+ const bool is_block_sparse = (blocksparse_vert_stride > 1);
+
+ DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
+ CALL_V1_LAUNCHER_BLOCK_SIZE)
}
-#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \
- vllm::paged_attention_v2_kernel \
- <<>>( \
- exp_sums_ptr, \
- max_logits_ptr, \
- tmp_out_ptr, \
- query_ptr, \
- key_cache_ptr, \
- value_cache_ptr, \
- num_kv_heads, \
- scale, \
- block_tables_ptr, \
- context_lens_ptr, \
- max_num_blocks_per_seq, \
- alibi_slopes_ptr, \
- q_stride, \
- kv_block_stride, \
- kv_head_stride); \
- vllm::paged_attention_v2_reduce_kernel \
- <<>>( \
- out_ptr, \
- exp_sums_ptr, \
- max_logits_ptr, \
- tmp_out_ptr, \
- context_lens_ptr, \
- max_num_partitions);
-
-template<
- typename T,
- typename CACHE_T,
- int BLOCK_SIZE,
- bool IS_FP8_E5M2_KV_CACHE,
- int NUM_THREADS = 128,
- int PARTITION_SIZE = 512>
+#define LAUNCH_PAGED_ATTENTION_V2(HEAD_SIZE) \
+ vllm::paged_attention_v2_kernel \
+ <<>>( \
+ exp_sums_ptr, max_logits_ptr, tmp_out_ptr, query_ptr, key_cache_ptr, \
+ value_cache_ptr, num_kv_heads, scale, block_tables_ptr, \
+ seq_lens_ptr, max_num_blocks_per_seq, alibi_slopes_ptr, q_stride, \
+ kv_block_stride, kv_head_stride, kv_scale, tp_rank, \
+ blocksparse_local_blocks, blocksparse_vert_stride, \
+ blocksparse_block_size, blocksparse_head_sliding_step); \
+ vllm::paged_attention_v2_reduce_kernel \
+ <<>>( \
+ out_ptr, exp_sums_ptr, max_logits_ptr, tmp_out_ptr, seq_lens_ptr, \
+ max_num_partitions);
+
+template
void paged_attention_v2_launcher(
- torch::Tensor& out,
- torch::Tensor& exp_sums,
- torch::Tensor& max_logits,
- torch::Tensor& tmp_out,
- torch::Tensor& query,
- torch::Tensor& key_cache,
- torch::Tensor& value_cache,
- int num_kv_heads,
- float scale,
- torch::Tensor& block_tables,
- torch::Tensor& context_lens,
- int max_context_len,
- const c10::optional& alibi_slopes) {
+ torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
+ torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
+ torch::Tensor& value_cache, int num_kv_heads, float scale,
+ torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len,
+ const c10::optional& alibi_slopes, float kv_scale,
+ const int tp_rank, const int blocksparse_local_blocks,
+ const int blocksparse_vert_stride, const int blocksparse_block_size,
+ const int blocksparse_head_sliding_step) {
int num_seqs = query.size(0);
int num_heads = query.size(1);
int head_size = query.size(2);
@@ -807,9 +865,10 @@ void paged_attention_v2_launcher(
assert(head_size % thread_group_size == 0);
// NOTE: alibi_slopes is optional.
- const float* alibi_slopes_ptr = alibi_slopes ?
- reinterpret_cast(alibi_slopes.value().data_ptr())
- : nullptr;
+ const float* alibi_slopes_ptr =
+ alibi_slopes
+ ? reinterpret_cast(alibi_slopes.value().data_ptr())
+ : nullptr;
T* out_ptr = reinterpret_cast(out.data_ptr());
float* exp_sums_ptr = reinterpret_cast(exp_sums.data_ptr());
@@ -819,10 +878,10 @@ void paged_attention_v2_launcher(
CACHE_T* key_cache_ptr = reinterpret_cast(key_cache.data_ptr());
CACHE_T* value_cache_ptr = reinterpret_cast(value_cache.data_ptr());
int* block_tables_ptr = block_tables.data_ptr();
- int* context_lens_ptr = context_lens.data_ptr();
+ int* seq_lens_ptr = seq_lens.data_ptr();
constexpr int NUM_WARPS = NUM_THREADS / WARP_SIZE;
- int max_num_partitions = DIVIDE_ROUND_UP(max_context_len, PARTITION_SIZE);
+ int max_num_partitions = DIVIDE_ROUND_UP(max_seq_len, PARTITION_SIZE);
int logits_size = PARTITION_SIZE * sizeof(float);
int outputs_size = (NUM_WARPS / 2) * head_size * sizeof(float);
@@ -855,6 +914,9 @@ void paged_attention_v2_launcher(
case 128:
LAUNCH_PAGED_ATTENTION_V2(128);
break;
+ case 192:
+ LAUNCH_PAGED_ATTENTION_V2(192);
+ break;
case 256:
LAUNCH_PAGED_ATTENTION_V2(256);
break;
@@ -864,82 +926,68 @@ void paged_attention_v2_launcher(
}
}
-#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_E5M2_KV_CACHE) \
- paged_attention_v2_launcher( \
- out, \
- exp_sums, \
- max_logits, \
- tmp_out, \
- query, \
- key_cache, \
- value_cache, \
- num_kv_heads, \
- scale, \
- block_tables, \
- context_lens, \
- max_context_len, \
- alibi_slopes);
+#define CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, KV_DTYPE, IS_BLOCK_SPARSE) \
+ paged_attention_v2_launcher( \
+ out, exp_sums, max_logits, tmp_out, query, key_cache, value_cache, \
+ num_kv_heads, scale, block_tables, seq_lens, max_seq_len, alibi_slopes, \
+ kv_scale, tp_rank, blocksparse_local_blocks, blocksparse_vert_stride, \
+ blocksparse_block_size, blocksparse_head_sliding_step);
+
+#define CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE) \
+ switch (is_block_sparse) { \
+ case true: \
+ CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, true); \
+ break; \
+ case false: \
+ CALL_V2_LAUNCHER(T, CACHE_T, BLOCK_SIZE, IS_FP8_KV_CACHE, false); \
+ break; \
+ }
// NOTE(woosuk): To reduce the compilation time, we omitted block sizes
// 1, 2, 4, 64, 128, 256.
-#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, IS_FP8_E5M2_KV_CACHE) \
- switch (block_size) { \
- case 8: \
- CALL_V2_LAUNCHER(T, CACHE_T, 8, IS_FP8_E5M2_KV_CACHE); \
- break; \
- case 16: \
- CALL_V2_LAUNCHER(T, CACHE_T, 16, IS_FP8_E5M2_KV_CACHE); \
- break; \
- case 32: \
- CALL_V2_LAUNCHER(T, CACHE_T, 32, IS_FP8_E5M2_KV_CACHE); \
- break; \
- default: \
- TORCH_CHECK(false, "Unsupported block size: ", block_size); \
- break; \
+#define CALL_V2_LAUNCHER_BLOCK_SIZE(T, CACHE_T, KV_DTYPE) \
+ switch (block_size) { \
+ case 8: \
+ CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 8, KV_DTYPE); \
+ break; \
+ case 16: \
+ CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 16, KV_DTYPE); \
+ break; \
+ case 32: \
+ CALL_V2_LAUNCHER_SPARSITY(T, CACHE_T, 32, KV_DTYPE); \
+ break; \
+ default: \
+ TORCH_CHECK(false, "Unsupported block size: ", block_size); \
+ break; \
}
void paged_attention_v2(
- torch::Tensor& out, // [num_seqs, num_heads, head_size]
- torch::Tensor& exp_sums, // [num_seqs, num_heads, max_num_partitions]
- torch::Tensor& max_logits, // [num_seqs, num_heads, max_num_partitions]
- torch::Tensor& tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
- torch::Tensor& query, // [num_seqs, num_heads, head_size]
- torch::Tensor& key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
- torch::Tensor& value_cache, // [num_blocks, num_heads, head_size, block_size]
- int num_kv_heads, // [num_heads]
- float scale,
- torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
- torch::Tensor& context_lens, // [num_seqs]
- int block_size,
- int max_context_len,
- const c10::optional& alibi_slopes,
- const std::string& kv_cache_dtype) {
- if (kv_cache_dtype == "auto") {
- if (query.dtype() == at::ScalarType::Float) {
- CALL_V2_LAUNCHER_BLOCK_SIZE(float, float, false);
- } else if (query.dtype() == at::ScalarType::Half) {
- CALL_V2_LAUNCHER_BLOCK_SIZE(uint16_t, uint16_t, false);
- } else if (query.dtype() == at::ScalarType::BFloat16) {
- CALL_V2_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, __nv_bfloat16, false);
- } else {
- TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
- }
- } else if (kv_cache_dtype == "fp8_e5m2") {
- if (query.dtype() == at::ScalarType::Float) {
- CALL_V2_LAUNCHER_BLOCK_SIZE(float, uint8_t, true);
- } else if (query.dtype() == at::ScalarType::Half) {
- CALL_V2_LAUNCHER_BLOCK_SIZE(uint16_t, uint8_t, true);
- } else if (query.dtype() == at::ScalarType::BFloat16) {
- CALL_V2_LAUNCHER_BLOCK_SIZE(__nv_bfloat16, uint8_t, true);
- } else {
- TORCH_CHECK(false, "Unsupported data type: ", query.dtype());
- }
- } else {
- TORCH_CHECK(false, "Unsupported data type of kv cache: ", kv_cache_dtype);
- }
+ torch::Tensor& out, // [num_seqs, num_heads, head_size]
+ torch::Tensor& exp_sums, // [num_seqs, num_heads, max_num_partitions]
+ torch::Tensor& max_logits, // [num_seqs, num_heads, max_num_partitions]
+ torch::Tensor&
+ tmp_out, // [num_seqs, num_heads, max_num_partitions, head_size]
+ torch::Tensor& query, // [num_seqs, num_heads, head_size]
+ torch::Tensor&
+ key_cache, // [num_blocks, num_heads, head_size/x, block_size, x]
+ torch::Tensor&
+ value_cache, // [num_blocks, num_heads, head_size, block_size]
+ int num_kv_heads, // [num_heads]
+ float scale,
+ torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq]
+ torch::Tensor& seq_lens, // [num_seqs]
+ int block_size, int max_seq_len,
+ const c10::optional& alibi_slopes,
+ const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
+ const int blocksparse_local_blocks, const int blocksparse_vert_stride,
+ const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
+ const bool is_block_sparse = (blocksparse_vert_stride > 1);
+ DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
+ CALL_V2_LAUNCHER_BLOCK_SIZE)
}
#undef WARP_SIZE
#undef MAX
#undef MIN
-#undef DIVIDE_ROUND_UP
+#undef DIVIDE_ROUND_UP
\ No newline at end of file
diff --git a/csrc/attention/attention_utils.cuh b/csrc/attention/attention_utils.cuh
index ff64c4bd8f..cdcee42748 100644
--- a/csrc/attention/attention_utils.cuh
+++ b/csrc/attention/attention_utils.cuh
@@ -1,5 +1,6 @@
/*
- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
* Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
@@ -26,7 +27,7 @@
namespace vllm {
// Q*K^T operation.
-template
+template
inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
using A_vec = typename FloatVec::Type;
// Compute the parallel products for Q*K^T (treat vector lanes separately).
@@ -45,12 +46,12 @@ inline __device__ float qk_dot_(const Vec (&q)[N], const Vec (&k)[N]) {
return qk;
}
-template
+template
struct Qk_dot {
- template
+ template
static inline __device__ float dot(const Vec (&q)[N], const Vec (&k)[N]) {
return qk_dot_(q, k);
}
};
-} // namespace vllm
+} // namespace vllm
diff --git a/csrc/attention/dtype_bfloat16.cuh b/csrc/attention/dtype_bfloat16.cuh
index 31e0cee01d..3cdcb95e08 100644
--- a/csrc/attention/dtype_bfloat16.cuh
+++ b/csrc/attention/dtype_bfloat16.cuh
@@ -1,6 +1,8 @@
/*
- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
- * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * and
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
* Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
@@ -28,8 +30,8 @@
#include
#include
- typedef __hip_bfloat162 __nv_bfloat162;
- typedef __hip_bfloat16 __nv_bfloat16;
+typedef __hip_bfloat162 __nv_bfloat162;
+typedef __hip_bfloat16 __nv_bfloat16;
#endif
#include
@@ -50,37 +52,37 @@ struct bf16_8_t {
};
// BF16 vector types for Q, K, V.
-template<>
+template <>
struct Vec<__nv_bfloat16, 1> {
using Type = __nv_bfloat16;
};
-template<>
+template <>
struct Vec<__nv_bfloat16, 2> {
using Type = __nv_bfloat162;
};
-template<>
+template <>
struct Vec<__nv_bfloat16, 4> {
using Type = bf16_4_t;
};
-template<>
+template <>
struct Vec<__nv_bfloat16, 8> {
using Type = bf16_8_t;
};
// FP32 accumulator vector types corresponding to Vec.
-template<>
+template <>
struct FloatVec<__nv_bfloat16> {
using Type = float;
};
-template<>
+template <>
struct FloatVec<__nv_bfloat162> {
using Type = float2;
};
-template<>
+template <>
struct FloatVec {
using Type = Float4_;
};
-template<>
+template <>
struct FloatVec {
using Type = Float8_;
};
@@ -108,9 +110,9 @@ inline __device__ __nv_bfloat16 add(__nv_bfloat16 a, __nv_bfloat16 b) {
assert(false);
#else
#ifndef USE_ROCM
- return a + b;
+ return a + b;
#else
- return __hadd(a, b);
+ return __hadd(a, b);
#endif
#endif
}
@@ -161,7 +163,7 @@ inline __device__ Float8_ add(bf16_8_t a, Float8_ fb) {
}
// Vector multiplication.
-template<>
+template <>
inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
assert(false);
@@ -170,7 +172,7 @@ inline __device__ __nv_bfloat16 mul(__nv_bfloat16 a, __nv_bfloat16 b) {
#endif
}
-template<>
+template <>
inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
assert(false);
@@ -179,12 +181,12 @@ inline __device__ __nv_bfloat162 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
#endif
}
-template<>
+template <>
inline __device__ __nv_bfloat162 mul(__nv_bfloat16 a, __nv_bfloat162 b) {
return mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(bf162bf162(a), b);
}
-template<>
+template <>
inline __device__ bf16_4_t mul(bf16_4_t a, bf16_4_t b) {
bf16_4_t c;
c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
@@ -192,7 +194,7 @@ inline __device__ bf16_4_t mul(bf16_4_t a, bf16_4_t b) {
return c;
}
-template<>
+template <>
inline __device__ bf16_4_t mul(__nv_bfloat16 a, bf16_4_t b) {
__nv_bfloat162 s = bf162bf162(a);
bf16_4_t c;
@@ -201,7 +203,7 @@ inline __device__ bf16_4_t mul(__nv_bfloat16 a, bf16_4_t b) {
return c;
}
-template<>
+template <>
inline __device__ bf16_8_t mul(bf16_8_t a, bf16_8_t b) {
bf16_8_t c;
c.x = mul<__nv_bfloat162, __nv_bfloat162, __nv_bfloat162>(a.x, b.x);
@@ -211,7 +213,7 @@ inline __device__ bf16_8_t mul(bf16_8_t a, bf16_8_t b) {
return c;
}
-template<>
+template <>
inline __device__ bf16_8_t mul(__nv_bfloat16 a, bf16_8_t b) {
__nv_bfloat162 s = bf162bf162(a);
bf16_8_t c;
@@ -222,26 +224,26 @@ inline __device__ bf16_8_t mul(__nv_bfloat16 a, bf16_8_t b) {
return c;
}
-template<>
+template <>
inline __device__ float mul(__nv_bfloat16 a, __nv_bfloat16 b) {
float fa = __bfloat162float(a);
float fb = __bfloat162float(b);
return fa * fb;
}
-template<>
+template <>
inline __device__ float2 mul(__nv_bfloat162 a, __nv_bfloat162 b) {
float2 fa = bf1622float2(a);
float2 fb = bf1622float2(b);
return mul(fa, fb);
}
-template<>
+template <>
inline __device__ float2 mul(__nv_bfloat16 a, __nv_bfloat162 b) {
return mul(bf162bf162(a), b);
}
-template<>
+template <>
inline __device__ Float4_ mul(bf16_4_t a, bf16_4_t b) {
Float4_ fc;
fc.x = mul(a.x, b.x);
@@ -249,7 +251,7 @@ inline __device__ Float4_ mul(bf16_4_t a, bf16_4_t b) {
return fc;
}
-template<>
+template <>
inline __device__ Float4_ mul(__nv_bfloat16 a, bf16_4_t b) {
__nv_bfloat162 s = bf162bf162(a);
Float4_ fc;
@@ -258,7 +260,7 @@ inline __device__ Float4_ mul(__nv_bfloat16 a, bf16_4_t b) {
return fc;
}
-template<>
+template <>
inline __device__ Float8_ mul(bf16_8_t a, bf16_8_t b) {
Float8_ fc;
fc.x = mul(a.x, b.x);
@@ -268,7 +270,7 @@ inline __device__ Float8_ mul(bf16_8_t a, bf16_8_t b) {
return fc;
}
-template<>
+template <>
inline __device__ Float8_ mul(__nv_bfloat16 a, bf16_8_t b) {
__nv_bfloat162 s = bf162bf162(a);
Float8_ fc;
@@ -280,7 +282,8 @@ inline __device__ Float8_ mul(__nv_bfloat16 a, bf16_8_t b) {
}
// Vector fused multiply-add.
-inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b,
+ __nv_bfloat162 c) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
assert(false);
#else
@@ -288,7 +291,8 @@ inline __device__ __nv_bfloat162 fma(__nv_bfloat162 a, __nv_bfloat162 b, __nv_bf
#endif
}
-inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b, __nv_bfloat162 c) {
+inline __device__ __nv_bfloat162 fma(__nv_bfloat16 a, __nv_bfloat162 b,
+ __nv_bfloat162 c) {
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800
assert(false);
#else
@@ -379,23 +383,23 @@ inline __device__ Float8_ fma(__nv_bfloat16 a, bf16_8_t b, Float8_ fc) {
}
// Vector sum.
-template<>
+template <>
inline __device__ float sum(__nv_bfloat16 v) {
return __bfloat162float(v);
}
-template<>
+template <>
inline __device__ float sum(__nv_bfloat162 v) {
float2 vf = bf1622float2(v);
return vf.x + vf.y;
}
-template<>
+template <>
inline __device__ float sum(bf16_4_t v) {
return sum(v.x) + sum(v.y);
}
-template<>
+template <>
inline __device__ float sum(bf16_8_t v) {
return sum(v.x) + sum(v.y) + sum(v.z) + sum(v.w);
}
@@ -448,4 +452,4 @@ inline __device__ void zero(__nv_bfloat16& dst) {
#endif
}
-} // namespace vllm
+} // namespace vllm
diff --git a/csrc/attention/dtype_float16.cuh b/csrc/attention/dtype_float16.cuh
index d3271e69cd..3a1815f0ed 100644
--- a/csrc/attention/dtype_float16.cuh
+++ b/csrc/attention/dtype_float16.cuh
@@ -1,6 +1,8 @@
/*
- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
- * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * and
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
* Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
@@ -30,37 +32,37 @@
namespace vllm {
// FP16 vector types for Q, K, V.
-template<>
+template <>
struct Vec {
using Type = uint16_t;
};
-template<>
+template <>
struct Vec {
using Type = uint32_t;
};
-template<>
+template <>
struct Vec {
using Type = uint2;
};
-template<>
+template <>
struct Vec {
using Type = uint4;
};
// FP32 accumulator vector types corresponding to Vec.
-template<>
+template <>
struct FloatVec {
using Type = float;
};
-template<>
+template <>
struct FloatVec {
using Type = float2;
};
-template<>
+template <>
struct FloatVec {
using Type = Float4_;
};
-template<>
+template <>
struct FloatVec {
using Type = Float8_;
};
@@ -73,8 +75,8 @@ inline __device__ uint32_t h0_h0(uint16_t a) {
return b;
#else
union {
- uint32_t u32;
- uint16_t u16[2];
+ uint32_t u32;
+ uint16_t u16[2];
} tmp;
tmp.u16[0] = a;
tmp.u16[1] = a;
@@ -130,10 +132,12 @@ inline __device__ uint32_t float2_to_half2(float2 f) {
} tmp;
#ifndef USE_ROCM
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
- asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n" : "=r"(tmp.u32) : "f"(f.y), "f"(f.x));
+ asm volatile("cvt.rn.f16x2.f32 %0, %1, %2;\n"
+ : "=r"(tmp.u32)
+ : "f"(f.y), "f"(f.x));
#else
- asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
- asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
+ asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[0]) : "f"(f.x));
+ asm volatile("cvt.rn.f16.f32 %0, %1;\n" : "=h"(tmp.u16[1]) : "f"(f.y));
#endif
#else
tmp.u16[0] = float_to_half(f.x);
@@ -201,7 +205,7 @@ inline __device__ Float8_ add(uint4 a, Float8_ fb) {
}
// Vector multiplication.
-template<>
+template <>
inline __device__ uint16_t mul(uint16_t a, uint16_t b) {
uint16_t c;
#ifndef USE_ROCM
@@ -212,7 +216,7 @@ inline __device__ uint16_t mul(uint16_t a, uint16_t b) {
return c;
}
-template<>
+template <>
inline __device__ uint32_t mul(uint32_t a, uint32_t b) {
uint32_t c;
#ifndef USE_ROCM
@@ -223,12 +227,12 @@ inline __device__ uint32_t mul(uint32_t a, uint32_t b) {
return c;
}
-template<>
+template <>
inline __device__ uint32_t mul(uint16_t a, uint32_t b) {
return mul(h0_h0(a), b);
}
-template<>
+template <>
inline __device__ uint2 mul(uint2 a, uint2 b) {
uint2 c;
c.x = mul(a.x, b.x);
@@ -236,7 +240,7 @@ inline __device__ uint2 mul(uint2 a, uint2 b) {
return c;
}
-template<>
+template <>
inline __device__ uint2 mul(uint16_t a, uint2 b) {
uint32_t s = h0_h0(a);
uint2 c;
@@ -245,7 +249,7 @@ inline __device__ uint2 mul(uint16_t a, uint2 b) {
return c;
}
-template<>
+template <>
inline __device__ uint4 mul(uint4 a, uint4 b) {
uint4 c;
c.x = mul(a.x, b.x);
@@ -255,7 +259,7 @@ inline __device__ uint4 mul(uint4 a, uint4 b) {
return c;
}
-template<>
+template <>
inline __device__ uint4 mul(uint16_t a, uint4 b) {
uint32_t s = h0_h0(a);
uint4 c;
@@ -266,26 +270,26 @@ inline __device__ uint4 mul(uint16_t a, uint4 b) {
return c;
}
-template<>
+template <>
inline __device__ float mul(uint16_t a, uint16_t b) {
float fa = half_to_float(a);
float fb = half_to_float(b);
return fa * fb;
}
-template<>
+template <>
inline __device__ float2 mul(uint32_t a, uint32_t b) {
float2 fa = half2_to_float2(a);
float2 fb = half2_to_float2(b);
return mul(fa, fb);
}
-template<>
+template <>
inline __device__ float2 mul(uint16_t a, uint32_t b) {
return mul(h0_h0(a), b);
}
-template<>
+template <>
inline __device__ Float4_ mul(uint2 a, uint2 b) {
Float4_ fc;
fc.x = mul(a.x, b.x);
@@ -293,7 +297,7 @@ inline __device__ Float4_ mul(uint2 a, uint2 b) {
return fc;
}
-template<>
+template <>
inline __device__ Float4_ mul(uint16_t a, uint2 b) {
uint32_t s = h0_h0(a);
Float4_ fc;
@@ -302,7 +306,7 @@ inline __device__ Float4_ mul(uint16_t a, uint2 b) {
return fc;
}
-template<>
+template <>
inline __device__ Float8_ mul(uint4 a, uint4 b) {
Float8_ fc;
fc.x = mul(a.x, b.x);
@@ -312,7 +316,7 @@ inline __device__ Float8_ mul(uint4 a, uint4 b) {
return fc;
}
-template<>
+template <>
inline __device__ Float8_ mul(uint16_t a, uint4 b) {
uint32_t s = h0_h0(a);
Float8_ fc;
@@ -327,9 +331,13 @@ inline __device__ Float8_ mul(uint16_t a, uint4 b) {
inline __device__ uint32_t fma(uint32_t a, uint32_t b, uint32_t c) {
uint32_t d;
#ifndef USE_ROCM
- asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n" : "=r"(d) : "r"(a), "r"(b), "r"(c));
+ asm volatile("fma.rn.f16x2 %0, %1, %2, %3;\n"
+ : "=r"(d)
+ : "r"(a), "r"(b), "r"(c));
#else
- asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n" : "=v"(d) : "v"(a), "v"(b), "v"(c));
+ asm volatile("v_pk_fma_f16 %0, %1, %2, %3;\n"
+ : "=v"(d)
+ : "v"(a), "v"(b), "v"(c));
#endif
return d;
}
@@ -423,24 +431,24 @@ inline __device__ Float8_ fma(uint16_t a, uint4 b, Float8_ fc) {
}
// Vector sum.
-template<>
+template <>
inline __device__ float sum(uint16_t v) {
return half_to_float(v);
}
-template<>
+template <>
inline __device__ float sum(uint32_t v) {
float2 tmp = half2_to_float2(v);
return tmp.x + tmp.y;
}
-template<>
+template <>
inline __device__ float sum(uint2 v) {
uint32_t c = add(v.x, v.y);
return sum(c);
}
-template<>
+template <>
inline __device__ float sum(uint4 v) {
uint32_t c = add(v.x, v.y);
c = add(c, v.z);
@@ -470,13 +478,9 @@ inline __device__ void from_float(uint4& dst, Float8_ src) {
}
// From float16 to float32.
-inline __device__ float to_float(uint16_t u) {
- return half_to_float(u);
-}
+inline __device__ float to_float(uint16_t u) { return half_to_float(u); }
-inline __device__ float2 to_float(uint32_t u) {
- return half2_to_float2(u);
-}
+inline __device__ float2 to_float(uint32_t u) { return half2_to_float2(u); }
inline __device__ Float4_ to_float(uint2 u) {
Float4_ tmp;
@@ -495,8 +499,6 @@ inline __device__ Float8_ to_float(uint4 u) {
}
// Zero-out a variable.
-inline __device__ void zero(uint16_t& dst) {
- dst = uint16_t(0);
-}
+inline __device__ void zero(uint16_t& dst) { dst = uint16_t(0); }
-} // namespace vllm
+} // namespace vllm
diff --git a/csrc/attention/dtype_float32.cuh b/csrc/attention/dtype_float32.cuh
index b200d2d226..7c6a686db3 100644
--- a/csrc/attention/dtype_float32.cuh
+++ b/csrc/attention/dtype_float32.cuh
@@ -1,6 +1,8 @@
/*
- * Adapted from https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
- * and https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
+ * Adapted from
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention/decoder_masked_multihead_attention_template.hpp
+ * and
+ * https://github.com/NVIDIA/FasterTransformer/blob/release/v5.3_tag/src/fastertransformer/kernels/decoder_masked_multihead_attention_utils.h
* Copyright (c) 2023, The vLLM team.
* Copyright (c) 2020-2023, NVIDIA CORPORATION. All rights reserved.
*
@@ -38,37 +40,35 @@ struct Float8_ {
};
// FP32 vector types for Q, K, V.
-template<>
+template <>
struct Vec {
using Type = float;
};
-template<>
+template <>
struct Vec {
using Type = float2;
};
-template<>
+template <>
struct Vec {
using Type = float4;
};
// FP32 accumulator vector types corresponding to Vec.
-template<>
+template <>
struct FloatVec {
using Type = float;
};
-template<>
+template <>
struct FloatVec {
using Type = float2;
};
-template<>
+template <>
struct FloatVec {
using Type = float4;
};
// Vector addition.
-inline __device__ float add(float a, float b) {
- return a + b;
-}
+inline __device__ float add(float a, float b) { return a + b; }
inline __device__ float2 add(float2 a, float2 b) {
float2 c;
@@ -87,12 +87,12 @@ inline __device__ float4 add(float4 a, float4 b) {
}
// Vector multiplication.
-template<>
+template <>
inline __device__ float mul(float a, float b) {
return a * b;
}
-template<>
+template <>
inline __device__ float2 mul(float2 a, float2 b) {
float2 c;
c.x = a.x * b.x;
@@ -100,7 +100,7 @@ inline __device__ float2 mul(float2 a, float2 b) {
return c;
}
-template<>
+template <>
inline __device__ float2 mul(float a, float2 b) {
float2 c;
c.x = a * b.x;
@@ -108,7 +108,7 @@ inline __device__ float2 mul(float a, float2 b) {
return c;
}
-template<>
+template <>
inline __device__ float4 mul(float4 a, float4 b) {
float4 c;
c.x = a.x * b.x;
@@ -118,7 +118,7 @@ inline __device__ float4 mul(float4 a, float4 b) {
return c;
}
-template<>
+template <>
inline __device__ float4 mul(float a, float4 b) {
float4 c;
c.x = a * b.x;
@@ -129,9 +129,7 @@ inline __device__ float4 mul(float a, float4 b) {
}
// Vector fused multiply-add.
-inline __device__ float fma(float a, float b, float c) {
- return a * b + c;
-}
+inline __device__ float fma(float a, float b, float c) { return a * b + c; }
inline __device__ float2 fma(float2 a, float2 b, float2 c) {
float2 d;
@@ -182,35 +180,33 @@ inline __device__ Float8_ fma(float a, Float8_ b, Float8_ c) {
}
// Vector sum.
-template<>
+template <>
inline __device__ float sum(float v) {
return v;
}
-template<>
+template <>
inline __device__ float sum(float2 v) {
return v.x + v.y;
}
-template<>
+template <>
inline __device__ float sum(float4 v) {
return v.x + v.y + v.z + v.w;
}
-template<>
+template <>
inline __device__ float sum(Float4_ v) {
return v.x.x + v.x.y + v.y.x + v.y.y;
}
-template<>
+template <>
inline __device__ float sum(Float8_ v) {
return v.x.x + v.x.y + v.y.x + v.y.y + v.z.x + v.z.y + v.w.x + v.w.y;
}
// Vector dot product.
-inline __device__ float dot(float a, float b) {
- return a * b;
-}
+inline __device__ float dot(float a, float b) { return a * b; }
inline __device__ float dot(float2 a, float2 b) {
float2 c = mul(a, b);
@@ -232,42 +228,24 @@ inline __device__ float dot(Float8_ a, Float8_ b) {
}
// From float to float.
-inline __device__ void from_float(float& dst, float src) {
- dst = src;
-}
+inline __device__ void from_float(float& dst, float src) { dst = src; }
-inline __device__ void from_float(float2& dst, float2 src) {
- dst = src;
-}
+inline __device__ void from_float(float2& dst, float2 src) { dst = src; }
-inline __device__ void from_float(float4& dst, float4 src) {
- dst = src;
-}
+inline __device__ void from_float(float4& dst, float4 src) { dst = src; }
// From float to float.
-inline __device__ float to_float(float u) {
- return u;
-}
+inline __device__ float to_float(float u) { return u; }
-inline __device__ float2 to_float(float2 u) {
- return u;
-}
+inline __device__ float2 to_float(float2 u) { return u; }
-inline __device__ float4 to_float(float4 u) {
- return u;
-}
+inline __device__ float4 to_float(float4 u) { return u; }
-inline __device__ Float4_ to_float(Float4_ u) {
- return u;
-}
+inline __device__ Float4_ to_float(Float4_ u) { return u; }
-inline __device__ Float8_ to_float(Float8_ u) {
- return u;
-}
+inline __device__ Float8_ to_float(Float8_ u) { return u; }
// Zero-out a variable.
-inline __device__ void zero(float& dst) {
- dst = 0.f;
-}
+inline __device__ void zero(float& dst) { dst = 0.f; }
-} // namespace vllm
+} // namespace vllm
diff --git a/csrc/attention/dtype_fp8.cuh b/csrc/attention/dtype_fp8.cuh
new file mode 100644
index 0000000000..e714e321b0
--- /dev/null
+++ b/csrc/attention/dtype_fp8.cuh
@@ -0,0 +1,41 @@
+#pragma once
+
+#include "attention_generic.cuh"
+
+#include
+#ifdef ENABLE_FP8
+ #ifndef USE_ROCM
+ #include
+ #endif // USE_ROCM
+#endif // ENABLE_FP8
+
+namespace vllm {
+
+enum class Fp8KVCacheDataType {
+ kAuto = 0,
+ kFp8E4M3 = 1,
+ kFp8E5M2 = 2,
+};
+
+// fp8 vector types for quantization of kv cache
+template <>
+struct Vec {
+ using Type = uint8_t;
+};
+
+template <>
+struct Vec {
+ using Type = uint16_t;
+};
+
+template <>
+struct Vec {
+ using Type = uint32_t;
+};
+
+template <>
+struct Vec {
+ using Type = uint2;
+};
+
+} // namespace vllm
diff --git a/csrc/attention/dtype_fp8_e5m2.cuh b/csrc/attention/dtype_fp8_e5m2.cuh
deleted file mode 100644
index 0580fbb8e8..0000000000
--- a/csrc/attention/dtype_fp8_e5m2.cuh
+++ /dev/null
@@ -1,35 +0,0 @@
-#pragma once
-
-#include "attention_generic.cuh"
-
-#include
-#ifdef ENABLE_FP8_E5M2
-#include
-#endif
-
-namespace vllm {
-#ifdef ENABLE_FP8_E5M2
-// fp8 vector types for quantization of kv cache
-
-template<>
-struct Vec {
- using Type = uint8_t;
-};
-
-template<>
-struct Vec {
- using Type = uint16_t;
-};
-
-template<>
-struct Vec {
- using Type = uint32_t;
-};
-
-template<>
-struct Vec {
- using Type = uint2;
-};
-#endif // ENABLE_FP8_E5M2
-
-} // namespace vllm
diff --git a/csrc/cache.h b/csrc/cache.h
index 765e231abd..435ae3e57f 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -5,25 +5,24 @@
#include