Merge branch 'main' into test-image-hf-transfer

neuralmagic · Nov 7, 2024 · 67bfc08 · 67bfc08
2 parents 2f0ed5e + ae62fd1
commit 67bfc08
Show file tree

Hide file tree

Showing 141 changed files with 1,659 additions and 707 deletions.
diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
@@ -46,7 +46,7 @@ docker exec cpu-test bash -c "
 docker exec cpu-test bash -c "
   export VLLM_CPU_KVCACHE_SPACE=10 
   export VLLM_CPU_OMP_THREADS_BIND=48-92 
-  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m & 
+  python3 -m vllm.entrypoints.openai.api_server --model facebook/opt-125m --dtype half & 
   timeout 600 bash -c 'until curl localhost:8000/v1/models; do sleep 1; done' || exit 1
   python3 benchmarks/benchmark_serving.py \
     --backend vllm \

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -120,6 +120,7 @@ steps:
   - tests/spec_decode/e2e/test_integration_dist_tp4
   - tests/compile
   commands:
+  - pytest -v -s distributed/test_utils.py
   - pytest -v -s compile/test_basic_correctness.py
   - pytest -v -s distributed/test_pynccl.py
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
@@ -431,7 +432,6 @@ steps:
   - pip install -e ./plugins/vllm_add_dummy_model
   - pytest -v -s distributed/test_distributed_oot.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
-  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
 
 - label: Multi-step Tests (4 GPUs) # 36min
   working_dir: "/vllm-workspace/tests"

diff --git a/.github/mergify.yml b/.github/mergify.yml
@@ -46,7 +46,9 @@ pull_request_rules:
     comment:
       message: |
        This pull request has merge conflicts that must be resolved before it can be
-       merged. @{{author}} please rebase it. https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
+       merged. Please rebase the PR, @{{author}}.
+
+       https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/working-with-forks/syncing-a-fork
 
 - name: remove 'needs-rebase' label when conflict is resolved
   conditions:

diff --git a/.github/workflows/codespell.yml b/.github/workflows/codespell.yml
@@ -0,0 +1,45 @@
+name: codespell
+
+on:
+  # Trigger the workflow on push or pull request,
+  # but only for the main branch
+  push:
+    branches:
+      - main
+    paths:
+      - "**/*.py"
+      - "**/*.md"
+      - "**/*.rst"
+      - pyproject.toml
+      - requirements-lint.txt
+      - .github/workflows/codespell.yml
+  pull_request:
+    branches:
+      - main
+    paths:
+      - "**/*.py"
+      - "**/*.md"
+      - "**/*.rst"
+      - pyproject.toml
+      - requirements-lint.txt
+      - .github/workflows/codespell.yml
+
+jobs:
+  codespell:
+    runs-on: ubuntu-latest
+    strategy:
+      matrix:
+        python-version: ["3.12"]
+    steps:
+    - uses: actions/checkout@eef61447b9ff4aafe5dcd4e0bbf5d482be7e7871 # v4.2.1
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@f677139bbe7f9c59b41e40162b753c062f5d49a3 # v5.2.0
+      with:
+        python-version: ${{ matrix.python-version }}
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements-lint.txt
+    - name: Spelling check with codespell
+      run: |
+        codespell --toml pyproject.toml
diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
@@ -43,4 +43,4 @@ jobs:
     - name: Mypy
       run: |
         echo "::add-matcher::.github/workflows/matchers/mypy.json"
-        tools/mypy.sh 1
+        tools/mypy.sh 1 ${{ matrix.python-version }}
diff --git a/.github/workflows/ruff.yml b/.github/workflows/ruff.yml
@@ -15,12 +15,17 @@ on:
   pull_request:
     branches:
       - main
-    paths:
-      - "**/*.py"
-      - pyproject.toml
-      - requirements-lint.txt
-      - .github/workflows/matchers/ruff.json
-      - .github/workflows/ruff.yml
+    # This workflow is only relevant when one of the following files changes.
+    # However, we have github configured to expect and require this workflow
+    # to run and pass before github with auto-merge a pull request. Until github
+    # allows more flexible auto-merge policy, we can just run this on every PR.
+    # It doesn't take that long to run, anyway.
+    #paths:
+    #  - "**/*.py"
+    #  - pyproject.toml
+    #  - requirements-lint.txt
+    #  - .github/workflows/matchers/ruff.json
+    #  - .github/workflows/ruff.yml
 
 jobs:
   ruff:

diff --git a/Dockerfile b/Dockerfile
@@ -196,6 +196,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     python3 -m pip install hf_transfer
 ENV HF_HUB_ENABLE_HF_TRANSFER 1
 
+# Copy in the v1 package for testing (it isn't distributed yet)
+COPY vllm/v1 /usr/local/lib/python3.12/dist-packages/vllm/v1
+
 # doc requires source code
 # we hide them inside `test_docs/` , so that this source code
 # will not be imported by other tests

diff --git a/Dockerfile.cpu b/Dockerfile.cpu
@@ -22,7 +22,7 @@ ENV LD_PRELOAD="/usr/lib/x86_64-linux-gnu/libtcmalloc_minimal.so.4:/usr/local/li
 
 RUN echo 'ulimit -c 0' >> ~/.bashrc
 
-RUN pip install intel_extension_for_pytorch==2.4.0
+RUN pip install intel_extension_for_pytorch==2.5.0
 
 WORKDIR /workspace
 

diff --git a/Dockerfile.xpu b/Dockerfile.xpu
@@ -30,9 +30,19 @@ COPY requirements-common.txt /workspace/vllm/requirements-common.txt
 
 RUN --mount=type=cache,target=/root/.cache/pip \
     pip install --no-cache-dir \
-    --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
     -r requirements-xpu.txt
 
+RUN git clone https://github.com/intel/pti-gpu && \
+    cd pti-gpu/sdk && \
+    git checkout 6c491f07a777ed872c2654ca9942f1d0dde0a082 && \
+    mkdir build && \
+    cd build && \
+    cmake -DCMAKE_BUILD_TYPE=Release -DCMAKE_TOOLCHAIN_FILE=../cmake/toolchains/icpx_toolchain.cmake -DBUILD_TESTING=OFF .. && \
+    make -j && \
+    cmake --install . --config Release --prefix "/usr/local"
+
+ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
+
 COPY . .
 ARG GIT_REPO_CHECK
 RUN --mount=type=bind,source=.git,target=.git \

diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -297,17 +297,43 @@ def sample_random_requests(
 async def get_request(
     input_requests: List[Tuple[str, int, int]],
     request_rate: float,
+    burstiness: float = 1.0,
 ) -> AsyncGenerator[Tuple[str, int, int], None]:
+    """
+    Asynchronously generates requests at a specified rate 
+    with OPTIONAL burstiness.
+    
+    Args:
+        input_requests: 
+            A list of input requests, each represented as a tuple.
+        request_rate: 
+            The rate at which requests are generated (requests/s).
+        burstiness (optional): 
+            The burstiness factor of the request generation. 
+            Only takes effect when request_rate is not inf.
+            Default value is 1, which follows a Poisson process.
+            Otherwise, the request intervals follow a gamma distribution.
+            A lower burstiness value (0 < burstiness < 1) results 
+            in more bursty requests, while a higher burstiness value 
+            (burstiness > 1) results in a more uniform arrival of requests.
+    """
     input_requests = iter(input_requests)
+
+    # Calculate scale parameter theta to maintain the desired request_rate.
+    assert burstiness > 0, (
+        f"A positive burstiness factor is expected, but given {burstiness}.")
+    theta = 1.0 / (request_rate * burstiness)
+
     for request in input_requests:
         yield request
 
         if request_rate == float("inf"):
             # If the request rate is infinity, then we don't need to wait.
             continue
 
-        # Sample the request interval from the exponential distribution.
-        interval = np.random.exponential(1.0 / request_rate)
+        # Sample the request interval from the gamma distribution.
+        # If burstiness is 1, it follows exponential distribution.
+        interval = np.random.gamma(shape=burstiness, scale=theta)
         # The next request will be sent after the interval.
         await asyncio.sleep(interval)
 
@@ -426,6 +452,7 @@ async def benchmark(
     logprobs: Optional[int],
     best_of: int,
     request_rate: float,
+    burstiness: float,
     disable_tqdm: bool,
     profile: bool,
     selected_percentile_metrics: List[str],
@@ -480,7 +507,13 @@ async def benchmark(
         if profile_output.success:
             print("Profiler started")
 
+    if burstiness == 1.0:
+        distribution = "Poisson process"
+    else:
+        distribution = "Gamma distribution"
+
     print(f"Traffic request rate: {request_rate}")
+    print(f"Burstiness factor: {burstiness} ({distribution})")
     print(f"Maximum request concurrency: {max_concurrency}")
 
     pbar = None if disable_tqdm else tqdm(total=len(input_requests))
@@ -502,7 +535,7 @@ async def limited_request_func(request_func_input, pbar):
 
     benchmark_start_time = time.perf_counter()
     tasks: List[asyncio.Task] = []
-    async for request in get_request(input_requests, request_rate):
+    async for request in get_request(input_requests, request_rate, burstiness):
         prompt, prompt_len, output_len, mm_content = request
         request_func_input = RequestFuncInput(model=model_id,
                                               prompt=prompt,
@@ -769,6 +802,7 @@ def main(args: argparse.Namespace):
             logprobs=args.logprobs,
             best_of=args.best_of,
             request_rate=args.request_rate,
+            burstiness=args.burstiness,
             disable_tqdm=args.disable_tqdm,
             profile=args.profile,
             selected_percentile_metrics=args.percentile_metrics.split(","),
@@ -807,6 +841,7 @@ def main(args: argparse.Namespace):
         # Traffic
         result_json["request_rate"] = (
             args.request_rate if args.request_rate < float("inf") else "inf")
+        result_json["burstiness"] = args.burstiness
         result_json["max_concurrency"] = args.max_concurrency
 
         # Merge with benchmark result
@@ -922,8 +957,20 @@ def main(args: argparse.Namespace):
         default=float("inf"),
         help="Number of requests per second. If this is inf, "
         "then all the requests are sent at time 0. "
-        "Otherwise, we use Poisson process to synthesize "
-        "the request arrival times.",
+        "Otherwise, we use Poisson process or gamma distribution "
+        "to synthesize the request arrival times.",
+    )
+    parser.add_argument(
+        "--burstiness",
+        type=float,
+        default=1.0,
+        help="Burstiness factor of the request generation. "
+        "Only take effect when request_rate is not inf. "
+        "Default value is 1, which follows Poisson process. "
+        "Otherwise, the request intervals follow a gamma distribution. "
+        "A lower burstiness value (0 < burstiness < 1) results in more "
+        "bursty requests. A higher burstiness value (burstiness > 1) "
+        "results in a more uniform arrival of requests.",
     )
     parser.add_argument("--seed", type=int, default=0)
     parser.add_argument(

diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
@@ -18,6 +18,7 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc")
 #
 list(APPEND CXX_COMPILE_FLAGS
     "-fopenmp"
+    "-mf16c"
     "-DVLLM_CPU_EXTENSION")
 
 execute_process(COMMAND cat /proc/cpuinfo
@@ -92,7 +93,7 @@ if (AVX512_FOUND AND NOT AVX512_DISABLED)
     FetchContent_Declare(
         oneDNN
         GIT_REPOSITORY https://github.com/oneapi-src/oneDNN.git
-        GIT_TAG  v3.5.3
+        GIT_TAG  v3.6
         GIT_PROGRESS TRUE
         GIT_SHALLOW TRUE
     )

diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
@@ -22,6 +22,16 @@ struct KernelVecType<float> {
   using v_load_vec_type = vec_op::FP32Vec16;
 };
 
+template <>
+struct KernelVecType<c10::Half> {
+  using q_load_vec_type = vec_op::FP16Vec8;
+  using q_vec_type = vec_op::FP32Vec16;
+  using k_load_vec_type = vec_op::FP16Vec16;
+  using k_vec_type = vec_op::FP32Vec16;
+  using qk_acc_vec_type = vec_op::FP32Vec16;
+  using v_load_vec_type = vec_op::FP16Vec16;
+};
+
 #ifdef __AVX512BF16__
 template <>
 struct KernelVecType<c10::BFloat16> {