From 4b41095feb40e9a267cd12acabe0b893314a2bfa Mon Sep 17 00:00:00 2001
From: Daniele <d.trifiro@me.com>
Date: Sat, 1 Jun 2024 06:06:45 +0200
Subject: [PATCH 01/93] [CI/Build] CMakeLists: build all extensions' cmake
 targets at the same time (#5034)

---
 setup.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/setup.py b/setup.py
index ab42020b8c45..c35fbf2e15ae 100644
--- a/setup.py
+++ b/setup.py
@@ -188,19 +188,22 @@ def build_extensions(self) -> None:
         if not os.path.exists(self.build_temp):
             os.makedirs(self.build_temp)
 
+        targets = []
         # Build all the extensions
         for ext in self.extensions:
             self.configure(ext)
+            targets.append(remove_prefix(ext.name, "vllm."))
 
-            ext_target_name = remove_prefix(ext.name, "vllm.")
-            num_jobs, _ = self.compute_num_jobs()
+        num_jobs, _ = self.compute_num_jobs()
 
-            build_args = [
-                '--build', '.', '--target', ext_target_name, '-j',
-                str(num_jobs)
-            ]
+        build_args = [
+            "--build",
+            ".",
+            f"-j={num_jobs}",
+            *[f"--target={name}" for name in targets],
+        ]
 
-            subprocess.check_call(['cmake', *build_args], cwd=self.build_temp)
+        subprocess.check_call(["cmake", *build_args], cwd=self.build_temp)
 
 
 def _is_cuda() -> bool:

From 045812f3a0c17d906944493156247013139b1536 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Sat, 1 Jun 2024 02:45:32 -0400
Subject: [PATCH 02/93] [Kernel] Refactor CUTLASS kernels to always take scales
 that reside on the GPU (#5137)

---
 ...ue.hpp => broadcast_load_epilogue_c2x.hpp} |  50 ++-
 .../broadcast_load_epilogue_c3x.hpp           | 389 ++++++++++++++++++
 .../cutlass_w8a8/scaled_mm_dq_c2x.cu          |  14 +-
 .../cutlass_w8a8/scaled_mm_dq_c3x.cu          |  20 +-
 pyproject.toml                                |   2 +-
 tests/kernels/test_cutlass.py                 |  13 +-
 .../compressed_tensors_w8a8_statictensor.py   |  33 +-
 7 files changed, 445 insertions(+), 76 deletions(-)
 rename csrc/quantization/cutlass_w8a8/{cutlass_visitor_2x_broadcast_epilogue.hpp => broadcast_load_epilogue_c2x.hpp} (86%)
 create mode 100644 csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp

diff --git a/csrc/quantization/cutlass_w8a8/cutlass_visitor_2x_broadcast_epilogue.hpp b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp
similarity index 86%
rename from csrc/quantization/cutlass_w8a8/cutlass_visitor_2x_broadcast_epilogue.hpp
rename to csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp
index ddbee15e54ab..c4c6b18654ee 100644
--- a/csrc/quantization/cutlass_w8a8/cutlass_visitor_2x_broadcast_epilogue.hpp
+++ b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c2x.hpp
@@ -33,20 +33,27 @@
 //
 // This file is a modified excerpt of
 // include/cutlass/epilogue/fusion/visitor_load.hpp from
-// https://github.com/NVIDIA/cutlass It's beem modified to support either
-// row/column or scalar broadcasting, like is already supported in CUTLASS 3.x.
-// Important because this saves us a factor 4x on the number of kernels
-// compiled.
+// https://github.com/NVIDIA/cutlass v3.5.0
+// It has been modified to support either
+// row/column or scalar broadcasting where the tensor being loaded from is
+// always passed in via a device pointer. This lets one compiled kernel handle
+// all cases of per-tensor or per-channel/per-token quantization.
+//
+// This interface also allows the scales to be passed in as tensors that
+// consistently reside on the device, which avoids an issue with a previous
+// implementation where scalars needed to be on the CPU since they
+// were passed in via float values. This created a potential performance hazard
+// if scales were initially on the device, and caused torch.compile graph
+// breaks when moving scales to the CPU.
 //
 #pragma once
 
+// Turn off clang-format for the entire file to keep it close to upstream
 // clang-format off
 
 #include "cutlass/epilogue/threadblock/fusion/visitor_2x.hpp"
 #include "cute/tensor.hpp"
 
-// clang-format on
-
 namespace cutlass::epilogue::threadblock {
 
 using namespace cute;
@@ -59,9 +66,11 @@ template<
 >
 struct VisitorRowOrScalarBroadcast {
 
+  // This struct has been modified to have a bool indicating that ptr_row is a 
+  // scalar that must be broadcast.
   struct Arguments {
     Element const* ptr_row = nullptr;
-    Element null_default = Element(0);
+    bool row_broadcast = true;
     StrideMNL dRow = {};
   };
 
@@ -125,25 +134,25 @@ struct VisitorRowOrScalarBroadcast {
       auto coord_v = filter(tC_cRow);
       auto dst_v = filter(tC_rRow);
 
-      if (params_ptr->ptr_row) {
+      if (params_ptr->row_broadcast) {
         // In this case we are loading from a row vector and broadcasting
         CUTLASS_PRAGMA_UNROLL
         for (int i = 0; i < size(src_v); ++i) {
           bool guard = get<1>(coord_v(i)) < n;
-          cutlass::arch::global_load<VecType, sizeof(VecType)>(dst_v(i), (void const*)&src_v(i), guard);
+          cutlass::arch::global_load<VecType, sizeof(VecType)>(
+              dst_v(i), (void const*)&src_v(i), guard);
         }
       } else {
         // In this case we are loading from a scalar and broadcasting
         VecType filled_vec;
         CUTLASS_PRAGMA_UNROLL
         for (int i = 0; i < VecLength; i++) {
-          reinterpret_cast<Element*>(&filled_vec)[i] = params_ptr->null_default;
+          reinterpret_cast<Element*>(&filled_vec)[i] = *(params_ptr->ptr_row);
         }
 
         CUTLASS_PRAGMA_UNROLL
         for (int i = 0; i < size(src_v); ++i) {
-          if(get<1>(coord_v(i)) < n)
-          {
+          if (get<1>(coord_v(i)) < n) {
             dst_v(i) = filled_vec;
           }
         }
@@ -208,9 +217,11 @@ template<
 >
 struct VisitorColOrScalarBroadcast {
 
+  // This struct has been modified to have a bool indicating that ptr_col is a 
+  // scalar that must be broadcast.
   struct Arguments {
     Element const* ptr_col = nullptr;
-    Element null_default = Element(0);
+    bool col_broadcast = true;
     StrideMNL dCol = {};
   };
 
@@ -230,11 +241,6 @@ struct VisitorColOrScalarBroadcast {
 
   struct SharedStorage { };
 
-  // Global load type
-  static int constexpr vec_bits = ThreadMap::kElementsPerAccess * sizeof_bits<Element>::value;
-  using VecType = uint_bit_t<cute::min(128, vec_bits)>;
-  static int constexpr VecLength = sizeof(VecType) / sizeof(Element);
-
   CUTLASS_HOST_DEVICE
   VisitorColOrScalarBroadcast() { }
 
@@ -267,7 +273,7 @@ struct VisitorColOrScalarBroadcast {
     int m;
 
     // This function is modified from VisitorColBroadcast
-    CUTLASS_DEVICE void
+    CUTLASS_DEVICE void 
     begin_epilogue() {
       clear(tC_rCol);
 
@@ -277,7 +283,7 @@ struct VisitorColOrScalarBroadcast {
         pred(i) = get<0>(tC_cCol(i)) < m;
       }
 
-      if (params_ptr->ptr_col) {
+      if (params_ptr->col_broadcast) {
         // In this case we are loading from a column vector and broadcasting
         copy_if(pred, tC_gCol, tC_rCol);
       } else {
@@ -286,8 +292,8 @@ struct VisitorColOrScalarBroadcast {
 
         CUTLASS_PRAGMA_UNROLL
         for (int i = 0; i < size(dst_v); ++i) {
-          if(pred(i)){
-             dst_v(i) = params_ptr->null_default;
+          if (pred(i)) {
+            dst_v(i) = *(params_ptr->ptr_col);
           }
         }
       }
diff --git a/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
new file mode 100644
index 000000000000..8f38bbf50790
--- /dev/null
+++ b/csrc/quantization/cutlass_w8a8/broadcast_load_epilogue_c3x.hpp
@@ -0,0 +1,389 @@
+/***************************************************************************************************
+ * Copyright (c) 2023 - 2024 NVIDIA CORPORATION & AFFILIATES. All rights
+ *reserved. SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice,
+ *this list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ *ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
+ *LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ *CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ *SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ *INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ *CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ *ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ *POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+//
+// This file is a modified excerpt of
+// include/cutlass/epilogue/fusion/sm90_visitor_load_tma_warpspecialized.hpp
+// from https://github.com/NVIDIA/cutlass v3.5.0
+// It has been modified to support either row/column or scalar broadcasting
+// where the tensor being loaded from is always passed in via a device pointer.
+// This lets one compiled kernel handle all cases of per-tensor or
+// per-channel/per-token quantization.
+//
+// This interface also allows the scales to be passed in as tensors that
+// consistently reside on the device, which avoids an issue with a previous
+// implementation where scalars needed to be on the CPU since they
+// were passed in via float values. This created a potential performance hazard
+// if scales were initially on the device, and caused torch.compile graphs
+// breaks when moving scales to the CPU.
+//
+#pragma once
+
+// Turn off clang-format for the entire file to keep it close to upstream
+// clang-format off
+
+#include "cutlass/cutlass.h"
+#include "cutlass/arch/barrier.h"
+
+#include "cute/tensor.hpp"
+#include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
+
+namespace cutlass::epilogue::fusion {
+
+using namespace cute;
+using namespace detail;
+
+// Row vector broadcast
+template<
+  // Row bcast reuses the mbarriers from the epilogue subtile load pipeline, so this must be at least
+  // ceil_div(StagesC, epi tiles per CTA tile) + 1 to ensure no data races
+  int Stages,
+  class CtaTileShapeMNK,
+  class Element,
+  class StrideMNL = Stride<_0,_1,_0>,
+  int Alignment = 128 / sizeof_bits_v<Element>
+>
+struct Sm90RowOrScalarBroadcast {
+  static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
+  static_assert(
+    (cute::is_same_v<StrideMNL, Stride<_0,_1, _0>>) || // row vector broadcast, e.g. per-col alpha/bias
+    (cute::is_same_v<StrideMNL, Stride<_0,_1,int>>));  // batched row vector broadcast
+
+  // Accumulator doesn't distribute row elements evenly amongst threads so we must buffer in smem
+  struct SharedStorage {
+    alignas(16) array_aligned<Element, size<1>(CtaTileShapeMNK{}) * Stages> smem_row;
+  };
+
+  // This struct has been modified to have a bool indicating that ptr_row is a 
+  // scalar that must be broadcast, instead of containing a scalar that is 
+  // valid if ptr_row is null.
+  struct Arguments {
+    Element const* ptr_row = nullptr;
+    bool row_broadcast = true;
+    StrideMNL dRow = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowOrScalarBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90RowOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params(params),
+        smem_row(const_cast<Element*>(shared_storage.smem_row.data())) { }
+
+  Params params;
+  Element* smem_row;
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return true;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return (!params.row_broadcast && *(params.ptr_row) == Element(0));
+  }
+
+  template <int EpiTiles, class GTensor, class STensor>
+  struct ProducerLoadCallbacks : EmptyProducerLoadCallbacks {
+    CUTLASS_DEVICE
+    ProducerLoadCallbacks(GTensor&& gRow, STensor&& sRow, Params const& params)
+      : gRow(cute::forward<GTensor>(gRow)),
+        sRow(cute::forward<STensor>(sRow)),
+        params(params) {}
+
+    GTensor gRow;                                                                                 // (CTA_M,CTA_N)
+    STensor sRow;                                                                                 // (CTA_M,CTA_N,PIPE)
+    Params const& params;
+
+    CUTLASS_DEVICE void
+    begin(uint64_t* full_mbarrier_ptr, int load_iteration, bool issue_tma_load) {
+      if (params.ptr_row == nullptr) {
+        return;
+      }
+
+      if (issue_tma_load) {
+        // Increment the expect-tx count of the first subtile's mbarrier by the row vector's byte-size
+        constexpr uint32_t copy_bytes = size<1>(CtaTileShapeMNK{}) * sizeof_bits_v<Element> / 8;
+        cutlass::arch::ClusterTransactionBarrier::expect_transaction(full_mbarrier_ptr, copy_bytes);
+        // Issue the TMA bulk copy
+        auto bulk_copy = Copy_Atom<SM90_BULK_COPY_AUTO, Element>{}.with(*full_mbarrier_ptr);
+        // Filter so we don't issue redundant copies over stride-0 modes
+        int bcast_pipe_index = (load_iteration / EpiTiles) % Stages;
+        copy(bulk_copy, filter(gRow), filter(sRow(_,_,bcast_pipe_index)));
+      }
+    }
+  };
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    auto [m, n, k, l] = args.tile_coord_mnkl;
+    Tensor mRow = make_tensor(make_gmem_ptr(params.ptr_row), make_shape(M,N,L), params.dRow);
+    Tensor gRow = local_tile(mRow, take<0,2>(args.tile_shape_mnk), make_coord(m,n,l));            // (CTA_M,CTA_N)
+    Tensor sRow = make_tensor(make_smem_ptr(smem_row),                                            // (CTA_M,CTA_N,PIPE)
+                    make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{}), Stages),
+                    make_stride(_0{},_1{},size<1>(CtaTileShapeMNK{})));
+
+    constexpr int EpiTiles = decltype(size<1>(zipped_divide(make_layout(take<0,2>(args.tile_shape_mnk)), args.epi_tile)))::value;
+    return ProducerLoadCallbacks<EpiTiles, decltype(gRow), decltype(sRow)>(
+      cute::move(gRow), cute::move(sRow), params);
+  }
+
+  template <int EpiTiles, class RTensor, class STensor>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(RTensor&& tCrRow, STensor&& tCsRow, Params const& params)
+      : tCrRow(cute::forward<RTensor>(tCrRow)),
+        tCsRow(cute::forward<STensor>(tCsRow)),
+        params(params) {}
+
+    RTensor tCrRow;                                                               // (CPY,CPY_M,CPY_N)
+    STensor tCsRow;                                                               // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,PIPE)
+    Params const& params;
+
+    CUTLASS_DEVICE void
+    previsit(int epi_m, int epi_n, int load_iteration, bool is_producer_load_needed) {
+      if (!params.row_broadcast) {
+        fill(tCrRow, *(params.ptr_row));
+        return;
+      }
+
+      if (epi_m == 0) { // Assumes M-major subtile loop
+        // Filter so we don't issue redundant copies over stride-0 modes
+        // (only works if 0-strides are in same location, which is by construction)
+        int bcast_pipe_index = (load_iteration / EpiTiles) % Stages;
+        copy_aligned(filter(tCsRow(_,_,_,epi_m,epi_n,bcast_pipe_index)), filter(tCrRow));
+      }
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_row;
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_row[i] = tCrRow(epi_v * FragmentSize + i);
+      }
+
+      return frg_row;
+    }
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    Tensor sRow = make_tensor(make_smem_ptr(smem_row),                                            // (CTA_M,CTA_N,PIPE)
+                    make_shape(size<0>(CtaTileShapeMNK{}), size<1>(CtaTileShapeMNK{}), Stages),
+                    make_stride(_0{},_1{},size<1>(CtaTileShapeMNK{})));
+    Tensor tCsRow = sm90_partition_for_epilogue<ReferenceSrc>(                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N,PIPE)
+                      sRow, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrRow = make_tensor_like(take<0,3>(tCsRow));                                           // (CPY,CPY_M,CPY_N)
+
+    constexpr int EpiTiles = decltype(size<1>(zipped_divide(make_layout(take<0,2>(args.tile_shape_mnk)), args.epi_tile)))::value;
+    return ConsumerStoreCallbacks<EpiTiles, decltype(tCrRow), decltype(tCsRow)>(
+      cute::move(tCrRow), cute::move(tCsRow), params);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+// Column vector broadcast
+template<
+  int Stages,
+  class CtaTileShapeMNK,
+  class Element,
+  class StrideMNL = Stride<_1,_0,_0>,
+  int Alignment = 128 / sizeof_bits_v<Element>
+>
+struct Sm90ColOrScalarBroadcast {
+  static_assert(Stages == 0, "Column broadcast doesn't support smem usage yet");
+  static_assert(Alignment * sizeof_bits_v<Element> % 128 == 0, "sub-16B alignment not supported yet");
+  static_assert(
+    (cute::is_same_v<StrideMNL, Stride<_1,_0, _0>>) || // col vector broadcast, e.g. per-row alpha/bias
+    (cute::is_same_v<StrideMNL, Stride<_1,_0,int>>));  // batched col vector broadcast, e.g. batched per-row bias
+
+  // Accumulator distributes col elements evenly amongst threads so we can just directly load from gmem
+  struct SharedStorage { };
+
+  // This struct has been modified to have a bool indicating that ptr_col is a 
+  // scalar that must be broadcast, instead of containing a scalar that is 
+  // valid if ptr_col is null.
+  struct Arguments {
+    Element const* ptr_col = nullptr;
+    bool col_broadcast = true;
+    StrideMNL dCol = {};
+  };
+
+  using Params = Arguments;
+
+  template <class ProblemShape>
+  static constexpr Params
+  to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, void* workspace) {
+    return args;
+  }
+
+  template <class ProblemShape>
+  static size_t
+  get_workspace_size(ProblemShape const& problem_shape, Arguments const& args) {
+    return 0;
+  }
+
+  template <class ProblemShape>
+  static cutlass::Status
+  initialize_workspace(ProblemShape const& problem_shape, Arguments const& args, void* workspace, cudaStream_t stream,
+    CudaHostAdapter* cuda_adapter = nullptr) {
+    return cutlass::Status::kSuccess;
+  }
+
+  CUTLASS_DEVICE bool
+  is_producer_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_C_load_needed() const {
+    return false;
+  }
+
+  CUTLASS_DEVICE bool
+  is_zero() const {
+    return (!params.col_broadcast && *(params.ptr_col) == Element(0));
+  }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColOrScalarBroadcast() { }
+
+  CUTLASS_HOST_DEVICE
+  Sm90ColOrScalarBroadcast(Params const& params, SharedStorage const& shared_storage)
+      : params(params) { }
+
+  Params params;
+
+  template <class... Args>
+  CUTLASS_DEVICE auto
+  get_producer_load_callbacks(ProducerLoadArgs<Args...> const& args) {
+    return EmptyProducerLoadCallbacks{};
+  }
+
+  template<class GTensor, class RTensor>
+  struct ConsumerStoreCallbacks : EmptyConsumerStoreCallbacks {
+    CUTLASS_DEVICE
+    ConsumerStoreCallbacks(GTensor&& tCgCol, RTensor&& tCrCol, Params const& params)
+      : tCgCol(cute::forward<GTensor>(tCgCol)),
+        tCrCol(cute::forward<RTensor>(tCrCol)),
+        params(params) {}
+
+    GTensor tCgCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    RTensor tCrCol;                                                                    // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+    Params const& params;
+
+    CUTLASS_DEVICE void
+    begin() {
+      if (!params.col_broadcast) {
+        fill(tCrCol, *(params.ptr_col));
+        return;
+      }
+
+      // Filter so we don't issue redundant copies over stride-0 modes
+      // (only works if 0-strides are in same location, which is by construction)
+      copy_aligned(filter(tCgCol), filter(tCrCol));
+    }
+
+    template <typename ElementAccumulator, int FragmentSize>
+    CUTLASS_DEVICE Array<Element, FragmentSize>
+    visit(Array<ElementAccumulator, FragmentSize> const& frg_acc, int epi_v, int epi_m, int epi_n) {
+      Array<Element, FragmentSize> frg_col;
+      Tensor tCrCol_mn = tCrCol(_,_,_,epi_m,epi_n);
+
+      CUTLASS_PRAGMA_UNROLL
+      for (int i = 0; i < FragmentSize; ++i) {
+        frg_col[i] = tCrCol_mn(epi_v * FragmentSize + i);
+      }
+
+      return frg_col;
+    }
+
+  };
+
+  template <
+    bool ReferenceSrc, // do register tensors reference the src or dst layout of the tiled copy
+    class... Args
+  >
+  CUTLASS_DEVICE auto
+  get_consumer_store_callbacks(ConsumerStoreArgs<Args...> const& args) {
+
+    auto [M, N, K, L] = args.problem_shape_mnkl;
+    Tensor mCol = make_tensor(make_gmem_ptr(params.ptr_col), make_shape(M,N,L), params.dCol);
+    Tensor tCgCol = sm90_partition_for_epilogue<ReferenceSrc>(                         // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+      mCol, args.tile_shape_mnk, args.tile_coord_mnkl, args.epi_tile, args.tiled_copy, args.thread_idx);
+    Tensor tCrCol = make_tensor_like(tCgCol);                                          // (CPY,CPY_M,CPY_N,EPI_M,EPI_N)
+
+    return ConsumerStoreCallbacks<decltype(tCgCol), decltype(tCrCol)>(
+      cute::move(tCgCol), cute::move(tCrCol), params);
+  }
+};
+
+}
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu
index 3a6b8a226e18..65870df0e8fc 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu
@@ -22,7 +22,7 @@
 #include "cutlass/epilogue/threadblock/fusion/visitors.hpp"
 #include "cutlass/gemm/kernel/default_gemm_universal_with_visitor.h"
 
-#include "cutlass_visitor_2x_broadcast_epilogue.hpp"
+#include "broadcast_load_epilogue_c2x.hpp"
 #include "common.hpp"
 // clang-format on
 
@@ -145,17 +145,11 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a,
   auto a_scales_ptr = a_scales.data_ptr<float>();
   auto b_scales_ptr = b_scales.data_ptr<float>();
 
-  // If A and B are quantized per-tensor, then these scale tensors are scalars,
-  // and they are passed in via the second argument.
   using ScaleAArgs = typename Gemm::ScaleA::Arguments;
-  ScaleAArgs a_args = a_scales.numel() == 1
-                          ? ScaleAArgs{nullptr, a_scales.item<float>(), {}}
-                          : ScaleAArgs{a_scales.data_ptr<float>(), {}, {}};
-
   using ScaleBArgs = typename Gemm::ScaleB::Arguments;
-  ScaleBArgs b_args = b_scales.numel() == 1
-                          ? ScaleBArgs{nullptr, b_scales.item<float>(), {}}
-                          : ScaleBArgs{b_scales.data_ptr<float>(), {}, {}};
+
+  ScaleBArgs b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
+  ScaleAArgs a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
 
   typename Gemm::EVTCompute0::Arguments evt0_compute_args{b_args};
 
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu
index 531414bc4516..2383760abcdb 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu
@@ -18,11 +18,14 @@
 #include "cute/atom/mma_atom.hpp"
 #include "cutlass/numeric_types.h"
 
+#include "cutlass/util/device_memory.h"
+
 #include "cutlass/gemm/device/gemm_universal_adapter.h"
 #include "cutlass/gemm/kernel/gemm_universal.hpp"
 #include "cutlass/epilogue/collective/collective_builder.hpp"
 #include "cutlass/gemm/collective/collective_builder.hpp"
 
+#include "broadcast_load_epilogue_c3x.hpp"
 #include "common.hpp"
 // clang-format on
 
@@ -65,7 +68,7 @@ struct cutlass_3x_gemm {
 
   using Accum = cutlass::epilogue::fusion::Sm90AccFetch;
 
-  using ScaleA = cutlass::epilogue::fusion::Sm90ColBroadcast<
+  using ScaleA = cutlass::epilogue::fusion::Sm90ColOrScalarBroadcast<
       0 /*Stages*/, typename EpilogueDescriptor::TileShape, float,
       Stride<Int<1>, Int<0>, Int<0>>>;
 
@@ -73,7 +76,7 @@ struct cutlass_3x_gemm {
       cutlass::epilogue::collective::detail::RowBroadcastDescriptor<
           EpilogueDescriptor, float>;
 
-  using ScaleB = cutlass::epilogue::fusion::Sm90RowBroadcast<
+  using ScaleB = cutlass::epilogue::fusion::Sm90RowOrScalarBroadcast<
       ScaleBDescriptor::Stages, typename EpilogueDescriptor::TileShape,
       typename ScaleBDescriptor::Element, Stride<Int<0>, Int<1>, Int<0>>>;
 
@@ -166,13 +169,9 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a,
 
   using ScaleA_Args = typename Gemm::ScaleA::Arguments;
   using ScaleB_Args = typename Gemm::ScaleB::Arguments;
-  ScaleA_Args a_args = a_scales.numel() == 1
-                           ? ScaleA_Args{nullptr, a_scales.item<float>(), {}}
-                           : ScaleA_Args{a_scales.data_ptr<float>(), {}, {}};
 
-  ScaleB_Args b_args = b_scales.numel() == 1
-                           ? ScaleB_Args{nullptr, b_scales.item<float>(), {}}
-                           : ScaleB_Args{b_scales.data_ptr<float>(), {}, {}};
+  ScaleA_Args a_args{a_scales.data_ptr<float>(), a_scales.numel() != 1, {}};
+  ScaleB_Args b_args{b_scales.data_ptr<float>(), b_scales.numel() != 1, {}};
 
   args.epilogue.thread = {a_args, {b_args}};
 
@@ -182,10 +181,11 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a,
   CUTLASS_CHECK(gemm_op.can_implement(args));
 
   size_t workspace_size = gemm_op.get_workspace_size(args);
-  TORCH_CHECK(workspace_size == 0);
+  cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
 
   auto stream = at::cuda::getCurrentCUDAStream(a.get_device());
-  cutlass::Status status = gemm_op.run(args, stream);
+
+  cutlass::Status status = gemm_op.run(args, workspace.get(), stream);
   CUTLASS_CHECK(status);
 }
 }  // namespace
diff --git a/pyproject.toml b/pyproject.toml
index 0e9096fb4c03..06f150009aa8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -59,7 +59,7 @@ exclude = [
 ]
 
 [tool.codespell]
-ignore-words-list = "dout, te, indicies"
+ignore-words-list = "dout, te, indicies, subtile"
 skip = "./tests/prompts,./benchmarks/sonnet.txt,./tests/lora/data,./build"
 
 [tool.isort]
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index d5e9c258925c..6ca62e3e0000 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -227,14 +227,21 @@ def forward(self, a):
                                         self.out_dtype)
 
 
-def test_cutlass_cuda_graph():
+@pytest.mark.parametrize("per_act_token", [True, False])
+@pytest.mark.parametrize("per_out_ch", [True, False])
+def test_cutlass_cuda_graph(per_act_token: bool, per_out_ch: bool):
     m, n, k = 512, 512, 512
 
     a = to_int8(torch.randn((m, k), device="cuda"))
     b = to_int8(torch.randn((n, k), device="cuda").t())
 
-    scale_a = (torch.randn((m, 1), device="cuda", dtype=torch.float32) / 10)
-    scale_b = (torch.randn((1, n), device="cuda", dtype=torch.float32) / 10)
+    m_a_scales = m if per_act_token else 1
+    n_b_scales = n if per_out_ch else 1
+
+    scale_a = (torch.randn(
+        (m_a_scales, 1), device="cuda", dtype=torch.float32) / 10)
+    scale_b = (torch.randn(
+        (1, n_b_scales), device="cuda", dtype=torch.float32) / 10)
 
     # Construct a trivial model with a single layer that calls a CUTLASS kernel
     model = CutlassLayer(b, scale_a, scale_b, torch.bfloat16)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
index 64a88b01cd26..7e3e932cfe14 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
@@ -41,46 +41,19 @@ def create_weights(self, layer: torch.nn.Module,
 
         # TODO: remove zero_point parameters once the configs given remove them
 
-        # Note on input/weight scales and zero_points
-        #
-        # When the scales have a single value, it is required that they be
-        # on the CPU for 2 reasons,
-        # 1. Performance:
-        #   When the scales (input_scale/weight_scales) have only a single
-        #   value, we perform a scalar broadcast of that value during the
-        #   quant/dequant operations. The "quant" and the "gemm+dequant"
-        #   kernels accept the Scalar by-value. These tensors are allocated
-        #   on the CPU in order to avoid the GPU-to-CPU copy when passing
-        #   by-value.
-        #
-        # 2. CUDA Graphs:
-        #   CUDA Graphs don't support GPU-to-CPU copy operations during
-        #   stream capture.
-        #
-        # TODO: zero-points are not supported yet. But we expect a similar
-        # pattern.
-
         is_tensor_partitioned = len(output_partition_sizes) != 1
         weight_scale_dim = sum(
             output_partition_sizes) if is_tensor_partitioned else 1
-        weight_scale_device = "cpu" if weight_scale_dim == 1 else "cuda"
 
-        input_scale = Parameter(torch.empty(1,
-                                            device="cpu",
-                                            dtype=torch.float32),
+        input_scale = Parameter(torch.empty(1, dtype=torch.float32),
                                 requires_grad=False)
-        input_zero_point = Parameter(torch.empty(1,
-                                                 device="cpu",
-                                                 dtype=torch.int8),
+        input_zero_point = Parameter(torch.empty(1, dtype=torch.int8),
                                      requires_grad=False)
 
         weight_scale = Parameter(torch.empty(weight_scale_dim,
-                                             device=weight_scale_device,
                                              dtype=torch.float32),
                                  requires_grad=False)
-        weight_zero_point = Parameter(torch.empty(1,
-                                                  device="cpu",
-                                                  dtype=torch.int8),
+        weight_zero_point = Parameter(torch.empty(1, dtype=torch.int8),
                                       requires_grad=False)
 
         weight = Parameter(torch.empty(sum(output_partition_sizes),

From db09745f6572cd17e9b03a3d0b12348294f53551 Mon Sep 17 00:00:00 2001
From: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Date: Sat, 1 Jun 2024 14:16:07 +0530
Subject: [PATCH 03/93] [Kernel] Update Cutlass fp8 configs (#5144)

Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Robert Shaw <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
---
 .../cutlass_benchmarks/w8a8_benchmarks.py     | 352 ++++++++++++++++++
 .../cutlass_benchmarks/weight_shapes.py       |  37 ++
 .../cutlass_w8a8/scaled_mm_dq_c3x.cu          | 104 +++++-
 tests/kernels/test_cutlass.py                 |   2 +-
 4 files changed, 480 insertions(+), 15 deletions(-)
 create mode 100644 benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
 create mode 100644 benchmarks/cutlass_benchmarks/weight_shapes.py

diff --git a/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
new file mode 100644
index 000000000000..6de56f618700
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/w8a8_benchmarks.py
@@ -0,0 +1,352 @@
+import argparse
+import copy
+import itertools
+import pickle as pkl
+import time
+from typing import Callable, Iterable, List, Tuple
+
+import torch
+import torch.utils.benchmark as TBenchmark
+from torch.utils.benchmark import Measurement as TMeasurement
+from weight_shapes import WEIGHT_SHAPES
+
+from vllm import _custom_ops as ops
+
+DEFAULT_MODELS = list(WEIGHT_SHAPES.keys())[1:]
+DEFAULT_BATCH_SIZES = [1, 16, 32, 64, 128, 256, 512]
+DEFAULT_TP_SIZES = [1]
+
+# helpers
+
+
+def to_fp8(tensor: torch.tensor) -> torch.tensor:
+    finfo = torch.finfo(torch.float8_e4m3fn)
+    return torch.round(tensor.clamp(
+        min=finfo.min, max=finfo.max)).to(dtype=torch.float8_e4m3fn)
+
+
+def to_int8(tensor: torch.tensor) -> torch.tensor:
+    return torch.round(tensor.clamp(min=-128, max=127)).to(dtype=torch.int8)
+
+
+def make_rand_tensors(dtype: torch.dtype, m: int, n: int,
+                      k: int) -> Tuple[torch.tensor, torch.tensor]:
+
+    a = torch.randn((m, k), device='cuda') * 5
+    b = torch.randn((n, k), device='cuda').t() * 5
+
+    if dtype == torch.int8:
+        return to_int8(a), to_int8(b)
+    if dtype == torch.float8_e4m3fn:
+        return to_fp8(a), to_fp8(b)
+
+    raise ValueError("unsupported dtype")
+
+
+# impl
+
+
+def pytorch_i8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
+                    scale_b: torch.tensor,
+                    out_dtype: torch.dtype) -> torch.tensor:
+    return torch.mm(a, b)
+
+
+def pytorch_fp8_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
+                     scale_b: torch.tensor,
+                     out_dtype: torch.dtype) -> torch.tensor:
+    return torch._scaled_mm(a,
+                            b,
+                            scale_a=scale_a,
+                            scale_b=scale_b,
+                            out_dtype=out_dtype)
+
+
+def pytorch_fp8_impl_fast_accum(a: torch.tensor, b: torch.tensor,
+                                scale_a: torch.tensor, scale_b: torch.tensor,
+                                out_dtype: torch.dtype) -> torch.tensor:
+    return torch._scaled_mm(a,
+                            b,
+                            scale_a=scale_a,
+                            scale_b=scale_b,
+                            out_dtype=out_dtype,
+                            use_fast_accum=True)
+
+
+def cutlass_impl(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
+                 scale_b: torch.tensor,
+                 out_dtype: torch.dtype) -> torch.tensor:
+    return ops.cutlass_scaled_mm_dq(a,
+                                    b,
+                                    scale_a,
+                                    scale_b,
+                                    out_dtype=out_dtype)
+
+
+# bench
+def bench_fn(a: torch.tensor, b: torch.tensor, scale_a: torch.tensor,
+             scale_b: torch.tensor, out_dtype: torch.dtype, label: str,
+             sub_label: str, fn: Callable, description: str) -> TMeasurement:
+
+    min_run_time = 1
+
+    globals = {
+        "a": a,
+        "b": b,
+        "scale_a": scale_a,
+        "scale_b": scale_b,
+        "out_dtype": out_dtype,
+        "fn": fn,
+    }
+    return TBenchmark.Timer(
+        stmt="fn(a, b, scale_a, scale_b, out_dtype)",
+        globals=globals,
+        label=label,
+        sub_label=sub_label,
+        description=description,
+    ).blocked_autorange(min_run_time=min_run_time)
+
+
+def bench_int8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+               sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.int8
+    a, b = make_rand_tensors(torch.int8, m, n, k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+
+    timers = []
+    # pytorch impl
+    timers.append(
+        bench_fn(a.to(dtype=torch.bfloat16, device="cuda"),
+                 b.to(dtype=torch.bfloat16, device="cuda"), scale_a, scale_b,
+                 torch.bfloat16, label, sub_label, pytorch_i8_impl,
+                 "pytorch_bf16_bf16_bf16_matmul-no-scales"))
+
+    # cutlass impl
+    timers.append(
+        bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
+                 torch.bfloat16, label, sub_label, cutlass_impl,
+                 "cutlass_i8_i8_bf16_scaled_mm"))
+
+    return timers
+
+
+def bench_fp8(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+              sub_label: str) -> Iterable[TMeasurement]:
+    assert dtype == torch.float8_e4m3fn
+    a, b = make_rand_tensors(torch.float8_e4m3fn, m, n, k)
+    scale_a = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+    scale_b = torch.tensor(1.0, device="cuda", dtype=torch.float32)
+
+    timers = []
+
+    # pytorch impl: bf16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
+                 pytorch_fp8_impl, "pytorch_fp8_fp8_bf16_scaled_mm"))
+
+    # pytorch impl: bf16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(a, b, scale_a, scale_b, torch.bfloat16, label, sub_label,
+                 pytorch_fp8_impl_fast_accum,
+                 "pytorch_fp8_fp8_bf16_scaled_mm_fast_accum"))
+
+    # pytorch impl: fp16 output, without fp8 fast accum
+    timers.append(
+        bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
+                 pytorch_fp8_impl, "pytorch_fp8_fp8_fp16_scaled_mm"))
+
+    # pytorch impl: fp16 output, with fp8 fast accum
+    timers.append(
+        bench_fn(a, b, scale_a, scale_b, torch.float16, label, sub_label,
+                 pytorch_fp8_impl_fast_accum,
+                 "pytorch_fp8_fp8_fp16_scaled_mm_fast_accum"))
+
+    # cutlass impl: bf16 output
+    timers.append(
+        bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
+                 torch.bfloat16, label, sub_label, cutlass_impl,
+                 "cutlass_fp8_fp8_bf16_scaled_mm"))
+    # cutlass impl: fp16 output
+    timers.append(
+        bench_fn(a, b, scale_a.to(device="cpu"), scale_b.to(device="cpu"),
+                 torch.float16, label, sub_label, cutlass_impl,
+                 "cutlass_fp8_fp8_fp16_scaled_mm"))
+    return timers
+
+
+def bench(dtype: torch.dtype, m: int, k: int, n: int, label: str,
+          sub_label: str) -> Iterable[TMeasurement]:
+    if dtype == torch.int8:
+        return bench_int8(dtype, m, k, n, label, sub_label)
+    if dtype == torch.float8_e4m3fn:
+        return bench_fp8(dtype, m, k, n, label, sub_label)
+    raise ValueError("unsupported type")
+
+
+# runner
+def print_timers(timers: Iterable[TMeasurement]):
+    compare = TBenchmark.Compare(timers)
+    compare.print()
+
+
+def run(dtype: torch.dtype,
+        MKNs: Iterable[Tuple[int, int, int]]) -> Iterable[TMeasurement]:
+
+    results = []
+    for m, k, n in MKNs:
+        timers = bench(dtype, m, k, n, f"scaled-{dtype}-gemm",
+                       f"MKN=({m}x{k}x{n})")
+        print_timers(timers)
+        results.extend(timers)
+
+    return results
+
+
+# output makers
+def make_output(data: Iterable[TMeasurement],
+                MKNs: Iterable[Tuple[int, int, int]],
+                base_description: str,
+                timestamp=None):
+
+    print(f"== All Results {base_description} ====")
+    print_timers(data)
+
+    # pickle all the results
+    timestamp = int(time.time()) if timestamp is None else timestamp
+    with open(f"{base_description}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(data, f)
+
+
+# argparse runners
+
+
+def run_square_bench(args):
+    dim_sizes = list(
+        range(args.dim_start, args.dim_end + 1, args.dim_increment))
+    MKNs = list(zip(dim_sizes, dim_sizes, dim_sizes))
+    data = run(args.dtype, MKNs)
+
+    make_output(data, MKNs, f"square_bench-{args.dtype}")
+
+
+def run_range_bench(args):
+    dim_sizes = list(range(args.dim_start, args.dim_end, args.dim_increment))
+    n = len(dim_sizes)
+    Ms = [args.m_constant] * n if args.m_constant is not None else dim_sizes
+    Ks = [args.k_constant] * n if args.k_constant is not None else dim_sizes
+    Ns = [args.n_constant] * n if args.n_constant is not None else dim_sizes
+    MKNs = list(zip(Ms, Ks, Ns))
+    data = run(args.dtype, MKNs)
+
+    make_output(data, MKNs, f"range_bench-{args.dtype}")
+
+
+def run_model_bench(args):
+
+    print("Benchmarking models:")
+    for i, model in enumerate(args.models):
+        print(f"[{i}]  {model}")
+
+    def model_shapes(model_name: str, tp_size: int) -> List[Tuple[int, int]]:
+        KNs = []
+        for KN, tp_split_dim in copy.deepcopy(WEIGHT_SHAPES[model_name]):
+            KN[tp_split_dim] = KN[tp_split_dim] // tp_size
+            KNs.append(KN)
+        return KNs
+
+    model_bench_data = []
+    models_tps = list(itertools.product(args.models, args.tp_sizes))
+    for model, tp_size in models_tps:
+        Ms = args.batch_sizes
+        KNs = model_shapes(model, tp_size)
+        MKNs = []
+        for m in Ms:
+            for k, n in KNs:
+                MKNs.append((m, k, n))
+
+        data = run(args.dtype, MKNs)
+        model_bench_data.append(data)
+
+    # Print all results
+    for data, model_tp in zip(model_bench_data, models_tps):
+        model, tp_size = model_tp
+        print(f"== Results {args.dtype} {model}-TP{tp_size} ====")
+        print_timers(data)
+
+    timestamp = int(time.time())
+
+    all_data = []
+    for d in model_bench_data:
+        all_data.extend(d)
+    # pickle all data
+    with open(f"model_bench-{args.dtype}-{timestamp}.pkl", "wb") as f:
+        pkl.dump(all_data, f)
+
+
+if __name__ == '__main__':
+
+    def to_torch_dtype(dt):
+        if dt == "int8":
+            return torch.int8
+        if dt == "fp8":
+            return torch.float8_e4m3fn
+        raise ValueError("unsupported dtype")
+
+    parser = argparse.ArgumentParser(
+        description="""
+Benchmark Cutlass GEMM.
+
+    To run square GEMMs:
+        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 square_bench --dim-start 128 --dim-end 512 --dim-increment 64
+    
+    To run constant N and K and sweep M:
+        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 range_bench --dim-start 128 --dim-end 512 --dim-increment 64 --n-constant 16384 --k-constant 16384
+    
+    To run dimensions from a model:
+        python3 ./benchmarks/cutlass_benchmarks/w8a8_benchmarks.py --dtype fp8 model_bench --models meta-llama/Llama-2-7b-hf --batch-sizes 16 --tp-sizes 1
+    
+    Output:
+        - a .pkl file, that is a list of raw torch.benchmark.utils.Measurements for the pytorch and cutlass implementations for the various GEMMs.
+            """,  # noqa: E501
+        formatter_class=argparse.RawTextHelpFormatter)
+
+    parser.add_argument("--dtype",
+                        type=to_torch_dtype,
+                        required=True,
+                        help="Available options are ['int8', 'fp8']")
+    subparsers = parser.add_subparsers(dest="cmd")
+
+    square_parser = subparsers.add_parser("square_bench")
+    square_parser.add_argument("--dim-start", type=int, required=True)
+    square_parser.add_argument("--dim-end", type=int, required=True)
+    square_parser.add_argument("--dim-increment", type=int, required=True)
+    square_parser.set_defaults(func=run_square_bench)
+
+    range_parser = subparsers.add_parser("range_bench")
+    range_parser.add_argument("--dim-start", type=int, required=True)
+    range_parser.add_argument("--dim-end", type=int, required=True)
+    range_parser.add_argument("--dim-increment", type=int, required=True)
+    range_parser.add_argument("--m-constant", type=int, default=None)
+    range_parser.add_argument("--n-constant", type=int, default=None)
+    range_parser.add_argument("--k-constant", type=int, default=None)
+    range_parser.set_defaults(func=run_range_bench)
+
+    model_parser = subparsers.add_parser("model_bench")
+    model_parser.add_argument("--models",
+                              nargs="+",
+                              type=str,
+                              default=DEFAULT_MODELS,
+                              choices=WEIGHT_SHAPES.keys())
+    model_parser.add_argument("--tp-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_TP_SIZES)
+    model_parser.add_argument("--batch-sizes",
+                              nargs="+",
+                              type=int,
+                              default=DEFAULT_BATCH_SIZES)
+    model_parser.set_defaults(func=run_model_bench)
+
+    args = parser.parse_args()
+    args.func(args)
diff --git a/benchmarks/cutlass_benchmarks/weight_shapes.py b/benchmarks/cutlass_benchmarks/weight_shapes.py
new file mode 100644
index 000000000000..7ad4a53d376b
--- /dev/null
+++ b/benchmarks/cutlass_benchmarks/weight_shapes.py
@@ -0,0 +1,37 @@
+# Weight Shapes are in the format
+# ([K, N], TP_SPLIT_DIM)
+# Example:
+#  A shape of ([14336, 4096], 0) indicates the following GEMM shape,
+#   - TP1 : K = 14336, N = 4096
+#   - TP2 : K = 7168, N = 4096
+#  A shape of ([4096, 6144], 1) indicates the following GEMM shape,
+#   - TP1 : K = 4096, N = 6144
+#   - TP4 : K = 4096, N = 1536
+
+# TP1 shapes
+WEIGHT_SHAPES = {
+    "mistralai/Mistral-7B-v0.1": [
+        ([4096, 6144], 1),
+        ([4096, 4096], 0),
+        ([4096, 28672], 1),
+        ([14336, 4096], 0),
+    ],
+    "meta-llama/Llama-2-7b-hf": [
+        ([4096, 12288], 1),
+        ([4096, 4096], 0),
+        ([4096, 22016], 1),
+        ([11008, 4096], 0),
+    ],
+    "meta-llama/Llama-2-13b-hf": [
+        ([5120, 15360], 1),
+        ([5120, 5120], 0),
+        ([5120, 27648], 1),
+        ([13824, 5120], 0),
+    ],
+    "meta-llama/Llama-2-70b-hf": [
+        ([8192, 10240], 1),
+        ([8192, 8192], 0),
+        ([8192, 57344], 1),
+        ([28672, 8192], 0),
+    ],
+}
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu
index 2383760abcdb..4c1aec03a3ca 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu
@@ -51,6 +51,11 @@ using namespace cute;
 
 namespace {
 
+uint32_t next_pow_2(uint32_t const num) {
+  if (num <= 1) return num;
+  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+}
+
 template <typename ElementAB_, typename ElementD_, typename TileShape,
           typename ClusterShape, typename KernelSchedule,
           typename EpilogueSchedule>
@@ -188,8 +193,89 @@ void cutlass_scaled_mm_dq_dispatcher(torch::Tensor& out, torch::Tensor const& a,
   cutlass::Status status = gemm_op.run(args, workspace.get(), stream);
   CUTLASS_CHECK(status);
 }
+
+template <typename InType, typename OutType, int32_t M>
+struct sm90_fp8_config {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_128, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, TileShape, ClusterShape, KernelSchedule,
+                      EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType>
+struct sm90_fp8_config<InType, OutType, 128> {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _128, _128>;
+  using ClusterShape = Shape<_2, _1, _1>;
+
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, TileShape, ClusterShape, KernelSchedule,
+                      EpilogueSchedule>;
+};
+
+template <typename InType, typename OutType>
+struct sm90_fp8_config<InType, OutType, 64> {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  using KernelSchedule =
+      cutlass::gemm::KernelTmaWarpSpecializedPingpongFP8FastAccum;
+  using EpilogueSchedule = typename cutlass::epilogue::TmaWarpSpecialized;
+  using TileShape = Shape<_64, _64, _128>;
+  using ClusterShape = Shape<_1, _8, _1>;
+
+  using Cutlass3xGemm =
+      cutlass_3x_gemm<InType, OutType, TileShape, ClusterShape, KernelSchedule,
+                      EpilogueSchedule>;
+};
+
 }  // namespace
 
+template <typename InType, typename OutType>
+void cutlass_scaled_mm_dq_sm90_fp8_dispatch(torch::Tensor& out,
+                                            torch::Tensor const& a,
+                                            torch::Tensor const& b,
+                                            torch::Tensor const& a_scales,
+                                            torch::Tensor const& b_scales) {
+  static_assert(std::is_same<InType, cutlass::float_e4m3_t>());
+  TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
+  TORCH_CHECK(a_scales.dtype() == torch::kFloat32);
+  TORCH_CHECK(b_scales.dtype() == torch::kFloat32);
+
+  using Cutlass3xGemmDefault =
+      typename sm90_fp8_config<InType, OutType, 0>::Cutlass3xGemm;
+  using Cutlass3xGemmM64 =
+      typename sm90_fp8_config<InType, OutType, 64>::Cutlass3xGemm;
+  using Cutlass3xGemmM128 =
+      typename sm90_fp8_config<InType, OutType, 128>::Cutlass3xGemm;
+
+  uint32_t const m = a.size(0);
+  uint32_t const mp2 =
+      std::max(static_cast<uint32_t>(64), next_pow_2(m));  // next power of 2
+
+  if (mp2 <= 64) {
+    // m in [1, 64]
+    return cutlass_scaled_mm_dq_dispatcher<Cutlass3xGemmM64>(
+        out, a, b, a_scales, b_scales);
+  } else if (mp2 <= 128) {
+    // m in (64, 128]
+    return cutlass_scaled_mm_dq_dispatcher<Cutlass3xGemmM128>(
+        out, a, b, a_scales, b_scales);
+  } else {
+    // m in (128, inf)
+    return cutlass_scaled_mm_dq_dispatcher<Cutlass3xGemmDefault>(
+        out, a, b, a_scales, b_scales);
+  }
+}
+
 void cutlass_scaled_mm_dq_sm90(torch::Tensor& out, torch::Tensor const& a,
                                torch::Tensor const& b,
                                torch::Tensor const& a_scales,
@@ -223,24 +309,14 @@ void cutlass_scaled_mm_dq_sm90(torch::Tensor& out, torch::Tensor const& a,
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
     TORCH_CHECK(b.dtype() == torch::kFloat8_e4m3fn);
 
-    using TileShape = Shape<_128, _128, _128>;
-    using ClusterShape = Shape<_1, _2, _1>;
-    using KernelSchedule =
-        typename cutlass::gemm::KernelCpAsyncWarpSpecializedCooperative;
-    using EpilogueSchedule =
-        typename cutlass::epilogue::TmaWarpSpecializedCooperative;
-
     if (out.dtype() == torch::kBFloat16) {
-      return cutlass_scaled_mm_dq_dispatcher<
-          cutlass_3x_gemm<cutlass::float_e4m3_t, cutlass::bfloat16_t, TileShape,
-                          ClusterShape, KernelSchedule, EpilogueSchedule>>(
+      return cutlass_scaled_mm_dq_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                                    cutlass::bfloat16_t>(
           out, a, b, a_scales, b_scales);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
-
-      return cutlass_scaled_mm_dq_dispatcher<
-          cutlass_3x_gemm<cutlass::float_e4m3_t, cutlass::half_t, TileShape,
-                          ClusterShape, KernelSchedule, EpilogueSchedule>>(
+      return cutlass_scaled_mm_dq_sm90_fp8_dispatch<cutlass::float_e4m3_t,
+                                                    cutlass::half_t>(
           out, a, b, a_scales, b_scales);
     }
   }
diff --git a/tests/kernels/test_cutlass.py b/tests/kernels/test_cutlass.py
index 6ca62e3e0000..276ecf00246c 100644
--- a/tests/kernels/test_cutlass.py
+++ b/tests/kernels/test_cutlass.py
@@ -82,7 +82,7 @@ def cutlass_int8_gemm_helper(m: int,
     assert torch.allclose(out, baseline, rtol=1e-1, atol=1e0)
 
 
-@pytest.mark.parametrize("m", [512, 222, 33, 1])
+@pytest.mark.parametrize("m", [512, 222, 100, 33, 1])
 @pytest.mark.parametrize("n", [2048, 256, 1024])
 @pytest.mark.parametrize("k", [128, 496, 1024])
 @pytest.mark.parametrize("per_act_token", [True, False])

From 46b6b26711a0a7f38a5793cd4488c6540ca72dd8 Mon Sep 17 00:00:00 2001
From: Ye Cao <952129620@qq.com>
Date: Sun, 2 Jun 2024 01:11:22 +0800
Subject: [PATCH 04/93] [Minor] Fix the path typo in loader.py:
 save_sharded_states.py -> save_sharded_state.py  (#5151)

Signed-off-by: Ye Cao <caoye.cao@alibaba-inc.com>
---
 vllm/model_executor/model_loader/loader.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index fa9866abf7d2..65501450c5f7 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -407,7 +407,7 @@ class ShardedStateLoader(BaseModelLoader):
     Model loader that directly loads each worker's model state dict, which
     enables a fast load path for large tensor-parallel models where each worker
     only needs to read its own shard rather than the entire checkpoint. See
-    `examples/save_sharded_states.py` for creating a sharded checkpoint.
+    `examples/save_sharded_state.py` for creating a sharded checkpoint.
     """
 
     DEFAULT_PATTERN = "model-rank-{rank}-part-{part}.safetensors"

From 5b5c2b9f20c98fc1102acdac1bf1ffad066e4153 Mon Sep 17 00:00:00 2001
From: Nadav Shmayovits <45605409+NadavShmayo@users.noreply.github.com>
Date: Sat, 1 Jun 2024 20:18:50 +0300
Subject: [PATCH 05/93] [Bugfix] Fix call to init_logger in openai server
 (#4765)

---
 vllm/entrypoints/openai/api_server.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 97b35262329e..95417718b51f 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -36,7 +36,7 @@
 openai_serving_completion: OpenAIServingCompletion
 openai_serving_embedding: OpenAIServingEmbedding
 
-logger = init_logger(__name__)
+logger = init_logger('vllm.entrypoints.openai.api_server')
 
 _running_tasks: Set[asyncio.Task] = set()
 

From cb6b7a0248e670c56b87e67f436d6f65ca0d753f Mon Sep 17 00:00:00 2001
From: chenqianfzh <51831990+chenqianfzh@users.noreply.github.com>
Date: Sat, 1 Jun 2024 13:51:10 -0700
Subject: [PATCH 06/93] [Feature][Kernel] Support bitsandbytes quantization and
 QLoRA (#4776)

---
 examples/lora_with_quantization_inference.py  | 140 ++++++++++
 requirements-dev.txt                          |   3 +
 tests/quantization/test_bitsandbytes.py       |  80 ++++++
 vllm/config.py                                |   9 +-
 vllm/engine/arg_utils.py                      |  38 ++-
 vllm/model_executor/layers/linear.py          |  41 ++-
 .../layers/quantization/__init__.py           |   3 +
 .../layers/quantization/bitsandbytes.py       | 175 +++++++++++++
 vllm/model_executor/model_loader/loader.py    | 247 +++++++++++++++++-
 .../model_loader/weight_utils.py              |  16 +-
 vllm/model_executor/models/llama.py           |   8 +
 11 files changed, 752 insertions(+), 8 deletions(-)
 create mode 100644 examples/lora_with_quantization_inference.py
 create mode 100644 tests/quantization/test_bitsandbytes.py
 create mode 100644 vllm/model_executor/layers/quantization/bitsandbytes.py

diff --git a/examples/lora_with_quantization_inference.py b/examples/lora_with_quantization_inference.py
new file mode 100644
index 000000000000..3b2347c1115e
--- /dev/null
+++ b/examples/lora_with_quantization_inference.py
@@ -0,0 +1,140 @@
+"""
+This example shows how to use LoRA with different quantization techniques
+for offline inference.
+
+Requires HuggingFace credentials for access.
+"""
+
+import gc
+from typing import List, Optional, Tuple
+
+import torch
+from huggingface_hub import snapshot_download
+
+from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
+from vllm.lora.request import LoRARequest
+
+
+def create_test_prompts(
+        lora_path: str
+) -> List[Tuple[str, SamplingParams, Optional[LoRARequest]]]:
+    return [
+        # this is an example of using quantization without LoRA
+        ("My name is",
+         SamplingParams(temperature=0.0,
+                        logprobs=1,
+                        prompt_logprobs=1,
+                        max_tokens=128), None),
+        # the next three examples use quantization with LoRA
+        ("my name is",
+         SamplingParams(temperature=0.0,
+                        logprobs=1,
+                        prompt_logprobs=1,
+                        max_tokens=128),
+         LoRARequest("lora-test-1", 1, lora_path)),
+        ("The capital of USA is",
+         SamplingParams(temperature=0.0,
+                        logprobs=1,
+                        prompt_logprobs=1,
+                        max_tokens=128),
+         LoRARequest("lora-test-2", 1, lora_path)),
+        ("The capital of France is",
+         SamplingParams(temperature=0.0,
+                        logprobs=1,
+                        prompt_logprobs=1,
+                        max_tokens=128),
+         LoRARequest("lora-test-3", 1, lora_path)),
+    ]
+
+
+def process_requests(engine: LLMEngine,
+                     test_prompts: List[Tuple[str, SamplingParams,
+                                              Optional[LoRARequest]]]):
+    """Continuously process a list of prompts and handle the outputs."""
+    request_id = 0
+
+    while test_prompts or engine.has_unfinished_requests():
+        if test_prompts:
+            prompt, sampling_params, lora_request = test_prompts.pop(0)
+            engine.add_request(str(request_id),
+                               prompt,
+                               sampling_params,
+                               lora_request=lora_request)
+            request_id += 1
+
+        request_outputs: List[RequestOutput] = engine.step()
+        for request_output in request_outputs:
+            if request_output.finished:
+                print("----------------------------------------------------")
+                print(f"Prompt: {request_output.prompt}")
+                print(f"Output: {request_output.outputs[0].text}")
+
+
+def initialize_engine(model: str, quantization: str,
+                      lora_repo: Optional[str]) -> LLMEngine:
+    """Initialize the LLMEngine."""
+
+    if quantization == "bitsandbytes":
+        # QLoRA (https://arxiv.org/abs/2305.14314) is a quantization technique.
+        # It quantizes the model when loading, with some config info from the
+        # LoRA adapter repo. So need to set the parameter of load_format and
+        # qlora_adapter_name_or_path as below.
+        engine_args = EngineArgs(
+            model=model,
+            quantization=quantization,
+            qlora_adapter_name_or_path=lora_repo,
+            load_format="bitsandbytes",
+            enable_lora=True,
+            max_lora_rank=64,
+            # set it only in GPUs of limited memory
+            enforce_eager=True)
+    else:
+        engine_args = EngineArgs(
+            model=model,
+            quantization=quantization,
+            enable_lora=True,
+            max_loras=4,
+            # set it only in GPUs of limited memory
+            enforce_eager=True)
+    return LLMEngine.from_engine_args(engine_args)
+
+
+def main():
+    """Main function that sets up and runs the prompt processing."""
+
+    test_configs = [{
+        "name": "qlora_inference_example",
+        'model': "huggyllama/llama-7b",
+        'quantization': "bitsandbytes",
+        'lora_repo': 'timdettmers/qlora-flan-7b'
+    }, {
+        "name": "AWQ_inference_with_lora_example",
+        'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ',
+        'quantization': "awq",
+        'lora_repo': 'jashing/tinyllama-colorist-lora'
+    }, {
+        "name": "GPTQ_inference_with_lora_example",
+        'model': 'TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ',
+        'quantization': "gptq",
+        'lora_repo': 'jashing/tinyllama-colorist-lora'
+    }]
+
+    for test_config in test_configs:
+        print(
+            f"~~~~~~~~~~~~~~~~ Running: {test_config['name']} ~~~~~~~~~~~~~~~~"
+        )
+        engine = initialize_engine(test_config['model'],
+                                   test_config['quantization'],
+                                   test_config['lora_repo'])
+        lora_path = snapshot_download(repo_id=test_config['lora_repo'])
+        test_prompts = create_test_prompts(lora_path)
+        process_requests(engine, test_prompts)
+
+        # Clean up the GPU memory for the next test
+        del engine
+        gc.collect()
+        torch.cuda.empty_cache()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 4329a4fd0fbe..22cc53fd3a72 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -37,3 +37,6 @@ aiohttp
 
 # Multimodal
 pillow
+
+# quantization
+bitsandbytes==0.42.0
diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py
new file mode 100644
index 000000000000..4e9feb3c4814
--- /dev/null
+++ b/tests/quantization/test_bitsandbytes.py
@@ -0,0 +1,80 @@
+'''Tests whether bitsandbytes computation is enabled correctly.
+
+Run `pytest tests/quantization/test_bitsandbytes.py`.
+'''
+import pytest
+import torch
+
+from vllm import SamplingParams
+from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
+
+capability = torch.cuda.get_device_capability()
+capability = capability[0] * 10 + capability[1]
+
+
+@pytest.mark.skipif(
+    capability < QUANTIZATION_METHODS['bitsandbytes'].get_min_capability(),
+    reason='bitsandbytes is not supported on this GPU type.')
+def test_load_bnb_model(vllm_runner) -> None:
+    llm = vllm_runner('huggyllama/llama-7b',
+                      quantization='bitsandbytes',
+                      load_format='bitsandbytes',
+                      enforce_eager=True)
+
+    model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
+
+    # check the weights in MLP & SelfAttention are quantized to torch.uint8
+    qweight = model.model.layers[0].mlp.gate_up_proj.qweight
+    assert qweight.dtype == torch.uint8, (
+        f'Expected gate_up_proj dtype torch.uint8 but got {qweight.dtype}')
+
+    qweight = model.model.layers[0].mlp.down_proj.qweight
+    assert qweight.dtype == torch.uint8, (
+        f'Expected down_proj dtype torch.uint8 but got {qweight.dtype}')
+
+    qweight = model.model.layers[0].self_attn.o_proj.qweight
+    assert qweight.dtype == torch.uint8, (
+        f'Expected o_proj dtype torch.uint8 but got {qweight.dtype}')
+
+    qweight = model.model.layers[0].self_attn.qkv_proj.qweight
+    assert qweight.dtype == torch.uint8, (
+        f'Expected qkv_proj dtype torch.uint8 but got {qweight.dtype}')
+
+    # some weights should not be quantized
+    weight = model.lm_head.weight
+    assert weight.dtype != torch.uint8, (
+        'lm_head weight dtype should not be torch.uint8')
+
+    weight = model.model.embed_tokens.weight
+    assert weight.dtype != torch.uint8, (
+        'embed_tokens weight dtype should not be torch.uint8')
+
+    weight = model.model.layers[0].input_layernorm.weight
+    assert weight.dtype != torch.uint8, (
+        'input_layernorm weight dtype should not be torch.uint8')
+
+    weight = model.model.layers[0].post_attention_layernorm.weight
+    assert weight.dtype != torch.uint8, (
+        'input_layernorm weight dtype should not be torch.uint8')
+
+    # check the output of the model is expected
+    sampling_params = SamplingParams(temperature=0.0,
+                                     logprobs=1,
+                                     prompt_logprobs=1,
+                                     max_tokens=8)
+
+    prompts = ['That which does not kill us', 'To be or not to be,']
+    expected_outputs = [
+        'That which does not kill us makes us stronger.',
+        'To be or not to be, that is the question.'
+    ]
+    outputs = llm.generate(prompts, sampling_params=sampling_params)
+
+    assert len(outputs) == len(prompts)
+
+    for index in range(len(outputs)):
+        # compare the first line of the output
+        actual_output = outputs[index][1][0].split('\n', 1)[0]
+        expected_output = expected_outputs[index].split('\n', 1)[0]
+        assert actual_output == expected_output, (
+            f'Expected: {expected_output}, but got: {actual_output}')
diff --git a/vllm/config.py b/vllm/config.py
index ea372eda38d2..61cd66aab80c 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -273,6 +273,12 @@ def verify_with_parallel_config(
                 "must be divisible by pipeline parallel size "
                 f"({pipeline_parallel_size}).")
 
+        if self.quantization == "bitsandbytes" and (
+                parallel_config.tensor_parallel_size > 1
+                or parallel_config.pipeline_parallel_size > 1):
+            raise ValueError(
+                "BitAndBytes quantization with TP or PP is not supported yet.")
+
     def get_hf_config_sliding_window(self) -> Optional[int]:
         """Get the sliding window size, or None if disabled.
         """
@@ -359,7 +365,7 @@ def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
     def get_num_attention_heads(self,
                                 parallel_config: "ParallelConfig") -> int:
         return self.hf_text_config.num_attention_heads // \
-                    parallel_config.tensor_parallel_size
+            parallel_config.tensor_parallel_size
 
     def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
         total_num_hidden_layers = self.hf_text_config.num_hidden_layers
@@ -519,6 +525,7 @@ class LoadFormat(str, enum.Enum):
     DUMMY = "dummy"
     TENSORIZER = "tensorizer"
     SHARDED_STATE = "sharded_state"
+    BITSANDBYTES = "bitsandbytes"
 
 
 @dataclass
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 24d42b791b5d..bceb6e5fb064 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -96,6 +96,8 @@ class EngineArgs:
     ngram_prompt_lookup_max: Optional[int] = None
     ngram_prompt_lookup_min: Optional[int] = None
 
+    qlora_adapter_name_or_path: Optional[str] = None
+
     def __post_init__(self):
         if self.tokenizer is None:
             self.tokenizer = self.model
@@ -163,7 +165,8 @@ def add_cli_args(
             type=str,
             default=EngineArgs.load_format,
             choices=[
-                'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer'
+                'auto', 'pt', 'safetensors', 'npcache', 'dummy', 'tensorizer',
+                'bitsandbytes'
             ],
             help='The format of the model weights to load.\n\n'
             '* "auto" will try to load the weights in the safetensors format '
@@ -177,7 +180,9 @@ def add_cli_args(
             'which is mainly for profiling.\n'
             '* "tensorizer" will load the weights using tensorizer from '
             'CoreWeave. See the Tensorize vLLM Model script in the Examples'
-            'section for more information.\n')
+            'section for more information.\n'
+            '* "bitsandbytes" will load the weights using bitsandbytes '
+            'quantization.\n')
         parser.add_argument(
             '--dtype',
             type=str,
@@ -558,7 +563,10 @@ def add_cli_args(
             "will also be used in `model_name` tag content of "
             "prometheus metrics, if multiple names provided, metrics"
             "tag will take the first one.")
-
+        parser.add_argument('--qlora-adapter-name-or-path',
+                            type=str,
+                            default=None,
+                            help='Name or path of the QLoRA adapter.')
         return parser
 
     @classmethod
@@ -570,6 +578,23 @@ def from_cli_args(cls, args: argparse.Namespace):
         return engine_args
 
     def create_engine_config(self, ) -> EngineConfig:
+
+        # bitsandbytes quantization needs a specific model loader
+        # so we make sure the quant method and the load format are consistent
+        if (self.quantization == "bitsandbytes" or
+            self.qlora_adapter_name_or_path is not None) and \
+            self.load_format != "bitsandbytes":
+            raise ValueError(
+                "BitsAndBytes quantization and QLoRA adapter only support "
+                f"'bitsandbytes' load format, but got {self.load_format}")
+
+        if (self.load_format == "bitsandbytes" or
+            self.qlora_adapter_name_or_path is not None) and \
+            self.quantization != "bitsandbytes":
+            raise ValueError(
+                "BitsAndBytes load format and QLoRA adapter only support "
+                f"'bitsandbytes' quantization, but got {self.quantization}")
+
         device_config = DeviceConfig(self.device)
         model_config = ModelConfig(
             self.model, self.tokenizer, self.tokenizer_mode,
@@ -637,6 +662,13 @@ def create_engine_config(self, ) -> EngineConfig:
             max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras
             and self.max_cpu_loras > 0 else None) if self.enable_lora else None
 
+        if self.qlora_adapter_name_or_path is not None and \
+            self.qlora_adapter_name_or_path != "":
+            if self.model_loader_extra_config is None:
+                self.model_loader_extra_config = {}
+            self.model_loader_extra_config[
+                "qlora_adapter_name_or_path"] = self.qlora_adapter_name_or_path
+
         load_config = LoadConfig(
             load_format=self.load_format,
             download_dir=self.download_dir,
diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py
index 1b18efd5177f..78b306bbe6b3 100644
--- a/vllm/model_executor/layers/linear.py
+++ b/vllm/model_executor/layers/linear.py
@@ -1,5 +1,5 @@
 from abc import abstractmethod
-from typing import List, Optional, Set
+from typing import Dict, List, Optional, Set, Tuple
 
 import torch
 import torch.nn.functional as F
@@ -28,6 +28,21 @@ def adjust_marlin_shard(param, shard_size, shard_offset):
     return shard_size * marlin_tile_size, shard_offset * marlin_tile_size
 
 
+def adjust_bitsandbytes_shard(param: Parameter,
+                              qkv_offsets: Dict[str, Tuple[int, int]],
+                              loaded_shard_id: str) -> Tuple[int, int]:
+    """Adjust the quantization offsets and sizes for BitsAndBytes sharding."""
+
+    total, _ = qkv_offsets["total"]
+    orig_offset, orig_size = qkv_offsets[loaded_shard_id]
+
+    quantized_total = param.data.shape[0]
+    quantized_offset = orig_offset * quantized_total // total
+    quantized_size = orig_size * quantized_total // total
+
+    return quantized_size, quantized_offset
+
+
 class LinearMethodBase(QuantizeMethodBase):
     """Base class for different (maybe quantized) linear methods."""
 
@@ -39,7 +54,7 @@ def create_weights(self, layer: torch.nn.Module,
                        **extra_weight_attrs):
         """Create weights for a linear layer. 
            The weights will be set as attributes of the layer.
-        
+
         Args:
             layer: The layer that is using the LinearMethodBase factory.
             input_size_per_partition: Size of the weight input dim on rank X.
@@ -424,6 +439,12 @@ def weight_loader(self,
                 shard_size, shard_offset = adjust_marlin_shard(
                     param, shard_size, shard_offset)
 
+            use_bitsandbytes = getattr(param, "use_bitsandbytes", False)
+            if use_bitsandbytes:
+                shard_size = loaded_weight.shape[output_dim]
+                shard_offset = loaded_weight.shape[output_dim] * \
+                    loaded_shard_id
+
             param_data = param_data.narrow(output_dim, shard_offset,
                                            shard_size)
             start_idx = tp_rank * shard_size
@@ -636,6 +657,22 @@ def weight_loader(self,
                 shard_size, shard_offset = adjust_marlin_shard(
                     param, shard_size, shard_offset)
 
+            use_bitsandbytes = getattr(param, "use_bitsandbytes", False)
+            if use_bitsandbytes:
+                orig_qkv_offsets = {
+                    "q": (0, self.num_heads * self.head_size),
+                    "k": (self.num_heads * self.head_size,
+                          self.num_kv_heads * self.head_size),
+                    "v":
+                    ((self.num_heads + self.num_kv_heads) * self.head_size,
+                     self.num_kv_heads * self.head_size),
+                    "total":
+                    ((self.num_heads + 2 * self.num_kv_heads) * self.head_size,
+                     0)
+                }
+                shard_size, shard_offset = adjust_bitsandbytes_shard(
+                    param, orig_qkv_offsets, loaded_shard_id)
+
             param_data = param_data.narrow(output_dim, shard_offset,
                                            shard_size)
             if loaded_shard_id == "q":
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 7b9abe1b629a..0bc42beb6625 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -4,6 +4,8 @@
 from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
+from vllm.model_executor.layers.quantization.bitsandbytes import (
+    BitsAndBytesConfig)
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
     CompressedTensorsConfig)
 from vllm.model_executor.layers.quantization.deepspeedfp import (
@@ -30,6 +32,7 @@
     "gptq": GPTQConfig,
     "squeezellm": SqueezeLLMConfig,
     "sparseml": CompressedTensorsConfig,
+    "bitsandbytes": BitsAndBytesConfig,
 }
 
 
diff --git a/vllm/model_executor/layers/quantization/bitsandbytes.py b/vllm/model_executor/layers/quantization/bitsandbytes.py
new file mode 100644
index 000000000000..969958d9b544
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/bitsandbytes.py
@@ -0,0 +1,175 @@
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch.nn.parameter import Parameter
+
+from vllm.model_executor.layers.linear import (LinearBase, LinearMethodBase,
+                                               set_weight_attrs)
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+
+
+class BitsAndBytesConfig(QuantizationConfig):
+    """Config class for BitsAndBytes Quantization.
+
+    Reference: https://arxiv.org/abs/2305.14314
+    """
+
+    def __init__(
+        self,
+        adapter_name_or_path: str,
+        target_modules: List[str],
+    ) -> None:
+
+        self.adapter_name_or_path = adapter_name_or_path
+        self.target_modules = target_modules
+
+    def __repr__(self) -> str:
+        return (
+            f"BitsAndBytesConfig(adapter_name_or_path={self.adapter_name_or_path}"
+        )
+
+    @classmethod
+    def get_name(self) -> str:
+        return "bitsandbytes"
+
+    @classmethod
+    def get_supported_act_dtypes(self) -> List[torch.dtype]:
+        return [torch.float32, torch.float16, torch.bfloat16]
+
+    @classmethod
+    def get_min_capability(self) -> int:
+        return 70
+
+    @staticmethod
+    def get_config_filenames() -> List[str]:
+        return [
+            "adapter_config.json",
+        ]
+
+    @classmethod
+    def from_config(cls, config: Dict[str, Any]) -> "BitsAndBytesConfig":
+        adapter_name = cls.get_from_keys(config, ["adapter_name_or_path"])
+        default_target_modules = [
+            "gate_proj", "down_proj", "up_proj", "q_proj", "k_proj", "v_proj",
+            "o_proj"
+        ]
+        if adapter_name == "":
+            target_modules = default_target_modules
+        else:
+            target_modules = cls.get_from_keys(config, ["target_modules"])
+        return cls(adapter_name, target_modules)
+
+    def get_quant_method(
+            self,
+            layer: torch.nn.Module) -> Optional["BitsAndBytesLinearMethod"]:
+        if isinstance(layer, LinearBase):
+            return BitsAndBytesLinearMethod(self)
+        return None
+
+    def get_scaled_act_names(self) -> List[str]:
+        return ["gelu", "gelu_fast", "gelu_new", "gelu_pytorch_tanh"]
+
+
+class BitsAndBytesLinearMethod(LinearMethodBase):
+    """Linear method for BitsAndBytes.
+
+    Args:
+       quant_config: The BitsAndBytes quantization config.
+    """
+
+    def __init__(self, quant_config: BitsAndBytesConfig):
+        try:
+            import bitsandbytes
+            if bitsandbytes.__version__ < "0.42.0":
+                raise ImportError("bitsandbytes version is wrong. Please "
+                                  "install bitsandbytes>=0.42.0.")
+        except ImportError as err:
+            raise ImportError("Please install bitsandbytes>=0.42.0 via "
+                              "`pip install bitsandbytes>=0.42.0` to use "
+                              "bitsandbytes quantizer.") from err
+
+        self.quant_config = quant_config
+
+    def create_weights(self, layer: torch.nn.Module,
+                       input_size_per_partition: int,
+                       output_partition_sizes: List[int], input_size: int,
+                       output_size: int, params_dtype: torch.dtype,
+                       **extra_weight_attrs):
+        quant_ratio = 0
+        if params_dtype.is_floating_point:
+            quant_ratio = torch.finfo(params_dtype).bits // torch.iinfo(
+                torch.uint8).bits
+        else:
+            quant_ratio = torch.iinfo(params_dtype).bits // torch.iinfo(
+                torch.uint8).bits
+
+        if input_size_per_partition * sum(
+                output_partition_sizes) % quant_ratio != 0:
+            raise ValueError(
+                "The input size is not aligned with the quantized "
+                "weight shape. ")
+        qweight = Parameter(
+            torch.empty(
+                input_size_per_partition * sum(output_partition_sizes) //
+                quant_ratio,
+                1,
+                dtype=torch.uint8,
+            ),
+            requires_grad=False,
+        )
+
+        set_weight_attrs(
+            qweight,
+            {
+                "input_dim": 0,
+                # In bitsandbytes, a tensor of shape [n,m] is quantized to
+                #[n*m/pack_ratio, 1],so the output_dim is 0
+                "output_dim": 0,
+                "pack_factor": quant_ratio,
+                "use_bitsandbytes": True,
+            })
+        layer.register_parameter("qweight", qweight)
+        set_weight_attrs(qweight, extra_weight_attrs)
+
+    def apply(self,
+              layer: torch.nn.Module,
+              x: torch.Tensor,
+              bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
+        # only load the bitsandbytes module when needed
+        from bitsandbytes import matmul_4bit
+
+        original_type = x.dtype
+        bf_x = x.to(torch.bfloat16)
+
+        qweight = layer.qweight
+        quant_states = qweight.bnb_quant_state
+        offsets = qweight.bnb_shard_offsets
+
+        out_dim_0 = x.shape[0]
+        out_dim_1 = sum(
+            [quant_state[1].shape[0] for quant_state in quant_states.items()])
+        out = torch.empty(out_dim_0,
+                          out_dim_1,
+                          dtype=torch.bfloat16,
+                          device=x.device)
+
+        current_index = 0
+        for i in range(len(quant_states)):
+            output_size = quant_states[i].shape[0]
+            # It is more efficient to use out kwarg like
+            # matmul_4bit(..., out = ...).  Infeasible now due to the bug
+            # https://github.com/TimDettmers/bitsandbytes/issues/1235.
+            # Need to change  after the bug is fixed.
+            out[:, current_index:current_index + output_size] = matmul_4bit(
+                bf_x, qweight[offsets[i]:offsets[i + 1]].t(), quant_states[i])
+
+            current_index += output_size
+
+        out = out.to(original_type)
+
+        if bias is not None:
+            out += bias
+
+        return out
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
index 65501450c5f7..491a8ccc4f7e 100644
--- a/vllm/model_executor/model_loader/loader.py
+++ b/vllm/model_executor/model_loader/loader.py
@@ -1,13 +1,18 @@
 # ruff: noqa: SIM117
 import collections
 import copy
+import fnmatch
 import glob
+import json
+import math
 import os
 from abc import ABC, abstractmethod
 from typing import Any, Dict, Generator, List, Optional, Tuple, Type
 
 import huggingface_hub
+import numpy as np
 import torch
+from huggingface_hub import HfApi, hf_hub_download
 from torch import nn
 
 from vllm.config import (CacheConfig, DeviceConfig, LoadConfig, LoadFormat,
@@ -30,6 +35,7 @@
     np_cache_weights_iterator, pt_weights_iterator,
     safetensors_weights_iterator)
 from vllm.model_executor.models.vlm_base import VisionLanguageModelBase
+from vllm.model_executor.utils import set_weight_attrs
 
 logger = init_logger(__name__)
 
@@ -146,7 +152,7 @@ def __init__(self, load_config: LoadConfig):
     def _maybe_download_from_modelscope(
             self, model: str, revision: Optional[str]) -> Optional[str]:
         """Download model from ModelScope hub if VLLM_USE_MODELSCOPE is True.
-        
+
         Returns the path to the downloaded model, or None if the model is not
         downloaded from ModelScope."""
         if VLLM_USE_MODELSCOPE:
@@ -268,6 +274,7 @@ def load_model(self, *, model_config: ModelConfig,
                                                model,
                                                "fall_back_to_pt_during_load",
                                                True)), )
+
             for _, module in model.named_modules():
                 quant_method = getattr(module, "quant_method", None)
                 if quant_method is not None:
@@ -560,6 +567,241 @@ def save_model(
             )
 
 
+class BitsAndBytesModelLoader(BaseModelLoader):
+    """Model loader to load model weights with BitAndBytes quantization."""
+
+    default_target_modules = [
+        "gate_proj", "down_proj", "up_proj", "q_proj", "k_proj", "v_proj",
+        "o_proj"
+    ]
+
+    possible_config_file_names = ["adapter_config.json"]
+
+    def __init__(self, load_config: LoadConfig):
+        super().__init__(load_config)
+
+        # we don't need to quantize the whole model, only the target modules
+        # that are specified in the adapter config file. If the adapter config
+        # file is not provided, we will quantize the default modules.
+        if (not load_config.model_loader_extra_config
+                or "qlora_adapter_name_or_path"
+                not in load_config.model_loader_extra_config):
+            self.target_modules = self.default_target_modules
+            return
+
+        qlora_adapter = load_config.model_loader_extra_config[
+            "qlora_adapter_name_or_path"]
+
+        config_file_path = self._get_config_file(qlora_adapter)
+
+        with open(config_file_path, "r") as f:
+            config = json.load(f)
+            self.target_modules = config["target_modules"]
+
+    def _get_config_file(self, qlora_adapter: str) -> str:
+        is_local = os.path.isdir(qlora_adapter)
+        config_file_path = None
+        if is_local:
+            for file in self.possible_config_file_names:
+                config_file_path = os.path.join(qlora_adapter, file)
+                if os.path.exists(config_file_path):
+                    break
+        else:
+            hf_api = HfApi()
+            repo_files = hf_api.list_repo_files(repo_id=qlora_adapter)
+            for file in self.possible_config_file_names:
+                if file in repo_files:
+                    config_file_path = hf_hub_download(repo_id=qlora_adapter,
+                                                       filename=file)
+                    break
+
+        if not config_file_path:
+            raise ValueError(
+                f"Cannot find adapter config file in {qlora_adapter}")
+
+        return config_file_path
+
+    def _get_weight_files(
+            self,
+            model_name_or_path: str,
+            allowed_patterns: List[str],
+            revision: Optional[str] = None) -> Tuple[List[str], str]:
+        """Retrieve weight files. Download the files if necessary. 
+        
+        Return the weight files and the file pattern."""
+        is_local = os.path.isdir(model_name_or_path)
+
+        if is_local:
+            for pattern in allowed_patterns:
+                weight_files = glob.glob(
+                    os.path.join(model_name_or_path, pattern))
+                if weight_files:
+                    return weight_files, pattern
+        else:
+            hf_api = HfApi()
+            repo_files = hf_api.list_repo_files(repo_id=model_name_or_path)
+            for pattern in allowed_patterns:
+                matching_files = fnmatch.filter(repo_files, pattern)
+                if matching_files:
+                    hf_folder = download_weights_from_hf(
+                        model_name_or_path, self.load_config.download_dir,
+                        [pattern], revision)
+                    return glob.glob(os.path.join(hf_folder, pattern)), pattern
+
+        raise RuntimeError(
+            f"No model weights found in: `{model_name_or_path}`")
+
+    def _prepare_weights(self, model_name_or_path: str,
+                         revision: Optional[str]) -> Tuple[List[str], bool]:
+        """Prepare weight files for the model."""
+
+        allowed_patterns = ["*.safetensors", "*.bin", "*.pt"]
+
+        hf_weights_files, matched_pattern = self._get_weight_files(
+            model_name_or_path, allowed_patterns, revision)
+
+        if matched_pattern != "*.safetensors":
+            hf_weights_files = filter_files_not_needed_for_inference(
+                hf_weights_files)
+
+        if len(hf_weights_files) == 0:
+            raise RuntimeError(
+                f"Cannot find any model weights with `{model_name_or_path}`")
+
+        return hf_weights_files, matched_pattern == "*.safetensors"
+
+    def _get_quantized_weights_iterator(
+        self, model_name_or_path: str, revision: Optional[str]
+    ) -> Tuple[Generator[Tuple[str, torch.Tensor], None, None], Dict[str,
+                                                                     Any]]:
+        """Get an iterator to the model weights with bitsandbytes quantization,
+        as well as the quantization state dictionary."""
+
+        # only load the bitsandbytes module when needed
+        try:
+            import bitsandbytes
+            if bitsandbytes.__version__ < "0.42.0":
+                raise ImportError("bitsandbytes version is wrong. Please "
+                                  "install bitsandbytes>=0.42.0.")
+            from bitsandbytes.functional import quantize_4bit
+        except ImportError as err:
+            raise ImportError("Please install bitsandbytes>=0.42.0 via "
+                              "`pip install bitsandbytes>=0.42.0` to use "
+                              "bitsandbytes quantizer.") from err
+
+        hf_weights_files, use_safetensors = self._prepare_weights(
+            model_name_or_path, revision)
+
+        quant_state_dict = {}
+        if use_safetensors:
+            weight_iterator = safetensors_weights_iterator(hf_weights_files)
+        else:
+            weight_iterator = pt_weights_iterator(hf_weights_files)
+
+        def generator():
+            for weight_name, weight_tensor in weight_iterator:
+                if any(target_module in weight_name
+                       for target_module in self.target_modules):
+                    weight_name = weight_name.replace(".weight", ".qweight")
+                    #  bitsandbytes requires data in GPU
+                    loaded_weight = weight_tensor.cuda().data
+                    with set_default_torch_dtype(torch.float32):
+                        processed_weight, quant_state = quantize_4bit(
+                            loaded_weight,
+                            compress_statistics=True,
+                            quant_type="nf4")
+
+                    quant_state_dict[weight_name] = quant_state
+                else:
+                    processed_weight = weight_tensor
+
+                yield weight_name, processed_weight
+
+        return generator(), quant_state_dict
+
+    def _load_weights(self, model_config: ModelConfig,
+                      model: nn.Module) -> None:
+        if not hasattr(model, 'load_weights'):
+            raise AttributeError(
+                "The required method 'load_weights' is not defined in class"
+                f" {type(self).__name__}.")
+
+        if not hasattr(model, 'bitsandbytes_stacked_params_mapping'):
+            raise AttributeError(
+                f"Model {type(self).__name__} does not support BitsAndBytes "
+                "quantization yet.")
+
+        logger.info("Loading weights with BitsAndBytes quantization. "
+                    " May take a while ...")
+
+        qweight_iterator, quant_state_dict = (
+            self._get_quantized_weights_iterator(model_config.model,
+                                                 model_config.revision))
+
+        model.load_weights(qweight_iterator)
+
+        param_dict = dict(model.named_parameters())
+        stacked_quant_state_dict: Dict[str, Dict[int, Any]] = {}
+        for quant_param_name in quant_state_dict:
+            non_stacked_param_name = quant_param_name
+
+            shard_index = 0
+            for shard_name, (
+                    weight_name, index
+            ) in model.bitsandbytes_stacked_params_mapping.items():
+                if shard_name in quant_param_name:
+                    shard_index = index
+                    quant_param_name = quant_param_name.replace(
+                        shard_name, weight_name)
+                    break
+
+            if quant_param_name not in param_dict:
+                raise ValueError(
+                    f"Parameter {quant_param_name} not found in the model.")
+
+            if quant_param_name not in stacked_quant_state_dict:
+                stacked_quant_state_dict[quant_param_name] = {}
+
+            stacked_quant_state_dict[quant_param_name][shard_index] = (
+                quant_state_dict[non_stacked_param_name])
+
+        # save quant_states and offsets as the attributes of the parameters
+        for param_name, param in param_dict.items():
+            if param_name in stacked_quant_state_dict:
+                quant_states = stacked_quant_state_dict[param_name]
+                set_weight_attrs(param, {"bnb_quant_state": quant_states})
+
+                pack_ratio = getattr(param, "pack_factor", -1)
+                if pack_ratio == -1:
+                    raise ValueError(
+                        f"pack_factor not set for parameter {param_name}.")
+
+                num_elements = [0] * len(quant_states)
+                for seq, quant_state in enumerate(quant_states.items()):
+                    num_elements[seq] = math.prod(
+                        quant_state[1].shape) // pack_ratio
+
+                offsets = np.concatenate(([0], np.cumsum(num_elements)))
+                set_weight_attrs(param, {"bnb_shard_offsets": offsets})
+
+    def load_model(self, *, model_config: ModelConfig,
+                   device_config: DeviceConfig,
+                   lora_config: Optional[LoRAConfig],
+                   vision_language_config: Optional[VisionLanguageConfig],
+                   parallel_config: ParallelConfig,
+                   scheduler_config: SchedulerConfig,
+                   cache_config: CacheConfig) -> nn.Module:
+        with set_default_torch_dtype(model_config.dtype):
+            with torch.device(device_config.device):
+                model = _initialize_model(model_config, self.load_config,
+                                          lora_config, vision_language_config,
+                                          cache_config)
+
+                self._load_weights(model_config, model)
+
+        return model.eval()
+
+
 def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
     """Get a model loader based on the load format."""
 
@@ -575,4 +817,7 @@ def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
     if load_config.load_format == LoadFormat.SHARDED_STATE:
         return ShardedStateLoader(load_config)
 
+    if load_config.load_format == LoadFormat.BITSANDBYTES:
+        return BitsAndBytesModelLoader(load_config)
+
     return DefaultModelLoader(load_config)
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index a251828b45eb..e76d5798fd2e 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -145,7 +145,17 @@ def get_quant_config(model_config: ModelConfig,
 
     if hf_quant_config is not None:
         return quant_cls.from_config(hf_quant_config)
-    model_name_or_path = model_config.model
+    # In case of bitsandbytes/QLoRA, get quant config from the adapter model.
+    if model_config.quantization == "bitsandbytes":
+        if (not load_config.model_loader_extra_config
+                or "qlora_adapter_name_or_path"
+                not in load_config.model_loader_extra_config):
+            return quant_cls.from_config({"adapter_name_or_path": ""})
+        model_name_or_path = load_config.model_loader_extra_config[
+            "qlora_adapter_name_or_path"]
+
+    else:
+        model_name_or_path = model_config.model
     is_local = os.path.isdir(model_name_or_path)
     if not is_local:
         # Download the config files.
@@ -184,6 +194,10 @@ def get_quant_config(model_config: ModelConfig,
     quant_config_file = quant_config_files[0]
     with open(quant_config_file, "r") as f:
         config = json.load(f)
+
+        if model_config.quantization == "bitsandbytes":
+            config["adapter_name_or_path"] = model_name_or_path
+
     return quant_cls.from_config(config)
 
 
diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py
index 2ca55f9270fc..d83ee9a201c0 100644
--- a/vllm/model_executor/models/llama.py
+++ b/vllm/model_executor/models/llama.py
@@ -319,6 +319,14 @@ class LlamaForCausalLM(nn.Module):
         "lm_head": "output_embeddings",
     }
     embedding_padding_modules = ["lm_head"]
+    bitsandbytes_stacked_params_mapping = {
+        # shard_name, weight_name, index
+        "q_proj": ("qkv_proj", 0),
+        "k_proj": ("qkv_proj", 1),
+        "v_proj": ("qkv_proj", 2),
+        "gate_proj": ("gate_up_proj", 0),
+        "up_proj": ("gate_up_proj", 1),
+    }
 
     def __init__(
         self,

From 9c2a759e29ce039864bdbb94e2eedc8c9be6f0c3 Mon Sep 17 00:00:00 2001
From: Zhuohan Li <zhuohan123@gmail.com>
Date: Sat, 1 Jun 2024 15:40:25 -0700
Subject: [PATCH 07/93] [Bugfix] Remove deprecated @abstractproperty (#5174)

---
 vllm/core/evictor_v1.py     | 5 +++--
 vllm/core/evictor_v2.py     | 5 +++--
 vllm/lora/worker_manager.py | 5 +++--
 3 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/vllm/core/evictor_v1.py b/vllm/core/evictor_v1.py
index aa51dd693887..5db5a08a5bb6 100644
--- a/vllm/core/evictor_v1.py
+++ b/vllm/core/evictor_v1.py
@@ -1,5 +1,5 @@
 import enum
-from abc import ABC, abstractmethod, abstractproperty
+from abc import ABC, abstractmethod
 from typing import OrderedDict
 
 from vllm.block import PhysicalTokenBlock
@@ -44,7 +44,8 @@ def remove(self, block_hash: int) -> PhysicalTokenBlock:
         """
         pass
 
-    @abstractproperty
+    @property
+    @abstractmethod
     def num_blocks(self) -> int:
         pass
 
diff --git a/vllm/core/evictor_v2.py b/vllm/core/evictor_v2.py
index 57759b29347f..3dd12e2e2513 100644
--- a/vllm/core/evictor_v2.py
+++ b/vllm/core/evictor_v2.py
@@ -1,5 +1,5 @@
 import enum
-from abc import ABC, abstractmethod, abstractproperty
+from abc import ABC, abstractmethod
 from typing import OrderedDict, Tuple
 
 
@@ -46,7 +46,8 @@ def remove(self, block_id: int):
         """Remove a given block id from the cache."""
         pass
 
-    @abstractproperty
+    @property
+    @abstractmethod
     def num_blocks(self) -> int:
         pass
 
diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py
index d67ce67172e3..4657757bd484 100644
--- a/vllm/lora/worker_manager.py
+++ b/vllm/lora/worker_manager.py
@@ -1,4 +1,4 @@
-from abc import ABC, abstractmethod, abstractproperty
+from abc import ABC, abstractmethod
 from contextlib import contextmanager
 from typing import Any, Dict, List, Literal, Optional, Set, Type, Union
 
@@ -42,7 +42,8 @@ def dummy_lora_cache(self):
         yield
         self._cached_dummy_lora = False
 
-    @abstractproperty
+    @property
+    @abstractmethod
     def is_enabled(self) -> bool:
         ...
 

From fd82eff32576278a87bf68663493671067529b51 Mon Sep 17 00:00:00 2001
From: Daniil Arapov <59310708+Delviet@users.noreply.github.com>
Date: Sun, 2 Jun 2024 01:53:52 +0300
Subject: [PATCH 08/93] [Bugfix]: Fix issues related to prefix caching example
 (#5177) (#5180)

---
 examples/offline_inference_with_prefix.py | 47 ++++++++++++++++++-----
 1 file changed, 37 insertions(+), 10 deletions(-)

diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference_with_prefix.py
index 7ed0563f14e0..166e98549b53 100644
--- a/examples/offline_inference_with_prefix.py
+++ b/examples/offline_inference_with_prefix.py
@@ -1,5 +1,8 @@
+from time import time
+
 from vllm import LLM, SamplingParams
 
+# Common prefix.
 prefix = (
     "You are an expert school principal, skilled in effectively managing "
     "faculty and staff. Draft 10-15 questions for a potential first grade "
@@ -18,36 +21,60 @@
     "The capital of France is",
     "The future of AI is",
 ]
+
+generating_prompts = [prefix + prompt for prompt in prompts]
+
 # Create a sampling params object.
 sampling_params = SamplingParams(temperature=0.0)
 
 # Create an LLM.
-llm = LLM(model="facebook/opt-125m", enable_prefix_caching=True)
+regular_llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.4)
 
-generating_prompts = [prefix + prompt for prompt in prompts]
+prefix_cached_llm = LLM(model="facebook/opt-125m",
+                        enable_prefix_caching=True,
+                        gpu_memory_utilization=0.4)
+print("Results without `enable_prefix_caching`")
 
 # Generate texts from the prompts. The output is a list of RequestOutput objects
 # that contain the prompt, generated text, and other information.
-outputs = llm.generate(generating_prompts, sampling_params)
+start_time_regular = time()
+outputs = regular_llm.generate(generating_prompts, sampling_params)
+duration_regular = time() - start_time_regular
+
+regular_generated_texts = []
 # Print the outputs.
 for output in outputs:
     prompt = output.prompt
     generated_text = output.outputs[0].text
+    regular_generated_texts.append(generated_text)
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
 
 print("-" * 80)
 
 # The llm.generate call will batch all prompts and send the batch at once
-# if resources allow. The prefix will only be cached after the first batch
-# is processed, so we need to call generate once to calculate the prefix
-# and cache it.
-outputs = llm.generate(generating_prompts[0], sampling_params)
+# if resources allow.
+start_time_cached = time()
+outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
+duration_cached = time() - start_time_cached
 
-# Subsequent batches can leverage the cached prefix
-outputs = llm.generate(generating_prompts, sampling_params)
+print("Results with `enable_prefix_caching`")
 
-# Print the outputs. You should see the same outputs as before
+cached_generated_texts = []
+# Print the outputs. You should see the same outputs as before.
 for output in outputs:
     prompt = output.prompt
     generated_text = output.outputs[0].text
+    cached_generated_texts.append(generated_text)
     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
+
+print("-" * 80)
+
+# Compare the results and display the speedup
+generated_same = all([
+    regular_generated_texts[i] == cached_generated_texts[i]
+    for i in range(len(prompts))
+])
+print(f"Generated answers are the same: {generated_same}")
+
+speedup = round(duration_regular / duration_cached, 2)
+print(f"Speed up of cached generation compared to the regular is: {speedup}")

From 5b6b8ed2123049c568b743fb1ed7a441cba1e759 Mon Sep 17 00:00:00 2001
From: Robert Shaw
 <114415538+robertgshaw2-neuralmagic@users.noreply.github.com>
Date: Sat, 1 Jun 2024 19:35:41 -0400
Subject: [PATCH 09/93] [BugFix] Prevent `LLM.encode` for non-generation Models
  (#5184)

Co-authored-by: mgoin <michael@neuralmagic.com>
---
 vllm/entrypoints/llm.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 8a4245f93679..7f1c6ef9603a 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -285,6 +285,11 @@ def generate(
             considered legacy and may be deprecated in the future. You should
             instead pass them via the ``inputs`` parameter.
         """
+        if self.llm_engine.model_config.embedding_mode:
+            raise ValueError(
+                "LLM.generate() is only supported for generation models "
+                "(XForCausalLM).")
+
         if prompt_token_ids is not None or multi_modal_data is not None:
             inputs = self._convert_v1_inputs(
                 prompts=cast(Optional[Union[str, List[str]]], prompts),
@@ -429,6 +434,11 @@ def encode(
             considered legacy and may be deprecated in the future. You should
             instead pass them via the ``inputs`` parameter.
         """
+        if not self.llm_engine.model_config.embedding_mode:
+            raise ValueError(
+                "LLM.encode() is only supported for embedding models (XModel)."
+            )
+
         if prompt_token_ids is not None or multi_modal_data is not None:
             inputs = self._convert_v1_inputs(
                 prompts=cast(Optional[Union[str, List[str]]], prompts),

From 15650a31ff3f1d5c3171e00d639073717be12bf2 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Sat, 1 Jun 2024 21:21:53 -0500
Subject: [PATCH 10/93] Update test_ignore_eos (#4898)

---
 tests/samplers/test_ignore_eos.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py
index 864657a3c2b2..67b5168bea0e 100644
--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@@ -7,25 +7,26 @@
 
 from vllm import SamplingParams
 
-MODELS = ["facebook/opt-125m"]
+# We also test with llama because it has generation_config to specify EOS
+# (past regression).
+MODELS = ["facebook/opt-125m", "meta-llama/Llama-2-7b-hf"]
 
 
 @pytest.mark.parametrize("model", MODELS)
 @pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [1024])
-def test_beam_search_single_input(
+@pytest.mark.parametrize("max_tokens", [512])
+def test_ignore_eos(
     vllm_runner,
     example_prompts,
     model: str,
     dtype: str,
     max_tokens: int,
 ) -> None:
-    example_prompts = "1 + 1 is"
-
     vllm_model = vllm_runner(model, dtype=dtype)
     sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True)
-    ignore_eos_output = vllm_model.model.generate(
-        example_prompts, sampling_params=sampling_params)
-    print(len(ignore_eos_output[0].outputs[0].token_ids))
-    assert max_tokens - len(ignore_eos_output[0].outputs[0].token_ids) < 10
-    assert max_tokens - len(ignore_eos_output[0].outputs[0].token_ids) >= 0
+
+    for prompt in example_prompts:
+        ignore_eos_output = vllm_model.model.generate(
+            prompt, sampling_params=sampling_params)
+        output_length = len(ignore_eos_output[0].outputs[0].token_ids)
+        assert output_length == max_tokens

From dc64b0788a1bd117f5ea382acdbf676a04ca898c Mon Sep 17 00:00:00 2001
From: Avinash Raj <avistylein3105@gmail.com>
Date: Sun, 2 Jun 2024 13:36:13 +0530
Subject: [PATCH 11/93] [Frontend][OpenAI] Support for returning max_model_len
 on /v1/models response (#4643)

---
 vllm/entrypoints/openai/protocol.py       | 1 +
 vllm/entrypoints/openai/serving_engine.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index e380212a4d76..bbd61a2c5dd5 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -82,6 +82,7 @@ class ModelCard(OpenAIBaseModel):
     owned_by: str = "vllm"
     root: Optional[str] = None
     parent: Optional[str] = None
+    max_model_len: Optional[int] = None
     permission: List[ModelPermission] = Field(default_factory=list)
 
 
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index 066acdf1c019..ae659d19c878 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -62,6 +62,7 @@ async def show_available_models(self) -> ModelList:
         """Show available models. Right now we only have one model."""
         model_cards = [
             ModelCard(id=served_model_name,
+                      max_model_len=self.max_model_len,
                       root=self.served_model_names[0],
                       permission=[ModelPermission()])
             for served_model_name in self.served_model_names

From bfc6bc751f93214831ad6fd7d7a46dcdcf352126 Mon Sep 17 00:00:00 2001
From: Divakar Verma <137818590+divakar-amd@users.noreply.github.com>
Date: Sun, 2 Jun 2024 16:13:26 -0500
Subject: [PATCH 12/93] [Kernel][ROCm][AMD] enable fused topk_softmax kernel
 for moe layer (#4927)

This PR enables the fused topk_softmax kernel used in moe layer for HIP
---
 CMakeLists.txt                                |  8 ++--
 Dockerfile.rocm                               |  1 +
 csrc/cuda_compat.h                            |  4 ++
 csrc/moe/topk_softmax_kernels.cu              | 27 +++++++----
 setup.py                                      |  2 +-
 .../layers/fused_moe/fused_moe.py             | 46 ++++++++-----------
 6 files changed, 45 insertions(+), 43 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5f991af61d9b..a197063f3360 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -311,6 +311,9 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
   message(STATUS "Enabling C extension.")
   add_dependencies(default _C)
 
+  message(STATUS "Enabling moe extension.")
+  add_dependencies(default _moe_C)
+
   # Enable punica if -DVLLM_INSTALL_PUNICA_KERNELS=ON or
   # VLLM_INSTALL_PUNICA_KERNELS is set in the environment and
   # there are supported target arches.
@@ -320,8 +323,3 @@ if(VLLM_GPU_LANG STREQUAL "CUDA" OR VLLM_GPU_LANG STREQUAL "HIP")
     add_dependencies(default _punica_C)
   endif()
 endif()
-
-if(VLLM_GPU_LANG STREQUAL "CUDA")
-  message(STATUS "Enabling moe extension.")
-  add_dependencies(default _moe_C)
-endif()
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index 9bfe8446a519..e30a2aaf3020 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -108,6 +108,7 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     && python3 setup.py install \
     && cp build/lib.linux-x86_64-cpython-39/vllm/_C.cpython-39-x86_64-linux-gnu.so vllm/ \
     && cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.cpython-39-x86_64-linux-gnu.so vllm/ \
+    && cp build/lib.linux-x86_64-cpython-39/vllm/_moe_C.cpython-39-x86_64-linux-gnu.so vllm/ \
     && cd ..
 
 
diff --git a/csrc/cuda_compat.h b/csrc/cuda_compat.h
index 5909e5eaf5e6..82e55613d915 100644
--- a/csrc/cuda_compat.h
+++ b/csrc/cuda_compat.h
@@ -19,8 +19,12 @@
 #ifndef USE_ROCM
   #define VLLM_SHFL_XOR_SYNC(var, lane_mask) \
     __shfl_xor_sync(uint32_t(-1), var, lane_mask)
+  #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
+    __shfl_xor_sync(uint32_t(-1), var, lane_mask, width)
 #else
   #define VLLM_SHFL_XOR_SYNC(var, lane_mask) __shfl_xor(var, lane_mask)
+  #define VLLM_SHFL_XOR_SYNC_WIDTH(var, lane_mask, width) \
+    __shfl_xor(var, lane_mask, width)
 #endif
 
 #ifndef USE_ROCM
diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu
index 8c65f40fe836..6ba4fcdb3a3f 100644
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -19,15 +19,22 @@
 #include <torch/extension.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
+#include "../cuda_compat.h"
 
-#include <cub/cub.cuh>
-#include <cub/util_type.cuh>
+#ifndef USE_ROCM
+    #include <cub/util_type.cuh>
+    #include <cub/cub.cuh>
+#else
+    #include <hipcub/util_type.hpp>
+    #include <hipcub/hipcub.hpp>
+#endif
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
 
 namespace vllm {
 namespace moe {
 
-static constexpr int WARP_SIZE = 32;
-
 /// Aligned array type
 template <
     typename T,
@@ -265,7 +272,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
 #pragma unroll
     for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
     {
-        thread_max = max(thread_max, __shfl_xor_sync(0xFFFFFFFF, thread_max, mask, THREADS_PER_ROW));
+        thread_max = max(thread_max, VLLM_SHFL_XOR_SYNC_WIDTH(thread_max, mask, THREADS_PER_ROW));
     }
 
     // From this point, thread max in all the threads have the max within the row.
@@ -282,7 +289,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
 #pragma unroll
     for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
     {
-        row_sum += __shfl_xor_sync(0xFFFFFFFF, row_sum, mask, THREADS_PER_ROW);
+        row_sum += VLLM_SHFL_XOR_SYNC_WIDTH(row_sum, mask, THREADS_PER_ROW);
     }
 
     // From this point, all threads have the max and the sum for their rows in the thread_max and thread_sum variables
@@ -332,8 +339,8 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__
 #pragma unroll
         for (int mask = THREADS_PER_ROW / 2; mask > 0; mask /= 2)
         {
-            float other_max = __shfl_xor_sync(0xFFFFFFFF, max_val, mask, THREADS_PER_ROW);
-            int other_expert = __shfl_xor_sync(0xFFFFFFFF, expert, mask, THREADS_PER_ROW);
+            float other_max = VLLM_SHFL_XOR_SYNC_WIDTH(max_val, mask, THREADS_PER_ROW);
+            int other_expert = VLLM_SHFL_XOR_SYNC_WIDTH(expert, mask, THREADS_PER_ROW);
 
             // We want lower indices to "win" in every thread so we break ties this way
             if (other_max > max_val || (other_max == max_val && other_expert < expert))
@@ -383,7 +390,7 @@ struct TopkConstants
 {
     static constexpr int ELTS_PER_LDG = BYTES_PER_LDG / sizeof(float);
     static_assert(EXPERTS / (ELTS_PER_LDG * WARP_SIZE) == 0 || EXPERTS % (ELTS_PER_LDG * WARP_SIZE) == 0, "");
-    static constexpr int VECs_PER_THREAD = std::max(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE));
+    static constexpr int VECs_PER_THREAD = MAX(1, EXPERTS / (ELTS_PER_LDG * WARP_SIZE));
     static constexpr int VPT = VECs_PER_THREAD * ELTS_PER_LDG;
     static constexpr int THREADS_PER_ROW = EXPERTS / VPT;
     static constexpr int ROWS_PER_WARP = WARP_SIZE / THREADS_PER_ROW;
@@ -396,7 +403,7 @@ void topkGatingSoftmaxLauncherHelper(const float* input, const bool* finished, f
 {
     static constexpr std::size_t MAX_BYTES_PER_LDG = 16;
 
-    static constexpr int BYTES_PER_LDG = std::min(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
+    static constexpr int BYTES_PER_LDG = MIN(MAX_BYTES_PER_LDG, sizeof(float) * EXPERTS);
     using Constants = detail::TopkConstants<EXPERTS, BYTES_PER_LDG>;
     static constexpr int VPT = Constants::VPT;
     static constexpr int ROWS_PER_WARP = Constants::ROWS_PER_WARP;
diff --git a/setup.py b/setup.py
index c35fbf2e15ae..cadc89379bec 100644
--- a/setup.py
+++ b/setup.py
@@ -402,7 +402,7 @@ def _read_requirements(filename: str) -> List[str]:
 
 ext_modules = []
 
-if _is_cuda():
+if _is_cuda() or _is_hip():
     ext_modules.append(CMakeExtension(name="vllm._moe_C"))
 
 if not _is_neuron():
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index bb7938b3715b..20a3c9f6f893 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -8,9 +8,9 @@
 import triton
 import triton.language as tl
 
+import vllm._moe_C as moe_kernels
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
-from vllm.utils import is_hip
 
 logger = init_logger(__name__)
 
@@ -319,34 +319,26 @@ def fused_topk(
 
     M, _ = hidden_states.shape
 
-    if is_hip():
-        # The MoE kernels are not yet supported on ROCm.
-        routing_weights = torch.softmax(gating_output,
-                                        dim=-1,
-                                        dtype=torch.float32)
-        topk_weights, topk_ids = torch.topk(routing_weights, topk, dim=-1)
-    else:
-        import vllm._moe_C as moe_kernels
-
-        topk_weights = torch.empty(M,
-                                   topk,
-                                   dtype=torch.float32,
-                                   device=hidden_states.device)
-        topk_ids = torch.empty(M,
+    topk_weights = torch.empty(M,
                                topk,
-                               dtype=torch.int32,
+                               dtype=torch.float32,
                                device=hidden_states.device)
-        token_expert_indicies = torch.empty(M,
-                                            topk,
-                                            dtype=torch.int32,
-                                            device=hidden_states.device)
-        moe_kernels.topk_softmax(
-            topk_weights,
-            topk_ids,
-            token_expert_indicies,
-            gating_output.float(),  # TODO(woosuk): Optimize this.
-        )
-        del token_expert_indicies  # Not used. Will be used in the future.
+    topk_ids = torch.empty(M,
+                           topk,
+                           dtype=torch.int32,
+                           device=hidden_states.device)
+    token_expert_indicies = torch.empty(M,
+                                        topk,
+                                        dtype=torch.int32,
+                                        device=hidden_states.device)
+    moe_kernels.topk_softmax(
+        topk_weights,
+        topk_ids,
+        token_expert_indicies,
+        gating_output.float(),  # TODO(woosuk): Optimize this.
+    )
+    del token_expert_indicies  # Not used. Will be used in the future.
+
     if renormalize:
         topk_weights = topk_weights / topk_weights.sum(dim=-1, keepdim=True)
     return topk_weights, topk_ids

From 5008643e87e92f899967b2e35cb89b85c1ad2d4c Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 3 Jun 2024 07:05:50 +0800
Subject: [PATCH 13/93] [Misc] Simplify code and fix type annotations in
 `conftest.py` (#5118)

---
 tests/conftest.py | 95 +++++++++++++++++++++--------------------------
 1 file changed, 43 insertions(+), 52 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 0b44b1761c9a..bb4525a32ebb 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -6,6 +6,7 @@
 
 import pytest
 import torch
+import torch.nn.functional as F
 from PIL import Image
 from transformers import (AutoModelForCausalLM, AutoProcessor, AutoTokenizer,
                           LlavaConfig, LlavaForConditionalGeneration)
@@ -14,9 +15,9 @@
 from vllm import LLM, SamplingParams
 from vllm.config import TokenizerPoolConfig, VisionLanguageConfig
 from vllm.distributed import destroy_model_parallel
-from vllm.inputs import PromptInputs
+from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
-from vllm.sequence import MultiModalData
+from vllm.sequence import MultiModalData, SampleLogprobs
 
 logger = init_logger(__name__)
 
@@ -192,10 +193,11 @@ def generate(
         prompts: List[str],
         images: Optional[List[Image.Image]] = None,
         **kwargs,
-    ) -> List[Tuple[List[int], str]]:
-        outputs: List[Tuple[List[int], str]] = []
+    ) -> List[Tuple[List[List[int]], List[str]]]:
         if images:
             assert len(prompts) == len(images)
+
+        outputs: List[Tuple[List[List[int]], List[str]]] = []
         for i, prompt in enumerate(prompts):
             processor_kwargs: Dict[str, Any] = {
                 "text": prompt,
@@ -205,17 +207,13 @@ def generate(
                 processor_kwargs["images"] = images[i]
 
             inputs = self.processor(**processor_kwargs)
-            inputs = {
-                key: value.cuda() if value is not None else None
-                for key, value in inputs.items()
-            }
 
             output_ids = self.model.generate(
-                **inputs,
+                **inputs.to("cuda"),
                 use_cache=True,
                 **kwargs,
             )
-            output_str = self.tokenizer.batch_decode(
+            output_str = self.processor.batch_decode(
                 output_ids,
                 skip_special_tokens=True,
                 clean_up_tokenization_spaces=False,
@@ -228,23 +226,22 @@ def generate_greedy(
         self,
         prompts: List[str],
         max_tokens: int,
-        images: Optional["torch.Tensor"] = None,
+        images: Optional[List[Image.Image]] = None,
     ) -> List[Tuple[List[int], str]]:
         outputs = self.generate(prompts,
                                 do_sample=False,
                                 max_new_tokens=max_tokens,
                                 images=images)
-        for i in range(len(outputs)):
-            output_ids, output_str = outputs[i]
-            outputs[i] = (output_ids[0], output_str[0])
-        return outputs
+
+        return [(output_ids[0], output_str[0])
+                for output_ids, output_str in outputs]
 
     def generate_beam_search(
         self,
         prompts: List[str],
         beam_width: int,
         max_tokens: int,
-    ) -> List[Tuple[List[int], str]]:
+    ) -> List[Tuple[List[List[int]], List[str]]]:
         outputs = self.generate(prompts,
                                 do_sample=False,
                                 max_new_tokens=max_tokens,
@@ -286,9 +283,7 @@ def generate_greedy_logprobs(
                 if self.model.get_output_embeddings().bias is not None:
                     logits += self.model.get_output_embeddings(
                     ).bias.unsqueeze(0)
-                logprobs = torch.nn.functional.log_softmax(logits,
-                                                           dim=-1,
-                                                           dtype=torch.float32)
+                logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
                 seq_logprobs.append(logprobs)
             all_logprobs.append(seq_logprobs)
         return all_logprobs
@@ -298,10 +293,10 @@ def generate_greedy_logprobs_limit(
         prompts: List[str],
         max_tokens: int,
         num_logprobs: int,
-    ) -> List[Tuple[List[int], str]]:
-        all_logprobs = []
-        all_output_ids = []
-        all_output_strs = []
+    ) -> List[Tuple[List[int], str, List[Dict[int, float]]]]:
+        all_logprobs: List[List[Dict[int, float]]] = []
+        all_output_ids: List[List[int]] = []
+        all_output_strs: List[str] = []
 
         for prompt in prompts:
             input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
@@ -314,7 +309,7 @@ def generate_greedy_logprobs_limit(
                 return_dict_in_generate=True,
             )
 
-            seq_logprobs = []
+            seq_logprobs: List[torch.Tensor] = []
             for _, hidden_states in enumerate(output.hidden_states):
                 last_hidden_states = hidden_states[-1][0]
                 logits = torch.matmul(
@@ -325,13 +320,11 @@ def generate_greedy_logprobs_limit(
                            None) is not None:
                     logits += self.model.get_output_embeddings(
                     ).bias.unsqueeze(0)
-                logprobs = torch.nn.functional.log_softmax(logits,
-                                                           dim=-1,
-                                                           dtype=torch.float32)
+                logprobs = F.log_softmax(logits, dim=-1, dtype=torch.float32)
                 seq_logprobs.append(logprobs)
 
             # convert to dict
-            seq_logprobs_lst = []
+            seq_logprobs_lst: List[Dict[int, float]] = []
             for tok_idx, tok_logprobs in enumerate(seq_logprobs):
                 # drop prompt logprobs
                 if tok_idx == 0:
@@ -557,20 +550,19 @@ def __init__(
         tokenizer_name: Optional[str] = None,
         # Use smaller max model length, otherwise bigger model cannot run due
         # to kv cache size limit.
-        max_model_len=1024,
+        max_model_len: int = 1024,
         dtype: str = "half",
         disable_log_stats: bool = True,
         tensor_parallel_size: int = 1,
         block_size: int = 16,
         enable_chunked_prefill: bool = False,
-        swap_space=4,
-        trust_remote_code: bool = True,
+        swap_space: int = 4,
         **kwargs,
     ) -> None:
         self.model = LLM(
             model=model_name,
             tokenizer=tokenizer_name,
-            trust_remote_code=trust_remote_code,
+            trust_remote_code=True,
             dtype=dtype,
             swap_space=swap_space,
             disable_log_stats=disable_log_stats,
@@ -585,32 +577,31 @@ def generate(
         self,
         prompts: List[str],
         sampling_params: SamplingParams,
-        images: Optional["torch.Tensor"] = None,
-    ) -> List[Tuple[List[int], str]]:
+        images: Optional[torch.Tensor] = None,
+    ) -> List[Tuple[List[List[int]], List[str]]]:
         if images is not None:
-            assert len(prompts) == images.shape[0]
+            assert len(prompts) == len(images)
 
-        prompt_inputs: List[PromptInputs] = []
+        prompt_inputs: List[TextPrompt] = []
         for i, prompt in enumerate(prompts):
-            image = None if images is None else images[i:i + 1]
-            mm_data = None if image is None else MultiModalData(
-                type=MultiModalData.Type.IMAGE,
-                data=image,
-            )
+            prompt = TextPrompt(prompt=prompt)
+            if images is not None:
+                prompt["multi_modal_data"] = MultiModalData(
+                    type=MultiModalData.Type.IMAGE,
+                    data=images[i:i + 1],
+                )
 
-            prompt_inputs.append({
-                "prompt": prompt,
-                "multi_modal_data": mm_data,
-            })
+            prompt_inputs.append(prompt)
 
         req_outputs = self.model.generate(prompt_inputs,
                                           sampling_params=sampling_params)
-        outputs = []
+
+        outputs: List[Tuple[List[List[int]], List[str]]] = []
         for req_output in req_outputs:
             prompt_str = req_output.prompt
             prompt_ids = req_output.prompt_token_ids
-            req_sample_output_ids = []
-            req_sample_output_strs = []
+            req_sample_output_ids: List[List[int]] = []
+            req_sample_output_strs: List[str] = []
             for sample in req_output.outputs:
                 output_str = sample.text
                 output_ids = sample.token_ids
@@ -623,12 +614,12 @@ def generate_w_logprobs(
         self,
         prompts: List[str],
         sampling_params: SamplingParams,
-    ) -> List[Tuple[List[int], str]]:
+    ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
         assert sampling_params.logprobs is not None
 
         req_outputs = self.model.generate(prompts,
                                           sampling_params=sampling_params)
-        outputs = []
+        outputs: List[Tuple[List[int], str, Optional[SampleLogprobs]]] = []
         for req_output in req_outputs:
             for sample in req_output.outputs:
                 output_str = sample.text
@@ -653,7 +644,7 @@ def generate_greedy_logprobs(
         prompts: List[str],
         max_tokens: int,
         num_logprobs: int,
-    ) -> List[Tuple[List[int], str]]:
+    ) -> List[Tuple[List[int], str, Optional[SampleLogprobs]]]:
         greedy_logprobs_params = SamplingParams(temperature=0.0,
                                                 max_tokens=max_tokens,
                                                 logprobs=num_logprobs)
@@ -667,7 +658,7 @@ def generate_beam_search(
         prompts: List[str],
         beam_width: int,
         max_tokens: int,
-    ) -> List[Tuple[List[int], str]]:
+    ) -> List[Tuple[List[List[int]], List[str]]]:
         beam_search_params = SamplingParams(n=beam_width,
                                             use_beam_search=True,
                                             temperature=0.0,

From c070e44996d3813c5469d35b25ca500e5005c351 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 3 Jun 2024 13:56:41 +0800
Subject: [PATCH 14/93] [Core] Support image processor (#4197)

---
 .github/workflows/mypy.yaml                   |   1 +
 docs/source/conf.py                           |  14 +-
 .../dev/multimodal/multimodal_index.rst       |  51 ++++++
 docs/source/index.rst                         |   6 +-
 docs/source/models/supported_models.rst       |   4 +
 docs/source/models/vlm.rst                    |  56 +++++++
 examples/llava_example.py                     |  29 ++--
 format.sh                                     |   1 +
 requirements-common.txt                       |   1 +
 requirements-dev.txt                          |   3 -
 tests/conftest.py                             |  45 ++---
 tests/models/test_llava.py                    |  60 ++++---
 tests/multimodal/__init__.py                  |   0
 tests/multimodal/test_processor.py            |  98 +++++++++++
 tests/spec_decode/e2e/conftest.py             |   3 +-
 tests/tokenization/test_image_processor.py    |  20 +++
 vllm/config.py                                |   6 +-
 vllm/engine/arg_utils.py                      | 108 ++++++++----
 vllm/entrypoints/llm.py                       |  25 +--
 vllm/model_executor/models/llava.py           |  73 +++++---
 vllm/multimodal/__init__.py                   |   7 +
 vllm/multimodal/base.py                       | 126 ++++++++++++++
 vllm/multimodal/image.py                      | 141 ++++++++++++++++
 vllm/multimodal/registry.py                   | 156 ++++++++++++++++++
 vllm/sequence.py                              |  32 +---
 vllm/transformers_utils/image_processor.py    |  45 +++++
 vllm/worker/cpu_model_runner.py               |  57 ++++---
 vllm/worker/embedding_model_runner.py         |  10 +-
 vllm/worker/model_runner.py                   | 120 +++++++-------
 29 files changed, 1042 insertions(+), 256 deletions(-)
 create mode 100644 docs/source/dev/multimodal/multimodal_index.rst
 create mode 100644 docs/source/models/vlm.rst
 create mode 100644 tests/multimodal/__init__.py
 create mode 100644 tests/multimodal/test_processor.py
 create mode 100644 tests/tokenization/test_image_processor.py
 create mode 100644 vllm/multimodal/__init__.py
 create mode 100644 vllm/multimodal/base.py
 create mode 100644 vllm/multimodal/image.py
 create mode 100644 vllm/multimodal/registry.py
 create mode 100644 vllm/transformers_utils/image_processor.py

diff --git a/.github/workflows/mypy.yaml b/.github/workflows/mypy.yaml
index a20753d8a770..22e6c2ef0101 100644
--- a/.github/workflows/mypy.yaml
+++ b/.github/workflows/mypy.yaml
@@ -37,6 +37,7 @@ jobs:
         mypy vllm/distributed --config-file pyproject.toml
         mypy vllm/entrypoints --config-file pyproject.toml
         mypy vllm/executor --config-file pyproject.toml
+        mypy vllm/multimodal --config-file pyproject.toml
         mypy vllm/usage --config-file pyproject.toml
         mypy vllm/*.py --config-file pyproject.toml
         mypy vllm/transformers_utils --config-file pyproject.toml
diff --git a/docs/source/conf.py b/docs/source/conf.py
index cfebc2ff9bb3..f1a7013edd33 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -90,6 +90,7 @@ def setup(app):
     "sentencepiece",
     "vllm.cuda_utils",
     "vllm._C",
+    "PIL",
     "numpy",
     "tqdm",
     "tensorizer",
@@ -116,12 +117,13 @@ def add_line(self, line: str, source: str, *lineno: int) -> None:
 autodoc.ClassDocumenter = MockedClassDocumenter
 
 intersphinx_mapping = {
-    'python': ('https://docs.python.org/3', None),
-    'typing_extensions':
-    ('https://typing-extensions.readthedocs.io/en/latest', None),
-    'numpy': ('https://numpy.org/doc/stable', None),
-    'torch': ('https://pytorch.org/docs/stable', None),
-    'psutil': ('https://psutil.readthedocs.io/en/stable', None),
+    "python": ("https://docs.python.org/3", None),
+    "typing_extensions":
+    ("https://typing-extensions.readthedocs.io/en/latest", None),
+    "pillow": ("https://pillow.readthedocs.io/en/stable", None),
+    "numpy": ("https://numpy.org/doc/stable", None),
+    "torch": ("https://pytorch.org/docs/stable", None),
+    "psutil": ("https://psutil.readthedocs.io/en/stable", None),
 }
 
 autodoc_preserve_defaults = True
diff --git a/docs/source/dev/multimodal/multimodal_index.rst b/docs/source/dev/multimodal/multimodal_index.rst
new file mode 100644
index 000000000000..a25eceecc276
--- /dev/null
+++ b/docs/source/dev/multimodal/multimodal_index.rst
@@ -0,0 +1,51 @@
+Multi-Modality
+==============
+
+.. currentmodule:: vllm.multimodal
+    
+vLLM provides experimental support for multi-modal models through the :mod:`vllm.multimodal` package.
+
+:class:`vllm.inputs.PromptStrictInputs` accepts an additional attribute ``multi_modal_data``
+which allows you to pass in multi-modal input alongside text and token prompts.
+
+By default, vLLM models do not support multi-modal inputs. To enable multi-modal support for a model,
+you must decorate the model class with :meth:`MULTIMODAL_REGISTRY.register_dummy_data <MultiModalRegistry.register_dummy_data>`,
+as well as :meth:`MULTIMODAL_REGISTRY.register_input <MultiModalRegistry.register_input>` for each modality type to support.
+
+.. contents::
+   :local:
+   :backlinks: none
+
+Module Contents
++++++++++++++++
+
+.. automodule:: vllm.multimodal
+
+Registry
+--------
+
+.. data:: vllm.multimodal.MULTIMODAL_REGISTRY
+
+    The global :class:`MultiModalRegistry` which is used by model runners.
+
+.. autoclass:: vllm.multimodal.MultiModalRegistry
+    :members:
+    :show-inheritance:
+
+Base Classes
+------------
+
+.. autoclass:: vllm.multimodal.MultiModalData
+    :members:
+    :show-inheritance:
+
+.. autoclass:: vllm.multimodal.MultiModalPlugin
+    :members:
+    :show-inheritance:
+
+Image Classes
+-------------
+
+.. automodule:: vllm.multimodal.image
+    :members:
+    :show-inheritance:
diff --git a/docs/source/index.rst b/docs/source/index.rst
index 5f18fe9ae0a7..fad3c3b05b0c 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -88,6 +88,7 @@ Documentation
    models/adding_model
    models/engine_args
    models/lora
+   models/vlm
    models/performance
 
 .. toctree::
@@ -99,17 +100,18 @@ Documentation
    quantization/fp8_e4m3_kvcache
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 1
    :caption: Developer Documentation
    
    dev/sampling_params
    dev/offline_inference/offline_index
    dev/engine/engine_index
    dev/kernel/paged_attention
+   dev/multimodal/multimodal_index
    dev/dockerfile/dockerfile
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 1
    :caption: Community
 
    community/meetups
diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 82e71e61975c..24fa83df7d75 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -87,6 +87,10 @@ Alongside each architecture, we include some popular models that use it.
     - LLaMA, Llama 2, Meta Llama 3, Vicuna, Alpaca, Yi
     - :code:`meta-llama/Meta-Llama-3-8B-Instruct`, :code:`meta-llama/Meta-Llama-3-70B-Instruct`, :code:`meta-llama/Llama-2-13b-hf`, :code:`meta-llama/Llama-2-70b-hf`, :code:`openlm-research/open_llama_13b`, :code:`lmsys/vicuna-13b-v1.3`, :code:`01-ai/Yi-6B`, :code:`01-ai/Yi-34B`, etc.
     - ✅︎
+  * - :code:`LlavaForConditionalGeneration`
+    - LLaVA-1.5
+    - :code:`llava-hf/llava-1.5-7b-hf`\*, :code:`llava-hf/llava-1.5-13b-hf`\*, etc.
+    -
   * - :code:`MiniCPMForCausalLM`
     - MiniCPM
     - :code:`openbmb/MiniCPM-2B-sft-bf16`, :code:`openbmb/MiniCPM-2B-dpo-bf16`, etc.
diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
new file mode 100644
index 000000000000..52afda747aab
--- /dev/null
+++ b/docs/source/models/vlm.rst
@@ -0,0 +1,56 @@
+.. _vlm:
+
+Using VLMs
+==========
+
+This document shows you how to run and serve Vision Language Models (VLMs) using vLLM.
+
+Engine Arguments
+----------------
+
+The following :ref:`engine arguments <engine_args>` are specific to VLMs:
+
+.. argparse::
+    :module: vllm.engine.arg_utils
+    :func: _vlm_engine_args_parser
+    :prog: -m vllm.entrypoints.openai.api_server
+    :nodefaultconst:
+
+Offline Batched Inference
+-------------------------
+
+To initialize a VLM, the aforementioned arguments must be passed to the ``LLM`` class for instantiating the engine.
+
+.. code-block:: python
+
+    llm = LLM(
+        model="llava-hf/llava-1.5-7b-hf",
+        image_input_type="pixel_values",
+        image_token_id=32000,
+        image_input_shape="1,3,336,336",
+        image_feature_size=576,
+    )
+
+For now, we only support a single image per text prompt. To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
+
+* ``prompt``: The prompt should have a number of ``<image>`` tokens equal to ``image_feature_size``.
+* ``multi_modal_data``: This should be an instance of :class:`~vllm.multimodal.image.ImagePixelData` or :class:`~vllm.multimodal.image.ImageFeatureData`.
+
+.. code-block:: python
+
+    prompt = "<image>" * 576 + (
+        "\nUSER: What is the content of this image?\nASSISTANT:")
+
+    # Load the image using PIL.Image
+    image = ...
+
+    outputs = llm.generate({
+        "prompt": prompt,
+        "multi_modal_data": ImagePixelData(image),
+    })
+
+    for o in outputs:
+        generated_text = o.outputs[0].text
+        print(generated_text)
+
+A code example can be found in `examples/llava_example.py <https://github.com/vllm-project/vllm/blob/main/examples/llava_example.py>`_.
diff --git a/examples/llava_example.py b/examples/llava_example.py
index 60250c4303fb..980d7bf9f8a3 100644
--- a/examples/llava_example.py
+++ b/examples/llava_example.py
@@ -3,33 +3,36 @@
 import subprocess
 
 import torch
+from PIL import Image
 
 from vllm import LLM
-from vllm.sequence import MultiModalData
+from vllm.multimodal.image import ImageFeatureData, ImagePixelData
 
 # The assets are located at `s3://air-example-data-2/vllm_opensource_llava/`.
+# You can use `.buildkite/download-images.sh` to download them
 
 
-def run_llava_pixel_values():
+def run_llava_pixel_values(*, disable_image_processor: bool = False):
     llm = LLM(
         model="llava-hf/llava-1.5-7b-hf",
         image_input_type="pixel_values",
         image_token_id=32000,
         image_input_shape="1,3,336,336",
         image_feature_size=576,
+        disable_image_processor=disable_image_processor,
     )
 
     prompt = "<image>" * 576 + (
         "\nUSER: What is the content of this image?\nASSISTANT:")
 
-    # This should be provided by another online or offline component.
-    image = torch.load("images/stop_sign_pixel_values.pt")
+    if disable_image_processor:
+        image = torch.load("images/stop_sign_pixel_values.pt")
+    else:
+        image = Image.open("images/stop_sign.jpg")
 
     outputs = llm.generate({
-        "prompt":
-        prompt,
-        "multi_modal_data":
-        MultiModalData(type=MultiModalData.Type.IMAGE, data=image),
+        "prompt": prompt,
+        "multi_modal_data": ImagePixelData(image),
     })
 
     for o in outputs:
@@ -49,15 +52,13 @@ def run_llava_image_features():
     prompt = "<image>" * 576 + (
         "\nUSER: What is the content of this image?\nASSISTANT:")
 
-    # This should be provided by another online or offline component.
-    image = torch.load("images/stop_sign_image_features.pt")
+    image: torch.Tensor = torch.load("images/stop_sign_image_features.pt")
 
     outputs = llm.generate({
-        "prompt":
-        prompt,
-        "multi_modal_data":
-        MultiModalData(type=MultiModalData.Type.IMAGE, data=image),
+        "prompt": prompt,
+        "multi_modal_data": ImageFeatureData(image),
     })
+
     for o in outputs:
         generated_text = o.outputs[0].text
         print(generated_text)
diff --git a/format.sh b/format.sh
index 4fd7071a0278..18984935013b 100755
--- a/format.sh
+++ b/format.sh
@@ -101,6 +101,7 @@ mypy vllm/core --config-file pyproject.toml
 mypy vllm/distributed --config-file pyproject.toml
 mypy vllm/entrypoints --config-file pyproject.toml
 mypy vllm/executor --config-file pyproject.toml
+mypy vllm/multimodal --config-file pyproject.toml
 mypy vllm/usage --config-file pyproject.toml
 mypy vllm/*.py --config-file pyproject.toml
 mypy vllm/transformers_utils --config-file pyproject.toml
diff --git a/requirements-common.txt b/requirements-common.txt
index 3ea22276f63f..f41873570aa6 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -12,6 +12,7 @@ aiohttp
 openai
 uvicorn[standard]
 pydantic >= 2.0  # Required for OpenAI server.
+pillow  # Required for image processing
 prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 22cc53fd3a72..837ed9d495e1 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -35,8 +35,5 @@ sentence-transformers # required for embedding
 # Benchmarking
 aiohttp
 
-# Multimodal
-pillow
-
 # quantization
 bitsandbytes==0.42.0
diff --git a/tests/conftest.py b/tests/conftest.py
index bb4525a32ebb..796f498bb28a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -17,7 +17,9 @@
 from vllm.distributed import destroy_model_parallel
 from vllm.inputs import TextPrompt
 from vllm.logger import init_logger
-from vllm.sequence import MultiModalData, SampleLogprobs
+from vllm.multimodal import MultiModalData
+from vllm.multimodal.image import ImageFeatureData, ImagePixelData
+from vllm.sequence import SampleLogprobs
 
 logger = init_logger(__name__)
 
@@ -26,6 +28,7 @@
 _LONG_PROMPTS = [os.path.join(_TEST_DIR, "prompts", "summary.txt")]
 
 # Multi modal related
+# You can use `.buildkite/download-images.sh` to download the assets
 _PIXEL_VALUES_FILES = [
     os.path.join(_TEST_DIR, "images", filename) for filename in
     ["stop_sign_pixel_values.pt", "cherry_blossom_pixel_values.pt"]
@@ -91,17 +94,23 @@ def hf_images() -> List[Image.Image]:
 
 
 @pytest.fixture()
-def vllm_images(request) -> "torch.Tensor":
+def vllm_images(request) -> List[MultiModalData]:
     vision_language_config = request.getfixturevalue("model_and_config")[1]
-    all_images = []
     if vision_language_config.image_input_type == (
             VisionLanguageConfig.ImageInputType.IMAGE_FEATURES):
-        filenames = _IMAGE_FEATURES_FILES
+        return [
+            ImageFeatureData(torch.load(filename))
+            for filename in _IMAGE_FEATURES_FILES
+        ]
     else:
-        filenames = _PIXEL_VALUES_FILES
-    for filename in filenames:
-        all_images.append(torch.load(filename))
-    return torch.concat(all_images, dim=0)
+        return [
+            ImagePixelData(Image.open(filename)) for filename in _IMAGE_FILES
+        ]
+
+
+@pytest.fixture()
+def vllm_image_tensors(request) -> List[torch.Tensor]:
+    return [torch.load(filename) for filename in _PIXEL_VALUES_FILES]
 
 
 @pytest.fixture()
@@ -577,23 +586,17 @@ def generate(
         self,
         prompts: List[str],
         sampling_params: SamplingParams,
-        images: Optional[torch.Tensor] = None,
+        images: Optional[List[MultiModalData]] = None,
     ) -> List[Tuple[List[List[int]], List[str]]]:
         if images is not None:
             assert len(prompts) == len(images)
 
-        prompt_inputs: List[TextPrompt] = []
-        for i, prompt in enumerate(prompts):
-            prompt = TextPrompt(prompt=prompt)
-            if images is not None:
-                prompt["multi_modal_data"] = MultiModalData(
-                    type=MultiModalData.Type.IMAGE,
-                    data=images[i:i + 1],
-                )
-
-            prompt_inputs.append(prompt)
+        inputs = [TextPrompt(prompt=prompt) for prompt in prompts]
+        if images is not None:
+            for i, image in enumerate(images):
+                inputs[i]["multi_modal_data"] = image
 
-        req_outputs = self.model.generate(prompt_inputs,
+        req_outputs = self.model.generate(inputs,
                                           sampling_params=sampling_params)
 
         outputs: List[Tuple[List[List[int]], List[str]]] = []
@@ -632,7 +635,7 @@ def generate_greedy(
         self,
         prompts: List[str],
         max_tokens: int,
-        images: Optional[torch.Tensor] = None,
+        images: Optional[List[MultiModalData]] = None,
     ) -> List[Tuple[List[int], str]]:
         greedy_params = SamplingParams(temperature=0.0, max_tokens=max_tokens)
         outputs = self.generate(prompts, greedy_params, images=images)
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index f86cd3fa88f5..cc0685ca9c5e 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -1,7 +1,7 @@
 import gc
 from dataclasses import fields
 from enum import Enum
-from typing import Dict, List, Tuple
+from typing import Any, Dict, List, Tuple
 
 import pytest
 import torch
@@ -9,36 +9,50 @@
 
 from vllm.config import VisionLanguageConfig
 
+
+def iter_llava_configs(model_name: str):
+    image_hw_to_feature_size = {
+        (336, 336): 576,
+    }
+
+    for (h, w), f in image_hw_to_feature_size.items():
+        for input_type, input_shape in [
+            (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
+            (VisionLanguageConfig.ImageInputType.IMAGE_FEATURES, (1, f, 1024)),
+        ]:
+            yield (model_name,
+                   VisionLanguageConfig(image_input_type=input_type,
+                                        image_feature_size=f,
+                                        image_token_id=32000,
+                                        image_input_shape=input_shape,
+                                        image_processor=model_name,
+                                        image_processor_revision=None))
+
+
 model_and_vl_config = [
-    ("llava-hf/llava-1.5-7b-hf",
-     VisionLanguageConfig(
-         image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
-         image_feature_size=576,
-         image_token_id=32000,
-         image_input_shape=(1, 3, 336, 336))),
-    ("llava-hf/llava-1.5-7b-hf",
-     VisionLanguageConfig(
-         image_input_type=VisionLanguageConfig.ImageInputType.IMAGE_FEATURES,
-         image_feature_size=576,
-         image_token_id=32000,
-         image_input_shape=(1, 576, 1024)))
+    *iter_llava_configs("llava-hf/llava-1.5-7b-hf"),
+    # Not enough memory
+    # *iter_llava_configs("llava-hf/llava-1.5-13b-hf"),
 ]
 
 
-def as_dict(vision_language_config: VisionLanguageConfig) -> Dict:
+def as_dict(vlm_config: VisionLanguageConfig) -> Dict[str, Any]:
     """Flatten vision language config to pure args.
 
     Compatible with what llm entrypoint expects.
     """
     result = {}
-    for field in fields(vision_language_config):
-        value = getattr(vision_language_config, field.name)
+    for field in fields(vlm_config):
+        value = getattr(vlm_config, field.name)
         if isinstance(value, Enum):
             result[field.name] = value.name.lower()
         elif isinstance(value, tuple):
             result[field.name] = ",".join([str(item) for item in value])
         else:
             result[field.name] = value
+
+    result["disable_image_processor"] = vlm_config.image_processor is None
+
     return result
 
 
@@ -67,18 +81,19 @@ def sanitize_vllm_output(vllm_output: Tuple[List[int], str],
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 def test_models(hf_runner, vllm_runner, hf_image_prompts, hf_images,
-                vllm_image_prompts, vllm_images, model_and_config: tuple,
-                dtype: str, max_tokens: int, worker_use_ray: bool) -> None:
+                vllm_image_prompts, vllm_images, model_and_config, dtype: str,
+                max_tokens: int, worker_use_ray: bool) -> None:
     """Inference result should be the same between hf and vllm.
 
     All the image fixtures for the test is under tests/images.
-    For huggingface runner, we provide the raw images as input.
-    For vllm runner, we provide image tensors and corresponding
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalData objects and corresponding
     vision language config as input.
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
     model_id, vision_language_config = model_and_config
+
     hf_model = hf_runner(model_id, dtype=dtype)
     hf_outputs = hf_model.generate_greedy(hf_image_prompts,
                                           max_tokens,
@@ -88,6 +103,7 @@ def test_models(hf_runner, vllm_runner, hf_image_prompts, hf_images,
     vllm_model = vllm_runner(model_id,
                              dtype=dtype,
                              worker_use_ray=worker_use_ray,
+                             enforce_eager=True,
                              **as_dict(vision_language_config))
     vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
                                               max_tokens,
@@ -105,3 +121,7 @@ def test_models(hf_runner, vllm_runner, hf_image_prompts, hf_images,
             f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
         assert hf_output_ids == vllm_output_ids, (
             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
+
+
+# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
+# (Requires multiple GPUs)
diff --git a/tests/multimodal/__init__.py b/tests/multimodal/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/multimodal/test_processor.py b/tests/multimodal/test_processor.py
new file mode 100644
index 000000000000..4aeae633d07f
--- /dev/null
+++ b/tests/multimodal/test_processor.py
@@ -0,0 +1,98 @@
+import numpy as np
+import pytest
+from transformers import CLIPImageProcessor
+
+from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import ImagePixelData
+
+
+@pytest.mark.parametrize("dtype", ["half", "bfloat16", "float"])
+def test_clip_image_processor(hf_images, dtype):
+    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
+    IMAGE_HEIGHT = IMAGE_WIDTH = 33
+
+    hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)
+    assert isinstance(hf_processor, CLIPImageProcessor)
+
+    model_config = ModelConfig(
+        model=MODEL_NAME,
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype=dtype,
+        revision=None,
+    )
+    vlm_config = VisionLanguageConfig(
+        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
+        image_token_id=32000,
+        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
+        image_feature_size=576,
+        image_processor=MODEL_NAME,
+        image_processor_revision=None,
+    )
+
+    for image in hf_images:
+        hf_result = hf_processor.preprocess(
+            image,
+            return_tensors="np",
+        )
+        vllm_result = MULTIMODAL_REGISTRY.process_input(
+            ImagePixelData(image),
+            model_config=model_config,
+            vlm_config=vlm_config,
+        )
+
+        assert hf_result.keys() == vllm_result.keys()
+        for key, hf_arr in hf_result.items():
+            vllm_arr: np.ndarray = vllm_result[key].numpy()
+
+            assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
+            assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
+
+
+@pytest.mark.parametrize("dtype", ["float"])
+def test_image_pixel_types(hf_images, vllm_image_tensors, dtype):
+    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
+    IMAGE_HEIGHT = IMAGE_WIDTH = 33
+
+    model_config = ModelConfig(
+        model=MODEL_NAME,
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype=dtype,
+        revision=None,
+    )
+    vlm_config = VisionLanguageConfig(
+        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
+        image_token_id=32000,
+        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
+        image_feature_size=576,
+        image_processor=MODEL_NAME,
+        image_processor_revision=None,
+    )
+
+    for image, tensor in zip(hf_images, vllm_image_tensors):
+        image_result = MULTIMODAL_REGISTRY.process_input(
+            ImagePixelData(image),
+            model_config=model_config,
+            vlm_config=vlm_config,
+        )
+        tensor_result = MULTIMODAL_REGISTRY.process_input(
+            ImagePixelData(tensor),
+            model_config=model_config,
+            vlm_config=vlm_config,
+        )
+
+        assert image_result.keys() == tensor_result.keys()
+        for key, image_arr in image_result.items():
+            tensor_arr: np.ndarray = tensor_result[key].numpy()
+
+            assert image_arr.shape == tensor_arr.shape, f"Failed for key={key}"
+
+            # The examples in PR#3042 have slightly different preprocessing from
+            # HuggingFace's LlavaProcessor, causing the test to fail.
+            # assert np.allclose(image_arr, tensor_arr), f"Failed for key={key}"
diff --git a/tests/spec_decode/e2e/conftest.py b/tests/spec_decode/e2e/conftest.py
index 7c5840baf359..1d060e265848 100644
--- a/tests/spec_decode/e2e/conftest.py
+++ b/tests/spec_decode/e2e/conftest.py
@@ -18,9 +18,10 @@
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.lora.request import LoRARequest
 from vllm.model_executor.utils import set_random_seed
+from vllm.multimodal import MultiModalData
 from vllm.outputs import RequestOutput
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import Logprob, MultiModalData
+from vllm.sequence import Logprob
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Counter, random_uuid
 
diff --git a/tests/tokenization/test_image_processor.py b/tests/tokenization/test_image_processor.py
new file mode 100644
index 000000000000..5ba232336741
--- /dev/null
+++ b/tests/tokenization/test_image_processor.py
@@ -0,0 +1,20 @@
+import pytest
+from transformers.image_processing_utils import BaseImageProcessor
+
+from vllm.transformers_utils.image_processor import get_image_processor
+
+IMAGE_PROCESSOR_NAMES = [
+    "llava-hf/llava-1.5-7b-hf",
+    "llava-hf/llava-v1.6-34b-hf",
+]
+
+
+@pytest.mark.parametrize("processor_name", IMAGE_PROCESSOR_NAMES)
+def test_image_processor_revision(processor_name: str):
+    # Assume that "main" branch always exists
+    image_processor = get_image_processor(processor_name, revision="main")
+    assert isinstance(image_processor, BaseImageProcessor)
+
+    # Assume that "never" branch always does not exist
+    with pytest.raises(OSError, match='not a valid git identifier'):
+        get_image_processor(processor_name, revision="never")
diff --git a/vllm/config.py b/vllm/config.py
index 61cd66aab80c..6f2249f412dd 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1126,10 +1126,12 @@ class ImageInputType(enum.Enum):
     # worst case scenario (biggest supported resolution).
     image_input_shape: tuple
     image_feature_size: int
+    # The image processor to load from HuggingFace
+    image_processor: Optional[str]
+    image_processor_revision: Optional[str]
 
     @classmethod
-    def get_image_input_enum_type(
-            cls, value: str) -> "VisionLanguageConfig.ImageInputType":
+    def get_image_input_enum_type(cls, value: str) -> ImageInputType:
         """Get the image input type from a string."""
         try:
             return cls.ImageInputType[value.upper()]
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index bceb6e5fb064..a2a98a05ad97 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -3,6 +3,7 @@
 import argparse
 import dataclasses
 import json
+import warnings
 from dataclasses import dataclass
 from typing import List, Optional, Tuple, Union
 
@@ -84,6 +85,10 @@ class EngineArgs:
     image_token_id: Optional[int] = None
     image_input_shape: Optional[str] = None
     image_feature_size: Optional[int] = None
+    image_processor: Optional[str] = None
+    image_processor_revision: Optional[str] = None
+    disable_image_processor: bool = False
+
     scheduler_delay_factor: float = 0.0
     enable_chunked_prefill: bool = False
 
@@ -102,6 +107,53 @@ def __post_init__(self):
         if self.tokenizer is None:
             self.tokenizer = self.model
 
+    @staticmethod
+    def add_cli_args_for_vlm(
+            parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
+        parser.add_argument('--image-input-type',
+                            type=nullable_str,
+                            default=None,
+                            choices=[
+                                t.name.lower()
+                                for t in VisionLanguageConfig.ImageInputType
+                            ],
+                            help=('The image input type passed into vLLM.'))
+        parser.add_argument('--image-token-id',
+                            type=int,
+                            default=None,
+                            help=('Input id for image token.'))
+        parser.add_argument(
+            '--image-input-shape',
+            type=nullable_str,
+            default=None,
+            help=('The biggest image input shape (worst for memory footprint) '
+                  'given an input type. Only used for vLLM\'s profile_run.'))
+        parser.add_argument(
+            '--image-feature-size',
+            type=int,
+            default=None,
+            help=('The image feature size along the context dimension.'))
+        parser.add_argument(
+            '--image-processor',
+            type=str,
+            default=EngineArgs.image_processor,
+            help='Name or path of the huggingface image processor to use. '
+            'If unspecified, model name or path will be used.')
+        parser.add_argument(
+            '--image-processor-revision',
+            type=str,
+            default=None,
+            help='Revision of the huggingface image processor version to use. '
+            'It can be a branch name, a tag name, or a commit id. '
+            'If unspecified, will use the default version.')
+        parser.add_argument(
+            '--disable-image-processor',
+            action='store_true',
+            help='Disables the use of image processor, even if one is defined '
+            'for the model on huggingface.')
+
+        return parser
+
     @staticmethod
     def add_cli_args(
             parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
@@ -117,7 +169,8 @@ def add_cli_args(
             '--tokenizer',
             type=nullable_str,
             default=EngineArgs.tokenizer,
-            help='Name or path of the huggingface tokenizer to use.')
+            help='Name or path of the huggingface tokenizer to use. '
+            'If unspecified, model name or path will be used.')
         parser.add_argument(
             '--skip-tokenizer-init',
             action='store_true',
@@ -140,9 +193,9 @@ def add_cli_args(
             '--tokenizer-revision',
             type=nullable_str,
             default=None,
-            help='The specific tokenizer version to use. It can be a branch '
-            'name, a tag name, or a commit id. If unspecified, will use '
-            'the default version.')
+            help='Revision of the huggingface tokenizer to use. '
+            'It can be a branch name, a tag name, or a commit id. '
+            'If unspecified, will use the default version.')
         parser.add_argument(
             '--tokenizer-mode',
             type=str,
@@ -460,31 +513,10 @@ def add_cli_args(
                             default=EngineArgs.device,
                             choices=["auto", "cuda", "neuron", "cpu"],
                             help='Device type for vLLM execution.')
+
         # Related to Vision-language models such as llava
-        parser.add_argument(
-            '--image-input-type',
-            type=nullable_str,
-            default=None,
-            choices=[
-                t.name.lower() for t in VisionLanguageConfig.ImageInputType
-            ],
-            help=('The image input type passed into vLLM. '
-                  'Should be one of "pixel_values" or "image_features".'))
-        parser.add_argument('--image-token-id',
-                            type=int,
-                            default=None,
-                            help=('Input id for image token.'))
-        parser.add_argument(
-            '--image-input-shape',
-            type=nullable_str,
-            default=None,
-            help=('The biggest image input shape (worst for memory footprint) '
-                  'given an input type. Only used for vLLM\'s profile_run.'))
-        parser.add_argument(
-            '--image-feature-size',
-            type=int,
-            default=None,
-            help=('The image feature size along the context dimension.'))
+        parser = EngineArgs.add_cli_args_for_vlm(parser)
+
         parser.add_argument(
             '--scheduler-delay-factor',
             type=float,
@@ -503,7 +535,6 @@ def add_cli_args(
             default=EngineArgs.speculative_model,
             help=
             'The name of the draft model to be used in speculative decoding.')
-
         parser.add_argument(
             '--num-speculative-tokens',
             type=int,
@@ -681,12 +712,27 @@ def create_engine_config(self, ) -> EngineConfig:
                 raise ValueError(
                     'Specify `image_token_id`, `image_input_shape` and '
                     '`image_feature_size` together with `image_input_type`.')
+
+            if self.image_processor is None:
+                self.image_processor = self.model
+            if self.disable_image_processor:
+                if self.image_processor != self.model:
+                    warnings.warn(
+                        "You've specified an image processor "
+                        f"({self.image_processor}) but also disabled "
+                        "it via `--disable-image-processor`.",
+                        stacklevel=2)
+
+                self.image_processor = None
+
             vision_language_config = VisionLanguageConfig(
                 image_input_type=VisionLanguageConfig.
                 get_image_input_enum_type(self.image_input_type),
                 image_token_id=self.image_token_id,
                 image_input_shape=str_to_int_tuple(self.image_input_shape),
                 image_feature_size=self.image_feature_size,
+                image_processor=self.image_processor,
+                image_processor_revision=self.image_processor_revision,
             )
         else:
             vision_language_config = None
@@ -749,3 +795,7 @@ def _engine_args_parser():
 def _async_engine_args_parser():
     return AsyncEngineArgs.add_cli_args(argparse.ArgumentParser(),
                                         async_args_only=True)
+
+
+def _vlm_engine_args_parser():
+    return EngineArgs.add_cli_args_for_vlm(argparse.ArgumentParser())
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index 7f1c6ef9603a..ad9404898d53 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -14,7 +14,6 @@
 from vllm.outputs import EmbeddingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import MultiModalData
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Counter, deprecate_kwargs
 
@@ -173,7 +172,6 @@ def generate(
         prompt_token_ids: Optional[List[int]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -186,7 +184,6 @@ def generate(
         prompt_token_ids: Optional[List[List[int]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -200,7 +197,6 @@ def generate(
         prompt_token_ids: List[int],
         use_tqdm: bool = True,
         lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -214,7 +210,6 @@ def generate(
         prompt_token_ids: List[List[int]],
         use_tqdm: bool = True,
         lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -226,7 +221,6 @@ def generate(
         prompt_token_ids: Union[List[int], List[List[int]]],
         use_tqdm: bool = True,
         lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -245,7 +239,6 @@ def generate(
 
     @deprecate_kwargs("prompts",
                       "prompt_token_ids",
-                      "multi_modal_data",
                       is_deprecated=lambda: LLM.DEPRECATE_LEGACY,
                       additional_message="Please use the 'inputs' parameter "
                       "instead.")
@@ -258,7 +251,6 @@ def generate(
         prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
     ) -> List[RequestOutput]:
         """Generates the completions for the input prompts.
 
@@ -290,11 +282,10 @@ def generate(
                 "LLM.generate() is only supported for generation models "
                 "(XForCausalLM).")
 
-        if prompt_token_ids is not None or multi_modal_data is not None:
+        if prompt_token_ids is not None:
             inputs = self._convert_v1_inputs(
                 prompts=cast(Optional[Union[str, List[str]]], prompts),
                 prompt_token_ids=prompt_token_ids,
-                multi_modal_data=multi_modal_data,
             )
         else:
             inputs = cast(
@@ -323,7 +314,6 @@ def encode(
         prompt_token_ids: Optional[List[int]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
     ) -> List[EmbeddingRequestOutput]:
         ...
 
@@ -336,7 +326,6 @@ def encode(
         prompt_token_ids: Optional[List[List[int]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
     ) -> List[EmbeddingRequestOutput]:
         ...
 
@@ -350,7 +339,6 @@ def encode(
         prompt_token_ids: List[int],
         use_tqdm: bool = True,
         lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
     ) -> List[EmbeddingRequestOutput]:
         ...
 
@@ -364,7 +352,6 @@ def encode(
         prompt_token_ids: List[List[int]],
         use_tqdm: bool = True,
         lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
     ) -> List[EmbeddingRequestOutput]:
         ...
 
@@ -376,7 +363,6 @@ def encode(
         prompt_token_ids: Union[List[int], List[List[int]]],
         use_tqdm: bool = True,
         lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
     ) -> List[EmbeddingRequestOutput]:
         ...
 
@@ -395,7 +381,6 @@ def encode(
 
     @deprecate_kwargs("prompts",
                       "prompt_token_ids",
-                      "multi_modal_data",
                       is_deprecated=lambda: LLM.DEPRECATE_LEGACY,
                       additional_message="Please use the 'inputs' parameter "
                       "instead.")
@@ -408,7 +393,6 @@ def encode(
         prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None,
         use_tqdm: bool = True,
         lora_request: Optional[LoRARequest] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
     ) -> List[EmbeddingRequestOutput]:
         """Generates the completions for the input prompts.
 
@@ -439,11 +423,10 @@ def encode(
                 "LLM.encode() is only supported for embedding models (XModel)."
             )
 
-        if prompt_token_ids is not None or multi_modal_data is not None:
+        if prompt_token_ids is not None:
             inputs = self._convert_v1_inputs(
                 prompts=cast(Optional[Union[str, List[str]]], prompts),
                 prompt_token_ids=prompt_token_ids,
-                multi_modal_data=multi_modal_data,
             )
         else:
             inputs = cast(
@@ -468,7 +451,6 @@ def _convert_v1_inputs(
         self,
         prompts: Optional[Union[str, List[str]]],
         prompt_token_ids: Optional[Union[List[int], List[List[int]]]],
-        multi_modal_data: Optional[MultiModalData],
     ):
         # skip_tokenizer_init is now checked in engine
 
@@ -508,9 +490,6 @@ def _convert_v1_inputs(
                 else:
                     raise AssertionError
 
-            if multi_modal_data is not None:
-                item["multi_modal_data"] = multi_modal_data
-
             inputs.append(item)
 
         return inputs
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index fbd763809728..3332bcc57846 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -17,6 +17,8 @@
 from vllm.model_executor.model_loader.weight_utils import default_weight_loader
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.multimodal.image import get_dummy_image_data
 from vllm.sequence import SamplerOutput
 
 from .vlm_base import VisionLanguageModelBase
@@ -82,6 +84,9 @@ class LlavaImageFeatureInputs(TypedDict):
 LlavaImageInputs = Union[LlavaImagePixelInputs, LlavaImageFeatureInputs]
 
 
+@MULTIMODAL_REGISTRY.register_image_feature_input()
+@MULTIMODAL_REGISTRY.register_image_pixel_input()
+@MULTIMODAL_REGISTRY.register_dummy_data(get_dummy_image_data)
 class LlavaForConditionalGeneration(VisionLanguageModelBase):
 
     def __init__(self,
@@ -131,30 +136,41 @@ def _validate_image_data(self, data: torch.Tensor) -> torch.Tensor:
         return data
 
     def _parse_and_validate_image_input(
-            self, data: object) -> Optional[LlavaImageInputs]:
+            self, **kwargs: object) -> Optional[LlavaImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_features = kwargs.pop("image_features", None)
+
         expected_input_type = self.vision_language_config.image_input_type
         ImageInputType = VisionLanguageConfig.ImageInputType
 
-        if data is None:
-            return None
-
         if expected_input_type == ImageInputType.PIXEL_VALUES:
-            if not isinstance(data, torch.Tensor):
-                raise TypeError("Image pixel vector should be a tensor, "
-                                f"but received type: {type(data)}")
+            if image_features is not None:
+                raise ValueError(
+                    "Expected pixel values but got image features")
+            if pixel_values is None:
+                return None
+
+            if not isinstance(pixel_values, torch.Tensor):
+                raise ValueError("Incorrect type of pixel values")
 
             return LlavaImagePixelInputs(
                 type="pixel_values",
-                data=self._validate_image_data(data),
+                data=self._validate_image_data(pixel_values),
             )
-        elif expected_input_type == ImageInputType.IMAGE_FEATURES:
-            if not isinstance(data, torch.Tensor):
-                raise TypeError("Image feature vector should be a tensor, "
-                                f"but received type: {type(data)}")
+
+        if expected_input_type == ImageInputType.IMAGE_FEATURES:
+            if pixel_values is not None:
+                raise ValueError(
+                    "Expected image features but got pixel values")
+            if image_features is None:
+                return None
+
+            if not isinstance(image_features, torch.Tensor):
+                raise ValueError("Incorrect type of image features")
 
             return LlavaImageFeatureInputs(
                 type="image_features",
-                data=self._validate_image_data(data),
+                data=self._validate_image_data(image_features),
             )
 
         return None
@@ -201,12 +217,14 @@ def _process_image_input(self,
 
         return self.multi_modal_projector(image_features)
 
-    def forward(self,
-                input_ids: torch.Tensor,
-                positions: torch.Tensor,
-                kv_caches: List[torch.Tensor],
-                attn_metadata: AttentionMetadata,
-                image_input: Optional[torch.Tensor] = None) -> SamplerOutput:
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        **kwargs: object,
+    ) -> SamplerOutput:
         """Run forward pass for Llava 1.5.
 
         One key thing to understand is the `input_ids` already accounts for the
@@ -227,10 +245,10 @@ def forward(self,
         This way, the `positions` and `attn_metadata` are consistent
         with the `input_ids`.
 
-        The model takes two types of image inputs: 
+        The model takes two types of image inputs:
         PIXEL_VALUES and IMAGE_FEATURES.
         The following shows how each maps to huggingface implementation.
-        PIXEL_VALUES: 
+        PIXEL_VALUES:
         - https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L353
         IMAGE_FEATURES:
         - https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L430
@@ -239,14 +257,15 @@ def forward(self,
         Args:
             input_ids: Flattened (concatenated) input_ids corresponding to a
                 batch.
-            image_input: A batch of image inputs.
-                For PIXEL_VALUES, expecting [1, 3, 336, 336].
-                For IMAGE_FEATURES, expecting [1, 576, 1024].
+            pixel_values: For PIXEL_VALUES, expects a batch with shape
+                [1, 3, 336, 336].
+            image_features: For IMAGE_FEATURES, expects a batch with shape
+                [1, 576, 1024].
         """
-        parsed_image_input = self._parse_and_validate_image_input(image_input)
+        image_input = self._parse_and_validate_image_input(**kwargs)
 
-        if parsed_image_input is not None:
-            vision_embeddings = self._process_image_input(parsed_image_input)
+        if image_input is not None:
+            vision_embeddings = self._process_image_input(image_input)
             inputs_embeds = self.language_model.get_input_embeddings(input_ids)
 
             inputs_embeds = _merge_vision_embeddings(
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
new file mode 100644
index 000000000000..270012e7d1c3
--- /dev/null
+++ b/vllm/multimodal/__init__.py
@@ -0,0 +1,7 @@
+from .base import MultiModalData, MultiModalPlugin
+from .registry import MULTIMODAL_REGISTRY, MultiModalRegistry
+
+__all__ = [
+    "MultiModalData", "MultiModalPlugin", "MULTIMODAL_REGISTRY",
+    "MultiModalRegistry"
+]
diff --git a/vllm/multimodal/base.py b/vllm/multimodal/base.py
new file mode 100644
index 000000000000..847752449ba8
--- /dev/null
+++ b/vllm/multimodal/base.py
@@ -0,0 +1,126 @@
+from abc import ABC, abstractmethod
+from typing import (TYPE_CHECKING, Callable, Dict, Generic, Optional, Type,
+                    TypeVar)
+
+from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.logger import init_logger
+
+if TYPE_CHECKING:
+    import torch
+    from torch import nn
+
+logger = init_logger(__name__)
+
+
+class MultiModalData:
+    """
+    Base class that contains multi-modal data.
+
+    To add a new modality, add a new file under ``multimodal`` directory.
+
+    In this new file, subclass :class:`~MultiModalData` and
+    :class:`~MultiModalPlugin`.
+
+    Finally, register the new plugin to
+    :const:`vllm.multimodal.MULTIMODAL_REGISTRY`.
+    This enables models to call :meth:`MultiModalRegistry.register_input` for
+    the new modality.
+    """
+    pass
+
+
+D = TypeVar("D", bound=MultiModalData)
+N = TypeVar("N", bound=Type["nn.Module"])
+
+MultiModalInputProcessor = Callable[[D, ModelConfig, VisionLanguageConfig],
+                                    Dict[str, "torch.Tensor"]]
+"""Return a dictionary to be passed as keyword arguments to
+:meth:`torch.nn.Module.forward`. This is similar in concept to tokenizers
+and processors in HuggingFace Transformers."""
+
+
+class MultiModalPlugin(ABC, Generic[D]):
+    """
+    Base class that defines data processing logic for a specific modality.
+
+    In particular, we adopt a registry pattern to dispatch data processing
+    according to the model being used (considering that different models may
+    process the same data differently). This registry is in turn used by
+    :class:`~MultiModalRegistry` which acts at a higher level
+    (i.e., the modality of the data).
+    """
+
+    @classmethod
+    def get_model_cls(cls, model_config: ModelConfig) -> Type["nn.Module"]:
+        # Avoid circular import
+        from vllm.model_executor.model_loader import get_model_architecture
+
+        return get_model_architecture(model_config)[0]
+
+    def __init__(self) -> None:
+        self._input_processors: Dict[Type["nn.Module"],
+                                     MultiModalInputProcessor[D]] = {}
+
+    @abstractmethod
+    def get_data_type(self) -> Type[D]:
+        """
+        Get the modality (subclass of :class:`~MultiModalData`) served by
+        this plugin.
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def _default_input_processor(
+            self, data: D, model_config: ModelConfig,
+            vlm_config: VisionLanguageConfig) -> Dict[str, "torch.Tensor"]:
+        """Return a dictionary to be passed as keyword arguments to
+        :meth:`torch.nn.Module.forward`. This is similar in concept to
+        tokenizers and processors in HuggingFace Transformers.
+        """
+        raise NotImplementedError
+
+    def register_input_processor(self,
+                                 processor: Optional[
+                                     MultiModalInputProcessor[D]] = None):
+        """
+        Register an input processor to a model class.
+        
+        When the model receives input data that matches the modality served by
+        this plugin (see :meth:`get_data_type`), the provided input processor is
+        applied to preprocess the data. If `None` is provided, then the default
+        input processor is applied instead.
+        """
+
+        def wrapper(model_cls: N) -> N:
+            if model_cls in self._input_processors:
+                logger.warning(
+                    "Model class %s already has an input processor "
+                    "registered to %s. It is overwritten by the new one.",
+                    model_cls, self)
+
+            self._input_processors[model_cls] = processor \
+                or self._default_input_processor
+
+            return model_cls
+
+        return wrapper
+
+    def process_input(
+            self, data: D, model_config: ModelConfig,
+            vlm_config: VisionLanguageConfig) -> Dict[str, "torch.Tensor"]:
+        """
+        Apply an input processor to a :class:`~MultiModalData` instance passed
+        to the model.
+        
+        The model is identified by ``model_config``. ``vlm_config`` is
+        for compatibility purposes and may be merged into ``model_config``
+        in the near future.
+        """
+        model_cls = self.get_model_cls(model_config)
+
+        processor = self._input_processors.get(model_cls)
+        if processor is None:
+            raise KeyError(f"No input processor in {self} is registered for "
+                           f"model class {model_cls.__name__}.")
+
+        return processor(data, model_config, vlm_config)
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
new file mode 100644
index 000000000000..b964e9ee4262
--- /dev/null
+++ b/vllm/multimodal/image.py
@@ -0,0 +1,141 @@
+from typing import Dict, Tuple, Type, Union
+
+import torch
+from PIL import Image
+
+from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.logger import init_logger
+from vllm.sequence import SequenceData
+from vllm.transformers_utils.image_processor import cached_get_image_processor
+
+from .base import MultiModalData, MultiModalPlugin
+
+logger = init_logger(__name__)
+
+
+def _get_dummy_seq_data(seq_len: int,
+                        vlm_config: VisionLanguageConfig) -> SequenceData:
+    # NOTE: We assume that <image> token is repeated `image_feature_size` times
+    # and then concatenated with the text prompt
+    # TODO: Enable other ways of inserting the image into the prompt
+
+    token_ids = [vlm_config.image_token_id] * vlm_config.image_feature_size
+    token_ids += [0] * (seq_len - vlm_config.image_feature_size)
+
+    return SequenceData(token_ids)
+
+
+def _get_dummy_values(vlm_config: VisionLanguageConfig) -> torch.Tensor:
+    if vlm_config.image_processor is None:
+        values_dtype = torch.float16
+    else:
+        values_dtype = torch.uint8
+
+    return torch.zeros(vlm_config.image_input_shape, dtype=values_dtype)
+
+
+def get_dummy_image_data(
+    seq_len: int,
+    model_config: ModelConfig,
+    vlm_config: VisionLanguageConfig,
+) -> Tuple[SequenceData, MultiModalData]:
+    """Standard dummy data factory for image data (to be used in
+    :meth:`vlm.multimodal.MultiModalRegistry.register_dummy_data`)."""
+    seq_data = _get_dummy_seq_data(seq_len, vlm_config)
+    values = _get_dummy_values(vlm_config)
+
+    config_input_type = vlm_config.image_input_type
+    ImageInputType = VisionLanguageConfig.ImageInputType
+
+    fake_mm_data: MultiModalData
+    if config_input_type == ImageInputType.PIXEL_VALUES:
+        fake_mm_data = ImagePixelData(values)
+    elif config_input_type == ImageInputType.IMAGE_FEATURES:
+        fake_mm_data = ImageFeatureData(values)
+    else:
+        raise NotImplementedError
+
+    return seq_data, fake_mm_data
+
+
+class ImagePixelData(MultiModalData):
+    """
+    The pixel data of an image. Can be one of:
+
+    - :class:``PIL.Image``: An image object. Requires that a HuggingFace
+      processor is available to the model.
+    - :class:``torch.Tensor``: The raw pixel data which is passed to the model
+      without additional pre-processing.
+    """
+
+    def __init__(self, image: Union[Image.Image, torch.Tensor]) -> None:
+        if isinstance(image, Image.Image):
+            # So that this class can be created inside the Image context manager
+            image.load()
+
+        self.image = image
+
+
+class ImagePixelPlugin(MultiModalPlugin[ImagePixelData]):
+
+    def get_data_type(self) -> Type[ImagePixelData]:
+        return ImagePixelData
+
+    def _get_hf_image_processor(self, model_config: ModelConfig,
+                                vlm_config: VisionLanguageConfig):
+        if vlm_config is None or vlm_config.image_processor is None:
+            return None
+
+        return cached_get_image_processor(
+            vlm_config.image_processor,
+            trust_remote_code=model_config.trust_remote_code,
+            revision=vlm_config.image_processor_revision,
+        )
+
+    def _default_input_processor(
+            self, data: ImagePixelData, model_config: ModelConfig,
+            vlm_config: VisionLanguageConfig) -> Dict[str, torch.Tensor]:
+        image = data.image
+        image_processor = self._get_hf_image_processor(model_config,
+                                                       vlm_config)
+
+        if isinstance(image, Image.Image):
+            if image_processor is None:
+                raise RuntimeError("No HuggingFace processor is available"
+                                   "to process the image object")
+            try:
+                return image_processor.preprocess(image, return_tensors="pt") \
+                    .to(model_config.dtype).data
+            except Exception:
+                logger.error("Failed to process image (%s)", image)
+                raise
+        elif isinstance(image, torch.Tensor):
+            pixel_values = image.to(model_config.dtype)
+
+            return {"pixel_values": pixel_values}
+
+        raise TypeError(f"Invalid image type: {type(image)}")
+
+
+class ImageFeatureData(MultiModalData):
+    """
+    The feature vector of an image, passed directly to the model.
+
+    This should be the output of the vision tower.
+    """
+
+    def __init__(self, image_features: torch.Tensor) -> None:
+        self.image_features = image_features
+
+
+class ImageFeaturePlugin(MultiModalPlugin[ImageFeatureData]):
+
+    def get_data_type(self) -> Type[ImageFeatureData]:
+        return ImageFeatureData
+
+    def _default_input_processor(
+            self, data: ImageFeatureData, model_config: ModelConfig,
+            vlm_config: VisionLanguageConfig) -> Dict[str, torch.Tensor]:
+        image_features = data.image_features.to(model_config.dtype)
+
+        return {"image_features": image_features}
diff --git a/vllm/multimodal/registry.py b/vllm/multimodal/registry.py
new file mode 100644
index 000000000000..4789ce5ce4cf
--- /dev/null
+++ b/vllm/multimodal/registry.py
@@ -0,0 +1,156 @@
+import functools
+from typing import (TYPE_CHECKING, Any, Callable, Dict, Optional, Sequence,
+                    Tuple, Type, TypeVar)
+
+from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.logger import init_logger
+
+from .base import MultiModalData, MultiModalPlugin
+from .image import (ImageFeatureData, ImageFeaturePlugin, ImagePixelData,
+                    ImagePixelPlugin)
+
+if TYPE_CHECKING:
+    import torch
+    from torch import nn
+
+    from vllm.sequence import SequenceData
+
+logger = init_logger(__name__)
+
+D = TypeVar("D", bound=MultiModalData)
+N = TypeVar("N", bound=Type["nn.Module"])
+
+MultiModalInputProcessor = Callable[[D, ModelConfig, VisionLanguageConfig],
+                                    Dict[str, "torch.Tensor"]]
+MultiModalDummyFactory = Callable[[int, ModelConfig, VisionLanguageConfig],
+                                  Tuple["SequenceData", MultiModalData]]
+
+
+class MultiModalRegistry:
+    """
+    This registry is used by model runners to dispatch data processing
+    according to its modality and the target model.
+    """
+
+    DEFAULT_PLUGINS = (ImageFeaturePlugin(), ImagePixelPlugin())
+
+    def __init__(self,
+                 *,
+                 plugins: Sequence[MultiModalPlugin[Any]] = DEFAULT_PLUGINS
+                 ) -> None:
+        self._plugins_by_data_type = {p.get_data_type(): p for p in plugins}
+        self._dummy_factories_by_model_type: Dict[Type["nn.Module"],
+                                                  MultiModalDummyFactory] = {}
+
+    def register_plugin(self, plugin: MultiModalPlugin[Any]) -> None:
+        data_type = plugin.get_data_type()
+
+        if data_type in self._plugins_by_data_type:
+            logger.warning(
+                "A plugin is already registered for data type %s, "
+                "and will be overwritten by the new plugin %s.", data_type,
+                plugin)
+
+        self._plugins_by_data_type[data_type] = plugin
+
+    def _get_plugin_for_data_type(self, data_type: Type[MultiModalData]):
+        for typ in data_type.mro():
+            plugin = self._plugins_by_data_type.get(typ)
+            if plugin is not None:
+                return plugin
+
+        msg = f"Unknown multi-modal data type: {data_type}"
+        raise NotImplementedError(msg)
+
+    def register_dummy_data(self, factory: MultiModalDummyFactory):
+        """
+        Register a dummy data factory to a model class.
+
+        During memory profiling, the provided function is invoked to create
+        dummy data to be inputted into the model. The modality and shape of
+        the dummy data should be an upper bound of what the model would receive
+        at inference time.
+        """
+
+        def wrapper(model_cls: N) -> N:
+            if model_cls in self._dummy_factories_by_model_type:
+                logger.warning(
+                    "Model class %s already has dummy data "
+                    "registered to %s. It is overwritten by the new one.",
+                    model_cls, self)
+
+            self._dummy_factories_by_model_type[model_cls] = factory
+
+            return model_cls
+
+        return wrapper
+
+    def dummy_data_for_profiling(self, seq_len: int, model_config: ModelConfig,
+                                 vlm_config: VisionLanguageConfig):
+        """Create dummy data for memory profiling."""
+        model_cls = MultiModalPlugin.get_model_cls(model_config)
+        dummy_factory = self._dummy_factories_by_model_type.get(model_cls)
+        if dummy_factory is None:
+            msg = f"No dummy data defined for model class: {model_cls}"
+            raise NotImplementedError(msg)
+
+        return dummy_factory(seq_len, model_config, vlm_config)
+
+    def register_input(
+            self,
+            data_type: Type[D],
+            processor: Optional[MultiModalInputProcessor[D]] = None):
+        """
+        Register an input processor for a specific modality to a model class.
+
+        See :meth:`MultiModalPlugin.register_input_processor` for more details.
+        """
+        return self._get_plugin_for_data_type(data_type) \
+            .register_input_processor(processor)
+
+    def register_image_pixel_input(
+            self,
+            processor: Optional[
+                MultiModalInputProcessor[ImagePixelData]] = None):
+        """
+        Register an input processor for image pixel data to a model class.
+
+        See :meth:`MultiModalPlugin.register_input_processor` for more details.
+        """
+        return self.register_input(ImagePixelData, processor)
+
+    def register_image_feature_input(
+        self,
+        processor: Optional[
+            MultiModalInputProcessor[ImageFeatureData]] = None):
+        """
+        Register an input processor for image feature data to a model class.
+
+        See :meth:`MultiModalPlugin.register_input_processor` for more details.
+        """
+        return self.register_input(ImageFeatureData, processor)
+
+    def process_input(self, data: MultiModalData, model_config: ModelConfig,
+                      vlm_config: VisionLanguageConfig):
+        """
+        Apply an input processor to a :class:`~MultiModalData` instance passed
+        to the model.
+        
+        See :meth:`MultiModalPlugin.process_input` for more details.
+        """
+        return self._get_plugin_for_data_type(type(data)) \
+            .process_input(data, model_config, vlm_config)
+
+    def create_input_processor(self, model_config: ModelConfig,
+                               vlm_config: VisionLanguageConfig):
+        """
+        Create an input processor (see :meth:`process_input`) for a
+        specific model.
+        """
+        return functools.partial(self.process_input,
+                                 model_config=model_config,
+                                 vlm_config=vlm_config)
+
+
+MULTIMODAL_REGISTRY = MultiModalRegistry()
+"""The global :class:`~MultiModalRegistry` which is used by model runners."""
diff --git a/vllm/sequence.py b/vllm/sequence.py
index ac5c234d052b..2f27bf33b166 100644
--- a/vllm/sequence.py
+++ b/vllm/sequence.py
@@ -5,6 +5,8 @@
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Dict, List, Optional, Tuple, Union
 
+import torch
+
 from vllm.block import LogicalTokenBlock
 from vllm.inputs import LLMInputs
 from vllm.lora.request import LoRARequest
@@ -12,8 +14,7 @@
 from vllm.sampling_params import SamplingParams
 
 if TYPE_CHECKING:
-    import torch
-
+    from vllm.multimodal import MultiModalData
     from vllm.spec_decode.metrics import SpecDecodeWorkerMetrics
 
 
@@ -398,25 +399,6 @@ class SequenceGroupState:
     generator: Optional = None  # type: ignore
 
 
-class MultiModalData:
-    """Multi modal request.
-
-    Args:
-        type: The data type.
-        data: The actual data.
-        The required shape and semantic meaning of it depends on the vision
-        language config of the hosted model.
-        See `VisionLanguageConfig` in `config.py`.
-    """
-
-    class Type(enum.Enum):
-        IMAGE = enum.auto()
-
-    def __init__(self, type: Type, data: "torch.Tensor"):
-        self.type = type
-        self.data = data
-
-
 class SequenceGroup:
     """A group of sequences that are generated from the same prompt.
 
@@ -473,7 +455,7 @@ def prompt_token_ids(self) -> List[int]:
         return next(iter(self.seqs_dict.values())).prompt_token_ids
 
     @property
-    def multi_modal_data(self) -> Optional[MultiModalData]:
+    def multi_modal_data(self) -> Optional["MultiModalData"]:
         # All sequences in the group should have the same multi-modal data.
         # We use the multi-modal data of an arbitrary sequence.
         return next(iter(self.seqs_dict.values())).multi_modal_data
@@ -655,7 +637,7 @@ def __init__(
         lora_request: Optional[LoRARequest] = None,
         computed_block_nums: Optional[List[int]] = None,
         state: Optional[SequenceGroupState] = None,
-        multi_modal_data: Optional[MultiModalData] = None,
+        multi_modal_data: Optional["MultiModalData"] = None,
         encoder_seq_data: Optional[SequenceData] = None,
         cross_block_table: Optional[List[int]] = None,
     ) -> None:
@@ -798,13 +780,13 @@ class SamplerOutput:
     outputs: List[CompletionSequenceGroupOutput]
 
     # On-device tensor containing probabilities of each token.
-    sampled_token_probs: Optional["torch.Tensor"] = None
+    sampled_token_probs: Optional[torch.Tensor] = None
 
     # On-device tensor containing the logprobs of each token.
     logprobs: Optional["torch.Tensor"] = None
 
     # On-device tensor containing the sampled token ids.
-    sampled_token_ids: Optional["torch.Tensor"] = None
+    sampled_token_ids: Optional[torch.Tensor] = None
 
     # Spec decode metrics populated by workers.
     spec_decode_worker_metrics: Optional["SpecDecodeWorkerMetrics"] = None
diff --git a/vllm/transformers_utils/image_processor.py b/vllm/transformers_utils/image_processor.py
new file mode 100644
index 000000000000..3239b1d0cfa2
--- /dev/null
+++ b/vllm/transformers_utils/image_processor.py
@@ -0,0 +1,45 @@
+from functools import lru_cache
+from typing import Optional
+
+from transformers import AutoImageProcessor
+from transformers.image_processing_utils import BaseImageProcessor
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+
+def get_image_processor(
+    processor_name: str,
+    *args,
+    trust_remote_code: bool = False,
+    revision: Optional[str] = None,
+    **kwargs,
+) -> BaseImageProcessor:
+    """Gets an image processor for the given model name via HuggingFace."""
+    try:
+        processor: BaseImageProcessor = AutoImageProcessor.from_pretrained(
+            processor_name,
+            *args,
+            trust_remote_code=trust_remote_code,
+            revision=revision,
+            **kwargs)
+    except ValueError as e:
+        # If the error pertains to the processor class not existing or not
+        # currently being imported, suggest using the --trust-remote-code flag.
+        # Unlike AutoTokenizer, AutoImageProcessor does not separate such errors
+        if not trust_remote_code:
+            err_msg = (
+                "Failed to load the image processor. If the image processor is "
+                "a custom processor not yet available in the HuggingFace "
+                "transformers library, consider setting "
+                "`trust_remote_code=True` in LLM or using the "
+                "`--trust-remote-code` flag in the CLI.")
+            raise RuntimeError(err_msg) from e
+        else:
+            raise e
+
+    return processor
+
+
+cached_get_image_processor = lru_cache(get_image_processor)
diff --git a/vllm/worker/cpu_model_runner.py b/vllm/worker/cpu_model_runner.py
index bc88f2c5bed6..eaf43247d4fc 100644
--- a/vllm/worker/cpu_model_runner.py
+++ b/vllm/worker/cpu_model_runner.py
@@ -1,4 +1,5 @@
-from typing import List, Optional, Tuple
+from collections import defaultdict
+from typing import Dict, List, Optional, Tuple
 
 import torch
 from torch import nn
@@ -11,6 +12,7 @@
 from vllm.logger import init_logger
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sequence import SamplerOutput, SequenceGroupMetadata
 from vllm.utils import make_tensor_with_pad
 
@@ -63,6 +65,16 @@ def __init__(
             self.block_size,
         )
 
+        # Create processor for multi-modal data
+        if self.vision_language_config is not None:
+            self.multi_modal_input_processor = MULTIMODAL_REGISTRY \
+                .create_input_processor(
+                    self.model_config,
+                    self.vision_language_config,
+                )
+        else:
+            self.multi_modal_input_processor = None
+
         # Lazy initialization.
         self.model: nn.Module  # Set after init_Model
 
@@ -80,14 +92,15 @@ def load_model(self) -> None:
     def _prepare_prompt(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
-    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int],
-               Optional[torch.Tensor]]:
+    ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, List[int], Dict[
+            str, torch.Tensor]]:
         assert len(seq_group_metadata_list) > 0
         input_tokens: List[int] = []
         input_positions: List[int] = []
         slot_mapping: List[int] = []
         seq_lens: List[int] = []
-        multi_modal_input_list: List[torch.Tensor] = []
+        multi_modal_kwargs_list: Dict[str,
+                                      List[torch.Tensor]] = defaultdict(list)
 
         for seq_group_metadata in seq_group_metadata_list:
             assert seq_group_metadata.is_prompt
@@ -108,9 +121,17 @@ def _prepare_prompt(
             # is always the first token in the sequence.
             input_positions.extend(list(range(computed_len, seq_len)))
 
-            if seq_group_metadata.multi_modal_data:
-                multi_modal_input_list.append(
-                    seq_group_metadata.multi_modal_data.data)
+            mm_data = seq_group_metadata.multi_modal_data
+            if mm_data is not None:
+                # Process multi-modal data
+                if self.multi_modal_input_processor is None:
+                    raise ValueError(
+                        "Multi-modal inputs are only supported by "
+                        "vision language models.")
+
+                mm_kwargs = self.multi_modal_input_processor(mm_data)
+                for k, v in mm_kwargs.items():
+                    multi_modal_kwargs_list[k].append(v)
 
             # Compute the slot mapping.
             block_table = seq_group_metadata.block_tables[seq_id]
@@ -134,14 +155,10 @@ def _prepare_prompt(
                 slot = block_number * self.block_size + block_offset
                 slot_mapping.append(slot)
 
-        if multi_modal_input_list:
-            assert self.vision_language_config, (
-                "Multi-modal inputs are only supported by "
-                "vision language models.")
-            multi_modal_input = torch.cat(multi_modal_input_list,
-                                          dim=0).to(self.device)
-        else:
-            multi_modal_input = None
+        multi_modal_kwargs = {
+            k: torch.cat(v, dim=0).to(self.device)
+            for k, v in multi_modal_kwargs_list.items()
+        }
 
         num_prompt_tokens = len(input_tokens)
 
@@ -167,7 +184,7 @@ def _prepare_prompt(
             slot_mapping=slot_mapping,
         )
         return (input_tokens, input_positions, attn_metadata, seq_lens,
-                multi_modal_input)
+                multi_modal_kwargs)
 
     def _prepare_decode(
         self,
@@ -257,8 +274,8 @@ def prepare_input_tensors(
         self,
         seq_group_metadata_list: List[SequenceGroupMetadata],
     ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
-               Optional[torch.Tensor]]:
-        multi_modal_input = None
+               Optional[Dict[str, torch.Tensor]]]:
+        multi_modal_kwargs = None
         if self.is_driver_worker:
             # NOTE: We assume that all sequences in the group are all prompts or
             # all decodes.
@@ -266,7 +283,7 @@ def prepare_input_tensors(
             # Prepare input tensors.
             if is_prompt:
                 (input_tokens, input_positions, attn_metadata, seq_lens,
-                 multi_modal_input
+                 multi_modal_kwargs
                  ) = self._prepare_prompt(seq_group_metadata_list)
             else:
                 (input_tokens, input_positions,
@@ -307,7 +324,7 @@ def prepare_input_tensors(
             )
 
         return (input_tokens, input_positions, attn_metadata,
-                sampling_metadata, multi_modal_input)
+                sampling_metadata, multi_modal_kwargs)
 
     @torch.inference_mode()
     def execute_model(
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py
index 0ba1200696ca..465130d10e2f 100644
--- a/vllm/worker/embedding_model_runner.py
+++ b/vllm/worker/embedding_model_runner.py
@@ -90,7 +90,7 @@ def prepare_input_tensors(
         self,
         seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
     ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, PoolingMetadata,
-               Set[LoRARequest], LoRAMapping, torch.Tensor]:
+               Set[LoRARequest], LoRAMapping, Dict[str, torch.Tensor]]:
         if self.is_driver_worker:
             assert seq_group_metadata_list is not None
             # Prepare input tensors.
@@ -102,7 +102,7 @@ def prepare_input_tensors(
                 _,
                 lora_mapping,
                 lora_requests,
-                multi_modal_input,
+                multi_modal_kwargs,
                 slot_mapping,
                 num_prefill_tokens,
                 num_decode_tokens,
@@ -117,7 +117,7 @@ def prepare_input_tensors(
                 "input_positions": input_positions,
                 "lora_requests": lora_requests,
                 "lora_mapping": lora_mapping,
-                "multi_modal_input": multi_modal_input,
+                "multi_modal_kwargs": multi_modal_kwargs,
                 "num_prefill_tokens": num_prefill_tokens,
                 "num_decode_tokens": num_decode_tokens,
                 "slot_mapping": slot_mapping,
@@ -132,7 +132,7 @@ def prepare_input_tensors(
             input_positions = metadata_dict.pop("input_positions")
             lora_mapping = metadata_dict.pop("lora_mapping")
             lora_requests = metadata_dict.pop("lora_requests")
-            multi_modal_input = metadata_dict.pop("multi_modal_input")
+            multi_modal_kwargs = metadata_dict.pop("multi_modal_kwargs")
             if metadata_dict:
                 attn_metadata = self.attn_backend.make_metadata(
                     **metadata_dict)
@@ -143,7 +143,7 @@ def prepare_input_tensors(
                                                prompt_lens=None)
 
         return (input_tokens, input_positions, attn_metadata, pooling_metadata,
-                lora_requests, lora_mapping, multi_modal_input)
+                lora_requests, lora_mapping, multi_modal_kwargs)
 
     def _prepare_pooling(
         self,
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 47aa70dc617a..63ec22d79694 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1,5 +1,6 @@
 import time
 import warnings
+from collections import defaultdict
 from typing import Dict, List, NamedTuple, Optional, Set, Tuple, Union
 
 import numpy as np
@@ -18,9 +19,9 @@
 from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
 from vllm.model_executor import SamplingMetadata
 from vllm.model_executor.model_loader import get_model
+from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.sampling_params import SamplingParams
-from vllm.sequence import (MultiModalData, SamplerOutput, SequenceData,
-                           SequenceGroupMetadata)
+from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
 from vllm.utils import (CudaMemoryProfiler, get_kv_cache_torch_dtype, is_hip,
                         is_pin_memory_available, make_tensor_with_pad)
 
@@ -44,7 +45,7 @@ class ModelInput(NamedTuple):
     query_lens: List[int]
     lora_mapping: Optional[LoRAMapping]
     lora_requests: Set[LoRARequest]
-    multi_modal_input: Optional[torch.Tensor]
+    multi_modal_kwargs: Dict[str, torch.Tensor]
     slot_mapping: torch.Tensor
     num_prefill_tokens: int
     num_decode_tokens: int
@@ -60,7 +61,7 @@ def empty(cls, device):
             query_lens=[],
             lora_mapping=None,
             lora_requests=set(),
-            multi_modal_input=None,
+            multi_modal_kwargs={},
             slot_mapping=torch.empty(0, device=device),
             num_prefill_tokens=0,
             num_decode_tokens=0,
@@ -122,6 +123,16 @@ def __init__(
             self.block_size,
         )
 
+        # Create processor for multi-modal data
+        if self.vision_language_config is not None:
+            self.multi_modal_input_processor = MULTIMODAL_REGISTRY \
+                .create_input_processor(
+                    self.model_config,
+                    self.vision_language_config,
+                )
+        else:
+            self.multi_modal_input_processor = None
+
         # Lazy initialization
         self.model: nn.Module  # Set after load_model
         # Set if the backend is flashinfer.
@@ -242,7 +253,8 @@ def _prepare_model_input(
         context_lens: List[int] = []
         query_lens: List[int] = []
         block_tables: List[List[int]] = []
-        multi_modal_input_list: List[torch.Tensor] = []
+        multi_modal_kwargs_list: Dict[str,
+                                      List[torch.Tensor]] = defaultdict(list)
         decode_only = True
         num_prefills = 0
         num_prefill_tokens = 0
@@ -417,9 +429,17 @@ def _prepare_model_input(
                      and seq_group_metadata.sampling_params.prompt_logprobs
                      else 1))
 
-                if seq_group_metadata.multi_modal_data:
-                    multi_modal_input_list.append(
-                        seq_group_metadata.multi_modal_data.data)
+                mm_data = seq_group_metadata.multi_modal_data
+                if mm_data is not None:
+                    # Process multi-modal data
+                    if self.multi_modal_input_processor is None:
+                        raise ValueError(
+                            "Multi-modal inputs are only supported by "
+                            "vision language models.")
+
+                    mm_kwargs = self.multi_modal_input_processor(mm_data)
+                    for k, v in mm_kwargs.items():
+                        multi_modal_kwargs_list[k].append(v)
 
                 if _is_block_tables_empty(seq_group_metadata.block_tables):
                     # During memory profiling, the block tables are not
@@ -508,16 +528,6 @@ def _prepare_model_input(
         context_lens_tensor = torch.tensor(context_lens,
                                            dtype=torch.int,
                                            device=self.device)
-
-        if multi_modal_input_list:
-            assert self.vision_language_config, (
-                "Multi-modal inputs are only supported by "
-                "vision language models.")
-            multi_modal_input = torch.cat(multi_modal_input_list,
-                                          dim=0).to(self.device)
-        else:
-            multi_modal_input = None
-
         query_lens_tensor = torch.tensor(query_lens,
                                          dtype=torch.long,
                                          device=self.device)
@@ -614,6 +624,11 @@ def _prepare_model_input(
         else:
             lora_mapping = None
 
+        multi_modal_kwargs = {
+            k: torch.cat(v, dim=0).to(self.device)
+            for k, v in multi_modal_kwargs_list.items()
+        }
+
         return ModelInput(
             input_tokens=input_tokens_tensor,
             input_positions=input_positions_tensor,
@@ -622,7 +637,7 @@ def _prepare_model_input(
             query_lens=query_lens,
             lora_mapping=lora_mapping,
             lora_requests=lora_requests,
-            multi_modal_input=multi_modal_input,
+            multi_modal_kwargs=multi_modal_kwargs,
             slot_mapping=slot_mapping_tensor,
             num_prefill_tokens=num_prefill_tokens,
             num_decode_tokens=num_decode_tokens,
@@ -633,7 +648,7 @@ def prepare_input_tensors(
         self,
         seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
     ) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata,
-               Set[LoRARequest], LoRAMapping, torch.Tensor]:
+               Set[LoRARequest], LoRAMapping, Dict[str, torch.Tensor]]:
         if self.is_driver_worker:
             assert seq_group_metadata_list is not None
             # Prepare input tensors.
@@ -645,7 +660,7 @@ def prepare_input_tensors(
                 query_lens,
                 lora_mapping,
                 lora_requests,
-                multi_modal_input,
+                multi_modal_kwargs,
                 slot_mapping,
                 num_prefill_tokens,
                 num_decode_tokens,
@@ -662,7 +677,7 @@ def prepare_input_tensors(
                 sampling_metadata.selected_token_indices,
                 "lora_requests": lora_requests,
                 "lora_mapping": lora_mapping,
-                "multi_modal_input": multi_modal_input,
+                "multi_modal_kwargs": multi_modal_kwargs,
                 "num_prefill_tokens": num_prefill_tokens,
                 "num_decode_tokens": num_decode_tokens,
                 "slot_mapping": slot_mapping,
@@ -679,7 +694,7 @@ def prepare_input_tensors(
                 "selected_token_indices")
             lora_mapping = metadata_dict.pop("lora_mapping")
             lora_requests = metadata_dict.pop("lora_requests")
-            multi_modal_input = metadata_dict.pop("multi_modal_input")
+            multi_modal_kwargs = metadata_dict.pop("multi_modal_kwargs")
             if metadata_dict:
                 attn_metadata = self.attn_backend.make_metadata(
                     **metadata_dict)
@@ -694,7 +709,7 @@ def prepare_input_tensors(
 
         return (input_tokens, input_positions, attn_metadata,
                 sampling_metadata, lora_requests, lora_mapping,
-                multi_modal_input)
+                multi_modal_kwargs)
 
     @torch.inference_mode()
     def execute_model(
@@ -703,7 +718,7 @@ def execute_model(
         kv_caches: List[torch.Tensor],
     ) -> Optional[SamplerOutput]:
         (input_tokens, input_positions, attn_metadata, sampling_metadata,
-         lora_requests, lora_mapping, multi_modal_input
+         lora_requests, lora_mapping, multi_modal_kwargs
          ) = self.prepare_input_tensors(seq_group_metadata_list)
 
         if self.lora_config:
@@ -717,15 +732,14 @@ def execute_model(
             model_executable = self.graph_runners[graph_batch_size]
         else:
             model_executable = self.model
-        execute_model_kwargs = {
-            "input_ids": input_tokens,
-            "positions": input_positions,
-            "kv_caches": kv_caches,
-            "attn_metadata": attn_metadata,
-        }
-        if self.vision_language_config:
-            execute_model_kwargs.update({"image_input": multi_modal_input})
-        hidden_states = model_executable(**execute_model_kwargs)
+
+        hidden_states = model_executable(
+            input_ids=input_tokens,
+            positions=input_positions,
+            kv_caches=kv_caches,
+            attn_metadata=attn_metadata,
+            **multi_modal_kwargs,
+        )
 
         # Compute the logits.
         logits = self.model.compute_logits(hidden_states, sampling_metadata)
@@ -781,16 +795,24 @@ def profile_run(self) -> None:
         # To exercise the worst scenario for GPU memory consumption,
         # the number of seqs (batch_size) is chosen to maximize the number
         # of images processed.
-        if self.vision_language_config:
+        model_config = self.model_config
+        vlm_config = self.vision_language_config
+
+        if vlm_config:
             max_num_seqs = min(
                 max_num_seqs,
-                int(max_num_batched_tokens /
-                    self.vision_language_config.image_feature_size))
+                int(max_num_batched_tokens / vlm_config.image_feature_size))
         for group_id in range(max_num_seqs):
             seq_len = (max_num_batched_tokens // max_num_seqs +
                        (group_id < max_num_batched_tokens % max_num_seqs))
-            seq_data, fake_multi_modal_input = _prepare_fake_inputs(
-                seq_len, self.vision_language_config)
+
+            if vlm_config is None:
+                seq_data = SequenceData([0] * seq_len)
+                dummy_multi_modal_data = None
+            else:
+                seq_data, dummy_multi_modal_data = MULTIMODAL_REGISTRY \
+                    .dummy_data_for_profiling(seq_len, model_config, vlm_config)
+
             seq = SequenceGroupMetadata(
                 request_id=str(group_id),
                 is_prompt=True,
@@ -799,7 +821,7 @@ def profile_run(self) -> None:
                 block_tables=None,
                 lora_request=dummy_lora_requests_per_seq[group_id]
                 if dummy_lora_requests_per_seq else None,
-                multi_modal_data=fake_multi_modal_input,
+                multi_modal_data=dummy_multi_modal_data,
             )
             seqs.append(seq)
 
@@ -1034,24 +1056,6 @@ def _get_graph_batch_size(batch_size: int) -> int:
                 _BATCH_SIZE_ALIGNMENT * _BATCH_SIZE_ALIGNMENT)
 
 
-def _prepare_fake_inputs(
-        seq_len: int, vision_language_config: Optional[VisionLanguageConfig]):
-    """Prepare fake inputs for profile run."""
-    if vision_language_config:
-        prompt_tokens = [
-            vision_language_config.image_token_id
-        ] * vision_language_config.image_feature_size + [0] * (
-            seq_len - vision_language_config.image_feature_size)
-        fake_image_input = MultiModalData(
-            type=MultiModalData.Type.IMAGE,
-            data=torch.zeros(vision_language_config.image_input_shape,
-                             dtype=torch.float16))
-    else:
-        prompt_tokens = [0] * seq_len
-        fake_image_input = None
-    return SequenceData(prompt_tokens), fake_image_input
-
-
 def _is_block_tables_empty(block_tables: Union[None, Dict]):
     """
     Check if block_tables is None or a dictionary with all None values.

From 314398cc8019934dee4992d26a929f6f94277c3b Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Mon, 3 Jun 2024 09:39:31 -0700
Subject: [PATCH 15/93] [Core] Remove unnecessary copies in flash attn backend
 (#5138)

---
 requirements-cuda.txt                 |  2 +-
 vllm/attention/backends/flash_attn.py | 13 +++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/requirements-cuda.txt b/requirements-cuda.txt
index 5109f1735617..353617983596 100644
--- a/requirements-cuda.txt
+++ b/requirements-cuda.txt
@@ -6,4 +6,4 @@ ray >= 2.9
 nvidia-ml-py # for pynvml package
 torch == 2.3.0
 xformers == 0.0.26.post1  # Requires PyTorch 2.3.0
-vllm-flash-attn == 2.5.8.post2  # Requires PyTorch 2.3.0
+vllm-flash-attn == 2.5.9  # Requires PyTorch 2.3.0
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 0b9d6283493f..070c074e511b 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -317,7 +317,7 @@ def forward(
                 # normal attention
                 # When block_tables are not filled, it means q and k are the
                 # prompt, and they have the same length.
-                out = flash_attn_varlen_func(
+                flash_attn_varlen_func(
                     q=query,
                     k=key,
                     v=value,
@@ -329,14 +329,13 @@ def forward(
                     causal=True,
                     window_size=self.sliding_window,
                     alibi_slopes=self.alibi_slopes,
+                    out=output[:num_prefill_tokens],
                 )
-                assert output[:num_prefill_tokens].shape == out.shape
-                output[:num_prefill_tokens] = out
             else:
                 # prefix-enabled attention
                 assert prefill_meta.seq_lens is not None
                 max_seq_len = max(prefill_meta.seq_lens)
-                output[:num_prefill_tokens] = flash_attn_varlen_func(
+                flash_attn_varlen_func(
                     q=query,
                     k=key_cache,
                     v=value_cache,
@@ -348,11 +347,12 @@ def forward(
                     causal=True,
                     alibi_slopes=self.alibi_slopes,
                     block_table=prefill_meta.block_tables,
+                    out=output[:num_prefill_tokens],
                 )
 
         if decode_meta := attn_metadata.decode_metadata:
             # Decoding run.
-            output[num_prefill_tokens:] = flash_attn_with_kvcache(
+            flash_attn_with_kvcache(
                 decode_query.unsqueeze(1),
                 key_cache,
                 value_cache,
@@ -361,7 +361,8 @@ def forward(
                 softmax_scale=self.scale,
                 causal=True,
                 alibi_slopes=self.alibi_slopes,
-            ).squeeze(1)
+                out=output[num_prefill_tokens:].unsqueeze(1),
+            )
 
         # Reshape the output tensor.
         return output.view(num_tokens, hidden_size)

From 1ebb77243e02364ac77553739daf5be607dc1e90 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Mon, 3 Jun 2024 12:52:30 -0400
Subject: [PATCH 16/93] [Kernel] Pass a device pointer into the quantize kernel
 for the scales (#5159)

---
 csrc/ops.h                                        |  4 ++--
 .../compressed_tensors/int8_quant_kernels.cu      | 15 +++++++++------
 tests/kernels/test_int8_quant.py                  |  4 +++-
 vllm/_custom_ops.py                               |  2 +-
 .../compressed_tensors_w8a8_statictensor.py       |  2 +-
 5 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/csrc/ops.h b/csrc/ops.h
index 567d9fae4bd2..4952e826ec8a 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -94,8 +94,8 @@ int cutlass_scaled_mm_dq(torch::Tensor& out, torch::Tensor const& a,
 
 #endif
 
-void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor& input,
-                              float scale);
+void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
+                              torch::Tensor const& scale);
 
 void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
                      torch::Tensor lookup_table);
diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
index 4902e4c23434..11baa5d414c1 100644
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@@ -28,9 +28,10 @@ namespace vllm {
 template <typename scalar_t, typename scale_type>
 __global__ void static_scaled_int8_quant_kernel(
     const scalar_t* __restrict__ input, int8_t* __restrict__ out,
-    scale_type scale, const int hidden_size) {
+    const scale_type* scale_ptr, const int hidden_size) {
   const int tid = threadIdx.x;
   const int token_idx = blockIdx.x;
+  scale_type scale = *scale_ptr;
 
   for (int i = tid; i < hidden_size; i += blockDim.x) {
     out[token_idx * hidden_size + i] =
@@ -39,11 +40,13 @@ __global__ void static_scaled_int8_quant_kernel(
 }
 }  // namespace vllm
 
-void static_scaled_int8_quant(torch::Tensor& out,    // [..., hidden_size]
-                              torch::Tensor& input,  // [..., hidden_size]
-                              float scale) {
+void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
+                              torch::Tensor const& input,  // [..., hidden_size]
+                              torch::Tensor const& scale) {
   TORCH_CHECK(input.is_contiguous());
   TORCH_CHECK(out.is_contiguous());
+  TORCH_CHECK(scale.numel() == 1);
+
   int hidden_size = input.size(-1);
   int num_tokens = input.numel() / hidden_size;
   dim3 grid(num_tokens);
@@ -53,7 +56,7 @@ void static_scaled_int8_quant(torch::Tensor& out,    // [..., hidden_size]
       input.scalar_type(), "static_scaled_int8_quant_kernel", [&] {
         vllm::static_scaled_int8_quant_kernel<scalar_t, float>
             <<<grid, block, 0, stream>>>(input.data_ptr<scalar_t>(),
-                                         out.data_ptr<int8_t>(), scale,
-                                         hidden_size);
+                                         out.data_ptr<int8_t>(),
+                                         scale.data_ptr<float>(), hidden_size);
       });
 }
diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py
index b9aa00ce13f5..29890118c93d 100644
--- a/tests/kernels/test_int8_quant.py
+++ b/tests/kernels/test_int8_quant.py
@@ -26,6 +26,8 @@ def test_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype,
         torch.iinfo(torch.int8).min,
         torch.iinfo(torch.int8).max).to(torch.int8)
     out2 = torch.empty_like(x, dtype=torch.int8)
-    ops.static_scaled_int8_quant(out2, x, scale)
+    scale_argument = torch.tensor([scale], dtype=torch.float32, device="cuda")
+
+    ops.static_scaled_int8_quant(out2, x, scale_argument)
     assert torch.allclose(out1, out2,
                           atol=1)  # big atol to account for rounding errors
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 22cf5a44e341..8a6f6d96d81f 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -265,7 +265,7 @@ def scaled_fp8_quant(
 
 # int8
 def static_scaled_int8_quant(input: torch.Tensor,
-                             scale: float) -> torch.Tensor:
+                             scale: torch.Tensor) -> torch.Tensor:
     """
     Quantize the input tensor to int8 and return the quantized tensor.
 
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
index 7e3e932cfe14..2dfc6e2b0778 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
@@ -97,7 +97,7 @@ def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
         act_scale = layer.input_scale
 
         # Input quantize
-        x_q = custom_ops.static_scaled_int8_quant(x, act_scale[0].item())
+        x_q = custom_ops.static_scaled_int8_quant(x, act_scale)
 
         return custom_ops.cutlass_scaled_mm_dq(x_q, weight.t(), act_scale,
                                                weight_scale, x.dtype)

From 48e8e3fd08be0aae4b2cee83c5c889656b51f610 Mon Sep 17 00:00:00 2001
From: Yuan <yuan.zhou@intel.com>
Date: Tue, 4 Jun 2024 01:39:50 +0800
Subject: [PATCH 17/93] [CI/BUILD] enable intel queue for longer CPU tests
 (#4113)

---
 .buildkite/run-cpu-test.sh          |  14 +++-
 .buildkite/test-template.j2         |   2 +
 Dockerfile.cpu                      |   6 +-
 csrc/cpu/pos_encoding.cpp           | 101 ++++++++++++++--------------
 tests/conftest.py                   |  37 ++++++----
 tests/models/test_aqlm.py           |  11 +--
 tests/models/test_big_models.py     |  10 ++-
 tests/models/test_fp8.py            |  11 +--
 tests/models/test_gptq_marlin.py    |  11 +--
 tests/models/test_gptq_marlin_24.py |  11 +--
 tests/models/test_marlin.py         |  13 ++--
 11 files changed, 138 insertions(+), 89 deletions(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index 414045fe163e..d1200ee84dfe 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -10,5 +10,15 @@ remove_docker_container() { docker rm -f cpu-test || true; }
 trap remove_docker_container EXIT
 remove_docker_container
 
-# Run the image and launch offline inference
-docker run --network host --env VLLM_CPU_KVCACHE_SPACE=1 --name cpu-test cpu-test python3 vllm/examples/offline_inference.py
+# Run the image
+docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
+
+# offline inference
+docker exec cpu-test bash -c "python3 examples/offline_inference.py"
+
+# Run basic model test
+docker exec cpu-test bash -c "cd tests;
+  pip install pytest Pillow protobuf
+  bash ../.buildkite/download-images.sh
+  cd ../
+  pytest -v -s tests/models --ignore=tests/models/test_llava.py --ignore=tests/models/test_embedding.py --ignore=tests/models/test_registry.py"
diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
index 265833e2ccf6..7e986c988407 100644
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@@ -40,6 +40,8 @@ steps:
 
   - label: "Intel Test"
     depends_on: ~
+    agents:
+      queue: intel
     command: bash .buildkite/run-cpu-test.sh
 
   {% for step in steps %}
diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index aec79824213f..ae23e27b413b 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -1,6 +1,6 @@
 # This vLLM Dockerfile is used to construct image that can build and run vLLM on x86 CPU platform.
 
-FROM ubuntu:22.04
+FROM ubuntu:22.04 AS cpu-test-1
 
 RUN apt-get update  -y \
     && apt-get install -y git wget vim numactl gcc-12 g++-12 python3 python3-pip \
@@ -9,6 +9,8 @@ RUN apt-get update  -y \
 RUN pip install --upgrade pip \
     && pip install wheel packaging ninja setuptools>=49.4.0 numpy
 
+FROM cpu-test-1 AS build
+
 COPY ./ /workspace/vllm
 
 WORKDIR /workspace/vllm
@@ -19,4 +21,6 @@ RUN VLLM_TARGET_DEVICE=cpu python3 setup.py install
 
 WORKDIR /workspace/
 
+RUN ln -s /workspace/vllm/tests  && ln -s /workspace/vllm/examples && ln -s /workspace/vllm/benchmarks
+
 CMD ["/bin/bash"]
diff --git a/csrc/cpu/pos_encoding.cpp b/csrc/cpu/pos_encoding.cpp
index 73bf77e46f53..e8aead17ae5a 100644
--- a/csrc/cpu/pos_encoding.cpp
+++ b/csrc/cpu/pos_encoding.cpp
@@ -21,73 +21,74 @@ void rotary_embedding_impl(
   constexpr int VEC_ELEM_NUM = scalar_vec_t::get_elem_num();
 
   const int embed_dim = rot_dim / 2;
-  TORCH_CHECK(embed_dim % VEC_ELEM_NUM == 0);
+  bool flag = (embed_dim % VEC_ELEM_NUM == 0);
+  const int loop_upper = flag ? embed_dim : embed_dim - VEC_ELEM_NUM;
 
-#pragma omp parallel for
-  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
-    int64_t pos = positions[token_idx];
-    const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
+  auto compute_loop = [&](const int64_t token_head, const scalar_t* cache_ptr,
+                          scalar_t* qk) {
+    int j = 0;
+    for (; j < loop_upper; j += VEC_ELEM_NUM) {
+      const int rot_offset = j;
+      const int x_index = rot_offset;
+      const int y_index = embed_dim + rot_offset;
 
-    for (int i = 0; i < num_heads; ++i) {
-      const int head_idx = i;
-      const int64_t token_head =
-          token_idx * query_stride + head_idx * head_size;
-      for (int j = 0; j < embed_dim; j += VEC_ELEM_NUM) {
-        const int rot_offset = j;
-        const int x_index = rot_offset;
-        const int y_index = embed_dim + rot_offset;
+      const int64_t out_x = token_head + x_index;
+      const int64_t out_y = token_head + y_index;
 
-        const int64_t out_x = token_head + x_index;
-        const int64_t out_y = token_head + y_index;
+      const scalar_vec_t cos(cache_ptr + x_index);
+      const scalar_vec_t sin(cache_ptr + y_index);
 
-        const scalar_vec_t cos(cache_ptr + x_index);
-        const scalar_vec_t sin(cache_ptr + y_index);
+      const scalar_vec_t q_x(qk + out_x);
+      const scalar_vec_t q_y(qk + out_y);
 
-        const scalar_vec_t q_x(query + out_x);
-        const scalar_vec_t q_y(query + out_y);
+      vec_op::FP32Vec8 fp32_cos(cos);
+      vec_op::FP32Vec8 fp32_sin(sin);
 
-        vec_op::FP32Vec8 fp32_cos(cos);
-        vec_op::FP32Vec8 fp32_sin(sin);
+      vec_op::FP32Vec8 fp32_q_x(q_x);
+      vec_op::FP32Vec8 fp32_q_y(q_y);
 
-        vec_op::FP32Vec8 fp32_q_x(q_x);
-        vec_op::FP32Vec8 fp32_q_y(q_y);
+      auto out1 = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
+      scalar_vec_t(out1).save(qk + out_x);
 
-        auto out1 = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
-        scalar_vec_t(out1).save(query + out_x);
-
-        auto out2 = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
-        scalar_vec_t(out2).save(query + out_y);
-      }
+      auto out2 = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
+      scalar_vec_t(out2).save(qk + out_y);
     }
-
-    for (int i = 0; i < num_kv_heads; ++i) {
-      const int head_idx = i;
-      const int64_t token_head = token_idx * key_stride + head_idx * head_size;
-      for (int j = 0; j < embed_dim; j += VEC_ELEM_NUM) {
-        const int rot_offset = j;
-        const int x_index = rot_offset;
-        const int y_index = embed_dim + rot_offset;
+    if (!flag) {
+      for (; j < embed_dim; ++j) {
+        const int x_index = j;
+        const int y_index = embed_dim + j;
 
         const int64_t out_x = token_head + x_index;
         const int64_t out_y = token_head + y_index;
 
-        const scalar_vec_t cos(cache_ptr + x_index);
-        const scalar_vec_t sin(cache_ptr + y_index);
+        const float fp32_cos = cache_ptr[x_index];
+        const float fp32_sin = cache_ptr[y_index];
 
-        const scalar_vec_t k_x(key + out_x);
-        const scalar_vec_t k_y(key + out_y);
+        const float fp32_q_x = qk[out_x];
+        const float fp32_q_y = qk[out_y];
 
-        vec_op::FP32Vec8 fp32_cos(cos);
-        vec_op::FP32Vec8 fp32_sin(sin);
+        qk[out_x] = fp32_q_x * fp32_cos - fp32_q_y * fp32_sin;
+        qk[out_y] = fp32_q_y * fp32_cos + fp32_q_x * fp32_sin;
+      }
+    }
+  };
 
-        vec_op::FP32Vec8 fp32_k_x(k_x);
-        vec_op::FP32Vec8 fp32_k_y(k_y);
+#pragma omp parallel for
+  for (int token_idx = 0; token_idx < num_tokens; ++token_idx) {
+    int64_t pos = positions[token_idx];
+    const scalar_t* cache_ptr = cos_sin_cache + pos * rot_dim;
 
-        auto out1 = fp32_k_x * fp32_cos - fp32_k_y * fp32_sin;
-        scalar_vec_t(out1).save(key + out_x);
-        auto out2 = fp32_k_y * fp32_cos + fp32_k_x * fp32_sin;
-        scalar_vec_t(out2).save(key + out_y);
-      }
+    for (int i = 0; i < num_heads; ++i) {
+      const int head_idx = i;
+      const int64_t token_head =
+          token_idx * query_stride + head_idx * head_size;
+      compute_loop(token_head, cache_ptr, query);
+    }
+
+    for (int i = 0; i < num_kv_heads; ++i) {
+      const int head_idx = i;
+      const int64_t token_head = token_idx * key_stride + head_idx * head_size;
+      compute_loop(token_head, cache_ptr, key);
     }
   }
 }
diff --git a/tests/conftest.py b/tests/conftest.py
index 796f498bb28a..8fcd91305e3a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -20,6 +20,7 @@
 from vllm.multimodal import MultiModalData
 from vllm.multimodal.image import ImageFeatureData, ImagePixelData
 from vllm.sequence import SampleLogprobs
+from vllm.utils import is_cpu
 
 logger = init_logger(__name__)
 
@@ -60,7 +61,8 @@ def cleanup():
     with contextlib.suppress(AssertionError):
         torch.distributed.destroy_process_group()
     gc.collect()
-    torch.cuda.empty_cache()
+    if not is_cpu():
+        torch.cuda.empty_cache()
 
 
 @pytest.fixture()
@@ -153,6 +155,12 @@ def example_long_prompts() -> List[str]:
 
 class HfRunner:
 
+    def wrap_device(self, input: any):
+        if not is_cpu():
+            return input.to("cuda")
+        else:
+            return input.to("cpu")
+
     def __init__(
         self,
         model_name: str,
@@ -167,17 +175,18 @@ def __init__(
         if model_name in _EMBEDDING_MODELS:
             # Lazy init required for AMD CI
             from sentence_transformers import SentenceTransformer
-            self.model = SentenceTransformer(
-                model_name,
-                device="cpu",
-            ).to(dtype=torch_dtype).cuda()
+            self.model = self.wrap_device(
+                SentenceTransformer(
+                    model_name,
+                    device="cpu",
+                ).to(dtype=torch_dtype))
         else:
-            self.model = AutoModelForCausalLM.from_pretrained(
-                model_name,
-                torch_dtype=torch_dtype,
-                trust_remote_code=True,
-                token=access_token,
-            ).cuda()
+            self.model = self.wrap_device(
+                AutoModelForCausalLM.from_pretrained(
+                    model_name,
+                    torch_dtype=torch_dtype,
+                    trust_remote_code=True,
+                ))
 
         self.tokenizer = AutoTokenizer.from_pretrained(
             model_name,
@@ -218,7 +227,7 @@ def generate(
             inputs = self.processor(**processor_kwargs)
 
             output_ids = self.model.generate(
-                **inputs.to("cuda"),
+                **self.wrap_device(inputs),
                 use_cache=True,
                 **kwargs,
             )
@@ -275,7 +284,7 @@ def generate_greedy_logprobs(
         for prompt in prompts:
             input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
             output = self.model.generate(
-                input_ids.cuda(),
+                self.wrap_device(input_ids),
                 use_cache=True,
                 do_sample=False,
                 max_new_tokens=max_tokens,
@@ -310,7 +319,7 @@ def generate_greedy_logprobs_limit(
         for prompt in prompts:
             input_ids = self.tokenizer(prompt, return_tensors="pt").input_ids
             output = self.model.generate(
-                input_ids.cuda(),
+                self.wrap_device(input_ids),
                 use_cache=True,
                 do_sample=False,
                 max_new_tokens=max_tokens,
diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py
index a7abc011f57d..85d74f7f5b03 100644
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -8,10 +8,13 @@
 
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 
-capability = torch.cuda.get_device_capability()
-capability = capability[0] * 10 + capability[1]
-aqlm_not_supported = (capability <
-                      QUANTIZATION_METHODS["aqlm"].get_min_capability())
+aqlm_not_supported = True
+
+if torch.cuda.is_available():
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    aqlm_not_supported = (capability <
+                          QUANTIZATION_METHODS["aqlm"].get_min_capability())
 
 # In this test we hardcode prompts and generations for the model so we don't
 # need to require the AQLM package as a dependency
diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py
index 8116b796287a..fd1253f73c93 100644
--- a/tests/models/test_big_models.py
+++ b/tests/models/test_big_models.py
@@ -8,6 +8,7 @@
 import sys
 
 import pytest
+import torch
 
 MODELS = [
     "meta-llama/Llama-2-7b-hf",
@@ -36,9 +37,14 @@
     "mosaicml/mpt-7b",
 ]
 
+#TODO: remove this after CPU float16 support ready
+target_dtype = "float"
+if torch.cuda.is_available():
+    target_dtype = "half"
+
 
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", [target_dtype])
 @pytest.mark.parametrize("max_tokens", [32])
 def test_models(
     hf_runner,
@@ -78,7 +84,7 @@ def test_models(
 
 @pytest.mark.skip("Slow and not useful (just prints model).")
 @pytest.mark.parametrize("model", MODELS)
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", [target_dtype])
 def test_model_print(
     vllm_runner,
     model: str,
diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py
index 0a5819ea3f05..61aee0d0a6e9 100644
--- a/tests/models/test_fp8.py
+++ b/tests/models/test_fp8.py
@@ -67,10 +67,13 @@
     },
 }
 
-capability = torch.cuda.get_device_capability()
-capability = capability[0] * 10 + capability[1]
-fp8_not_supported = (capability <
-                     QUANTIZATION_METHODS["fp8"].get_min_capability())
+fp8_not_supported = True
+
+if torch.cuda.is_available():
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    fp8_not_supported = (capability <
+                         QUANTIZATION_METHODS["fp8"].get_min_capability())
 
 
 @pytest.mark.skipif(fp8_not_supported,
diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py
index 561d4a175658..da549cae0054 100644
--- a/tests/models/test_gptq_marlin.py
+++ b/tests/models/test_gptq_marlin.py
@@ -21,10 +21,13 @@
 
 MAX_MODEL_LEN = 1024
 
-capability = torch.cuda.get_device_capability()
-capability = capability[0] * 10 + capability[1]
-gptq_marlin_not_supported = (
-    capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability())
+gptq_marlin_not_supported = True
+
+if torch.cuda.is_available():
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    gptq_marlin_not_supported = (
+        capability < QUANTIZATION_METHODS["gptq_marlin"].get_min_capability())
 
 MODELS = [
     # act_order==False, group_size=channelwise
diff --git a/tests/models/test_gptq_marlin_24.py b/tests/models/test_gptq_marlin_24.py
index 3e6ffb7f90fc..cc35ee803ff0 100644
--- a/tests/models/test_gptq_marlin_24.py
+++ b/tests/models/test_gptq_marlin_24.py
@@ -14,10 +14,13 @@
 from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 
-capability = torch.cuda.get_device_capability()
-capability = capability[0] * 10 + capability[1]
-marlin_not_supported = (capability <
-                        QUANTIZATION_METHODS["marlin"].get_min_capability())
+marlin_not_supported = True
+
+if torch.cuda.is_available():
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    marlin_not_supported = (
+        capability < QUANTIZATION_METHODS["marlin"].get_min_capability())
 
 
 @dataclass
diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py
index d3770fa69f6f..585c5ad686d1 100644
--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -23,10 +23,15 @@
 from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 
-capability = torch.cuda.get_device_capability()
-capability = capability[0] * 10 + capability[1]
-marlin_not_supported = (capability <
-                        QUANTIZATION_METHODS["marlin"].get_min_capability())
+from .utils import check_logprobs_close
+
+marlin_not_supported = True
+
+if torch.cuda.is_available():
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    marlin_not_supported = (
+        capability < QUANTIZATION_METHODS["marlin"].get_min_capability())
 
 
 @dataclass

From a6f07255d3bbf450bf5adbd68a225ded3821bb80 Mon Sep 17 00:00:00 2001
From: Kaiyang Chen <48289729+Kaiyang-Chen@users.noreply.github.com>
Date: Tue, 4 Jun 2024 04:37:11 +0800
Subject: [PATCH 18/93] [Misc]: Implement CPU/GPU swapping in BlockManagerV2
 (#3834)

---
 format.sh                                  |   2 +-
 tests/core/block/e2e/test_correctness.py   |  49 ++++++--
 tests/core/block/test_block_manager_v2.py  |  58 ++++++++-
 vllm/config.py                             |  28 +++--
 vllm/core/block/block_table.py             |   4 +
 vllm/core/block/common.py                  |   1 -
 vllm/core/block/cpu_gpu_block_allocator.py |  82 +++++++++++-
 vllm/core/block/interfaces.py              |  36 +++++-
 vllm/core/block/naive_block.py             |  66 +++++++++-
 vllm/core/block/prefix_caching_block.py    |  78 ++++++++++++
 vllm/core/block_manager_v1.py              |   6 +-
 vllm/core/block_manager_v2.py              | 140 +++++++++++++++++++--
 vllm/core/embedding_model_block_manager.py |   3 +-
 vllm/core/interfaces.py                    |   3 +-
 vllm/core/scheduler.py                     |  13 +-
 vllm/engine/arg_utils.py                   |   9 ++
 16 files changed, 529 insertions(+), 49 deletions(-)

diff --git a/format.sh b/format.sh
index 18984935013b..8cdb46a41fc6 100755
--- a/format.sh
+++ b/format.sh
@@ -118,7 +118,7 @@ mypy vllm/model_executor --config-file pyproject.toml
 # https://github.com/codespell-project/codespell/issues/1915
 # Avoiding the "./" prefix and using "/**" globs for directories appears to solve the problem
 CODESPELL_EXCLUDES=(
-    '--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,tests/lora/data/**,build/**'
+    '--skip' 'tests/prompts/**,./benchmarks/sonnet.txt,*tests/lora/data/**,build/**'
 )
 
 
diff --git a/tests/core/block/e2e/test_correctness.py b/tests/core/block/e2e/test_correctness.py
index 3713ef2fed4d..ad253635e0ba 100644
--- a/tests/core/block/e2e/test_correctness.py
+++ b/tests/core/block/e2e/test_correctness.py
@@ -24,7 +24,13 @@
 @pytest.mark.parametrize("baseline_llm_kwargs", [{
     "use_v2_block_manager": False
 }])
-@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "use_v2_block_manager": True,
+    "preemption_mode": "swap"
+}, {
+    "use_v2_block_manager": True,
+    "preemption_mode": "recompute"
+}])
 @pytest.mark.parametrize("batch_size", [10])
 @pytest.mark.parametrize("seed", [1])
 def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
@@ -95,7 +101,13 @@ def test_v1_v2_greedy_equality_with_preemption(baseline_llm_generator,
 @pytest.mark.parametrize("baseline_llm_kwargs", [{
     "use_v2_block_manager": False
 }])
-@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "use_v2_block_manager": True,
+    "preemption_mode": "swap"
+}, {
+    "use_v2_block_manager": True,
+    "preemption_mode": "recompute"
+}])
 @pytest.mark.parametrize("batch_size", [10])
 @pytest.mark.parametrize("seed", [1])
 def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,
@@ -179,11 +191,18 @@ def test_v1_v2_greedy_equality_with_cow(baseline_llm_generator,
 }])
 @pytest.mark.parametrize(
     "test_llm_kwargs",
-    [{
-        # We run one test with block_size < lookahead_slots, one test with
-        # block_size > lookahead_slots
-        "num_lookahead_slots": 10,
-    }])
+    [
+        {
+            # We run one test with block_size < lookahead_slots, one test with
+            # block_size > lookahead_slots
+            "num_lookahead_slots": 10,
+            "preemption_mode": "swap",
+        },
+        {
+            "num_lookahead_slots": 10,
+            "preemption_mode": "recompute",
+        }
+    ])
 @pytest.mark.parametrize("batch_size", [4])
 @pytest.mark.parametrize("seed", [1])
 def test_lookahead_greedy_equality_with_preemption(baseline_llm_generator,
@@ -322,7 +341,13 @@ def test_chunked_prefill_block_manager_v2(baseline_llm_generator,
 @pytest.mark.parametrize("baseline_llm_kwargs", [{
     "use_v2_block_manager": False
 }])
-@pytest.mark.parametrize("test_llm_kwargs", [{"use_v2_block_manager": True}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "use_v2_block_manager": True,
+    "preemption_mode": "swap"
+}, {
+    "use_v2_block_manager": True,
+    "preemption_mode": "recompute"
+}])
 @pytest.mark.parametrize("batch_size", [10])
 @pytest.mark.parametrize("seed", [1])
 def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
@@ -397,7 +422,13 @@ def test_v1_v2_greedy_equality_prefix_caching_enabled_with_preemption(
 @pytest.mark.parametrize("baseline_llm_kwargs", [{
     "enable_prefix_caching": False
 }])
-@pytest.mark.parametrize("test_llm_kwargs", [{"enable_prefix_caching": True}])
+@pytest.mark.parametrize("test_llm_kwargs", [{
+    "enable_prefix_caching": True,
+    "preemption_mode": "swap"
+}, {
+    "enable_prefix_caching": True,
+    "preemption_mode": "recompute"
+}])
 @pytest.mark.parametrize("batch_size", [10])
 @pytest.mark.parametrize("seed", [1])
 def test_auto_prefix_caching_with_preemption(baseline_llm_generator,
diff --git a/tests/core/block/test_block_manager_v2.py b/tests/core/block/test_block_manager_v2.py
index f98fc0e21727..d0ca09c4be0d 100644
--- a/tests/core/block/test_block_manager_v2.py
+++ b/tests/core/block/test_block_manager_v2.py
@@ -7,7 +7,8 @@
 from vllm.sequence import Logprob, SequenceStatus
 from vllm.utils import chunk_list
 
-from ..utils import create_seq_group, create_seq_group_encoder_decoder
+from ..utils import (create_dummy_prompt, create_seq_group,
+                     create_seq_group_encoder_decoder)
 
 
 @pytest.mark.parametrize("block_size", [16])
@@ -255,6 +256,61 @@ def test_append_slots(block_size, prompt_len, num_slots_to_append,
     assert num_consumed_blocks == expected_consumed_blocks
 
 
+@pytest.mark.parametrize("block_size", [8])
+@pytest.mark.parametrize("num_cpu_blocks", [4])
+@pytest.mark.parametrize("num_gpu_blocks", [4])
+@pytest.mark.parametrize("num_lookahead_slots", [0, 2, 10])
+@pytest.mark.parametrize("enable_caching", [False, True])
+def test_swap(block_size, num_cpu_blocks, num_gpu_blocks, num_lookahead_slots,
+              enable_caching):
+    """Verify blocks number on src/desc device is correct after swapping in/out
+        sequence group (not missing or extra blocks).
+    """
+    block_manager = BlockSpaceManagerV2(block_size,
+                                        num_cpu_blocks,
+                                        num_gpu_blocks,
+                                        watermark=0,
+                                        enable_caching=enable_caching)
+    prompt, seq_group = create_dummy_prompt("1", prompt_length=block_size - 1)
+    prompt.status = SequenceStatus.WAITING
+    block_manager.allocate(seq_group)
+    # Emulate a forward pass by appending a single token.
+    # The block manager then knows how many unprocessed
+    # tokens will be written in the next forward pass.
+    token_id = 0
+    prompt.status = SequenceStatus.RUNNING
+    prompt.append_token_id(token_id, {token_id: Logprob(0.0)})
+
+    # Swap seq group from GPU -> CPU.
+    gpu_blocks = block_manager.get_block_table(prompt)
+    assert block_manager.can_swap_out(seq_group)
+    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    mapping = block_manager.swap_out(seq_group)
+    mapping_keys = [key for key, _ in mapping]
+    assert mapping_keys == gpu_blocks
+    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_cpu_blocks == after_cpu_blocks + len(gpu_blocks)
+    assert before_gpu_blocks + len(gpu_blocks) == after_gpu_blocks
+    prompt.status = SequenceStatus.SWAPPED
+
+    # Swap seq group from CPU -> GPU.
+    assert block_manager.can_swap_in(seq_group, num_lookahead_slots)
+    before_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    before_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    mapping = block_manager.swap_in(seq_group)
+    cpu_blocks = block_manager.get_block_table(prompt)
+    mapping_keys = [key for key, _ in mapping]
+    assert mapping_keys == [cpu_blocks[0]]
+    after_cpu_blocks = block_manager.get_num_free_cpu_blocks()
+    after_gpu_blocks = block_manager.get_num_free_gpu_blocks()
+    assert before_gpu_blocks == after_gpu_blocks + len(cpu_blocks)
+
+
+# TODO(cade/kaiyang): add comprehensive tests for swapping at allocator level.
+
+
 @pytest.mark.parametrize("block_size", [8, 16])
 @pytest.mark.parametrize("prompt_len", [10, 300, 1000])
 @pytest.mark.parametrize("num_slots_to_append", [50])
diff --git a/vllm/config.py b/vllm/config.py
index 6f2249f412dd..d4f938163ed8 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -683,19 +683,24 @@ class SchedulerConfig:
         enable_chunked_prefill: If True, prefill requests can be chunked based
             on the remaining max_num_batched_tokens.
         embedding_mode: Whether the running model is for embedding.
+        preemption_mode: Whether to perform preemption by swapping or 
+            recomputation. If not specified, we determine the mode as follows:
+            We use recomputation by default since it incurs lower overhead than
+            swapping. However, when the sequence group has multiple sequences
+            (e.g., beam search), recomputation is not currently supported. In
+            such a case, we use swapping instead.
     """
 
-    def __init__(
-        self,
-        max_num_batched_tokens: Optional[int],
-        max_num_seqs: int,
-        max_model_len: int,
-        use_v2_block_manager: bool = False,
-        num_lookahead_slots: int = 0,
-        delay_factor: float = 0.0,
-        enable_chunked_prefill: bool = False,
-        embedding_mode: Optional[bool] = False,
-    ) -> None:
+    def __init__(self,
+                 max_num_batched_tokens: Optional[int],
+                 max_num_seqs: int,
+                 max_model_len: int,
+                 use_v2_block_manager: bool = False,
+                 num_lookahead_slots: int = 0,
+                 delay_factor: float = 0.0,
+                 enable_chunked_prefill: bool = False,
+                 embedding_mode: Optional[bool] = False,
+                 preemption_mode: Optional[str] = None) -> None:
         if max_num_batched_tokens is not None:
             self.max_num_batched_tokens = max_num_batched_tokens
         else:
@@ -721,6 +726,7 @@ def __init__(
         self.delay_factor = delay_factor
         self.chunked_prefill_enabled = enable_chunked_prefill
         self.embedding_mode = embedding_mode
+        self.preemption_mode = preemption_mode
 
         self._verify_args()
 
diff --git a/vllm/core/block/block_table.py b/vllm/core/block/block_table.py
index 26c704b8de90..26f378ba24b7 100644
--- a/vllm/core/block/block_table.py
+++ b/vllm/core/block/block_table.py
@@ -283,6 +283,10 @@ def _get_all_token_ids(self) -> List[int]:
     def _is_allocated(self) -> bool:
         return len(self._blocks) > 0
 
+    @property
+    def blocks(self) -> Optional[List[Block]]:
+        return self._blocks
+
     @property
     def _num_empty_slots(self) -> int:
         assert self._is_allocated
diff --git a/vllm/core/block/common.py b/vllm/core/block/common.py
index 4d7a12165cb0..d2787d69616f 100644
--- a/vllm/core/block/common.py
+++ b/vllm/core/block/common.py
@@ -140,7 +140,6 @@ def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
         assert refcount != 0
         if refcount > 1:
             src_block_id = block_id
-
             # Decrement refcount of the old block.
             self._allocator.free(block)
 
diff --git a/vllm/core/block/cpu_gpu_block_allocator.py b/vllm/core/block/cpu_gpu_block_allocator.py
index d28a68437697..255aae9d1731 100644
--- a/vllm/core/block/cpu_gpu_block_allocator.py
+++ b/vllm/core/block/cpu_gpu_block_allocator.py
@@ -90,11 +90,8 @@ def create(
             gpu_block_allocator=gpu_allocator,
         )
 
-    def __init__(
-        self,
-        cpu_block_allocator: BlockAllocator,
-        gpu_block_allocator: BlockAllocator,
-    ):
+    def __init__(self, cpu_block_allocator: BlockAllocator,
+                 gpu_block_allocator: BlockAllocator):
         assert not (
             cpu_block_allocator.all_block_ids
             & gpu_block_allocator.all_block_ids
@@ -105,6 +102,7 @@ def __init__(
             Device.GPU: gpu_block_allocator,
         }
 
+        self._swap_mapping: Dict[int, int] = {}
         self._null_block: Optional[Block] = None
 
         self._block_ids_to_allocator: Dict[int, BlockAllocator] = {}
@@ -198,6 +196,68 @@ def get_num_free_blocks(self, device: Device) -> int:
     def get_num_total_blocks(self, device: Device) -> int:
         return self._allocators[device].get_num_total_blocks()
 
+    def get_physical_block_id(self, device: Device, absolute_id: int) -> int:
+        """Returns the zero-offset block id on certain device given the 
+        absolute block id.
+
+        Args:
+            device (Device): The device for which to query relative block id.
+                absolute_id (int): The absolute block id for the block in 
+                whole allocator.
+
+        Returns:
+            int: The zero-offset block id on certain device.
+        """
+        return self._allocators[device].get_physical_block_id(absolute_id)
+
+    def swap(self, blocks: List[Block], source_device: Device,
+             dest_device: Device) -> Dict[int, int]:
+        """Execute the swap for the given blocks from source_device
+        on to dest_device, save the current swap mapping and append 
+        them to the accumulated `self._swap_mapping` for each 
+        scheduling move.
+
+        Args:
+            blocks: List of blocks to be swapped.
+            source_device (Device): Device to swap the 'blocks' from.
+            dest_device (Device): Device to swap the 'blocks' to.
+        
+        Returns:
+            Dict[int, int]: Swap mapping from source_device
+                on to dest_device.
+        """
+        source_block_ids = [block.block_id for block in blocks]
+        self._allocators[source_device].swap_out(blocks)
+        self._allocators[dest_device].swap_in(blocks)
+        dest_block_ids = [block.block_id for block in blocks]
+
+        current_swap_mapping: Dict[int, int] = {}
+        for src, dest in zip(source_block_ids, dest_block_ids):
+            if src is not None and dest is not None:
+                self._swap_mapping[src] = dest
+                current_swap_mapping[src] = dest
+        return current_swap_mapping
+
+    def get_num_blocks_touched(self,
+                               blocks: List[Block],
+                               device: Device,
+                               num_lookahead_slots: int = 0) -> int:
+        """Returns the number of blocks that will be touched by
+        swapping in/out the given blocks on to the 'device'.
+
+        Args:
+            blocks: List of blocks to be swapped.
+            device (Device): Device to swap the 'blocks' on.
+            num_lookahead_slots (int): Number of lookahead slots used in 
+                speculative decoding, default to 0.
+
+        Returns:
+            int: the number of blocks that will be touched by
+                swapping in/out the given blocks on to the 'device'.
+        """
+        return self._allocators[device].get_num_blocks_touched(
+            blocks, num_lookahead_slots)
+
     def clear_copy_on_writes(self) -> List[Tuple[int, int]]:
         """Clears the copy-on-write (CoW) state and returns the mapping of
             source to destination block IDs.
@@ -240,6 +300,18 @@ def promote_to_immutable_block(self, block: Block) -> BlockId:
     def cow_block_if_not_appendable(self, block: Block) -> Optional[BlockId]:
         raise NotImplementedError
 
+    def get_and_reset_swaps(self) -> List[Tuple[int, int]]:
+        """Returns and clears the mapping of source to destination block IDs.
+        Will be called after every swapping operations for now, and after every
+        schedule when BlockManagerV2 become default. Currently not useful.
+
+        Returns:
+            List[Tuple[int, int]]: A mapping of source to destination block IDs.
+        """
+        mapping = self._swap_mapping.copy()
+        self._swap_mapping.clear()
+        return list(mapping.items())
+
 
 class NullBlock(Block):
     """
diff --git a/vllm/core/block/interfaces.py b/vllm/core/block/interfaces.py
index 8fc4c601106c..4b20856a1b42 100644
--- a/vllm/core/block/interfaces.py
+++ b/vllm/core/block/interfaces.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import FrozenSet, List, Optional, Protocol, Tuple
+from typing import Dict, FrozenSet, List, Optional, Protocol, Tuple
 
 from vllm.utils import Device
 
@@ -116,6 +116,18 @@ def get_num_total_blocks(self) -> int:
     def get_num_free_blocks(self) -> int:
         pass
 
+    @abstractmethod
+    def get_physical_block_id(self, absolute_id: int) -> int:
+        pass
+
+    @abstractmethod
+    def swap_out(self, blocks: List[Block]) -> None:
+        pass
+
+    @abstractmethod
+    def swap_in(self, blocks: List[Block]) -> None:
+        pass
+
     @property
     @abstractmethod
     def all_block_ids(self) -> FrozenSet[int]:
@@ -149,6 +161,12 @@ def promote_to_immutable_block(self, block: Block) -> BlockId:
         """NOTE: This should not be used besides Block"""
         pass
 
+    @abstractmethod
+    def get_num_blocks_touched(self,
+                               blocks: List[Block],
+                               num_lookahead_slots: int = 0) -> int:
+        pass
+
     class NoFreeBlocksError(ValueError):
         pass
 
@@ -204,6 +222,22 @@ def get_common_computed_block_ids(
             self, seq_block_ids: List[List[int]]) -> List[int]:
         pass
 
+    @abstractmethod
+    def get_num_blocks_touched(self,
+                               blocks: List[Block],
+                               device: Device,
+                               num_lookahead_slots: int = 0) -> int:
+        pass
+
+    @abstractmethod
+    def swap(self, blocks: List[Block], source_device: Device,
+             dest_device: Device) -> Dict[int, int]:
+        pass
+
+    @abstractmethod
+    def get_physical_block_id(self, device: Device, absolute_id: int) -> int:
+        pass
+
     @abstractmethod
     def allocate_or_get_null_block(self) -> Block:
         """
diff --git a/vllm/core/block/naive_block.py b/vllm/core/block/naive_block.py
index ae0193087825..d033787122d7 100644
--- a/vllm/core/block/naive_block.py
+++ b/vllm/core/block/naive_block.py
@@ -3,6 +3,7 @@
 from vllm.core.block.common import (CopyOnWriteTracker, RefCounter,
                                     get_all_blocks_recursively)
 from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
+from vllm.utils import cdiv
 
 Refcount = int
 
@@ -95,8 +96,6 @@ def allocate_mutable(self,
     def free(self, block: Block) -> None:
         assert block.block_id is not None
         self._free_block_id(block.block_id)
-
-        # Mark the block as having no allocation.
         block.block_id = None
 
     def fork(self, last_block: Block) -> List[Block]:
@@ -153,6 +152,19 @@ def _free_block_id(self, block_id: BlockId) -> None:
         if refcount == 0:
             self._free_block_indices.add(block_id)
 
+    def get_physical_block_id(self, absolute_id: int) -> int:
+        """Returns the zero-offset block id on certain block allocator
+        given the absolute block id.
+
+        Args:
+            absolute_id (int): The absolute block id for the block 
+            in whole allocator.
+
+        Returns:
+            int: The zero-offset block id on certain device.
+        """
+        return sorted(self._all_block_indices).index(absolute_id)
+
     @property
     def refcounter(self):
         return self._refcounter
@@ -213,6 +225,56 @@ def get_common_computed_block_ids(
     def promote_to_immutable_block(self, block: Block) -> BlockId:
         raise NotImplementedError
 
+    def get_num_blocks_touched(self,
+                               blocks: List[Block],
+                               num_lookahead_slots: int = 0) -> int:
+        """Determine the number of blocks that will be touched by
+        swapping in/out the given blocks from certain sequence
+        group with the provided num_lookahead_slots.
+
+        Args:
+            blocks (List[Block]): The potential blocks to swap.
+            num_lookahead_slots (int): number of lookahead slots (0 for swap 
+                out).
+        
+        Returns:
+            int: the number of blocks that will be touched by
+                swapping in/out the given blocks and num_lookahead_slots.
+        """
+        # NOTE: for naive block, we use set to eliminate common blocks among
+        # seqs, also we compare the empty slots in the mutable blocks with
+        # lookahead slots to get the number of unique new block that are
+        # needed.
+        old_block_set = set()
+        new_block_count = 0
+        # TODO(cade): make sure the logic is correct and clean it up.
+        for block in blocks:
+            if not block.is_full and num_lookahead_slots != 0:
+                if block.num_empty_slots >= num_lookahead_slots:
+                    new_block_count += 1
+                else:
+                    new_block_count += cdiv(
+                        num_lookahead_slots - block.num_empty_slots,
+                        self._block_size)
+            else:
+                old_block_set.add(block.block_id)
+        num_touched_blocks = new_block_count + len(old_block_set)
+        return num_touched_blocks
+
+    def swap_out(self, blocks: List[Block]) -> None:
+        for block in blocks:
+            self.free(block)
+
+    def swap_in(self, blocks: List[Block]) -> None:
+        for block in blocks:
+            if block.is_full:
+                alloc = self.allocate_immutable(block.prev_block,
+                                                block.token_ids)
+            else:
+                alloc = self.allocate_mutable(block.prev_block)
+                alloc.append_token_ids(block.token_ids)
+            block.block_id = alloc.block_id
+
 
 class NaiveBlock(Block):
     """An implementation of the Block class that does not support prefix
diff --git a/vllm/core/block/prefix_caching_block.py b/vllm/core/block/prefix_caching_block.py
index 4eb32f145b05..405e9705659d 100644
--- a/vllm/core/block/prefix_caching_block.py
+++ b/vllm/core/block/prefix_caching_block.py
@@ -1,4 +1,5 @@
 """Token blocks."""
+
 from itertools import takewhile
 from os.path import commonprefix
 from typing import Dict, FrozenSet, Iterable, List, Optional, Tuple
@@ -8,6 +9,7 @@
 from vllm.core.block.interfaces import Block, BlockAllocator, BlockId, Device
 from vllm.core.block.naive_block import NaiveBlock, NaiveBlockAllocator
 from vllm.core.evictor_v2 import EvictionPolicy, Evictor, make_evictor
+from vllm.utils import cdiv
 
 PrefixHash = int
 
@@ -294,10 +296,29 @@ def get_num_free_blocks(self, device: Optional[Device] = None) -> int:
     def get_num_total_blocks(self) -> int:
         return self._hashless_allocator.get_num_total_blocks()
 
+    def get_physical_block_id(self, absolute_id: int) -> int:
+        """Returns the zero-offset block id on certain block allocator
+        given the absolute block id.
+
+        Args:
+            absolute_id (int): The absolute block id for the block 
+                in whole allocator.
+
+        Returns:
+            int: The rzero-offset block id on certain device.
+        """
+        return sorted(self.all_block_ids).index(absolute_id)
+
     @property
     def all_block_ids(self) -> FrozenSet[int]:
         return self._hashless_allocator.all_block_ids
 
+    def is_block_cached(self, block: Block) -> bool:
+        assert block.content_hash is not None
+        if block.content_hash in self._cached_blocks:
+            return True
+        return False
+
     def promote_to_immutable_block(self, block: Block) -> BlockId:
         """Once a mutable block is full, it can be promoted to an immutable
         block. This means that its content can be referenced by future blocks
@@ -411,6 +432,63 @@ def get_common_computed_block_ids(
             if ids != []
         ])
 
+    def get_num_blocks_touched(self,
+                               blocks: List[Block],
+                               num_lookahead_slots: int = 0) -> int:
+        """Determine the number of blocks that will be touched by
+        swapping in/out the given blocks from certain sequence
+        group with the provided num_lookahead_slots.
+
+        Args:
+            blocks (List[Block]): The potential blocks to swap.
+            num_lookahead_slots (int): number of lookahead slots (0 for 
+                swap out).
+        
+        Returns:
+            int: the number of blocks that will be touched by
+                swapping in/out the given blocks and num_lookahead_slots.
+        """
+        num_touched_blocks = 0
+        for block in blocks:
+            if not block.is_full:
+                if block.num_empty_slots >= num_lookahead_slots:
+                    num_touched_blocks += 1
+                else:
+                    num_touched_blocks += cdiv(
+                        num_lookahead_slots - block.num_empty_slots,
+                        self._block_size)
+            else:
+                if not self.is_block_cached(block):
+                    num_touched_blocks += 1
+        return num_touched_blocks
+
+    def swap_out(self, blocks: List[Block]) -> None:
+        """Execute the swap out actions. Basically just free the 
+        given blocks.
+
+        Args:
+            blocks: List of blocks to be swapped out.
+        """
+        for block in blocks:
+            self.free(block)
+
+    def swap_in(self, blocks: List[Block]) -> None:
+        """Execute the swap int actions. Change the block id from 
+        old allocator to current allocator for each block to finish 
+        the block table update. 
+
+        Args:
+            blocks: List of blocks to be swapped in.
+        """
+        for block in blocks:
+            if block.is_full:
+                alloc = self.allocate_immutable(block.prev_block,
+                                                block.token_ids)
+            else:
+                alloc = self.allocate_mutable(block.prev_block)
+                alloc.append_token_ids(block.token_ids)
+            block.block_id = alloc.block_id
+
 
 class PrefixCachingBlock(Block):
     """A block implementation that supports prefix caching.
diff --git a/vllm/core/block_manager_v1.py b/vllm/core/block_manager_v1.py
index 201cba309f6e..4010aaf02b82 100644
--- a/vllm/core/block_manager_v1.py
+++ b/vllm/core/block_manager_v1.py
@@ -541,11 +541,7 @@ def _swap_block_table(
 
         return new_block_table
 
-    def swap_in(self,
-                seq_group: SequenceGroup,
-                num_lookahead_slots: int = 0) -> List[Tuple[int, int]]:
-        assert (num_lookahead_slots == 0
-                ), "BlockSpaceManagerV1 does not support lookahead allocation"
+    def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
 
         request_id = seq_group.request_id
 
diff --git a/vllm/core/block_manager_v2.py b/vllm/core/block_manager_v2.py
index cad42ab3c1ba..121092cf189b 100644
--- a/vllm/core/block_manager_v2.py
+++ b/vllm/core/block_manager_v2.py
@@ -1,10 +1,12 @@
 """A block manager that manages token blocks."""
+from itertools import chain
 from typing import Dict, List, Optional
 from typing import Sequence as GenericSequence
 from typing import Tuple
 
 from vllm.core.block.block_table import BlockTable
 from vllm.core.block.cpu_gpu_block_allocator import CpuGpuBlockAllocator
+from vllm.core.block.interfaces import Block
 from vllm.core.block.utils import check_no_caching_or_swa_for_blockmgr_encdec
 from vllm.core.interfaces import AllocStatus, BlockSpaceManager
 from vllm.sequence import Sequence, SequenceGroup, SequenceStatus
@@ -217,7 +219,6 @@ def append_slots(
             num_lookahead_slots=num_lookahead_slots,
             num_computed_slots=seq.data.get_num_computed_tokens(),
         )
-
         # Return any new copy-on-writes.
         new_cows = self.block_allocator.clear_copy_on_writes()
         return new_cows
@@ -297,20 +298,145 @@ def fork(self, parent_seq: Sequence, child_seq: Sequence) -> None:
 
     def can_swap_in(self, seq_group: SequenceGroup,
                     num_lookahead_slots: int) -> AllocStatus:
-        return AllocStatus.LATER
+        """Returns the AllocStatus for the given sequence_group 
+        with num_lookahead_slots.
+
+        Args:
+            sequence_group (SequenceGroup): The sequence group to swap in.
+            num_lookahead_slots (int): Number of lookahead slots used in 
+                speculative decoding, default to 0.
+
+        Returns:
+            AllocStatus: The AllocStatus for the given sequence group.
+        """
+        return self._can_swap(seq_group, Device.GPU, SequenceStatus.SWAPPED,
+                              num_lookahead_slots)
+
+    def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
+        """Returns the block id mapping (from CPU to GPU) generated by
+        swapping in the given seq_group with num_lookahead_slots.
 
-    def swap_in(self, seq_group: SequenceGroup,
-                num_lookahead_slots: int) -> List[Tuple[int, int]]:
-        raise NotImplementedError
+        Args:
+            seq_group (SequenceGroup): The sequence group to swap in.
+
+        Returns:
+            List[Tuple[int, int]]: The mapping of swapping block from CPU 
+                to GPU.
+        """
+        blocks = self._get_blocks_for_swap(seq_group, SequenceStatus.SWAPPED)
+        current_swap_mapping = self.block_allocator.swap(
+            blocks=blocks, source_device=Device.CPU, dest_device=Device.GPU)
+
+        block_number_mapping = {
+            self.block_allocator.get_physical_block_id(Device.CPU,
+                                                       cpu_block_id):
+            self.block_allocator.get_physical_block_id(Device.GPU,
+                                                       gpu_block_id)
+            for cpu_block_id, gpu_block_id in current_swap_mapping.items()
+        }
+        # convert to list of tuples once here
+        return list(block_number_mapping.items())
 
     def can_swap_out(self, seq_group: SequenceGroup) -> bool:
+        """Returns whether we can swap out the given sequence_group 
+        with num_lookahead_slots.
+
+        Args:
+            seq_group (SequenceGroup): The sequence group to swap in.
+            num_lookahead_slots (int): Number of lookahead slots used in 
+                speculative decoding, default to 0.
+
+        Returns:
+            bool: Whether it's possible to swap out current sequence group.
+        """
+        alloc_status = self._can_swap(seq_group, Device.CPU,
+                                      SequenceStatus.RUNNING)
+        if alloc_status == AllocStatus.OK:
+            return True
         return False
 
-    def swap_out(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
-        raise NotImplementedError
+    def swap_out(self, sequence_group: SequenceGroup) -> List[Tuple[int, int]]:
+        """Returns the block id mapping (from GPU to CPU) generated by
+        swapping out the given sequence_group with num_lookahead_slots.
+
+        Args:
+            sequence_group (SequenceGroup): The sequence group to swap in.
+
+        Returns:
+            List[Tuple[int, int]]: The mapping of swapping block from 
+                GPU to CPU.
+        """
+        blocks = self._get_blocks_for_swap(sequence_group,
+                                           SequenceStatus.RUNNING)
+        current_swap_mapping = self.block_allocator.swap(
+            blocks=blocks, source_device=Device.GPU, dest_device=Device.CPU)
+        block_number_mapping = {
+            self.block_allocator.get_physical_block_id(Device.GPU,
+                                                       gpu_block_id):
+            self.block_allocator.get_physical_block_id(Device.CPU,
+                                                       cpu_block_id)
+            for gpu_block_id, cpu_block_id in current_swap_mapping.items()
+        }
+        # convert to list of tuples once here
+        return list(block_number_mapping.items())
 
     def get_num_free_gpu_blocks(self) -> int:
         return self.block_allocator.get_num_free_blocks(Device.GPU)
 
     def get_num_free_cpu_blocks(self) -> int:
         return self.block_allocator.get_num_free_blocks(Device.CPU)
+
+    def _can_swap(self,
+                  seq_group: SequenceGroup,
+                  device: Device,
+                  status: SequenceStatus,
+                  num_lookahead_slots: int = 0) -> AllocStatus:
+        """Returns the AllocStatus for swapping in/out the given sequence_group 
+        on to the 'device'.
+
+        Args:
+            sequence_group (SequenceGroup): The sequence group to swap in.
+            device (Device): device to swap the 'seq_group' on.
+            status (SequenceStatus): The status of sequence which is needed
+                for action. RUNNING for swap out and SWAPPED for swap in
+            num_lookahead_slots (int): Number of lookahead slots used in 
+                speculative decoding, default to 0.
+
+        Returns:
+            AllocStatus: The AllocStatus for swapping in/out the given 
+                sequence_group on to the 'device'.
+        """
+        blocks = self._get_blocks_for_swap(seq_group, status)
+        num_blocks_touched = self.block_allocator.get_num_blocks_touched(
+            blocks, device, num_lookahead_slots)
+        watermark_blocks = 0
+        if device == Device.GPU:
+            watermark_blocks = self.watermark_blocks
+        if self.block_allocator.get_num_total_blocks(
+                device) < num_blocks_touched:
+            return AllocStatus.NEVER
+        elif self.block_allocator.get_num_free_blocks(
+                device) - num_blocks_touched >= watermark_blocks:
+            return AllocStatus.OK
+        else:
+            return AllocStatus.LATER
+
+    def _get_blocks_for_swap(self, seq_group: SequenceGroup,
+                             status: SequenceStatus) -> List[Block]:
+        """Returns the list of blocks those are touched by the seq_group
+        
+        Args:
+            sequence_group (SequenceGroup): The sequence group to swap in.
+            status (SequenceStatus): The status of sequence which is needed
+                for action. RUNNING for swap out and SWAPPED for swap in
+        
+        Returns:
+            The list of blocks those are touched by the seq_group.
+        """
+        blocks: Dict[int, List[Block]] = {}
+        for seq in seq_group.get_seqs(status=status):
+            block_table = self.block_tables[seq.seq_id]
+            if block_table.blocks is not None:
+                blocks[seq.seq_id] = block_table.blocks
+        combined_blocks = list(chain(*blocks.values()))
+        return combined_blocks
diff --git a/vllm/core/embedding_model_block_manager.py b/vllm/core/embedding_model_block_manager.py
index a09d79ec3c42..f2d67306d7ce 100644
--- a/vllm/core/embedding_model_block_manager.py
+++ b/vllm/core/embedding_model_block_manager.py
@@ -46,8 +46,7 @@ def can_swap_in(self, seq_group: SequenceGroup,
                     num_lookahead_slots: int) -> AllocStatus:
         return AllocStatus.OK
 
-    def swap_in(self, seq_group: SequenceGroup,
-                num_lookahead_slots: int) -> List[Tuple[int, int]]:
+    def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
         return None  # type: ignore
 
     def can_swap_out(self, seq_group: SequenceGroup) -> bool:
diff --git a/vllm/core/interfaces.py b/vllm/core/interfaces.py
index 689cbc2179ee..8759ee06795b 100644
--- a/vllm/core/interfaces.py
+++ b/vllm/core/interfaces.py
@@ -73,8 +73,7 @@ def can_swap_in(self, seq_group: SequenceGroup,
         pass
 
     @abstractmethod
-    def swap_in(self, seq_group: SequenceGroup,
-                num_lookahead_slots: int) -> List[Tuple[int, int]]:
+    def swap_in(self, seq_group: SequenceGroup) -> List[Tuple[int, int]]:
         pass
 
     @abstractmethod
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 7c70b1b244f7..399665082f83 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -297,6 +297,8 @@ def __init__(
         self.prev_prompt = False
         # Latency of the last prompt step
         self.last_prompt_latency = 0.0
+        # preemption mode, RECOMPUTE or SWAP
+        self.user_specified_preemption_mode = scheduler_config.preemption_mode
 
         # The following field is test-only. It is used to inject artificial
         # preemption.
@@ -522,7 +524,9 @@ def _schedule_swapped(
             seq_group = swapped_queue[0]
 
             # If the sequence group cannot be swapped in, stop.
-            alloc_status = self.block_manager.can_swap_in(seq_group)
+            is_prefill = seq_group.is_prefill()
+            alloc_status = self.block_manager.can_swap_in(
+                seq_group, self._get_num_lookahead_slots(is_prefill))
             if alloc_status == AllocStatus.LATER:
                 break
             elif alloc_status == AllocStatus.NEVER:
@@ -1067,12 +1071,17 @@ def _preempt(
         # over sequence groups with a single sequence.
         # TODO(woosuk): Support recomputation for sequence groups with multiple
         # sequences. This may require a more sophisticated CUDA kernel.
-        if preemption_mode is None:
+        if self.user_specified_preemption_mode is None:
             if seq_group.get_max_num_running_seqs() == 1:
                 preemption_mode = PreemptionMode.RECOMPUTE
             else:
                 preemption_mode = PreemptionMode.SWAP
 
+        elif self.user_specified_preemption_mode == "swap":
+            preemption_mode = PreemptionMode.SWAP
+        else:
+            preemption_mode = PreemptionMode.RECOMPUTE
+
         if self.num_cumulative_preemption % 50 == 0:
             logger.warning(
                 "Sequence group %s is preempted by %s mode because there is "
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index a2a98a05ad97..54911250cea9 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -79,6 +79,7 @@ class EngineArgs:
     num_gpu_blocks_override: Optional[int] = None
     num_lookahead_slots: int = 0
     model_loader_extra_config: Optional[dict] = None
+    preemption_mode: Optional[str] = None
 
     # Related to Vision-language models such as llava
     image_input_type: Optional[str] = None
@@ -579,6 +580,13 @@ def add_cli_args(
                             'corresponding to the chosen load_format. '
                             'This should be a JSON string that will be '
                             'parsed into a dictionary.')
+        parser.add_argument(
+            '--preemption_mode',
+            type=str,
+            default=None,
+            help='If \'recompute\', the engine performs preemption by block '
+            'swapping; If \'swap\', the engine performs preemption by block '
+            'swapping.')
 
         parser.add_argument(
             "--served-model-name",
@@ -682,6 +690,7 @@ def create_engine_config(self, ) -> EngineConfig:
             delay_factor=self.scheduler_delay_factor,
             enable_chunked_prefill=self.enable_chunked_prefill,
             embedding_mode=model_config.embedding_mode,
+            preemption_mode=self.preemption_mode,
         )
         lora_config = LoRAConfig(
             max_lora_rank=self.max_lora_rank,

From 198d7845156044eaffe9306a74a63830d995a761 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Mon, 3 Jun 2024 16:16:43 -0700
Subject: [PATCH 19/93] New CI template on AWS stack (#5110)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .buildkite/test-template-aws.j2 | 59 +++++++++++++++++++++++++++++++++
 1 file changed, 59 insertions(+)
 create mode 100644 .buildkite/test-template-aws.j2

diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2
new file mode 100644
index 000000000000..9f7d07acca29
--- /dev/null
+++ b/.buildkite/test-template-aws.j2
@@ -0,0 +1,59 @@
+{% set docker_image = "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" %}
+{% set default_working_dir = "/vllm-workspace/tests" %}
+
+steps:
+  - label: ":docker: build image"
+    agents:
+      queue: cpu_queue
+    commands:
+      - "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7"
+      - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
+      - "docker push {{ docker_image }}"
+    env:
+      DOCKER_BUILDKIT: "1"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+        - exit_status: -10  # Agent was lost
+          limit: 5
+  - wait
+
+  {% for step in steps %}
+  - label: "{{ step.label }}"
+    agents:
+      {% if step.no_gpu %}
+      queue: cpu_queue
+      {% elif step.num_gpus == 2 or step.num_gpus == 4 %}
+      queue: gpu_4_queue
+      {% else %}
+      queue: gpu_1_queue
+      {% endif %}
+    soft_fail: true
+    {% if step.parallelism %}
+    parallelism: {{ step.parallelism }}
+    {% endif %}
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 5
+        - exit_status: -10  # Agent was lost
+          limit: 5
+    plugins:
+      - docker#v5.2.0:
+          image: {{ docker_image }}
+          always-pull: true
+          propagate-environment: true
+          {% if not step.no_gpu %}
+          gpus: all
+          {% endif %}
+          command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}"]
+          environment:
+            - VLLM_USAGE_SOURCE=ci-test
+            - HF_TOKEN
+            {% if step.label == "Speculative decoding tests" %}
+            - VLLM_ATTENTION_BACKEND=XFORMERS
+            {% endif %}
+          volumes:
+            - /dev/shm:/dev/shm
+  {% endfor %}

From 1923dcb3e4e774ba04a20b95cbf396fe137952e2 Mon Sep 17 00:00:00 2001
From: Breno Faria <breno@veltefaria.de>
Date: Tue, 4 Jun 2024 01:25:29 +0200
Subject: [PATCH 20/93] [FRONTEND] OpenAI `tools` support named functions
 (#5032)

---
 .../serving/openai_compatible_server.md       |  13 +-
 tests/entrypoints/test_openai_server.py       | 185 ++++++++++++++++++
 tests/utils.py                                |   3 +-
 vllm/entrypoints/openai/protocol.py           |  57 +++++-
 vllm/entrypoints/openai/serving_chat.py       |  37 +++-
 .../guided_decoding/__init__.py               |  30 ++-
 6 files changed, 314 insertions(+), 11 deletions(-)

diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index 15a8761eb573..a912949352b8 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -109,4 +109,15 @@ directory [here](https://github.com/vllm-project/vllm/tree/main/examples/)
 :module: vllm.entrypoints.openai.cli_args
 :func: make_arg_parser
 :prog: -m vllm.entrypoints.openai.api_server
-```
\ No newline at end of file
+```
+
+## Tool calling in the chat completion API
+vLLM supports only named function calling in the chat completion API. The `tool_choice` options `auto` and `required` are **not yet supported** but on the roadmap.
+
+To use a named function you need to define the function in the `tools` parameter and call it in the `tool_choice` parameter. 
+
+It is the callers responsibility to prompt the model with the tool information, vLLM will not automatically manipulate the prompt. **This may change in the future.**
+
+vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
+
+Please refer to the OpenAI API reference documentation for more information.
\ No newline at end of file
diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index c4c1f8fe3afe..79b1ee3fb5d9 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -905,6 +905,191 @@ async def test_guided_choice_chat_logprobs(server, client: openai.AsyncOpenAI,
         for token in top_logprobs)
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend",
+                         ["outlines", "lm-format-enforcer"])
+async def test_named_tool_use(server, client: openai.AsyncOpenAI,
+                              guided_decoding_backend: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        f"Give an example JSON for an employee profile that "
+        f"fits this schema: {TEST_SCHEMA}"
+    }]
+
+    # non-streaming
+
+    chat_completion = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=1000,
+        tools=[{
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name",
+                "description": "This is a dummy function",
+                "parameters": TEST_SCHEMA
+            }
+        }],
+        tool_choice={
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name"
+            }
+        })
+    message = chat_completion.choices[0].message
+    assert len(message.content) == 0
+    json_string = message.tool_calls[0].function.arguments
+    json1 = json.loads(json_string)
+    jsonschema.validate(instance=json1, schema=TEST_SCHEMA)
+
+    messages.append({"role": "assistant", "content": json_string})
+    messages.append({
+        "role":
+        "user",
+        "content":
+        "Give me another one with a different name and age"
+    })
+
+    # streaming
+
+    stream = await client.chat.completions.create(
+        model=MODEL_NAME,
+        messages=messages,
+        max_tokens=1000,
+        tools=[{
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name",
+                "description": "This is a dummy function",
+                "parameters": TEST_SCHEMA
+            }
+        }],
+        tool_choice={
+            "type": "function",
+            "function": {
+                "name": "dummy_function_name"
+            }
+        },
+        stream=True)
+
+    output = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        assert delta.content is None or len(delta.content) == 0
+        if delta.tool_calls:
+            output.append(delta.tool_calls[0].function.arguments)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    json2 = json.loads("".join(output))
+    jsonschema.validate(instance=json2, schema=TEST_SCHEMA)
+    assert json1["name"] != json2["name"]
+    assert json1["age"] != json2["age"]
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
+async def test_required_tool_use_not_yet_supported(
+        server, client: openai.AsyncOpenAI, guided_decoding_backend: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        f"Give an example JSON for an employee profile that "
+        f"fits this schema: {TEST_SCHEMA}"
+    }]
+
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_tokens=1000,
+            tools=[{
+                "type": "function",
+                "function": {
+                    "name": "dummy_function_name",
+                    "description": "This is a dummy function",
+                    "parameters": TEST_SCHEMA
+                }
+            }],
+            tool_choice="required")
+
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_tokens=1000,
+            tools=[{
+                "type": "function",
+                "function": {
+                    "name": "dummy_function_name",
+                    "description": "This is a dummy function",
+                    "parameters": TEST_SCHEMA
+                }
+            }],
+            tool_choice="auto")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("guided_decoding_backend", ["outlines"])
+async def test_inconsistent_tool_choice_and_tools(
+        server, client: openai.AsyncOpenAI, guided_decoding_backend: str):
+    messages = [{
+        "role": "system",
+        "content": "you are a helpful assistant"
+    }, {
+        "role":
+        "user",
+        "content":
+        f"Give an example JSON for an employee profile that "
+        f"fits this schema: {TEST_SCHEMA}"
+    }]
+
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(model=MODEL_NAME,
+                                             messages=messages,
+                                             max_tokens=1000,
+                                             tool_choice={
+                                                 "type": "function",
+                                                 "function": {
+                                                     "name":
+                                                     "dummy_function_name"
+                                                 }
+                                             })
+
+    with pytest.raises(openai.BadRequestError):
+        await client.chat.completions.create(
+            model=MODEL_NAME,
+            messages=messages,
+            max_tokens=1000,
+            tools=[{
+                "type": "function",
+                "function": {
+                    "name": "dummy_function_name",
+                    "description": "This is a dummy function",
+                    "parameters": TEST_SCHEMA
+                }
+            }],
+            tool_choice={
+                "type": "function",
+                "function": {
+                    "name": "nondefined_function_name"
+                }
+            })
+
+
 @pytest.mark.asyncio
 async def test_response_format_json_object(server, client: openai.AsyncOpenAI):
     for _ in range(2):
diff --git a/tests/utils.py b/tests/utils.py
index 329842911e15..cc8b86276947 100644
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -24,7 +24,8 @@ def __init__(self, args):
         env = os.environ.copy()
         env["PYTHONUNBUFFERED"] = "1"
         self.proc = subprocess.Popen(
-            ["python3", "-m", "vllm.entrypoints.openai.api_server"] + args,
+            [sys.executable, "-m", "vllm.entrypoints.openai.api_server"] +
+            args,
             env=env,
             stdout=sys.stdout,
             stderr=sys.stderr,
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index bbd61a2c5dd5..15bdae38d1d4 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -102,6 +102,26 @@ class ResponseFormat(OpenAIBaseModel):
     type: Literal["text", "json_object"]
 
 
+class FunctionDefinition(OpenAIBaseModel):
+    name: str
+    description: Optional[str] = None
+    parameters: Optional[Dict[str, Any]] = None
+
+
+class ChatCompletionToolsParam(OpenAIBaseModel):
+    type: Literal["function"] = "function"
+    function: FunctionDefinition
+
+
+class ChatCompletionNamedFunction(OpenAIBaseModel):
+    name: str
+
+
+class ChatCompletionNamedToolChoiceParam(OpenAIBaseModel):
+    function: ChatCompletionNamedFunction
+    type: Literal["function"] = "function"
+
+
 class ChatCompletionRequest(OpenAIBaseModel):
     # Ordered by official OpenAI API documentation
     # https://platform.openai.com/docs/api-reference/chat/create
@@ -122,6 +142,9 @@ class ChatCompletionRequest(OpenAIBaseModel):
     stream: Optional[bool] = False
     temperature: Optional[float] = 0.7
     top_p: Optional[float] = 1.0
+    tools: Optional[List[ChatCompletionToolsParam]] = None
+    tool_choice: Optional[Union[Literal["none"],
+                                ChatCompletionNamedToolChoiceParam]] = "none"
     user: Optional[str] = None
 
     # doc: begin-chat-completion-sampling-params
@@ -245,10 +268,27 @@ def check_guided_decoding_count(cls, data):
             "guided_regex" in data and data["guided_regex"] is not None,
             "guided_choice" in data and data["guided_choice"] is not None
         ])
+        # you can only use one kind of guided decoding
         if guide_count > 1:
             raise ValueError(
                 "You can only use one kind of guided decoding "
                 "('guided_json', 'guided_regex' or 'guided_choice').")
+        # you can only either use guided decoding or tools, not both
+        if guide_count > 1 and "tool_choice" in data and data[
+                "tool_choice"] != "none":
+            raise ValueError(
+                "You can only either use guided decoding or tools, not both.")
+        return data
+
+    @model_validator(mode="before")
+    @classmethod
+    def check_tool_choice(cls, data):
+        if "tool_choice" in data and data["tool_choice"] != "none":
+            if not isinstance(data["tool_choice"], dict):
+                raise ValueError("Currently only named tools are supported.")
+            if "tools" not in data or data["tools"] is None:
+                raise ValueError(
+                    "When using `tool_choice`, `tools` must be set.")
         return data
 
     @model_validator(mode="before")
@@ -506,9 +546,21 @@ class EmbeddingResponse(BaseModel):
     usage: UsageInfo
 
 
+class FunctionCall(OpenAIBaseModel):
+    name: str
+    arguments: str
+
+
+class ToolCall(OpenAIBaseModel):
+    id: str = Field(default_factory=lambda: f"chatcmpl-tool-{random_uuid()}")
+    type: Literal["function"] = "function"
+    function: FunctionCall
+
+
 class ChatMessage(OpenAIBaseModel):
     role: str
     content: str
+    tool_calls: List[ToolCall] = Field(default_factory=list)
 
 
 class ChatCompletionLogProb(OpenAIBaseModel):
@@ -535,7 +587,7 @@ class ChatCompletionResponseChoice(OpenAIBaseModel):
 
 class ChatCompletionResponse(OpenAIBaseModel):
     id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
-    object: str = "chat.completion"
+    object: Literal["chat.completion"] = "chat.completion"
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: List[ChatCompletionResponseChoice]
@@ -545,6 +597,7 @@ class ChatCompletionResponse(OpenAIBaseModel):
 class DeltaMessage(OpenAIBaseModel):
     role: Optional[str] = None
     content: Optional[str] = None
+    tool_calls: List[ToolCall] = Field(default_factory=list)
 
 
 class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
@@ -557,7 +610,7 @@ class ChatCompletionResponseStreamChoice(OpenAIBaseModel):
 
 class ChatCompletionStreamResponse(OpenAIBaseModel):
     id: str = Field(default_factory=lambda: f"chatcmpl-{random_uuid()}")
-    object: str = "chat.completion.chunk"
+    object: Literal["chat.completion.chunk"] = "chat.completion.chunk"
     created: int = Field(default_factory=lambda: int(time.time()))
     model: str
     choices: List[ChatCompletionResponseStreamChoice]
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index cc5b896e0e56..7b52e1095246 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -14,10 +14,11 @@
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionContentPartParam, ChatCompletionLogProb,
     ChatCompletionLogProbs, ChatCompletionLogProbsContent,
-    ChatCompletionMessageParam, ChatCompletionRequest, ChatCompletionResponse,
+    ChatCompletionMessageParam, ChatCompletionNamedToolChoiceParam,
+    ChatCompletionRequest, ChatCompletionResponse,
     ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
     ChatCompletionStreamResponse, ChatMessage, DeltaMessage, ErrorResponse,
-    UsageInfo)
+    FunctionCall, ToolCall, UsageInfo)
 from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
                                                     OpenAIServing)
 from vllm.logger import init_logger
@@ -298,11 +299,24 @@ async def chat_completion_stream_generator(
                     delta_text = output.text[len(previous_texts[i]):]
                     previous_texts[i] = output.text
                     previous_num_tokens[i] = len(output.token_ids)
+
+                    if request.tool_choice and type(
+                            request.tool_choice
+                    ) is ChatCompletionNamedToolChoiceParam:
+                        delta_message = DeltaMessage(tool_calls=[
+                            ToolCall(function=FunctionCall(
+                                name=request.tool_choice.function.name,
+                                arguments=delta_text))
+                        ])
+                    else:
+                        delta_message = DeltaMessage(content=delta_text)
+
                     if output.finish_reason is None:
                         # Send token-by-token response for each request.n
+
                         choice_data = ChatCompletionResponseStreamChoice(
                             index=i,
-                            delta=DeltaMessage(content=delta_text),
+                            delta=delta_message,
                             logprobs=logprobs,
                             finish_reason=None)
                         chunk = ChatCompletionStreamResponse(
@@ -324,7 +338,7 @@ async def chat_completion_stream_generator(
                         )
                         choice_data = ChatCompletionResponseStreamChoice(
                             index=i,
-                            delta=DeltaMessage(content=delta_text),
+                            delta=delta_message,
                             logprobs=logprobs,
                             finish_reason=output.finish_reason,
                             stop_reason=output.stop_reason)
@@ -381,9 +395,22 @@ async def chat_completion_full_generator(
             else:
                 logprobs = None
 
+            if request.tool_choice and type(
+                    request.tool_choice) is ChatCompletionNamedToolChoiceParam:
+                message = ChatMessage(
+                    role=role,
+                    content="",
+                    tool_calls=[
+                        ToolCall(function=FunctionCall(
+                            name=request.tool_choice.function.name,
+                            arguments=output.text))
+                    ])
+            elif not request.tool_choice or request.tool_choice == "none":
+                message = ChatMessage(role=role, content=output.text)
+
             choice_data = ChatCompletionResponseChoice(
                 index=output.index,
-                message=ChatMessage(role=role, content=output.text),
+                message=message,
                 logprobs=logprobs,
                 finish_reason=output.finish_reason,
                 stop_reason=output.stop_reason)
diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py
index 0558d6c95d97..50aa3ec379f4 100644
--- a/vllm/model_executor/guided_decoding/__init__.py
+++ b/vllm/model_executor/guided_decoding/__init__.py
@@ -1,7 +1,8 @@
 from typing import Optional, Union
 
-from vllm.entrypoints.openai.protocol import (ChatCompletionRequest,
-                                              CompletionRequest)
+from vllm.entrypoints.openai.protocol import (
+    ChatCompletionNamedToolChoiceParam, ChatCompletionRequest,
+    CompletionRequest)
 from vllm.model_executor.guided_decoding.lm_format_enforcer_decoding import (
     get_lm_format_enforcer_guided_decoding_logits_processor)
 from vllm.model_executor.guided_decoding.outlines_decoding import (
@@ -13,6 +14,8 @@ async def get_guided_decoding_logits_processor(
         guided_decoding_backend: str, request: Union[CompletionRequest,
                                                      ChatCompletionRequest],
         tokenizer) -> Optional[LogitsProcessor]:
+    request = _adapt_request_for_tool_use(request)
+
     if guided_decoding_backend == 'outlines':
         return await get_outlines_guided_decoding_logits_processor(
             request, tokenizer)
@@ -23,3 +26,26 @@ async def get_guided_decoding_logits_processor(
     raise ValueError(
         f"Unknown guided decoding backend '{guided_decoding_backend}'. "
         "Must be one of 'outlines, 'lm-format-enforcer'")
+
+
+def _adapt_request_for_tool_use(request: Union[CompletionRequest,
+                                               ChatCompletionRequest]):
+    # the legacy completion API does not support tool use
+    if type(request) is CompletionRequest:
+        return request
+
+    # user has chosen to not use any tool
+    if request.tool_choice == "none":
+        return request
+
+    # user has chosen to use a named tool
+    if type(request.tool_choice) is ChatCompletionNamedToolChoiceParam:
+        tool_name = request.tool_choice.function.name
+        tools = {tool.function.name: tool.function for tool in request.tools}
+        if tool_name not in tools:
+            raise ValueError(
+                f"Tool '{tool_name}' has not been passed in `tools`.")
+        tool = tools[tool_name]
+        request.guided_json = tool.parameters
+
+    return request

From fa0bba25dad10ddc5ce7b2d35c10381901f3ecd9 Mon Sep 17 00:00:00 2001
From: Toshiki Kataoka <tos.lunar@gmail.com>
Date: Tue, 4 Jun 2024 09:59:30 +0900
Subject: [PATCH 21/93] [Bugfix] Support `prompt_logprobs==0` (#5217)

---
 tests/entrypoints/test_openai_server.py       | 12 ++++++++----
 vllm/entrypoints/openai/serving_completion.py |  2 +-
 vllm/model_executor/sampling_metadata.py      |  2 +-
 vllm/worker/model_runner.py                   |  2 +-
 4 files changed, 11 insertions(+), 7 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index 79b1ee3fb5d9..edd457107d31 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -223,7 +223,7 @@ async def test_zero_logprobs(server, client: openai.AsyncOpenAI,
     assert choice.logprobs is not None
     assert choice.logprobs.token_logprobs is not None
     assert choice.logprobs.top_logprobs is not None
-    assert len(choice.logprobs.top_logprobs[0]) <= 1
+    assert len(choice.logprobs.top_logprobs[0]) == 1
 
 
 @pytest.mark.asyncio
@@ -245,7 +245,7 @@ async def test_some_logprobs(server, client: openai.AsyncOpenAI,
     assert choice.logprobs is not None
     assert choice.logprobs.token_logprobs is not None
     assert choice.logprobs.top_logprobs is not None
-    assert len(choice.logprobs.top_logprobs[0]) <= 6
+    assert 5 <= len(choice.logprobs.top_logprobs[0]) <= 6
 
 
 @pytest.mark.asyncio
@@ -1216,8 +1216,9 @@ async def test_guided_grammar(server, client: openai.AsyncOpenAI):
     "model_name",
     [MODEL_NAME, "zephyr-lora", "zephyr-lora2"],
 )
+@pytest.mark.parametrize("logprobs_arg", [1, 0])
 async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
-                                       model_name: str):
+                                       model_name: str, logprobs_arg: int):
     tokenizer = get_tokenizer(tokenizer_name=MODEL_NAME)
     # test using text and token IDs
     for prompt in ("Hello, my name is", [0, 0, 0, 0, 0]):
@@ -1226,7 +1227,7 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
                                                      max_tokens=5,
                                                      temperature=0.0,
                                                      echo=True,
-                                                     logprobs=1)
+                                                     logprobs=logprobs_arg)
 
         prompt_text = tokenizer.decode(prompt) if isinstance(prompt,
                                                              list) else prompt
@@ -1239,6 +1240,9 @@ async def test_echo_logprob_completion(server, client: openai.AsyncOpenAI,
                 and logprobs.token_logprobs[0] is None)
         assert (len(logprobs.top_logprobs) > 5
                 and logprobs.top_logprobs[0] is None)
+        for top_logprobs in logprobs.top_logprobs[1:]:
+            assert max(logprobs_arg,
+                       1) <= len(top_logprobs) <= logprobs_arg + 1
         assert len(logprobs.tokens) > 5
 
 
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 2fb122edaf98..572878b5527d 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -312,7 +312,7 @@ def request_output_to_completion_response(
                 elif request.echo and request.max_tokens > 0:
                     token_ids = prompt_token_ids + output.token_ids
                     top_logprobs = (prompt_logprobs + output.logprobs
-                                    if request.logprobs else None)
+                                    if request.logprobs is not None else None)
                     output_text = prompt_text + output.text
                 else:
                     token_ids = output.token_ids
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 9969c45963e9..0b3b41e69d6b 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -233,7 +233,7 @@ def _prepare_seq_groups(
         logits = hidden_states[selected_token_indices]
         """
 
-        if sampling_params.prompt_logprobs:
+        if sampling_params.prompt_logprobs is not None:
             selected_token_indices.extend(
                 range(model_output_idx, model_output_idx + prompt_logprob_len))
         model_output_idx += prompt_logprob_len
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 63ec22d79694..67c03ad60008 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -427,7 +427,7 @@ def _prepare_model_input(
                     [lora_id] *
                     (query_len if seq_group_metadata.sampling_params
                      and seq_group_metadata.sampling_params.prompt_logprobs
-                     else 1))
+                     is not None else 1))
 
                 mm_data = seq_group_metadata.multi_modal_data
                 if mm_data is not None:

From d8b71e308ec6d2fcf6ef810b79c328f69616b8f9 Mon Sep 17 00:00:00 2001
From: Zhuohan Li <zhuohan123@gmail.com>
Date: Mon, 3 Jun 2024 19:36:41 -0700
Subject: [PATCH 22/93] [Bugfix] Add warmup for prefix caching example (#5235)

---
 examples/offline_inference_with_prefix.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/offline_inference_with_prefix.py b/examples/offline_inference_with_prefix.py
index 166e98549b53..04c2843792a1 100644
--- a/examples/offline_inference_with_prefix.py
+++ b/examples/offline_inference_with_prefix.py
@@ -51,8 +51,10 @@
 
 print("-" * 80)
 
-# The llm.generate call will batch all prompts and send the batch at once
-# if resources allow.
+# Warmup so that the shared prompt's KV cache is computed.
+prefix_cached_llm.generate(generating_prompts[0], sampling_params)
+
+# Generate with prefix caching.
 start_time_cached = time()
 outputs = prefix_cached_llm.generate(generating_prompts, sampling_params)
 duration_cached = time() - start_time_cached

From 1d88071536686d14d4440534c8695e91108fb31d Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 3 Jun 2024 20:06:59 -0700
Subject: [PATCH 23/93] [Kernel] Enhance MoE benchmarking & tuning script
 (#4921)

---
 benchmarks/kernels/benchmark_mixtral_moe.py   | 239 -------------
 benchmarks/kernels/benchmark_moe.py           | 319 ++++++++++++++++++
 .../layers/fused_moe/fused_moe.py             |  41 ++-
 3 files changed, 346 insertions(+), 253 deletions(-)
 delete mode 100644 benchmarks/kernels/benchmark_mixtral_moe.py
 create mode 100644 benchmarks/kernels/benchmark_moe.py

diff --git a/benchmarks/kernels/benchmark_mixtral_moe.py b/benchmarks/kernels/benchmark_mixtral_moe.py
deleted file mode 100644
index 196ec8cfce88..000000000000
--- a/benchmarks/kernels/benchmark_mixtral_moe.py
+++ /dev/null
@@ -1,239 +0,0 @@
-import argparse
-import json
-import os
-import sys
-
-import torch
-import torch.nn.functional as F
-import triton
-from tqdm import tqdm
-
-from vllm.model_executor.layers.fused_moe import (fused_moe,
-                                                  get_config_file_name)
-
-
-def main(model, tp_size, gpu, dtype: str):
-    os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu)
-    method = fused_moe
-    for bs in [
-            1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
-            2048, 3072, 4096
-    ]:
-        run_grid(bs,
-                 model=model,
-                 method=method,
-                 gpu=gpu,
-                 tp_size=tp_size,
-                 dtype=dtype)
-
-
-def run_grid(bs, model, method, gpu, tp_size, dtype: str):
-    if model == '8x7B':
-        d_model = 4096
-        model_intermediate_size = 14336
-        num_layers = 32
-    elif model == '8x22B':
-        d_model = 6144
-        model_intermediate_size = 16384
-        num_layers = 56
-    else:
-        raise ValueError(f'Unsupported Mixtral model {model}')
-    num_total_experts = 8
-    top_k = 2
-    # tp_size = 2
-    num_calls = 100
-
-    num_warmup_trials = 1
-    num_trials = 1
-
-    configs = []
-
-    for block_size_n in [32, 64, 128, 256]:
-        for block_size_m in [16, 32, 64, 128, 256]:
-            for block_size_k in [64, 128, 256]:
-                for group_size_m in [1, 16, 32, 64]:
-                    for num_warps in [4, 8]:
-                        for num_stages in [2, 3, 4, 5]:
-                            configs.append({
-                                "BLOCK_SIZE_M": block_size_m,
-                                "BLOCK_SIZE_N": block_size_n,
-                                "BLOCK_SIZE_K": block_size_k,
-                                "GROUP_SIZE_M": group_size_m,
-                                "num_warps": num_warps,
-                                "num_stages": num_stages,
-                            })
-
-    best_config = None
-    best_time_us = 1e20
-
-    print(f'{tp_size=} {bs=}')
-
-    for config in tqdm(configs):
-        # warmup
-        try:
-            for _ in range(num_warmup_trials):
-                run_timing(
-                    num_calls=num_calls,
-                    bs=bs,
-                    d_model=d_model,
-                    num_total_experts=num_total_experts,
-                    top_k=top_k,
-                    tp_size=tp_size,
-                    model_intermediate_size=model_intermediate_size,
-                    method=method,
-                    config=config,
-                    dtype=dtype,
-                )
-        except triton.runtime.autotuner.OutOfResources:
-            continue
-
-        # trial
-        for _ in range(num_trials):
-            kernel_dur_ms = run_timing(
-                num_calls=num_calls,
-                bs=bs,
-                d_model=d_model,
-                num_total_experts=num_total_experts,
-                top_k=top_k,
-                tp_size=tp_size,
-                model_intermediate_size=model_intermediate_size,
-                method=method,
-                config=config,
-                dtype=dtype,
-            )
-
-            kernel_dur_us = 1000 * kernel_dur_ms
-            model_dur_ms = kernel_dur_ms * num_layers
-
-            if kernel_dur_us < best_time_us:
-                best_config = config
-                best_time_us = kernel_dur_us
-
-                tqdm.write(
-                    f'{kernel_dur_us=:.1f} {model_dur_ms=:.1f}'
-                    f' {bs=} {tp_size=} {top_k=} {num_total_experts=} '
-                    f'{d_model=} {model_intermediate_size=} {num_layers=}')
-
-    print("best_time_us", best_time_us)
-    print("best_config", best_config)
-
-    # holds Dict[str, Dict[str, int]]
-    filename = get_config_file_name(num_total_experts,
-                                    model_intermediate_size // tp_size,
-                                    "float8" if dtype == "float8" else None)
-    print(f"writing config to file {filename}")
-    existing_content = {}
-    if os.path.exists(filename):
-        with open(filename, "r") as f:
-            existing_content = json.load(f)
-    existing_content[str(bs)] = best_config
-    with open(filename, "w") as f:
-        json.dump(existing_content, f, indent=4)
-        f.write("\n")
-
-
-def run_timing(num_calls: int, bs: int, d_model: int, num_total_experts: int,
-               top_k: int, tp_size: int, model_intermediate_size: int, method,
-               config, dtype: str) -> float:
-    shard_intermediate_size = model_intermediate_size // tp_size
-
-    hidden_states = torch.rand(
-        (bs, d_model),
-        device="cuda:0",
-        dtype=torch.float16,
-    )
-
-    w1 = torch.rand(
-        (num_total_experts, 2 * shard_intermediate_size, d_model),
-        device=hidden_states.device,
-        dtype=hidden_states.dtype,
-    )
-
-    w2 = torch.rand(
-        (num_total_experts, d_model, shard_intermediate_size),
-        device=hidden_states.device,
-        dtype=hidden_states.dtype,
-    )
-
-    w1_scale = None
-    w2_scale = None
-    a1_scale = None
-    a2_scale = None
-
-    if dtype == "float8":
-        w1 = w1.to(torch.float8_e4m3fn)
-        w2 = w2.to(torch.float8_e4m3fn)
-        w1_scale = torch.ones(num_total_experts,
-                              device=hidden_states.device,
-                              dtype=torch.float32)
-        w2_scale = torch.ones(num_total_experts,
-                              device=hidden_states.device,
-                              dtype=torch.float32)
-        a1_scale = torch.ones(1,
-                              device=hidden_states.device,
-                              dtype=torch.float32)
-        a2_scale = torch.ones(1,
-                              device=hidden_states.device,
-                              dtype=torch.float32)
-
-    gating_output = F.softmax(torch.rand(
-        (num_calls, bs, num_total_experts),
-        device=hidden_states.device,
-        dtype=torch.float32,
-    ),
-                              dim=-1)
-
-    start_event = torch.cuda.Event(enable_timing=True)
-    end_event = torch.cuda.Event(enable_timing=True)
-
-    start_event.record()
-    for i in range(num_calls):
-        hidden_states = method(
-            hidden_states=hidden_states,
-            w1=w1,
-            w2=w2,
-            w1_scale=w1_scale,
-            w2_scale=w2_scale,
-            a1_scale=a1_scale,
-            a2_scale=a2_scale,
-            gating_output=gating_output[i],
-            topk=2,
-            renormalize=True,
-            inplace=True,
-            override_config=config,
-            use_fp8=dtype == "float8",
-        )
-    end_event.record()
-    end_event.synchronize()
-
-    dur_ms = start_event.elapsed_time(end_event) / num_calls
-    return dur_ms
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(
-        prog='benchmark_mixtral_moe',
-        description='Benchmark and tune the fused_moe kernel',
-    )
-    parser.add_argument(
-        '--dtype',
-        type=str,
-        default='auto',
-        choices=['float8', 'float16'],
-        help='Data type used for fused_moe kernel computations',
-    )
-    parser.add_argument('--model',
-                        type=str,
-                        default='8x7B',
-                        choices=['8x7B', '8x22B'],
-                        help='The Mixtral model to benchmark')
-    parser.add_argument('--tp-size',
-                        type=int,
-                        default=2,
-                        help='Tensor paralleli size')
-    parser.add_argument('--gpu',
-                        type=int,
-                        default=0,
-                        help="GPU ID for benchmarking")
-    args = parser.parse_args()
-    sys.exit(main(args.model, args.tp_size, args.gpu, args.dtype))
diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
new file mode 100644
index 000000000000..d6fa39a4d30e
--- /dev/null
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -0,0 +1,319 @@
+import argparse
+import time
+from datetime import datetime
+from typing import Any, Dict, List, Tuple
+
+import ray
+import torch
+import triton
+from ray.experimental.tqdm_ray import tqdm
+from transformers import AutoConfig
+
+from vllm.model_executor.layers.fused_moe.fused_moe import *
+
+
+def benchmark_config(
+    config: Dict[str, int],
+    num_tokens: int,
+    num_experts: int,
+    shard_intermediate_size: int,
+    hidden_size: int,
+    topk: int,
+    dtype: torch.dtype,
+    use_fp8: bool,
+    num_iters: int = 100,
+) -> float:
+    init_dtype = torch.float16 if use_fp8 else dtype
+    x = torch.randn(num_tokens, hidden_size, dtype=dtype)
+    w1 = torch.randn(num_experts,
+                     shard_intermediate_size,
+                     hidden_size,
+                     dtype=init_dtype)
+    w2 = torch.randn(num_experts,
+                     hidden_size,
+                     shard_intermediate_size // 2,
+                     dtype=init_dtype)
+    gating_output = torch.randn(num_iters,
+                                num_tokens,
+                                num_experts,
+                                dtype=torch.float32)
+
+    w1_scale = None
+    w2_scale = None
+    a1_scale = None
+    a2_scale = None
+    if use_fp8:
+        w1_scale = torch.randn(num_experts, dtype=torch.float32)
+        w2_scale = torch.randn(num_experts, dtype=torch.float32)
+        a1_scale = torch.randn(1, dtype=torch.float32)
+        a2_scale = torch.randn(1, dtype=torch.float32)
+
+        w1 = w1.to(torch.float8_e4m3fn)
+        w2 = w2.to(torch.float8_e4m3fn)
+
+    input_gating = torch.empty(num_tokens, num_experts, dtype=torch.float32)
+
+    def prepare(i: int):
+        input_gating.copy_(gating_output[i])
+
+    def run():
+        fused_moe(
+            x,
+            w1,
+            w2,
+            input_gating,
+            topk,
+            renormalize=True,
+            inplace=True,
+            override_config=config,
+            use_fp8=use_fp8,
+            w1_scale=w1_scale,
+            w2_scale=w2_scale,
+            a1_scale=a1_scale,
+            a2_scale=a2_scale,
+        )
+
+    # JIT compilation & warmup
+    run()
+    torch.cuda.synchronize()
+
+    # Capture 10 invocations with CUDA graph
+    graph = torch.cuda.CUDAGraph()
+    with torch.cuda.graph(graph):
+        for _ in range(10):
+            run()
+    torch.cuda.synchronize()
+
+    # Warmup
+    for _ in range(5):
+        graph.replay()
+    torch.cuda.synchronize()
+
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    latencies = []
+    for i in range(num_iters):
+        prepare(i)
+        torch.cuda.synchronize()
+
+        start_event.record()
+        graph.replay()
+        end_event.record()
+        end_event.synchronize()
+        latencies.append(start_event.elapsed_time(end_event))
+    avg = sum(latencies) / (num_iters * 10) * 1000  # us
+    graph.reset()
+    return avg
+
+
+def get_configs_compute_bound() -> List[Dict[str, int]]:
+    # Reduced search space for faster tuning.
+    # TODO(woosuk): Increase the search space and use a performance model to
+    # prune the search space.
+    configs = []
+    for num_stages in [2, 3, 4, 5]:
+        for block_m in [16, 32, 64, 128, 256]:
+            for block_k in [64, 128, 256]:
+                for block_n in [32, 64, 128, 256]:
+                    for num_warps in [4, 8]:
+                        for group_size in [1, 16, 32, 64]:
+                            configs.append({
+                                "BLOCK_SIZE_M": block_m,
+                                "BLOCK_SIZE_N": block_n,
+                                "BLOCK_SIZE_K": block_k,
+                                "GROUP_SIZE_M": group_size,
+                                "num_warps": num_warps,
+                                "num_stages": num_stages,
+                            })
+    return configs
+
+
+@ray.remote(num_gpus=1)
+class BenchmarkWorker:
+
+    def __init__(self, seed: int) -> None:
+        torch.set_default_device("cuda")
+        torch.cuda.manual_seed_all(seed)
+        self.seed = seed
+
+    def benchmark(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        shard_intermediate_size: int,
+        hidden_size: int,
+        topk: int,
+        dtype: torch.dtype,
+        use_fp8: bool,
+    ) -> Tuple[Dict[str, int], float]:
+        torch.cuda.manual_seed_all(self.seed)
+
+        dtype_str = "float8" if use_fp8 else None
+        # NOTE(woosuk): The current naming convention uses w2.shape[2], which
+        # is the intermediate size after silu_and_mul.
+        op_config = get_moe_configs(num_experts, shard_intermediate_size // 2,
+                                    dtype_str)
+        if op_config is None:
+            config = get_default_config(num_tokens, num_experts,
+                                        shard_intermediate_size, hidden_size,
+                                        topk, dtype_str)
+        else:
+            config = op_config[min(op_config.keys(),
+                                   key=lambda x: abs(x - num_tokens))]
+        kernel_time = benchmark_config(config, num_tokens, num_experts,
+                                       shard_intermediate_size, hidden_size,
+                                       topk, dtype, use_fp8)
+        return config, kernel_time
+
+    def tune(
+        self,
+        num_tokens: int,
+        num_experts: int,
+        shard_intermediate_size: int,
+        hidden_size: int,
+        topk: int,
+        dtype: torch.dtype,
+        use_fp8: bool,
+        search_space: List[Dict[str, int]],
+    ) -> Dict[str, int]:
+        best_config = None
+        best_time = float("inf")
+        for config in tqdm(search_space):
+            try:
+                kernel_time = benchmark_config(config,
+                                               num_tokens,
+                                               num_experts,
+                                               shard_intermediate_size,
+                                               hidden_size,
+                                               topk,
+                                               dtype,
+                                               use_fp8,
+                                               num_iters=10)
+            except triton.runtime.autotuner.OutOfResources:
+                # Some configurations may be invalid and fail to compile.
+                continue
+
+            if kernel_time < best_time:
+                best_time = kernel_time
+                best_config = config
+        now = datetime.now()
+        print(f"{now.ctime()}] Completed tuning for batch_size={num_tokens}")
+        return best_config
+
+
+def sort_config(config: Dict[str, int]) -> Dict[str, int]:
+    return {
+        "BLOCK_SIZE_M": config["BLOCK_SIZE_M"],
+        "BLOCK_SIZE_N": config["BLOCK_SIZE_N"],
+        "BLOCK_SIZE_K": config["BLOCK_SIZE_K"],
+        "GROUP_SIZE_M": config["GROUP_SIZE_M"],
+        "num_warps": config["num_warps"],
+        "num_stages": config["num_stages"],
+    }
+
+
+def save_configs(
+    configs: Dict[int, Dict[str, int]],
+    num_experts: int,
+    shard_intermediate_size: int,
+    hidden_size: int,
+    topk: int,
+    dtype: torch.dtype,
+    use_fp8: bool,
+) -> None:
+    dtype_str = "float8" if use_fp8 else None
+    # NOTE(woosuk): The current naming convention uses w2.shape[2], which
+    # is the intermediate size after silu_and_mul.
+    filename = get_config_file_name(num_experts, shard_intermediate_size // 2,
+                                    dtype_str)
+    print(f"Writing best config to {filename}...")
+    with open(filename, "w") as f:
+        json.dump(configs, f, indent=4)
+        f.write("\n")
+
+
+def main(args: argparse.Namespace):
+    print(args)
+
+    config = AutoConfig.from_pretrained(args.model)
+    if config.architectures[0] == "DbrxForCausalLM":
+        E = config.ffn_config.moe_num_experts
+        topk = config.ffn_config.moe_top_k
+        intermediate_size = config.ffn_config.ffn_hidden_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+    else:
+        # Default: Mixtral.
+        E = config.num_local_experts
+        topk = config.num_experts_per_tok
+        intermediate_size = config.intermediate_size
+        shard_intermediate_size = 2 * intermediate_size // args.tp_size
+
+    hidden_size = config.hidden_size
+    dtype = config.torch_dtype
+    use_fp8 = args.dtype == "fp8"
+
+    if args.batch_size is None:
+        batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]
+    else:
+        batch_sizes = [args.batch_size]
+
+    ray.init()
+    num_gpus = int(ray.available_resources()["GPU"])
+    workers = [BenchmarkWorker.remote(args.seed) for _ in range(num_gpus)]
+
+    def _distribute(method: str, inputs: List[Any]) -> List[Any]:
+        outputs = []
+        worker_idx = 0
+        for input_args in inputs:
+            worker = workers[worker_idx]
+            worker_method = getattr(worker, method)
+            output = worker_method.remote(*input_args)
+            outputs.append(output)
+            worker_idx = (worker_idx + 1) % num_gpus
+        return ray.get(outputs)
+
+    if args.tune:
+        search_space = get_configs_compute_bound()
+        print(f"Start tuning over {len(search_space)} configurations...")
+
+        start = time.time()
+        configs = _distribute(
+            "tune", [(batch_size, E, shard_intermediate_size, hidden_size,
+                      topk, dtype, use_fp8, search_space)
+                     for batch_size in batch_sizes])
+        best_configs = {
+            M: sort_config(config)
+            for M, config in zip(batch_sizes, configs)
+        }
+        save_configs(best_configs, E, shard_intermediate_size, hidden_size,
+                     topk, dtype, use_fp8)
+        end = time.time()
+        print(f"Tuning took {end - start:.2f} seconds")
+    else:
+        outputs = _distribute("benchmark",
+                              [(batch_size, E, shard_intermediate_size,
+                                hidden_size, topk, dtype, use_fp8)
+                               for batch_size in batch_sizes])
+
+        for batch_size, (config, kernel_time) in zip(batch_sizes, outputs):
+            print(f"Batch size: {batch_size}, config: {config}")
+            print(f"Kernel time: {kernel_time:.2f} us")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model",
+                        type=str,
+                        default="mistralai/Mixtral-8x7B-Instruct-v0.1")
+    parser.add_argument("--tp-size", "-tp", type=int, default=2)
+    parser.add_argument("--dtype",
+                        type=str,
+                        choices=["auto", "fp8"],
+                        default="auto")
+    parser.add_argument("--seed", type=int, default=0)
+    parser.add_argument("--batch-size", type=int, required=False)
+    parser.add_argument("--tune", action="store_true")
+    args = parser.parse_args()
+
+    main(args)
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 20a3c9f6f893..1c6947137a1c 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -308,6 +308,30 @@ def get_moe_configs(E: int, N: int,
     return None
 
 
+def get_default_config(
+    M: int,
+    E: int,
+    N: int,
+    K: int,
+    topk: int,
+    dtype: Optional[str],
+) -> Dict[str, int]:
+    config = {
+        'BLOCK_SIZE_M': 64,
+        'BLOCK_SIZE_N': 64,
+        'BLOCK_SIZE_K': 32,
+        'GROUP_SIZE_M': 8
+    }
+    if M <= E:
+        config = {
+            'BLOCK_SIZE_M': 16,
+            'BLOCK_SIZE_N': 32,
+            'BLOCK_SIZE_K': 64,
+            'GROUP_SIZE_M': 1
+        }
+    return config
+
+
 def fused_topk(
     hidden_states: torch.Tensor,
     gating_output: torch.Tensor,
@@ -382,20 +406,9 @@ def fused_experts(hidden_states: torch.Tensor,
             config = configs[min(configs.keys(), key=lambda x: abs(x - M))]
         else:
             # Else use the default config
-            config = {
-                'BLOCK_SIZE_M': 64,
-                'BLOCK_SIZE_N': 64,
-                'BLOCK_SIZE_K': 32,
-                'GROUP_SIZE_M': 8
-            }
-
-            if M <= E:
-                config = {
-                    'BLOCK_SIZE_M': 16,
-                    'BLOCK_SIZE_N': 32,
-                    'BLOCK_SIZE_K': 64,
-                    'GROUP_SIZE_M': 1
-                }
+            config = get_default_config(M, E, N, w1.shape[2],
+                                        topk_ids.shape[1],
+                                        "float8" if use_fp8 else None)
 
     intermediate_cache1 = torch.empty((M, topk_ids.shape[1], N),
                                       device=hidden_states.device,

From 789905582ea2f1b40cd5478c38795ff49e5b4f78 Mon Sep 17 00:00:00 2001
From: afeldman-nm <156691304+afeldman-nm@users.noreply.github.com>
Date: Mon, 3 Jun 2024 23:32:57 -0400
Subject: [PATCH 24/93] [Bugfix]: During testing, use pytest monkeypatch for
 safely overriding the env var that indicates the vLLM backend (#5210)

---
 tests/kernels/test_attention_selector.py | 27 +++++++++---------------
 tests/kernels/utils.py                   | 22 +++++++++++++++++++
 2 files changed, 32 insertions(+), 17 deletions(-)
 create mode 100644 tests/kernels/utils.py

diff --git a/tests/kernels/test_attention_selector.py b/tests/kernels/test_attention_selector.py
index f439afa9b7d2..79e03c7478de 100644
--- a/tests/kernels/test_attention_selector.py
+++ b/tests/kernels/test_attention_selector.py
@@ -1,21 +1,22 @@
-import os
 from unittest.mock import patch
 
 import pytest
 import torch
 
+from tests.kernels.utils import (STR_FLASH_ATTN_VAL, STR_INVALID_VAL,
+                                 override_backend_env_variable)
 from vllm.attention.selector import which_attn_to_use
 
 
 @pytest.mark.parametrize(
     "name", ["TORCH_SDPA", "ROCM_FLASH", "XFORMERS", "FLASHINFER"])
 @pytest.mark.parametrize("device", ["cpu", "hip", "cuda"])
-def test_env(name: str, device: str):
+def test_env(name: str, device: str, monkeypatch):
     """Test that the attention selector can be set via environment variable.
     Note that we do not test FlashAttn because it is the default backend.
     """
-    name_backup = os.environ.get("VLLM_ATTENTION_BACKEND", None)
-    os.environ["VLLM_ATTENTION_BACKEND"] = name
+
+    override_backend_env_variable(monkeypatch, name)
 
     if device == "cpu":
         with patch("vllm.attention.selector.is_cpu", return_value=True):
@@ -32,14 +33,11 @@ def test_env(name: str, device: str):
                                     torch.float16, 16)
         assert backend.name == name
 
-    if name_backup is not None:
-        os.environ["VLLM_ATTENTION_BACKEND"] = name_backup
-
 
-def test_flash_attn():
+def test_flash_attn(monkeypatch):
     """Test FlashAttn validation."""
-    name_backup = os.environ.get("VLLM_ATTENTION_BACKEND", None)
-    os.environ["VLLM_ATTENTION_BACKEND"] = "FLASH_ATTN"
+
+    override_backend_env_variable(monkeypatch, STR_FLASH_ATTN_VAL)
 
     # Unsupported CUDA arch
     with patch("torch.cuda.get_device_capability", return_value=[7, 5]):
@@ -71,14 +69,9 @@ def test_flash_attn():
     backend = which_attn_to_use(8, 17, 8, None, torch.float16, None, 16)
     assert backend.name != "FLASH_ATTN"
 
-    if name_backup is not None:
-        os.environ["VLLM_ATTENTION_BACKEND"] = name_backup
-
 
-def test_invalid_env():
+def test_invalid_env(monkeypatch):
     """Throw an exception if the backend name is invalid."""
-    name_backup = os.environ.get("VLLM_ATTENTION_BACKEND", None)
-    os.environ["VLLM_ATTENTION_BACKEND"] = "INVALID"
+    override_backend_env_variable(monkeypatch, STR_INVALID_VAL)
     with pytest.raises(ValueError):
         which_attn_to_use(8, 16, 8, None, torch.float16, None, 16)
-    os.environ["VLLM_ATTENTION_BACKEND"] = name_backup
diff --git a/tests/kernels/utils.py b/tests/kernels/utils.py
new file mode 100644
index 000000000000..b401eb87d3ec
--- /dev/null
+++ b/tests/kernels/utils.py
@@ -0,0 +1,22 @@
+"""Kernel test utils"""
+
+import pytest
+
+STR_BACKEND_ENV_VAR: str = "VLLM_ATTENTION_BACKEND"
+STR_FLASH_ATTN_VAL: str = "FLASH_ATTN"
+STR_INVALID_VAL: str = "INVALID"
+
+
+def override_backend_env_variable(mpatch: pytest.MonkeyPatch,
+                                  backend_name: str) -> None:
+    '''
+    Override the environment variable indicating the vLLM backend temporarily,
+    using pytest monkeypatch to ensure that the env vars get
+    reset once the test context exits.
+
+    Arguments:
+
+    * mpatch: pytest monkeypatch instance
+    * backend_name: attention backend name to force
+    '''
+    mpatch.setenv(STR_BACKEND_ENV_VAR, backend_name)

From 0e8a84dbee444e8f00a6ff953520590a37042089 Mon Sep 17 00:00:00 2001
From: zifeitong <zifei.tong@parasail.io>
Date: Mon, 3 Jun 2024 20:55:50 -0700
Subject: [PATCH 25/93] [Bugfix] Fix torch.compile() error when using
 MultiprocessingGPUExecutor (#5229)

---
 vllm/executor/multiproc_gpu_executor.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/vllm/executor/multiproc_gpu_executor.py b/vllm/executor/multiproc_gpu_executor.py
index 8fa54454907b..bd1cac2ab9b5 100644
--- a/vllm/executor/multiproc_gpu_executor.py
+++ b/vllm/executor/multiproc_gpu_executor.py
@@ -34,6 +34,9 @@ def _init_executor(self) -> None:
         # Ensure that VLLM_INSTANCE_ID is set, to be inherited by workers
         os.environ["VLLM_INSTANCE_ID"] = get_vllm_instance_id()
 
+        # Disable torch async compiling which won't work with daemonic processes
+        os.environ["TORCHINDUCTOR_COMPILE_THREADS"] = "1"
+
         from torch.cuda import device_count
         assert world_size <= device_count(), (
             "please set tensor_parallel_size to less than max local gpu count")

From 88368d3e8171dee85ec0d3823c370ae0d33cbc5a Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Tue, 4 Jun 2024 12:01:46 +0800
Subject: [PATCH 26/93] [CI/Build] Add inputs tests (#5215)

---
 .buildkite/test-pipeline.yaml      |  8 +++++++-
 tests/multimodal/test_processor.py | 11 +++++++----
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 21cbd9ba1378..4edd1cadfb2f 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -62,7 +62,6 @@ steps:
   mirror_hardwares: [amd]
 
   commands:
-  - pytest -v -s test_inputs.py
   - pytest -v -s entrypoints -m llm
   - pytest -v -s entrypoints -m openai
 
@@ -79,6 +78,13 @@ steps:
     - python3 llava_example.py
     - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
 
+- label: Inputs Test
+  #mirror_hardwares: [amd]
+  commands:
+    - bash ../.buildkite/download-images.sh
+    - pytest -v -s test_inputs.py
+    - pytest -v -s multimodal
+
 - label: Kernels Test %N
   #mirror_hardwares: [amd]
   command: pytest -v -s kernels --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT
diff --git a/tests/multimodal/test_processor.py b/tests/multimodal/test_processor.py
index 4aeae633d07f..3df28e782dd8 100644
--- a/tests/multimodal/test_processor.py
+++ b/tests/multimodal/test_processor.py
@@ -6,8 +6,10 @@
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import ImagePixelData
 
+from ..conftest import _STR_DTYPE_TO_TORCH_DTYPE
 
-@pytest.mark.parametrize("dtype", ["half", "bfloat16", "float"])
+
+@pytest.mark.parametrize("dtype", ["half", "float"])
 def test_clip_image_processor(hf_images, dtype):
     MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
     IMAGE_HEIGHT = IMAGE_WIDTH = 33
@@ -36,8 +38,8 @@ def test_clip_image_processor(hf_images, dtype):
     for image in hf_images:
         hf_result = hf_processor.preprocess(
             image,
-            return_tensors="np",
-        )
+            return_tensors="pt",
+        ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
         vllm_result = MULTIMODAL_REGISTRY.process_input(
             ImagePixelData(image),
             model_config=model_config,
@@ -45,7 +47,8 @@ def test_clip_image_processor(hf_images, dtype):
         )
 
         assert hf_result.keys() == vllm_result.keys()
-        for key, hf_arr in hf_result.items():
+        for key, hf_tensor in hf_result.items():
+            hf_arr: np.ndarray = hf_tensor.numpy()
             vllm_arr: np.ndarray = vllm_result[key].numpy()
 
             assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"

From 756340ae28795fabf391536b91cd2dc1c470095e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= <fujie_email@sina.com>
Date: Wed, 5 Jun 2024 00:57:51 +0800
Subject: [PATCH 27/93] [Bugfix] Fix a bug caused by pip install
 setuptools>=49.4.0 for CPU backend (#5249)

---
 Dockerfile.cpu                                   | 2 +-
 docs/source/getting_started/cpu-installation.rst | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.cpu b/Dockerfile.cpu
index ae23e27b413b..403a1cd0391b 100644
--- a/Dockerfile.cpu
+++ b/Dockerfile.cpu
@@ -7,7 +7,7 @@ RUN apt-get update  -y \
     && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12
 
 RUN pip install --upgrade pip \
-    && pip install wheel packaging ninja setuptools>=49.4.0 numpy
+    && pip install wheel packaging ninja "setuptools>=49.4.0" numpy
 
 FROM cpu-test-1 AS build
 
diff --git a/docs/source/getting_started/cpu-installation.rst b/docs/source/getting_started/cpu-installation.rst
index ba8b0645adcd..5270253cae9a 100644
--- a/docs/source/getting_started/cpu-installation.rst
+++ b/docs/source/getting_started/cpu-installation.rst
@@ -54,7 +54,7 @@ Build from source
 .. code-block:: console
 
     $ pip install --upgrade pip
-    $ pip install wheel packaging ninja setuptools>=49.4.0 numpy
+    $ pip install wheel packaging ninja "setuptools>=49.4.0" numpy
     $ pip install -v -r requirements-cpu.txt --extra-index-url https://download.pytorch.org/whl/cpu
 
 - Finally, build and install vLLM CPU backend: 

From 789553f76a6b690037f25265da378f778512d904 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Tue, 4 Jun 2024 09:58:47 -0700
Subject: [PATCH 28/93] [Kernel] Add back batch size 1536 and 3072 to MoE
 tuning (#5242)

---
 benchmarks/kernels/benchmark_moe.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index d6fa39a4d30e..2edc63142d02 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -254,7 +254,9 @@ def main(args: argparse.Namespace):
     use_fp8 = args.dtype == "fp8"
 
     if args.batch_size is None:
-        batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096]
+        batch_sizes = [
+            1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 1536, 2048, 3072, 4096
+        ]
     else:
         batch_sizes = [args.batch_size]
 

From c57b71eedf67b4bfd871138230cf5add42659b07 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Wed, 5 Jun 2024 01:09:19 +0800
Subject: [PATCH 29/93] [CI/Build] Simplify model loading for `HfRunner`
 (#5251)

---
 tests/conftest.py              | 28 ++++++++++++++++------------
 tests/models/test_embedding.py |  2 +-
 tests/models/test_llava.py     |  2 +-
 3 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 8fcd91305e3a..93b161914b53 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -2,14 +2,15 @@
 import gc
 import logging
 import os
-from typing import Any, Dict, List, Optional, Tuple
+from typing import Any, Dict, List, Optional, Tuple, TypeVar
 
 import pytest
 import torch
+import torch.nn as nn
 import torch.nn.functional as F
 from PIL import Image
-from transformers import (AutoModelForCausalLM, AutoProcessor, AutoTokenizer,
-                          LlavaConfig, LlavaForConditionalGeneration)
+from transformers import (AutoModelForCausalLM, AutoModelForVision2Seq,
+                          AutoProcessor, AutoTokenizer, BatchEncoding)
 
 from tests.nm_utils.logging import make_logger
 from vllm import LLM, SamplingParams
@@ -146,16 +147,12 @@ def example_long_prompts() -> List[str]:
     "float": torch.float,
 }
 
-AutoModelForCausalLM.register(LlavaConfig, LlavaForConditionalGeneration)
-
-_EMBEDDING_MODELS = [
-    "intfloat/e5-mistral-7b-instruct",
-]
+_T = TypeVar("_T", nn.Module, torch.Tensor, BatchEncoding)
 
 
 class HfRunner:
 
-    def wrap_device(self, input: any):
+    def wrap_device(self, input: _T) -> _T:
         if not is_cpu():
             return input.to("cuda")
         else:
@@ -165,14 +162,16 @@ def __init__(
         self,
         model_name: str,
         dtype: str = "half",
-        access_token: Optional[str] = None,
+        *,
+        is_embedding_model: bool = False,
+        is_vision_model: bool = False,
     ) -> None:
         assert dtype in _STR_DTYPE_TO_TORCH_DTYPE
         torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
 
         self.model_name = model_name
 
-        if model_name in _EMBEDDING_MODELS:
+        if is_embedding_model:
             # Lazy init required for AMD CI
             from sentence_transformers import SentenceTransformer
             self.model = self.wrap_device(
@@ -181,8 +180,13 @@ def __init__(
                     device="cpu",
                 ).to(dtype=torch_dtype))
         else:
+            if is_vision_model:
+                auto_cls = AutoModelForVision2Seq
+            else:
+                auto_cls = AutoModelForCausalLM
+
             self.model = self.wrap_device(
-                AutoModelForCausalLM.from_pretrained(
+                auto_cls.from_pretrained(
                     model_name,
                     torch_dtype=torch_dtype,
                     trust_remote_code=True,
diff --git a/tests/models/test_embedding.py b/tests/models/test_embedding.py
index 59bf054913f7..668ed3a520a3 100644
--- a/tests/models/test_embedding.py
+++ b/tests/models/test_embedding.py
@@ -28,7 +28,7 @@ def test_models(
     model: str,
     dtype: str,
 ) -> None:
-    hf_model = hf_runner(model, dtype=dtype)
+    hf_model = hf_runner(model, dtype=dtype, is_embedding_model=True)
     hf_outputs = hf_model.encode(example_prompts)
     del hf_model
 
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index cc0685ca9c5e..839a9f78d1bb 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -94,7 +94,7 @@ def test_models(hf_runner, vllm_runner, hf_image_prompts, hf_images,
     """
     model_id, vision_language_config = model_and_config
 
-    hf_model = hf_runner(model_id, dtype=dtype)
+    hf_model = hf_runner(model_id, dtype=dtype, is_vision_model=True)
     hf_outputs = hf_model.generate_greedy(hf_image_prompts,
                                           max_tokens,
                                           images=hf_images)

From 14ec8df77f3b79358036ee70a0ce631bc5b06407 Mon Sep 17 00:00:00 2001
From: "Li, Jiang" <jiang1.li@intel.com>
Date: Wed, 5 Jun 2024 01:26:40 +0800
Subject: [PATCH 30/93] [CI/Build] Reducing CPU CI execution time (#5241)

---
 .buildkite/run-cpu-test.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh
index d1200ee84dfe..6a86bc0ebfb6 100644
--- a/.buildkite/run-cpu-test.sh
+++ b/.buildkite/run-cpu-test.sh
@@ -11,7 +11,7 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image
-docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
+docker run -itd -v ~/.cache/huggingface:/root/.cache/huggingface --cpuset-cpus=48-95 --cpuset-mems=1 --network host -e HF_TOKEN --env VLLM_CPU_KVCACHE_SPACE=4 --name cpu-test cpu-test
 
 # offline inference
 docker exec cpu-test bash -c "python3 examples/offline_inference.py"

From 3b6f9d6b8fa5a86968cb4d9cf444e3aed2900f59 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Tue, 4 Jun 2024 13:34:53 -0500
Subject: [PATCH 31/93] [CI] mark AMD test as softfail to prevent blockage
 (#5256)

---
 .buildkite/test-template.j2 | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-template.j2 b/.buildkite/test-template.j2
index 7e986c988407..4a20a462b98e 100644
--- a/.buildkite/test-template.j2
+++ b/.buildkite/test-template.j2
@@ -4,7 +4,7 @@
 
 steps:
   - label: ":docker: build image"
-    commands: 
+    commands:
       - "docker build --build-arg max_jobs=16 --tag {{ docker_image }} --target test --progress plain ."
       - "docker push {{ docker_image }}"
     env:
@@ -28,6 +28,7 @@ steps:
         command: bash .buildkite/run-amd-test.sh "cd {{ (step.working_dir or default_working_dir) | safe  }} ; {{ step.command  or (step.commands | join(" ; ")) | safe }}"
         env:
           DOCKER_BUILDKIT: "1"
+        soft_fail: true
     {% endif %}
     {% endfor %}
 
@@ -36,7 +37,7 @@ steps:
     agents:
       queue: neuron
     command: bash .buildkite/run-neuron-test.sh
-    soft_fail: true
+    soft_fail: false
 
   - label: "Intel Test"
     depends_on: ~

From 06bcc9724c115403a92f3543025291bd248ae2d4 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Tue, 4 Jun 2024 15:52:28 -0400
Subject: [PATCH 32/93] [Misc] Add transformers version to collect_env.py
 (#5259)

---
 collect_env.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/collect_env.py b/collect_env.py
index c89f8c64eddc..3990c45078cb 100644
--- a/collect_env.py
+++ b/collect_env.py
@@ -64,6 +64,7 @@
     "triton",
     "optree",
     "nccl",
+    "transformers",
 }
 
 DEFAULT_PIP_PATTERNS = {
@@ -75,6 +76,7 @@
     "optree",
     "onnx",
     "nccl",
+    "transformers",
     # UPSTREAM SYNC: needed for sparsity
     "nm-magic-wand-nightly",
 }

From c3a46ddef8d09cfa41aecb448c81ed84479d0ebd Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Tue, 4 Jun 2024 15:29:09 -0700
Subject: [PATCH 33/93] [Misc] update collect env (#5261)

---
 collect_env.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/collect_env.py b/collect_env.py
index 3990c45078cb..a1fee64d39e8 100644
--- a/collect_env.py
+++ b/collect_env.py
@@ -605,6 +605,11 @@ def get_version_or_na(cfg, prefix):
 {conda_packages}
 """.strip()
 
+# both the above code and the following code use `strip()` to
+# remove leading/trailing whitespaces, so we need to add a newline
+# in between to separate the two sections
+env_info_fmt += "\n"
+
 env_info_fmt += """
 ROCM Version: {rocm_version}
 Neuron SDK Version: {neuron_sdk_version}

From c6bcf664ab6b05e32ef20a6b4a2c3fdb5cdca1ea Mon Sep 17 00:00:00 2001
From: zifeitong <zifei.tong@parasail.io>
Date: Tue, 4 Jun 2024 19:37:28 -0700
Subject: [PATCH 34/93] [Bugfix] Fix prompt_logprobs when
 SamplingParams.detokenize is set to True (#5226)

---
 tests/samplers/test_logprobs.py             | 27 ++++++++++++++-------
 vllm/engine/output_processor/single_step.py |  8 +++---
 2 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 40d054cd472b..61720cccf50b 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -12,6 +12,7 @@
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("chunked_prefill_token_size", [1, 4, 16, -1])
 @pytest.mark.parametrize("num_top_logprobs", [6])  # 32000 == vocab_size
+@pytest.mark.parametrize("detokenize", [True, False])
 def test_get_prompt_logprobs(
     hf_runner,
     vllm_runner,
@@ -19,6 +20,7 @@ def test_get_prompt_logprobs(
     dtype,
     chunked_prefill_token_size: int,
     num_top_logprobs: int,
+    detokenize: bool,
     example_prompts,
 ):
     max_num_seqs = 256
@@ -48,7 +50,8 @@ def test_get_prompt_logprobs(
     vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
                                           logprobs=num_top_logprobs,
                                           prompt_logprobs=num_top_logprobs,
-                                          temperature=0.0)
+                                          temperature=0.0,
+                                          detokenize=detokenize)
     vllm_results = vllm_model.model.generate(
         example_prompts, sampling_params=vllm_sampling_params)
 
@@ -65,11 +68,16 @@ def test_get_prompt_logprobs(
             top_logprob = next(iter(top_logprobs.values()))
             output_string_from_most_likely_tokens.append(
                 top_logprob.decoded_token)
-        output_string_from_most_likely_tokens = "".join(
-            output_string_from_most_likely_tokens)
-        assert output_text == output_string_from_most_likely_tokens, (
-            "The output text from the top logprob for each token position "
-            "should be the same as the output text in the result.")
+
+        if detokenize:
+            output_string_from_most_likely_tokens = "".join(
+                output_string_from_most_likely_tokens)
+            assert output_text == output_string_from_most_likely_tokens, (
+                "The output text from the top logprob for each token position "
+                "should be the same as the output text in the result.")
+        else:
+            assert output_text == ''
+            assert output_string_from_most_likely_tokens == [None] * max_tokens
 
         # The first prompt logprob is always None
         assert result.prompt_logprobs[0] is None
@@ -98,9 +106,10 @@ def test_get_prompt_logprobs(
                                            hf_logprob[i][-1][token_id].item(),
                                            atol=1e-2,
                                            rtol=1e-2)
-                assert isinstance(sample_logprob.decoded_token, str), (
-                    "The token should be decoded by the time it is returned "
-                    " to the user.")
+                if detokenize:
+                    assert isinstance(sample_logprob.decoded_token, str), (
+                        "The token should be decoded by the time it is returned"
+                        " to the user.")
 
     # Test if prompt logprobs are correctly set.
     for vllm_result in vllm_results:
diff --git a/vllm/engine/output_processor/single_step.py b/vllm/engine/output_processor/single_step.py
index 44de1d7ec560..cad44f476f06 100644
--- a/vllm/engine/output_processor/single_step.py
+++ b/vllm/engine/output_processor/single_step.py
@@ -60,10 +60,10 @@ def process_prompt_logprob(self, seq_group: SequenceGroup,
         assert len(outputs) == 1, ("Single step should only has 1 output.")
         output = outputs[0]
         prompt_logprobs = output.prompt_logprobs
-        if (prompt_logprobs is not None
-                and seq_group.sampling_params.detokenize and self.detokenizer):
-            self.detokenizer.decode_prompt_logprobs_inplace(
-                seq_group, prompt_logprobs)
+        if prompt_logprobs is not None:
+            if seq_group.sampling_params.detokenize and self.detokenizer:
+                self.detokenizer.decode_prompt_logprobs_inplace(
+                    seq_group, prompt_logprobs)
             if not seq_group.prompt_logprobs:
                 # The first prompt token's logprob is None because it doesn't
                 # have tokens that are precedent.

From f5d9197d05920e5196c47ff80fc52f4a91c49db7 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 5 Jun 2024 09:18:19 -0700
Subject: [PATCH 35/93] [Misc] Add CustomOp interface for device portability
 (#5255)

---
 tests/kernels/test_activation.py              |  4 +-
 tests/kernels/test_layernorm.py               |  2 +-
 tests/kernels/test_pos_encoding.py            |  7 ++-
 vllm/model_executor/custom_op.py              | 60 +++++++++++++++++++
 vllm/model_executor/layers/activation.py      | 34 +++++++----
 vllm/model_executor/layers/layernorm.py       | 10 ++--
 .../model_executor/layers/rotary_embedding.py | 10 ++--
 7 files changed, 100 insertions(+), 27 deletions(-)
 create mode 100644 vllm/model_executor/custom_op.py

diff --git a/tests/kernels/test_activation.py b/tests/kernels/test_activation.py
index a624c4ca9ee6..a4b9f91c7688 100644
--- a/tests/kernels/test_activation.py
+++ b/tests/kernels/test_activation.py
@@ -44,7 +44,7 @@ def test_act_and_mul(
     elif activation == "gelu_tanh":
         layer = GeluAndMul(approximate="tanh")
     out = layer(x)
-    ref_out = layer._forward(x)
+    ref_out = layer.forward_native(x)
     # The SiLU and GELU implementations are equivalent to the native PyTorch
     # implementations, so we can do exact comparison.
     assert torch.allclose(out, ref_out, atol=0.0, rtol=0.0)
@@ -72,7 +72,7 @@ def test_activation(
     x = torch.randn(num_tokens, d, dtype=dtype)
     layer = activation()
     out = layer(x)
-    ref_out = layer._forward(x)
+    ref_out = layer.forward_native(x)
     assert torch.allclose(out,
                           ref_out,
                           atol=get_default_atol(out),
diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py
index 210d59e4f32f..a635e6c12c59 100644
--- a/tests/kernels/test_layernorm.py
+++ b/tests/kernels/test_layernorm.py
@@ -42,7 +42,7 @@ def test_rms_norm(
 
     # NOTE(woosuk): The reference implementation should be executed first
     # because the custom kernel is in-place.
-    ref_out = layer._forward(x, residual)
+    ref_out = layer.forward_native(x, residual)
     out = layer(x, residual)
     # NOTE(woosuk): LayerNorm operators (including RMS) typically have larger
     # numerical errors than other operators because they involve reductions.
diff --git a/tests/kernels/test_pos_encoding.py b/tests/kernels/test_pos_encoding.py
index fbabc02bf9a9..e564e325112a 100644
--- a/tests/kernels/test_pos_encoding.py
+++ b/tests/kernels/test_pos_encoding.py
@@ -64,7 +64,7 @@ def test_rotary_embedding(
 
     # NOTE(woosuk): The reference implementation should be executed first
     # because the custom kernel is in-place.
-    ref_query, ref_key = rope._forward(positions, query, key)
+    ref_query, ref_key = rope.forward_native(positions, query, key)
     out_query, out_key = rope.forward(positions, query, key)
     # Compare the results.
     assert torch.allclose(out_query,
@@ -121,7 +121,7 @@ def test_batched_rotary_embedding(
 
     # NOTE(woosuk): The reference implementation should be executed first
     # because the custom kernel is in-place.
-    ref_query, ref_key = rope._forward(positions, query, key)
+    ref_query, ref_key = rope.forward_native(positions, query, key)
     out_query, out_key = rope.forward(positions,
                                       query,
                                       key,
@@ -195,7 +195,8 @@ def test_batched_rotary_embedding_multi_lora(
 
     # NOTE(woosuk): The reference implementation should be executed first
     # because the custom kernel is in-place.
-    ref_query, ref_key = rope._forward(positions, query, key, query_offsets)
+    ref_query, ref_key = rope.forward_native(positions, query, key,
+                                             query_offsets)
     out_query, out_key = rope.forward(positions, query, key,
                                       query_offsets.flatten())
     # Compare the results.
diff --git a/vllm/model_executor/custom_op.py b/vllm/model_executor/custom_op.py
new file mode 100644
index 000000000000..1d49213cd4ab
--- /dev/null
+++ b/vllm/model_executor/custom_op.py
@@ -0,0 +1,60 @@
+import torch.nn as nn
+
+from vllm.utils import is_cpu, is_hip
+
+
+class CustomOp(nn.Module):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__()
+        self._forward_method = self.dispatch_forward()
+
+    def forward(self, *args, **kwargs):
+        return self._forward_method(*args, **kwargs)
+
+    def forward_native(self, *args, **kwargs):
+        """PyTorch-native implementation of the forward method.
+
+        This method is optional. If implemented, it can be used with compilers
+        such as torch.compile or PyTorch XLA. Also, it can be used for testing
+        purposes.
+        """
+        raise NotImplementedError
+
+    def forward_cuda(self, *args, **kwargs):
+        raise NotImplementedError
+
+    def forward_hip(self, *args, **kwargs):
+        # By default, we assume that HIP ops are compatible with CUDA ops.
+        return self.forward_cuda(*args, **kwargs)
+
+    def forward_xpu(self, *args, **kwargs):
+        # By default, we assume that XPU ops are compatible with CUDA ops.
+        # NOTE(woosuk): This is a placeholder for future extensions.
+        return self.forward_cuda(*args, **kwargs)
+
+    def forward_cpu(self, *args, **kwargs):
+        # By default, we assume that CPU ops are compatible with CUDA ops.
+        return self.forward_cuda(*args, **kwargs)
+
+    def forward_tpu(self, *args, **kwargs):
+        # By default, we assume that TPU ops are compatible with the
+        # PyTorch-native implementation.
+        # NOTE(woosuk): This is a placeholder for future extensions.
+        return self.forward_native(*args, **kwargs)
+
+    def forward_gaudi(self, *args, **kwargs):
+        # By default, we assume that Gaudi ops are compatible with the
+        # PyTorch-native implementation.
+        # NOTE(woosuk): This is a placeholder for future extensions.
+        return self.forward_native(*args, **kwargs)
+
+    def dispatch_forward(self):
+        # NOTE(woosuk): Here we assume that vLLM was built for only one
+        # specific backend. Currently, we do not support dynamic dispatching.
+        if is_hip():
+            return self.forward_hip
+        elif is_cpu():
+            return self.forward_cpu
+        else:
+            return self.forward_cuda
diff --git a/vllm/model_executor/layers/activation.py b/vllm/model_executor/layers/activation.py
index d101aa323b0e..4d076421f9d2 100644
--- a/vllm/model_executor/layers/activation.py
+++ b/vllm/model_executor/layers/activation.py
@@ -6,14 +6,14 @@
 import torch.nn as nn
 import torch.nn.functional as F
 
-from vllm import _custom_ops as ops
 from vllm.distributed import (divide, get_tensor_model_parallel_rank,
                               get_tensor_model_parallel_world_size)
+from vllm.model_executor.custom_op import CustomOp
 from vllm.model_executor.layers.quantization import QuantizationConfig
 from vllm.model_executor.utils import set_weight_attrs
 
 
-class SiluAndMul(nn.Module):
+class SiluAndMul(CustomOp):
     """An activation function for SwiGLU.
 
     The function computes x -> silu(x[:d]) * x[d:] where d = x.shape[-1] // 2.
@@ -23,12 +23,14 @@ class SiluAndMul(nn.Module):
         return: (num_tokens, d) or (batch_size, seq_len, d)
     """
 
-    def _forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
         d = x.shape[-1] // 2
         return F.silu(x[..., :d]) * x[..., d:]
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        from vllm import _custom_ops as ops
+
         d = x.shape[-1] // 2
         output_shape = (x.shape[:-1] + (d, ))
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
@@ -36,7 +38,7 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
         return out
 
 
-class GeluAndMul(nn.Module):
+class GeluAndMul(CustomOp):
     """An activation function for GeGLU.
 
     The function computes x -> GELU(x[:d]) * x[d:] where d = x.shape[-1] // 2.
@@ -52,12 +54,14 @@ def __init__(self, approximate: str = "none"):
         if approximate not in ("none", "tanh"):
             raise ValueError(f"Unknown approximate mode: {approximate}")
 
-    def _forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
         d = x.shape[-1] // 2
         return F.gelu(x[..., :d], approximate=self.approximate) * x[..., d:]
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        from vllm import _custom_ops as ops
+
         d = x.shape[-1] // 2
         output_shape = (x.shape[:-1] + (d, ))
         out = torch.empty(output_shape, dtype=x.dtype, device=x.device)
@@ -71,28 +75,32 @@ def extra_repr(self) -> str:
         return f'approximate={repr(self.approximate)}'
 
 
-class NewGELU(nn.Module):
+class NewGELU(CustomOp):
 
-    def _forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
         c = math.sqrt(2.0 / math.pi)
         return 0.5 * x * (1.0 + torch.tanh(c *
                                            (x + 0.044715 * torch.pow(x, 3.0))))
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        from vllm import _custom_ops as ops
+
         out = torch.empty_like(x)
         ops.gelu_new(out, x)
         return out
 
 
-class FastGELU(nn.Module):
+class FastGELU(CustomOp):
 
-    def _forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         """PyTorch-native implementation equivalent to forward()."""
         return 0.5 * x * (1.0 + torch.tanh(x * 0.7978845608 *
                                            (1.0 + 0.044715 * x * x)))
 
-    def forward(self, x: torch.Tensor) -> torch.Tensor:
+    def forward_cuda(self, x: torch.Tensor) -> torch.Tensor:
+        from vllm import _custom_ops as ops
+
         out = torch.empty_like(x)
         ops.gelu_fast(out, x)
         return out
diff --git a/vllm/model_executor/layers/layernorm.py b/vllm/model_executor/layers/layernorm.py
index 8de079415898..4533adf8f83a 100644
--- a/vllm/model_executor/layers/layernorm.py
+++ b/vllm/model_executor/layers/layernorm.py
@@ -4,10 +4,10 @@
 import torch
 import torch.nn as nn
 
-from vllm import _custom_ops as ops
+from vllm.model_executor.custom_op import CustomOp
 
 
-class RMSNorm(nn.Module):
+class RMSNorm(CustomOp):
     """Root mean square normalization.
 
     Computes x -> w * x / sqrt(E[x^2] + eps) where w is the learned weight.
@@ -23,7 +23,7 @@ def __init__(
         self.weight = nn.Parameter(torch.ones(hidden_size))
         self.variance_epsilon = eps
 
-    def _forward(
+    def forward_native(
         self,
         x: torch.Tensor,
         residual: Optional[torch.Tensor] = None,
@@ -43,11 +43,13 @@ def _forward(
         else:
             return x, residual
 
-    def forward(
+    def forward_cuda(
         self,
         x: torch.Tensor,
         residual: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        from vllm import _custom_ops as ops
+
         if residual is not None:
             ops.fused_add_rms_norm(
                 x,
diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py
index d03903d206d3..d2652106b844 100644
--- a/vllm/model_executor/layers/rotary_embedding.py
+++ b/vllm/model_executor/layers/rotary_embedding.py
@@ -27,7 +27,7 @@
 import torch
 import torch.nn as nn
 
-from vllm import _custom_ops as ops
+from vllm.model_executor.custom_op import CustomOp
 
 
 def _rotate_neox(x: torch.Tensor) -> torch.Tensor:
@@ -43,7 +43,7 @@ def _rotate_gptj(x: torch.Tensor) -> torch.Tensor:
     return x.flatten(-2)
 
 
-class RotaryEmbedding(nn.Module):
+class RotaryEmbedding(CustomOp):
     """Original rotary positional embedding."""
 
     def __init__(
@@ -93,7 +93,7 @@ def _compute_cos_sin_cache(self) -> torch.Tensor:
         cache = torch.cat((cos, sin), dim=-1)
         return cache
 
-    def _forward(
+    def forward_native(
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
@@ -138,13 +138,15 @@ def _forward(
         key = key.flatten(-2)
         return query, key
 
-    def forward(
+    def forward_cuda(
         self,
         positions: torch.Tensor,
         query: torch.Tensor,
         key: torch.Tensor,
         offsets: Optional[torch.Tensor] = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        from vllm import _custom_ops as ops
+
         self.cos_sin_cache = self.cos_sin_cache.to(positions.device,
                                                    dtype=query.dtype)
         # ops.rotary_embedding()/batched_rotary_embedding()

From bbfee0c2befcb45173673bf196b8307481b7ac58 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 5 Jun 2024 09:18:59 -0700
Subject: [PATCH 36/93] [Misc] Fix docstring of get_attn_backend (#5271)

---
 vllm/attention/selector.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/vllm/attention/selector.py b/vllm/attention/selector.py
index 9ceda3431b89..7253483f9a0b 100644
--- a/vllm/attention/selector.py
+++ b/vllm/attention/selector.py
@@ -31,15 +31,14 @@ def get_attn_backend(
     block_size: int,
     is_blocksparse: bool = False,
 ) -> Type[AttentionBackend]:
+    """Selects which attention backend to use and lazily imports it."""
 
     if is_blocksparse:
         logger.info("Using BlocksparseFlashAttention backend.")
         from vllm.attention.backends.blocksparse_attn import (
             BlocksparseFlashAttentionBackend)
         return BlocksparseFlashAttentionBackend
-    """Determine which attention backend to use and only import
-    the selected backend module.
-    """
+
     backend = which_attn_to_use(num_heads, head_size, num_kv_heads,
                                 sliding_window, dtype, kv_cache_dtype,
                                 block_size)

From 47c12560a4b282b8de818c59b72e829a9f5b5ee5 Mon Sep 17 00:00:00 2001
From: tomeras91 <57313761+tomeras91@users.noreply.github.com>
Date: Wed, 5 Jun 2024 19:32:58 +0300
Subject: [PATCH 37/93] [Frontend] OpenAI API server: Add `add_special_tokens`
 to ChatCompletionRequest (default False) (#5278)

---
 vllm/entrypoints/openai/protocol.py       |  9 +++++++++
 vllm/entrypoints/openai/serving_chat.py   |  4 +++-
 vllm/entrypoints/openai/serving_engine.py | 14 ++++++++------
 3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 15bdae38d1d4..11ac28e758c3 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -176,6 +176,15 @@ class ChatCompletionRequest(OpenAIBaseModel):
          "This is a parameter used by chat template in tokenizer config of the "
          "model."),
     )
+    add_special_tokens: Optional[bool] = Field(
+        default=False,
+        description=(
+            "If true, special tokens (e.g. BOS) will be added to the prompt "
+            "on top of what is added by the chat template. "
+            "For most models, the chat template takes care of adding the "
+            "special tokens so this should be set to False (as is the "
+            "default)."),
+    )
     include_stop_str_in_output: Optional[bool] = Field(
         default=False,
         description=(
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 7b52e1095246..afd87f49c1c4 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -163,7 +163,9 @@ async def create_chat_completion(
         try:
             # Tokenize/detokenize depending on prompt format (string/token list)
             prompt_ids, prompt_text = self._validate_prompt_and_tokenize(
-                request, prompt=prompt, add_special_tokens=False)
+                request,
+                prompt=prompt,
+                add_special_tokens=request.add_special_tokens)
             sampling_params = request.to_sampling_params()
             lora_request = self._maybe_get_lora(request)
             decoding_config = await self.engine.get_decoding_config()
diff --git a/vllm/entrypoints/openai/serving_engine.py b/vllm/entrypoints/openai/serving_engine.py
index ae659d19c878..6b5a62efc7f2 100644
--- a/vllm/entrypoints/openai/serving_engine.py
+++ b/vllm/entrypoints/openai/serving_engine.py
@@ -131,7 +131,8 @@ def _validate_prompt_and_tokenize(
             prompt_ids: Optional[List[int]] = None,
             truncate_prompt_tokens: Optional[Annotated[int,
                                                        Field(ge=1)]] = None,
-            add_special_tokens: bool = True) -> Tuple[List[int], str]:
+            add_special_tokens: Optional[bool] = True
+    ) -> Tuple[List[int], str]:
         if not (prompt or prompt_ids):
             raise ValueError("Either prompt or prompt_ids should be provided.")
         if (prompt and prompt_ids):
@@ -139,11 +140,12 @@ def _validate_prompt_and_tokenize(
                 "Only one of prompt or prompt_ids should be provided.")
 
         if prompt_ids is None:
-            # When using OpenAIServingChat for chat completions, the
-            # special tokens (e.g., BOS) have already been added by the
-            # chat template. Therefore, we do not need to add them again.
-            # Set add_special_tokens to False to avoid adding the BOS tokens
-            # again.
+            # When using OpenAIServingChat for chat completions, for
+            # most models the special tokens (e.g., BOS) have already
+            # been added by the chat template. Therefore, we do not
+            # need to add them again.
+            # Set add_special_tokens to False (by default) to avoid
+            # adding the BOS tokens again.
             tokenizer_kwargs: Dict[str, Any] = {
                 "add_special_tokens": add_special_tokens
             }

From d619bd95427d1f7103be3a7b0f6be14a9462c351 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Wed, 5 Jun 2024 11:42:08 -0500
Subject: [PATCH 38/93] [CI] Add nightly benchmarks (#5260)

---
 .../nightly-benchmarks/kickoff-pipeline.sh    | 26 +++++++++++++
 .buildkite/nightly-benchmarks/sample.yaml     | 39 +++++++++++++++++++
 2 files changed, 65 insertions(+)
 create mode 100755 .buildkite/nightly-benchmarks/kickoff-pipeline.sh
 create mode 100644 .buildkite/nightly-benchmarks/sample.yaml

diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
new file mode 100755
index 000000000000..d3bf3b72980a
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
@@ -0,0 +1,26 @@
+#!/usr/bin/env bash
+
+set -euo pipefail
+
+# Install system packages
+apt update
+apt install -y curl jq
+
+# Install minijinja for templating
+curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh
+source $HOME/.cargo/env
+
+# If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
+if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
+  PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
+
+  if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then
+    echo "This PR has the 'perf-benchmarks' label. Proceeding with the nightly benchmarks."
+  else
+    echo "This PR does not have the 'perf-benchmarks' label. Skipping the nightly benchmarks."
+    exit 0
+  fi
+fi
+
+# Upload sample.yaml
+buildkite-agent pipeline upload .buildkite/nightly-benchmarks/sample.yaml
diff --git a/.buildkite/nightly-benchmarks/sample.yaml b/.buildkite/nightly-benchmarks/sample.yaml
new file mode 100644
index 000000000000..50e6e8207218
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/sample.yaml
@@ -0,0 +1,39 @@
+steps:
+  # NOTE(simon): You can create separate blocks for different jobs
+  - label: "A100: NVIDIA SMI"
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          containers:
+          # - image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:$BUILDKITE_COMMIT
+          # TODO(simon): check latest main branch or use the PR image.
+          - image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6
+            command:
+            - bash -c 'nvidia-smi && nvidia-smi topo -m && pwd && ls'
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+  # TODO(simon): bring H100 online
+  # - label: "H100: NVIDIA SMI"
+  #   agents:
+  #     queue: H100
+  #   plugins:
+  #   - docker#v5.11.0:
+  #       image: us-central1-docker.pkg.dev/vllm-405802/vllm-ci-test-repo/vllm-test:45c35f0d58f4508bf43bd6af1d3d0d0ec0c915e6
+  #       command:
+  #       - bash -c 'nvidia-smi && nvidia-smi topo -m'
+  #       propagate-environment: true
+  #       ipc: host
+  #       gpus: all
+

From 2cf59111a5eb963532e30c410181c137a6524b75 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Wed, 5 Jun 2024 13:17:51 -0400
Subject: [PATCH 39/93] [misc] benchmark_serving.py -- add ITL results and
 tweak TPOT results (#5263)

---
 .buildkite/run-benchmarks.sh    |  2 +-
 benchmarks/benchmark_serving.py | 23 ++++++++++++++++++++++-
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh
index 1efc96395933..6283cd106401 100644
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
@@ -50,7 +50,7 @@ echo "### Serving Benchmarks" >> benchmark_results.md
 sed -n '1p' benchmark_serving.txt >> benchmark_results.md # first line
 echo "" >> benchmark_results.md
 echo '```' >> benchmark_results.md
-tail -n 20 benchmark_serving.txt >> benchmark_results.md # last 20 lines
+tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines
 echo '```' >> benchmark_results.md
 
 # if the agent binary is not found, skip uploading the results, exit 0
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index f3d71de775f8..4112a3272518 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -56,6 +56,9 @@ class BenchmarkMetrics:
     mean_tpot_ms: float
     median_tpot_ms: float
     p99_tpot_ms: float
+    mean_itl_ms: float
+    median_itl_ms: float
+    p99_itl_ms: float
 
 
 def sample_sharegpt_requests(
@@ -200,16 +203,24 @@ def calculate_metrics(
     actual_output_lens = []
     total_input = 0
     completed = 0
+    itls = []
     tpots = []
     ttfts = []
     for i in range(len(outputs)):
         if outputs[i].success:
-            output_len = len(tokenizer(outputs[i].generated_text).input_ids)
+            # We use the tokenizer to count the number of output tokens for all
+            # serving backends instead of looking at len(outputs[i].itl) since
+            # multiple output tokens may be bundled together
+            # Note: this may inflate the output token count slightly
+            output_len = len(
+                tokenizer(outputs[i].generated_text,
+                          add_special_tokens=False).input_ids)
             actual_output_lens.append(output_len)
             total_input += input_requests[i][1]
             if output_len > 1:
                 tpots.append(
                     (outputs[i].latency - outputs[i].ttft) / (output_len - 1))
+            itls += outputs[i].itl
             ttfts.append(outputs[i].ttft)
             completed += 1
         else:
@@ -234,6 +245,9 @@ def calculate_metrics(
         mean_tpot_ms=np.mean(tpots or 0) * 1000,
         median_tpot_ms=np.median(tpots or 0) * 1000,
         p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
+        mean_itl_ms=np.mean(itls or 0) * 1000,
+        median_itl_ms=np.median(itls or 0) * 1000,
+        p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
     )
 
     return metrics, actual_output_lens
@@ -333,6 +347,10 @@ async def benchmark(
     print("{:<40} {:<10.2f}".format("Median TPOT (ms):",
                                     metrics.median_tpot_ms))
     print("{:<40} {:<10.2f}".format("P99 TPOT (ms):", metrics.p99_tpot_ms))
+    print("{s:{c}^{n}}".format(s='Inter-token Latency', n=50, c='-'))
+    print("{:<40} {:<10.2f}".format("Mean ITL (ms):", metrics.mean_itl_ms))
+    print("{:<40} {:<10.2f}".format("Median ITL (ms):", metrics.median_itl_ms))
+    print("{:<40} {:<10.2f}".format("P99 ITL (ms):", metrics.p99_itl_ms))
     print("=" * 50)
 
     result = {
@@ -349,6 +367,9 @@ async def benchmark(
         "mean_tpot_ms": metrics.mean_tpot_ms,
         "median_tpot_ms": metrics.median_tpot_ms,
         "p99_tpot_ms": metrics.p99_tpot_ms,
+        "mean_itl_ms": metrics.mean_itl_ms,
+        "median_itl_ms": metrics.median_itl_ms,
+        "p99_itl_ms": metrics.p99_itl_ms,
         "input_lens": [output.prompt_len for output in outputs],
         "output_lens": actual_output_lens,
         "ttfts": [output.ttft for output in outputs],

From 8f5fafa94cdc5ac0f9517f4dcc389e929238f02d Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Wed, 5 Jun 2024 13:44:15 -0400
Subject: [PATCH 40/93] [Kernel] Add GPU architecture guards to the CUTLASS
 w8a8 kernels to reduce binary size (#5157)

Co-authored-by: Cody Yu <hao.yu.cody@gmail.com>
---
 .../cutlass_w8a8/scaled_mm_dq_c2x.cu          | 105 ++++++++++++------
 .../cutlass_w8a8/scaled_mm_dq_c3x.cu          |  19 +++-
 2 files changed, 87 insertions(+), 37 deletions(-)

diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu
index 65870df0e8fc..088fee4783fa 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu
@@ -48,9 +48,44 @@ using namespace cute;
 
 namespace {
 
-template <typename Arch, typename ElementAB_, typename ElementD_,
-          typename TileShape, typename WarpShape, typename InstructionShape,
-          int32_t MainLoopStages>
+// Wrappers for the GEMM kernel that is used to guard against compilation on
+// architectures that will never use the kernel. The purpose of this is to
+// reduce the size of the compiled binary.
+// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+// into code that will be executed on the device where it is defined.
+template <typename Kernel>
+struct enable_sm75_to_sm80 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 750 && __CUDA_ARCH__ < 800
+    Kernel::invoke(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm80_to_sm89 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 800 && __CUDA_ARCH__ < 890
+    Kernel::invoke(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+template <typename Kernel>
+struct enable_sm89_to_sm90 : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE static void invoke(Args&&... args) {
+#if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 890 && __CUDA_ARCH__ < 900
+    Kernel::invoke(std::forward<Args>(args)...);
+#endif
+  }
+};
+
+template <typename Arch, template <typename> typename ArchGuard,
+          typename ElementAB_, typename ElementD_, typename TileShape,
+          typename WarpShape, typename InstructionShape, int32_t MainLoopStages>
 struct cutlass_2x_gemm {
   using ElementAB = ElementAB_;
   using ElementD = ElementD_;
@@ -101,7 +136,7 @@ struct cutlass_2x_gemm {
   using RowMajor = typename cutlass::layout::RowMajor;
   using ColumnMajor = typename cutlass::layout::ColumnMajor;
   using KernelType = 
-    typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
+    ArchGuard<typename cutlass::gemm::kernel::DefaultGemmWithVisitor<
       ElementAB, RowMajor, cutlass::ComplexTransform::kNone, 16, 
       ElementAB, ColumnMajor, cutlass::ComplexTransform::kNone, 16, 
       float, cutlass::layout::RowMajor, 4,
@@ -112,7 +147,7 @@ struct cutlass_2x_gemm {
       cutlass::gemm::threadblock::ThreadblockSwizzleStreamK,
       MainLoopStages, Operator,
       1 /* epilogue stages */
-      >::GemmKernel;
+      >::GemmKernel>;
   // clang-format on
 
   using Op = cutlass::gemm::device::GemmUniversalAdapter<KernelType>;
@@ -208,16 +243,16 @@ void cutlass_scaled_mm_dq_sm75(torch::Tensor& out, torch::Tensor const& a,
   using InstructionShape = typename cutlass::gemm::GemmShape<8, 8, 16>;
 
   if (out.dtype() == torch::kBFloat16) {
-    return cutlass_scaled_mm_dq_dispatcher<
-        cutlass_2x_gemm<cutlass::arch::Sm75, int8_t, cutlass::bfloat16_t,
-                        TileShape, WarpShape, InstructionShape, 2>>(
-        out, a, b, a_scales, b_scales);
+    return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
+        cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::bfloat16_t,
+        TileShape, WarpShape, InstructionShape, 2>>(out, a, b, a_scales,
+                                                    b_scales);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return cutlass_scaled_mm_dq_dispatcher<
-        cutlass_2x_gemm<cutlass::arch::Sm75, int8_t, cutlass::half_t, TileShape,
-                        WarpShape, InstructionShape, 2>>(out, a, b, a_scales,
-                                                         b_scales);
+    return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
+        cutlass::arch::Sm75, enable_sm75_to_sm80, int8_t, cutlass::half_t,
+        TileShape, WarpShape, InstructionShape, 2>>(out, a, b, a_scales,
+                                                    b_scales);
   }
 }
 
@@ -235,16 +270,16 @@ void cutlass_scaled_mm_dq_sm80(torch::Tensor& out, torch::Tensor const& a,
   using InstructionShape = typename cutlass::gemm::GemmShape<16, 8, 32>;
 
   if (out.dtype() == torch::kBFloat16) {
-    return cutlass_scaled_mm_dq_dispatcher<
-        cutlass_2x_gemm<cutlass::arch::Sm80, int8_t, cutlass::bfloat16_t,
-                        TileShape, WarpShape, InstructionShape, 5>>(
-        out, a, b, a_scales, b_scales);
+    return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
+        cutlass::arch::Sm80, enable_sm80_to_sm89, int8_t, cutlass::bfloat16_t,
+        TileShape, WarpShape, InstructionShape, 5>>(out, a, b, a_scales,
+                                                    b_scales);
   } else {
     TORCH_CHECK(out.dtype() == torch::kFloat16);
-    return cutlass_scaled_mm_dq_dispatcher<
-        cutlass_2x_gemm<cutlass::arch::Sm80, int8_t, cutlass::half_t, TileShape,
-                        WarpShape, InstructionShape, 5>>(out, a, b, a_scales,
-                                                         b_scales);
+    return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
+        cutlass::arch::Sm80, enable_sm80_to_sm89, int8_t, cutlass::half_t,
+        TileShape, WarpShape, InstructionShape, 5>>(out, a, b, a_scales,
+                                                    b_scales);
   }
 }
 
@@ -263,16 +298,16 @@ void cutlass_scaled_mm_dq_sm89(torch::Tensor& out, torch::Tensor const& a,
     TORCH_CHECK(b.dtype() == torch::kInt8);
 
     if (out.dtype() == torch::kBFloat16) {
-      return cutlass_scaled_mm_dq_dispatcher<
-          cutlass_2x_gemm<cutlass::arch::Sm89, int8_t, cutlass::bfloat16_t,
-                          TileShape, WarpShape, InstructionShape, 5>>(
-          out, a, b, a_scales, b_scales);
+      return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
+          cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::bfloat16_t,
+          TileShape, WarpShape, InstructionShape, 5>>(out, a, b, a_scales,
+                                                      b_scales);
     } else {
       assert(out.dtype() == torch::kFloat16);
-      return cutlass_scaled_mm_dq_dispatcher<
-          cutlass_2x_gemm<cutlass::arch::Sm89, int8_t, cutlass::half_t,
-                          TileShape, WarpShape, InstructionShape, 5>>(
-          out, a, b, a_scales, b_scales);
+      return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
+          cutlass::arch::Sm89, enable_sm89_to_sm90, int8_t, cutlass::half_t,
+          TileShape, WarpShape, InstructionShape, 5>>(out, a, b, a_scales,
+                                                      b_scales);
     }
   } else {
     TORCH_CHECK(a.dtype() == torch::kFloat8_e4m3fn);
@@ -280,15 +315,15 @@ void cutlass_scaled_mm_dq_sm89(torch::Tensor& out, torch::Tensor const& a,
 
     if (out.dtype() == torch::kBFloat16) {
       return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
-          cutlass::arch::Sm89, cutlass::float_e4m3_t, cutlass::bfloat16_t,
-          TileShape, WarpShape, InstructionShape, 5>>(out, a, b, a_scales,
-                                                      b_scales);
+          cutlass::arch::Sm89, enable_sm89_to_sm90, cutlass::float_e4m3_t,
+          cutlass::bfloat16_t, TileShape, WarpShape, InstructionShape, 5>>(
+          out, a, b, a_scales, b_scales);
     } else {
       TORCH_CHECK(out.dtype() == torch::kFloat16);
       return cutlass_scaled_mm_dq_dispatcher<cutlass_2x_gemm<
-          cutlass::arch::Sm89, cutlass::float_e4m3_t, cutlass::half_t,
-          TileShape, WarpShape, InstructionShape, 5>>(out, a, b, a_scales,
-                                                      b_scales);
+          cutlass::arch::Sm89, enable_sm89_to_sm90, cutlass::float_e4m3_t,
+          cutlass::half_t, TileShape, WarpShape, InstructionShape, 5>>(
+          out, a, b, a_scales, b_scales);
     }
   }
 }
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu
index 4c1aec03a3ca..8fc4ba662ecd 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu
@@ -56,6 +56,21 @@ uint32_t next_pow_2(uint32_t const num) {
   return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
 }
 
+// A wrapper for the GEMM kernel that is used to guard against compilation on
+// architectures that will never use the kernel. The purpose of this is to
+// reduce the size of the compiled binary.
+// __CUDA_ARCH__ is not defined in host code, so this lets us smuggle the ifdef
+// into code that will be executed on the device where it is defined.
+template <typename Kernel>
+struct enable_sm90_or_later : Kernel {
+  template <typename... Args>
+  CUTLASS_DEVICE void operator()(Args&&... args) {
+  #if defined __CUDA_ARCH__ && __CUDA_ARCH__ >= 900
+    Kernel::operator()(std::forward<Args>(args)...);
+  #endif
+  }
+};
+
 template <typename ElementAB_, typename ElementD_, typename TileShape,
           typename ClusterShape, typename KernelSchedule,
           typename EpilogueSchedule>
@@ -126,9 +141,9 @@ struct cutlass_3x_gemm {
           KernelSchedule>::CollectiveOp;
   // clang-format on
 
-  using KernelType = cutlass::gemm::kernel::GemmUniversal<
+  using KernelType = enable_sm90_or_later<cutlass::gemm::kernel::GemmUniversal<
       cute::Shape<int, int, int, int>, CollectiveMainloop, CollectiveEpilogue,
-      cutlass::gemm::PersistentScheduler>;
+      cutlass::gemm::PersistentScheduler>>;
 
   struct GemmKernel : public KernelType {};
 };

From 0770930c1cd506aaadf95c068bba0fa1bf6be60f Mon Sep 17 00:00:00 2001
From: Cody Yu <hao.yu.cody@gmail.com>
Date: Wed, 5 Jun 2024 10:58:50 -0700
Subject: [PATCH 41/93] [Model] Correct Mixtral FP8 checkpoint loading (#5231)

---
 .../model_executor/layers/quantization/fp8.py |  7 +-
 vllm/model_executor/models/mixtral.py         | 82 +++++++++++++------
 2 files changed, 62 insertions(+), 27 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 44161bde73f4..a4c067375d21 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -300,14 +300,15 @@ def all_close_1d(x: torch.Tensor) -> bool:
 
 
 def per_tensor_quantize(tensor: torch.Tensor,
-                        inv_scale: float) -> torch.Tensor:
+                        inv_scale: Union[float, torch.Tensor]) -> torch.Tensor:
     finfo = torch.finfo(torch.float8_e4m3fn)
     qweight = (tensor / inv_scale).clamp(min=finfo.min, max=finfo.max)
     return qweight.to(torch.float8_e4m3fn)
 
 
-def per_tensor_dequantize(tensor: torch.Tensor,
-                          inv_scale: float) -> torch.Tensor:
+def per_tensor_dequantize(
+        tensor: torch.Tensor, inv_scale: Union[float,
+                                               torch.Tensor]) -> torch.Tensor:
     fake_qweight = tensor.to(torch.float16)
     dq_weight = fake_qweight * inv_scale
     return dq_weight
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 10f823b229fd..234b213d54d1 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -41,7 +41,9 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.quantization.base_config import (
     QuantizationConfig)
-from vllm.model_executor.layers.quantization.fp8 import Fp8Config
+from vllm.model_executor.layers.quantization.fp8 import (Fp8Config,
+                                                         per_tensor_dequantize,
+                                                         per_tensor_quantize)
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.sampler import Sampler
 from vllm.model_executor.layers.vocab_parallel_embedding import (
@@ -98,16 +100,16 @@ def __init__(
         if self.use_fp8 and self.quant_config.is_checkpoint_fp8_serialized:
             params_dtype = torch.float8_e4m3fn
 
-        self.w13_weight = nn.Parameter(
-            torch.empty(self.num_total_experts,
-                        2 * self.intermediate_size,
-                        self.hidden_size,
-                        dtype=params_dtype))
-        self.w2_weight = nn.Parameter(
-            torch.empty(self.num_total_experts,
-                        self.hidden_size,
-                        self.intermediate_size,
-                        dtype=params_dtype))
+        self.w13_weight = nn.Parameter(torch.empty(self.num_total_experts,
+                                                   2 * self.intermediate_size,
+                                                   self.hidden_size,
+                                                   dtype=params_dtype),
+                                       requires_grad=False)
+        self.w2_weight = nn.Parameter(torch.empty(self.num_total_experts,
+                                                  self.hidden_size,
+                                                  self.intermediate_size,
+                                                  dtype=params_dtype),
+                                      requires_grad=False)
 
         set_weight_attrs(self.w13_weight, {
             "weight_loader": self.weight_loader,
@@ -124,7 +126,10 @@ def __init__(
 
         if self.use_fp8:
             # WEIGHT_SCALE (for fp8)
+            # Allocate 2 scales for w1 and w3 respectively.
+            # They will be combined to a single scale after weight loading.
             self.w13_scale = nn.Parameter(torch.ones(self.num_total_experts,
+                                                     2,
                                                      dtype=torch.float32),
                                           requires_grad=False)
             self.w2_scale = nn.Parameter(torch.ones(self.num_total_experts,
@@ -148,11 +153,11 @@ def __init__(
                     raise ValueError(
                         "Found static activation scheme for checkpoint that "
                         "was not serialized fp8.")
-                self.a13_scale = nn.Parameter(torch.zeros(
+                self.a13_scale = nn.Parameter(torch.ones(
                     self.num_total_experts, dtype=torch.float32),
                                               requires_grad=False)
-                self.a2_scale = nn.Parameter(torch.zeros(
-                    self.num_total_experts, dtype=torch.float32),
+                self.a2_scale = nn.Parameter(torch.ones(self.num_total_experts,
+                                                        dtype=torch.float32),
                                              requires_grad=False)
 
                 set_weight_attrs(self.a13_scale, {
@@ -177,14 +182,20 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
             param_data[expert_id, :, :] = loaded_weight[:, shard]
 
         # Loading scales
-        if "input_scale" in weight_name or "w2.weight_scale" in weight_name:
+        if "act_scale" in weight_name or "w2.weight_scale" in weight_name:
             if param_data[expert_id] != 1 and (param_data[expert_id] -
                                                loaded_weight).abs() > 1e-5:
                 raise ValueError(
-                    "input_scales of w1 and w3 of a layer "
+                    "act_scales of w1 and w3 of a layer "
                     f"must be equal. But got {param_data[expert_id]} "
                     f"vs. {loaded_weight}")
             param_data[expert_id] = loaded_weight
+        elif "weight_scale" in weight_name:
+            # We have to keep the weight scales of w1 and w3 because
+            # we need to re-quantize w1/w3 weights after weight loading.
+            assert "w1" in weight_name or "w3" in weight_name
+            shard_id = 0 if "w1" in weight_name else 1
+            param_data[expert_id][shard_id] = loaded_weight
 
     def process_weights_after_loading(self):
         # Fp8 is the only case where we need to process after loading.
@@ -197,6 +208,12 @@ def process_weights_after_loading(self):
                                           dtype=torch.float8_e4m3fn)
             w2_weight = torch.empty_like(self.w2_weight.data,
                                          dtype=torch.float8_e4m3fn)
+
+            # Re-initialize w13_scale because we directly quantize
+            # merged w13 weights and generate a single scaling factor.
+            self.w13_scale = nn.Parameter(torch.ones(self.num_total_experts,
+                                                     dtype=torch.float32),
+                                          requires_grad=False)
             for expert in range(self.num_total_experts):
                 w13_weight[expert, :, :], self.w13_scale[
                     expert] = ops.scaled_fp8_quant(
@@ -208,9 +225,9 @@ def process_weights_after_loading(self):
             self.w2_weight = nn.Parameter(w2_weight, requires_grad=False)
 
         else:
-            # If checkpoint is fp8 + static, cleanup input_scales.
-            #   Since state_dict has an input_scale per expert but our kernels
-            #   are passed one input_scale shared across all experts.
+            # If checkpoint is fp8 + static, cleanup act_scales.
+            #   Since state_dict has an act_scale per expert but our kernels
+            #   are passed one act_scale shared across all experts.
             if self.quant_config.activation_scheme == "static":
                 if self.a13_scale is None or self.a2_scale is None:
                     raise ValueError(
@@ -220,14 +237,31 @@ def process_weights_after_loading(self):
                 if (not all_close_1d(self.a13_scale)
                         or not all_close_1d(self.a2_scale)):
                     print_warning_once(
-                        "Found input_scales that are not equal for "
+                        "Found act_scales that are not equal for "
                         "fp8 MoE layer. Using the maximum across experts "
                         "for each layer. ")
 
-            self.a13_scale = nn.Parameter(self.a13_scale.max(),
-                                          requires_grad=False)
-            self.a2_scale = nn.Parameter(self.a2_scale.max(),
-                                         requires_grad=False)
+                self.a13_scale = nn.Parameter(self.a13_scale.max(),
+                                              requires_grad=False)
+                self.a2_scale = nn.Parameter(self.a2_scale.max(),
+                                             requires_grad=False)
+
+            assert self.w13_scale is not None
+            shard_size = self.intermediate_size
+            max_w13_scales = self.w13_scale.max(dim=1).values
+            for expert_id in range(self.num_total_experts):
+                start = 0
+                for shard_id in range(2):
+                    dq_weight = per_tensor_dequantize(
+                        self.w13_weight[expert_id][start:start +
+                                                   shard_size, :],
+                        self.w13_scale[expert_id][shard_id])
+                    self.w13_weight[expert_id][
+                        start:start + shard_size, :] = per_tensor_quantize(
+                            dq_weight, max_w13_scales[expert_id])
+                    start += shard_size
+
+            self.w13_scale = nn.Parameter(max_w13_scales, requires_grad=False)
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         num_tokens, hidden_size = hidden_states.shape

From 8310e3492455fe3c21b905fe1595de13715278f7 Mon Sep 17 00:00:00 2001
From: DriverSong <31926998+DriverSong@users.noreply.github.com>
Date: Thu, 6 Jun 2024 01:59:02 +0800
Subject: [PATCH 42/93] [BugFix] Apply get_cached_tokenizer to the tokenizer
 setter of LLM (#5207)

Co-authored-by: qiujiawei9 <qiujiawei9@jd.com>
---
 vllm/entrypoints/llm.py | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index ad9404898d53..b6173a9362d8 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -14,6 +14,7 @@
 from vllm.outputs import EmbeddingRequestOutput, RequestOutput
 from vllm.pooling_params import PoolingParams
 from vllm.sampling_params import SamplingParams
+from vllm.transformers_utils.tokenizer import get_cached_tokenizer
 from vllm.usage.usage_lib import UsageContext
 from vllm.utils import Counter, deprecate_kwargs
 
@@ -161,7 +162,14 @@ def set_tokenizer(
         self,
         tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
     ) -> None:
-        self.llm_engine.tokenizer.tokenizer = tokenizer
+        # While CachedTokenizer is dynamic, have no choice but
+        # compare class name. Misjudgment will arise from
+        # user-defined tokenizer started with 'Cached'
+        if tokenizer.__class__.__name__.startswith("Cached"):
+            self.llm_engine.tokenizer.tokenizer = tokenizer
+        else:
+            self.llm_engine.tokenizer.tokenizer = get_cached_tokenizer(
+                tokenizer)
 
     @overload  # LEGACY: single (prompt + optional token ids)
     def generate(

From 6e32dd4f4c7dbcd2298d0a43ceb214743d0165d6 Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Wed, 5 Jun 2024 10:59:14 -0700
Subject: [PATCH 43/93] [Kernel] Re-tune Mixtral MoE configurations for FP8 on
 H100 (#5238)

---
 benchmarks/kernels/benchmark_moe.py           |   3 +-
 ...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json | 104 +++++++++---------
 ...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json |  62 +++++------
 3 files changed, 85 insertions(+), 84 deletions(-)

diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py
index 2edc63142d02..be5dd32bd6f9 100644
--- a/benchmarks/kernels/benchmark_moe.py
+++ b/benchmarks/kernels/benchmark_moe.py
@@ -255,7 +255,8 @@ def main(args: argparse.Namespace):
 
     if args.batch_size is None:
         batch_sizes = [
-            1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 1536, 2048, 3072, 4096
+            1, 2, 4, 8, 16, 24, 32, 48, 64, 96, 128, 256, 512, 1024, 1536,
+            2048, 3072, 4096
         ]
     else:
         batch_sizes = [args.batch_size]
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
index 5b78c30f08b6..673bae2ba8ef 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=3584,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
@@ -1,113 +1,113 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 4
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 5
     },
     "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
+        "num_warps": 4,
+        "num_stages": 4
     },
     "4": {
-        "BLOCK_SIZE_M": 256,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 2
+        "num_stages": 4
     },
     "8": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 32,
-        "num_warps": 8,
-        "num_stages": 2
+        "num_warps": 4,
+        "num_stages": 4
     },
     "16": {
-        "BLOCK_SIZE_M": 256,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 5
+        "num_warps": 4,
+        "num_stages": 3
     },
     "24": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "32": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "48": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "64": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 2
+        "num_stages": 3
     },
     "96": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 32,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 3
     },
     "128": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 8,
-        "num_stages": 5
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
     },
     "256": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
         "num_stages": 3
     },
     "512": {
         "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 256,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 8,
-        "num_stages": 3
+        "num_stages": 4
     },
     "1024": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 256,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 8,
         "num_stages": 4
     },
@@ -115,7 +115,7 @@
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 256,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 16,
         "num_warps": 8,
         "num_stages": 4
     },
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
index 75f8b0017b9c..918f6839620c 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=7168,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
@@ -1,67 +1,67 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 4
     },
     "2": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 8,
-        "num_stages": 4
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 5
     },
     "4": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 2
+        "num_stages": 5
     },
     "8": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "16": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 5
     },
     "24": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 5
+        "num_stages": 3
     },
     "32": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 4
     },
     "48": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "64": {
         "BLOCK_SIZE_M": 64,
@@ -81,19 +81,19 @@
     },
     "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 4
     },
     "256": {
-        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 5
+        "GROUP_SIZE_M": 64,
+        "num_warps": 4,
+        "num_stages": 3
     },
     "512": {
         "BLOCK_SIZE_M": 128,
@@ -107,7 +107,7 @@
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 256,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 32,
         "num_warps": 8,
         "num_stages": 4
     },

From c2c62c8529bd5a6e67a113a27b8179dcf1458b0f Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Wed, 5 Jun 2024 13:02:56 -0500
Subject: [PATCH 44/93] [Docs] Add Sequoia as sponsors (#5287)

---
 docs/source/community/sponsors.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md
index d167b66267a4..17586125fd6f 100644
--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
@@ -15,6 +15,7 @@ vLLM is a community project. Our compute resources for development and testing a
 - Dropbox
 - Lambda Lab
 - NVIDIA
+- Sequoia Capital
 - Replicate
 - Roblox
 - RunPod

From ee3104be0c1cc20e18bcf1c59944a5cc3b778cd0 Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Wed, 5 Jun 2024 14:53:05 -0700
Subject: [PATCH 45/93] [Speculative Decoding] Add `ProposerWorkerBase`
 abstract class (#5252)

---
 tests/spec_decode/test_dynamic_spec_decode.py |  4 +-
 tests/spec_decode/test_multi_step_worker.py   | 21 +++++----
 tests/spec_decode/test_ngram_worker.py        | 21 +++++----
 vllm/spec_decode/interfaces.py                |  2 +-
 vllm/spec_decode/multi_step_worker.py         | 15 ++++---
 vllm/spec_decode/ngram_worker.py              | 33 +++-----------
 vllm/spec_decode/proposer_worker_base.py      | 44 +++++++++++++++++++
 vllm/spec_decode/spec_decode_worker.py        |  5 ++-
 vllm/spec_decode/top1_proposer.py             |  6 +--
 9 files changed, 91 insertions(+), 60 deletions(-)
 create mode 100644 vllm/spec_decode/proposer_worker_base.py

diff --git a/tests/spec_decode/test_dynamic_spec_decode.py b/tests/spec_decode/test_dynamic_spec_decode.py
index 48fa862b2e41..bb6d1c23a003 100644
--- a/tests/spec_decode/test_dynamic_spec_decode.py
+++ b/tests/spec_decode/test_dynamic_spec_decode.py
@@ -68,13 +68,13 @@ def test_disable_spec_tokens(queue_size: int, batch_size: int, k: int):
     if queue_size < disable_by_batch_size:
         # Should raise exception when executing the mocked draft model.
         with pytest.raises(ValueError, match=exception_secret):
-            proposer.get_proposals(execute_model_req=ExecuteModelRequest(
+            proposer.get_spec_proposals(execute_model_req=ExecuteModelRequest(
                 seq_group_metadata_list=seq_group_metadata_list,
                 num_lookahead_slots=k), )
     else:
         # Should not execute the draft model because spec decode is disabled
         # for all requests. Accordingly, the proposal length should be 0.
-        proposals = proposer.get_proposals(
+        proposals = proposer.get_spec_proposals(
             execute_model_req=ExecuteModelRequest(
                 seq_group_metadata_list=seq_group_metadata_list,
                 num_lookahead_slots=k), )
diff --git a/tests/spec_decode/test_multi_step_worker.py b/tests/spec_decode/test_multi_step_worker.py
index cb2de97a4af9..6cea6668acc9 100644
--- a/tests/spec_decode/test_multi_step_worker.py
+++ b/tests/spec_decode/test_multi_step_worker.py
@@ -307,9 +307,10 @@ def test_draft_proposals_full_speculation_len():
 
     seq_group_metadata_list, _, _ = create_batch(batch_size, k)
 
-    proposals = proposer.get_proposals(execute_model_req=ExecuteModelRequest(
-        seq_group_metadata_list=seq_group_metadata_list,
-        num_lookahead_slots=k), )
+    proposals = proposer.get_spec_proposals(
+        execute_model_req=ExecuteModelRequest(
+            seq_group_metadata_list=seq_group_metadata_list,
+            num_lookahead_slots=k), )
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
@@ -344,9 +345,10 @@ def test_draft_proposals_no_speculations():
                                                  k,
                                                  prompt_len=prompt_len)
 
-    proposals = proposer.get_proposals(execute_model_req=ExecuteModelRequest(
-        seq_group_metadata_list=seq_group_metadata_list,
-        num_lookahead_slots=k), )
+    proposals = proposer.get_spec_proposals(
+        execute_model_req=ExecuteModelRequest(
+            seq_group_metadata_list=seq_group_metadata_list,
+            num_lookahead_slots=k), )
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
@@ -415,9 +417,10 @@ def test_draft_proposals_mixed_k():
         prev_output_token_len=prev_output_token_len,
     )
 
-    proposals = proposer.get_proposals(execute_model_req=ExecuteModelRequest(
-        seq_group_metadata_list=seq_group_metadata_list,
-        num_lookahead_slots=k), )
+    proposals = proposer.get_spec_proposals(
+        execute_model_req=ExecuteModelRequest(
+            seq_group_metadata_list=seq_group_metadata_list,
+            num_lookahead_slots=k), )
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
diff --git a/tests/spec_decode/test_ngram_worker.py b/tests/spec_decode/test_ngram_worker.py
index 88b40d1eb467..b1537884f896 100644
--- a/tests/spec_decode/test_ngram_worker.py
+++ b/tests/spec_decode/test_ngram_worker.py
@@ -50,9 +50,10 @@ def test_ngram_algo_correctness_for_single_no_match():
         block_size,
         final_prompt_lens=final_prompt_lens)
 
-    proposals = proposer.get_proposals(execute_model_req=ExecuteModelRequest(
-        seq_group_metadata_list=seq_group_metadata_list,
-        num_lookahead_slots=proposal_len), )
+    proposals = proposer.get_spec_proposals(
+        execute_model_req=ExecuteModelRequest(
+            seq_group_metadata_list=seq_group_metadata_list,
+            num_lookahead_slots=proposal_len), )
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
@@ -117,9 +118,10 @@ def test_ngram_algo_correctness_for_batches_not_match_all():
         block_size,
         final_prompt_lens=final_prompt_lens)
 
-    proposals = proposer.get_proposals(execute_model_req=ExecuteModelRequest(
-        seq_group_metadata_list=seq_group_metadata_list,
-        num_lookahead_slots=proposal_len), )
+    proposals = proposer.get_spec_proposals(
+        execute_model_req=ExecuteModelRequest(
+            seq_group_metadata_list=seq_group_metadata_list,
+            num_lookahead_slots=proposal_len), )
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
@@ -188,9 +190,10 @@ def test_ngram_algo_correctness_for_batches_match_all():
         block_size,
         final_prompt_lens=final_prompt_lens)
 
-    proposals = proposer.get_proposals(execute_model_req=ExecuteModelRequest(
-        seq_group_metadata_list=seq_group_metadata_list,
-        num_lookahead_slots=proposal_len), )
+    proposals = proposer.get_spec_proposals(
+        execute_model_req=ExecuteModelRequest(
+            seq_group_metadata_list=seq_group_metadata_list,
+            num_lookahead_slots=proposal_len), )
 
     assert torch.is_tensor(proposals.proposal_token_ids)
     assert torch.is_tensor(proposals.proposal_probs)
diff --git a/vllm/spec_decode/interfaces.py b/vllm/spec_decode/interfaces.py
index d311bfe984cb..72d7818eb117 100644
--- a/vllm/spec_decode/interfaces.py
+++ b/vllm/spec_decode/interfaces.py
@@ -55,7 +55,7 @@ def __repr__(self):
 class SpeculativeProposer(ABC):
 
     @abstractmethod
-    def get_proposals(
+    def get_spec_proposals(
         self,
         execute_model_req: ExecuteModelRequest,
     ) -> SpeculativeProposals:
diff --git a/vllm/spec_decode/multi_step_worker.py b/vllm/spec_decode/multi_step_worker.py
index b5a805278d27..fe15ea33b5f3 100644
--- a/vllm/spec_decode/multi_step_worker.py
+++ b/vllm/spec_decode/multi_step_worker.py
@@ -7,11 +7,12 @@
 from vllm.sequence import (ExecuteModelRequest, SamplerOutput,
                            SequenceGroupMetadata)
 from vllm.spec_decode.interfaces import SpeculativeProposals
+from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
 from vllm.spec_decode.top1_proposer import Top1Proposer
 from vllm.worker.worker import Worker
 
 
-class MultiStepWorker(Worker):
+class MultiStepWorker(Worker, ProposerWorkerBase):
     """The MultiStepWorker is equivalent to a Worker except that it allows
     multiple forward passes in a single call, assuming the scheduler has
     allocated enough space to store the additional KV. This reduces overhead
@@ -33,7 +34,7 @@ def init_device(self):
         super().init_device()
 
         self._proposer = Top1Proposer(
-            weakref.proxy(self),
+            weakref.proxy(self),  # type: ignore[arg-type]
             self.device,
             self.vocab_size,
             max_proposal_len=self.max_model_len,
@@ -92,11 +93,12 @@ def get_spec_proposals(
         speculative tokens per sequence is determined by max_proposal_len.
         """
 
-        return self._proposer.get_proposals(execute_model_req)
+        return self._proposer.get_spec_proposals(execute_model_req)
 
+    @staticmethod
     def _append_new_tokens(
-            self, model_output: SamplerOutput,
-            seq_group_metadata_list: SequenceGroupMetadata) -> None:
+            model_output: List[SamplerOutput],
+            seq_group_metadata_list: List[SequenceGroupMetadata]) -> None:
         """Given model output from a single run, append the tokens to the
         sequences. This is normally done outside of the worker, but it is
         required if the worker is to perform multiple forward passes.
@@ -116,8 +118,9 @@ def _append_new_tokens(
                 seq.append_token_id(token_id, token_logprob.logprob)
                 seq.update_num_computed_tokens(1)
 
+    @staticmethod
     def _shallow_copy_inputs(
-        self, seq_group_metadata_list: List[SequenceGroupMetadata]
+        seq_group_metadata_list: List[SequenceGroupMetadata]
     ) -> List[SequenceGroupMetadata]:
         """Copy input data structures to remove side-effects when input data
         structures are shared with other modules.
diff --git a/vllm/spec_decode/ngram_worker.py b/vllm/spec_decode/ngram_worker.py
index c2b22f2acd7b..33af588d0ba2 100644
--- a/vllm/spec_decode/ngram_worker.py
+++ b/vllm/spec_decode/ngram_worker.py
@@ -5,15 +5,16 @@
 
 from vllm.sequence import ExecuteModelRequest, SamplerOutput
 from vllm.spec_decode.interfaces import SpeculativeProposals
+from vllm.spec_decode.proposer_worker_base import NonLLMProposerWorkerBase
 from vllm.spec_decode.top1_proposer import Top1Proposer
 from vllm.worker.worker_base import LoraNotSupportedWorkerBase
 
 
-class NGramWorker(LoraNotSupportedWorkerBase):
+class NGramWorker(NonLLMProposerWorkerBase, LoraNotSupportedWorkerBase):
     """NGramWorker provides a light drafter without need for model.
 
     Current NGramWorker only implement prompt lookup decoding,
-    and in future we may also do RAG type drafter and other scenerios
+    and in future we may also do RAG type drafter and other scenarios
     which don't rely on LLM model to give proposals.
     """
 
@@ -38,34 +39,11 @@ def init_device(self):
 
         # Current only support Top1Proposer
         self._proposer = Top1Proposer(
-            weakref.proxy(self),
+            weakref.proxy(self),  # type: ignore[arg-type]
             device=self.device,
             vocab_size=self.vocab_size,
         )
 
-    def set_include_gpu_probs_tensor(self):
-        # NGram don't need gpu sampler
-        pass
-
-    def execute_model(
-            self,
-            execute_model_req: Optional[ExecuteModelRequest] = None) -> None:
-        """NGram doesn't depend on model execution, just pass this function"""
-        pass
-
-    def determine_num_available_blocks(self) -> None:
-        """NGram doesn't depend on model execution, no need to check blocks"""
-        pass
-
-    def initialize_cache(self, num_gpu_blocks: int,
-                         num_cpu_blocks: int) -> None:
-        """As there is no cache need to handle, just pass this function"""
-        pass
-
-    def get_cache_block_size_bytes(self):
-        """Return the size of a cache block in bytes."""
-        return 0
-
     def sampler_output(
         self,
         execute_model_req: ExecuteModelRequest,
@@ -97,7 +75,6 @@ def sampler_output(
                     -1,
             ):
                 ngram_tensor = input_ids[-ngram_size:]
-                proposal_start_idx = None
                 if ngram_size == 1:
                     # Do not match itself and do not use unfold and all
                     matches = (input_ids[:-1] == ngram_tensor)
@@ -161,7 +138,7 @@ def get_spec_proposals(
         speculative tokens per sequence is determined by max_proposal_len.
         """
 
-        return self._proposer.get_proposals(execute_model_req)
+        return self._proposer.get_spec_proposals(execute_model_req)
 
     def _raise_if_unsupported(
         self,
diff --git a/vllm/spec_decode/proposer_worker_base.py b/vllm/spec_decode/proposer_worker_base.py
new file mode 100644
index 000000000000..fd67ceb912ee
--- /dev/null
+++ b/vllm/spec_decode/proposer_worker_base.py
@@ -0,0 +1,44 @@
+from abc import ABC, abstractmethod
+from typing import List, Optional, Tuple
+
+from vllm.sequence import ExecuteModelRequest, SamplerOutput
+from vllm.spec_decode.interfaces import SpeculativeProposer
+from vllm.worker.worker_base import WorkerBase
+
+
+class ProposerWorkerBase(WorkerBase, SpeculativeProposer):
+    """Interface for proposer workers"""
+
+    @abstractmethod
+    def sampler_output(
+        self,
+        execute_model_req: ExecuteModelRequest,
+        sample_len: int,
+    ) -> Tuple[Optional[List[SamplerOutput]], bool]:
+        raise NotImplementedError
+
+    def set_include_gpu_probs_tensor(self):
+        """Implementation optional"""
+        pass
+
+
+class NonLLMProposerWorkerBase(ProposerWorkerBase, ABC):
+    """Proposer worker which does not use a model with kvcache"""
+
+    def execute_model(
+        self,
+        execute_model_req: Optional[ExecuteModelRequest] = None
+    ) -> List[SamplerOutput]:
+        """get_spec_proposals is used to get the proposals"""
+        return []
+
+    def determine_num_available_blocks(self) -> Tuple[int, int]:
+        """This is never called on the proposer, only the target model"""
+        raise NotImplementedError
+
+    def initialize_cache(self, num_gpu_blocks: int,
+                         num_cpu_blocks: int) -> None:
+        pass
+
+    def get_cache_block_size_bytes(self) -> int:
+        return 0
diff --git a/vllm/spec_decode/spec_decode_worker.py b/vllm/spec_decode/spec_decode_worker.py
index 150e8db0c8aa..45d9d5735efc 100644
--- a/vllm/spec_decode/spec_decode_worker.py
+++ b/vllm/spec_decode/spec_decode_worker.py
@@ -14,6 +14,7 @@
 from vllm.spec_decode.metrics import AsyncMetricsCollector
 from vllm.spec_decode.multi_step_worker import MultiStepWorker
 from vllm.spec_decode.ngram_worker import NGramWorker
+from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
 from vllm.spec_decode.util import (create_sequence_group_output,
                                    get_all_num_logprobs, get_all_seq_ids,
                                    get_sampled_token_logprobs, nvtx_range,
@@ -117,7 +118,7 @@ def create_worker(
 
     def __init__(
         self,
-        proposer_worker: WorkerBase,
+        proposer_worker: ProposerWorkerBase,
         scorer_worker: WorkerBase,
         rejection_sampler: RejectionSampler,
         metrics_collector: Optional[AsyncMetricsCollector] = None,
@@ -260,7 +261,7 @@ def execute_model(
 
         # This is required as if the number of draft model runs changes
         # dynamically, the non-driver workers won't know unless we perform a
-        # communication to inform then.
+        # communication to inform them.
         broadcast_dict = dict(
             num_lookahead_slots=num_lookahead_slots,
             disable_all_speculation=disable_all_speculation,
diff --git a/vllm/spec_decode/top1_proposer.py b/vllm/spec_decode/top1_proposer.py
index 6c7e22207f6b..fdef2833a399 100644
--- a/vllm/spec_decode/top1_proposer.py
+++ b/vllm/spec_decode/top1_proposer.py
@@ -6,8 +6,8 @@
                            SequenceGroupMetadata)
 from vllm.spec_decode.interfaces import (SpeculativeProposals,
                                          SpeculativeProposer)
+from vllm.spec_decode.proposer_worker_base import ProposerWorkerBase
 from vllm.spec_decode.util import sampler_output_to_torch
-from vllm.worker.worker_base import WorkerBase
 
 
 class Top1Proposer(SpeculativeProposer):
@@ -29,7 +29,7 @@ class Top1Proposer(SpeculativeProposer):
 
     def __init__(
         self,
-        worker: WorkerBase,
+        worker: ProposerWorkerBase,
         device: str,
         vocab_size: int,
         max_proposal_len: Optional[int] = None,
@@ -39,7 +39,7 @@ def __init__(
         self.max_proposal_len = max_proposal_len
         self._vocab_size = vocab_size
 
-    def get_proposals(
+    def get_spec_proposals(
         self,
         execute_model_req: ExecuteModelRequest,
     ) -> SpeculativeProposals:

From 1680d99fb0ade11c270eca323acdb31c4eb8277b Mon Sep 17 00:00:00 2001
From: Nick Hill <nickhill@us.ibm.com>
Date: Wed, 5 Jun 2024 14:53:16 -0700
Subject: [PATCH 46/93] [BugFix] Fix log message about default max model length
 (#5284)

---
 vllm/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index d4f938163ed8..87a71ec36b4f 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1255,7 +1255,7 @@ def _get_and_verify_max_len(
         logger.warning(
             "The model's config.json does not contain any of the following "
             "keys to determine the original maximum length of the model: "
-            "%d. Assuming the model's maximum length is %d.", possible_keys,
+            "%s. Assuming the model's maximum length is %d.", possible_keys,
             default_max_len)
         derived_max_model_len = default_max_len
 

From efb32e16e044aaab70e8dadf82aab2d07619c98c Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Wed, 5 Jun 2024 18:16:56 -0400
Subject: [PATCH 47/93] [Bugfix] Make EngineArgs use named arguments for config
 construction (#5285)

---
 vllm/engine/arg_utils.py | 67 ++++++++++++++++++++++++----------------
 1 file changed, 41 insertions(+), 26 deletions(-)

diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index 54911250cea9..5295c3db32a0 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -634,34 +634,49 @@ def create_engine_config(self, ) -> EngineConfig:
                 "BitsAndBytes load format and QLoRA adapter only support "
                 f"'bitsandbytes' quantization, but got {self.quantization}")
 
-        device_config = DeviceConfig(self.device)
+        device_config = DeviceConfig(device=self.device)
         model_config = ModelConfig(
-            self.model, self.tokenizer, self.tokenizer_mode,
-            self.trust_remote_code, self.dtype, self.seed, self.revision,
-            self.code_revision, self.rope_scaling, self.tokenizer_revision,
-            self.max_model_len, self.quantization,
-            self.quantization_param_path, self.sparsity, self.enforce_eager,
-            self.max_context_len_to_capture, self.max_seq_len_to_capture,
-            self.max_logprobs, self.disable_sliding_window,
-            self.skip_tokenizer_init, self.served_model_name)
-        cache_config = CacheConfig(self.block_size,
-                                   self.gpu_memory_utilization,
-                                   self.swap_space, self.kv_cache_dtype,
-                                   self.num_gpu_blocks_override,
-                                   model_config.get_sliding_window(),
-                                   self.enable_prefix_caching)
+            model=self.model,
+            tokenizer=self.tokenizer,
+            tokenizer_mode=self.tokenizer_mode,
+            trust_remote_code=self.trust_remote_code,
+            dtype=self.dtype,
+            seed=self.seed,
+            revision=self.revision,
+            code_revision=self.code_revision,
+            rope_scaling=self.rope_scaling,
+            tokenizer_revision=self.tokenizer_revision,
+            max_model_len=self.max_model_len,
+            quantization=self.quantization,
+            quantization_param_path=self.quantization_param_path,
+            sparsity=self.sparsity,
+            enforce_eager=self.enforce_eager,
+            max_context_len_to_capture=self.max_context_len_to_capture,
+            max_seq_len_to_capture=self.max_seq_len_to_capture,
+            max_logprobs=self.max_logprobs,
+            disable_sliding_window=self.disable_sliding_window,
+            skip_tokenizer_init=self.skip_tokenizer_init,
+            served_model_name=self.served_model_name)
+        cache_config = CacheConfig(
+            block_size=self.block_size,
+            gpu_memory_utilization=self.gpu_memory_utilization,
+            swap_space=self.swap_space,
+            cache_dtype=self.kv_cache_dtype,
+            num_gpu_blocks_override=self.num_gpu_blocks_override,
+            sliding_window=model_config.get_sliding_window(),
+            enable_prefix_caching=self.enable_prefix_caching)
         parallel_config = ParallelConfig(
-            self.pipeline_parallel_size,
-            self.tensor_parallel_size,
-            self.worker_use_ray,
-            self.max_parallel_loading_workers,
-            self.disable_custom_all_reduce,
-            TokenizerPoolConfig.create_config(
+            pipeline_parallel_size=self.pipeline_parallel_size,
+            tensor_parallel_size=self.tensor_parallel_size,
+            worker_use_ray=self.worker_use_ray,
+            max_parallel_loading_workers=self.max_parallel_loading_workers,
+            disable_custom_all_reduce=self.disable_custom_all_reduce,
+            tokenizer_pool_config=TokenizerPoolConfig.create_config(
                 self.tokenizer_pool_size,
                 self.tokenizer_pool_type,
                 self.tokenizer_pool_extra_config,
             ),
-            self.ray_workers_use_nsight,
+            ray_workers_use_nsight=self.ray_workers_use_nsight,
             distributed_executor_backend=self.distributed_executor_backend)
 
         speculative_config = SpeculativeConfig.maybe_create_spec_config(
@@ -680,10 +695,10 @@ def create_engine_config(self, ) -> EngineConfig:
         )
 
         scheduler_config = SchedulerConfig(
-            self.max_num_batched_tokens,
-            self.max_num_seqs,
-            model_config.max_model_len,
-            self.use_v2_block_manager,
+            max_num_batched_tokens=self.max_num_batched_tokens,
+            max_num_seqs=self.max_num_seqs,
+            max_model_len=model_config.max_model_len,
+            use_v2_block_manager=self.use_v2_block_manager,
             num_lookahead_slots=(self.num_lookahead_slots
                                  if speculative_config is None else
                                  speculative_config.num_lookahead_slots),

From 9a28c647bb7f2f086c404b1813a2117c0049f18f Mon Sep 17 00:00:00 2001
From: Alex Wu <alexanderwu@berkeley.edu>
Date: Wed, 5 Jun 2024 15:18:12 -0700
Subject: [PATCH 48/93] [Bugfix][Frontend/Core] Don't log exception when
 AsyncLLMEngine gracefully shuts down. (#5290)

---
 vllm/engine/async_llm_engine.py | 30 +++++++++++++++++++-----------
 1 file changed, 19 insertions(+), 11 deletions(-)

diff --git a/vllm/engine/async_llm_engine.py b/vllm/engine/async_llm_engine.py
index db4d2849b3f0..aa1f07b5bdc2 100644
--- a/vllm/engine/async_llm_engine.py
+++ b/vllm/engine/async_llm_engine.py
@@ -29,23 +29,32 @@ class AsyncEngineDeadError(RuntimeError):
     pass
 
 
-def _raise_exception_on_finish(
-        task: asyncio.Task, error_callback: Callable[[Exception],
-                                                     None]) -> None:
-    msg = ("Task finished unexpectedly. This should never happen! "
-           "Please open an issue on Github.")
+def _log_task_completion(task: asyncio.Task,
+                         error_callback: Callable[[Exception], None]) -> None:
+    """This function is only intended for the `engine.run_engine_loop()` task.
+
+    In particular, that task runs a `while True` loop that can only exit if
+    there is an exception.
+    """
 
     exception = None
     try:
-        task.result()
-        # NOTE: This will be thrown if task exits normally (which it should not)
-        raise AsyncEngineDeadError(msg)
+        return_value = task.result()
+        raise AssertionError(
+            f"The engine background task should never finish without an "
+            f"exception. {return_value}")
+    except asyncio.exceptions.CancelledError:
+        # We assume that if the task is cancelled, we are gracefully shutting
+        # down. This should only happen on program exit.
+        logger.info("Engine is gracefully shutting down.")
     except Exception as e:
         exception = e
         logger.error("Engine background task failed", exc_info=e)
         error_callback(exception)
         raise AsyncEngineDeadError(
-            msg + " See stack trace above for the actual cause.") from e
+            "Task finished unexpectedly. This should never happen! "
+            "Please open an issue on Github. See stack trace above for the"
+            "actual cause.") from e
 
 
 class AsyncStream:
@@ -438,8 +447,7 @@ def start_background_loop(self) -> None:
         self._background_loop_unshielded = asyncio.get_event_loop(
         ).create_task(self.run_engine_loop())
         self._background_loop_unshielded.add_done_callback(
-            partial(_raise_exception_on_finish,
-                    error_callback=self._error_callback))
+            partial(_log_task_completion, error_callback=self._error_callback))
         self.background_loop = asyncio.shield(self._background_loop_unshielded)
 
     def _init_engine(self, *args,

From 2b27f72d6a518f9c9607c1fb2cbef75f5e2b8d83 Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Wed, 5 Jun 2024 15:19:02 -0700
Subject: [PATCH 49/93] [Misc] Skip for logits_scale == 1.0 (#5291)

---
 vllm/model_executor/layers/logits_processor.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/logits_processor.py b/vllm/model_executor/layers/logits_processor.py
index d450c46455d4..7eee599473a1 100644
--- a/vllm/model_executor/layers/logits_processor.py
+++ b/vllm/model_executor/layers/logits_processor.py
@@ -21,7 +21,7 @@ class LogitsProcessor(nn.Module):
     def __init__(self,
                  vocab_size: int,
                  org_vocab_size: Optional[int] = None,
-                 scale: Optional[float] = 1.0,
+                 scale: float = 1.0,
                  logits_as_input: bool = False) -> None:
         """
         Args:
@@ -52,7 +52,8 @@ def forward(
             logits = self._get_logits(hidden_states, embedding, embedding_bias)
 
         if logits is not None:
-            logits *= self.scale
+            if self.scale != 1.0:
+                logits *= self.scale
 
             # Apply logits processors (if any).
             logits = _apply_logits_processors(logits, sampling_metadata)

From 54d26907e85f4c9c2f9b330a1281fcbab3f30d15 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Wed, 5 Jun 2024 17:25:18 -0500
Subject: [PATCH 50/93] [Docs] Add Ray Summit CFP (#5295)


From cc2aaba2843c453f3283a43a3e3bc146b569d563 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Wed, 5 Jun 2024 17:49:27 -0500
Subject: [PATCH 51/93] [CI] Disable flash_attn backend for spec decode (#5286)

---
 .buildkite/test-pipeline.yaml | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 4edd1cadfb2f..02a4364da3f1 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -45,7 +45,7 @@ steps:
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_basic_distributed_correctness.py
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
-  - pytest -v -s spec_decode/e2e/test_integration_dist.py 
+  - pytest -v -s spec_decode/e2e/test_integration_dist.py
 
 - label: Distributed Tests (Multiple Groups)
   #mirror_hardwares: [amd]
@@ -124,7 +124,10 @@ steps:
 
 - label: Speculative decoding tests
   #mirror_hardwares: [amd]
-  command: pytest -v -s spec_decode
+  commands:
+    # See https://github.com/vllm-project/vllm/issues/5152
+    - export VLLM_ATTENTION_BACKEND=XFORMERS
+    - pytest -v -s spec_decode
 
 - label: LoRA Test %N
   #mirror_hardwares: [amd]

From d72ae5b7363e2e00a04a83627843f019b5e1132b Mon Sep 17 00:00:00 2001
From: Breno Faria <breno@veltefaria.de>
Date: Thu, 6 Jun 2024 01:49:12 +0200
Subject: [PATCH 52/93] [Frontend][Core] Update Outlines Integration from `FSM`
 to `Guide` (#4109)

Co-authored-by: Simon Mo <simon.mo@hey.com>
Co-authored-by: Breno Faria <breno.faria@intrafind.com>
---
 requirements-common.txt                       |  2 +-
 tests/entrypoints/test_guided_processors.py   |  2 -
 .../guided_decoding/outlines_decoding.py      | 31 ++++------
 .../outlines_logits_processors.py             | 62 +++++++++++--------
 4 files changed, 49 insertions(+), 48 deletions(-)

diff --git a/requirements-common.txt b/requirements-common.txt
index f41873570aa6..bf9987e3af01 100644
--- a/requirements-common.txt
+++ b/requirements-common.txt
@@ -17,6 +17,6 @@ prometheus_client >= 0.18.0
 prometheus-fastapi-instrumentator >= 7.0.0
 tiktoken >= 0.6.0  # Required for DBRX tokenizer
 lm-format-enforcer == 0.10.1
-outlines == 0.0.34 # Requires torch >= 2.1.0
+outlines >= 0.0.43 # Requires torch >= 2.1.0
 typing_extensions
 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4
diff --git a/tests/entrypoints/test_guided_processors.py b/tests/entrypoints/test_guided_processors.py
index 5d4163e96fd8..fb32a9d155bc 100644
--- a/tests/entrypoints/test_guided_processors.py
+++ b/tests/entrypoints/test_guided_processors.py
@@ -63,7 +63,6 @@ def test_guided_logits_processors():
                                   tokenizer,
                                   whitespace_pattern=None)
 
-    regex_LP.init_state()
     token_ids = tokenizer.encode(
         f"Give an example IPv4 address with this regex: {TEST_REGEX}")
     tensor = torch.rand(32000)
@@ -72,7 +71,6 @@ def test_guided_logits_processors():
     assert tensor.shape == original_tensor.shape
     assert not torch.allclose(tensor, original_tensor)
 
-    json_LP.init_state()
     token_ids = tokenizer.encode(
         f"Give an employee profile that fits this schema: {TEST_SCHEMA}")
     tensor = torch.rand(32000)
diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py
index 840360428690..721f7e0530cb 100644
--- a/vllm/model_executor/guided_decoding/outlines_decoding.py
+++ b/vllm/model_executor/guided_decoding/outlines_decoding.py
@@ -1,8 +1,6 @@
 import asyncio
 import concurrent.futures
-from copy import copy
 from enum import Enum
-from functools import lru_cache
 from json import dumps as json_dumps
 from re import escape as regex_escape
 from typing import Tuple, Union
@@ -54,8 +52,10 @@ class GuidedDecodingMode(Enum):
 
 
 async def get_outlines_guided_decoding_logits_processor(
-        request: Union[CompletionRequest, ChatCompletionRequest],
-        tokenizer) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, None]:
+    request: Union[CompletionRequest,
+                   ChatCompletionRequest], tokenizer: PreTrainedTokenizerBase
+) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor,
+           None]:
     """
     Given an OpenAI-compatible request, check for guided decoding parameters
     and get the necessary logits processor for the given guide.
@@ -64,7 +64,7 @@ async def get_outlines_guided_decoding_logits_processor(
     """
     global global_thread_pool
     guide, mode = _get_guide_and_mode(request)
-    if not guide:
+    if not guide or not mode:
         return None
 
     if global_thread_pool is None:
@@ -72,15 +72,9 @@ async def get_outlines_guided_decoding_logits_processor(
             max_workers=2)
     loop = asyncio.get_running_loop()
 
-    result = await loop.run_in_executor(global_thread_pool,
-                                        _get_cached_logits_processor, guide,
-                                        tokenizer, mode,
-                                        request.guided_whitespace_pattern)
-
-    logits_processor = copy(result)
-    # reset logits processor's internal state
-    logits_processor.init_state()
-    return logits_processor
+    return await loop.run_in_executor(global_thread_pool,
+                                      _get_logits_processor, guide, tokenizer,
+                                      mode, request.guided_whitespace_pattern)
 
 
 def _get_guide_and_mode(
@@ -115,11 +109,10 @@ def _get_guide_and_mode(
         return None, None
 
 
-@lru_cache(maxsize=32)
-def _get_cached_logits_processor(guide: str,
-                                 tokenizer: PreTrainedTokenizerBase,
-                                 mode: GuidedDecodingMode,
-                                 whitespace_pattern: Union[str, None]):
+def _get_logits_processor(
+    guide: str, tokenizer: PreTrainedTokenizerBase, mode: GuidedDecodingMode,
+    whitespace_pattern: Union[str, None]
+) -> Union[JSONLogitsProcessor, RegexLogitsProcessor, CFGLogitsProcessor]:
     if mode == GuidedDecodingMode.JSON:
         return JSONLogitsProcessor(guide, tokenizer, whitespace_pattern)
     elif mode == GuidedDecodingMode.REGEX or mode == GuidedDecodingMode.CHOICE:
diff --git a/vllm/model_executor/guided_decoding/outlines_logits_processors.py b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
index a131c6a1b92b..1618705ff298 100644
--- a/vllm/model_executor/guided_decoding/outlines_logits_processors.py
+++ b/vllm/model_executor/guided_decoding/outlines_logits_processors.py
@@ -21,7 +21,7 @@
 from typing import Callable, DefaultDict, Dict, List, Union
 
 import torch
-from outlines.fsm.fsm import CFGFSM, FSM, RegexFSM
+from outlines.fsm.guide import CFGGuide, Generate, Guide, RegexGuide, Write
 from outlines.fsm.json_schema import build_regex_from_schema
 from pydantic import BaseModel
 from transformers import PreTrainedTokenizerBase
@@ -29,28 +29,32 @@
 
 class BaseLogitsProcessor:
 
-    def __init__(self):
-        # Child class should use initialize in their init.
-        self.fsm: FSM
-
-    def init_state(self):
-        """Initialize the FSM states."""
-        self.fsm_state: DefaultDict[int, int] = defaultdict(int)
+    def __init__(self, guide: Guide):
+        self._guide: Guide = guide
+        self._fsm_state: DefaultDict[int, int] = defaultdict(int)
 
     def __call__(self, input_ids: List[int],
                  scores: torch.Tensor) -> torch.Tensor:
         """Use the FSM to bias the logits before sampling the next token."""
         seq_id = hash(tuple(input_ids))
 
-        if len(input_ids) == 0:
-            self.init_state()
-        else:
+        if len(input_ids) > 0:
             last_token = input_ids[-1]
             last_seq_id = hash(tuple(input_ids[:-1]))
-            self.fsm_state[seq_id] = self.fsm.next_state(
-                self.fsm_state[last_seq_id], last_token)
+            self._fsm_state[seq_id] = self._guide.get_next_state(
+                state=self._fsm_state[last_seq_id], token_id=last_token)
+
+        instruction = self._guide.get_next_instruction(
+            state=self._fsm_state[seq_id])
 
-        allowed_tokens = self.fsm.allowed_token_ids(self.fsm_state[seq_id])
+        if type(instruction) == Generate:
+            allowed_tokens = instruction.tokens
+        elif type(instruction) == Write:
+            # TODO: support fast forward tokens
+            allowed_tokens = [instruction.tokens[0]]
+        else:
+            raise TypeError(
+                f"Unsupported instruction type {type(instruction)}")
 
         mask = torch.full((scores.shape[-1], ),
                           -math.inf,
@@ -62,6 +66,13 @@ def __call__(self, input_ids: List[int],
 
 class RegexLogitsProcessor(BaseLogitsProcessor):
 
+    @classmethod
+    @lru_cache(maxsize=32)
+    def _get_guide(cls, regex_string: str,
+                   tokenizer: PreTrainedTokenizerBase) -> Guide:
+        tokenizer = _adapt_tokenizer(tokenizer)
+        return RegexGuide(regex_string, tokenizer)
+
     def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase):
         """Compile the FSM that drives the regex-structured generation.
 
@@ -73,9 +84,8 @@ def __init__(self, regex_string: str, tokenizer: PreTrainedTokenizerBase):
             The model's tokenizer
 
         """
-        tokenizer = _adapt_tokenizer(tokenizer)
-        fsm = RegexFSM(regex_string, tokenizer)
-        self.fsm = fsm
+        super().__init__(
+            RegexLogitsProcessor._get_guide(regex_string, tokenizer))
 
 
 class JSONLogitsProcessor(RegexLogitsProcessor):
@@ -115,6 +125,12 @@ def __init__(self, schema: Union[str, Dict, BaseModel],
 
 class CFGLogitsProcessor(BaseLogitsProcessor):
 
+    @classmethod
+    @lru_cache(maxsize=32)
+    def _get_guide(cls, cfg: str, tokenizer: PreTrainedTokenizerBase) -> Guide:
+        tokenizer = _adapt_tokenizer(tokenizer)
+        return CFGGuide(cfg, tokenizer)
+
     def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase):
         """Compile the FSM that drives the context free grammar generation.
 
@@ -126,17 +142,11 @@ def __init__(self, cfg: str, tokenizer: PreTrainedTokenizerBase):
             The model's tokenizer
 
         """
-        tokenizer = _adapt_tokenizer(tokenizer)
-        fsm = CFGFSM(cfg, tokenizer)
-        self.fsm = fsm
-
-    def init_state(self):
-        """Initialize state with a CFGFSM copy."""
-        super().init_state()
-        self.fsm = self.fsm.copy()
+        super().__init__(CFGLogitsProcessor._get_guide(cfg, tokenizer))
+        self._guide = self._guide.copy()
 
 
-@lru_cache
+@lru_cache(maxsize=32)
 def _adapt_tokenizer(tokenizer: PreTrainedTokenizerBase):
     """Adapt vLLM's tokenizer to use to compile the FSM.
 

From 08fd78813dc718243b826f307c98dc8bd51c72fd Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Thu, 6 Jun 2024 18:17:18 +0800
Subject: [PATCH 53/93] [CI/Build] Update vision tests (#5307)

---
 .buildkite/test-pipeline.yaml |  5 +-
 pyproject.toml                |  1 +
 tests/conftest.py             | 35 +++----------
 tests/models/test_llava.py    | 97 +++++++++++++++--------------------
 vllm/config.py                | 22 +++++++-
 vllm/multimodal/image.py      | 18 ++++++-
 6 files changed, 90 insertions(+), 88 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index 02a4364da3f1..d8030ab219cc 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -93,14 +93,13 @@ steps:
 - label: Models Test
   #mirror_hardwares: [amd]
   commands:
-    - bash ../.buildkite/download-images.sh
-    - pytest -v -s models --ignore=models/test_llava.py
+    - pytest -v -s models -m \"not llava\"
 
 - label: Llava Test
   mirror_hardwares: [amd]
   commands:
     - bash ../.buildkite/download-images.sh
-    - pytest -v -s models/test_llava.py
+    - pytest -v -s models -m llava
 
 - label: Prefix Caching Test
   mirror_hardwares: [amd]
diff --git a/pyproject.toml b/pyproject.toml
index 06f150009aa8..eb691c29724c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -71,4 +71,5 @@ markers = [
     "skip_global_cleanup",
     "llm: run tests for vLLM API only",
     "openai: run tests for OpenAI API only",
+    "llava: run tests for LLaVA models only",
 ]
diff --git a/tests/conftest.py b/tests/conftest.py
index 93b161914b53..2e64f04468ee 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -31,24 +31,19 @@
 
 # Multi modal related
 # You can use `.buildkite/download-images.sh` to download the assets
-_PIXEL_VALUES_FILES = [
+PIXEL_VALUES_FILES = [
     os.path.join(_TEST_DIR, "images", filename) for filename in
     ["stop_sign_pixel_values.pt", "cherry_blossom_pixel_values.pt"]
 ]
-_IMAGE_FEATURES_FILES = [
+IMAGE_FEATURES_FILES = [
     os.path.join(_TEST_DIR, "images", filename) for filename in
     ["stop_sign_image_features.pt", "cherry_blossom_image_features.pt"]
 ]
-_IMAGE_FILES = [
+IMAGE_FILES = [
     os.path.join(_TEST_DIR, "images", filename)
     for filename in ["stop_sign.jpg", "cherry_blossom.jpg"]
 ]
-_IMAGE_PROMPTS = [
-    "<image>\nUSER: What's the content of the image?\nASSISTANT:",
-    "<image>\nUSER: What is the season?\nASSISTANT:"
-]
-assert len(_PIXEL_VALUES_FILES) == len(_IMAGE_FEATURES_FILES) == len(
-    _IMAGE_FILES) == len(_IMAGE_PROMPTS)
+assert len(PIXEL_VALUES_FILES) == len(IMAGE_FEATURES_FILES) == len(IMAGE_FILES)
 
 
 def _read_prompts(filename: str) -> List[str]:
@@ -86,14 +81,9 @@ def cleanup_fixture(should_do_global_cleanup_after_test: bool):
         cleanup()
 
 
-@pytest.fixture(scope="session")
-def hf_image_prompts() -> List[str]:
-    return _IMAGE_PROMPTS
-
-
 @pytest.fixture(scope="session")
 def hf_images() -> List[Image.Image]:
-    return [Image.open(filename) for filename in _IMAGE_FILES]
+    return [Image.open(filename) for filename in IMAGE_FILES]
 
 
 @pytest.fixture()
@@ -103,26 +93,17 @@ def vllm_images(request) -> List[MultiModalData]:
             VisionLanguageConfig.ImageInputType.IMAGE_FEATURES):
         return [
             ImageFeatureData(torch.load(filename))
-            for filename in _IMAGE_FEATURES_FILES
+            for filename in IMAGE_FEATURES_FILES
         ]
     else:
         return [
-            ImagePixelData(Image.open(filename)) for filename in _IMAGE_FILES
+            ImagePixelData(Image.open(filename)) for filename in IMAGE_FILES
         ]
 
 
 @pytest.fixture()
 def vllm_image_tensors(request) -> List[torch.Tensor]:
-    return [torch.load(filename) for filename in _PIXEL_VALUES_FILES]
-
-
-@pytest.fixture()
-def vllm_image_prompts(request) -> List[str]:
-    vision_language_config = request.getfixturevalue("model_and_config")[1]
-    return [
-        "<image>" * (vision_language_config.image_feature_size - 1) + p
-        for p in _IMAGE_PROMPTS
-    ]
+    return [torch.load(filename) for filename in PIXEL_VALUES_FILES]
 
 
 @pytest.fixture
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index 839a9f78d1bb..f03dbdbb770e 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -1,14 +1,22 @@
-import gc
-from dataclasses import fields
-from enum import Enum
-from typing import Any, Dict, List, Tuple
+from typing import List, Tuple
 
 import pytest
-import torch
 from transformers import AutoTokenizer
 
 from vllm.config import VisionLanguageConfig
 
+from ..conftest import IMAGE_FILES
+
+pytestmark = pytest.mark.llava
+
+# The image token is placed before "user" on purpose so that the test can pass
+HF_IMAGE_PROMPTS = [
+    "<image>\nUSER: What's the content of the image?\nASSISTANT:",
+    "<image>\nUSER: What is the season?\nASSISTANT:",
+]
+
+assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES)
+
 
 def iter_llava_configs(model_name: str):
     image_hw_to_feature_size = {
@@ -36,53 +44,35 @@ def iter_llava_configs(model_name: str):
 ]
 
 
-def as_dict(vlm_config: VisionLanguageConfig) -> Dict[str, Any]:
-    """Flatten vision language config to pure args.
-
-    Compatible with what llm entrypoint expects.
-    """
-    result = {}
-    for field in fields(vlm_config):
-        value = getattr(vlm_config, field.name)
-        if isinstance(value, Enum):
-            result[field.name] = value.name.lower()
-        elif isinstance(value, tuple):
-            result[field.name] = ",".join([str(item) for item in value])
-        else:
-            result[field.name] = value
-
-    result["disable_image_processor"] = vlm_config.image_processor is None
-
-    return result
-
-
-def sanitize_vllm_output(vllm_output: Tuple[List[int], str],
-                         vision_language_config: VisionLanguageConfig,
-                         model_id: str):
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
+                      vlm_config: VisionLanguageConfig, model_id: str):
     """Sanitize vllm output to be comparable with hf output.
     The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
     x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
     It also reduces `output_str` from "<image><image>bla" to "bla".
     """
-    tokenizer = AutoTokenizer.from_pretrained(model_id)
-    image_token_str = tokenizer.decode(vision_language_config.image_token_id)
-    image_token_str_len = len(image_token_str)
     input_ids, output_str = vllm_output
-    sanitized_input_ids = input_ids[0:2] + input_ids[2 + vision_language_config
-                                                     .image_feature_size - 1:]
-    sanitzied_output_str = output_str[vision_language_config.
-                                      image_feature_size *
-                                      image_token_str_len:]
-    return sanitized_input_ids, sanitzied_output_str
+    image_token_id = vlm_config.image_token_id
+
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    image_token_str = tokenizer.decode(image_token_id)
 
+    hf_input_ids = [
+        input_id for idx, input_id in enumerate(input_ids)
+        if input_id != image_token_id or input_ids[idx - 1] != image_token_id
+    ]
+    hf_output_str = output_str \
+        .replace(image_token_str * vlm_config.image_feature_size, "")
 
-@pytest.mark.parametrize("worker_use_ray", [False])
+    return hf_input_ids, hf_output_str
+
+
+# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
 @pytest.mark.parametrize("model_and_config", model_and_vl_config)
 @pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
-def test_models(hf_runner, vllm_runner, hf_image_prompts, hf_images,
-                vllm_image_prompts, vllm_images, model_and_config, dtype: str,
-                max_tokens: int, worker_use_ray: bool) -> None:
+def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
+                model_and_config, dtype: str, max_tokens: int) -> None:
     """Inference result should be the same between hf and vllm.
 
     All the image fixtures for the test is under tests/images.
@@ -92,36 +82,33 @@ def test_models(hf_runner, vllm_runner, hf_image_prompts, hf_images,
     Note, the text input is also adjusted to abide by vllm contract.
     The text output is sanitized to be able to compare with hf.
     """
-    model_id, vision_language_config = model_and_config
+    model_id, vlm_config = model_and_config
 
     hf_model = hf_runner(model_id, dtype=dtype, is_vision_model=True)
-    hf_outputs = hf_model.generate_greedy(hf_image_prompts,
+    hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
                                           max_tokens,
                                           images=hf_images)
     del hf_model
 
+    vllm_image_prompts = [
+        p.replace("<image>", "<image>" * vlm_config.image_feature_size)
+        for p in HF_IMAGE_PROMPTS
+    ]
+
     vllm_model = vllm_runner(model_id,
                              dtype=dtype,
-                             worker_use_ray=worker_use_ray,
                              enforce_eager=True,
-                             **as_dict(vision_language_config))
+                             **vlm_config.as_cli_args_dict())
     vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
                                               max_tokens,
                                               images=vllm_images)
     del vllm_model
 
-    gc.collect()
-    torch.cuda.empty_cache()
-
-    for i in range(len(hf_image_prompts)):
+    for i in range(len(HF_IMAGE_PROMPTS)):
         hf_output_ids, hf_output_str = hf_outputs[i]
-        vllm_output_ids, vllm_output_str = sanitize_vllm_output(
-            vllm_outputs[i], vision_language_config, model_id)
+        vllm_output_ids, vllm_output_str = vllm_to_hf_output(
+            vllm_outputs[i], vlm_config, model_id)
         assert hf_output_str == vllm_output_str, (
             f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
         assert hf_output_ids == vllm_output_ids, (
             f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
-
-
-# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
-# (Requires multiple GPUs)
diff --git a/vllm/config.py b/vllm/config.py
index 87a71ec36b4f..e6df38c0e874 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -1,7 +1,8 @@
 import enum
 import json
 from dataclasses import dataclass, field, fields
-from typing import TYPE_CHECKING, ClassVar, List, Optional, Tuple, Union
+from typing import (TYPE_CHECKING, Any, ClassVar, Dict, List, Optional, Tuple,
+                    Union)
 
 import torch
 from transformers import PretrainedConfig
@@ -1146,6 +1147,25 @@ def get_image_input_enum_type(cls, value: str) -> ImageInputType:
                              f"Expecting to choose from "
                              f"{[x.name for x in cls.ImageInputType]}.") from e
 
+    def as_cli_args_dict(self) -> Dict[str, Any]:
+        """Flatten vision language config to pure args.
+
+        Compatible with what llm entrypoint expects.
+        """
+        result: Dict[str, Any] = {}
+        for f in fields(self):
+            value = getattr(self, f.name)
+            if isinstance(value, enum.Enum):
+                result[f.name] = value.name.lower()
+            elif isinstance(value, tuple):
+                result[f.name] = ",".join([str(item) for item in value])
+            else:
+                result[f.name] = value
+
+        result["disable_image_processor"] = self.image_processor is None
+
+        return result
+
 
 _STR_DTYPE_TO_TORCH_DTYPE = {
     "half": torch.float16,
diff --git a/vllm/multimodal/image.py b/vllm/multimodal/image.py
index b964e9ee4262..08fb09d11160 100644
--- a/vllm/multimodal/image.py
+++ b/vllm/multimodal/image.py
@@ -75,6 +75,14 @@ def __init__(self, image: Union[Image.Image, torch.Tensor]) -> None:
 
         self.image = image
 
+    def __repr__(self) -> str:
+        image = self.image
+        if isinstance(image, Image.Image):
+            return f"{type(self).__name__}(image={image})"
+
+        return (f"{type(self).__name__}(image=torch.Tensor(shape="
+                f"{image.shape}, dtype={image.dtype}))")
+
 
 class ImagePixelPlugin(MultiModalPlugin[ImagePixelData]):
 
@@ -96,10 +104,10 @@ def _default_input_processor(
             self, data: ImagePixelData, model_config: ModelConfig,
             vlm_config: VisionLanguageConfig) -> Dict[str, torch.Tensor]:
         image = data.image
-        image_processor = self._get_hf_image_processor(model_config,
-                                                       vlm_config)
 
         if isinstance(image, Image.Image):
+            image_processor = self._get_hf_image_processor(
+                model_config, vlm_config)
             if image_processor is None:
                 raise RuntimeError("No HuggingFace processor is available"
                                    "to process the image object")
@@ -127,6 +135,12 @@ class ImageFeatureData(MultiModalData):
     def __init__(self, image_features: torch.Tensor) -> None:
         self.image_features = image_features
 
+    def __repr__(self) -> str:
+        image_features = self.image_features
+
+        return (f"{type(self).__name__}(image_features=torch.Tensor(shape="
+                f"{image_features.shape}, dtype={image_features.dtype}))")
+
 
 class ImageFeaturePlugin(MultiModalPlugin[ImageFeatureData]):
 

From cbfd3d9c7c9b2bedee197aaf96655e5d4faf5e7d Mon Sep 17 00:00:00 2001
From: liuyhwangyh <liuyhwangyh@163.com>
Date: Fri, 7 Jun 2024 00:28:10 +0800
Subject: [PATCH 54/93] Bugfix: fix broken of download models from modelscope
 (#5233)

Co-authored-by: mulin.lyh <mulin.lyh@taobao.com>
---
 tests/test_regression.py          | 21 +++++++++++++++++++++
 vllm/config.py                    |  6 +++++-
 vllm/transformers_utils/config.py |  7 ++++++-
 3 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/tests/test_regression.py b/tests/test_regression.py
index cb68e9ecfc06..5d27d3579301 100644
--- a/tests/test_regression.py
+++ b/tests/test_regression.py
@@ -53,6 +53,27 @@ def test_gc():
     assert allocated < 50 * 1024 * 1024
 
 
+def test_model_from_modelscope(monkeypatch):
+    # model: https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary
+    MODELSCOPE_MODEL_NAME = "qwen/Qwen1.5-0.5B-Chat"
+    monkeypatch.setenv("VLLM_USE_MODELSCOPE", "True")
+    try:
+        llm = LLM(model=MODELSCOPE_MODEL_NAME)
+
+        prompts = [
+            "Hello, my name is",
+            "The president of the United States is",
+            "The capital of France is",
+            "The future of AI is",
+        ]
+        sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
+
+        outputs = llm.generate(prompts, sampling_params)
+        assert len(outputs) == 4
+    finally:
+        monkeypatch.delenv("VLLM_USE_MODELSCOPE", raising=False)
+
+
 if __name__ == "__main__":
     import pytest
     pytest.main([__file__])
diff --git a/vllm/config.py b/vllm/config.py
index e6df38c0e874..cca0496eeb32 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -115,7 +115,11 @@ def __init__(
         self.revision = revision
         self.code_revision = code_revision
         self.rope_scaling = rope_scaling
-        self.tokenizer_revision = tokenizer_revision
+        # The tokenizer version is consistent with the model version by default.
+        if tokenizer_revision is None:
+            self.tokenizer_revision = revision
+        else:
+            self.tokenizer_revision = tokenizer_revision
         self.quantization = quantization
         self.quantization_param_path = quantization_param_path
         # UPSTREAM SYNC: keep sparsity
diff --git a/vllm/transformers_utils/config.py b/vllm/transformers_utils/config.py
index 044eec6410a5..970645987885 100644
--- a/vllm/transformers_utils/config.py
+++ b/vllm/transformers_utils/config.py
@@ -1,7 +1,8 @@
 from typing import Dict, Optional
 
-from transformers import AutoConfig, PretrainedConfig
+from transformers import PretrainedConfig
 
+from vllm.envs import VLLM_USE_MODELSCOPE
 from vllm.logger import init_logger
 from vllm.transformers_utils.configs import (ChatGLMConfig, DbrxConfig,
                                              JAISConfig, MPTConfig, RWConfig)
@@ -24,6 +25,10 @@ def get_config(model: str,
                code_revision: Optional[str] = None,
                rope_scaling: Optional[dict] = None) -> PretrainedConfig:
     try:
+        if VLLM_USE_MODELSCOPE:
+            from modelscope import AutoConfig
+        else:
+            from transformers import AutoConfig
         config = AutoConfig.from_pretrained(
             model,
             trust_remote_code=trust_remote_code,

From 7bb7e9b352a4f73bf8a6b7ac390b7f303e21ae2d Mon Sep 17 00:00:00 2001
From: Philipp Moritz <pcmoritz@gmail.com>
Date: Thu, 6 Jun 2024 09:29:29 -0700
Subject: [PATCH 55/93] [Kernel] Retune Mixtral 8x22b configs for FP8 on H100
 (#5294)

---
 ...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json | 110 +++++++++---------
 ...me=NVIDIA_H100_80GB_HBM3,dtype=float8.json |  72 ++++++------
 2 files changed, 91 insertions(+), 91 deletions(-)

diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
index 0c495e7e290c..555718733954 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=2048,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
@@ -1,113 +1,113 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
     },
     "2": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
         "num_stages": 5
     },
     "4": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 5
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
+        "num_stages": 4
     },
     "8": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 5
     },
     "16": {
-        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
+        "BLOCK_SIZE_K": 256,
+        "GROUP_SIZE_M": 32,
+        "num_warps": 4,
         "num_stages": 3
     },
     "24": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 2
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
     },
     "32": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
         "num_stages": 3
     },
     "48": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 4
+        "GROUP_SIZE_M": 1,
+        "num_warps": 4,
+        "num_stages": 3
     },
     "64": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 2
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 3
     },
     "96": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 4
+        "num_stages": 3
     },
     "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 32,
-        "num_warps": 8,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
         "num_stages": 3
     },
     "256": {
-        "BLOCK_SIZE_M": 128,
-        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 4
+        "num_warps": 4,
+        "num_stages": 3
     },
     "512": {
-        "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
-        "num_warps": 4,
-        "num_stages": 3
+        "num_warps": 8,
+        "num_stages": 5
     },
     "1024": {
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 256,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 64,
         "num_warps": 8,
         "num_stages": 4
     },
@@ -139,7 +139,7 @@
         "BLOCK_SIZE_M": 128,
         "BLOCK_SIZE_N": 256,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 16,
         "num_warps": 8,
         "num_stages": 4
     }
diff --git a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
index 60a65724d68b..cc614e635ea5 100644
--- a/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
+++ b/vllm/model_executor/layers/fused_moe/configs/E=8,N=4096,device_name=NVIDIA_H100_80GB_HBM3,dtype=float8.json
@@ -1,17 +1,17 @@
 {
     "1": {
-        "BLOCK_SIZE_M": 16,
-        "BLOCK_SIZE_N": 32,
-        "BLOCK_SIZE_K": 64,
+        "BLOCK_SIZE_M": 64,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 4,
         "num_stages": 5
     },
     "2": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 64,
-        "GROUP_SIZE_M": 16,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 64,
         "num_warps": 4,
         "num_stages": 4
     },
@@ -20,59 +20,59 @@
         "BLOCK_SIZE_N": 128,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 64,
-        "num_warps": 8,
-        "num_stages": 2
+        "num_warps": 4,
+        "num_stages": 3
     },
     "8": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 256,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 32,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 5
     },
     "16": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
+        "num_warps": 4,
+        "num_stages": 5
     },
     "24": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 1,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 16,
         "num_warps": 4,
-        "num_stages": 3
+        "num_stages": 5
     },
     "32": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 128,
+        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_K": 256,
         "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 3
+        "num_warps": 4,
+        "num_stages": 4
     },
     "48": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
+        "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
-        "GROUP_SIZE_M": 64,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 2
+        "num_stages": 3
     },
     "64": {
         "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
-        "BLOCK_SIZE_K": 256,
-        "GROUP_SIZE_M": 16,
+        "BLOCK_SIZE_K": 128,
+        "GROUP_SIZE_M": 1,
         "num_warps": 4,
-        "num_stages": 2
+        "num_stages": 3
     },
     "96": {
-        "BLOCK_SIZE_M": 128,
+        "BLOCK_SIZE_M": 64,
         "BLOCK_SIZE_N": 64,
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
@@ -81,11 +81,11 @@
     },
     "128": {
         "BLOCK_SIZE_M": 64,
-        "BLOCK_SIZE_N": 128,
-        "BLOCK_SIZE_K": 256,
+        "BLOCK_SIZE_N": 64,
+        "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
-        "num_warps": 8,
-        "num_stages": 2
+        "num_warps": 4,
+        "num_stages": 3
     },
     "256": {
         "BLOCK_SIZE_M": 128,
@@ -93,7 +93,7 @@
         "BLOCK_SIZE_K": 128,
         "GROUP_SIZE_M": 1,
         "num_warps": 8,
-        "num_stages": 5
+        "num_stages": 3
     },
     "512": {
         "BLOCK_SIZE_M": 128,

From fbd60f326322515f0ce6c90f3b91a58506cadca3 Mon Sep 17 00:00:00 2001
From: Matthew Goldey <matthew.goldey@gmail.com>
Date: Thu, 6 Jun 2024 16:48:13 -0400
Subject: [PATCH 56/93] [Frontend] enable passing multiple LoRA adapters at
 once to generate() (#5300)

---
 .../test_llm_generate_multiple_loras.py       | 69 +++++++++++++++++++
 vllm/entrypoints/llm.py                       | 39 ++++++-----
 2 files changed, 91 insertions(+), 17 deletions(-)
 create mode 100644 tests/entrypoints/test_llm_generate_multiple_loras.py

diff --git a/tests/entrypoints/test_llm_generate_multiple_loras.py b/tests/entrypoints/test_llm_generate_multiple_loras.py
new file mode 100644
index 000000000000..b429b904c7c3
--- /dev/null
+++ b/tests/entrypoints/test_llm_generate_multiple_loras.py
@@ -0,0 +1,69 @@
+import weakref
+
+import pytest
+# downloading lora to test lora requests
+from huggingface_hub import snapshot_download
+
+from vllm import LLM
+from vllm.lora.request import LoRARequest
+
+from ..conftest import cleanup
+
+MODEL_NAME = "HuggingFaceH4/zephyr-7b-beta"
+
+PROMPTS = [
+    "Hello, my name is",
+    "The president of the United States is",
+    "The capital of France is",
+    "The future of AI is",
+]
+
+LORA_NAME = "typeof/zephyr-7b-beta-lora"
+
+pytestmark = pytest.mark.llm
+
+
+@pytest.fixture(scope="module")
+def llm():
+    # pytest caches the fixture so we use weakref.proxy to
+    # enable garbage collection
+    llm = LLM(model=MODEL_NAME,
+              tensor_parallel_size=1,
+              max_model_len=8192,
+              enable_lora=True,
+              max_loras=4,
+              max_lora_rank=64,
+              max_num_seqs=128,
+              enforce_eager=True)
+
+    with llm.deprecate_legacy_api():
+        yield weakref.proxy(llm)
+
+        del llm
+
+    cleanup()
+
+
+@pytest.fixture(scope="session")
+def zephyr_lora_files():
+    return snapshot_download(repo_id=LORA_NAME)
+
+
+@pytest.mark.skip_global_cleanup
+def test_multiple_lora_requests(llm: LLM, zephyr_lora_files):
+    lora_request = [
+        LoRARequest(LORA_NAME, idx + 1, zephyr_lora_files)
+        for idx in range(len(PROMPTS))
+    ]
+    # Multiple SamplingParams should be matched with each prompt
+    outputs = llm.generate(PROMPTS, lora_request=lora_request)
+    assert len(PROMPTS) == len(outputs)
+
+    # Exception raised, if the size of params does not match the size of prompts
+    with pytest.raises(ValueError):
+        outputs = llm.generate(PROMPTS, lora_request=lora_request[:1])
+
+    # Single LoRARequest should be applied to every prompt
+    single_lora_request = lora_request[0]
+    outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
+    assert len(PROMPTS) == len(outputs)
diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py
index b6173a9362d8..31dc5557e76f 100644
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -179,7 +179,7 @@ def generate(
                                         List[SamplingParams]]] = None,
         prompt_token_ids: Optional[List[int]] = None,
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -191,7 +191,7 @@ def generate(
                                         List[SamplingParams]]] = None,
         prompt_token_ids: Optional[List[List[int]]] = None,
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -204,7 +204,7 @@ def generate(
         *,
         prompt_token_ids: List[int],
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -217,7 +217,7 @@ def generate(
         *,
         prompt_token_ids: List[List[int]],
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -228,7 +228,7 @@ def generate(
         sampling_params: None,
         prompt_token_ids: Union[List[int], List[List[int]]],
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -241,7 +241,7 @@ def generate(
         sampling_params: Optional[Union[SamplingParams,
                                         Sequence[SamplingParams]]] = None,
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[RequestOutput]:
         ...
 
@@ -258,7 +258,7 @@ def generate(
                                         Sequence[SamplingParams]]] = None,
         prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None,
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[RequestOutput]:
         """Generates the completions for the input prompts.
 
@@ -321,7 +321,7 @@ def encode(
                                        Sequence[PoolingParams]]] = None,
         prompt_token_ids: Optional[List[int]] = None,
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[EmbeddingRequestOutput]:
         ...
 
@@ -333,7 +333,7 @@ def encode(
                                        Sequence[PoolingParams]]] = None,
         prompt_token_ids: Optional[List[List[int]]] = None,
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[EmbeddingRequestOutput]:
         ...
 
@@ -346,7 +346,7 @@ def encode(
         *,
         prompt_token_ids: List[int],
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[EmbeddingRequestOutput]:
         ...
 
@@ -359,7 +359,7 @@ def encode(
         *,
         prompt_token_ids: List[List[int]],
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[EmbeddingRequestOutput]:
         ...
 
@@ -370,7 +370,7 @@ def encode(
         pooling_params: None,
         prompt_token_ids: Union[List[int], List[List[int]]],
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[EmbeddingRequestOutput]:
         ...
 
@@ -383,7 +383,7 @@ def encode(
         pooling_params: Optional[Union[PoolingParams,
                                        Sequence[PoolingParams]]] = None,
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[EmbeddingRequestOutput]:
         ...
 
@@ -400,7 +400,7 @@ def encode(
                                        Sequence[PoolingParams]]] = None,
         prompt_token_ids: Optional[Union[List[int], List[List[int]]]] = None,
         use_tqdm: bool = True,
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> List[EmbeddingRequestOutput]:
         """Generates the completions for the input prompts.
 
@@ -507,7 +507,7 @@ def _validate_and_add_requests(
         inputs: Union[PromptStrictInputs, Sequence[PromptStrictInputs]],
         params: Union[SamplingParams, Sequence[SamplingParams], PoolingParams,
                       Sequence[PoolingParams]],
-        lora_request: Optional[LoRARequest],
+        lora_request: Optional[Union[Sequence[LoRARequest], LoRARequest]],
     ) -> None:
         if isinstance(inputs, (str, dict)):
             # Convert a single prompt to a list.
@@ -518,20 +518,25 @@ def _validate_and_add_requests(
         if isinstance(params, list) and len(params) != num_requests:
             raise ValueError("The lengths of prompts and params "
                              "must be the same.")
+        if isinstance(lora_request,
+                      list) and len(lora_request) != num_requests:
+            raise ValueError("The lengths of prompts and lora_request "
+                             "must be the same.")
 
         # Add requests to the engine.
         for i, request_inputs in enumerate(inputs):
             self._add_request(
                 request_inputs,
                 params[i] if isinstance(params, Sequence) else params,
-                lora_request=lora_request,
+                lora_request=lora_request[i] if isinstance(
+                    lora_request, Sequence) else lora_request,
             )
 
     def _add_request(
         self,
         inputs: PromptInputs,
         params: Union[SamplingParams, PoolingParams],
-        lora_request: Optional[LoRARequest] = None,
+        lora_request: Optional[Union[List[LoRARequest], LoRARequest]] = None,
     ) -> None:
         request_id = str(next(self.request_counter))
         self.llm_engine.add_request(request_id,

From 14a49c21297de6fd7e4966d28d0042b19484ec85 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Thu, 6 Jun 2024 18:12:00 -0700
Subject: [PATCH 57/93] [Core] Avoid copying prompt/output tokens if no
 penalties are used (#5289)

---
 vllm/model_executor/sampling_metadata.py | 80 +++++++++++++++---------
 1 file changed, 50 insertions(+), 30 deletions(-)

diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
index 0b3b41e69d6b..7ad84f51b7e4 100644
--- a/vllm/model_executor/sampling_metadata.py
+++ b/vllm/model_executor/sampling_metadata.py
@@ -386,16 +386,18 @@ def from_sampling_metadata(
                 presence_penalties += [0] * prefill_len
                 frequency_penalties += [0] * prefill_len
                 repetition_penalties += [1] * prefill_len
-                prompt_tokens.extend([] for _ in range(prefill_len))
-                output_tokens.extend([] for _ in range(prefill_len))
+                if do_penalties:
+                    prompt_tokens.extend([] for _ in range(prefill_len))
+                    output_tokens.extend([] for _ in range(prefill_len))
 
             if seq_group.do_sample:
                 sample_lens = len(seq_group.sample_indices)
                 assert sample_lens == len(seq_ids)
                 for seq_id in seq_ids:
                     seq_data = seq_group.seq_data[seq_id]
-                    prompt_tokens.append(seq_data.prompt_token_ids)
-                    output_tokens.append(seq_data.output_token_ids)
+                    if do_penalties:
+                        prompt_tokens.append(seq_data.prompt_token_ids)
+                        output_tokens.append(seq_data.output_token_ids)
                 temperatures += [temperature] * len(seq_ids)
                 top_ps += [top_p] * len(seq_ids)
                 top_ks += [top_k] * len(seq_ids)
@@ -443,18 +445,22 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
         # Note that the performance will be very bad without
         # pinned memory.
         pin_memory = is_pin_memory_available()
-        prompt_max_len = max([len(tokens) for tokens in prompt_tokens],
-                             default=0)
-        prompt_padded_tokens = [
-            tokens + [vocab_size] * (prompt_max_len - len(tokens))
-            for tokens in prompt_tokens
-        ]
-        output_max_len = max([len(tokens) for tokens in output_tokens],
-                             default=0)
-        output_padded_tokens = [
-            tokens + [vocab_size] * (output_max_len - len(tokens))
-            for tokens in output_tokens
-        ]
+
+        do_penalties = prompt_tokens or output_tokens
+
+        if do_penalties:
+            prompt_max_len = max([len(tokens) for tokens in prompt_tokens],
+                                 default=0)
+            prompt_padded_tokens = [
+                tokens + [vocab_size] * (prompt_max_len - len(tokens))
+                for tokens in prompt_tokens
+            ]
+            output_max_len = max([len(tokens) for tokens in output_tokens],
+                                 default=0)
+            output_padded_tokens = [
+                tokens + [vocab_size] * (output_max_len - len(tokens))
+                for tokens in output_tokens
+            ]
 
         temperatures_t = torch.tensor(
             temperatures,
@@ -504,18 +510,22 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
             dtype=torch.long,
             pin_memory=pin_memory,
         )
-        prompt_tensor = torch.tensor(
-            prompt_padded_tokens,
-            device="cpu",
-            dtype=torch.long,
-            pin_memory=pin_memory,
-        )
-        output_tensor = torch.tensor(
-            output_padded_tokens,
-            device="cpu",
-            dtype=torch.long,
-            pin_memory=pin_memory,
-        )
+        if do_penalties:
+            prompt_tensor = torch.tensor(
+                prompt_padded_tokens,
+                device="cpu",
+                dtype=torch.long,
+                pin_memory=pin_memory,
+            )
+            output_tensor = torch.tensor(
+                output_padded_tokens,
+                device="cpu",
+                dtype=torch.long,
+                pin_memory=pin_memory,
+            )
+        else:
+            prompt_tensor = None
+            output_tensor = None
         # need to transpose and make contiguous to
         # copy the tensor correctly.
         # [batch_size, n_seeds] -> [n_seeds, batch_size]
@@ -538,6 +548,16 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
             extra_seeds_gpu = None
         sampling_seeds_gpu = sampling_seeds_gpu[:num_base_seeds]
 
+        if do_penalties:
+            prompt_tokens_gpu = prompt_tensor.to(device=device,
+                                                 non_blocking=True)
+            output_tokens_gpu = output_tensor.to(device=device,
+                                                 non_blocking=True)
+        else:
+            empty_tensor = torch.empty(0, device=device, dtype=torch.long)
+            prompt_tokens_gpu = empty_tensor
+            output_tokens_gpu = empty_tensor
+
         return cls(
             temperatures=temperatures_t.to(device=device, non_blocking=True),
             top_ps=top_ps_t.to(device=device, non_blocking=True),
@@ -549,8 +569,8 @@ def from_lists(cls, temperatures: List[float], top_ps: List[float],
                                                          non_blocking=True),
             repetition_penalties=repetition_penalties_t.to(device=device,
                                                            non_blocking=True),
-            prompt_tokens=prompt_tensor.to(device=device, non_blocking=True),
-            output_tokens=output_tensor.to(device=device, non_blocking=True),
+            prompt_tokens=prompt_tokens_gpu,
+            output_tokens=output_tokens_gpu,
             sampling_seeds=sampling_seeds_gpu,
             sample_indices=sample_indices_t.to(device=device,
                                                non_blocking=True),

From a60515d0e72f80fc349cb9ecd016be48a481b70b Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Thu, 6 Jun 2024 19:07:57 -0700
Subject: [PATCH 58/93] [Core] Change LoRA embedding sharding to support
 loading methods (#5038)

---
 .buildkite/test-pipeline.yaml                 |  10 +-
 tests/conftest.py                             |  21 ++
 tests/lora/conftest.py                        |  18 +-
 tests/lora/test_layers.py                     | 219 ++++++++++++++-
 tests/lora/test_llama.py                      |  17 +-
 tests/lora/test_long_context.py               |  23 +-
 tests/test_sharded_state_loader.py            | 128 +++++----
 vllm/lora/layers.py                           |  76 +++--
 vllm/lora/utils.py                            |   3 +-
 .../layers/vocab_parallel_embedding.py        | 260 ++++++++++++++++--
 vllm/worker/model_runner.py                   |  19 +-
 11 files changed, 662 insertions(+), 132 deletions(-)

diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index d8030ab219cc..b48ef31bc416 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -46,6 +46,7 @@ steps:
   - TEST_DIST_MODEL=facebook/opt-125m DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - TEST_DIST_MODEL=meta-llama/Llama-2-7b-hf DISTRIBUTED_EXECUTOR_BACKEND=mp pytest -v -s distributed/test_chunked_prefill_distributed.py
   - pytest -v -s spec_decode/e2e/test_integration_dist.py
+  - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
 
 - label: Distributed Tests (Multiple Groups)
   #mirror_hardwares: [amd]
@@ -138,14 +139,7 @@ steps:
   num_gpus: 4
   # This test runs llama 13B, so it is required to run on 4 GPUs.
   commands:
-    # Temporarily run this way because we cannot clean up GPU mem usage
-    # for multi GPU tests.
-    # TODO(sang): Fix it.
-    - pytest -v -s lora/test_long_context.py::test_rotary_emb_replaced
-    - pytest -v -s lora/test_long_context.py::test_batched_rope_kernel
-    - pytest -v -s lora/test_long_context.py::test_self_consistency
-    - pytest -v -s lora/test_long_context.py::test_quality
-    - pytest -v -s lora/test_long_context.py::test_max_len
+    - pytest -v -s -x lora/test_long_context.py
 
 - label: Tensorizer Test
   #mirror_hardwares: [amd]
diff --git a/tests/conftest.py b/tests/conftest.py
index 2e64f04468ee..9343a5a83b30 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -2,6 +2,8 @@
 import gc
 import logging
 import os
+import subprocess
+import sys
 from typing import Any, Dict, List, Optional, Tuple, TypeVar
 
 import pytest
@@ -752,6 +754,25 @@ def caplog_vllm(temporary_enable_log_propagate, caplog):
     yield caplog
 
 
+@pytest.fixture(scope="session")
+def num_gpus_available():
+    """Get number of GPUs without initializing the CUDA context
+    in current process."""
+
+    try:
+        out = subprocess.run([
+            sys.executable, "-c",
+            "import torch; print(torch.cuda.device_count())"
+        ],
+                             capture_output=True,
+                             check=True,
+                             text=True)
+    except subprocess.CalledProcessError as e:
+        logger.warning("Failed to get number of GPUs.", exc_info=e)
+        return 0
+    return int(out.stdout.strip())
+
+
 @pytest.fixture(scope="session")
 def logger() -> logging.Logger:
     return make_logger("vllm_test")
diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py
index e5cf9cd48b65..400333066b9f 100644
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -42,10 +42,24 @@ def cleanup():
     ray.shutdown()
 
 
+@pytest.fixture()
+def should_do_global_cleanup_after_test(request) -> bool:
+    """Allow subdirectories to skip global cleanup by overriding this fixture.
+    This can provide a ~10x speedup for non-GPU unit tests since they don't need
+    to initialize torch.
+    """
+
+    if request.node.get_closest_marker("skip_global_cleanup"):
+        return False
+
+    return True
+
+
 @pytest.fixture(autouse=True)
-def cleanup_fixture():
+def cleanup_fixture(should_do_global_cleanup_after_test: bool):
     yield
-    cleanup()
+    if should_do_global_cleanup_after_test:
+        cleanup()
 
 
 @pytest.fixture
diff --git a/tests/lora/test_layers.py b/tests/lora/test_layers.py
index 3d868b0b1d5f..63fd2cd9e7fb 100644
--- a/tests/lora/test_layers.py
+++ b/tests/lora/test_layers.py
@@ -2,6 +2,7 @@
 from copy import deepcopy
 from dataclasses import dataclass
 from typing import Dict, List, Optional, Tuple
+from unittest.mock import patch
 
 import pytest
 import torch
@@ -32,7 +33,7 @@
 from vllm.model_executor.layers.logits_processor import LogitsProcessor
 from vllm.model_executor.layers.rotary_embedding import get_rope
 from vllm.model_executor.layers.vocab_parallel_embedding import (
-    ParallelLMHead, VocabParallelEmbedding)
+    ParallelLMHead, VocabParallelEmbedding, get_masked_input_and_mask)
 from vllm.model_executor.utils import set_random_seed
 
 from .utils import DummyLoRAManager
@@ -436,7 +437,8 @@ def _pretest():
         logits_processor = LogitsProcessor(
             vocab_size + lora_config.lora_extra_vocab_size, vocab_size)
         lora_logits_processor = LogitsProcessorWithLoRA(
-            logits_processor, 1024, linear.weight.dtype, linear.weight.device)
+            logits_processor, 1024, linear.weight.dtype, linear.weight.device,
+            None)
         lora_logits_processor.create_lora_weights(max_loras, lora_config)
 
         return linear, logits_processor, lora_logits_processor
@@ -882,3 +884,216 @@ def test_rotary_embedding_long_context(dist_init, num_loras, device,
 
     torch.allclose(ref_q, actual_q)
     torch.allclose(ref_k, actual_k)
+
+
+@pytest.mark.parametrize("tp_size", [1, 2, 4, 8])
+@pytest.mark.parametrize("seed", list(range(256)))
+def test_vocab_parallel_embedding_indices(tp_size, seed):
+    random.seed(seed)
+    vocab_size = random.randint(4000, 64000)
+    added_vocab_size = random.randint(0, 1024)
+    org_vocab_size = vocab_size - added_vocab_size
+    last_org_vocab_end_index = 0
+    last_added_vocab_end_index = org_vocab_size
+    computed_vocab_size = 0
+    computed_org_vocab_size = 0
+    computed_added_vocab_size = 0
+    vocab_size_padded = -1
+
+    all_org_tokens = []
+    all_added_tokens = []
+    token_ids = []
+
+    for tp_rank in range(tp_size):
+        with patch(
+                "vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_rank",
+                return_value=tp_rank
+        ), patch(
+                "vllm.model_executor.layers.vocab_parallel_embedding.get_tensor_model_parallel_world_size",
+                return_value=tp_size):
+            vocab_embedding = VocabParallelEmbedding(
+                vocab_size, 1, org_num_embeddings=org_vocab_size)
+        vocab_size_padded = vocab_embedding.num_embeddings_padded
+        shard_indices = vocab_embedding.shard_indices
+        # Assert that the ranges are contiguous
+        assert shard_indices.org_vocab_start_index == last_org_vocab_end_index
+        assert (shard_indices.added_vocab_start_index ==
+                last_added_vocab_end_index)
+
+        # Ensure that we are not exceeding the vocab size
+        computed_vocab_size += shard_indices.num_elements_padded
+        computed_org_vocab_size += shard_indices.num_org_elements
+        computed_added_vocab_size += shard_indices.num_added_elements
+
+        # Ensure that the ranges are not overlapping
+        all_org_tokens.extend(
+            range(shard_indices.org_vocab_start_index,
+                  shard_indices.org_vocab_end_index))
+        all_added_tokens.extend(
+            range(shard_indices.added_vocab_start_index,
+                  shard_indices.added_vocab_end_index))
+
+        token_ids.extend(
+            range(shard_indices.org_vocab_start_index,
+                  shard_indices.org_vocab_end_index))
+        token_ids.extend([-1] * (shard_indices.num_org_elements_padded -
+                                 shard_indices.num_org_elements))
+        token_ids.extend(
+            range(shard_indices.added_vocab_start_index,
+                  shard_indices.added_vocab_end_index))
+        token_ids.extend([-1] * (shard_indices.num_added_elements_padded -
+                                 shard_indices.num_added_elements))
+
+        last_org_vocab_end_index = shard_indices.org_vocab_end_index
+        last_added_vocab_end_index = shard_indices.added_vocab_end_index
+
+    assert computed_vocab_size == vocab_size_padded
+    assert computed_org_vocab_size == org_vocab_size
+    assert computed_added_vocab_size == added_vocab_size
+
+    # Ensure that the ranges are not overlapping
+    assert len(all_org_tokens) == len(set(all_org_tokens))
+    assert len(all_added_tokens) == len(set(all_added_tokens))
+    assert not set(all_org_tokens).intersection(set(all_added_tokens))
+
+    token_ids_tensor = torch.tensor(token_ids, dtype=torch.long)
+    reindex_mapping = vocab_embedding.get_sharded_to_full_mapping()
+    assert reindex_mapping is not None or tp_size == 1
+    if reindex_mapping is not None:
+        reindexed_token_ids = token_ids_tensor[reindex_mapping]
+        expected = torch.tensor(list(range(0, vocab_size)))
+        assert reindexed_token_ids[:vocab_size].equal(expected)
+        assert torch.all(reindexed_token_ids[vocab_size:] == -1)
+
+
+def test_get_masked_input_and_mask():
+    x = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11])
+
+    # base tp 1 case, no padding
+    modified_x, _ = get_masked_input_and_mask(x,
+                                              org_vocab_start_index=0,
+                                              org_vocab_end_index=8,
+                                              added_vocab_start_index=8,
+                                              added_vocab_end_index=12,
+                                              num_org_vocab_padding=0)
+    assert torch.equal(x, modified_x)
+
+    # tp 2 case, no padding
+    modified_x_rank_0, _ = get_masked_input_and_mask(x,
+                                                     org_vocab_start_index=0,
+                                                     org_vocab_end_index=4,
+                                                     added_vocab_start_index=8,
+                                                     added_vocab_end_index=10,
+                                                     num_org_vocab_padding=0)
+    modified_x_rank_1, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=4,
+        org_vocab_end_index=8,
+        added_vocab_start_index=10,
+        added_vocab_end_index=12,
+        num_org_vocab_padding=0)
+    assert torch.equal(modified_x_rank_0,
+                       torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 4, 5, 0, 0]))
+    assert torch.equal(modified_x_rank_1,
+                       torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 4, 5]))
+
+    # tp 4 case, no padding
+    modified_x_rank_0, _ = get_masked_input_and_mask(x,
+                                                     org_vocab_start_index=0,
+                                                     org_vocab_end_index=2,
+                                                     added_vocab_start_index=8,
+                                                     added_vocab_end_index=9,
+                                                     num_org_vocab_padding=0)
+    modified_x_rank_1, _ = get_masked_input_and_mask(x,
+                                                     org_vocab_start_index=2,
+                                                     org_vocab_end_index=4,
+                                                     added_vocab_start_index=9,
+                                                     added_vocab_end_index=10,
+                                                     num_org_vocab_padding=0)
+    modified_x_rank_2, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=4,
+        org_vocab_end_index=6,
+        added_vocab_start_index=10,
+        added_vocab_end_index=11,
+        num_org_vocab_padding=0)
+    modified_x_rank_3, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=6,
+        org_vocab_end_index=8,
+        added_vocab_start_index=11,
+        added_vocab_end_index=12,
+        num_org_vocab_padding=0)
+    assert torch.equal(modified_x_rank_0,
+                       torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0]))
+    assert torch.equal(modified_x_rank_1,
+                       torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 0, 0]))
+    assert torch.equal(modified_x_rank_2,
+                       torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 2, 0]))
+    assert torch.equal(modified_x_rank_3,
+                       torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2]))
+
+    # base tp 1 case, with padding
+    modified_x, _ = get_masked_input_and_mask(x,
+                                              org_vocab_start_index=0,
+                                              org_vocab_end_index=8,
+                                              added_vocab_start_index=8,
+                                              added_vocab_end_index=12,
+                                              num_org_vocab_padding=2)
+    assert torch.equal(modified_x,
+                       torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 10, 11, 12, 13]))
+
+    # tp 2 case, with padding
+    modified_x_rank_0, _ = get_masked_input_and_mask(x,
+                                                     org_vocab_start_index=0,
+                                                     org_vocab_end_index=4,
+                                                     added_vocab_start_index=8,
+                                                     added_vocab_end_index=10,
+                                                     num_org_vocab_padding=2)
+    modified_x_rank_1, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=4,
+        org_vocab_end_index=8,
+        added_vocab_start_index=10,
+        added_vocab_end_index=12,
+        num_org_vocab_padding=2)
+    assert torch.equal(modified_x_rank_0,
+                       torch.tensor([0, 1, 2, 3, 0, 0, 0, 0, 6, 7, 0, 0]))
+    assert torch.equal(modified_x_rank_1,
+                       torch.tensor([0, 0, 0, 0, 0, 1, 2, 3, 0, 0, 6, 7]))
+
+    # tp 4 case, with padding
+    modified_x_rank_0, _ = get_masked_input_and_mask(x,
+                                                     org_vocab_start_index=0,
+                                                     org_vocab_end_index=2,
+                                                     added_vocab_start_index=8,
+                                                     added_vocab_end_index=9,
+                                                     num_org_vocab_padding=2)
+    modified_x_rank_1, _ = get_masked_input_and_mask(x,
+                                                     org_vocab_start_index=2,
+                                                     org_vocab_end_index=4,
+                                                     added_vocab_start_index=9,
+                                                     added_vocab_end_index=10,
+                                                     num_org_vocab_padding=2)
+    modified_x_rank_2, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=4,
+        org_vocab_end_index=6,
+        added_vocab_start_index=10,
+        added_vocab_end_index=11,
+        num_org_vocab_padding=2)
+    modified_x_rank_3, _ = get_masked_input_and_mask(
+        x,
+        org_vocab_start_index=6,
+        org_vocab_end_index=8,
+        added_vocab_start_index=11,
+        added_vocab_end_index=12,
+        num_org_vocab_padding=2)
+    assert torch.equal(modified_x_rank_0,
+                       torch.tensor([0, 1, 0, 0, 0, 0, 0, 0, 4, 0, 0, 0]))
+    assert torch.equal(modified_x_rank_1,
+                       torch.tensor([0, 0, 0, 1, 0, 0, 0, 0, 0, 4, 0, 0]))
+    assert torch.equal(modified_x_rank_2,
+                       torch.tensor([0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 4, 0]))
+    assert torch.equal(modified_x_rank_3,
+                       torch.tensor([0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 4]))
diff --git a/tests/lora/test_llama.py b/tests/lora/test_llama.py
index f5a571e81acb..7143a99bea08 100644
--- a/tests/lora/test_llama.py
+++ b/tests/lora/test_llama.py
@@ -36,11 +36,10 @@ def do_sample(llm, lora_path: str, lora_id: int):
     return generated_texts
 
 
-@pytest.mark.parametrize("tp_size", [1])
-def test_llama_lora(sql_lora_files, tp_size):
-    # Cannot use as it will initialize torch.cuda too early...
-    # if torch.cuda.device_count() < tp_size:
-    #     pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+@pytest.mark.parametrize("tp_size", [1, 2, 4])
+def test_llama_lora(sql_lora_files, tp_size, num_gpus_available):
+    if num_gpus_available < tp_size:
+        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
 
     llm = vllm.LLM(MODEL_PATH,
                    enable_lora=True,
@@ -80,11 +79,9 @@ def test_llama_lora(sql_lora_files, tp_size):
     print("removing lora")
 
 
-@pytest.mark.skip("Requires multiple GPUs")
-def test_llama_tensor_parallel_equality(sql_lora_files):
-    # Cannot use as it will initialize torch.cuda too early...
-    # if torch.cuda.device_count() < 4:
-    #     pytest.skip(f"Not enough GPUs for tensor parallelism {4}")
+def test_llama_tensor_parallel_equality(sql_lora_files, num_gpus_available):
+    if num_gpus_available < 4:
+        pytest.skip("Not enough GPUs for tensor parallelism 4")
 
     llm_tp1 = vllm.LLM(MODEL_PATH,
                        enable_lora=True,
diff --git a/tests/lora/test_long_context.py b/tests/lora/test_long_context.py
index cc1d4d620ff8..feb58aa28bda 100644
--- a/tests/lora/test_long_context.py
+++ b/tests/lora/test_long_context.py
@@ -102,22 +102,21 @@ def batched_generate(
     return [outputs[i].outputs[0].text.strip() for i in range(len(outputs))]
 
 
-@pytest.fixture
+@pytest.fixture(scope="module")
 def lora_llm(long_context_infos):
     scaling_factors = [
         context_len_to_scaling_factor[info["context_length"]]
         for info in long_context_infos.values()
     ]
 
-    llm = vllm.LLM(
-        "meta-llama/Llama-2-13b-chat-hf",
-        enable_lora=True,
-        max_num_seqs=16,
-        max_loras=2,
-        long_lora_scaling_factors=tuple(scaling_factors),
-        max_num_batched_tokens=4096 * 8,
-        tensor_parallel_size=4,
-    )
+    llm = vllm.LLM("meta-llama/Llama-2-13b-chat-hf",
+                   enable_lora=True,
+                   max_num_seqs=16,
+                   max_loras=2,
+                   long_lora_scaling_factors=tuple(scaling_factors),
+                   max_num_batched_tokens=4096 * 8,
+                   tensor_parallel_size=4,
+                   distributed_executor_backend="mp")
     yield llm
     del llm
 
@@ -154,6 +153,7 @@ def test_rotary_emb_replaced(dist_init):
     assert rotary_emb_count == 32
 
 
+@pytest.mark.skip_global_cleanup
 @pytest.mark.skip(reason="Too many GPUs for NM Automation")
 def test_batched_rope_kernel(lora_llm, long_context_infos):
     """We test the batched kernel by comparing the results of batched an
@@ -189,6 +189,7 @@ def test_batched_rope_kernel(lora_llm, long_context_infos):
             f"same:\n{batched}\n{non_batched}")
 
 
+@pytest.mark.skip_global_cleanup
 @pytest.mark.skip(reason="Too many GPUs for NM Automation")
 def test_self_consistency(lora_llm, long_context_infos):
     """We test consistency of the batched kernel by permuting batched
@@ -229,6 +230,7 @@ def test_self_consistency(lora_llm, long_context_infos):
                 f"\n{permutated_batched_results[permutation[i]]}")
 
 
+@pytest.mark.skip_global_cleanup
 @pytest.mark.skip(reason="Too many GPUs for NM Automation")
 def test_quality(lora_llm, long_context_infos):
     """We test the quality of the answers given by the LoRA model by
@@ -260,6 +262,7 @@ def test_quality(lora_llm, long_context_infos):
     assert np.mean(scores) > 0.5
 
 
+@pytest.mark.skip_global_cleanup
 @pytest.mark.skip(reason="Too many GPUs for NM Automation")
 def test_max_len(lora_llm, long_context_infos):
     """Test that we raise an ValueError when the input of a given LoRA
diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py
index 426110b9d7e3..022fb36b346f 100644
--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
@@ -1,3 +1,4 @@
+import multiprocessing as mp
 import os
 import shutil
 from tempfile import TemporaryDirectory
@@ -18,9 +19,7 @@
 
 # Create a sampling params object.
 sampling_params = SamplingParams(
-    temperature=0.8,
-    top_p=0.95,
-    seed=0,
+    temperature=0,
     max_tokens=256,
     ignore_eos=True,
 )
@@ -43,50 +42,85 @@ def test_filter_subtensors():
         assert tensor.equal(state_dict[key])
 
 
-# @pytest.mark.skip("OOM in NM Automation")
+@pytest.fixture(scope="module")
+def llama_2_7b_files():
+    with TemporaryDirectory() as cache_dir:
+        input_dir = snapshot_download("meta-llama/Llama-2-7b-hf",
+                                      cache_dir=cache_dir,
+                                      ignore_patterns="*.bin*")
+        yield input_dir
+
+
+def _run_writer(input_dir, output_dir, weights_patterns, **kwargs):
+    llm_sharded_writer = LLM(model=input_dir, **kwargs)
+
+    # Dump worker states to output directory
+    llm_sharded_writer.llm_engine.model_executor.save_sharded_state(
+        path=output_dir)
+    # Copy metadata files to output directory
+    for file in os.listdir(input_dir):
+        if not any(file.endswith(ext) for ext in weights_patterns):
+            shutil.copy(f"{input_dir}/{file}", output_dir)
+
+
+def _run_generate(input_dir, queue: mp.Queue, **kwargs):
+    llm = LLM(model=input_dir, **kwargs)
+    gen = llm.generate(prompts, sampling_params)
+    queue.put([g.outputs[0].__dict__ for g in gen])
+    queue.close()
+    queue.join_thread()
+
+
 @pytest.mark.parametrize("enable_lora", [False, True])
-def test_sharded_state_loader(enable_lora):
-    weights_patterns = ("*.bin", "*.pt", "*.safetensors")
-
-    with TemporaryDirectory() as cache_dir, TemporaryDirectory() as output_dir:
-        # input_dir = snapshot_download("meta-llama/Llama-2-7b-hf",
-        input_dir = snapshot_download("TinyLlama/TinyLlama-1.1B-Chat-v1.0",
-                                      cache_dir=cache_dir)
-
-        llm = LLM(
-            model=input_dir,
-            worker_use_ray=True,
-            gpu_memory_utilization=0.3,
-        )
-
-        # Dump worker states to output directory
-        model_executor = llm.llm_engine.model_executor
-        model_executor.save_sharded_state(path=output_dir)
-        # Copy metadata files to output directory
-        for file in os.listdir(input_dir):
-            if not any(file.endswith(ext) for ext in weights_patterns):
-                shutil.copy(f"{input_dir}/{file}", output_dir)
-        del llm.llm_engine.model_executor
-
-        llm_before = LLM(
-            model=input_dir,
-            worker_use_ray=True,
-            enable_lora=enable_lora,
-            gpu_memory_utilization=0.3,
-        )
-        gen_before = llm_before.generate(prompts, sampling_params)
-        out_before = [gen.outputs[0].__dict__ for gen in gen_before]
-        del llm_before.llm_engine.model_executor
-
-        llm_after = LLM(
-            model=output_dir,
-            worker_use_ray=True,
-            enable_lora=enable_lora,
-            gpu_memory_utilization=0.3,
-            load_format="sharded_state",
-        )
-        gen_after = llm_after.generate(prompts, sampling_params)
-        out_after = [gen.outputs[0].__dict__ for gen in gen_after]
-        del llm_after.llm_engine.model_executor
+@pytest.mark.parametrize("tp_size", [1, 2])
+def test_sharded_state_loader(enable_lora, tp_size, num_gpus_available,
+                              llama_2_7b_files):
+    if num_gpus_available < tp_size:
+        pytest.skip(f"Not enough GPUs for tensor parallelism {tp_size}")
+
+    weights_patterns = ("*.safetensors", )
+    gpu_memory_utilization = 0.8
+    input_dir = llama_2_7b_files
+    ctx = mp.get_context("spawn")
+
+    # Run in separate processes for memory & CUDA isolation
+    with TemporaryDirectory() as output_dir:
+        p = ctx.Process(target=_run_writer,
+                        args=(input_dir, output_dir, weights_patterns),
+                        kwargs=dict(
+                            tensor_parallel_size=tp_size,
+                            distributed_executor_backend="mp",
+                            gpu_memory_utilization=gpu_memory_utilization,
+                            enforce_eager=True,
+                        ))
+        p.start()
+        p.join()
+
+        queue = ctx.Queue()
+
+        p = ctx.Process(target=_run_generate,
+                        args=(input_dir, queue),
+                        kwargs=dict(
+                            distributed_executor_backend="mp",
+                            enable_lora=enable_lora,
+                            gpu_memory_utilization=gpu_memory_utilization,
+                            tensor_parallel_size=tp_size,
+                        ))
+        p.start()
+        p.join()
+        out_before = queue.get()
+
+        p = ctx.Process(target=_run_generate,
+                        args=(output_dir, queue),
+                        kwargs=dict(
+                            distributed_executor_backend="mp",
+                            enable_lora=enable_lora,
+                            gpu_memory_utilization=gpu_memory_utilization,
+                            tensor_parallel_size=tp_size,
+                            load_format="sharded_state",
+                        ))
+        p.start()
+        p.join()
+        out_after = queue.get()
 
         assert out_before == out_after
diff --git a/vllm/lora/layers.py b/vllm/lora/layers.py
index 24b74476c3b8..e3ab1708c3fd 100644
--- a/vllm/lora/layers.py
+++ b/vllm/lora/layers.py
@@ -215,19 +215,19 @@ def create_lora_weights(
             lora_config: LoRAConfig,
             model_config: Optional[PretrainedConfig] = None) -> None:
 
-        lora_vocab_start_idx = self.base_layer.org_vocab_size
-        weights_idx = None
-        if self.base_layer.vocab_end_index > lora_vocab_start_idx:
+        if self.base_layer.num_added_embeddings_per_partition > 0:
             # We can start adding lora weights
-            weights_idx = max(
-                lora_vocab_start_idx - self.base_layer.vocab_start_index, 0)
-            self.embeddings_slice = (self.base_layer.vocab_start_index -
-                                     self.base_layer.org_vocab_size +
-                                     weights_idx,
-                                     self.base_layer.vocab_end_index -
-                                     self.base_layer.org_vocab_size)
-            self.embeddings_weights = self.base_layer.weight.data[weights_idx:]
-            self.embeddings_weights.fill_(0)
+            self.embeddings_weights = self.base_layer.weight.data[
+                self.base_layer.num_org_embeddings_per_partition:self.
+                base_layer.num_org_embeddings_per_partition +
+                self.base_layer.num_added_embeddings_per_partition]
+            self.embeddings_slice = (
+                self.base_layer.shard_indices.added_vocab_start_index -
+                self.base_layer.org_vocab_size,
+                self.base_layer.shard_indices.added_vocab_end_index -
+                self.base_layer.org_vocab_size)
+            self.base_layer.weight.data[
+                self.base_layer.num_org_embeddings_per_partition:].fill_(0)
         else:
             self.embeddings_slice = None
             self.embeddings_weights = None
@@ -1025,19 +1025,31 @@ def can_replace_layer(cls, source_layer: nn.Module,
 
 
 class LogitsProcessorWithLoRA(BaseLayerWithLoRA):
+    """
+    LoRA wrapper for LogitsProcessor, with extra logic to handle the
+    application of the LoRA adapter and added LoRA vocabulary.
+
+    Args:
+        base_layer: LogitsProcessor layer
+        hidden_size: hidden size of the model
+        dtype: data type of the model
+        device: device of the model
+        sharded_to_full_mapping: index mapping from sharded vocab to full vocab
+            received from base_layer.get_sharded_to_full_mapping(). If None,
+            no reindexing will be done.
+    """
 
-    def __init__(
-        self,
-        base_layer: LogitsProcessor,
-        hidden_size: int,
-        dtype: torch.dtype,
-        device: torch.device,
-    ) -> None:
+    def __init__(self, base_layer: LogitsProcessor, hidden_size: int,
+                 dtype: torch.dtype, device: torch.device,
+                 sharded_to_full_mapping: Optional[List[int]]) -> None:
         super().__init__()
         self.base_layer = base_layer
         self.hidden_size = hidden_size
         self.dtype = dtype
         self.device = device
+        self.tp_size = get_tensor_model_parallel_world_size()
+        self.tp_rank = get_tensor_model_parallel_rank()
+        self.sharded_to_full_mapping = sharded_to_full_mapping
 
     @property
     def logits_as_input(self):
@@ -1098,6 +1110,13 @@ def create_lora_weights(
             dtype=self.dtype,
             device=self.device,
         )
+        if self.sharded_to_full_mapping is not None:
+            self.sharded_to_full_mapping_gpu = torch.tensor(
+                self.sharded_to_full_mapping,
+                device=self.device,
+                dtype=torch.long)
+        else:
+            self.sharded_to_full_mapping_gpu = None
         # Lazily initialized.
         self.indices: torch.Tensor
         self.indices_len: List[int]
@@ -1154,6 +1173,25 @@ def _get_logits(
         if logits is None:
             return None
 
+        if self.sharded_to_full_mapping_gpu is not None:
+            # Reindex full logits tensor to ensure 1:1 mapping between
+            # index and token_id
+            # Example for:
+            #   org_vocab_size = 4
+            #   added_vocab_size = 2
+            #   pad_to_size = 8
+            #   tp_size = 2
+
+            # indices:  [0, 1, 2,  3, 4, 5, 6,  7]
+            # token_id: [0, 1, 4, -1, 2, 3, 5, -1]
+
+            # Therefore, the mapping is expected to be:
+            # [0, 1, 4, 6, 2, 3, 5, 7] so that when we reindex,
+            # we get:
+            # indices:  [0, 1, 2, 3, 4, 5,  6,  7]
+            # token_id: [0, 1, 2, 3, 4, 5, -1, -1]
+            logits = logits[:, self.sharded_to_full_mapping_gpu]
+
         lora_logits = torch.empty(
             self.embeddings_tensors.shape[0] + 1,
             self.embeddings_tensors.shape[1],
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index fcc7f2472193..b0198a50b1c5 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -67,7 +67,8 @@ def from_layer_logits_processor(
     model_config: Optional[PretrainedConfig] = None,
 ) -> LogitsProcessorWithLoRA:
     ret = LogitsProcessorWithLoRA(layer, lm_head.embedding_dim,
-                                  lm_head.weight.dtype, lm_head.weight.device)
+                                  lm_head.weight.dtype, lm_head.weight.device,
+                                  lm_head.get_sharded_to_full_mapping())
     ret.create_lora_weights(max_loras, lora_config, model_config)
     return ret
 
diff --git a/vllm/model_executor/layers/vocab_parallel_embedding.py b/vllm/model_executor/layers/vocab_parallel_embedding.py
index 4585b1679cb5..60eb5b404e2c 100644
--- a/vllm/model_executor/layers/vocab_parallel_embedding.py
+++ b/vllm/model_executor/layers/vocab_parallel_embedding.py
@@ -1,4 +1,5 @@
-from typing import Optional, Sequence
+from dataclasses import dataclass
+from typing import List, Optional, Sequence, Tuple
 
 import torch
 import torch.nn.functional as F
@@ -18,18 +19,107 @@ def pad_vocab_size(vocab_size: int,
     return ((vocab_size + pad_to - 1) // pad_to) * pad_to
 
 
-def vocab_range_from_per_partition_vocab_size(per_partition_vocab_size: int,
-                                              rank: int) -> Sequence[int]:
+def vocab_range_from_per_partition_vocab_size(
+        per_partition_vocab_size: int,
+        rank: int,
+        offset: int = 0) -> Sequence[int]:
     index_f = rank * per_partition_vocab_size
     index_l = index_f + per_partition_vocab_size
-    return index_f, index_l
+    return index_f + offset, index_l + offset
 
 
-def vocab_range_from_global_vocab_size(global_vocab_size: int, rank: int,
-                                       world_size: int) -> Sequence[int]:
+def vocab_range_from_global_vocab_size(global_vocab_size: int,
+                                       rank: int,
+                                       world_size: int,
+                                       offset: int = 0) -> Sequence[int]:
     per_partition_vocab_size = divide(global_vocab_size, world_size)
     return vocab_range_from_per_partition_vocab_size(per_partition_vocab_size,
-                                                     rank)
+                                                     rank,
+                                                     offset=offset)
+
+
+@dataclass
+class VocabParallelEmbeddingShardIndices:
+    """Indices for a shard of a vocab parallel embedding."""
+    padded_org_vocab_start_index: int
+    padded_org_vocab_end_index: int
+    padded_added_vocab_start_index: int
+    padded_added_vocab_end_index: int
+
+    org_vocab_start_index: int
+    org_vocab_end_index: int
+    added_vocab_start_index: int
+    added_vocab_end_index: int
+
+    @property
+    def num_org_elements(self) -> int:
+        return self.org_vocab_end_index - self.org_vocab_start_index
+
+    @property
+    def num_added_elements(self) -> int:
+        return self.added_vocab_end_index - self.added_vocab_start_index
+
+    @property
+    def num_org_elements_padded(self) -> int:
+        return (self.padded_org_vocab_end_index -
+                self.padded_org_vocab_start_index)
+
+    @property
+    def num_added_elements_padded(self) -> int:
+        return (self.padded_added_vocab_end_index -
+                self.padded_added_vocab_start_index)
+
+    @property
+    def num_org_vocab_padding(self) -> int:
+        return self.num_org_elements_padded - self.num_org_elements
+
+    @property
+    def num_added_vocab_padding(self) -> int:
+        return self.num_added_elements_padded - self.num_added_elements
+
+    @property
+    def num_elements_padded(self) -> int:
+        return self.num_org_elements_padded + self.num_added_elements_padded
+
+    def __post_init__(self):
+        # sanity checks
+        assert (self.padded_org_vocab_start_index <=
+                self.padded_org_vocab_end_index)
+        assert (self.padded_added_vocab_start_index <=
+                self.padded_added_vocab_end_index)
+
+        assert self.org_vocab_start_index <= self.org_vocab_end_index
+        assert self.added_vocab_start_index <= self.added_vocab_end_index
+
+        assert self.org_vocab_start_index <= self.padded_org_vocab_start_index
+        assert (self.added_vocab_start_index <=
+                self.padded_added_vocab_start_index)
+        assert self.org_vocab_end_index <= self.padded_org_vocab_end_index
+        assert self.added_vocab_end_index <= self.padded_added_vocab_end_index
+
+        assert self.num_org_elements <= self.num_org_elements_padded
+        assert self.num_added_elements <= self.num_added_elements_padded
+
+
+@torch.jit.script
+def get_masked_input_and_mask(
+        input_: torch.Tensor, org_vocab_start_index: int,
+        org_vocab_end_index: int, num_org_vocab_padding: int,
+        added_vocab_start_index: int,
+        added_vocab_end_index: int) -> Tuple[torch.Tensor, torch.Tensor]:
+    # torch.jit.script will fuse all of the pointwise ops below
+    # into a single kernel, making it very fast
+    org_vocab_mask = (input_ >= org_vocab_start_index) & (input_ <
+                                                          org_vocab_end_index)
+    added_vocab_mask = (input_ >= added_vocab_start_index) & (
+        input_ < added_vocab_end_index)
+    added_offset = added_vocab_start_index - (
+        org_vocab_end_index - org_vocab_start_index) - num_org_vocab_padding
+    valid_offset = (org_vocab_start_index *
+                    org_vocab_mask) + (added_offset * added_vocab_mask)
+    vocab_mask = org_vocab_mask | added_vocab_mask
+    input_ = vocab_mask * (input_ - valid_offset)
+    return input_, ~vocab_mask
 
 
 class VocabParallelEmbedding(torch.nn.Module):
@@ -38,13 +128,36 @@ class VocabParallelEmbedding(torch.nn.Module):
     Adapted from torch.nn.Embedding, note that we pad the vocabulary size to
     make sure it is divisible by the number of model parallel GPUs.
 
+    In order to support various loading methods, we ensure that LoRA-added
+    embeddings are always at the end of TP-sharded tensors. In other words,
+    we shard base embeddings and LoRA embeddings separately (both padded),
+    and place them in the same tensor.
+    In this example, we will have the original vocab size = 1010,
+    added vocab size = 16 and padding to 64. Therefore, the total
+    vocab size with padding will be 1088 (because we first pad 1010 to
+    1024, add 16, and then pad to 1088).
+    Therefore, the tensor format looks like the following:
+    TP1, rank 0 (no sharding):
+                            |< --------BASE-------- >|< -BASE PADDING-- >|< -----LORA------ >|< -LORA PADDING-- >|
+    corresponding token_id: |  0  |  1  | ... | 1009 |  -1  | ... |  -1  | 1010 | ... | 1015 |  -1  | ... |  -1  |
+                     index: |  0  |  1  | ... | 1009 | 1010 | ... | 1023 | 1024 | ... | 1039 | 1040 | ... | 1087 |
+
+    TP2, rank 0:
+                            |< --------------------BASE--------------------- >|< -----LORA------ >|< -LORA PADDING- >|
+    corresponding token_id: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 1000 | ... | 1015 |  -1  | ... |  -1 |
+                     index: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 512  | ... | 527  |  520 | ... | 543 |
+    TP2, rank 1:
+                            |< -----------BASE----------- >|< -BASE PADDING- >|< -----------LORA PADDING----------- >|
+    corresponding token_id: | 512 | 513 | 514 | ... | 1009 | -1  | ...  | -1  |  -1  | ... |  -1  | -1  | ... |   -1 |
+                     index: |  0  |  1  |  2  | ... | 497  | 498 | ...  | 511 | 512  | ... | 519  | 520 | ... |  543 |
+
     Args:
         num_embeddings: vocabulary size.
         embedding_dim: size of hidden state.
         params_dtype: type of the parameters.
         org_num_embeddings: original vocabulary size (without LoRA).
         padding_size: padding size for the vocabulary.
-    """
+    """  # noqa: E501
 
     def __init__(self,
                  num_embeddings: int,
@@ -55,21 +168,39 @@ def __init__(self,
         super().__init__()
 
         # Keep the input dimensions.
+        tp_rank = get_tensor_model_parallel_rank()
+        self.tp_size = get_tensor_model_parallel_world_size()
         self.num_embeddings = num_embeddings
+        self.padding_size = padding_size
         self.org_vocab_size = org_num_embeddings or num_embeddings
-        self.num_embeddings_padded = pad_vocab_size(num_embeddings,
-                                                    padding_size)
+        num_added_embeddings = num_embeddings - self.org_vocab_size
+        self.org_vocab_size_padded = pad_vocab_size(self.org_vocab_size,
+                                                    self.padding_size)
+        self.num_embeddings_padded = pad_vocab_size(
+            self.org_vocab_size_padded + num_added_embeddings,
+            self.padding_size)
+        assert self.org_vocab_size_padded <= self.num_embeddings_padded
+
+        self.shard_indices = self._get_indices(self.num_embeddings_padded,
+                                               self.org_vocab_size_padded,
+                                               self.num_embeddings,
+                                               self.org_vocab_size, tp_rank,
+                                               self.tp_size)
         self.embedding_dim = embedding_dim
         if params_dtype is None:
             params_dtype = torch.get_default_dtype()
-        self.tp_size = get_tensor_model_parallel_world_size()
         # Divide the weight matrix along the vocaburaly dimension.
-        self.vocab_start_index, self.vocab_end_index = (
-            vocab_range_from_global_vocab_size(
-                self.num_embeddings_padded, get_tensor_model_parallel_rank(),
-                self.tp_size))
-        self.num_embeddings_per_partition = (self.vocab_end_index -
-                                             self.vocab_start_index)
+        self.num_added_embeddings = self.num_embeddings - self.org_vocab_size
+        self.num_embeddings_per_partition = divide(self.num_embeddings_padded,
+                                                   self.tp_size)
+        assert (self.shard_indices.num_elements_padded ==
+                self.num_embeddings_per_partition)
+        self.num_org_embeddings_per_partition = (
+            self.shard_indices.org_vocab_end_index -
+            self.shard_indices.org_vocab_start_index)
+        self.num_added_embeddings_per_partition = (
+            self.shard_indices.added_vocab_end_index -
+            self.shard_indices.added_vocab_start_index)
         self.weight = Parameter(
             torch.empty(self.num_embeddings_per_partition,
                         self.embedding_dim,
@@ -79,28 +210,107 @@ def __init__(self,
             "weight_loader": self.weight_loader
         })
 
+    @classmethod
+    def _get_indices(cls, vocab_size_padded: int, org_vocab_size_padded: int,
+                     vocab_size: int, org_vocab_size: int, tp_rank: int,
+                     tp_size: int) -> VocabParallelEmbeddingShardIndices:
+        """Get start and end indices for vocab parallel embedding, following the
+        layout outlined in the class docstring, based on the given tp_rank and
+        tp_size."""
+        num_added_embeddings_padded = vocab_size_padded - org_vocab_size_padded
+        padded_org_vocab_start_index, padded_org_vocab_end_index = (
+            vocab_range_from_global_vocab_size(org_vocab_size_padded, tp_rank,
+                                               tp_size))
+        padded_added_vocab_start_index, padded_added_vocab_end_index = (
+            vocab_range_from_global_vocab_size(num_added_embeddings_padded,
+                                               tp_rank,
+                                               tp_size,
+                                               offset=org_vocab_size))
+        # remove padding
+        org_vocab_start_index = min(padded_org_vocab_start_index,
+                                    org_vocab_size)
+        org_vocab_end_index = min(padded_org_vocab_end_index, org_vocab_size)
+        added_vocab_start_index = min(padded_added_vocab_start_index,
+                                      vocab_size)
+        added_vocab_end_index = min(padded_added_vocab_end_index, vocab_size)
+        return VocabParallelEmbeddingShardIndices(
+            padded_org_vocab_start_index, padded_org_vocab_end_index,
+            padded_added_vocab_start_index, padded_added_vocab_end_index,
+            org_vocab_start_index, org_vocab_end_index,
+            added_vocab_start_index, added_vocab_end_index)
+
+    def get_sharded_to_full_mapping(self) -> Optional[List[int]]:
+        """Get a mapping that can be used to reindex the gathered
+        logits for sampling.
+        
+        During sampling, we gather logits from all ranks. The relationship
+        of index->token_id will follow the same format as outlined in the class
+        docstring. However, after the gather, we want to reindex the final
+        logits tensor to map index->token_id one-to-one (the index is always
+        equal the token_id it corresponds to). The indices returned by this
+        method allow us to do that.
+        """
+        if self.tp_size < 2:
+            return None
+
+        base_embeddings: List[int] = []
+        added_embeddings: List[int] = []
+        padding: List[int] = []
+        for tp_rank in range(self.tp_size):
+            shard_indices = self._get_indices(self.num_embeddings_padded,
+                                              self.org_vocab_size_padded,
+                                              self.num_embeddings,
+                                              self.org_vocab_size, tp_rank,
+                                              self.tp_size)
+            range_start = self.num_embeddings_per_partition * tp_rank
+            range_end = self.num_embeddings_per_partition * (tp_rank + 1)
+            base_embeddings.extend(
+                range(range_start,
+                      range_start + shard_indices.num_org_elements))
+            padding.extend(
+                range(range_start + shard_indices.num_org_elements,
+                      range_start + shard_indices.num_org_elements_padded))
+            added_embeddings.extend(
+                range(
+                    range_start + shard_indices.num_org_elements_padded,
+                    range_start + shard_indices.num_org_elements_padded +
+                    shard_indices.num_added_elements))
+            padding.extend(
+                range(
+                    range_start + shard_indices.num_org_elements_padded +
+                    shard_indices.num_added_elements,
+                    range_start + shard_indices.num_org_elements_padded +
+                    shard_indices.num_added_elements_padded))
+            assert (range_start + shard_indices.num_org_elements_padded +
+                    shard_indices.num_added_elements_padded == range_end)
+        ret = base_embeddings + added_embeddings + padding
+        assert len(ret) == self.num_embeddings_padded
+        return ret
+
     def weight_loader(self, param: Parameter, loaded_weight: torch.Tensor):
         parallel_dim = param.parallel_dim
         assert loaded_weight.shape[parallel_dim] == self.org_vocab_size
-        loaded_weight = loaded_weight[self.vocab_start_index:self.
-                                      vocab_end_index]
+        loaded_weight = loaded_weight[self.shard_indices.org_vocab_start_index:
+                                      self.shard_indices.org_vocab_end_index]
         param[:loaded_weight.shape[0]].data.copy_(loaded_weight)
+        param[loaded_weight.shape[0]:].data.fill_(0)
 
     def forward(self, input_):
         if self.tp_size > 1:
             # Build the mask.
-            input_mask = ((input_ < self.vocab_start_index) |
-                          (input_ >= self.vocab_end_index))
-            # Mask the input.
-            masked_input = input_.clone() - self.vocab_start_index
-            masked_input[input_mask] = 0
+            masked_input, input_mask = get_masked_input_and_mask(
+                input_, self.shard_indices.org_vocab_start_index,
+                self.shard_indices.org_vocab_end_index,
+                self.shard_indices.num_org_vocab_padding,
+                self.shard_indices.added_vocab_start_index,
+                self.shard_indices.added_vocab_end_index)
         else:
             masked_input = input_
             # Get the embeddings.
         output_parallel = F.embedding(masked_input, self.weight)
         # Mask the output embedding.
         if self.tp_size > 1:
-            output_parallel[input_mask, :] = 0.0
+            output_parallel.masked_fill_(input_mask.unsqueeze(1), 0)
         # Reduce across all the model parallel GPUs.
         output = tensor_model_parallel_all_reduce(output_parallel)
         return output
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index 67c03ad60008..c59288b4f73c 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -35,6 +35,7 @@
 _BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [
     _BATCH_SIZE_ALIGNMENT * i for i in range(1, 33)
 ]
+_NUM_WARMUP_ITERS = 2
 
 
 class ModelInput(NamedTuple):
@@ -975,16 +976,18 @@ def capture(
         **kwargs,
     ) -> None:
         assert self._graph is None
-        # Run the model once without capturing the graph.
+        # Run the model a few times without capturing the graph.
         # This is to make sure that the captured graph does not include the
         # kernel launches for initial benchmarking (e.g., Triton autotune).
-        self.model(
-            input_ids,
-            positions,
-            kv_caches,
-            attn_metadata,
-            **kwargs,
-        )
+        # Note one iteration is not enough for torch.jit.script
+        for _ in range(_NUM_WARMUP_ITERS):
+            self.model(
+                input_ids,
+                positions,
+                kv_caches,
+                attn_metadata,
+                **kwargs,
+            )
         torch.cuda.synchronize()
 
         # Capture the graph.

From 653a080881663b220f3102cc466e663c01dfcfcf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= <jiefu@tencent.com>
Date: Fri, 7 Jun 2024 11:17:21 +0800
Subject: [PATCH 59/93] [Misc] Missing error message for custom ops import
 (#5282)

---
 vllm/_custom_ops.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 8a6f6d96d81f..462ba8a75310 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -5,8 +5,10 @@
 try:
     from vllm._C import cache_ops as vllm_cache_ops
     from vllm._C import ops as vllm_ops
-except ImportError:
-    pass
+except ImportError as e:
+    from vllm.logger import init_logger
+    logger = init_logger(__name__)
+    logger.warning("Failed to import from vllm._C with %r", e)
 
 
 # activation ops

From 219a38515cc08af1a4596cf6c6b0a6a746a62297 Mon Sep 17 00:00:00 2001
From: Itay Etelis <92247226+Etelis@users.noreply.github.com>
Date: Fri, 7 Jun 2024 06:29:24 +0300
Subject: [PATCH 60/93] [Feature][Frontend]: Add support for `stream_options`
 in `ChatCompletionRequest` (#5135)

---
 tests/entrypoints/test_openai_server.py | 101 ++++++++++++++++++++++++
 vllm/entrypoints/openai/protocol.py     |  14 ++++
 vllm/entrypoints/openai/serving_chat.py |  44 ++++++++---
 3 files changed, 149 insertions(+), 10 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index edd457107d31..c6fc4769987f 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -1342,5 +1342,106 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
     assert embeddings.usage.total_tokens == 17
 
 
+@pytest.mark.parametrize(
+    "model_name",
+    [MODEL_NAME],
+)
+async def test_stream_options(server, client: openai.AsyncOpenAI,
+                              model_name: str):
+    prompt = "What is the capital of France?"
+
+    # Test stream=True, stream_options=None
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options=None,
+    )
+    chunks = []
+    async for chunk in stream:
+        chunks.append(chunk.choices[0].text)
+    assert len(chunks) > 0
+    assert "usage" not in chunk
+
+    # Test stream=True, stream_options={"include_usage": False}
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": False},
+    )
+    chunks = []
+    async for chunk in stream:
+        chunks.append(chunk.choices[0].text)
+    assert len(chunks) > 0
+    assert "usage" not in chunk
+
+    # Test stream=True, stream_options={"include_usage": True}
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": True},
+    )
+    chunks = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        if chunk.choices[0].finish_reason is None:
+            assert chunk.usage is None
+            chunks.append(chunk.choices[0].text)
+        else:
+            assert chunk.usage is None
+            finish_reason_count += 1
+
+    # The last message should have usage and no choices
+    last_message = await stream.__anext__()
+    assert last_message.usage is not None
+    assert last_message.usage.prompt_tokens > 0
+    assert last_message.usage.completion_tokens > 0
+    assert last_message.usage.total_tokens == (
+        last_message.usage.prompt_tokens +
+        last_message.usage.completion_tokens)
+    assert last_message.choices == []
+
+    # Test stream=False, stream_options={"include_usage": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": None},
+        )
+
+    # Test stream=False, stream_options={"include_usage": False}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": False},
+        )
+
+    # Test stream=False, stream_options={"include_usage": True}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(
+            model=model_name,
+            prompt=prompt,
+            max_tokens=5,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": True},
+        )
+
+
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index 11ac28e758c3..fa33318786b9 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -102,6 +102,10 @@ class ResponseFormat(OpenAIBaseModel):
     type: Literal["text", "json_object"]
 
 
+class StreamOptions(OpenAIBaseModel):
+    include_usage: Optional[bool]
+
+
 class FunctionDefinition(OpenAIBaseModel):
     name: str
     description: Optional[str] = None
@@ -140,6 +144,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
                                 le=torch.iinfo(torch.long).max)
     stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
     stream: Optional[bool] = False
+    stream_options: Optional[StreamOptions] = None
     temperature: Optional[float] = 0.7
     top_p: Optional[float] = 1.0
     tools: Optional[List[ChatCompletionToolsParam]] = None
@@ -269,6 +274,15 @@ def logit_bias_logits_processor(
             logits_processors=logits_processors,
         )
 
+    @model_validator(mode='before')
+    @classmethod
+    def validate_stream_options(cls, values):
+        if (values.get('stream_options') is not None
+                and not values.get('stream')):
+            raise ValueError(
+                "stream_options can only be set if stream is true")
+        return values
+
     @model_validator(mode="before")
     @classmethod
     def check_guided_decoding_count(cls, data):
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index afd87f49c1c4..883567abf415 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -247,6 +247,9 @@ async def chat_completion_stream_generator(
                             created=created_time,
                             choices=[choice_data],
                             model=model_name)
+                        if (request.stream_options
+                                and request.stream_options.include_usage):
+                            chunk.usage = None
                         data = chunk.model_dump_json(exclude_unset=True)
                         yield f"data: {data}\n\n"
 
@@ -274,6 +277,9 @@ async def chat_completion_stream_generator(
                                     choices=[choice_data],
                                     logprobs=None,
                                     model=model_name)
+                                if (request.stream_options and
+                                        request.stream_options.include_usage):
+                                    chunk.usage = None
                                 data = chunk.model_dump_json(
                                     exclude_unset=True)
                                 yield f"data: {data}\n\n"
@@ -327,17 +333,14 @@ async def chat_completion_stream_generator(
                             created=created_time,
                             choices=[choice_data],
                             model=model_name)
+                        if (request.stream_options
+                                and request.stream_options.include_usage):
+                            chunk.usage = None
                         data = chunk.model_dump_json(exclude_unset=True)
                         yield f"data: {data}\n\n"
                     else:
                         # Send the finish response for each request.n only once
                         prompt_tokens = len(res.prompt_token_ids)
-                        final_usage = UsageInfo(
-                            prompt_tokens=prompt_tokens,
-                            completion_tokens=previous_num_tokens[i],
-                            total_tokens=prompt_tokens +
-                            previous_num_tokens[i],
-                        )
                         choice_data = ChatCompletionResponseStreamChoice(
                             index=i,
                             delta=delta_message,
@@ -350,12 +353,33 @@ async def chat_completion_stream_generator(
                             created=created_time,
                             choices=[choice_data],
                             model=model_name)
-                        if final_usage is not None:
-                            chunk.usage = final_usage
-                        data = chunk.model_dump_json(exclude_unset=True,
-                                                     exclude_none=True)
+                        if (request.stream_options
+                                and request.stream_options.include_usage):
+                            chunk.usage = None
+                        data = chunk.model_dump_json(exclude_unset=True)
                         yield f"data: {data}\n\n"
                         finish_reason_sent[i] = True
+
+                    if (request.stream_options
+                            and request.stream_options.include_usage):
+                        final_usage = UsageInfo(
+                            prompt_tokens=prompt_tokens,
+                            completion_tokens=previous_num_tokens[i],
+                            total_tokens=prompt_tokens +
+                            previous_num_tokens[i],
+                        )
+
+                        final_usage_chunk = ChatCompletionStreamResponse(
+                            id=request_id,
+                            object=chunk_object_type,
+                            created=created_time,
+                            choices=[],
+                            model=model_name,
+                            usage=final_usage)
+                        final_usage_data = (final_usage_chunk.model_dump_json(
+                            exclude_unset=True, exclude_none=True))
+                        yield f"data: {final_usage_data}\n\n"
+
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             data = self.create_streaming_error_response(str(e))

From bd666220867f3067f26ced95520e67714cb814e3 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Thu, 6 Jun 2024 22:15:11 -0700
Subject: [PATCH 61/93] [Misc][Utils] allow get_open_port to be called for
 multiple times (#5333)

---
 tests/test_utils.py | 16 +++++++++++++++-
 vllm/envs.py        |  3 +++
 vllm/utils.py       | 10 +++++++++-
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/tests/test_utils.py b/tests/test_utils.py
index a6c3896fa43b..0b674ea6a85c 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -1,11 +1,13 @@
 import asyncio
+import os
+import socket
 import sys
 from typing import (TYPE_CHECKING, Any, AsyncIterator, Awaitable, Protocol,
                     Tuple, TypeVar)
 
 import pytest
 
-from vllm.utils import deprecate_kwargs, merge_async_iterators
+from vllm.utils import deprecate_kwargs, get_open_port, merge_async_iterators
 
 from .utils import error_on_warning
 
@@ -116,3 +118,15 @@ def dummy(*, old_arg: object = None, new_arg: object = None):
 
     with pytest.warns(DeprecationWarning, match="abcd"):
         dummy(old_arg=1)
+
+
+def test_get_open_port():
+    os.environ["VLLM_PORT"] = "5678"
+    # make sure we can get multiple ports, even if the env var is set
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s1:
+        s1.bind(("localhost", get_open_port()))
+        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s2:
+            s2.bind(("localhost", get_open_port()))
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s3:
+                s3.bind(("localhost", get_open_port()))
+    os.environ.pop("VLLM_PORT")
diff --git a/vllm/envs.py b/vllm/envs.py
index bef343d08429..7d5c7371b774 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -99,6 +99,9 @@
     lambda: os.getenv('VLLM_HOST_IP', "") or os.getenv("HOST_IP", ""),
 
     # used in distributed environment to manually set the communication port
+    # Note: if VLLM_PORT is set, and some code asks for multiple ports, the
+    # VLLM_PORT will be used as the first port, and the rest will be generated
+    # by incrementing the VLLM_PORT value.
     # '0' is used to make mypy happy
     'VLLM_PORT':
     lambda: int(os.getenv('VLLM_PORT', '0'))
diff --git a/vllm/utils.py b/vllm/utils.py
index f0ea434a0969..b6edf00d5388 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -289,7 +289,15 @@ def get_distributed_init_method(ip: str, port: int) -> str:
 def get_open_port() -> int:
     port = envs.VLLM_PORT
     if port is not None:
-        return port
+        while True:
+            try:
+                with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                    s.bind(("", port))
+                    return port
+            except OSError:
+                port += 1  # Increment port number if already in use
+                logger.info("Port %d is already in use, trying port %d",
+                            port - 1, port)
     # try ipv4
     try:
         with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:

From ed99ec91de1228b27b9620dc2c394f439ab78418 Mon Sep 17 00:00:00 2001
From: Tyler Michael Smith <tyler@neuralmagic.com>
Date: Fri, 7 Jun 2024 04:42:35 -0400
Subject: [PATCH 62/93] [Kernel] Switch fp8 layers to use the CUTLASS kernels
 (#5183)

Switching from torch._scaled_mm to vLLM's cutlass fp8 kernels when supported as we are seeing 5-15% improvement in e2e performance on neuralmagic/Meta-Llama-3-8B-Instruct-FP8

see https://docs.google.com/spreadsheets/d/1GiAnmzyGHgZ6zL_LDSTm35Bdrt4A8AaFEurDlISYYA4/ for some quick e2e benchmarks and #5144 for comparisons across different GEMM sizes.
---
 vllm/_custom_ops.py                           |  4 +-
 .../model_executor/layers/quantization/fp8.py | 70 ++++++++++++++-----
 2 files changed, 54 insertions(+), 20 deletions(-)

diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 462ba8a75310..cae6822166b6 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -179,7 +179,7 @@ def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
 
 # cutlass
 def cutlass_scaled_mm_dq(a: torch.Tensor, b: torch.Tensor,
-                         a_scales: torch.Tensor, b_scales: torch.Tensor,
+                         scale_a: torch.Tensor, scale_b: torch.Tensor,
                          out_dtype: Type[torch.dtype]) -> torch.Tensor:
     assert (b.shape[0] % 16 == 0 and b.shape[1] % 16 == 0)
     assert (out_dtype is torch.bfloat16 or out_dtype is torch.float16)
@@ -188,7 +188,7 @@ def cutlass_scaled_mm_dq(a: torch.Tensor, b: torch.Tensor,
     n = b.shape[1]
     out = torch.empty((m, n), dtype=out_dtype, device=a.device)
 
-    vllm_ops.cutlass_scaled_mm_dq(out, a, b, a_scales, b_scales)
+    vllm_ops.cutlass_scaled_mm_dq(out, a, b, scale_a, scale_b)
 
     return out
 
diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index a4c067375d21..1323360df6c8 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -17,6 +17,24 @@
 logger = init_logger(__name__)
 
 
+def cutlass_fp8_supported() -> bool:
+    capability = torch.cuda.get_device_capability()
+    capability = capability[0] * 10 + capability[1]
+    version = torch.version.cuda
+    version = version[0] * 10 + version[1]
+
+    # CUTLASS FP8 kernels need at least
+    #   CUDA 12.0 on SM90 systems (Hopper)
+    #   CUDA 12.4 on SM89 systems (Lovelace)
+    gpu_is_supported = False
+    if capability >= 900:
+        gpu_is_supported = version > 120
+    elif capability >= 890:
+        gpu_is_supported = version > 124
+
+    return gpu_is_supported
+
+
 class Fp8Config(QuantizationConfig):
     """Config class for FP8."""
 
@@ -92,6 +110,7 @@ class Fp8LinearMethod(LinearMethodBase):
 
     def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
+        self.cutlass_fp8_supported = cutlass_fp8_supported()
 
     def _create_scale_param(
         self,
@@ -233,25 +252,40 @@ def apply(self,
               layer: torch.nn.Module,
               x: torch.Tensor,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
+
         # ops.scaled_fp8_quant supports both dynamic and static quant.
-        #   If dynamic, layer.input_scale is None and x_scale computed from x.
-        #   If static,  layer.input_scale is scalar and x_scale uses it.
-        qinput, x_scale = ops.scaled_fp8_quant(x,
-                                               layer.input_scale,
-                                               batch_dim_padding=17)
-
-        # Fused GEMM_DQ -- note we padded the input above because
-        # torch._scaled_mm is more performant for matrices with
-        # batch dimension > 16. Note that this could change
-        # in the future.
-        output, _ = torch._scaled_mm(
-            qinput,
-            layer.weight,
-            out_dtype=x.dtype,
-            scale_a=x_scale,
-            scale_b=layer.weight_scale,
-            bias=bias,
-        )
+        #   If dynamic, layer.act_scale is None and x_scale computed from x.
+        #   If static,  layer.act_scale is scalar and x_scale set to act_scale.
+
+        if bias is None and self.cutlass_fp8_supported:
+            qinput, x_scale = ops.scaled_fp8_quant(x, layer.act_scale)
+
+            # Fused GEMM_DQ
+            output = ops.cutlass_scaled_mm_dq(
+                qinput,
+                layer.weight,
+                out_dtype=x.dtype,
+                scale_a=x_scale,
+                scale_b=layer.weight_scale,
+            )
+
+        else:
+            qinput, x_scale = ops.scaled_fp8_quant(x,
+                                                   layer.act_scale,
+                                                   batch_dim_padding=17)
+
+            # Fused GEMM_DQ -- note we padded the input above because
+            # torch._scaled_mm is more performant for matrices with
+            # batch dimension > 16. Note that this could change
+            # in the future.
+            output, _ = torch._scaled_mm(
+                qinput,
+                layer.weight,
+                out_dtype=x.dtype,
+                scale_a=x_scale,
+                scale_b=layer.weight_scale,
+                bias=bias,
+            )
 
         return torch.narrow(output, 0, 0, x.shape[0])
 

From 50520b4ed63079e92726e17b93c2211368852f88 Mon Sep 17 00:00:00 2001
From: Antoni Baum <antoni.baum@protonmail.com>
Date: Fri, 7 Jun 2024 03:01:56 -0700
Subject: [PATCH 63/93] Remove Ray health check (#4693)

---
 vllm/executor/ray_gpu_executor.py | 17 -----------------
 1 file changed, 17 deletions(-)

diff --git a/vllm/executor/ray_gpu_executor.py b/vllm/executor/ray_gpu_executor.py
index bed356d1b6e5..89d1c4ac7cbc 100644
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -293,23 +293,6 @@ def _compiled_ray_dag(self):
             ])
         return forward_dag.experimental_compile()
 
-    def check_health(self) -> None:
-        """Raises an error if engine is unhealthy."""
-        self._check_if_any_actor_is_dead()
-
-    def _check_if_any_actor_is_dead(self):
-        if not self.workers:
-            return
-
-        dead_actors = []
-        for actor in self.workers:
-            actor_state = ray.state.actors(actor._ray_actor_id.hex())  # pylint: disable=protected-access
-            if actor_state["State"] == "DEAD":
-                dead_actors.append(actor)
-        if dead_actors:
-            raise RuntimeError("At least one Worker is dead. "
-                               f"Dead Workers: {dead_actors}. ")
-
 
 class RayGPUExecutorAsync(RayGPUExecutor, DistributedGPUExecutorAsync):
 

From 98744f934bc5a10b3200dc51691d14dd46698af3 Mon Sep 17 00:00:00 2001
From: limingshu <61349199+JamesLim-sy@users.noreply.github.com>
Date: Fri, 7 Jun 2024 21:35:42 +0800
Subject: [PATCH 64/93] Addition of lacked ignored_seq_groups in
 _schedule_chunked_prefill (#5296)

---
 vllm/core/scheduler.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 399665082f83..0159053b4dc6 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -905,7 +905,8 @@ def _schedule_chunked_prefill(self):
             blocks_to_swap_out=running_scheduled.blocks_to_swap_out,
             blocks_to_copy=running_scheduled.blocks_to_copy +
             swapped_in.blocks_to_copy,
-            ignored_seq_groups=prefills.ignored_seq_groups,
+            ignored_seq_groups=prefills.ignored_seq_groups +
+            swapped_in.infeasible_seq_groups,
             num_lookahead_slots=running_scheduled.num_lookahead_slots,
             running_queue_size=len(self.running),
             preempted=(len(running_scheduled.preempted) +

From 334e0a7c9d35684a30eb7500bf6b24091a0d8f06 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Fri, 7 Jun 2024 12:36:26 -0400
Subject: [PATCH 65/93] [Kernel] Dynamic Per-Token Activation Quantization
 (#5037)

Co-authored-by: Varun Sundar Rabindranath <varunsundar08@gmail.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
---
 csrc/ops.h                                    |   3 +
 csrc/pybind.cpp                               |   3 +
 .../compressed_tensors/int8_quant_kernels.cu  |  75 ++++++++++--
 csrc/reduction_utils.cuh                      |  54 +++++++--
 tests/kernels/test_int8_quant.py              |  44 ++++++-
 tests/quantization/test_compressed_tensors.py |  19 ++-
 vllm/_custom_ops.py                           |  28 +++--
 .../compressed_tensors/compressed_tensors.py  |  87 +++++++------
 .../compressed_tensors/schemes/__init__.py    |   2 +
 .../compressed_tensors_w8a8_dynamictoken.py   |  85 +++++++++++++
 .../compressed_tensors_w8a8_statictensor.py   |   2 +-
 .../quantization/compressed_tensors/utils.py  | 114 ++++++++++++++++++
 12 files changed, 440 insertions(+), 76 deletions(-)
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
 create mode 100644 vllm/model_executor/layers/quantization/compressed_tensors/utils.py

diff --git a/csrc/ops.h b/csrc/ops.h
index 4952e826ec8a..06b60e748886 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -97,6 +97,9 @@ int cutlass_scaled_mm_dq(torch::Tensor& out, torch::Tensor const& a,
 void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
                               torch::Tensor const& scale);
 
+void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input,
+                               torch::Tensor& scales);
+
 void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
                      torch::Tensor lookup_table);
 
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
index cdbec4a34d77..547823aa1b04 100644
--- a/csrc/pybind.cpp
+++ b/csrc/pybind.cpp
@@ -70,6 +70,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   ops.def("static_scaled_int8_quant", &static_scaled_int8_quant,
           "Compute int8 quantized tensor for given scaling factor");
 
+  ops.def("dynamic_scaled_int8_quant", &dynamic_scaled_int8_quant,
+          "Compute int8 quantized tensor and scaling factor");
+
   // Cache ops
   pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
   cache_ops.def("swap_blocks", &swap_blocks,
diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
index 11baa5d414c1..280b0327111d 100644
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@@ -3,6 +3,7 @@
 #include <cmath>
 
 #include "../../dispatch_utils.h"
+#include "../../reduction_utils.cuh"
 
 static inline __device__ int8_t float_to_int8_rn(float x) {
 #ifdef USE_ROCM
@@ -27,17 +28,48 @@ namespace vllm {
 
 template <typename scalar_t, typename scale_type>
 __global__ void static_scaled_int8_quant_kernel(
-    const scalar_t* __restrict__ input, int8_t* __restrict__ out,
-    const scale_type* scale_ptr, const int hidden_size) {
-  const int tid = threadIdx.x;
-  const int token_idx = blockIdx.x;
-  scale_type scale = *scale_ptr;
+    scalar_t const* __restrict__ input, int8_t* __restrict__ out,
+    scale_type const* scale_ptr, const int hidden_size) {
+  int const tid = threadIdx.x;
+  int const token_idx = blockIdx.x;
+  scale_type const scale = *scale_ptr;
 
   for (int i = tid; i < hidden_size; i += blockDim.x) {
-    out[token_idx * hidden_size + i] =
-        float_to_int8_rn(((float)input[token_idx * hidden_size + i]) / scale);
+    out[token_idx * hidden_size + i] = float_to_int8_rn(
+        static_cast<float>(input[token_idx * hidden_size + i]) / scale);
   }
 }
+
+template <typename scalar_t, typename scale_type>
+__global__ void dynamic_scaled_int8_quant_kernel(
+    scalar_t const* __restrict__ input, int8_t* __restrict__ out,
+    scale_type* scale, const int hidden_size) {
+  int const tid = threadIdx.x;
+  int const token_idx = blockIdx.x;
+  float absmax_val = 0.0f;
+  float const zero = 0.0f;
+
+  for (int i = tid; i < hidden_size; i += blockDim.x) {
+    float val = static_cast<float>(input[token_idx * hidden_size + i]);
+    val = val > zero ? val : -val;
+    absmax_val = val > absmax_val ? val : absmax_val;
+  }
+
+  float const block_absmax_val_maybe = blockReduceMax(absmax_val);
+  __shared__ float block_absmax_val;
+  if (tid == 0) {
+    block_absmax_val = block_absmax_val_maybe;
+    scale[token_idx] = block_absmax_val / 127.0f;
+  }
+  __syncthreads();
+
+  float const tmp_scale = 127.0f / block_absmax_val;
+  for (int i = tid; i < hidden_size; i += blockDim.x) {
+    out[token_idx * hidden_size + i] = float_to_int8_rn(
+        static_cast<float>(input[token_idx * hidden_size + i]) * tmp_scale);
+  }
+}
+
 }  // namespace vllm
 
 void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
@@ -47,10 +79,10 @@ void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
   TORCH_CHECK(out.is_contiguous());
   TORCH_CHECK(scale.numel() == 1);
 
-  int hidden_size = input.size(-1);
-  int num_tokens = input.numel() / hidden_size;
-  dim3 grid(num_tokens);
-  dim3 block(std::min(hidden_size, 1024));
+  int const hidden_size = input.size(-1);
+  int const num_tokens = input.numel() / hidden_size;
+  dim3 const grid(num_tokens);
+  dim3 const block(std::min(hidden_size, 1024));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(
       input.scalar_type(), "static_scaled_int8_quant_kernel", [&] {
@@ -60,3 +92,24 @@ void static_scaled_int8_quant(torch::Tensor& out,          // [..., hidden_size]
                                          scale.data_ptr<float>(), hidden_size);
       });
 }
+
+void dynamic_scaled_int8_quant(
+    torch::Tensor& out,          // [..., hidden_size]
+    torch::Tensor const& input,  // [..., hidden_size]
+    torch::Tensor& scales) {
+  TORCH_CHECK(input.is_contiguous());
+  TORCH_CHECK(out.is_contiguous());
+
+  int const hidden_size = input.size(-1);
+  int const num_tokens = input.numel() / hidden_size;
+  dim3 const grid(num_tokens);
+  dim3 const block(std::min(hidden_size, 1024));
+  const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
+  VLLM_DISPATCH_FLOATING_TYPES(
+      input.scalar_type(), "dynamic_scaled_int8_quant_kernel", [&] {
+        vllm::dynamic_scaled_int8_quant_kernel<scalar_t, float>
+            <<<grid, block, 0, stream>>>(input.data_ptr<scalar_t>(),
+                                         out.data_ptr<int8_t>(),
+                                         scales.data_ptr<float>(), hidden_size);
+      });
+}
diff --git a/csrc/reduction_utils.cuh b/csrc/reduction_utils.cuh
index 9af4aae51615..08063356012b 100644
--- a/csrc/reduction_utils.cuh
+++ b/csrc/reduction_utils.cuh
@@ -21,29 +21,47 @@
 #include "cuda_compat.h"
 
 namespace vllm {
+
+namespace detail {
+
+template <typename T>
+__inline__ __device__ T _max(T a, T b) {
+  return max(a, b);
+}
+
+template <typename T>
+__inline__ __device__ T _sum(T a, T b) {
+  return a + b;
+}
+
+}  // namespace detail
+
+template <typename T>
+using ReduceFnType = T (*)(T, T);
+
+// Helper function to return the next largest power of 2
+static constexpr int _nextPow2(unsigned int num) {
+  if (num <= 1) return num;
+  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+}
+
 template <typename T, int numLanes = WARP_SIZE>
-__inline__ __device__ T warpReduceSum(T val) {
+__inline__ __device__ T warpReduce(T val, ReduceFnType<T> fn) {
   static_assert(numLanes > 0 && (numLanes & (numLanes - 1)) == 0,
                 "numLanes is not a positive power of 2!");
   static_assert(numLanes <= WARP_SIZE);
 #pragma unroll
   for (int mask = numLanes >> 1; mask > 0; mask >>= 1)
-    val += VLLM_SHFL_XOR_SYNC(val, mask);
-  return val;
-}
+    val = fn(val, VLLM_SHFL_XOR_SYNC(val, mask));
 
-// Helper function to return the next largest power of 2
-static constexpr int _nextPow2(unsigned int num) {
-  if (num <= 1) return num;
-  return 1 << (CHAR_BIT * sizeof(num) - __builtin_clz(num - 1));
+  return val;
 }
 
-/* Calculate the sum of all elements in a block */
 template <typename T, int maxBlockSize = 1024>
-__inline__ __device__ T blockReduceSum(T val) {
+__inline__ __device__ T blockReduce(T val, ReduceFnType<T> fn) {
   static_assert(maxBlockSize <= 1024);
   if constexpr (maxBlockSize > WARP_SIZE) {
-    val = warpReduceSum<T>(val);
+    val = warpReduce<T>(val, fn);
     // Calculates max number of lanes that need to participate in the last
     // warpReduce
     constexpr int maxActiveLanes = (maxBlockSize + WARP_SIZE - 1) / WARP_SIZE;
@@ -56,12 +74,22 @@ __inline__ __device__ T blockReduceSum(T val) {
 
     val = (threadIdx.x < blockDim.x / float(WARP_SIZE)) ? shared[lane]
                                                         : (T)(0.0f);
-    val = warpReduceSum<T, _nextPow2(maxActiveLanes)>(val);
+    val = warpReduce<T, _nextPow2(maxActiveLanes)>(val, fn);
   } else {
     // A single warpReduce is equal to blockReduce
-    val = warpReduceSum<T, _nextPow2(maxBlockSize)>(val);
+    val = warpReduce<T, _nextPow2(maxBlockSize)>(val, fn);
   }
   return val;
 }
 
+template <typename T, int maxBlockSize = 1024>
+__inline__ __device__ T blockReduceMax(T val) {
+  return blockReduce<T, maxBlockSize>(val, detail::_max<T>);
+}
+
+template <typename T, int maxBlockSize = 1024>
+__inline__ __device__ T blockReduceSum(T val) {
+  return blockReduce<T, maxBlockSize>(val, detail::_sum<T>);
+}
+
 }  // namespace vllm
diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py
index 29890118c93d..aab7af9d2cbf 100644
--- a/tests/kernels/test_int8_quant.py
+++ b/tests/kernels/test_int8_quant.py
@@ -4,27 +4,59 @@
 from vllm._C import ops
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
-HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 8192]  # Arbitrary values for testing
+HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 5137, 8192,
+                8193]  # Arbitrary values for testing
 NUM_TOKENS = [1, 7, 83, 4096]  # Arbitrary values for testing
 SEEDS = [0]
 SCALE = [0.1, 0.5, 0.8, 1.2, 2.1]
 
 
+@pytest.mark.parametrize("num_tokens", NUM_TOKENS)
+@pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
+@pytest.mark.parametrize("dtype", DTYPES)
+@pytest.mark.parametrize("seed", SEEDS)
+@torch.inference_mode()
+def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
+                                   dtype: torch.dtype, seed: int) -> None:
+    torch.random.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    int8_traits = torch.iinfo(torch.int8)
+
+    x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
+
+    x_token_max, _ = x.max(dim=1)
+    x_token_max = x_token_max.to(dtype=torch.float32)
+    scales = (x_token_max / float(127.0))[:, None].to(device="cuda",
+                                                      dtype=torch.float32)
+    torch_out = (x / scales).round().clamp(int8_traits.min,
+                                           int8_traits.max).to(torch.int8)
+
+    ops_out = torch.empty_like(x, dtype=torch.int8, device="cuda")
+    scales_out = torch.empty_like(scales, dtype=torch.float32, device="cuda")
+    ops.dynamic_scaled_int8_quant(ops_out, x, scales_out)
+
+    assert torch.allclose(scales_out, scales)
+    assert torch.allclose(torch_out, ops_out,
+                          atol=1)  # big atol to account for rounding errors
+
+
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
 @pytest.mark.parametrize("dtype", DTYPES)
 @pytest.mark.parametrize("seed", SEEDS)
 @pytest.mark.parametrize("scale", SCALE)
 @torch.inference_mode()
-def test_quant(num_tokens: int, hidden_size: int, dtype: torch.dtype,
-               seed: int, scale: float) -> None:
+def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
+                                  dtype: torch.dtype, seed: int,
+                                  scale: float) -> None:
     torch.random.manual_seed(seed)
     torch.cuda.manual_seed(seed)
+    int8_traits = torch.iinfo(torch.int8)
+
     x = torch.rand(num_tokens, hidden_size, dtype=dtype, device="cuda") * 1000
 
-    out1 = (x / scale).round().clamp(
-        torch.iinfo(torch.int8).min,
-        torch.iinfo(torch.int8).max).to(torch.int8)
+    out1 = (x / scale).round().clamp(int8_traits.min,
+                                     int8_traits.max).to(torch.int8)
     out2 = torch.empty_like(x, dtype=torch.int8)
     scale_argument = torch.tensor([scale], dtype=torch.float32, device="cuda")
 
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index b83286992da3..8b48f418fe49 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -6,7 +6,8 @@
 import torch
 
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
-    CompressedTensorsLinearMethod, CompressedTensorsW8A8StaticTensor)
+    CompressedTensorsLinearMethod, CompressedTensorsW8A8DynamicToken,
+    CompressedTensorsW8A8StaticTensor)
 
 
 def test_compressed_tensors_w8a8_static_setup(vllm_runner):
@@ -34,3 +35,19 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner):
     assert qkv_proj.weight_scale.shard_splitter is not None
     assert qkv_proj.weight_scale.logical_widths is not None
     assert qkv_proj.input_scale.dtype is torch.float32
+
+
+def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
+    model_path = "nm-testing/tinyllama-one-shot-dynamic-test"
+    llm = vllm_runner(model_path,
+                      quantization="sparseml",
+                      enforce_eager=True,
+                      dtype=torch.float16)
+    model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
+    layer = model.model.layers[0]
+
+    qkv_proj = layer.self_attn.qkv_proj
+
+    assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+    assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken)
+    assert qkv_proj.weight.dtype is torch.int8
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index cae6822166b6..7e12f1ba14cd 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -266,21 +266,33 @@ def scaled_fp8_quant(
 
 
 # int8
-def static_scaled_int8_quant(input: torch.Tensor,
-                             scale: torch.Tensor) -> torch.Tensor:
+def scaled_int8_quant(
+        input: torch.Tensor,
+        scale: Optional[torch.Tensor] = None
+) -> Tuple[torch.Tensor, torch.Tensor]:
     """
-    Quantize the input tensor to int8 and return the quantized tensor.
+    Quantize the input tensor to int8 and return the quantized tensor and scale.
 
     Args:
         input: The input tensor to be quantized to int8.
-        scale: Scaling factor for the int8 quantization.
+        scale: Optional scaling factor for the int8 quantization.
+            When not provided, we invoke dynamic-per-token quantization.
 
     Returns:
-        torch.Tensor: Output tensor in int8.
+      Tuple[Torch.Tensor, Torch.Tensor] : Output int8 tensor and scales.
     """
-    q = torch.empty_like(input, dtype=torch.int8)
-    vllm_ops.static_scaled_int8_quant(q, input, scale)
-    return q
+    output = torch.empty_like(input, dtype=torch.int8)
+    if scale is not None:
+        # static-per-tensor quantization.
+        vllm_ops.static_scaled_int8_quant(output, input, scale)
+        return output, scale
+
+    # dynamic-per-token quantization.
+    input_scales = torch.empty((input.numel() // input.shape[-1], 1),
+                               device=input.device,
+                               dtype=torch.float32)
+    vllm_ops.dynamic_scaled_int8_quant(output, input, input_scales)
+    return output, input_scales
 
 
 # moe
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
index 19e464bd6432..d2b0ce0dbbf0 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors.py
@@ -1,12 +1,16 @@
 from typing import Any, Dict, List, Optional
 
 import torch
+from pydantic import BaseModel
 
 from vllm.model_executor.layers.linear import LinearBase, LinearMethodBase
 from vllm.model_executor.layers.quantization.base_config import (  # noqa: E501
     QuantizationConfig)
 from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
-    CompressedTensorsScheme, CompressedTensorsW8A8StaticTensor)
+    CompressedTensorsScheme, CompressedTensorsW8A8DynamicToken,
+    CompressedTensorsW8A8StaticTensor)
+from vllm.model_executor.layers.quantization.compressed_tensors.utils import (
+    QuantizationArgs, QuantizationStrategy, find_first_name_or_class_match)
 
 
 class CompressedTensorsConfig(QuantizationConfig):
@@ -47,10 +51,12 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
             targets = quant_config.get("targets")
             for target in targets:
                 layer_quant_details[target] = {}
-                layer_quant_details[target]["weight"] = quant_config.get(
-                    "weights")
-                layer_quant_details[target]["input"] = quant_config.get(
-                    "input_activations")
+                layer_quant_details[target][
+                    "weight"] = QuantizationArgs.parse_obj(
+                        quant_config.get("weights"))
+                layer_quant_details[target][
+                    "input"] = QuantizationArgs.parse_obj(
+                        quant_config.get("input_activations"))
 
         return cls(layer_quant_details=layer_quant_details, ignore=ignore)
 
@@ -58,40 +64,46 @@ def from_config(cls, config: Dict[str, Any]) -> "CompressedTensorsConfig":
     def get_config_filenames(cls) -> List[str]:
         return []
 
-    def _get_schema(self, weight_quant: Dict, input_quant: Dict):
-        # TODO: Refactor as additional cases are supported
-
-        weight_bit = weight_quant.get("num_bits")
-        input_bit = input_quant.get("num_bits")
-
-        weight_strategy = weight_quant.get("strategy")
-        input_strategy = input_quant.get("strategy")
-
-        weight_symmetric = weight_quant.get("symmetric")
-        input_symmetric = input_quant.get("symmetric")
+    def _is_static_tensor_w8a8(self, weight_quant: BaseModel,
+                               input_quant: BaseModel) -> bool:
+        is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
+        is_tensor = (weight_quant.strategy == input_quant.strategy ==
+                     QuantizationStrategy.TENSOR.value)
+        is_symmetric = weight_quant.symmetric and input_quant.symmetric
+        is_static = not weight_quant.dynamic and not input_quant.dynamic
+
+        return is_8_bits and is_tensor and is_symmetric and is_static
+
+    def _is_dynamic_token_w8a8(self, weight_quant: BaseModel,
+                               input_quant: BaseModel) -> bool:
+        is_8_bits = weight_quant.num_bits == input_quant.num_bits == 8
+        is_token_tensor = (weight_quant.strategy
+                           == QuantizationStrategy.TENSOR.value) and (
+                               input_quant.strategy
+                               == QuantizationStrategy.TOKEN.value)
+        is_symmetric = weight_quant.symmetric and input_quant.symmetric
+        is_dynamic = not weight_quant.dynamic and input_quant.dynamic
+
+        return is_8_bits and is_token_tensor and is_symmetric and is_dynamic
+
+    def _get_schema(self, weight_quant: BaseModel,
+                    input_quant: BaseModel) -> "CompressedTensorsScheme":
+        if self._is_static_tensor_w8a8(weight_quant, input_quant):
+            return CompressedTensorsW8A8StaticTensor()
 
-        is_8_bits = weight_bit == input_bit == 8
-        is_tensor = weight_strategy == input_strategy == "tensor"
-        is_symmetric = weight_symmetric and input_symmetric
+        if self._is_dynamic_token_w8a8(weight_quant, input_quant):
+            return CompressedTensorsW8A8DynamicToken()
 
-        if is_8_bits and is_tensor and is_symmetric and \
-                torch.cuda.is_available():
-            # CompressedTensorsW8A8StaticTensor only supports CUDA path for
-            # now.
-            return CompressedTensorsW8A8StaticTensor()
-        raise NotImplementedError(
-            "Scheme not supported. Only CUDA, 8-bit static symmtetric "
-            "per tensor quantization is currently supported")
+        raise NotImplementedError("Scheme not supported.")
 
     def get_scheme(self, layer: torch.nn.Module) -> "CompressedTensorsScheme":
 
-        # TODO: update with matching function from `compressed_tensors`
-        layer_type_name = None
-        layer_name_class = type(layer).__name__.lower()
-        for target in self.layer_quant_details:
-            if target.lower() in layer_name_class:
-                layer_type_name = target
-                break
+        layer_type_name = find_first_name_or_class_match(
+            name="",
+            module=layer,
+            targets=self.layer_quant_details.keys(),
+            check_contains=True)
+
         if layer_type_name is None:
             raise ValueError(f"Could not matching target for layer {layer}")
 
@@ -117,7 +129,9 @@ def create_weights(self, layer: torch.nn.Module,
                        **extra_weight_attrs):
         """
         Use the CompressedTensorsScheme associated with each layer to create 
-        the necessary parameters for the layer.
+        the necessary parameters for the layer. See LinearMethodBase for param
+        details
+
         """
         weight_loader = extra_weight_attrs.get("weight_loader")
 
@@ -139,7 +153,8 @@ def apply(self,
         """
         Use the output of create_weights and the CompressedTensorsScheme 
         associated with the layer to apply the forward pass with the 
-        layer input.
+        layer input.  See LinearMethodBase for param details
+
         """
 
         if bias is not None:
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
index 831905b63e2c..9a910f061f58 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/__init__.py
@@ -1,5 +1,7 @@
 from .compressed_tensors_scheme import CompressedTensorsScheme  # noqa: F401
 from .compressed_tensors_unquantized import (  # noqa: F401
     CompressedTensorsUnquantized)
+from .compressed_tensors_w8a8_dynamictoken import (  # noqa: F401, E501
+    CompressedTensorsW8A8DynamicToken)
 from .compressed_tensors_w8a8_statictensor import (  # noqa: F401, E501
     CompressedTensorsW8A8StaticTensor)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
new file mode 100644
index 000000000000..25b707caeef3
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_dynamictoken.py
@@ -0,0 +1,85 @@
+from typing import Callable, List, Tuple, Union
+
+import torch
+from torch.nn import Parameter
+
+from vllm import _custom_ops as custom_ops
+from vllm.model_executor.layers.quantization.compressed_tensors.schemes import (
+    CompressedTensorsScheme)
+from vllm.model_executor.utils import set_weight_attrs
+
+__all__ = ["CompressedTensorsW8A8DynamicToken"]
+
+
+class CompressedTensorsW8A8DynamicToken(CompressedTensorsScheme):
+
+    def _shard_id_as_int(self, shard_id: Union[str, int]) -> int:
+        if isinstance(shard_id, int):
+            return shard_id
+
+        assert isinstance(shard_id, str)
+        qkv_idxs = {"q": 0, "k": 1, "v": 2}
+        assert shard_id in qkv_idxs
+        return qkv_idxs[shard_id]
+
+    def scales_shard_splitter(
+            self, param: torch.Tensor, loaded_weight: torch.Tensor,
+            shard_id: Union[str, int],
+            logical_widths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        shard_id = self._shard_id_as_int(shard_id)
+        offset = sum(logical_widths[:shard_id])
+        size = logical_widths[shard_id]
+        # update loaded weight with copies for broadcast.
+        loaded_weight = loaded_weight.repeat(size)
+        return param[offset:offset + size], loaded_weight
+
+    def create_weights(self, layer: torch.nn.Module,
+                       output_partition_sizes: List[int],
+                       input_size_per_partition: int,
+                       params_dtype: torch.dtype, weight_loader: Callable,
+                       **kwargs):
+
+        # When the scales have a single value, it is required that they be
+        # on the CPU for performance and CUDA Graphs compatibility. Please
+        # refer to the comment in
+        # CompressedTensorsW8A8StaticTensor::create_weights for further
+        # information.
+        is_tensor_partitioned = len(output_partition_sizes) != 1
+        weight_scale_dim = sum(
+            output_partition_sizes) if is_tensor_partitioned else 1
+
+        weight_zero_point = Parameter(torch.empty(1, dtype=torch.int8),
+                                      requires_grad=False)
+
+        weight_scale = Parameter(torch.empty(weight_scale_dim,
+                                             dtype=torch.float32),
+                                 requires_grad=False)
+
+        weight = Parameter(torch.empty(sum(output_partition_sizes),
+                                       input_size_per_partition,
+                                       dtype=torch.int8),
+                           requires_grad=False)
+
+        layer.register_parameter("weight", weight)
+        set_weight_attrs(weight, {"input_dim": 1, "output_dim": 0})
+        set_weight_attrs(weight, {"weight_loader": weight_loader})
+        set_weight_attrs(weight, {"logical_widths": output_partition_sizes})
+
+        layer.register_parameter("weight_scale", weight_scale)
+        set_weight_attrs(weight_scale, {"weight_loader": weight_loader})
+        set_weight_attrs(
+            weight_scale, {
+                "shard_splitter": self.scales_shard_splitter,
+                "logical_widths": output_partition_sizes
+            })
+
+        layer.register_parameter("weight_zero_point", weight_zero_point)
+        set_weight_attrs(weight_zero_point, {"weight_loader": weight_loader})
+
+    def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
+        weight = layer.weight
+        weight_scale = layer.weight_scale
+
+        x_q, input_scales = custom_ops.scaled_int8_quant(x)
+        return custom_ops.cutlass_scaled_mm_dq(x_q, weight.t(), input_scales,
+                                               weight_scale, x.dtype)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
index 2dfc6e2b0778..7559fc0f95b2 100644
--- a/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/schemes/compressed_tensors_w8a8_statictensor.py
@@ -97,7 +97,7 @@ def apply_weights(self, layer: torch.nn.Module, x: torch.Tensor):
         act_scale = layer.input_scale
 
         # Input quantize
-        x_q = custom_ops.static_scaled_int8_quant(x, act_scale)
+        x_q, _ = custom_ops.scaled_int8_quant(x, act_scale)
 
         return custom_ops.cutlass_scaled_mm_dq(x_q, weight.t(), act_scale,
                                                weight_scale, x.dtype)
diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/utils.py b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
new file mode 100644
index 000000000000..fcc664910184
--- /dev/null
+++ b/vllm/model_executor/layers/quantization/compressed_tensors/utils.py
@@ -0,0 +1,114 @@
+import re
+from enum import Enum
+from typing import Any, Dict, Iterable, Optional
+
+from pydantic import BaseModel, Field
+from torch.nn import Module
+
+
+class QuantizationType(str, Enum):
+    """
+    Enum storing quantization type options
+    """
+
+    INT = "int"
+    FLOAT = "float"
+
+
+class QuantizationStrategy(str, Enum):
+    """
+    Enum storing quantization strategy options
+    """
+
+    TENSOR = "tensor"
+    CHANNEL = "channel"
+    GROUP = "group"
+    BLOCK = "block"
+    TOKEN = "token"
+
+
+class QuantizationArgs(BaseModel):
+    """
+    User facing arguments used to define a quantization config 
+    for weights or activations
+
+    :param num_bits: quantization bit depth
+    :param type: dtype to quantized to, either int or float
+    :param symmetric: whether or not quantization scale is symmetric
+    :param strategy: string determining the scope of scale/zero-point to apply
+    :param group_size: group length to use for the group strategy
+    :param block_structure: 2d block structure to use for the block 
+    strategy, must be of the format "2x4", "8x16", etc.
+    :param dynamic: set True to perform dynamic quantization -
+        values will not be calibrated during calibration phase, 
+        instead during inference new quantization ranges will be 
+        observed with every sample. Defaults to False for static
+        quantization. Note that enabling dynamic quantization 
+        will change the default observer to a memoryless one
+    """
+
+    num_bits: int = 8
+    type: QuantizationType = QuantizationType.INT
+    symmetric: bool = True
+    group_size: Optional[int] = None
+    strategy: Optional[QuantizationStrategy] = None
+    block_structure: Optional[str] = None
+    dynamic: bool = False
+    observer: str = Field(
+        default="minmax",
+        description=("The class to use to compute the quantization param - "
+                     "scale and zero-point'"),
+    )
+    observer_kwargs: Dict[str, Any] = Field(
+        default_factory=dict,
+        description=
+        ("optional dict of kwargs to be passed directly to torch quantization "
+         "Observers constructor excluding quantization range or symmetry"),
+    )
+
+
+def find_first_name_or_class_match(
+        name: str,
+        module: Module,
+        targets: Iterable[str],
+        check_contains: bool = False) -> Optional[str]:
+    """
+    Helper function to map the quantization details listed in the config 
+    for a given list of targets against each model layer. First uses the
+    layer name to try and find a match. If no name match is found, uses
+    the layer class name. Returns None otherwise.
+
+    :param name: layer name
+    :param module: torch.nn.Module
+    :param targets: list of targets to match the layer against
+    :param check_contains: whether or not to do a substring match
+    """
+
+    return _find_first_match(name, targets) or _find_first_match(
+        module.__class__.__name__, targets, check_contains)
+
+
+def _find_first_match(value: str,
+                      targets: Iterable[str],
+                      check_contains: bool = False) -> Optional[str]:
+    """
+    Returns first element of target that matches value either
+    exactly or as a regex after 're:'. If check_contains is set to True,
+    additionally checks if the target string is contained within the value.
+
+    :param value: string to compare the list of targets against
+    :param targets: list of targets to match the layer against
+    :param check_contains: whether or not to do a substring match
+    """
+
+    for target in targets:
+        if target.startswith("re:"):
+            pattern = target[3:]
+            if re.match(pattern, value):
+                return target
+        elif check_contains:
+            if target.lower() in value.lower():
+                return target
+        elif target == value:
+            return target
+    return None

From 17984a71203c3bce4bd8964bc36469f22acad776 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Fri, 7 Jun 2024 11:23:32 -0700
Subject: [PATCH 66/93] [Frontend] Add OpenAI Vision API Support (#5237)

Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 docs/source/models/vlm.rst                    |  68 ++++-
 .../serving/openai_compatible_server.md       |   4 +-
 examples/template_llava.jinja                 |  23 ++
 tests/entrypoints/test_openai_vision.py       | 286 ++++++++++++++++++
 tests/multimodal/test_utils.py                |  75 +++++
 vllm/config.py                                |  12 +-
 vllm/entrypoints/openai/serving_chat.py       | 113 ++++++-
 vllm/envs.py                                  |   6 +
 vllm/multimodal/utils.py                      |  85 ++++++
 9 files changed, 653 insertions(+), 19 deletions(-)
 create mode 100644 examples/template_llava.jinja
 create mode 100644 tests/entrypoints/test_openai_vision.py
 create mode 100644 tests/multimodal/test_utils.py
 create mode 100644 vllm/multimodal/utils.py

diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index 52afda747aab..b917688a529d 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -3,7 +3,7 @@
 Using VLMs
 ==========
 
-This document shows you how to run and serve Vision Language Models (VLMs) using vLLM.
+vLLM provides experimental support for Vision Language Models (VLMs). This document shows you how to run and serve these models using vLLM.
 
 Engine Arguments
 ----------------
@@ -54,3 +54,69 @@ For now, we only support a single image per text prompt. To pass an image to the
         print(generated_text)
 
 A code example can be found in `examples/llava_example.py <https://github.com/vllm-project/vllm/blob/main/examples/llava_example.py>`_.
+
+Online OpenAI Vision API Compatible Inference
+----------------------------------------------
+
+You can serve vision language models with vLLM's HTTP server that is compatible with `OpenAI Vision API <https://platform.openai.com/docs/guides/vision>`_.
+
+.. note::
+    Currently, vLLM supports only **single** ``image_url`` input per ``messages``. Support for multi-image inputs will be
+    added in the future.
+
+Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with vLLM API server.
+
+.. important::
+    Since OpenAI Vision API is based on `Chat <https://platform.openai.com/docs/api-reference/chat>`_ API, a chat template 
+    is **required** to launch the API server if the model's tokenizer does not come with one. In this example, we use the 
+    HuggingFace Llava chat template that you can find in the example folder `here <https://github.com/vllm-project/vllm/blob/main/examples/template_llava.jinja>`_.
+
+.. code-block:: bash
+
+    python -m vllm.entrypoints.openai.api_server \
+        --model llava-hf/llava-1.5-7b-hf \
+        --image-input-type pixel_values \
+        --image-token-id 32000 \
+        --image-input-shape 1,3,336,336 \
+        --image-feature-size 576 \
+        --chat-template template_llava.jinja
+
+To consume the server, you can use the OpenAI client like in the example below:
+
+.. code-block:: python
+
+    from openai import OpenAI
+    openai_api_key = "EMPTY"
+    openai_api_base = "http://localhost:8000/v1"
+    client = OpenAI(
+        api_key=openai_api_key,
+        base_url=openai_api_base,
+    )
+    chat_response = client.chat.completions.create(
+        model="llava-hf/llava-1.5-7b-hf",
+        messages=[{
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "What's in this image?"},
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+                    },
+                },
+            ],
+        }],
+    )
+    print("Chat response:", chat_response)
+
+.. note::
+
+    By default, the timeout for fetching images through http url is ``5`` seconds. You can override this by setting the environment variable:
+
+    .. code-block:: shell
+
+        export VLLM_IMAGE_FETCH_TIMEOUT=<timeout>
+
+.. note::
+    The prompt formatting with the image token ``<image>`` is not needed when serving VLMs with the API server since the prompt will be 
+    processed automatically by the server.
diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md
index a912949352b8..6248d8468375 100644
--- a/docs/source/serving/openai_compatible_server.md
+++ b/docs/source/serving/openai_compatible_server.md
@@ -30,6 +30,8 @@ Please see the [OpenAI API Reference](https://platform.openai.com/docs/api-refer
 - Chat: `tools`, and `tool_choice`.
 - Completions: `suffix`.
 
+vLLM also provides experimental support for OpenAI Vision API compatible inference. See more details in [Using VLMs](../models/vlm.rst).
+
 ## Extra Parameters
 vLLM supports a set of parameters that are not part of the OpenAI API.
 In order to use them, you can pass them as extra parameters in the OpenAI client.
@@ -120,4 +122,4 @@ It is the callers responsibility to prompt the model with the tool information,
 
 vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter.
 
-Please refer to the OpenAI API reference documentation for more information.
\ No newline at end of file
+Please refer to the OpenAI API reference documentation for more information.
diff --git a/examples/template_llava.jinja b/examples/template_llava.jinja
new file mode 100644
index 000000000000..6a902ee16772
--- /dev/null
+++ b/examples/template_llava.jinja
@@ -0,0 +1,23 @@
+{%- if messages[0]['role'] == 'system' -%}
+    {%- set system_message = messages[0]['content'] -%}
+    {%- set messages = messages[1:] -%}
+{%- else -%}
+    {% set system_message = '' -%}
+{%- endif -%}
+
+{{ bos_token + system_message }}
+{%- for message in messages -%}
+    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
+        {{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}
+    {%- endif -%}
+
+    {%- if message['role'] == 'user' -%}
+        {{ 'USER: ' + message['content'] + '\n' }}
+    {%- elif message['role'] == 'assistant' -%}
+        {{ 'ASSISTANT: ' + message['content'] + eos_token + '\n' }}
+    {%- endif -%}
+{%- endfor -%}
+
+{%- if add_generation_prompt -%}
+    {{ 'ASSISTANT:' }}
+{% endif %}
diff --git a/tests/entrypoints/test_openai_vision.py b/tests/entrypoints/test_openai_vision.py
new file mode 100644
index 000000000000..cc03b04e0b0e
--- /dev/null
+++ b/tests/entrypoints/test_openai_vision.py
@@ -0,0 +1,286 @@
+from pathlib import Path
+from typing import Dict
+
+import openai
+import pytest
+import pytest_asyncio
+import ray
+
+from vllm.multimodal.utils import ImageFetchAiohttp, encode_image_base64
+
+from ..utils import ServerRunner
+
+MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
+LLAVA_CHAT_TEMPLATE = (Path(__file__).parent.parent.parent /
+                       "examples/template_llava.jinja")
+assert LLAVA_CHAT_TEMPLATE.exists()
+# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
+TEST_IMAGE_URLS = [
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+]
+
+pytestmark = pytest.mark.openai
+
+
+@pytest.fixture(scope="module")
+def server():
+    ray.init()
+    server_runner = ServerRunner.remote([
+        "--model",
+        MODEL_NAME,
+        "--dtype",
+        "bfloat16",
+        "--max-model-len",
+        "4096",
+        "--enforce-eager",
+        "--image-input-type",
+        "pixel_values",
+        "--image-token-id",
+        "32000",
+        "--image-input-shape",
+        "1,3,336,336",
+        "--image-feature-size",
+        "576",
+        "--chat-template",
+        str(LLAVA_CHAT_TEMPLATE),
+    ])
+    ray.get(server_runner.ready.remote())
+    yield server_runner
+    ray.shutdown()
+
+
+@pytest.fixture(scope="session")
+def client():
+    client = openai.AsyncOpenAI(
+        base_url="http://localhost:8000/v1",
+        api_key="token-abc123",
+    )
+    yield client
+
+
+@pytest_asyncio.fixture(scope="session")
+async def base64_encoded_image() -> Dict[str, str]:
+    return {
+        image_url:
+        encode_image_base64(await ImageFetchAiohttp.fetch_image(image_url))
+        for image_url in TEST_IMAGE_URLS
+    }
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_single_chat_session_image(server, client: openai.AsyncOpenAI,
+                                         model_name: str, image_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(model=model_name,
+                                                           messages=messages,
+                                                           max_tokens=10,
+                                                           logprobs=True,
+                                                           top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=596, total_tokens=606)
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_single_chat_session_image_base64encoded(
+        server, client: openai.AsyncOpenAI, model_name: str, image_url: str,
+        base64_encoded_image: Dict[str, str]):
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url":
+                    f"data:image/jpeg;base64,{base64_encoded_image[image_url]}"
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(model=model_name,
+                                                           messages=messages,
+                                                           max_tokens=10,
+                                                           logprobs=True,
+                                                           top_logprobs=5)
+    assert len(chat_completion.choices) == 1
+
+    choice = chat_completion.choices[0]
+    assert choice.finish_reason == "length"
+    assert chat_completion.usage == openai.types.CompletionUsage(
+        completion_tokens=10, prompt_tokens=596, total_tokens=606)
+
+    message = choice.message
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 10
+    assert message.role == "assistant"
+    messages.append({"role": "assistant", "content": message.content})
+
+    # test multi-turn dialogue
+    messages.append({"role": "user", "content": "express your result in json"})
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+    )
+    message = chat_completion.choices[0].message
+    assert message.content is not None and len(message.content) >= 0
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_chat_streaming_image(server, client: openai.AsyncOpenAI,
+                                    model_name: str, image_url: str):
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+
+    # test single completion
+    chat_completion = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+    )
+    output = chat_completion.choices[0].message.content
+    stop_reason = chat_completion.choices[0].finish_reason
+
+    # test streaming
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+        stream=True,
+    )
+    chunks = []
+    finish_reason_count = 0
+    async for chunk in stream:
+        delta = chunk.choices[0].delta
+        if delta.role:
+            assert delta.role == "assistant"
+        if delta.content:
+            chunks.append(delta.content)
+        if chunk.choices[0].finish_reason is not None:
+            finish_reason_count += 1
+    # finish reason should only return in last block
+    assert finish_reason_count == 1
+    assert chunk.choices[0].finish_reason == stop_reason
+    assert delta.content
+    assert "".join(chunks) == output
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+async def test_multi_image_input(server, client: openai.AsyncOpenAI,
+                                 model_name: str, image_url: str):
+
+    messages = [{
+        "role":
+        "user",
+        "content": [
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            },
+            {
+                "type": "image_url",
+                "image_url": {
+                    "url": image_url
+                }
+            },
+            {
+                "type": "text",
+                "text": "What's in this image?"
+            },
+        ],
+    }]
+
+    with pytest.raises(openai.BadRequestError):  # test multi-image input
+        await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_tokens=10,
+            temperature=0.0,
+        )
+
+    # the server should still work afterwards
+    completion = await client.completions.create(
+        model=model_name,
+        prompt=[0, 0, 0, 0, 0],
+        max_tokens=5,
+        temperature=0.0,
+    )
+    completion = completion.choices[0].text
+    assert completion is not None and len(completion) >= 0
+
+
+if __name__ == "__main__":
+    pytest.main([__file__])
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
new file mode 100644
index 000000000000..5a6395ac9e42
--- /dev/null
+++ b/tests/multimodal/test_utils.py
@@ -0,0 +1,75 @@
+import base64
+import mimetypes
+from tempfile import NamedTemporaryFile
+from typing import Dict, Tuple
+
+import numpy as np
+import pytest
+import pytest_asyncio
+from PIL import Image
+
+from vllm.multimodal.utils import ImageFetchAiohttp
+
+# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
+TEST_IMAGE_URLS = [
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
+    "https://upload.wikimedia.org/wikipedia/commons/f/fa/Grayscale_8bits_palette_sample_image.png",
+    "https://upload.wikimedia.org/wikipedia/commons/thumb/9/91/Venn_diagram_rgb.svg/1280px-Venn_diagram_rgb.svg.png",
+    "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png",
+]
+
+
+@pytest_asyncio.fixture(scope="session")
+async def url_images() -> Dict[str, Image.Image]:
+    return {
+        image_url: await ImageFetchAiohttp.fetch_image(image_url)
+        for image_url in TEST_IMAGE_URLS
+    }
+
+
+def get_supported_suffixes() -> Tuple[str, ...]:
+    # We should at least test the file types mentioned in GPT-4 with Vision
+    OPENAI_SUPPORTED_SUFFIXES = ('.png', '.jpeg', '.jpg', '.webp', '.gif')
+
+    # Additional file types that are supported by us
+    EXTRA_SUPPORTED_SUFFIXES = ('.bmp', '.tiff')
+
+    return OPENAI_SUPPORTED_SUFFIXES + EXTRA_SUPPORTED_SUFFIXES
+
+
+def _image_equals(a: Image.Image, b: Image.Image) -> bool:
+    return (np.asarray(a) == np.asarray(b.convert(a.mode))).all()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("image_url", TEST_IMAGE_URLS)
+@pytest.mark.parametrize("suffix", get_supported_suffixes())
+async def test_fetch_image_base64(url_images: Dict[str, Image.Image],
+                                  image_url: str, suffix: str):
+    url_image = url_images[image_url]
+
+    try:
+        mime_type = Image.MIME[Image.registered_extensions()[suffix]]
+    except KeyError:
+        try:
+            mime_type = mimetypes.types_map[suffix]
+        except KeyError:
+            pytest.skip('No MIME type')
+
+    with NamedTemporaryFile(suffix=suffix) as f:
+        try:
+            url_image.save(f.name)
+        except Exception as e:
+            if e.args[0] == 'cannot write mode RGBA as JPEG':
+                pytest.skip('Conversion not supported')
+
+            raise
+
+        base64_image = base64.b64encode(f.read()).decode("utf-8")
+        data_url = f"data:{mime_type};base64,{base64_image}"
+
+        data_image = await ImageFetchAiohttp.fetch_image(data_url)
+        if _image_equals(url_image, Image.open(f)):
+            assert _image_equals(url_image, data_image)
+        else:
+            pass  # Lossy format; only check that image can be opened
diff --git a/vllm/config.py b/vllm/config.py
index cca0496eeb32..a90b9aa83ad7 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -5,7 +5,7 @@
                     Union)
 
 import torch
-from transformers import PretrainedConfig
+from transformers import PretrainedConfig, PreTrainedTokenizerBase
 
 from vllm.logger import init_logger
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
@@ -1151,6 +1151,16 @@ def get_image_input_enum_type(cls, value: str) -> ImageInputType:
                              f"Expecting to choose from "
                              f"{[x.name for x in cls.ImageInputType]}.") from e
 
+    #TODO(ywang96): make this a cached property once we refactor the
+    # VisionLanguageConfig class.
+    def get_image_token_text(
+            self, tokenizer: PreTrainedTokenizerBase) -> Tuple[str, str]:
+        """Get the image token placeholder text to be inserted into the 
+        text prompt and the string representation of the image token id.
+        """
+        image_token_str = tokenizer.decode(self.image_token_id)
+        return image_token_str * self.image_feature_size, image_token_str
+
     def as_cli_args_dict(self) -> Dict[str, Any]:
         """Flatten vision language config to pure args.
 
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index 883567abf415..c025e7e96826 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -1,15 +1,16 @@
 import codecs
 import time
-from dataclasses import dataclass
-from typing import (AsyncGenerator, AsyncIterator, Dict, Iterable, List,
-                    Optional)
+from dataclasses import dataclass, field
+from typing import (AsyncGenerator, AsyncIterator, Awaitable, Dict, Iterable,
+                    List, Optional)
 from typing import Sequence as GenericSequence
 from typing import TypedDict, Union, cast, final
 
 from fastapi import Request
-from openai.types.chat import ChatCompletionContentPartTextParam
+from openai.types.chat import (ChatCompletionContentPartImageParam,
+                               ChatCompletionContentPartTextParam)
 
-from vllm.config import ModelConfig
+from vllm.config import ModelConfig, VisionLanguageConfig
 from vllm.engine.async_llm_engine import AsyncLLMEngine
 from vllm.entrypoints.openai.protocol import (
     ChatCompletionContentPartParam, ChatCompletionLogProb,
@@ -21,9 +22,13 @@
     FunctionCall, ToolCall, UsageInfo)
 from vllm.entrypoints.openai.serving_engine import (LoRAModulePath,
                                                     OpenAIServing)
+from vllm.inputs import PromptInputs
 from vllm.logger import init_logger
 from vllm.model_executor.guided_decoding import (
     get_guided_decoding_logits_processor)
+from vllm.multimodal.image import ImagePixelData
+from vllm.multimodal.utils import (async_get_and_parse_image,
+                                   get_full_image_text_prompt)
 from vllm.outputs import RequestOutput
 from vllm.sequence import Logprob
 from vllm.utils import random_uuid
@@ -40,6 +45,8 @@ class ConversationMessage(TypedDict):
 @dataclass(frozen=True)
 class ChatMessageParseResult:
     messages: List[ConversationMessage]
+    image_futures: List[Awaitable[ImagePixelData]] = field(
+        default_factory=list)
 
 
 class OpenAIServingChat(OpenAIServing):
@@ -94,19 +101,76 @@ def _parse_chat_message_content_parts(
         parts: Iterable[ChatCompletionContentPartParam],
     ) -> ChatMessageParseResult:
         texts: List[str] = []
+        image_futures: List[Awaitable[ImagePixelData]] = []
 
-        for _, part in enumerate(parts):
+        vlm_config: Optional[VisionLanguageConfig] = getattr(
+            self.engine.engine, "vision_language_config", None)
+        model_config = getattr(self.engine.engine, "model_config", None)
+
+        for part in parts:
             part_type = part["type"]
             if part_type == "text":
                 text = cast(ChatCompletionContentPartTextParam, part)["text"]
 
                 texts.append(text)
+            elif part_type == "image_url":
+                if vlm_config is None:
+                    raise ValueError(
+                        "'image_url' input is not supported as the loaded "
+                        "model is not multimodal.")
+
+                elif len(image_futures) == 0:
+                    assert self.tokenizer is not None
+                    image_url = cast(ChatCompletionContentPartImageParam,
+                                     part)["image_url"]
+
+                    if image_url.get("detail", "auto") != "auto":
+                        logger.warning(
+                            "'image_url.detail' is currently not supported and "
+                            "will be ignored.")
+
+                    image_future = async_get_and_parse_image(image_url["url"])
+                    image_futures.append(image_future)
+
+                else:
+                    raise NotImplementedError(
+                        "Multiple 'image_url' input is currently not supported."
+                    )
+
             else:
                 raise NotImplementedError(f"Unknown part type: {part_type}")
 
-        messages = [ConversationMessage(role=role, content="\n".join(texts))]
+        text_prompt = "\n".join(texts)
+
+        if vlm_config is not None and len(image_futures):
+
+            (image_token_prompt,
+             image_token_str) = vlm_config.get_image_token_text(self.tokenizer)
 
-        return ChatMessageParseResult(messages=messages)
+            # NOTE: If image token string (e.g, <image>) is already present
+            # in the text prompt, we assume it follows the same format required
+            # by the engine.
+            if image_token_str in text_prompt:
+                logger.warning(
+                    "Detected image token string in the text prompt. "
+                    "Skipping prompt formatting.")
+                messages = [
+                    ConversationMessage(role=role, content=text_prompt)
+                ]
+
+            else:
+                full_prompt = get_full_image_text_prompt(
+                    image_prompt=image_token_prompt,
+                    text_prompt=text_prompt,
+                    config=model_config)
+                messages = [
+                    ConversationMessage(role=role, content=full_prompt)
+                ]
+        else:
+            messages = [ConversationMessage(role=role, content=text_prompt)]
+
+        return ChatMessageParseResult(messages=messages,
+                                      image_futures=image_futures)
 
     def _parse_chat_message_content(
         self,
@@ -116,10 +180,10 @@ def _parse_chat_message_content(
         content = message.get("content")
 
         if content is None:
-            return ChatMessageParseResult(messages=[])
+            return ChatMessageParseResult(messages=[], image_futures=[])
         if isinstance(content, str):
             messages = [ConversationMessage(role=role, content=content)]
-            return ChatMessageParseResult(messages=messages)
+            return ChatMessageParseResult(messages=messages, image_futures=[])
 
         return self._parse_chat_message_content_parts(role, content)
 
@@ -144,11 +208,13 @@ async def create_chat_completion(
 
         try:
             conversation: List[ConversationMessage] = []
+            image_futures: List[Awaitable[ImagePixelData]] = []
 
             for msg in request.messages:
-                parsed_msg = self._parse_chat_message_content(msg)
+                chat_parsed_result = self._parse_chat_message_content(msg)
 
-                conversation.extend(parsed_msg.messages)
+                conversation.extend(chat_parsed_result.messages)
+                image_futures.extend(chat_parsed_result.image_futures)
 
             prompt = self.tokenizer.apply_chat_template(
                 conversation=conversation,
@@ -159,6 +225,17 @@ async def create_chat_completion(
             logger.error("Error in applying chat template from request: %s", e)
             return self.create_error_response(str(e))
 
+        # Fetch image data
+        image_data: Optional[ImagePixelData] = None
+        try:
+            if len(image_futures):
+                # since we support only single image currently
+                assert len(image_futures) == 1
+                image_data = await image_futures[0]
+        except Exception as e:
+            logger.error("Error in loading image data: %s", e)
+            return self.create_error_response(str(e))
+
         request_id = f"cmpl-{random_uuid()}"
         try:
             # Tokenize/detokenize depending on prompt format (string/token list)
@@ -183,11 +260,15 @@ async def create_chat_completion(
         except ValueError as e:
             return self.create_error_response(str(e))
 
+        inputs: PromptInputs = {
+            "prompt": prompt_text,
+            "prompt_token_ids": prompt_ids,
+        }
+        if image_data is not None:
+            inputs["multi_modal_data"] = image_data
+
         result_generator = self.engine.generate(
-            {
-                "prompt": prompt_text,
-                "prompt_token_ids": prompt_ids
-            },
+            inputs,
             sampling_params,
             request_id,
             lora_request,
diff --git a/vllm/envs.py b/vllm/envs.py
index 7d5c7371b774..b140aa6d658e 100644
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -29,6 +29,7 @@
     VLLM_CPU_KVCACHE_SPACE: int = 0
     VLLM_USE_RAY_COMPILED_DAG: bool = False
     VLLM_WORKER_MULTIPROC_METHOD: str = "spawn"
+    VLLM_IMAGE_FETCH_TIMEOUT: int = 5
     VLLM_TARGET_DEVICE: str = "cuda"
     MAX_JOBS: Optional[str] = None
     NVCC_THREADS: Optional[str] = None
@@ -216,6 +217,11 @@
     # Both spawn and fork work
     "VLLM_WORKER_MULTIPROC_METHOD":
     lambda: os.getenv("VLLM_WORKER_MULTIPROC_METHOD", "spawn"),
+
+    # Timeout for fetching images when serving multimodal models
+    # Default is 5 seconds
+    "VLLM_IMAGE_FETCH_TIMEOUT":
+    lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),
 }
 
 # end-env-vars-definition
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
new file mode 100644
index 000000000000..b8ad6f8f78e2
--- /dev/null
+++ b/vllm/multimodal/utils.py
@@ -0,0 +1,85 @@
+import base64
+from io import BytesIO
+from typing import Optional, Union
+
+import aiohttp
+from PIL import Image
+
+from vllm.config import ModelConfig
+from vllm.envs import VLLM_IMAGE_FETCH_TIMEOUT
+from vllm.multimodal.image import ImagePixelData
+
+
+class ImageFetchAiohttp:
+    aiohttp_client: Optional[aiohttp.ClientSession] = None
+
+    @classmethod
+    def get_aiohttp_client(cls) -> aiohttp.ClientSession:
+        if cls.aiohttp_client is None:
+            timeout = aiohttp.ClientTimeout(total=VLLM_IMAGE_FETCH_TIMEOUT)
+            connector = aiohttp.TCPConnector()
+            cls.aiohttp_client = aiohttp.ClientSession(timeout=timeout,
+                                                       connector=connector)
+
+        return cls.aiohttp_client
+
+    @classmethod
+    async def fetch_image(cls, image_url: str) -> Image.Image:
+        """Load PIL image from a url or base64 encoded openai GPT4V format"""
+
+        if image_url.startswith('http'):
+            # Avoid circular import
+            from vllm import __version__ as VLLM_VERSION
+
+            client = cls.get_aiohttp_client()
+            headers = {"User-Agent": f"vLLM/{VLLM_VERSION}"}
+
+            async with client.get(url=image_url, headers=headers) as response:
+                response.raise_for_status()
+                image_raw = await response.read()
+            image = Image.open(BytesIO(image_raw))
+
+        # Only split once and assume the second part is the base64 encoded image
+        elif image_url.startswith('data:image'):
+            image = load_image_from_base64(image_url.split(',', 1)[1])
+
+        else:
+            raise ValueError("Invalid image url: A valid image url must start "
+                             "with either 'data:image' or 'http'.")
+
+        return image
+
+
+async def async_get_and_parse_image(image_url: str) -> ImagePixelData:
+    with await ImageFetchAiohttp.fetch_image(image_url) as image:
+        return ImagePixelData(image)
+
+
+def encode_image_base64(image: Image.Image, format: str = 'JPEG') -> str:
+    """encode image to base64 format."""
+
+    buffered = BytesIO()
+    if format == 'JPEG':
+        image = image.convert('RGB')
+    image.save(buffered, format)
+    return base64.b64encode(buffered.getvalue()).decode('utf-8')
+
+
+def load_image_from_base64(image: Union[bytes, str]) -> Image.Image:
+    """Load image from base64 format."""
+    return Image.open(BytesIO(base64.b64decode(image)))
+
+
+# TODO(ywang96): move this to a model registry for preprocessing vision
+# language prompts based on the model type.
+def get_full_image_text_prompt(image_prompt: str, text_prompt: str,
+                               config: ModelConfig) -> str:
+    """Combine image and text prompts for vision language model depending on
+    the model architecture."""
+
+    if config.hf_config.model_type == "llava":
+        full_prompt = f"{image_prompt}\n{text_prompt}"
+    else:
+        raise ValueError(
+            f"Unsupported model type: {config.hf_config.model_type}")
+    return full_prompt

From 3da01195807f8d8a924f19c793b7caaf58f1fa10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jie=20Fu=20=28=E5=82=85=E6=9D=B0=29?= <jiefu@tencent.com>
Date: Sat, 8 Jun 2024 05:09:13 +0800
Subject: [PATCH 67/93] [Misc] Remove unused cuda_utils.h in CPU backend
 (#5345)

---
 csrc/cpu/pybind.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/csrc/cpu/pybind.cpp b/csrc/cpu/pybind.cpp
index 63082393c810..e5b2ce4f3011 100644
--- a/csrc/cpu/pybind.cpp
+++ b/csrc/cpu/pybind.cpp
@@ -1,5 +1,4 @@
 #include "cache.h"
-#include "cuda_utils.h"
 #include "ops.h"
 #include <torch/extension.h>
 

From d65c3abbaa26930c9b431003f7a3bdce48e1b308 Mon Sep 17 00:00:00 2001
From: Calvinn Ng <39899397+Calvinnncy97@users.noreply.github.com>
Date: Sat, 8 Jun 2024 05:10:21 +0800
Subject: [PATCH 68/93] fix DbrxFusedNormAttention missing cache_config (#5340)

Co-authored-by: team <calvinn.ng@ahrefs.com>
---
 vllm/model_executor/models/dbrx.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py
index 8ff19a2015e0..59af42445f32 100644
--- a/vllm/model_executor/models/dbrx.py
+++ b/vllm/model_executor/models/dbrx.py
@@ -247,11 +247,12 @@ class DbrxFusedNormAttention(nn.Module):
     def __init__(
         self,
         config: DbrxConfig,
+        cache_config: Optional[CacheConfig] = None,
         quant_config: Optional[QuantizationConfig] = None,
     ):
         super().__init__()
         self.d_model = config.d_model
-        self.attn = DbrxAttention(config, quant_config)
+        self.attn = DbrxAttention(config, cache_config, quant_config)
         self.norm_1 = nn.LayerNorm(self.d_model)
         self.norm_2 = nn.LayerNorm(self.d_model)
 

From e349c2d596dccba7c44059ae3aec9b1cb53c623e Mon Sep 17 00:00:00 2001
From: Cheng Li <pistasable@gmail.com>
Date: Fri, 7 Jun 2024 17:42:05 -0700
Subject: [PATCH 69/93] [Bug Fix] Fix the support check for FP8 CUTLASS 
 (#5352)

Bug description:
With torch 2.4.0.dev20240603+cu121,
cutlass_fp8_supported outputs False, and the (capability, version) before the comparison is (90, 11111111112)

This PR fixes the support check for FP8 CUTLASS ( cutlass_fp8_supported) which was introduced in https://github.com/vllm-project/vllm/pull/5183.
---
 vllm/model_executor/layers/quantization/fp8.py | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index 1323360df6c8..c70183dea7d3 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -20,16 +20,16 @@
 def cutlass_fp8_supported() -> bool:
     capability = torch.cuda.get_device_capability()
     capability = capability[0] * 10 + capability[1]
-    version = torch.version.cuda
-    version = version[0] * 10 + version[1]
+    major, minor = torch.version.cuda.split(".")
+    version = int(major) * 10 + int(minor)
 
     # CUTLASS FP8 kernels need at least
     #   CUDA 12.0 on SM90 systems (Hopper)
     #   CUDA 12.4 on SM89 systems (Lovelace)
     gpu_is_supported = False
-    if capability >= 900:
+    if capability >= 90:
         gpu_is_supported = version > 120
-    elif capability >= 890:
+    elif capability >= 89:
         gpu_is_supported = version > 124
 
     return gpu_is_supported
@@ -103,7 +103,7 @@ class Fp8LinearMethod(LinearMethodBase):
     1. Only support per-tensor quantization due to torch._scaled_mm support.
     2. Only support float8_e4m3fn data type due to the limitation of
        torch._scaled_mm (https://github.com/pytorch/pytorch/blob/2e48b39603411a41c5025efbe52f89560b827825/aten/src/ATen/native/cuda/Blas.cpp#L854-L856)
-       
+
     Args:
         quant_config: The quantization config.
     """
@@ -298,8 +298,8 @@ def __init__(self, quant_config: Fp8Config):
         self.quant_config = quant_config
 
     def create_weights(self, layer: torch.nn.Module):
-        """Create "weight" (aka kv_scale) for an attention layer. 
-        
+        """Create "weight" (aka kv_scale) for an attention layer.
+
         Args:
             layer: The layer that is using the QuantizeMethodBase factory.
         """

From 4d5b6995a9ae668da9e16a83a77758091a46b4ad Mon Sep 17 00:00:00 2001
From: Benjamin Kitor <bkitor@gmail.com>
Date: Fri, 7 Jun 2024 18:20:16 -0700
Subject: [PATCH 70/93] [Misc] Add args for selecting distributed executor to
 benchmarks (#5335)

---
 benchmarks/benchmark_latency.py    | 10 +++++++++-
 benchmarks/benchmark_throughput.py | 13 +++++++++++--
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
index f69d91a086a9..1a41b66b3882 100644
--- a/benchmarks/benchmark_latency.py
+++ b/benchmarks/benchmark_latency.py
@@ -36,7 +36,8 @@ def main(args: argparse.Namespace):
               enable_chunked_prefill=args.enable_chunked_prefill,
               download_dir=args.download_dir,
               block_size=args.block_size,
-              gpu_memory_utilization=args.gpu_memory_utilization)
+              gpu_memory_utilization=args.gpu_memory_utilization,
+              distributed_executor_backend=args.distributed_executor_backend)
 
     sampling_params = SamplingParams(
         n=args.n,
@@ -221,5 +222,12 @@ def run_to_completion(profile_dir: Optional[str] = None):
                         help='the fraction of GPU memory to be used for '
                         'the model executor, which can range from 0 to 1.'
                         'If unspecified, will use the default value of 0.9.')
+    parser.add_argument(
+        '--distributed-executor-backend',
+        choices=['ray', 'mp'],
+        default=None,
+        help='Backend to use for distributed serving. When more than 1 GPU '
+        'is used, will be automatically set to "ray" if installed '
+        'or "mp" (multiprocessing) otherwise.')
     args = parser.parse_args()
     main(args)
diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
index 7c8cb5ee8cea..90f7433e0ae2 100644
--- a/benchmarks/benchmark_throughput.py
+++ b/benchmarks/benchmark_throughput.py
@@ -78,6 +78,7 @@ def run_vllm(
     enable_prefix_caching: bool,
     enable_chunked_prefill: bool,
     max_num_batched_tokens: int,
+    distributed_executor_backend: Optional[str],
     gpu_memory_utilization: float = 0.9,
     download_dir: Optional[str] = None,
 ) -> float:
@@ -100,6 +101,7 @@ def run_vllm(
         download_dir=download_dir,
         enable_chunked_prefill=enable_chunked_prefill,
         max_num_batched_tokens=max_num_batched_tokens,
+        distributed_executor_backend=distributed_executor_backend,
     )
 
     # Add the requests to the engine.
@@ -225,8 +227,8 @@ def main(args: argparse.Namespace):
             args.enforce_eager, args.kv_cache_dtype,
             args.quantization_param_path, args.device,
             args.enable_prefix_caching, args.enable_chunked_prefill,
-            args.max_num_batched_tokens, args.gpu_memory_utilization,
-            args.download_dir)
+            args.max_num_batched_tokens, args.distributed_executor_backend,
+            args.gpu_memory_utilization, args.download_dir)
     elif args.backend == "hf":
         assert args.tensor_parallel_size == 1
         elapsed_time = run_hf(requests, args.model, tokenizer, args.n,
@@ -368,6 +370,13 @@ def main(args: argparse.Namespace):
         type=str,
         default=None,
         help='Path to save the throughput results in JSON format.')
+    parser.add_argument(
+        '--distributed-executor-backend',
+        choices=['ray', 'mp'],
+        default=None,
+        help='Backend to use for distributed serving. When more than 1 GPU '
+        'is used, will be automatically set to "ray" if installed '
+        'or "mp" (multiprocessing) otherwise.')
     args = parser.parse_args()
     if args.tokenizer is None:
         args.tokenizer = args.model

From f12b6365853d2e0245cc4ed4e63db5d97ebaceeb Mon Sep 17 00:00:00 2001
From: Hongxia Yang <62075498+hongxiayang@users.noreply.github.com>
Date: Fri, 7 Jun 2024 22:13:12 -0400
Subject: [PATCH 71/93] [ROCm][AMD] Use pytorch sdpa math backend to do naive
 attention (#4965)

---
 vllm/attention/backends/rocm_flash_attn.py | 62 ++++++++++------------
 1 file changed, 29 insertions(+), 33 deletions(-)

diff --git a/vllm/attention/backends/rocm_flash_attn.py b/vllm/attention/backends/rocm_flash_attn.py
index e92e6c5e2dc8..9294068c64d1 100644
--- a/vllm/attention/backends/rocm_flash_attn.py
+++ b/vllm/attention/backends/rocm_flash_attn.py
@@ -247,7 +247,7 @@ def __init__(
                     self.use_naive_attn = True
 
             if self.use_naive_attn:
-                self.attn_func = _naive_attention
+                self.attn_func = _sdpa_attention
                 logger.debug("Using naive attention in ROCmBackend")
 
     def repeat_kv(self, x: torch.Tensor, n_rep: int) -> torch.Tensor:
@@ -342,11 +342,18 @@ def forward(
                         # Interleave for MQA workaround.
                         key = self.repeat_kv(key, self.num_queries_per_kv)
                         value = self.repeat_kv(value, self.num_queries_per_kv)
+                    query = query.movedim(0, query.dim() - 2)
+                    key = key.movedim(0, key.dim() - 2)
+                    value = value.movedim(0, value.dim() - 2)
+                    # sdpa math backend attention
                     out = self.attn_func(
                         query,
                         key,
                         value,
                         prefill_meta.seq_lens,
+                        num_tokens,
+                        self.num_heads,
+                        self.head_size,
                         self.scale,
                     )
                 else:
@@ -402,45 +409,34 @@ def forward(
         return output.view(num_tokens, hidden_size)
 
 
-def _naive_attention(
+def _sdpa_attention(
     query: torch.Tensor,
     key: torch.Tensor,
     value: torch.Tensor,
     seq_lens: List[int],
+    num_tokens: int,
+    num_heads: int,
+    head_size: int,
     scale: float,
 ) -> torch.Tensor:
-    output = torch.empty_like(query)
     start = 0
-    for _, seq_len in enumerate(seq_lens):
+    output = torch.empty((num_tokens, num_heads, head_size),
+                         dtype=query.dtype,
+                         device=query.device)
+
+    for seq_len in seq_lens:
         end = start + seq_len
-        out = _naive_masked_attention(
-            query[start:end],
-            key[start:end],
-            value[start:end],
-            scale,
-        )
-        # TODO(woosuk): Unnecessary copy. Optimize.
-        output[start:end].copy_(out)
-        start += seq_len
+        with torch.backends.cuda.sdp_kernel(enable_math=True,
+                                            enable_flash=False,
+                                            enable_mem_efficient=False):
+            sub_out = torch.nn.functional.scaled_dot_product_attention(
+                query[:, start:end, :],
+                key[:, start:end, :],
+                value[:, start:end, :],
+                dropout_p=0.0,
+                is_causal=True,
+                scale=scale).movedim(query.dim() - 2, 0)
+            output[start:end, :, :] = sub_out
+            start = end
 
     return output
-
-
-def _naive_masked_attention(
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    scale: float,
-) -> torch.Tensor:
-    seq_len, head_size, head_dim = query.shape
-    attn_mask = torch.triu(torch.ones(seq_len,
-                                      seq_len,
-                                      dtype=query.dtype,
-                                      device=query.device),
-                           diagonal=1)
-    attn_mask = attn_mask * torch.finfo(query.dtype).min
-    attn_weights = scale * torch.einsum("qhd,khd->hqk", query, key).float()
-    attn_weights = attn_weights + attn_mask.float()
-    attn_weights = torch.softmax(attn_weights, dim=-1).to(value.dtype)
-    out = torch.einsum("hqk,khd->qhd", attn_weights, value)
-    return out

From 842974cd088d3c0f2645a56af0502b26d463000b Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Fri, 7 Jun 2024 22:31:32 -0700
Subject: [PATCH 72/93] [CI/Test] improve robustness of test (hf_runner)
 (#5347)

[CI/Test] improve robustness of test by replacing del with context manager (hf_runner) (#5347)
---
 .../test_basic_correctness.py                  |  5 ++---
 .../basic_correctness/test_chunked_prefill.py  |  5 ++---
 tests/basic_correctness/test_preemption.py     | 17 +++++++----------
 tests/conftest.py                              |  5 ++++-
 .../test_basic_distributed_correctness.py      |  5 ++---
 .../test_chunked_prefill_distributed.py        |  5 ++---
 tests/models/test_big_models.py                |  5 ++---
 tests/models/test_embedding.py                 |  5 ++---
 tests/models/test_llava.py                     |  9 ++++-----
 tests/models/test_mistral.py                   |  7 +++----
 tests/models/test_models.py                    |  5 ++---
 tests/samplers/test_beam_search.py             |  7 +++----
 tests/samplers/test_logprobs.py                | 11 +++++------
 tests/tensorizer_loader/test_tensorizer.py     | 18 ++++++++----------
 14 files changed, 48 insertions(+), 61 deletions(-)

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 7d8117447ca0..4561c8b1e143 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -43,9 +43,8 @@ def test_models(
     if backend_by_env_var == "FLASHINFER" and enforce_eager is False:
         pytest.skip("Skipping non-eager test for FlashInferBackend.")
 
-    hf_model = hf_runner(model, dtype=dtype)
-    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    del hf_model
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     vllm_model = vllm_runner(model,
                              dtype=dtype,
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index 8d7e88d15136..44e561130bf3 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -43,9 +43,8 @@ def test_models(
         enable_chunked_prefill = True
         max_num_batched_tokens = chunked_prefill_token_size
 
-    hf_model = hf_runner(model, dtype=dtype)
-    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    del hf_model
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     vllm_model = vllm_runner(
         model,
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
index 29a4c39cd25a..58610e9e1016 100644
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -43,9 +43,8 @@ def test_chunked_prefill_recompute(
         enable_chunked_prefill = True
         max_num_batched_tokens = chunked_prefill_token_size
 
-    hf_model = hf_runner(model, dtype=dtype)
-    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    del hf_model
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     vllm_model = vllm_runner(
         model,
@@ -82,9 +81,8 @@ def test_preemption(
 ) -> None:
     """By default, recompute preemption is enabled"""
 
-    hf_model = hf_runner(model, dtype=dtype)
-    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    del hf_model
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     vllm_model = vllm_runner(
         model,
@@ -137,10 +135,9 @@ def test_swap(
 ) -> None:
     """Use beam search enables swapping."""
     example_prompts = example_prompts[:1]
-    hf_model = hf_runner(model, dtype=dtype)
-    hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
-                                               max_tokens)
-    del hf_model
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
+                                                   max_tokens)
 
     vllm_model = vllm_runner(
         model,
diff --git a/tests/conftest.py b/tests/conftest.py
index 9343a5a83b30..1f7868aef07a 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -356,7 +356,10 @@ def generate_greedy_logprobs_limit(
     def encode(self, prompts: List[str]) -> List[List[torch.Tensor]]:
         return self.model.encode(prompts)
 
-    def __del__(self):
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
         del self.model
         cleanup()
 
diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py
index 5178bc5dae56..3b2192a8ab26 100644
--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
@@ -46,9 +46,8 @@ def test_models(
     backend_by_env_var = os.getenv(VLLM_ATTENTION_BACKEND)
     enforce_eager = backend_by_env_var == "FLASHINFER"
 
-    hf_model = hf_runner(model, dtype=dtype)
-    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    del hf_model
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     vllm_model = vllm_runner(
         model,
diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py
index a15d0f876655..b469ad4cfc0e 100644
--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ b/tests/distributed/test_chunked_prefill_distributed.py
@@ -49,9 +49,8 @@ def test_models(
     enable_chunked_prefill = True
     max_num_batched_tokens = chunked_prefill_token_size
 
-    hf_model = hf_runner(model, dtype=dtype)
-    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    del hf_model
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     vllm_model = vllm_runner(
         model,
diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py
index fd1253f73c93..f2fc0555b136 100644
--- a/tests/models/test_big_models.py
+++ b/tests/models/test_big_models.py
@@ -65,9 +65,8 @@ def test_models(
         pytest.skip(reason="This model has custom code that does not "
                     "support Python 3.8")
 
-    hf_model = hf_runner(model, dtype=dtype)
-    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    del hf_model
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     vllm_model = vllm_runner(model, dtype=dtype)
     vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/tests/models/test_embedding.py b/tests/models/test_embedding.py
index 668ed3a520a3..8ad9ac2d4f59 100644
--- a/tests/models/test_embedding.py
+++ b/tests/models/test_embedding.py
@@ -28,9 +28,8 @@ def test_models(
     model: str,
     dtype: str,
 ) -> None:
-    hf_model = hf_runner(model, dtype=dtype, is_embedding_model=True)
-    hf_outputs = hf_model.encode(example_prompts)
-    del hf_model
+    with hf_runner(model, dtype=dtype, is_embedding_model=True) as hf_model:
+        hf_outputs = hf_model.encode(example_prompts)
 
     vllm_model = vllm_runner(model, dtype=dtype)
     vllm_outputs = vllm_model.encode(example_prompts)
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index f03dbdbb770e..1e7ee52832b1 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -84,11 +84,10 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
     """
     model_id, vlm_config = model_and_config
 
-    hf_model = hf_runner(model_id, dtype=dtype, is_vision_model=True)
-    hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
-                                          max_tokens,
-                                          images=hf_images)
-    del hf_model
+    with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
+        hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
+                                              max_tokens,
+                                              images=hf_images)
 
     vllm_image_prompts = [
         p.replace("<image>", "<image>" * vlm_config.image_feature_size)
diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py
index 290d68501bc5..f1554ce2180c 100644
--- a/tests/models/test_mistral.py
+++ b/tests/models/test_mistral.py
@@ -28,10 +28,9 @@ def test_models(
     num_logprobs: int,
 ) -> None:
     # TODO(sang): Sliding window should be tested separately.
-    hf_model = hf_runner(model, dtype=dtype)
-    hf_outputs = hf_model.generate_greedy_logprobs_limit(
-        example_prompts, max_tokens, num_logprobs)
-    del hf_model
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy_logprobs_limit(
+            example_prompts, max_tokens, num_logprobs)
 
     vllm_model = vllm_runner(model, dtype=dtype)
     vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
diff --git a/tests/models/test_models.py b/tests/models/test_models.py
index 934749625d08..b92c3ebb6b6f 100644
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -38,9 +38,8 @@ def test_models(
     # To pass the small model tests, we need full precision.
     assert dtype == "float"
 
-    hf_model = hf_runner(model, dtype=dtype)
-    hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
-    del hf_model
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
     vllm_model = vllm_runner(model, dtype=dtype)
     vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index 2682f284505b..2e373cb87cd1 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -30,10 +30,9 @@ def test_beam_search_single_input(
     beam_width: int,
 ) -> None:
     example_prompts = example_prompts[:1]
-    hf_model = hf_runner(model, dtype=dtype)
-    hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
-                                               max_tokens)
-    del hf_model
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
+                                                   max_tokens)
 
     vllm_model = vllm_runner(model, dtype=dtype)
     vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 61720cccf50b..25d59391ca98 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -32,12 +32,11 @@ def test_get_prompt_logprobs(
         max_num_batched_tokens = chunked_prefill_token_size
 
     max_tokens = 5
-    hf_model = hf_runner(model, dtype=dtype)
-    hf_logprobs = hf_model.generate_greedy_logprobs(
-        example_prompts,
-        max_tokens=max_tokens,
-    )
-    del hf_model
+    with hf_runner(model, dtype=dtype) as hf_model:
+        hf_logprobs = hf_model.generate_greedy_logprobs(
+            example_prompts,
+            max_tokens=max_tokens,
+        )
 
     vllm_model = vllm_runner(
         model,
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index b63fcf23af09..949aaea0080a 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -121,16 +121,14 @@ def test_deserialized_encrypted_vllm_model_has_same_outputs(
 
 def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
                                                 tmp_path):
-    hf_model = hf_runner(model_ref)
-    model_path = tmp_path / (model_ref + ".tensors")
-    max_tokens = 50
-    outputs = hf_model.generate_greedy(prompts, max_tokens=max_tokens)
-    with open_stream(model_path, "wb+") as stream:
-        serializer = TensorSerializer(stream)
-        serializer.write_module(hf_model.model)
-    del hf_model
-    gc.collect()
-    torch.cuda.empty_cache()
+    with hf_runner(model_ref) as hf_model:
+        model_path = tmp_path / (model_ref + ".tensors")
+        max_tokens = 50
+        outputs = hf_model.generate_greedy(prompts, max_tokens=max_tokens)
+        with open_stream(model_path, "wb+") as stream:
+            serializer = TensorSerializer(stream)
+            serializer.write_module(hf_model.model)
+
     loaded_hf_model = vllm_runner(model_ref,
                                   load_format="tensorizer",
                                   model_loader_extra_config=TensorizerConfig(

From 2a16c038ab480fa17965a154d2e4ebefd3a632d7 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 8 Jun 2024 01:59:20 -0700
Subject: [PATCH 73/93] [CI/Test] improve robustness of test (vllm_runner)
 (#5357)

[CI/Test] improve robustness of test by replacing del with context manager (vllm_runner) (#5357)
---
 .../test_basic_correctness.py                 |  11 +-
 .../basic_correctness/test_chunked_prefill.py |  21 ++-
 tests/basic_correctness/test_preemption.py    | 148 +++++++++---------
 tests/conftest.py                             |   5 +-
 .../test_basic_distributed_correctness.py     |  15 +-
 .../test_chunked_prefill_distributed.py       |  21 ++-
 tests/engine/test_stop_reason.py              |   5 +-
 tests/engine/test_stop_strings.py             |   3 +-
 tests/metrics/test_metrics.py                 |  84 +++++-----
 tests/models/test_aqlm.py                     |   7 +-
 tests/models/test_big_models.py               |  16 +-
 tests/models/test_embedding.py                |   5 +-
 tests/models/test_gptq_marlin.py              |  37 ++---
 tests/models/test_gptq_marlin_24.py           |  22 ++-
 tests/models/test_llava.py                    |  15 +-
 tests/models/test_marlin.py                   |  24 ++-
 tests/models/test_mistral.py                  |   8 +-
 tests/models/test_models.py                   |  16 +-
 tests/quantization/test_bitsandbytes.py       | 124 +++++++--------
 tests/quantization/test_compressed_tensors.py |  63 ++++----
 tests/quantization/test_fp8.py                |  10 +-
 tests/samplers/test_beam_search.py            |  13 +-
 tests/samplers/test_ignore_eos.py             |  15 +-
 tests/samplers/test_logits_processor.py       |  82 +++++-----
 tests/samplers/test_logprobs.py               |  30 ++--
 tests/samplers/test_ranks.py                  |  38 +++--
 tests/samplers/test_seeded_generate.py        |   5 +-
 tests/tensorizer_loader/test_tensorizer.py    | 106 ++++++-------
 28 files changed, 455 insertions(+), 494 deletions(-)

diff --git a/tests/basic_correctness/test_basic_correctness.py b/tests/basic_correctness/test_basic_correctness.py
index 4561c8b1e143..805b8883b9d9 100644
--- a/tests/basic_correctness/test_basic_correctness.py
+++ b/tests/basic_correctness/test_basic_correctness.py
@@ -46,12 +46,11 @@ def test_models(
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    vllm_model = vllm_runner(model,
-                             dtype=dtype,
-                             enforce_eager=enforce_eager,
-                             gpu_memory_utilization=0.7)
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    del vllm_model
+    with vllm_runner(model,
+                     dtype=dtype,
+                     enforce_eager=enforce_eager,
+                     gpu_memory_utilization=0.7) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py
index 44e561130bf3..357bff61ef01 100644
--- a/tests/basic_correctness/test_chunked_prefill.py
+++ b/tests/basic_correctness/test_chunked_prefill.py
@@ -46,17 +46,16 @@ def test_models(
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        max_num_batched_tokens=max_num_batched_tokens,
-        enable_chunked_prefill=enable_chunked_prefill,
-        tensor_parallel_size=tensor_parallel_size,
-        enforce_eager=enforce_eager,
-        max_num_seqs=max_num_seqs,
-    )
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    del vllm_model
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            max_num_batched_tokens=max_num_batched_tokens,
+            enable_chunked_prefill=enable_chunked_prefill,
+            tensor_parallel_size=tensor_parallel_size,
+            enforce_eager=enforce_eager,
+            max_num_seqs=max_num_seqs,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
diff --git a/tests/basic_correctness/test_preemption.py b/tests/basic_correctness/test_preemption.py
index 58610e9e1016..7f20b2d93494 100644
--- a/tests/basic_correctness/test_preemption.py
+++ b/tests/basic_correctness/test_preemption.py
@@ -46,17 +46,16 @@ def test_chunked_prefill_recompute(
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        max_num_batched_tokens=max_num_batched_tokens,
-        enable_chunked_prefill=enable_chunked_prefill,
-        max_num_seqs=max_num_seqs,
-    )
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-            ARTIFICIAL_PREEMPTION_MAX_CNT)
-    del vllm_model
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            max_num_batched_tokens=max_num_batched_tokens,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_seqs=max_num_seqs,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+                ARTIFICIAL_PREEMPTION_MAX_CNT)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
@@ -84,17 +83,16 @@ def test_preemption(
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        disable_log_stats=False,
-    )
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-            ARTIFICIAL_PREEMPTION_MAX_CNT)
-    total_preemption = (
-        vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
-    del vllm_model
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            disable_log_stats=False,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+                ARTIFICIAL_PREEMPTION_MAX_CNT)
+        total_preemption = (
+            vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
@@ -139,19 +137,18 @@ def test_swap(
         hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
                                                    max_tokens)
 
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        swap_space=10,
-        disable_log_stats=False,
-    )
-    vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
-                                                   max_tokens)
-    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-            ARTIFICIAL_PREEMPTION_MAX_CNT)
-    total_preemption = (
-        vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
-    del vllm_model
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            swap_space=10,
+            disable_log_stats=False,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_beam_search(example_prompts,
+                                                       beam_width, max_tokens)
+        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+                ARTIFICIAL_PREEMPTION_MAX_CNT)
+        total_preemption = (
+            vllm_model.model.llm_engine.scheduler.num_cumulative_preemption)
 
     for i in range(len(example_prompts)):
         hf_output_ids, _ = hf_outputs[i]
@@ -196,28 +193,28 @@ def test_swap_infeasible(
     decode_blocks = max_tokens // BLOCK_SIZE
     example_prompts = example_prompts[:1]
 
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        swap_space=10,
-        block_size=BLOCK_SIZE,
-        # Since beam search have more than 1 sequence, prefill + decode blocks
-        # are not enough to finish.
-        num_gpu_blocks_override=prefill_blocks + decode_blocks,
-        max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE,
-    )
-    sampling_params = SamplingParams(n=beam_width,
-                                     use_beam_search=True,
-                                     temperature=0.0,
-                                     max_tokens=max_tokens,
-                                     ignore_eos=True)
-    req_outputs = vllm_model.model.generate(
-        example_prompts,
-        sampling_params=sampling_params,
-    )
-    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-            ARTIFICIAL_PREEMPTION_MAX_CNT)
-    del vllm_model
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            swap_space=10,
+            block_size=BLOCK_SIZE,
+            # Since beam search have more than 1 sequence, prefill +
+            # decode blocks are not enough to finish.
+            num_gpu_blocks_override=prefill_blocks + decode_blocks,
+            max_model_len=(prefill_blocks + decode_blocks) * BLOCK_SIZE,
+    ) as vllm_model:
+        sampling_params = SamplingParams(n=beam_width,
+                                         use_beam_search=True,
+                                         temperature=0.0,
+                                         max_tokens=max_tokens,
+                                         ignore_eos=True)
+        req_outputs = vllm_model.model.generate(
+            example_prompts,
+            sampling_params=sampling_params,
+        )
+        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+                ARTIFICIAL_PREEMPTION_MAX_CNT)
+
     # Verify the request is ignored and not hang.
     assert req_outputs[0].outputs[0].finish_reason == "length"
 
@@ -236,25 +233,26 @@ def test_preemption_infeasible(
     BLOCK_SIZE = 16
     prefill_blocks = 2
     decode_blocks = max_tokens // BLOCK_SIZE
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        block_size=BLOCK_SIZE,
-        # Not enough gpu blocks to complete a single sequence.
-        # preemption should happen, and the sequence should be
-        # ignored instead of hanging forever.
-        num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
-        max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
-    )
-    sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True)
-    req_outputs = vllm_model.model.generate(
-        example_prompts,
-        sampling_params=sampling_params,
-    )
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            block_size=BLOCK_SIZE,
+            # Not enough gpu blocks to complete a single sequence.
+            # preemption should happen, and the sequence should be
+            # ignored instead of hanging forever.
+            num_gpu_blocks_override=prefill_blocks + decode_blocks // 2,
+            max_model_len=((prefill_blocks + decode_blocks // 2) * BLOCK_SIZE),
+    ) as vllm_model:
+        sampling_params = SamplingParams(max_tokens=max_tokens,
+                                         ignore_eos=True)
+        req_outputs = vllm_model.model.generate(
+            example_prompts,
+            sampling_params=sampling_params,
+        )
+
+        assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
+                ARTIFICIAL_PREEMPTION_MAX_CNT)
 
-    assert (vllm_model.model.llm_engine.scheduler.artificial_preempt_cnt <
-            ARTIFICIAL_PREEMPTION_MAX_CNT)
-    del vllm_model
     # Verify the request is ignored and not hang.
     for req_output in req_outputs:
         outputs = req_output.outputs
diff --git a/tests/conftest.py b/tests/conftest.py
index 1f7868aef07a..48c7f8c095f0 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -676,7 +676,10 @@ def encode(self, prompts: List[str]) -> List[List[float]]:
             outputs.append(embedding)
         return outputs
 
-    def __del__(self):
+    def __enter__(self):
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
         del self.model
         cleanup()
 
diff --git a/tests/distributed/test_basic_distributed_correctness.py b/tests/distributed/test_basic_distributed_correctness.py
index 3b2192a8ab26..b0576e20e9e1 100644
--- a/tests/distributed/test_basic_distributed_correctness.py
+++ b/tests/distributed/test_basic_distributed_correctness.py
@@ -49,14 +49,13 @@ def test_models(
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        tensor_parallel_size=2,
-        enforce_eager=enforce_eager,
-        distributed_executor_backend=distributed_executor_backend)
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    del vllm_model
+    with vllm_runner(model,
+                     dtype=dtype,
+                     tensor_parallel_size=2,
+                     enforce_eager=enforce_eager,
+                     distributed_executor_backend=distributed_executor_backend
+                     ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
diff --git a/tests/distributed/test_chunked_prefill_distributed.py b/tests/distributed/test_chunked_prefill_distributed.py
index b469ad4cfc0e..204e79e26d51 100644
--- a/tests/distributed/test_chunked_prefill_distributed.py
+++ b/tests/distributed/test_chunked_prefill_distributed.py
@@ -52,17 +52,16 @@ def test_models(
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        tensor_parallel_size=2,
-        max_num_seqs=max_num_seqs,
-        enable_chunked_prefill=enable_chunked_prefill,
-        max_num_batched_tokens=max_num_batched_tokens,
-        distributed_executor_backend=distributed_executor_backend,
-    )
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    del vllm_model
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            tensor_parallel_size=2,
+            max_num_seqs=max_num_seqs,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_batched_tokens=max_num_batched_tokens,
+            distributed_executor_backend=distributed_executor_backend,
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
diff --git a/tests/engine/test_stop_reason.py b/tests/engine/test_stop_reason.py
index 7b886507c04f..b0bd6c4aa95d 100644
--- a/tests/engine/test_stop_reason.py
+++ b/tests/engine/test_stop_reason.py
@@ -19,9 +19,8 @@
 
 @pytest.fixture
 def vllm_model(vllm_runner):
-    vllm_model = vllm_runner(MODEL)
-    yield vllm_model
-    del vllm_model
+    with vllm_runner(MODEL) as vllm_model:
+        yield vllm_model
 
 
 def test_stop_reason(vllm_model, example_prompts):
diff --git a/tests/engine/test_stop_strings.py b/tests/engine/test_stop_strings.py
index 6b747beb4b54..1584b85aeb06 100644
--- a/tests/engine/test_stop_strings.py
+++ b/tests/engine/test_stop_strings.py
@@ -10,7 +10,8 @@
 
 @pytest.fixture(scope="session")
 def vllm_model(vllm_runner):
-    return vllm_runner(MODEL)
+    with vllm_runner(MODEL) as vllm_model:
+        yield vllm_model
 
 
 @pytest.mark.skip_global_cleanup
diff --git a/tests/metrics/test_metrics.py b/tests/metrics/test_metrics.py
index e0aa14f165c2..c1164739eee3 100644
--- a/tests/metrics/test_metrics.py
+++ b/tests/metrics/test_metrics.py
@@ -23,23 +23,25 @@ def test_metric_counter_prompt_tokens(
     dtype: str,
     max_tokens: int,
 ) -> None:
-    vllm_model = vllm_runner(model,
-                             dtype=dtype,
-                             disable_log_stats=False,
-                             gpu_memory_utilization=0.4)
-    tokenizer = vllm_model.model.get_tokenizer()
-    prompt_token_counts = [len(tokenizer.encode(p)) for p in example_prompts]
-    # This test needs at least 2 prompts in a batch of different lengths to
-    # verify their token count is correct despite padding.
-    assert len(example_prompts) > 1, "at least 2 prompts are required"
-    assert prompt_token_counts[0] != prompt_token_counts[1], (
-        "prompts of different lengths are required")
-    vllm_prompt_token_count = sum(prompt_token_counts)
-
-    _ = vllm_model.generate_greedy(example_prompts, max_tokens)
-    stat_logger = vllm_model.model.llm_engine.stat_logger
-    metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
-        **stat_logger.labels)._value.get()
+    with vllm_runner(model,
+                     dtype=dtype,
+                     disable_log_stats=False,
+                     gpu_memory_utilization=0.4) as vllm_model:
+        tokenizer = vllm_model.model.get_tokenizer()
+        prompt_token_counts = [
+            len(tokenizer.encode(p)) for p in example_prompts
+        ]
+        # This test needs at least 2 prompts in a batch of different lengths to
+        # verify their token count is correct despite padding.
+        assert len(example_prompts) > 1, "at least 2 prompts are required"
+        assert prompt_token_counts[0] != prompt_token_counts[1], (
+            "prompts of different lengths are required")
+        vllm_prompt_token_count = sum(prompt_token_counts)
+
+        _ = vllm_model.generate_greedy(example_prompts, max_tokens)
+        stat_logger = vllm_model.model.llm_engine.stat_logger
+        metric_count = stat_logger.metrics.counter_prompt_tokens.labels(
+            **stat_logger.labels)._value.get()
 
     assert vllm_prompt_token_count == metric_count, (
         f"prompt token count: {vllm_prompt_token_count!r}\n"
@@ -56,22 +58,22 @@ def test_metric_counter_generation_tokens(
     dtype: str,
     max_tokens: int,
 ) -> None:
-    vllm_model = vllm_runner(model,
-                             dtype=dtype,
-                             disable_log_stats=False,
-                             gpu_memory_utilization=0.4)
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    tokenizer = vllm_model.model.get_tokenizer()
-    stat_logger = vllm_model.model.llm_engine.stat_logger
-    metric_count = stat_logger.metrics.counter_generation_tokens.labels(
-        **stat_logger.labels)._value.get()
-    vllm_generation_count = 0
-    for i in range(len(example_prompts)):
-        vllm_output_ids, vllm_output_str = vllm_outputs[i]
-        prompt_ids = tokenizer.encode(example_prompts[i])
-        # vllm_output_ids contains both prompt tokens and generation tokens.
-        # We're interested only in the count of the generation tokens.
-        vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
+    with vllm_runner(model,
+                     dtype=dtype,
+                     disable_log_stats=False,
+                     gpu_memory_utilization=0.4) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
+        tokenizer = vllm_model.model.get_tokenizer()
+        stat_logger = vllm_model.model.llm_engine.stat_logger
+        metric_count = stat_logger.metrics.counter_generation_tokens.labels(
+            **stat_logger.labels)._value.get()
+        vllm_generation_count = 0
+        for i in range(len(example_prompts)):
+            vllm_output_ids, vllm_output_str = vllm_outputs[i]
+            prompt_ids = tokenizer.encode(example_prompts[i])
+            # vllm_output_ids contains both prompt tokens and generation tokens.
+            # We're interested only in the count of the generation tokens.
+            vllm_generation_count += len(vllm_output_ids) - len(prompt_ids)
 
     assert vllm_generation_count == metric_count, (
         f"generation token count: {vllm_generation_count!r}\n"
@@ -85,15 +87,13 @@ def test_metric_counter_generation_tokens(
     [None, [], ["ModelName0"], ["ModelName0", "ModelName1", "ModelName2"]])
 def test_metric_set_tag_model_name(vllm_runner, model: str, dtype: str,
                                    served_model_name: List[str]) -> None:
-    vllm_model = vllm_runner(model,
-                             dtype=dtype,
-                             disable_log_stats=False,
-                             gpu_memory_utilization=0.3,
-                             served_model_name=served_model_name)
-    stat_logger = vllm_model.model.llm_engine.stat_logger
-    metrics_tag_content = stat_logger.labels["model_name"]
-
-    del vllm_model
+    with vllm_runner(model,
+                     dtype=dtype,
+                     disable_log_stats=False,
+                     gpu_memory_utilization=0.3,
+                     served_model_name=served_model_name) as vllm_model:
+        stat_logger = vllm_model.model.llm_engine.stat_logger
+        metrics_tag_content = stat_logger.labels["model_name"]
 
     if served_model_name is None or served_model_name == []:
         assert metrics_tag_content == model, (
diff --git a/tests/models/test_aqlm.py b/tests/models/test_aqlm.py
index 85d74f7f5b03..c4ecf846e633 100644
--- a/tests/models/test_aqlm.py
+++ b/tests/models/test_aqlm.py
@@ -82,10 +82,9 @@ def test_models(
     num_logprobs: int,
 ) -> None:
 
-    vllm_model = vllm_runner(model, dtype=dtype)
-    vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
-                                                       max_tokens,
-                                                       num_logprobs)
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
 
     # loop through the prompts to compare against the ground truth generations
     for prompt_idx in range(len(example_prompts)):
diff --git a/tests/models/test_big_models.py b/tests/models/test_big_models.py
index f2fc0555b136..48b655e58d60 100644
--- a/tests/models/test_big_models.py
+++ b/tests/models/test_big_models.py
@@ -68,9 +68,8 @@ def test_models(
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    vllm_model = vllm_runner(model, dtype=dtype)
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    del vllm_model
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
@@ -89,9 +88,8 @@ def test_model_print(
     model: str,
     dtype: str,
 ) -> None:
-    vllm_model = vllm_runner(model, dtype=dtype)
-    # This test is for verifying whether the model's extra_repr
-    # can be printed correctly.
-    print(vllm_model.model.llm_engine.model_executor.driver_worker.
-          model_runner.model)
-    del vllm_model
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        # This test is for verifying whether the model's extra_repr
+        # can be printed correctly.
+        print(vllm_model.model.llm_engine.model_executor.driver_worker.
+              model_runner.model)
diff --git a/tests/models/test_embedding.py b/tests/models/test_embedding.py
index 8ad9ac2d4f59..6556998b68a7 100644
--- a/tests/models/test_embedding.py
+++ b/tests/models/test_embedding.py
@@ -31,9 +31,8 @@ def test_models(
     with hf_runner(model, dtype=dtype, is_embedding_model=True) as hf_model:
         hf_outputs = hf_model.encode(example_prompts)
 
-    vllm_model = vllm_runner(model, dtype=dtype)
-    vllm_outputs = vllm_model.encode(example_prompts)
-    del vllm_model
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.encode(example_prompts)
 
     similarities = compare_embeddings(hf_outputs, vllm_outputs)
     all_similarities = torch.stack(similarities)
diff --git a/tests/models/test_gptq_marlin.py b/tests/models/test_gptq_marlin.py
index da549cae0054..1ecd27c5ce51 100644
--- a/tests/models/test_gptq_marlin.py
+++ b/tests/models/test_gptq_marlin.py
@@ -69,32 +69,29 @@ def test_models(
     model_name, revision = model
 
     # Run marlin.
-    gptq_marlin_model = vllm_runner(model_name=model_name,
-                                    revision=revision,
-                                    dtype=dtype,
-                                    quantization="marlin",
-                                    max_model_len=MAX_MODEL_LEN,
-                                    tensor_parallel_size=1)
-
-    gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
-        example_prompts[:-1], max_tokens, num_logprobs)
-    del gptq_marlin_model
+    with vllm_runner(model_name=model_name,
+                     revision=revision,
+                     dtype=dtype,
+                     quantization="marlin",
+                     max_model_len=MAX_MODEL_LEN,
+                     tensor_parallel_size=1) as gptq_marlin_model:
+
+        gptq_marlin_outputs = gptq_marlin_model.generate_greedy_logprobs(
+            example_prompts[:-1], max_tokens, num_logprobs)
     _ROPE_DICT.clear()  # clear rope cache to avoid rope dtype error
 
     # Run gptq.
     # The naive gptq kernel doesn't support bf16 yet.
     # Here we always compare fp16/bf16 gpt marlin kernel
     # to fp16 gptq kernel.
-    gptq_model = vllm_runner(model_name=model_name,
-                             revision=revision,
-                             dtype="half",
-                             quantization="gptq",
-                             max_model_len=MAX_MODEL_LEN,
-                             tensor_parallel_size=1)
-    gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts[:-1],
-                                                       max_tokens,
-                                                       num_logprobs)
-    del gptq_model
+    with vllm_runner(model_name=model_name,
+                     revision=revision,
+                     dtype="half",
+                     quantization="gptq",
+                     max_model_len=MAX_MODEL_LEN,
+                     tensor_parallel_size=1) as gptq_model:
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
+            example_prompts[:-1], max_tokens, num_logprobs)
 
     check_logprobs_close(
         outputs_0_lst=gptq_outputs,
diff --git a/tests/models/test_gptq_marlin_24.py b/tests/models/test_gptq_marlin_24.py
index cc35ee803ff0..195c3e5b5863 100644
--- a/tests/models/test_gptq_marlin_24.py
+++ b/tests/models/test_gptq_marlin_24.py
@@ -61,20 +61,16 @@ def test_models(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-    marlin_24_model = vllm_runner(model_pair.model_marlin,
-                                  dtype=dtype,
-                                  quantization="gptq_marlin_24")
-    marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
-        example_prompts, max_tokens, num_logprobs)
-    del marlin_24_model
+    with vllm_runner(model_pair.model_marlin,
+                     dtype=dtype,
+                     quantization="gptq_marlin_24") as marlin_24_model:
+        marlin_24_outputs = marlin_24_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
 
-    gptq_model = vllm_runner(model_pair.model_gptq,
-                             dtype=dtype,
-                             quantization="gptq")
-    gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
-                                                       max_tokens,
-                                                       num_logprobs)
-    del gptq_model
+    with vllm_runner(model_pair.model_gptq, dtype=dtype,
+                     quantization="gptq") as gptq_model:
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
 
     check_logprobs_close(
         outputs_0_lst=gptq_outputs,
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index 1e7ee52832b1..1f446362167a 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -94,14 +94,13 @@ def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
         for p in HF_IMAGE_PROMPTS
     ]
 
-    vllm_model = vllm_runner(model_id,
-                             dtype=dtype,
-                             enforce_eager=True,
-                             **vlm_config.as_cli_args_dict())
-    vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
-                                              max_tokens,
-                                              images=vllm_images)
-    del vllm_model
+    with vllm_runner(model_id,
+                     dtype=dtype,
+                     enforce_eager=True,
+                     **vlm_config.as_cli_args_dict()) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
+                                                  max_tokens,
+                                                  images=vllm_images)
 
     for i in range(len(HF_IMAGE_PROMPTS)):
         hf_output_ids, hf_output_str = hf_outputs[i]
diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py
index 585c5ad686d1..dee086733e4a 100644
--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -65,20 +65,16 @@ def test_models(
     max_tokens: int,
     num_logprobs: int,
 ) -> None:
-    marlin_model = vllm_runner(model_pair.model_marlin,
-                               dtype=dtype,
-                               quantization="marlin")
-    marlin_outputs = marlin_model.generate_greedy_logprobs(
-        example_prompts, max_tokens, num_logprobs)
-    del marlin_model
-
-    gptq_model = vllm_runner(model_pair.model_gptq,
-                             dtype=dtype,
-                             quantization="gptq")
-    gptq_outputs = gptq_model.generate_greedy_logprobs(example_prompts,
-                                                       max_tokens,
-                                                       num_logprobs)
-    del gptq_model
+    with vllm_runner(model_pair.model_marlin,
+                     dtype=dtype,
+                     quantization="marlin") as marlin_model:
+        marlin_outputs = marlin_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
+
+    with vllm_runner(model_pair.model_gptq, dtype=dtype,
+                     quantization="gptq") as gptq_model:
+        gptq_outputs = gptq_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
 
     check_logprobs_close(
         outputs_0_lst=gptq_outputs,
diff --git a/tests/models/test_mistral.py b/tests/models/test_mistral.py
index f1554ce2180c..88f2e97fb897 100644
--- a/tests/models/test_mistral.py
+++ b/tests/models/test_mistral.py
@@ -32,11 +32,9 @@ def test_models(
         hf_outputs = hf_model.generate_greedy_logprobs_limit(
             example_prompts, max_tokens, num_logprobs)
 
-    vllm_model = vllm_runner(model, dtype=dtype)
-    vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
-                                                       max_tokens,
-                                                       num_logprobs)
-    del vllm_model
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy_logprobs(
+            example_prompts, max_tokens, num_logprobs)
     check_logprobs_close(
         outputs_0_lst=hf_outputs,
         outputs_1_lst=vllm_outputs,
diff --git a/tests/models/test_models.py b/tests/models/test_models.py
index b92c3ebb6b6f..c838cfcb6913 100644
--- a/tests/models/test_models.py
+++ b/tests/models/test_models.py
@@ -41,9 +41,8 @@ def test_models(
     with hf_runner(model, dtype=dtype) as hf_model:
         hf_outputs = hf_model.generate_greedy(example_prompts, max_tokens)
 
-    vllm_model = vllm_runner(model, dtype=dtype)
-    vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
-    del vllm_model
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(example_prompts, max_tokens)
 
     for i in range(len(example_prompts)):
         hf_output_ids, hf_output_str = hf_outputs[i]
@@ -62,9 +61,8 @@ def test_model_print(
     model: str,
     dtype: str,
 ) -> None:
-    vllm_model = vllm_runner(model, dtype=dtype)
-    # This test is for verifying whether the model's extra_repr
-    # can be printed correctly.
-    print(vllm_model.model.llm_engine.model_executor.driver_worker.
-          model_runner.model)
-    del vllm_model
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        # This test is for verifying whether the model's extra_repr
+        # can be printed correctly.
+        print(vllm_model.model.llm_engine.model_executor.driver_worker.
+              model_runner.model)
diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py
index 4e9feb3c4814..31e938d15a1f 100644
--- a/tests/quantization/test_bitsandbytes.py
+++ b/tests/quantization/test_bitsandbytes.py
@@ -16,65 +16,65 @@
     capability < QUANTIZATION_METHODS['bitsandbytes'].get_min_capability(),
     reason='bitsandbytes is not supported on this GPU type.')
 def test_load_bnb_model(vllm_runner) -> None:
-    llm = vllm_runner('huggyllama/llama-7b',
-                      quantization='bitsandbytes',
-                      load_format='bitsandbytes',
-                      enforce_eager=True)
-
-    model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
-
-    # check the weights in MLP & SelfAttention are quantized to torch.uint8
-    qweight = model.model.layers[0].mlp.gate_up_proj.qweight
-    assert qweight.dtype == torch.uint8, (
-        f'Expected gate_up_proj dtype torch.uint8 but got {qweight.dtype}')
-
-    qweight = model.model.layers[0].mlp.down_proj.qweight
-    assert qweight.dtype == torch.uint8, (
-        f'Expected down_proj dtype torch.uint8 but got {qweight.dtype}')
-
-    qweight = model.model.layers[0].self_attn.o_proj.qweight
-    assert qweight.dtype == torch.uint8, (
-        f'Expected o_proj dtype torch.uint8 but got {qweight.dtype}')
-
-    qweight = model.model.layers[0].self_attn.qkv_proj.qweight
-    assert qweight.dtype == torch.uint8, (
-        f'Expected qkv_proj dtype torch.uint8 but got {qweight.dtype}')
-
-    # some weights should not be quantized
-    weight = model.lm_head.weight
-    assert weight.dtype != torch.uint8, (
-        'lm_head weight dtype should not be torch.uint8')
-
-    weight = model.model.embed_tokens.weight
-    assert weight.dtype != torch.uint8, (
-        'embed_tokens weight dtype should not be torch.uint8')
-
-    weight = model.model.layers[0].input_layernorm.weight
-    assert weight.dtype != torch.uint8, (
-        'input_layernorm weight dtype should not be torch.uint8')
-
-    weight = model.model.layers[0].post_attention_layernorm.weight
-    assert weight.dtype != torch.uint8, (
-        'input_layernorm weight dtype should not be torch.uint8')
-
-    # check the output of the model is expected
-    sampling_params = SamplingParams(temperature=0.0,
-                                     logprobs=1,
-                                     prompt_logprobs=1,
-                                     max_tokens=8)
-
-    prompts = ['That which does not kill us', 'To be or not to be,']
-    expected_outputs = [
-        'That which does not kill us makes us stronger.',
-        'To be or not to be, that is the question.'
-    ]
-    outputs = llm.generate(prompts, sampling_params=sampling_params)
-
-    assert len(outputs) == len(prompts)
-
-    for index in range(len(outputs)):
-        # compare the first line of the output
-        actual_output = outputs[index][1][0].split('\n', 1)[0]
-        expected_output = expected_outputs[index].split('\n', 1)[0]
-        assert actual_output == expected_output, (
-            f'Expected: {expected_output}, but got: {actual_output}')
+    with vllm_runner('huggyllama/llama-7b',
+                     quantization='bitsandbytes',
+                     load_format='bitsandbytes',
+                     enforce_eager=True) as llm:
+
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+
+        # check the weights in MLP & SelfAttention are quantized to torch.uint8
+        qweight = model.model.layers[0].mlp.gate_up_proj.qweight
+        assert qweight.dtype == torch.uint8, (
+            f'Expected gate_up_proj dtype torch.uint8 but got {qweight.dtype}')
+
+        qweight = model.model.layers[0].mlp.down_proj.qweight
+        assert qweight.dtype == torch.uint8, (
+            f'Expected down_proj dtype torch.uint8 but got {qweight.dtype}')
+
+        qweight = model.model.layers[0].self_attn.o_proj.qweight
+        assert qweight.dtype == torch.uint8, (
+            f'Expected o_proj dtype torch.uint8 but got {qweight.dtype}')
+
+        qweight = model.model.layers[0].self_attn.qkv_proj.qweight
+        assert qweight.dtype == torch.uint8, (
+            f'Expected qkv_proj dtype torch.uint8 but got {qweight.dtype}')
+
+        # some weights should not be quantized
+        weight = model.lm_head.weight
+        assert weight.dtype != torch.uint8, (
+            'lm_head weight dtype should not be torch.uint8')
+
+        weight = model.model.embed_tokens.weight
+        assert weight.dtype != torch.uint8, (
+            'embed_tokens weight dtype should not be torch.uint8')
+
+        weight = model.model.layers[0].input_layernorm.weight
+        assert weight.dtype != torch.uint8, (
+            'input_layernorm weight dtype should not be torch.uint8')
+
+        weight = model.model.layers[0].post_attention_layernorm.weight
+        assert weight.dtype != torch.uint8, (
+            'input_layernorm weight dtype should not be torch.uint8')
+
+        # check the output of the model is expected
+        sampling_params = SamplingParams(temperature=0.0,
+                                         logprobs=1,
+                                         prompt_logprobs=1,
+                                         max_tokens=8)
+
+        prompts = ['That which does not kill us', 'To be or not to be,']
+        expected_outputs = [
+            'That which does not kill us makes us stronger.',
+            'To be or not to be, that is the question.'
+        ]
+        outputs = llm.generate(prompts, sampling_params=sampling_params)
+
+        assert len(outputs) == len(prompts)
+
+        for index in range(len(outputs)):
+            # compare the first line of the output
+            actual_output = outputs[index][1][0].split('\n', 1)[0]
+            expected_output = expected_outputs[index].split('\n', 1)[0]
+            assert actual_output == expected_output, (
+                f'Expected: {expected_output}, but got: {actual_output}')
diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 8b48f418fe49..9d94d2ecfb22 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -12,42 +12,45 @@
 
 def test_compressed_tensors_w8a8_static_setup(vllm_runner):
     model_path = "nm-testing/tinyllama-one-shot-static-quant-test-compressed"
-    llm = vllm_runner(model_path, quantization="sparseml", enforce_eager=True)
-    model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
-    layer = model.model.layers[0]
+    with vllm_runner(model_path, quantization="sparseml",
+                     enforce_eager=True) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
 
-    qkv_proj = layer.self_attn.qkv_proj
-    o_proj = layer.self_attn.o_proj
-    gate_up_proj = layer.mlp.gate_up_proj
-    down_proj = layer.mlp.down_proj
+        qkv_proj = layer.self_attn.qkv_proj
+        o_proj = layer.self_attn.o_proj
+        gate_up_proj = layer.mlp.gate_up_proj
+        down_proj = layer.mlp.down_proj
 
-    assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-    assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod)
-    assert isinstance(gate_up_proj.quant_method, CompressedTensorsLinearMethod)
-    assert isinstance(down_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(o_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(gate_up_proj.quant_method,
+                          CompressedTensorsLinearMethod)
+        assert isinstance(down_proj.quant_method,
+                          CompressedTensorsLinearMethod)
 
-    assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor)
+        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8StaticTensor)
 
-    assert qkv_proj.weight.dtype is torch.int8
-    assert o_proj.weight.dtype is torch.int8
-    assert gate_up_proj.weight.dtype is torch.int8
+        assert qkv_proj.weight.dtype is torch.int8
+        assert o_proj.weight.dtype is torch.int8
+        assert gate_up_proj.weight.dtype is torch.int8
 
-    assert qkv_proj.weight_scale.shard_splitter is not None
-    assert qkv_proj.weight_scale.logical_widths is not None
-    assert qkv_proj.input_scale.dtype is torch.float32
+        assert qkv_proj.weight_scale.shard_splitter is not None
+        assert qkv_proj.weight_scale.logical_widths is not None
+        assert qkv_proj.input_scale.dtype is torch.float32
 
 
 def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
     model_path = "nm-testing/tinyllama-one-shot-dynamic-test"
-    llm = vllm_runner(model_path,
-                      quantization="sparseml",
-                      enforce_eager=True,
-                      dtype=torch.float16)
-    model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
-    layer = model.model.layers[0]
-
-    qkv_proj = layer.self_attn.qkv_proj
-
-    assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
-    assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken)
-    assert qkv_proj.weight.dtype is torch.int8
+    with vllm_runner(model_path,
+                     quantization="sparseml",
+                     enforce_eager=True,
+                     dtype=torch.float16) as llm:
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        layer = model.model.layers[0]
+
+        qkv_proj = layer.self_attn.qkv_proj
+
+        assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
+        assert isinstance(qkv_proj.scheme, CompressedTensorsW8A8DynamicToken)
+        assert qkv_proj.weight.dtype is torch.int8
diff --git a/tests/quantization/test_fp8.py b/tests/quantization/test_fp8.py
index 607544a1c839..fccce7f7b59a 100644
--- a/tests/quantization/test_fp8.py
+++ b/tests/quantization/test_fp8.py
@@ -16,9 +16,9 @@
     capability < QUANTIZATION_METHODS["fp8"].get_min_capability(),
     reason="FP8 is not supported on this GPU type.")
 def test_load_fp16_model(vllm_runner) -> None:
-    llm = vllm_runner("facebook/opt-125m", quantization="fp8")
+    with vllm_runner("facebook/opt-125m", quantization="fp8") as llm:
 
-    model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model
-    fc1 = model.model.decoder.layers[0].fc1
-    assert isinstance(fc1.quant_method, Fp8LinearMethod)
-    assert fc1.weight.dtype == torch.float8_e4m3fn
+        model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
+        fc1 = model.model.decoder.layers[0].fc1
+        assert isinstance(fc1.quant_method, Fp8LinearMethod)
+        assert fc1.weight.dtype == torch.float8_e4m3fn
diff --git a/tests/samplers/test_beam_search.py b/tests/samplers/test_beam_search.py
index 2e373cb87cd1..64f3ce94b7a8 100644
--- a/tests/samplers/test_beam_search.py
+++ b/tests/samplers/test_beam_search.py
@@ -2,10 +2,8 @@
 
 Run `pytest tests/samplers/test_beam_search.py`.
 """
-import gc
 
 import pytest
-import torch
 
 # FIXME(zhuohan): The test can not pass if we:
 #   1. Increase max_tokens to 256.
@@ -34,14 +32,9 @@ def test_beam_search_single_input(
         hf_outputs = hf_model.generate_beam_search(example_prompts, beam_width,
                                                    max_tokens)
 
-    vllm_model = vllm_runner(model, dtype=dtype)
-    vllm_outputs = vllm_model.generate_beam_search(example_prompts, beam_width,
-                                                   max_tokens)
-    del vllm_model
-    # NOTE(woosuk): For some reason, the following GC is required to avoid
-    # GPU OOM errors in the following tests using `vllm_runner`.
-    gc.collect()
-    torch.cuda.empty_cache()
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        vllm_outputs = vllm_model.generate_beam_search(example_prompts,
+                                                       beam_width, max_tokens)
 
     for i in range(len(example_prompts)):
         hf_output_ids, _ = hf_outputs[i]
diff --git a/tests/samplers/test_ignore_eos.py b/tests/samplers/test_ignore_eos.py
index 67b5168bea0e..dc2482d85a91 100644
--- a/tests/samplers/test_ignore_eos.py
+++ b/tests/samplers/test_ignore_eos.py
@@ -22,11 +22,12 @@ def test_ignore_eos(
     dtype: str,
     max_tokens: int,
 ) -> None:
-    vllm_model = vllm_runner(model, dtype=dtype)
-    sampling_params = SamplingParams(max_tokens=max_tokens, ignore_eos=True)
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        sampling_params = SamplingParams(max_tokens=max_tokens,
+                                         ignore_eos=True)
 
-    for prompt in example_prompts:
-        ignore_eos_output = vllm_model.model.generate(
-            prompt, sampling_params=sampling_params)
-        output_length = len(ignore_eos_output[0].outputs[0].token_ids)
-        assert output_length == max_tokens
+        for prompt in example_prompts:
+            ignore_eos_output = vllm_model.model.generate(
+                prompt, sampling_params=sampling_params)
+            output_length = len(ignore_eos_output[0].outputs[0].token_ids)
+            assert output_length == max_tokens
diff --git a/tests/samplers/test_logits_processor.py b/tests/samplers/test_logits_processor.py
index 0ccbabfff640..297947012071 100644
--- a/tests/samplers/test_logits_processor.py
+++ b/tests/samplers/test_logits_processor.py
@@ -14,46 +14,46 @@ def test_logits_processor_force_generate(
     model: str,
     dtype: str,
 ) -> None:
-    vllm_model = vllm_runner(model, dtype=dtype)
-    tokenizer = vllm_model.model.get_tokenizer()
-    repeat_times = 2
-    enforced_answers = " vLLM"
-    vllm_token_ids = tokenizer.encode(enforced_answers,
-                                      add_special_tokens=False)
-    max_tokens = len(vllm_token_ids) * repeat_times
-
-    def pick_vllm(token_ids, logits):
-        token_id = vllm_token_ids[len(token_ids) % len(vllm_token_ids)]
-        logits[token_id] = torch.finfo(logits.dtype).max
-        return logits
-
-    params_with_logprobs = SamplingParams(
-        logits_processors=[pick_vllm],
-        prompt_logprobs=3,
-        max_tokens=max_tokens,
-    )
-
-    # test logits_processors when prompt_logprobs is not None
-    vllm_model.model._add_request(
-        example_prompts[0],
-        params=params_with_logprobs,
-    )
-
-    # test prompt_logprobs is not None
-    vllm_model.model._add_request(
-        example_prompts[1],
-        params=SamplingParams(
+    with vllm_runner(model, dtype=dtype) as vllm_model:
+        tokenizer = vllm_model.model.get_tokenizer()
+        repeat_times = 2
+        enforced_answers = " vLLM"
+        vllm_token_ids = tokenizer.encode(enforced_answers,
+                                          add_special_tokens=False)
+        max_tokens = len(vllm_token_ids) * repeat_times
+
+        def pick_vllm(token_ids, logits):
+            token_id = vllm_token_ids[len(token_ids) % len(vllm_token_ids)]
+            logits[token_id] = torch.finfo(logits.dtype).max
+            return logits
+
+        params_with_logprobs = SamplingParams(
+            logits_processors=[pick_vllm],
             prompt_logprobs=3,
             max_tokens=max_tokens,
-        ),
-    )
-
-    # test grouped requests
-    vllm_model.model._add_request(
-        example_prompts[2],
-        params=SamplingParams(max_tokens=max_tokens),
-    )
-
-    outputs = vllm_model.model._run_engine(use_tqdm=False)
-
-    assert outputs[0].outputs[0].text == enforced_answers * repeat_times
+        )
+
+        # test logits_processors when prompt_logprobs is not None
+        vllm_model.model._add_request(
+            example_prompts[0],
+            params=params_with_logprobs,
+        )
+
+        # test prompt_logprobs is not None
+        vllm_model.model._add_request(
+            example_prompts[1],
+            params=SamplingParams(
+                prompt_logprobs=3,
+                max_tokens=max_tokens,
+            ),
+        )
+
+        # test grouped requests
+        vllm_model.model._add_request(
+            example_prompts[2],
+            params=SamplingParams(max_tokens=max_tokens),
+        )
+
+        outputs = vllm_model.model._run_engine(use_tqdm=False)
+
+        assert outputs[0].outputs[0].text == enforced_answers * repeat_times
diff --git a/tests/samplers/test_logprobs.py b/tests/samplers/test_logprobs.py
index 25d59391ca98..233540cdc391 100644
--- a/tests/samplers/test_logprobs.py
+++ b/tests/samplers/test_logprobs.py
@@ -38,21 +38,21 @@ def test_get_prompt_logprobs(
             max_tokens=max_tokens,
         )
 
-    vllm_model = vllm_runner(
-        model,
-        dtype=dtype,
-        max_logprobs=num_top_logprobs,
-        enable_chunked_prefill=enable_chunked_prefill,
-        max_num_batched_tokens=max_num_batched_tokens,
-        max_num_seqs=max_num_seqs,
-    )
-    vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
-                                          logprobs=num_top_logprobs,
-                                          prompt_logprobs=num_top_logprobs,
-                                          temperature=0.0,
-                                          detokenize=detokenize)
-    vllm_results = vllm_model.model.generate(
-        example_prompts, sampling_params=vllm_sampling_params)
+    with vllm_runner(
+            model,
+            dtype=dtype,
+            max_logprobs=num_top_logprobs,
+            enable_chunked_prefill=enable_chunked_prefill,
+            max_num_batched_tokens=max_num_batched_tokens,
+            max_num_seqs=max_num_seqs,
+    ) as vllm_model:
+        vllm_sampling_params = SamplingParams(max_tokens=max_tokens,
+                                              logprobs=num_top_logprobs,
+                                              prompt_logprobs=num_top_logprobs,
+                                              temperature=0.0,
+                                              detokenize=detokenize)
+        vllm_results = vllm_model.model.generate(
+            example_prompts, sampling_params=vllm_sampling_params)
 
     # Test whether logprobs are included in the results.
     for result in vllm_results:
diff --git a/tests/samplers/test_ranks.py b/tests/samplers/test_ranks.py
index 5e93238d709e..ed2fee1ae252 100644
--- a/tests/samplers/test_ranks.py
+++ b/tests/samplers/test_ranks.py
@@ -17,16 +17,27 @@ def test_ranks(
     num_top_logprobs = 5
     num_prompt_logprobs = 5
 
-    vllm_model = vllm_runner(model, dtype=dtype, max_logprobs=num_top_logprobs)
-
-    ## Test greedy logprobs ranks
-    vllm_sampling_params = SamplingParams(temperature=0.0,
-                                          top_p=1.0,
-                                          max_tokens=max_tokens,
-                                          logprobs=num_top_logprobs,
-                                          prompt_logprobs=num_prompt_logprobs)
-    vllm_results = vllm_model.generate_w_logprobs(example_prompts,
-                                                  vllm_sampling_params)
+    with vllm_runner(model, dtype=dtype,
+                     max_logprobs=num_top_logprobs) as vllm_model:
+
+        ## Test greedy logprobs ranks
+        vllm_sampling_params = SamplingParams(
+            temperature=0.0,
+            top_p=1.0,
+            max_tokens=max_tokens,
+            logprobs=num_top_logprobs,
+            prompt_logprobs=num_prompt_logprobs)
+        vllm_results = vllm_model.generate_w_logprobs(example_prompts,
+                                                      vllm_sampling_params)
+
+        ## Test non-greedy logprobs ranks
+        sampling_params = SamplingParams(temperature=1.0,
+                                         top_p=1.0,
+                                         max_tokens=max_tokens,
+                                         logprobs=num_top_logprobs,
+                                         prompt_logprobs=num_prompt_logprobs)
+        res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
+
     for result in vllm_results:
         assert result[2] is not None
         assert len(result[2]) == len(result[0])
@@ -35,13 +46,6 @@ def test_ranks(
             assert token in logprobs
             assert logprobs[token].rank == 1
 
-    ## Test non-greedy logprobs ranks
-    sampling_params = SamplingParams(temperature=1.0,
-                                     top_p=1.0,
-                                     max_tokens=max_tokens,
-                                     logprobs=num_top_logprobs,
-                                     prompt_logprobs=num_prompt_logprobs)
-    res = vllm_model.generate_w_logprobs(example_prompts, sampling_params)
     for result in res:
         assert result[2] is not None
         assert len(result[2]) == len(result[0])
diff --git a/tests/samplers/test_seeded_generate.py b/tests/samplers/test_seeded_generate.py
index fef5ff3fb9e8..88067f19c8f0 100644
--- a/tests/samplers/test_seeded_generate.py
+++ b/tests/samplers/test_seeded_generate.py
@@ -17,9 +17,8 @@
 
 @pytest.fixture
 def vllm_model(vllm_runner):
-    vllm_model = vllm_runner(MODEL, dtype="half")
-    yield vllm_model
-    del vllm_model
+    with vllm_runner(MODEL, dtype="half") as vllm_model:
+        yield vllm_model
 
 
 @pytest.mark.parametrize("seed", RANDOM_SEEDS)
diff --git a/tests/tensorizer_loader/test_tensorizer.py b/tests/tensorizer_loader/test_tensorizer.py
index 949aaea0080a..a95aa84978a4 100644
--- a/tests/tensorizer_loader/test_tensorizer.py
+++ b/tests/tensorizer_loader/test_tensorizer.py
@@ -1,4 +1,3 @@
-import gc
 import json
 import os
 import subprocess
@@ -7,7 +6,6 @@
 import openai
 import pytest
 import ray
-import torch
 
 from tests.utils import ServerRunner
 from vllm import SamplingParams
@@ -70,17 +68,17 @@ def test_can_deserialize_s3(vllm_runner):
     model_ref = "EleutherAI/pythia-1.4b"
     tensorized_path = f"s3://tensorized/{model_ref}/fp16/model.tensors"
 
-    loaded_hf_model = vllm_runner(model_ref,
+    with vllm_runner(model_ref,
                                   load_format="tensorizer",
                                   model_loader_extra_config=TensorizerConfig(
                                       tensorizer_uri=tensorized_path,
                                       num_readers=1,
                                       s3_endpoint="object.ord1.coreweave.com",
-                                  ))
+                                  )) as loaded_hf_model:
 
-    deserialized_outputs = loaded_hf_model.generate(prompts, sampling_params)
+        deserialized_outputs = loaded_hf_model.generate(prompts, sampling_params) # noqa: E501
 
-    assert deserialized_outputs
+        assert deserialized_outputs
 
 
 # UPSTREAM SYNC: breaks NM automation.
@@ -92,31 +90,27 @@ def test_can_deserialize_s3(vllm_runner):
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_deserialized_encrypted_vllm_model_has_same_outputs(
         vllm_runner, tmp_path):
-    vllm_model = vllm_runner(model_ref)
-    model_path = tmp_path / (model_ref + ".tensors")
-    key_path = tmp_path / (model_ref + ".key")
-    outputs = vllm_model.generate(prompts, sampling_params)
-
-    config_for_serializing = TensorizerConfig(tensorizer_uri=model_path)
-    serialize_vllm_model(vllm_model.model.llm_engine,
-                         config_for_serializing,
-                         encryption_key_path=key_path)
+    with vllm_runner(model_ref) as vllm_model:
+        model_path = tmp_path / (model_ref + ".tensors")
+        key_path = tmp_path / (model_ref + ".key")
+        outputs = vllm_model.generate(prompts, sampling_params)
 
-    del vllm_model
-    gc.collect()
-    torch.cuda.empty_cache()
+        config_for_serializing = TensorizerConfig(tensorizer_uri=model_path)
+        serialize_vllm_model(vllm_model.model.llm_engine,
+                            config_for_serializing,
+                            encryption_key_path=key_path)
 
     config_for_deserializing = TensorizerConfig(tensorizer_uri=model_path,
                                                 encryption_keyfile=key_path)
 
-    loaded_vllm_model = vllm_runner(
+    with vllm_runner(
         model_ref,
         load_format="tensorizer",
-        model_loader_extra_config=config_for_deserializing)
+        model_loader_extra_config=config_for_deserializing) as loaded_vllm_model: # noqa: E501
 
-    deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
+        deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params) # noqa: E501
 
-    assert outputs == deserialized_outputs
+        assert outputs == deserialized_outputs
 
 
 def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
@@ -129,17 +123,17 @@ def test_deserialized_hf_model_has_same_outputs(hf_runner, vllm_runner,
             serializer = TensorSerializer(stream)
             serializer.write_module(hf_model.model)
 
-    loaded_hf_model = vllm_runner(model_ref,
+    with vllm_runner(model_ref,
                                   load_format="tensorizer",
                                   model_loader_extra_config=TensorizerConfig(
                                       tensorizer_uri=model_path,
                                       num_readers=1,
-                                  ))
+                                  )) as loaded_hf_model:
 
-    deserialized_outputs = loaded_hf_model.generate_greedy(
-        prompts, max_tokens=max_tokens)
+        deserialized_outputs = loaded_hf_model.generate_greedy(
+            prompts, max_tokens=max_tokens)
 
-    assert outputs == deserialized_outputs
+        assert outputs == deserialized_outputs
 
 
 def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
@@ -153,16 +147,13 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
     test_prompts = create_test_prompts(lora_path)
 
     # Serialize model before deserializing and binding LoRA adapters
-    vllm_model = vllm_runner(model_ref, )
-    model_path = tmp_path / (model_ref + ".tensors")
+    with vllm_runner(model_ref, ) as vllm_model:
+        model_path = tmp_path / (model_ref + ".tensors")
 
-    serialize_vllm_model(vllm_model.model.llm_engine,
-                         TensorizerConfig(tensorizer_uri=model_path))
+        serialize_vllm_model(vllm_model.model.llm_engine,
+                            TensorizerConfig(tensorizer_uri=model_path))
 
-    del vllm_model
-    gc.collect()
-    torch.cuda.empty_cache()
-    loaded_vllm_model = vllm_runner(
+    with vllm_runner(
         model_ref,
         load_format="tensorizer",
         model_loader_extra_config=TensorizerConfig(
@@ -175,10 +166,10 @@ def test_vllm_model_can_load_with_lora(vllm_runner, tmp_path):
         max_cpu_loras=2,
         max_num_seqs=50,
         max_model_len=1000,
-    )
-    process_requests(loaded_vllm_model.model.llm_engine, test_prompts)
+    ) as loaded_vllm_model:
+        process_requests(loaded_vllm_model.model.llm_engine, test_prompts)
 
-    assert loaded_vllm_model
+        assert loaded_vllm_model
 
 
 def test_load_without_tensorizer_load_format(vllm_runner):
@@ -191,19 +182,15 @@ def test_load_without_tensorizer_load_format(vllm_runner):
 @pytest.mark.skipif(not is_curl_installed(), reason="cURL is not installed")
 def test_openai_apiserver_with_tensorizer(vllm_runner, tmp_path):
     ## Serialize model
-    vllm_model = vllm_runner(model_ref, )
-    model_path = tmp_path / (model_ref + ".tensors")
-
-    serialize_vllm_model(vllm_model.model.llm_engine,
-                         TensorizerConfig(tensorizer_uri=model_path))
+    with vllm_runner(model_ref, ) as vllm_model:
+        model_path = tmp_path / (model_ref + ".tensors")
 
-    model_loader_extra_config = {
-        "tensorizer_uri": str(model_path),
-    }
+        serialize_vllm_model(vllm_model.model.llm_engine,
+                            TensorizerConfig(tensorizer_uri=model_path))
 
-    del vllm_model
-    gc.collect()
-    torch.cuda.empty_cache()
+        model_loader_extra_config = {
+            "tensorizer_uri": str(model_path),
+        }
 
     ## Start OpenAI API server
     openai_args = [
@@ -267,18 +254,15 @@ def test_vllm_tensorized_model_has_same_outputs(vllm_runner, tmp_path):
     model_path = tmp_path / (model_ref + ".tensors")
     config = TensorizerConfig(tensorizer_uri=str(model_path))
 
-    vllm_model = vllm_runner(model_ref)
-    outputs = vllm_model.generate(prompts, sampling_params)
-    serialize_vllm_model(vllm_model.model.llm_engine, config)
+    with vllm_runner(model_ref) as vllm_model:
+        outputs = vllm_model.generate(prompts, sampling_params)
+        serialize_vllm_model(vllm_model.model.llm_engine, config)
 
-    assert is_vllm_tensorized(config)
-    del vllm_model
-    gc.collect()
-    torch.cuda.empty_cache()
+        assert is_vllm_tensorized(config)
 
-    loaded_vllm_model = vllm_runner(model_ref,
-                                    load_format="tensorizer",
-                                    model_loader_extra_config=config)
-    deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params)
+    with vllm_runner(model_ref,
+                    load_format="tensorizer",
+                    model_loader_extra_config=config) as loaded_vllm_model:
+        deserialized_outputs = loaded_vllm_model.generate(prompts, sampling_params) # noqa: E501
 
-    assert outputs == deserialized_outputs
+        assert outputs == deserialized_outputs

From f8fe9561cf5686677b5bdcf14e7ac8e116a1b636 Mon Sep 17 00:00:00 2001
From: Michael Goin <michael@neuralmagic.com>
Date: Sat, 8 Jun 2024 13:54:05 -0400
Subject: [PATCH 74/93] [Misc][Breaking] Change FP8 checkpoint format from
 act_scale -> input_scale (#5353)

---
 vllm/model_executor/layers/quantization/fp8.py |  8 ++++----
 vllm/model_executor/models/mixtral.py          | 12 ++++++------
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py
index c70183dea7d3..0cf2bd927a80 100644
--- a/vllm/model_executor/layers/quantization/fp8.py
+++ b/vllm/model_executor/layers/quantization/fp8.py
@@ -254,11 +254,11 @@ def apply(self,
               bias: Optional[torch.Tensor] = None) -> torch.Tensor:
 
         # ops.scaled_fp8_quant supports both dynamic and static quant.
-        #   If dynamic, layer.act_scale is None and x_scale computed from x.
-        #   If static,  layer.act_scale is scalar and x_scale set to act_scale.
+        #   If dynamic, layer.input_scale is None and x_scale computed from x.
+        #   If static, layer.input_scale is scalar and x_scale is input_scale.
 
         if bias is None and self.cutlass_fp8_supported:
-            qinput, x_scale = ops.scaled_fp8_quant(x, layer.act_scale)
+            qinput, x_scale = ops.scaled_fp8_quant(x, layer.input_scale)
 
             # Fused GEMM_DQ
             output = ops.cutlass_scaled_mm_dq(
@@ -271,7 +271,7 @@ def apply(self,
 
         else:
             qinput, x_scale = ops.scaled_fp8_quant(x,
-                                                   layer.act_scale,
+                                                   layer.input_scale,
                                                    batch_dim_padding=17)
 
             # Fused GEMM_DQ -- note we padded the input above because
diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py
index 234b213d54d1..3faf54d292b9 100644
--- a/vllm/model_executor/models/mixtral.py
+++ b/vllm/model_executor/models/mixtral.py
@@ -182,11 +182,11 @@ def weight_loader(self, param: nn.Parameter, loaded_weight: torch.Tensor,
             param_data[expert_id, :, :] = loaded_weight[:, shard]
 
         # Loading scales
-        if "act_scale" in weight_name or "w2.weight_scale" in weight_name:
+        if "input_scale" in weight_name or "w2.weight_scale" in weight_name:
             if param_data[expert_id] != 1 and (param_data[expert_id] -
                                                loaded_weight).abs() > 1e-5:
                 raise ValueError(
-                    "act_scales of w1 and w3 of a layer "
+                    "input_scales of w1 and w3 of a layer "
                     f"must be equal. But got {param_data[expert_id]} "
                     f"vs. {loaded_weight}")
             param_data[expert_id] = loaded_weight
@@ -225,9 +225,9 @@ def process_weights_after_loading(self):
             self.w2_weight = nn.Parameter(w2_weight, requires_grad=False)
 
         else:
-            # If checkpoint is fp8 + static, cleanup act_scales.
-            #   Since state_dict has an act_scale per expert but our kernels
-            #   are passed one act_scale shared across all experts.
+            # If checkpoint is fp8 + static, cleanup input_scales.
+            #   Since state_dict has an input_scale per expert but our kernels
+            #   are passed one input_scale shared across all experts.
             if self.quant_config.activation_scheme == "static":
                 if self.a13_scale is None or self.a2_scale is None:
                     raise ValueError(
@@ -237,7 +237,7 @@ def process_weights_after_loading(self):
                 if (not all_close_1d(self.a13_scale)
                         or not all_close_1d(self.a2_scale)):
                     print_warning_once(
-                        "Found act_scales that are not equal for "
+                        "Found input_scales that are not equal for "
                         "fp8 MoE layer. Using the maximum across experts "
                         "for each layer. ")
 

From 550ed83c7bf1d30d53776790b7155cb4f3869f99 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 8 Jun 2024 19:14:43 -0700
Subject: [PATCH 75/93] [Core][CUDA Graph] add output buffer for cudagraph
 (#5074)

[Core][CUDA Graph] add output buffer for cudagraph to reduce memory footprint (#5074)
---
 vllm/worker/model_runner.py | 24 ++++++++++++++++++++----
 1 file changed, 20 insertions(+), 4 deletions(-)

diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
index c59288b4f73c..7879a5de5b7b 100644
--- a/vllm/worker/model_runner.py
+++ b/vllm/worker/model_runner.py
@@ -1,3 +1,4 @@
+import gc
 import time
 import warnings
 from collections import defaultdict
@@ -894,6 +895,10 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
         seq_lens = torch.ones(max_batch_size, dtype=torch.int32).cuda()
         block_tables = torch.from_numpy(self.graph_block_tables).cuda()
 
+        # Prepare buffer for outputs. These will be reused for all batch sizes.
+        # It will be filled after the first graph capture.
+        hidden_states: Optional[torch.Tensor] = None
+
         graph_batch_size = _get_graph_batch_size(
             self.scheduler_config.max_num_seqs)
         batch_size_capture_list = [
@@ -930,9 +935,11 @@ def capture_model(self, kv_caches: List[torch.Tensor]) -> None:
                     self.set_active_loras(set(), lora_mapping)
 
                 graph_runner = CUDAGraphRunner(self.model)
-                graph_runner.capture(
+                hidden_states = graph_runner.capture(
                     input_tokens[:batch_size],
                     input_positions[:batch_size],
+                    hidden_states[:batch_size]
+                    if hidden_states is not None else None,
                     kv_caches,
                     attn_metadata,
                     memory_pool=self.graph_memory_pool,
@@ -969,12 +976,13 @@ def capture(
         self,
         input_ids: torch.Tensor,
         positions: torch.Tensor,
+        hidden_states: Optional[torch.Tensor],
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
         memory_pool: Optional[Tuple[int, int]],
         stream: torch.cuda.Stream,
         **kwargs,
-    ) -> None:
+    ) -> torch.Tensor:
         assert self._graph is None
         # Run the model a few times without capturing the graph.
         # This is to make sure that the captured graph does not include the
@@ -993,13 +1001,21 @@ def capture(
         # Capture the graph.
         self._graph = torch.cuda.CUDAGraph()
         with torch.cuda.graph(self._graph, pool=memory_pool, stream=stream):
-            hidden_states = self.model(
+            output_hidden_states = self.model(
                 input_ids,
                 positions,
                 kv_caches,
                 attn_metadata,
                 **kwargs,
             )
+            if hidden_states is not None:
+                hidden_states.copy_(output_hidden_states)
+            else:
+                hidden_states = output_hidden_states
+            del output_hidden_states
+            # make sure `output_hidden_states` is deleted
+            # in the graph's memory pool
+            gc.collect()
         torch.cuda.synchronize()
 
         # Save the input and output buffers.
@@ -1012,7 +1028,7 @@ def capture(
             "block_tables": attn_metadata.decode_metadata.block_tables,
         }
         self.output_buffers = {"hidden_states": hidden_states}
-        return
+        return hidden_states
 
     def forward(
         self,

From 52a90dd8326c7cbb24942fe32b18493a3751bfb5 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Sat, 8 Jun 2024 20:50:14 -0700
Subject: [PATCH 76/93] [mis][ci/test] fix flaky test in
 test_sharded_state_loader.py (#5361)

[mis][ci/test] fix flaky test in tests/test_sharded_state_loader.py (#5361)
---
 tests/test_sharded_state_loader.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tests/test_sharded_state_loader.py b/tests/test_sharded_state_loader.py
index 022fb36b346f..de79c3b945d4 100644
--- a/tests/test_sharded_state_loader.py
+++ b/tests/test_sharded_state_loader.py
@@ -39,7 +39,8 @@ def test_filter_subtensors():
     filtered_state_dict = ShardedStateLoader._filter_subtensors(state_dict)
     assert tuple(filtered_state_dict.keys()) == ("a", "b", "c")
     for key, tensor in filtered_state_dict.items():
-        assert tensor.equal(state_dict[key])
+        # NOTE: don't use `euqal` here, as the tensor might contain NaNs
+        assert tensor is state_dict[key]
 
 
 @pytest.fixture(scope="module")

From d20586a0972a781e174f5ac3d56ef0094394a32b Mon Sep 17 00:00:00 2001
From: bnellnm <49004751+bnellnm@users.noreply.github.com>
Date: Sun, 9 Jun 2024 16:23:30 -0400
Subject: [PATCH 77/93] [Kernel][Misc] Use TORCH_LIBRARY instead of
 PYBIND11_MODULE for custom ops (#5047)

---
 CMakeLists.txt                                |  22 +-
 Dockerfile.rocm                               |   6 +-
 cmake/cpu_extension.cmake                     |  12 +-
 cmake/utils.cmake                             |  11 +-
 csrc/activation_kernels.cu                    |   2 +-
 csrc/attention/attention_kernels.cu           |  34 ++-
 csrc/cache.h                                  |  14 +-
 csrc/cache_kernels.cu                         |  13 +-
 csrc/cpu/attention.cpp                        |  26 +-
 csrc/cpu/cache.cpp                            |  13 +-
 csrc/cpu/cpu_types.hpp                        |   2 +-
 csrc/cpu/layernorm.cpp                        |   4 +-
 csrc/cpu/pos_encoding.cpp                     |   2 +-
 csrc/cpu/pybind.cpp                           |  43 ---
 csrc/cpu/torch_bindings.cpp                   | 106 +++++++
 csrc/cuda_utils.h                             |   6 +-
 csrc/cuda_utils_kernels.cu                    |   6 +-
 csrc/custom_all_reduce.cu                     |  22 +-
 csrc/dispatch_utils.h                         |   2 +-
 csrc/layernorm_kernels.cu                     |   6 +-
 csrc/moe/moe_ops.cpp                          |   8 -
 csrc/moe/moe_ops.h                            |   2 +-
 csrc/moe/topk_softmax_kernels.cu              |   2 +-
 csrc/moe/torch_bindings.cpp                   |  12 +
 csrc/moe_align_block_size_kernels.cu          |   6 +-
 csrc/ops.h                                    |  68 +++--
 csrc/pos_encoding_kernels.cu                  |  12 +-
 csrc/punica/punica_ops.cu                     |   6 +-
 csrc/punica/punica_ops.h                      |   6 +-
 csrc/punica/punica_pybind.cpp                 |  13 -
 csrc/punica/torch_bindings.cpp                |  18 ++
 csrc/pybind.cpp                               | 114 -------
 csrc/quantization/aqlm/gemm_kernels.cu        |   2 +-
 csrc/quantization/awq/gemm_kernels.cu         |   8 +-
 .../compressed_tensors/int8_quant_kernels.cu  |   2 +-
 .../cutlass_w8a8/scaled_mm_dq_c2x.cu          |   2 +-
 .../cutlass_w8a8/scaled_mm_dq_c3x.cu          |   2 +-
 .../cutlass_w8a8/scaled_mm_dq_entry.cu        |   2 +-
 csrc/quantization/fp8/common.cu               |   2 +-
 csrc/quantization/gptq/q_gemm.cu              |   6 +-
 csrc/quantization/gptq_marlin/gptq_marlin.cu  |   2 +-
 csrc/quantization/gptq_marlin/gptq_marlin.cuh |   2 +-
 .../marlin/dense/marlin_cuda_kernel.cu        |   2 +-
 .../marlin/sparse/marlin_24_cuda_kernel.cu    |   2 +-
 .../squeezellm/quant_cuda_kernel.cu           |   1 -
 csrc/registration.h                           |  22 ++
 csrc/torch_bindings.cpp                       | 283 ++++++++++++++++++
 setup.py                                      |   2 +-
 tests/kernels/test_int8_quant.py              |   7 +-
 vllm/_custom_ops.py                           | 217 ++++++++++----
 vllm/attention/backends/flash_attn.py         |  10 +-
 .../device_communicators/custom_all_reduce.py |  34 ++-
 vllm/lora/punica.py                           |  45 ++-
 .../layers/fused_moe/fused_moe.py             |   3 +-
 vllm/utils.py                                 |   7 +-
 55 files changed, 833 insertions(+), 451 deletions(-)
 delete mode 100644 csrc/cpu/pybind.cpp
 create mode 100644 csrc/cpu/torch_bindings.cpp
 delete mode 100644 csrc/moe/moe_ops.cpp
 create mode 100644 csrc/moe/torch_bindings.cpp
 delete mode 100644 csrc/punica/punica_pybind.cpp
 create mode 100644 csrc/punica/torch_bindings.cpp
 delete mode 100644 csrc/pybind.cpp
 create mode 100644 csrc/registration.h
 create mode 100644 csrc/torch_bindings.cpp

diff --git a/CMakeLists.txt b/CMakeLists.txt
index a197063f3360..ad6736c47f45 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -66,19 +66,6 @@ endif()
 #
 find_package(Torch REQUIRED)
 
-#
-# Normally `torch.utils.cpp_extension.CUDAExtension` would add
-# `libtorch_python.so` for linking against an extension. Torch's cmake
-# configuration does not include this library (presumably since the cmake
-# config is used for standalone C++ binaries that link against torch).
-# The `libtorch_python.so` library defines some of the glue code between
-# torch/python via pybind and is required by VLLM extensions for this
-# reason. So, add it by manually with `find_library` using torch's
-# installed library path.
-#
-find_library(torch_python_LIBRARY torch_python PATHS
-  "${TORCH_INSTALL_PREFIX}/lib")
-
 #
 # Forward the non-CUDA device extensions to external CMake scripts.
 #
@@ -171,7 +158,7 @@ set(VLLM_EXT_SRC
   "csrc/quantization/fp8/common.cu"
   "csrc/cuda_utils_kernels.cu"
   "csrc/moe_align_block_size_kernels.cu"
-  "csrc/pybind.cpp")
+  "csrc/torch_bindings.cpp")
 
 if(VLLM_GPU_LANG STREQUAL "CUDA")
   include(FetchContent)
@@ -218,6 +205,7 @@ define_gpu_extension_target(
   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
   ARCHITECTURES ${VLLM_GPU_ARCHES}
   INCLUDE_DIRECTORIES ${CUTLASS_INCLUDE_DIR};${CUTLASS_TOOLS_UTIL_INCLUDE_DIR}
+  USE_SABI 3
   WITH_SOABI)
 
 #
@@ -225,7 +213,7 @@ define_gpu_extension_target(
 #
 
 set(VLLM_MOE_EXT_SRC
-  "csrc/moe/moe_ops.cpp"
+  "csrc/moe/torch_bindings.cpp"
   "csrc/moe/topk_softmax_kernels.cu")
 
 define_gpu_extension_target(
@@ -235,6 +223,7 @@ define_gpu_extension_target(
   SOURCES ${VLLM_MOE_EXT_SRC}
   COMPILE_FLAGS ${VLLM_GPU_FLAGS}
   ARCHITECTURES ${VLLM_GPU_ARCHES}
+  USE_SABI 3
   WITH_SOABI)
 
 #
@@ -249,7 +238,7 @@ set(VLLM_PUNICA_EXT_SRC
   "csrc/punica/bgmv/bgmv_fp32_bf16_bf16.cu"
   "csrc/punica/bgmv/bgmv_fp32_fp16_fp16.cu"
   "csrc/punica/punica_ops.cu"
-  "csrc/punica/punica_pybind.cpp")
+  "csrc/punica/torch_bindings.cpp")
 
 #
 # Copy GPU compilation flags+update for punica
@@ -286,6 +275,7 @@ if (VLLM_PUNICA_GPU_ARCHES)
     SOURCES ${VLLM_PUNICA_EXT_SRC}
     COMPILE_FLAGS ${VLLM_PUNICA_GPU_FLAGS}
     ARCHITECTURES ${VLLM_PUNICA_GPU_ARCHES}
+    USE_SABI 3
     WITH_SOABI)
 else()
   message(WARNING "Unable to create _punica_C target because none of the "
diff --git a/Dockerfile.rocm b/Dockerfile.rocm
index e30a2aaf3020..954958df88fc 100644
--- a/Dockerfile.rocm
+++ b/Dockerfile.rocm
@@ -106,9 +106,9 @@ RUN --mount=type=cache,target=/root/.cache/pip \
     pip install -U -r requirements-rocm.txt \
     && patch /opt/rocm/include/hip/amd_detail/amd_hip_bf16.h ./rocm_patch/rocm_bf16.patch \
     && python3 setup.py install \
-    && cp build/lib.linux-x86_64-cpython-39/vllm/_C.cpython-39-x86_64-linux-gnu.so vllm/ \
-    && cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.cpython-39-x86_64-linux-gnu.so vllm/ \
-    && cp build/lib.linux-x86_64-cpython-39/vllm/_moe_C.cpython-39-x86_64-linux-gnu.so vllm/ \
+    && cp build/lib.linux-x86_64-cpython-39/vllm/_C.abi3.so vllm/ \
+    && cp build/lib.linux-x86_64-cpython-39/vllm/_punica_C.abi3.so vllm/ \
+    && cp build/lib.linux-x86_64-cpython-39/vllm/_moe_C.abi3.so vllm/ \
     && cd ..
 
 
diff --git a/cmake/cpu_extension.cmake b/cmake/cpu_extension.cmake
index 0cf37769a696..61d4843838ba 100644
--- a/cmake/cpu_extension.cmake
+++ b/cmake/cpu_extension.cmake
@@ -12,7 +12,7 @@ include_directories("${CMAKE_SOURCE_DIR}/csrc")
 #
 # Check the compile flags
 #
-list(APPEND CXX_COMPILE_FLAGS 
+list(APPEND CXX_COMPILE_FLAGS
     "-fopenmp"
     "-DVLLM_CPU_EXTENSION")
 
@@ -44,8 +44,8 @@ if (AVX512_FOUND)
 
     find_isa(${CPUINFO} "avx512_bf16" AVX512BF16_FOUND)
     if (AVX512BF16_FOUND OR ENABLE_AVX512BF16)
-        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND 
-            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3) 
+        if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU" AND
+            CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.3)
             list(APPEND CXX_COMPILE_FLAGS "-mavx512bf16")
         else()
             message(WARNING "Disable AVX512-BF16 ISA support, requires gcc/g++ >= 12.3")
@@ -73,7 +73,7 @@ set(VLLM_EXT_SRC
     "csrc/cpu/cache.cpp"
     "csrc/cpu/layernorm.cpp"
     "csrc/cpu/pos_encoding.cpp"
-    "csrc/cpu/pybind.cpp")
+    "csrc/cpu/torch_bindings.cpp")
 
 define_gpu_extension_target(
     _C
@@ -81,10 +81,10 @@ define_gpu_extension_target(
     LANGUAGE CXX
     SOURCES ${VLLM_EXT_SRC}
     COMPILE_FLAGS ${CXX_COMPILE_FLAGS}
-    WITH_SOABI 
+    USE_SABI 3
+    WITH_SOABI
 )
 
 add_custom_target(default)
 message(STATUS "Enabling C extension.")
 add_dependencies(default _C)
-
diff --git a/cmake/utils.cmake b/cmake/utils.cmake
index 00c81e4d00ad..f3c1286dd849 100644
--- a/cmake/utils.cmake
+++ b/cmake/utils.cmake
@@ -5,7 +5,7 @@
 macro (find_python_from_executable EXECUTABLE SUPPORTED_VERSIONS)
   file(REAL_PATH ${EXECUTABLE} EXECUTABLE)
   set(Python_EXECUTABLE ${EXECUTABLE})
-  find_package(Python COMPONENTS Interpreter Development.Module)
+  find_package(Python COMPONENTS Interpreter Development.Module Development.SABIModule)
   if (NOT Python_FOUND)
     message(FATAL_ERROR "Unable to find python matching: ${EXECUTABLE}.")
   endif()
@@ -294,6 +294,7 @@ endmacro()
 # INCLUDE_DIRECTORIES <dirs> - Extra include directories.
 # LIBRARIES <libraries>      - Extra link libraries.
 # WITH_SOABI                 - Generate library with python SOABI suffix name.
+# USE_SABI <version>         - Use python stable api <version>
 #
 # Note: optimization level/debug info is set via cmake build type.
 #
@@ -301,7 +302,7 @@ function (define_gpu_extension_target GPU_MOD_NAME)
   cmake_parse_arguments(PARSE_ARGV 1
     GPU
     "WITH_SOABI"
-    "DESTINATION;LANGUAGE"
+    "DESTINATION;LANGUAGE;USE_SABI"
     "SOURCES;ARCHITECTURES;COMPILE_FLAGS;INCLUDE_DIRECTORIES;LIBRARIES")
 
   # Add hipify preprocessing step when building with HIP/ROCm.
@@ -315,7 +316,11 @@ function (define_gpu_extension_target GPU_MOD_NAME)
     set(GPU_WITH_SOABI)
   endif()
 
-  Python_add_library(${GPU_MOD_NAME} MODULE "${GPU_SOURCES}" ${GPU_WITH_SOABI})
+  if (GPU_USE_SABI)
+    Python_add_library(${GPU_MOD_NAME} MODULE USE_SABI ${GPU_USE_SABI} ${GPU_WITH_SOABI} "${GPU_SOURCES}")
+  else()
+    Python_add_library(${GPU_MOD_NAME} MODULE ${GPU_WITH_SOABI} "${GPU_SOURCES}")
+  endif()
 
   if (GPU_LANGUAGE STREQUAL "HIP")
     # Make this target dependent on the hipify preprocessor step.
diff --git a/csrc/activation_kernels.cu b/csrc/activation_kernels.cu
index 867f63f12de4..86ac2e75e78e 100644
--- a/csrc/activation_kernels.cu
+++ b/csrc/activation_kernels.cu
@@ -1,5 +1,5 @@
 #include <ATen/cuda/CUDAContext.h>
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <c10/cuda/CUDAGuard.h>
 
 #include <cmath>
diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
index 8f89f89786c3..91083481705c 100644
--- a/csrc/attention/attention_kernels.cu
+++ b/csrc/attention/attention_kernels.cu
@@ -17,7 +17,7 @@
  * limitations under the License.
  */
 
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <algorithm>
@@ -808,16 +808,17 @@ void paged_attention_v1(
     torch::Tensor&
         key_cache,  // [num_blocks, num_heads, head_size/x, block_size, x]
     torch::Tensor&
-        value_cache,   // [num_blocks, num_heads, head_size, block_size]
-    int num_kv_heads,  // [num_heads]
-    float scale,
+        value_cache,       // [num_blocks, num_heads, head_size, block_size]
+    int64_t num_kv_heads,  // [num_heads]
+    double scale,
     torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
     torch::Tensor& seq_lens,      // [num_seqs]
-    int block_size, int max_seq_len,
+    int64_t block_size, int64_t max_seq_len,
     const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
-    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
-    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
+    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step) {
   const bool is_block_sparse = (blocksparse_vert_stride > 1);
 
   DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
@@ -972,16 +973,17 @@ void paged_attention_v2(
     torch::Tensor&
         key_cache,  // [num_blocks, num_heads, head_size/x, block_size, x]
     torch::Tensor&
-        value_cache,   // [num_blocks, num_heads, head_size, block_size]
-    int num_kv_heads,  // [num_heads]
-    float scale,
+        value_cache,       // [num_blocks, num_heads, head_size, block_size]
+    int64_t num_kv_heads,  // [num_heads]
+    double scale,
     torch::Tensor& block_tables,  // [num_seqs, max_num_blocks_per_seq]
     torch::Tensor& seq_lens,      // [num_seqs]
-    int block_size, int max_seq_len,
+    int64_t block_size, int64_t max_seq_len,
     const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
-    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
-    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
+    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step) {
   const bool is_block_sparse = (blocksparse_vert_stride > 1);
   DISPATCH_BY_KV_CACHE_DTYPE(query.dtype(), kv_cache_dtype,
                              CALL_V2_LAUNCHER_BLOCK_SIZE)
@@ -990,4 +992,4 @@ void paged_attention_v2(
 #undef WARP_SIZE
 #undef MAX
 #undef MIN
-#undef DIVIDE_ROUND_UP
\ No newline at end of file
+#undef DIVIDE_ROUND_UP
diff --git a/csrc/cache.h b/csrc/cache.h
index 435ae3e57f55..86caa9345361 100644
--- a/csrc/cache.h
+++ b/csrc/cache.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/extension.h>
+#include <torch/all.h>
 
 #include <map>
 #include <vector>
@@ -8,14 +8,18 @@
 void swap_blocks(torch::Tensor& src, torch::Tensor& dst,
                  const torch::Tensor& block_mapping);
 
-void copy_blocks(std::vector<torch::Tensor>& key_caches,
-                 std::vector<torch::Tensor>& value_caches,
+// Note: the key_caches and value_caches vectors are constant but
+// not the Tensors they contain. The vectors need to be const refs
+// in order to satisfy pytorch's C++ operator registration code.
+void copy_blocks(std::vector<torch::Tensor> const& key_caches,
+                 std::vector<torch::Tensor> const& value_caches,
                  const torch::Tensor& block_mapping);
 
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                        torch::Tensor& key_cache, torch::Tensor& value_cache,
                        torch::Tensor& slot_mapping,
-                       const std::string& kv_cache_dtype, const float kv_scale);
+                       const std::string& kv_cache_dtype,
+                       const double kv_scale);
 
 void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
                              torch::Tensor& key_cache,
@@ -25,4 +29,4 @@ void reshape_and_cache_flash(torch::Tensor& key, torch::Tensor& value,
 
 // Just for unittest
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
-                 const float scale, const std::string& kv_cache_dtype);
+                 const double scale, const std::string& kv_cache_dtype);
diff --git a/csrc/cache_kernels.cu b/csrc/cache_kernels.cu
index d924ac39b89c..72041076ae00 100644
--- a/csrc/cache_kernels.cu
+++ b/csrc/cache_kernels.cu
@@ -1,4 +1,4 @@
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 
@@ -95,8 +95,11 @@ __global__ void copy_blocks_kernel(int64_t* key_cache_ptrs,
 
 }  // namespace vllm
 
-void copy_blocks(std::vector<torch::Tensor>& key_caches,
-                 std::vector<torch::Tensor>& value_caches,
+// Note: the key_caches and value_caches vectors are constant but
+// not the Tensors they contain. The vectors need to be const refs
+// in order to satisfy pytorch's C++ operator registration code.
+void copy_blocks(std::vector<torch::Tensor> const& key_caches,
+                 std::vector<torch::Tensor> const& value_caches,
                  const torch::Tensor& block_mapping) {
   int num_layers = key_caches.size();
   TORCH_CHECK(num_layers == value_caches.size());
@@ -255,7 +258,7 @@ void reshape_and_cache(
     torch::Tensor&
         value_cache,  // [num_blocks, num_heads, head_size, block_size]
     torch::Tensor& slot_mapping,  // [num_tokens]
-    const std::string& kv_cache_dtype, const float kv_scale) {
+    const std::string& kv_cache_dtype, const double kv_scale) {
   int num_tokens = key.size(0);
   int num_heads = key.size(1);
   int head_size = key.size(2);
@@ -334,7 +337,7 @@ __global__ void convert_fp8_kernel(const Tin* __restrict__ src_cache,
 
 // Only for testing.
 void convert_fp8(torch::Tensor& dst_cache, torch::Tensor& src_cache,
-                 const float kv_scale, const std::string& kv_cache_dtype) {
+                 const double kv_scale, const std::string& kv_cache_dtype) {
   torch::Device src_device = src_cache.device();
   torch::Device dst_device = dst_cache.device();
   TORCH_CHECK(src_device.is_cuda(), "src must be on a GPU")
diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp
index ed8cfbd421f0..836709332531 100644
--- a/csrc/cpu/attention.cpp
+++ b/csrc/cpu/attention.cpp
@@ -420,12 +420,13 @@ void paged_attention_v1_impl_launcher(
 
 void paged_attention_v1(
     torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int num_kv_heads, float scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
-    int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
-    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
-    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
+    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
+    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step) {
   TORCH_CHECK(kv_scale == 1.0f);
   TORCH_CHECK(blocksparse_vert_stride <= 1,
               "CPU backend does not support blocksparse attention yet.");
@@ -738,12 +739,13 @@ void paged_attention_v2_impl_launcher(
 void paged_attention_v2(
     torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int num_kv_heads, float scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
-    int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
-    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
-    const int blocksparse_block_size, const int blocksparse_head_sliding_step) {
+    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
+    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step) {
   TORCH_CHECK(kv_scale == 1.0f);
   TORCH_CHECK(blocksparse_vert_stride <= 1,
               "CPU backend does not support blocksparse attention yet.");
diff --git a/csrc/cpu/cache.cpp b/csrc/cpu/cache.cpp
index 2890ba6e2bb3..2b5c3bd6ee70 100644
--- a/csrc/cpu/cache.cpp
+++ b/csrc/cpu/cache.cpp
@@ -5,8 +5,8 @@
 
 namespace {
 template <typename scalar_t>
-void copy_blocks_cpu_impl(std::vector<torch::Tensor>& key_caches,
-                          std::vector<torch::Tensor>& value_caches,
+void copy_blocks_cpu_impl(std::vector<torch::Tensor> const& key_caches,
+                          std::vector<torch::Tensor> const& value_caches,
                           const torch::Tensor& mapping_pairs,
                           const int element_num_per_block,
                           const int layer_num) {
@@ -82,8 +82,11 @@ void reshape_and_cache_cpu_impl(
 }
 };  // namespace
 
-void copy_blocks(std::vector<torch::Tensor>& key_caches,
-                 std::vector<torch::Tensor>& value_caches,
+// Note: the key_caches and value_caches vectors are constant but
+// not the Tensors they contain. The vectors need to be const refs
+// in order to satisfy pytorch's C++ operator registration code.
+void copy_blocks(std::vector<torch::Tensor> const& key_caches,
+                 std::vector<torch::Tensor> const& value_caches,
                  const torch::Tensor& block_mapping) {
   unsigned num_layers = key_caches.size();
   TORCH_CHECK(num_layers == value_caches.size());
@@ -104,7 +107,7 @@ void copy_blocks(std::vector<torch::Tensor>& key_caches,
 void reshape_and_cache(torch::Tensor& key, torch::Tensor& value,
                        torch::Tensor& key_cache, torch::Tensor& value_cache,
                        torch::Tensor& slot_mapping,
-                       const std::string& kv_cache_dtype, float kv_scale) {
+                       const std::string& kv_cache_dtype, double kv_scale) {
   TORCH_CHECK(kv_scale == 1.0f);
 
   int num_tokens = key.size(0);
diff --git a/csrc/cpu/cpu_types.hpp b/csrc/cpu/cpu_types.hpp
index c1d3ec058b99..034c406a532d 100644
--- a/csrc/cpu/cpu_types.hpp
+++ b/csrc/cpu/cpu_types.hpp
@@ -3,7 +3,7 @@
 #define CPU_TYPES_HPP
 
 #include <immintrin.h>
-#include <torch/extension.h>
+#include <torch/all.h>
 
 namespace vec_op {
 
diff --git a/csrc/cpu/layernorm.cpp b/csrc/cpu/layernorm.cpp
index 65d3ddcec570..a76ad08928a2 100644
--- a/csrc/cpu/layernorm.cpp
+++ b/csrc/cpu/layernorm.cpp
@@ -88,7 +88,7 @@ void fused_add_rms_norm_impl(scalar_t* __restrict__ input,
 }  // namespace
 
 void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
-              float epsilon) {
+              double epsilon) {
   int hidden_size = input.size(-1);
   int num_tokens = input.numel() / hidden_size;
 
@@ -102,7 +102,7 @@ void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
 }
 
 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
-                        torch::Tensor& weight, float epsilon) {
+                        torch::Tensor& weight, double epsilon) {
   int hidden_size = input.size(-1);
   int num_tokens = input.numel() / hidden_size;
 
diff --git a/csrc/cpu/pos_encoding.cpp b/csrc/cpu/pos_encoding.cpp
index e8aead17ae5a..96bce7dda013 100644
--- a/csrc/cpu/pos_encoding.cpp
+++ b/csrc/cpu/pos_encoding.cpp
@@ -168,7 +168,7 @@ void rotary_embedding_gptj_impl(
 };  // namespace
 
 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
-                      torch::Tensor& key, int head_size,
+                      torch::Tensor& key, int64_t head_size,
                       torch::Tensor& cos_sin_cache, bool is_neox) {
   int num_tokens = query.numel() / query.size(-1);
   int rot_dim = cos_sin_cache.size(1);
diff --git a/csrc/cpu/pybind.cpp b/csrc/cpu/pybind.cpp
deleted file mode 100644
index e5b2ce4f3011..000000000000
--- a/csrc/cpu/pybind.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-#include "cache.h"
-#include "ops.h"
-#include <torch/extension.h>
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  // vLLM custom ops
-  pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
-
-  // Attention ops
-  ops.def("paged_attention_v1", &paged_attention_v1,
-          "Compute the attention between an input query and the cached "
-          "keys/values using PagedAttention.");
-  ops.def("paged_attention_v2", &paged_attention_v2, "PagedAttention V2.");
-
-  // Activation ops
-  ops.def("silu_and_mul", &silu_and_mul, "Activation function used in SwiGLU.");
-  ops.def("gelu_and_mul", &gelu_and_mul,
-          "Activation function used in GeGLU with `none` approximation.");
-  ops.def("gelu_tanh_and_mul", &gelu_tanh_and_mul,
-          "Activation function used in GeGLU with `tanh` approximation.");
-  ops.def("gelu_new", &gelu_new, "GELU implementation used in GPT-2.");
-  ops.def("gelu_fast", &gelu_fast, "Approximate GELU implementation.");
-
-  // Layernorm
-  ops.def("rms_norm", &rms_norm,
-          "Apply Root Mean Square (RMS) Normalization to the input tensor.");
-
-  ops.def("fused_add_rms_norm", &fused_add_rms_norm,
-          "In-place fused Add and RMS Normalization");
-
-  // Rotary embedding
-  ops.def("rotary_embedding", &rotary_embedding,
-          "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
-
-  // Cache ops
-  pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
-  cache_ops.def("swap_blocks", &swap_blocks,
-                "Swap in (out) the cache blocks from src to dst");
-  cache_ops.def("copy_blocks", &copy_blocks,
-                "Copy the cache blocks from src to dst");
-  cache_ops.def("reshape_and_cache", &reshape_and_cache,
-                "Reshape the key and value tensors and cache them");
-}
diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp
new file mode 100644
index 000000000000..a2bf0d49adba
--- /dev/null
+++ b/csrc/cpu/torch_bindings.cpp
@@ -0,0 +1,106 @@
+#include "cache.h"
+#include "ops.h"
+#include "registration.h"
+
+#include <torch/library.h>
+
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  // vLLM custom ops
+
+  // Attention ops
+  // Compute the attention between an input query and the cached keys/values
+  // using PagedAttention.
+  ops.def(
+      "paged_attention_v1("
+      "    Tensor! out, Tensor query, Tensor key_cache,"
+      "    Tensor value_cache, int num_kv_heads, float scale,"
+      "    Tensor block_tables, Tensor seq_lens, int block_size,"
+      "    int max_seq_len, Tensor? alibi_slopes,"
+      "    str kv_cache_dtype, float kv_scale, int tp_rank,"
+      "    int blocksparse_local_blocks,"
+      "    int blocksparse_vert_stride, int blocksparse_block_size,"
+      "    int blocksparse_head_sliding_step) -> ()");
+  ops.impl("paged_attention_v1", torch::kCPU, &paged_attention_v1);
+
+  // PagedAttention V2.
+  ops.def(
+      "paged_attention_v2("
+      "    Tensor! out, Tensor exp_sums, Tensor max_logits,"
+      "    Tensor tmp_out, Tensor query, Tensor key_cache,"
+      "    Tensor value_cache, int num_kv_heads, float scale,"
+      "    Tensor block_tables, Tensor seq_lens, int block_size,"
+      "    int max_seq_len, Tensor? alibi_slopes,"
+      "    str kv_cache_dtype, float kv_scale, int tp_rank,"
+      "    int blocksparse_local_blocks,"
+      "    int blocksparse_vert_stride, int blocksparse_block_size,"
+      "    int blocksparse_head_sliding_step) -> ()");
+  ops.impl("paged_attention_v2", torch::kCPU, &paged_attention_v2);
+
+  // Activation ops
+
+  // Activation function used in SwiGLU.
+  ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
+  ops.impl("silu_and_mul", torch::kCPU, &silu_and_mul);
+
+  // Activation function used in GeGLU with `none` approximation.
+  ops.def("gelu_and_mul(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_and_mul", torch::kCPU, &gelu_and_mul);
+
+  // Activation function used in GeGLU with `tanh` approximation.
+  ops.def("gelu_tanh_and_mul(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_tanh_and_mul", torch::kCPU, &gelu_tanh_and_mul);
+
+  // GELU implementation used in GPT-2.
+  ops.def("gelu_new(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_new", torch::kCPU, &gelu_new);
+
+  // Approximate GELU implementation.
+  ops.def("gelu_fast(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_fast", torch::kCPU, &gelu_fast);
+
+  // Layernorm
+  // Apply Root Mean Square (RMS) Normalization to the input tensor.
+  ops.def(
+      "rms_norm(Tensor! out, Tensor input, Tensor weight, float epsilon) -> "
+      "()");
+  ops.impl("rms_norm", torch::kCPU, &rms_norm);
+
+  // In-place fused Add and RMS Normalization.
+  ops.def(
+      "fused_add_rms_norm(Tensor! input, Tensor! residual, Tensor weight, "
+      "float epsilon) -> ()");
+  ops.impl("fused_add_rms_norm", torch::kCPU, &fused_add_rms_norm);
+
+  // Rotary embedding
+  // Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
+  ops.def(
+      "rotary_embedding(Tensor positions, Tensor! query,"
+      "                 Tensor! key, int head_size,"
+      "                 Tensor cos_sin_cache, bool is_neox) -> ()");
+  ops.impl("rotary_embedding", torch::kCPU, &rotary_embedding);
+}
+
+TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
+  // Cache ops
+  // Swap in (out) the cache blocks from src to dst.
+  cache_ops.def(
+      "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
+  cache_ops.impl("swap_blocks", torch::kCPU, &swap_blocks);
+
+  // Copy the cache blocks from src to dst.
+  cache_ops.def(
+      "copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
+      "block_mapping) -> ()");
+  cache_ops.impl("copy_blocks", torch::kCPU, &copy_blocks);
+
+  // Reshape the key and value tensors and cache them.
+  cache_ops.def(
+      "reshape_and_cache(Tensor key, Tensor value,"
+      "                  Tensor! key_cache, Tensor! value_cache,"
+      "                  Tensor slot_mapping,"
+      "                  str kv_cache_dtype,"
+      "                  float kv_scale) -> ()");
+  cache_ops.impl("reshape_and_cache", torch::kCPU, &reshape_and_cache);
+}
+
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/csrc/cuda_utils.h b/csrc/cuda_utils.h
index 2ba49b339e14..73944f4c1489 100644
--- a/csrc/cuda_utils.h
+++ b/csrc/cuda_utils.h
@@ -1,7 +1,5 @@
 #pragma once
 
-#include <torch/extension.h>
+int64_t get_device_attribute(int64_t attribute, int64_t device_id);
 
-int get_device_attribute(int attribute, int device_id);
-
-int get_max_shared_memory_per_block_device_attribute(int device_id);
+int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id);
diff --git a/csrc/cuda_utils_kernels.cu b/csrc/cuda_utils_kernels.cu
index 7d8e2e19720f..d6f9eb646fad 100644
--- a/csrc/cuda_utils_kernels.cu
+++ b/csrc/cuda_utils_kernels.cu
@@ -2,7 +2,7 @@
   #include <hip/hip_runtime.h>
   #include <hip/hip_runtime_api.h>
 #endif
-int get_device_attribute(int attribute, int device_id) {
+int64_t get_device_attribute(int64_t attribute, int64_t device_id) {
   int device, value;
   if (device_id < 0) {
     cudaGetDevice(&device);
@@ -14,8 +14,8 @@ int get_device_attribute(int attribute, int device_id) {
   return value;
 }
 
-int get_max_shared_memory_per_block_device_attribute(int device_id) {
-  int attribute;
+int64_t get_max_shared_memory_per_block_device_attribute(int64_t device_id) {
+  int64_t attribute;
   // https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__TYPES.html
   // cudaDevAttrMaxSharedMemoryPerBlockOptin = 97 if not is_hip() else 74
 
diff --git a/csrc/custom_all_reduce.cu b/csrc/custom_all_reduce.cu
index 0b1d95848525..82a3563979f1 100644
--- a/csrc/custom_all_reduce.cu
+++ b/csrc/custom_all_reduce.cu
@@ -1,17 +1,17 @@
 #include <ATen/cuda/Exceptions.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <c10/cuda/CUDAStream.h>
-#include <torch/extension.h>
+#include <torch/all.h>
 
 #include "custom_all_reduce.cuh"
 
-// fake pointer type
-using fptr_t = uint64_t;
+// fake pointer type, must match fptr_t type in ops.h
+using fptr_t = int64_t;
 static_assert(sizeof(void*) == sizeof(fptr_t));
 
 fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data,
                       const std::vector<std::string>& handles,
-                      const std::vector<int64_t>& offsets, int rank,
+                      const std::vector<int64_t>& offsets, int64_t rank,
                       bool full_nvlink) {
   int world_size = offsets.size();
   if (world_size > 8)
@@ -55,7 +55,7 @@ bool _is_weak_contiguous(torch::Tensor& t) {
           t.numel() * t.element_size());
 }
 
-bool should_custom_ar(torch::Tensor& inp, int max_size, int world_size,
+bool should_custom_ar(torch::Tensor& inp, int64_t max_size, int64_t world_size,
                       bool full_nvlink) {
   auto inp_size = inp.numel() * inp.element_size();
   // custom allreduce requires input byte size to be multiples of 16
@@ -125,7 +125,7 @@ void dispose(fptr_t _fa) {
   delete fa;
 }
 
-int meta_size() { return sizeof(vllm::Signal); }
+int64_t meta_size() { return sizeof(vllm::Signal); }
 
 void register_buffer(fptr_t _fa, torch::Tensor& t,
                      const std::vector<std::string>& handles,
@@ -134,10 +134,16 @@ void register_buffer(fptr_t _fa, torch::Tensor& t,
   fa->register_buffer(handles, offsets, t.data_ptr());
 }
 
-std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(
+std::tuple<torch::Tensor, std::vector<int64_t>> get_graph_buffer_ipc_meta(
     fptr_t _fa) {
   auto fa = reinterpret_cast<vllm::CustomAllreduce*>(_fa);
-  return fa->get_graph_buffer_ipc_meta();
+  auto [handle_bytes, offsets] = fa->get_graph_buffer_ipc_meta();
+  auto options =
+      torch::TensorOptions().dtype(torch::kUInt8).device(torch::kCPU);
+  auto handles =
+      torch::empty({static_cast<int64_t>(handle_bytes.size())}, options);
+  std::memcpy(handles.data_ptr(), handle_bytes.data(), handle_bytes.size());
+  return {handles, std::move(offsets)};
 }
 
 void register_graph_buffers(fptr_t _fa, const std::vector<std::string>& handles,
diff --git a/csrc/dispatch_utils.h b/csrc/dispatch_utils.h
index 3ecea03242f0..a634e1c3d488 100644
--- a/csrc/dispatch_utils.h
+++ b/csrc/dispatch_utils.h
@@ -4,7 +4,7 @@
  */
 #pragma once
 
-#include <torch/extension.h>
+#include <torch/all.h>
 
 #define VLLM_DISPATCH_CASE_FLOATING_TYPES(...)         \
   AT_DISPATCH_CASE(at::ScalarType::Float, __VA_ARGS__) \
diff --git a/csrc/layernorm_kernels.cu b/csrc/layernorm_kernels.cu
index 70a2b3b0a07b..ca1c04bd880d 100644
--- a/csrc/layernorm_kernels.cu
+++ b/csrc/layernorm_kernels.cu
@@ -1,4 +1,4 @@
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 
@@ -291,7 +291,7 @@ fused_add_rms_norm_kernel(
 void rms_norm(torch::Tensor& out,     // [..., hidden_size]
               torch::Tensor& input,   // [..., hidden_size]
               torch::Tensor& weight,  // [hidden_size]
-              float epsilon) {
+              double epsilon) {
   int hidden_size = input.size(-1);
   int num_tokens = input.numel() / hidden_size;
 
@@ -319,7 +319,7 @@ void rms_norm(torch::Tensor& out,     // [..., hidden_size]
 void fused_add_rms_norm(torch::Tensor& input,     // [..., hidden_size]
                         torch::Tensor& residual,  // [..., hidden_size]
                         torch::Tensor& weight,    // [hidden_size]
-                        float epsilon) {
+                        double epsilon) {
   int hidden_size = input.size(-1);
   int num_tokens = input.numel() / hidden_size;
 
diff --git a/csrc/moe/moe_ops.cpp b/csrc/moe/moe_ops.cpp
deleted file mode 100644
index 4122f7630d7c..000000000000
--- a/csrc/moe/moe_ops.cpp
+++ /dev/null
@@ -1,8 +0,0 @@
-#include "moe_ops.h"
-
-#include <torch/extension.h>
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("topk_softmax", &topk_softmax,
-        "Apply topk softmax to the gating outputs.");
-}
diff --git a/csrc/moe/moe_ops.h b/csrc/moe/moe_ops.h
index 93e7844ac199..a251730aa765 100644
--- a/csrc/moe/moe_ops.h
+++ b/csrc/moe/moe_ops.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/extension.h>
+#include <torch/all.h>
 
 void topk_softmax(torch::Tensor& topk_weights, torch::Tensor& topk_indices,
                   torch::Tensor& token_expert_indices,
diff --git a/csrc/moe/topk_softmax_kernels.cu b/csrc/moe/topk_softmax_kernels.cu
index 6ba4fcdb3a3f..de9747b60252 100644
--- a/csrc/moe/topk_softmax_kernels.cu
+++ b/csrc/moe/topk_softmax_kernels.cu
@@ -16,7 +16,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 #include "../cuda_compat.h"
diff --git a/csrc/moe/torch_bindings.cpp b/csrc/moe/torch_bindings.cpp
new file mode 100644
index 000000000000..243752b9a9e8
--- /dev/null
+++ b/csrc/moe/torch_bindings.cpp
@@ -0,0 +1,12 @@
+#include "registration.h"
+#include "moe_ops.h"
+
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
+  // Apply topk softmax to the gating outputs.
+  m.def(
+      "topk_softmax(Tensor! topk_weights, Tensor! topk_indices, Tensor! "
+      "token_expert_indices, Tensor gating_output) -> ()");
+  m.impl("topk_softmax", torch::kCUDA, &topk_softmax);
+}
+
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/csrc/moe_align_block_size_kernels.cu b/csrc/moe_align_block_size_kernels.cu
index edc441d12102..1f8d75da83bb 100644
--- a/csrc/moe_align_block_size_kernels.cu
+++ b/csrc/moe_align_block_size_kernels.cu
@@ -1,4 +1,4 @@
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 
 #include <ATen/ATen.h>
@@ -108,8 +108,8 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
 }
 }  // namespace vllm
 
-void moe_align_block_size(torch::Tensor topk_ids, int num_experts,
-                          int block_size, torch::Tensor sorted_token_ids,
+void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
+                          int64_t block_size, torch::Tensor sorted_token_ids,
                           torch::Tensor experts_ids,
                           torch::Tensor num_tokens_post_pad) {
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
diff --git a/csrc/ops.h b/csrc/ops.h
index 06b60e748886..0c270a78c331 100644
--- a/csrc/ops.h
+++ b/csrc/ops.h
@@ -1,40 +1,42 @@
 #pragma once
 
-#include <torch/extension.h>
+#include <torch/library.h>
 
 void paged_attention_v1(
     torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int num_kv_heads, float scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
-    int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
-    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
-    const int blocksparse_block_size, const int blocksparse_head_sliding_step);
+    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
+    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step);
 
 void paged_attention_v2(
     torch::Tensor& out, torch::Tensor& exp_sums, torch::Tensor& max_logits,
     torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache,
-    torch::Tensor& value_cache, int num_kv_heads, float scale,
-    torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size,
-    int max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
-    const std::string& kv_cache_dtype, float kv_scale, const int tp_rank,
-    const int blocksparse_local_blocks, const int blocksparse_vert_stride,
-    const int blocksparse_block_size, const int blocksparse_head_sliding_step);
+    torch::Tensor& value_cache, int64_t num_kv_heads, double scale,
+    torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size,
+    int64_t max_seq_len, const c10::optional<torch::Tensor>& alibi_slopes,
+    const std::string& kv_cache_dtype, double kv_scale, const int64_t tp_rank,
+    const int64_t blocksparse_local_blocks,
+    const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size,
+    const int64_t blocksparse_head_sliding_step);
 
 void rms_norm(torch::Tensor& out, torch::Tensor& input, torch::Tensor& weight,
-              float epsilon);
+              double epsilon);
 
 void fused_add_rms_norm(torch::Tensor& input, torch::Tensor& residual,
-                        torch::Tensor& weight, float epsilon);
+                        torch::Tensor& weight, double epsilon);
 
 void rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
-                      torch::Tensor& key, int head_size,
+                      torch::Tensor& key, int64_t head_size,
                       torch::Tensor& cos_sin_cache, bool is_neox);
 
 void batched_rotary_embedding(torch::Tensor& positions, torch::Tensor& query,
-                              torch::Tensor& key, int head_size,
+                              torch::Tensor& key, int64_t head_size,
                               torch::Tensor& cos_sin_cache, bool is_neox,
-                              int rot_dim,
+                              int64_t rot_dim,
                               torch::Tensor& cos_sin_cache_offsets);
 
 void silu_and_mul(torch::Tensor& out, torch::Tensor& input);
@@ -60,12 +62,12 @@ torch::Tensor aqlm_dequant(const torch::Tensor& codes,
 
 torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
                        torch::Tensor _scaling_factors, torch::Tensor _zeros,
-                       int split_k_iters);
+                       int64_t split_k_iters);
 
 torch::Tensor awq_dequantize(torch::Tensor _kernel,
                              torch::Tensor _scaling_factors,
-                             torch::Tensor _zeros, int split_k_iters, int thx,
-                             int thy);
+                             torch::Tensor _zeros, int64_t split_k_iters,
+                             int64_t thx, int64_t thy);
 
 torch::Tensor marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
                           torch::Tensor& b_scales, torch::Tensor& workspace,
@@ -88,9 +90,9 @@ torch::Tensor gptq_marlin_repack(torch::Tensor& b_q_weight, torch::Tensor& perm,
                                  int64_t size_k, int64_t size_n,
                                  int64_t num_bits);
 
-int cutlass_scaled_mm_dq(torch::Tensor& out, torch::Tensor const& a,
-                         torch::Tensor const& b, torch::Tensor const& a_scales,
-                         torch::Tensor const& b_scales);
+void cutlass_scaled_mm_dq(torch::Tensor& out, torch::Tensor const& a,
+                          torch::Tensor const& b, torch::Tensor const& a_scales,
+                          torch::Tensor const& b_scales);
 
 #endif
 
@@ -106,9 +108,9 @@ void squeezellm_gemm(torch::Tensor vec, torch::Tensor mat, torch::Tensor mul,
 torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
                         torch::Tensor b_gptq_qzeros,
                         torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
-                        bool use_exllama, int bit);
+                        bool use_exllama, int64_t bit);
 
-void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int bit);
+void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit);
 
 void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor& input,
                              torch::Tensor& scale);
@@ -116,28 +118,28 @@ void static_scaled_fp8_quant(torch::Tensor& out, torch::Tensor& input,
 void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor& input,
                               torch::Tensor& scale);
 
-void moe_align_block_size(torch::Tensor topk_ids, int num_experts,
-                          int block_size, torch::Tensor sorted_token_ids,
+void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
+                          int64_t block_size, torch::Tensor sorted_token_ids,
                           torch::Tensor experts_ids,
                           torch::Tensor num_tokens_post_pad);
 
 #ifndef USE_ROCM
-using fptr_t = uint64_t;
+using fptr_t = int64_t;
 fptr_t init_custom_ar(torch::Tensor& meta, torch::Tensor& rank_data,
                       const std::vector<std::string>& handles,
-                      const std::vector<int64_t>& offsets, int rank,
+                      const std::vector<int64_t>& offsets, int64_t rank,
                       bool full_nvlink);
-bool should_custom_ar(torch::Tensor& inp, int max_size, int world_size,
+bool should_custom_ar(torch::Tensor& inp, int64_t max_size, int64_t world_size,
                       bool full_nvlink);
 void all_reduce_reg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& out);
 void all_reduce_unreg(fptr_t _fa, torch::Tensor& inp, torch::Tensor& reg_buffer,
                       torch::Tensor& out);
 void dispose(fptr_t _fa);
-int meta_size();
+int64_t meta_size();
 void register_buffer(fptr_t _fa, torch::Tensor& t,
                      const std::vector<std::string>& handles,
                      const std::vector<int64_t>& offsets);
-std::pair<std::vector<uint8_t>, std::vector<int64_t>> get_graph_buffer_ipc_meta(
+std::tuple<torch::Tensor, std::vector<int64_t>> get_graph_buffer_ipc_meta(
     fptr_t _fa);
 void register_graph_buffers(fptr_t _fa, const std::vector<std::string>& handles,
                             const std::vector<std::vector<int64_t>>& offsets);
diff --git a/csrc/pos_encoding_kernels.cu b/csrc/pos_encoding_kernels.cu
index 69d6dae1c26b..97184a873559 100644
--- a/csrc/pos_encoding_kernels.cu
+++ b/csrc/pos_encoding_kernels.cu
@@ -1,4 +1,4 @@
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
 
@@ -127,7 +127,7 @@ void rotary_embedding(
                            // [num_tokens, num_heads * head_size]
     torch::Tensor& key,    // [batch_size, seq_len, num_kv_heads * head_size] or
                            // [num_tokens, num_kv_heads * head_size]
-    int head_size,
+    int64_t head_size,
     torch::Tensor& cos_sin_cache,  // [max_position, rot_dim]
     bool is_neox) {
   int64_t num_tokens = query.numel() / query.size(-1);
@@ -138,7 +138,7 @@ void rotary_embedding(
   int64_t key_stride = key.stride(-2);
 
   dim3 grid(num_tokens);
-  dim3 block(std::min(num_heads * rot_dim / 2, 512));
+  dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
   const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] {
@@ -168,9 +168,9 @@ void batched_rotary_embedding(
                            // [num_tokens, num_heads * head_size]
     torch::Tensor& key,    // [batch_size, seq_len, num_kv_heads * head_size] or
                            // [num_tokens, num_kv_heads * head_size]
-    int head_size,
+    int64_t head_size,
     torch::Tensor& cos_sin_cache,  // [max_position, rot_dim]
-    bool is_neox, int rot_dim,
+    bool is_neox, int64_t rot_dim,
     torch::Tensor& cos_sin_cache_offsets  // [num_tokens]
 ) {
   int64_t num_tokens = cos_sin_cache_offsets.size(0);
@@ -180,7 +180,7 @@ void batched_rotary_embedding(
   int64_t key_stride = key.stride(-2);
 
   dim3 grid(num_tokens);
-  dim3 block(std::min(num_heads * rot_dim / 2, 512));
+  dim3 block(std::min<int64_t>(num_heads * rot_dim / 2, 512));
   const at::cuda::OptionalCUDAGuard device_guard(device_of(query));
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
   VLLM_DISPATCH_FLOATING_TYPES(query.scalar_type(), "rotary_embedding", [&] {
diff --git a/csrc/punica/punica_ops.cu b/csrc/punica/punica_ops.cu
index 61de3b37937c..dd29820144b3 100644
--- a/csrc/punica/punica_ops.cu
+++ b/csrc/punica/punica_ops.cu
@@ -1,4 +1,4 @@
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <cstdint>
 
@@ -88,7 +88,7 @@ inline bool launch_bgmv_kernel(out_T *Y, const in_T *X, const W_T *W,
 }
 
 void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,
-                   torch::Tensor indicies, int64_t layer_idx, float scale) {
+                   torch::Tensor indicies, int64_t layer_idx, double scale) {
   CHECK_INPUT(y);
   CHECK_INPUT(x);
   CHECK_INPUT(w);
@@ -320,7 +320,7 @@ void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,
 
 void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w,
                              torch::Tensor indicies, int64_t layer_idx,
-                             float scale, int64_t h_in, int64_t h_out,
+                             double scale, int64_t h_in, int64_t h_out,
                              int64_t y_offset) {
   CHECK_INPUT(y);
   CHECK_INPUT(x);
diff --git a/csrc/punica/punica_ops.h b/csrc/punica/punica_ops.h
index 937e2d1d25d4..5d625d0564f7 100644
--- a/csrc/punica/punica_ops.h
+++ b/csrc/punica/punica_ops.h
@@ -1,11 +1,11 @@
 #pragma once
 
-#include <torch/extension.h>
+#include <torch/all.h>
 
 void dispatch_bgmv(torch::Tensor y, torch::Tensor x, torch::Tensor w,
-                   torch::Tensor indicies, int64_t layer_idx, float scale);
+                   torch::Tensor indicies, int64_t layer_idx, double scale);
 
 void dispatch_bgmv_low_level(torch::Tensor y, torch::Tensor x, torch::Tensor w,
                              torch::Tensor indicies, int64_t layer_idx,
-                             float scale, int64_t h_in, int64_t h_out,
+                             double scale, int64_t h_in, int64_t h_out,
                              int64_t y_offset);
diff --git a/csrc/punica/punica_pybind.cpp b/csrc/punica/punica_pybind.cpp
deleted file mode 100644
index 9490ad59cdd5..000000000000
--- a/csrc/punica/punica_pybind.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-#include <torch/extension.h>
-
-#include "punica_ops.h"
-
-//====== pybind ======
-
-#define DEFINE_pybind(name) m.def(#name, &name, #name);
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.def("dispatch_bgmv", &dispatch_bgmv, "dispatch_bgmv");
-  m.def("dispatch_bgmv_low_level", &dispatch_bgmv_low_level,
-        "dispatch_bgmv_low_level");
-}
diff --git a/csrc/punica/torch_bindings.cpp b/csrc/punica/torch_bindings.cpp
new file mode 100644
index 000000000000..894e229b6d9d
--- /dev/null
+++ b/csrc/punica/torch_bindings.cpp
@@ -0,0 +1,18 @@
+#include "registration.h"
+#include "punica_ops.h"
+
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, m) {
+  m.def(
+      "dispatch_bgmv(Tensor! y, Tensor x, Tensor w, Tensor indicies, int "
+      "layer_idx, float scale) -> ()");
+  m.impl("dispatch_bgmv", torch::kCUDA, &dispatch_bgmv);
+
+  m.def(
+      "dispatch_bgmv_low_level(Tensor! y, Tensor x, Tensor w,"
+      "Tensor indicies, int layer_idx,"
+      "float scale, int h_in, int h_out,"
+      "int y_offset) -> ()");
+  m.impl("dispatch_bgmv_low_level", torch::kCUDA, &dispatch_bgmv_low_level);
+}
+
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/csrc/pybind.cpp b/csrc/pybind.cpp
deleted file mode 100644
index 547823aa1b04..000000000000
--- a/csrc/pybind.cpp
+++ /dev/null
@@ -1,114 +0,0 @@
-#include "cache.h"
-#include "cuda_utils.h"
-#include "ops.h"
-#include <torch/extension.h>
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  // vLLM custom ops
-  pybind11::module ops = m.def_submodule("ops", "vLLM custom operators");
-
-  // Attention ops
-  ops.def("paged_attention_v1", &paged_attention_v1,
-          "Compute the attention between an input query and the cached "
-          "keys/values using PagedAttention.");
-  ops.def("paged_attention_v2", &paged_attention_v2, "PagedAttention V2.");
-
-  // Activation ops
-  ops.def("silu_and_mul", &silu_and_mul, "Activation function used in SwiGLU.");
-  ops.def("gelu_and_mul", &gelu_and_mul,
-          "Activation function used in GeGLU with `none` approximation.");
-  ops.def("gelu_tanh_and_mul", &gelu_tanh_and_mul,
-          "Activation function used in GeGLU with `tanh` approximation.");
-  ops.def("gelu_new", &gelu_new, "GELU implementation used in GPT-2.");
-  ops.def("gelu_fast", &gelu_fast, "Approximate GELU implementation.");
-
-  // Layernorm
-  ops.def("rms_norm", &rms_norm,
-          "Apply Root Mean Square (RMS) Normalization to the input tensor.");
-
-  ops.def("fused_add_rms_norm", &fused_add_rms_norm,
-          "In-place fused Add and RMS Normalization");
-
-  // Rotary embedding
-  ops.def("rotary_embedding", &rotary_embedding,
-          "Apply GPT-NeoX or GPT-J style rotary embedding to query and key");
-
-  ops.def("batched_rotary_embedding", &batched_rotary_embedding,
-          "Apply GPT-NeoX or GPT-J style rotary embedding to query and key "
-          "(supports multiple loras)");
-
-// Quantization ops
-#ifndef USE_ROCM
-  ops.def("aqlm_gemm", &aqlm_gemm, "Quantized GEMM for AQLM");
-  ops.def("aqlm_dequant", &aqlm_dequant, "Decompression method for AQLM");
-  ops.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
-  ops.def("marlin_gemm", &marlin_gemm,
-          "Marlin (Dense) Optimized Quantized GEMM for GPTQ");
-  ops.def("gptq_marlin_24_gemm", &gptq_marlin_24_gemm,
-          "Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ");
-  ops.def("gptq_marlin_gemm", &gptq_marlin_gemm,
-          "gptq_marlin Optimized Quantized GEMM for GPTQ");
-  ops.def("gptq_marlin_repack", &gptq_marlin_repack,
-          "gptq_marlin repack from GPTQ");
-  ops.def("awq_dequantize", &awq_dequantize, "Dequantization for AWQ");
-  ops.def("cutlass_scaled_mm_dq", &cutlass_scaled_mm_dq,
-          "CUTLASS w8a8 GEMM, supporting symmetric per-tensor or "
-          "per-row/column quantization.");
-#endif
-
-  ops.def("gptq_gemm", &gptq_gemm, "Quantized GEMM for GPTQ");
-  ops.def("gptq_shuffle", &gptq_shuffle, "Post processing for GPTQ");
-  ops.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
-  ops.def("static_scaled_fp8_quant", &static_scaled_fp8_quant,
-          "Compute FP8 quantized tensor for given scaling factor");
-  ops.def("dynamic_scaled_fp8_quant", &dynamic_scaled_fp8_quant,
-          "Compute FP8 quantized tensor and scaling factor");
-  ops.def("moe_align_block_size", &moe_align_block_size,
-          "Aligning the number of tokens to be processed by each expert such "
-          "that it is divisible by the block size.");
-
-  ops.def("static_scaled_int8_quant", &static_scaled_int8_quant,
-          "Compute int8 quantized tensor for given scaling factor");
-
-  ops.def("dynamic_scaled_int8_quant", &dynamic_scaled_int8_quant,
-          "Compute int8 quantized tensor and scaling factor");
-
-  // Cache ops
-  pybind11::module cache_ops = m.def_submodule("cache_ops", "vLLM cache ops");
-  cache_ops.def("swap_blocks", &swap_blocks,
-                "Swap in (out) the cache blocks from src to dst");
-  cache_ops.def("copy_blocks", &copy_blocks,
-                "Copy the cache blocks from src to dst");
-  cache_ops.def("reshape_and_cache", &reshape_and_cache,
-                "Reshape the key and value tensors and cache them");
-  cache_ops.def("reshape_and_cache_flash", &reshape_and_cache_flash,
-                "Reshape the key and value tensors and cache them");
-  cache_ops.def("convert_fp8", &convert_fp8,
-                "Convert the key and value cache to fp8 data type");
-
-  // Cuda utils
-  pybind11::module cuda_utils =
-      m.def_submodule("cuda_utils", "vLLM cuda utils");
-  cuda_utils.def("get_device_attribute", &get_device_attribute,
-                 "Gets the specified device attribute.");
-
-  cuda_utils.def("get_max_shared_memory_per_block_device_attribute",
-                 &get_max_shared_memory_per_block_device_attribute,
-                 "Gets the maximum shared memory per block device attribute.");
-
-#ifndef USE_ROCM
-  // Custom all-reduce kernels
-  pybind11::module custom_ar = m.def_submodule("custom_ar", "custom allreduce");
-  custom_ar.def("init_custom_ar", &init_custom_ar, "init_custom_ar");
-  custom_ar.def("should_custom_ar", &should_custom_ar, "should_custom_ar");
-  custom_ar.def("all_reduce_reg", &all_reduce_reg, "all_reduce_reg");
-  custom_ar.def("all_reduce_unreg", &all_reduce_unreg, "all_reduce_unreg");
-  custom_ar.def("dispose", &dispose, "dispose");
-  custom_ar.def("meta_size", &meta_size, "meta_size");
-  custom_ar.def("register_buffer", &register_buffer, "register_buffer");
-  custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta,
-                "get_graph_buffer_ipc_meta");
-  custom_ar.def("register_graph_buffers", &register_graph_buffers,
-                "register_graph_buffers");
-#endif
-}
diff --git a/csrc/quantization/aqlm/gemm_kernels.cu b/csrc/quantization/aqlm/gemm_kernels.cu
index 255844eec56d..8fb985680086 100644
--- a/csrc/quantization/aqlm/gemm_kernels.cu
+++ b/csrc/quantization/aqlm/gemm_kernels.cu
@@ -18,7 +18,7 @@
 #include <cuda.h>
 #include <cuda_fp16.h>
 #include <cuda_runtime.h>
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <c10/cuda/CUDAStream.h>
 #include <c10/cuda/CUDAGuard.h>
 
diff --git a/csrc/quantization/awq/gemm_kernels.cu b/csrc/quantization/awq/gemm_kernels.cu
index bb8e5bbb23d7..6d6da5f3d874 100644
--- a/csrc/quantization/awq/gemm_kernels.cu
+++ b/csrc/quantization/awq/gemm_kernels.cu
@@ -7,7 +7,7 @@ Shang and Dang, Xingyu and Han, Song}, journal={arXiv}, year={2023}
 }
  */
 
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <c10/cuda/CUDAGuard.h>
 
 #include "dequantize.cuh"
@@ -435,8 +435,8 @@ __global__ void __launch_bounds__(64)
 
 torch::Tensor awq_dequantize(torch::Tensor _kernel,
                              torch::Tensor _scaling_factors,
-                             torch::Tensor _zeros, int split_k_iters, int thx,
-                             int thy) {
+                             torch::Tensor _zeros, int64_t split_k_iters,
+                             int64_t thx, int64_t thy) {
   int in_c = _kernel.size(0);
   int qout_c = _kernel.size(1);
   int out_c = qout_c * 8;
@@ -491,7 +491,7 @@ torch::Tensor awq_dequantize(torch::Tensor _kernel,
 
 torch::Tensor awq_gemm(torch::Tensor _in_feats, torch::Tensor _kernel,
                        torch::Tensor _scaling_factors, torch::Tensor _zeros,
-                       int split_k_iters) {
+                       int64_t split_k_iters) {
   int num_in_feats = _in_feats.size(0);
   int num_in_channels = _in_feats.size(1);
   const at::cuda::OptionalCUDAGuard device_guard(device_of(_in_feats));
diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
index 280b0327111d..aa9511daa277 100644
--- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
+++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu
@@ -1,5 +1,5 @@
 #include <ATen/cuda/CUDAContext.h>
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <cmath>
 
 #include "../../dispatch_utils.h"
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu
index 088fee4783fa..23a8b4070b70 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c2x.cu
@@ -1,5 +1,5 @@
 #include <stddef.h>
-#include <torch/extension.h>
+#include <torch/all.h>
 
 #include <ATen/cuda/CUDAContext.h>
 
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu
index 8fc4ba662ecd..a99802153643 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_c3x.cu
@@ -4,7 +4,7 @@
 
 #if defined CUDA_VERSION && CUDA_VERSION >= 12000
 
-#include <torch/extension.h>
+#include <torch/all.h>
 
 #include <ATen/cuda/CUDAContext.h>
 
diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu
index eb532f2ac7a9..423e64a4932e 100644
--- a/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu
+++ b/csrc/quantization/cutlass_w8a8/scaled_mm_dq_entry.cu
@@ -1,7 +1,7 @@
 #include <cudaTypedefs.h>
 
 #include <c10/cuda/CUDAGuard.h>
-#include <torch/extension.h>
+#include <torch/all.h>
 
 void cutlass_scaled_mm_dq_sm75(torch::Tensor& c, torch::Tensor const& a,
                                torch::Tensor const& b,
diff --git a/csrc/quantization/fp8/common.cu b/csrc/quantization/fp8/common.cu
index 55be3305a9b8..8c5b693bf6ed 100644
--- a/csrc/quantization/fp8/common.cu
+++ b/csrc/quantization/fp8/common.cu
@@ -1,5 +1,5 @@
 #include <ATen/cuda/CUDAContext.h>
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <c10/cuda/CUDAGuard.h>
 
 #include <cmath>
diff --git a/csrc/quantization/gptq/q_gemm.cu b/csrc/quantization/gptq/q_gemm.cu
index 480c4986c382..785f1a09c190 100644
--- a/csrc/quantization/gptq/q_gemm.cu
+++ b/csrc/quantization/gptq/q_gemm.cu
@@ -6,7 +6,7 @@ https://github.com/qwopqwop200/GPTQ-for-LLaMa
 #include <cstdint>
 #include <cstdio>
 
-#include <torch/extension.h>
+#include <torch/all.h>
 #include <c10/cuda/CUDAGuard.h>
 #include <ATen/cuda/CUDAContext.h>
 #include <cuda_runtime.h>
@@ -1823,7 +1823,7 @@ void shuffle_exllama_weight(uint32_t* q_weight, int* q_perm, int height,
 torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
                         torch::Tensor b_gptq_qzeros,
                         torch::Tensor b_gptq_scales, torch::Tensor b_g_idx,
-                        bool use_exllama, int bit) {
+                        bool use_exllama, int64_t bit) {
   const at::cuda::OptionalCUDAGuard device_guard(device_of(a));
   auto options = torch::TensorOptions().dtype(a.dtype()).device(a.device());
   at::Tensor c = torch::empty({a.size(0), b_q_weight.size(1)}, options);
@@ -1845,7 +1845,7 @@ torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight,
   return c;
 }
 
-void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int bit) {
+void gptq_shuffle(torch::Tensor q_weight, torch::Tensor q_perm, int64_t bit) {
   const at::cuda::OptionalCUDAGuard device_guard(device_of(q_weight));
   vllm::gptq::shuffle_exllama_weight(
       (uint32_t*)q_weight.data_ptr(),
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cu b/csrc/quantization/gptq_marlin/gptq_marlin.cu
index c573b9041065..0beb9de14c68 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cu
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cu
@@ -1867,4 +1867,4 @@ torch::Tensor gptq_marlin_gemm(torch::Tensor& a, torch::Tensor& b_q_weight,
   return c;
 }
 
-#endif
\ No newline at end of file
+#endif
diff --git a/csrc/quantization/gptq_marlin/gptq_marlin.cuh b/csrc/quantization/gptq_marlin/gptq_marlin.cuh
index ba5368ea8835..42af44951efd 100644
--- a/csrc/quantization/gptq_marlin/gptq_marlin.cuh
+++ b/csrc/quantization/gptq_marlin/gptq_marlin.cuh
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <torch/extension.h>
+#include <torch/all.h>
 
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
diff --git a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
index 03d66cecedf1..d124c0149912 100644
--- a/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
+++ b/csrc/quantization/marlin/dense/marlin_cuda_kernel.cu
@@ -15,7 +15,7 @@
  * limitations under the License.
  */
 
-#include <torch/extension.h>
+#include <torch/all.h>
 
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
diff --git a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
index 686dd7851e6a..b5effc305544 100644
--- a/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
+++ b/csrc/quantization/marlin/sparse/marlin_24_cuda_kernel.cu
@@ -16,7 +16,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <torch/extension.h>
+#include <torch/all.h>
 
 #include <ATen/cuda/CUDAContext.h>
 #include <c10/cuda/CUDAGuard.h>
diff --git a/csrc/quantization/squeezellm/quant_cuda_kernel.cu b/csrc/quantization/squeezellm/quant_cuda_kernel.cu
index 1b339fa4b392..40baac610869 100644
--- a/csrc/quantization/squeezellm/quant_cuda_kernel.cu
+++ b/csrc/quantization/squeezellm/quant_cuda_kernel.cu
@@ -1,5 +1,4 @@
 #include <torch/all.h>
-#include <torch/python.h>
 #include <cuda.h>
 #include <cuda_runtime.h>
 #include <cuda_fp16.h>
diff --git a/csrc/registration.h b/csrc/registration.h
new file mode 100644
index 000000000000..e5396e9a8b13
--- /dev/null
+++ b/csrc/registration.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <Python.h>
+
+#define _CONCAT(A, B) A##B
+#define CONCAT(A, B) _CONCAT(A, B)
+
+#define _STRINGIFY(A) #A
+#define STRINGIFY(A) _STRINGIFY(A)
+
+// A version of the TORCH_LIBRARY macro that expands the NAME, i.e. so NAME
+// could be a macro instead of a literal token.
+#define TORCH_LIBRARY_EXPAND(NAME, MODULE) TORCH_LIBRARY(NAME, MODULE)
+
+// REGISTER_EXTENSION allows the shared library to be loaded and initialized
+// via python's import statement.
+#define REGISTER_EXTENSION(NAME)                                               \
+  PyMODINIT_FUNC CONCAT(PyInit_, NAME)() {                                     \
+    static struct PyModuleDef module = {PyModuleDef_HEAD_INIT,                 \
+                                        STRINGIFY(NAME), nullptr, 0, nullptr}; \
+    return PyModule_Create(&module);                                           \
+  }
diff --git a/csrc/torch_bindings.cpp b/csrc/torch_bindings.cpp
new file mode 100644
index 000000000000..df2603544c85
--- /dev/null
+++ b/csrc/torch_bindings.cpp
@@ -0,0 +1,283 @@
+#include "cache.h"
+#include "cuda_utils.h"
+#include "ops.h"
+#include "registration.h"
+
+#include <torch/library.h>
+
+// Note on op signatures:
+// The X_meta signatures are for the meta functions corresponding to op X.
+// They must be kept in sync with the signature for X. Generally, only
+// functions that return Tensors require a meta function.
+//
+// See the following links for detailed docs on op registration and function
+// schemas.
+// https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU/edit#heading=h.ptttacy8y1u9
+// https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/README.md#annotations
+
+TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
+  // vLLM custom ops
+
+  // Attention ops
+  // Compute the attention between an input query and the cached
+  // keys/values using PagedAttention.
+  ops.def(
+      "paged_attention_v1("
+      "    Tensor! out, Tensor query, Tensor key_cache,"
+      "    Tensor value_cache, int num_kv_heads, float scale,"
+      "    Tensor block_tables, Tensor seq_lens, int block_size,"
+      "    int max_seq_len, Tensor? alibi_slopes,"
+      "    str kv_cache_dtype, float kv_scale, int tp_rank,"
+      "    int blocksparse_local_blocks,"
+      "    int blocksparse_vert_stride, int blocksparse_block_size,"
+      "    int blocksparse_head_sliding_step) -> ()");
+  ops.impl("paged_attention_v1", torch::kCUDA, &paged_attention_v1);
+
+  // PagedAttention V2.
+  ops.def(
+      "paged_attention_v2("
+      "    Tensor! out, Tensor exp_sums, Tensor max_logits,"
+      "    Tensor tmp_out, Tensor query, Tensor key_cache,"
+      "    Tensor value_cache, int num_kv_heads, float scale,"
+      "    Tensor block_tables, Tensor seq_lens, int block_size,"
+      "    int max_seq_len, Tensor? alibi_slopes,"
+      "    str kv_cache_dtype, float kv_scale, int tp_rank,"
+      "    int blocksparse_local_blocks,"
+      "    int blocksparse_vert_stride, int blocksparse_block_size,"
+      "    int blocksparse_head_sliding_step) -> ()");
+  ops.impl("paged_attention_v2", torch::kCUDA, &paged_attention_v2);
+
+  // Activation ops
+  // Activation function used in SwiGLU.
+  ops.def("silu_and_mul(Tensor! out, Tensor input) -> ()");
+  ops.impl("silu_and_mul", torch::kCUDA, &silu_and_mul);
+
+  // Activation function used in GeGLU with `none` approximation.
+  ops.def("gelu_and_mul(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_and_mul", torch::kCUDA, &gelu_and_mul);
+
+  // Activation function used in GeGLU with `tanh` approximation.
+  ops.def("gelu_tanh_and_mul(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_tanh_and_mul", torch::kCUDA, &gelu_tanh_and_mul);
+
+  // GELU implementation used in GPT-2.
+  ops.def("gelu_new(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_new", torch::kCUDA, &gelu_new);
+
+  // Approximate GELU implementation.
+  ops.def("gelu_fast(Tensor! out, Tensor input) -> ()");
+  ops.impl("gelu_fast", torch::kCUDA, &gelu_fast);
+
+  // Layernorm
+  // Apply Root Mean Square (RMS) Normalization to the input tensor.
+  ops.def(
+      "rms_norm(Tensor! out, Tensor input, Tensor weight, float epsilon) -> "
+      "()");
+  ops.impl("rms_norm", torch::kCUDA, &rms_norm);
+
+  // In-place fused Add and RMS Normalization.
+  ops.def(
+      "fused_add_rms_norm(Tensor! input, Tensor! residual, Tensor weight, "
+      "float epsilon) -> ()");
+  ops.impl("fused_add_rms_norm", torch::kCUDA, &fused_add_rms_norm);
+
+  // Rotary embedding
+  // Apply GPT-NeoX or GPT-J style rotary embedding to query and key.
+  ops.def(
+      "rotary_embedding(Tensor positions, Tensor! query,"
+      "                 Tensor! key, int head_size,"
+      "                 Tensor cos_sin_cache, bool is_neox) -> ()");
+  ops.impl("rotary_embedding", torch::kCUDA, &rotary_embedding);
+
+  // Apply GPT-NeoX or GPT-J style rotary embedding to query and key
+  // (supports multiple loras).
+  ops.def(
+      "batched_rotary_embedding(Tensor positions, Tensor! query,"
+      "                         Tensor! key, int head_size,"
+      "                         Tensor cos_sin_cache, bool is_neox,"
+      "                         int rot_dim,"
+      "                         Tensor cos_sin_cache_offsets) -> ()");
+  ops.impl("batched_rotary_embedding", torch::kCUDA, &batched_rotary_embedding);
+
+  // Quantization ops
+#ifndef USE_ROCM
+  // Quantized GEMM for AQLM.
+  ops.def("aqlm_gemm", &aqlm_gemm);
+  ops.impl("aqlm_gemm", torch::kCUDA, &aqlm_gemm);
+
+  // Decompression method for AQLM.
+  ops.def("aqlm_dequant", &aqlm_dequant);
+  ops.impl("aqlm_dequant", torch::kCUDA, &aqlm_dequant);
+
+  // Quantized GEMM for AWQ.
+  ops.def("awq_gemm", &awq_gemm);
+  ops.impl("awq_gemm", torch::kCUDA, &awq_gemm);
+
+  // Dequantization for AWQ.
+  ops.def("awq_dequantize", &awq_dequantize);
+  ops.impl("awq_dequantize", torch::kCUDA, &awq_dequantize);
+
+  // Marlin (Dense) Optimized Quantized GEMM for GPTQ.
+  ops.def("marlin_gemm", &marlin_gemm);
+  ops.impl("marlin_gemm", torch::kCUDA, &marlin_gemm);
+
+  // Marlin_24 (Sparse) Optimized Quantized GEMM for GPTQ.
+  ops.def("gptq_marlin_24_gemm", &gptq_marlin_24_gemm);
+  ops.impl("gptq_marlin_24_gemm", torch::kCUDA, &gptq_marlin_24_gemm);
+
+  // gptq_marlin Optimized Quantized GEMM for GPTQ.
+  ops.def("gptq_marlin_gemm", &gptq_marlin_gemm);
+  ops.impl("gptq_marlin_gemm", torch::kCUDA, &gptq_marlin_gemm);
+
+  // gptq_marlin repack from GPTQ.
+  ops.def("gptq_marlin_repack", &gptq_marlin_repack);
+  ops.impl("gptq_marlin_repack", torch::kCUDA, &gptq_marlin_repack);
+
+  // CUTLASS w8a8 GEMM, supporting symmetric per-tensor or per-row/column
+  // quantization.
+  ops.def(
+      "cutlass_scaled_mm_dq(Tensor! out, Tensor a,"
+      "                     Tensor b, Tensor a_scales,"
+      "                     Tensor b_scales) -> ()");
+  ops.impl("cutlass_scaled_mm_dq", torch::kCUDA, &cutlass_scaled_mm_dq);
+#endif
+
+  // Quantized GEMM for GPTQ.
+  ops.def("gptq_gemm", &gptq_gemm);
+  ops.impl("gptq_gemm", torch::kCUDA, &gptq_gemm);
+
+  // Post processing for GPTQ.
+  ops.def("gptq_shuffle(Tensor! q_weight, Tensor q_perm, int bit) -> ()");
+  ops.impl("gptq_shuffle", torch::kCUDA, &gptq_shuffle);
+
+  // Quantized GEMM for SqueezeLLM.
+  ops.def(
+      "squeezellm_gemm(Tensor vec, Tensor mat, Tensor! mul, Tensor "
+      "lookup_table) -> ()");
+  ops.impl("squeezellm_gemm", torch::kCUDA, &squeezellm_gemm);
+
+  // Compute FP8 quantized tensor for given scaling factor.
+  ops.def(
+      "static_scaled_fp8_quant(Tensor! out, Tensor input, Tensor scale) -> ()");
+  ops.impl("static_scaled_fp8_quant", torch::kCUDA, &static_scaled_fp8_quant);
+
+  // Compute FP8 quantized tensor and scaling factor.
+  ops.def(
+      "dynamic_scaled_fp8_quant(Tensor! out, Tensor input, Tensor! scale) -> "
+      "()");
+  ops.impl("dynamic_scaled_fp8_quant", torch::kCUDA, &dynamic_scaled_fp8_quant);
+
+  // Aligning the number of tokens to be processed by each expert such
+  // that it is divisible by the block size.
+  ops.def(
+      "moe_align_block_size(Tensor topk_ids, int num_experts,"
+      "                     int block_size, Tensor! sorted_token_ids,"
+      "                     Tensor! experts_ids,"
+      "                     Tensor! num_tokens_post_pad) -> ()");
+  ops.impl("moe_align_block_size", torch::kCUDA, &moe_align_block_size);
+
+  // Compute int8 quantized tensor for given scaling factor.
+  ops.def(
+      "static_scaled_int8_quant(Tensor! out, Tensor input, Tensor scale) -> "
+      "()");
+  ops.impl("static_scaled_int8_quant", torch::kCUDA, &static_scaled_int8_quant);
+
+  // Compute int8 quantized tensor and scaling factor
+  ops.def(
+      "dynamic_scaled_int8_quant(Tensor! out, Tensor input, Tensor! scale) -> "
+      "()");
+  ops.impl("dynamic_scaled_int8_quant", torch::kCUDA,
+           &dynamic_scaled_int8_quant);
+}
+
+TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cache_ops), cache_ops) {
+  // Cache ops
+  // Swap in (out) the cache blocks from src to dst.
+  cache_ops.def(
+      "swap_blocks(Tensor src, Tensor! dst, Tensor block_mapping) -> ()");
+  cache_ops.impl("swap_blocks", torch::kCUDA, &swap_blocks);
+
+  // Copy the cache blocks from src to dst.
+  cache_ops.def(
+      "copy_blocks(Tensor[]! key_caches, Tensor[]! value_caches, Tensor "
+      "block_mapping) -> ()");
+  cache_ops.impl("copy_blocks", torch::kCUDA, &copy_blocks);
+
+  // Reshape the key and value tensors and cache them.
+  cache_ops.def(
+      "reshape_and_cache(Tensor key, Tensor value,"
+      "                  Tensor! key_cache, Tensor! value_cache,"
+      "                  Tensor slot_mapping,"
+      "                  str kv_cache_dtype,"
+      "                  float kv_scale) -> ()");
+  cache_ops.impl("reshape_and_cache", torch::kCUDA, &reshape_and_cache);
+
+  // Reshape the key and value tensors and cache them.
+  cache_ops.def(
+      "reshape_and_cache_flash(Tensor key, Tensor value,"
+      "                        Tensor! key_cache,"
+      "                        Tensor! value_cache,"
+      "                        Tensor slot_mapping,"
+      "                        str kv_cache_dtype) -> ()");
+  cache_ops.impl("reshape_and_cache_flash", torch::kCUDA,
+                 &reshape_and_cache_flash);
+
+  // Convert the key and value cache to fp8 data type.
+  cache_ops.def(
+      "convert_fp8(Tensor! dst_cache, Tensor src_cache, float scale, str "
+      "kv_cache_dtype) -> ()");
+  cache_ops.impl("convert_fp8", torch::kCUDA, &convert_fp8);
+}
+
+TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _cuda_utils), cuda_utils) {
+  // Cuda utils
+
+  // Gets the specified device attribute.
+  cuda_utils.def("get_device_attribute", &get_device_attribute);
+  cuda_utils.impl("get_device_attribute", torch::kCUDA, &get_device_attribute);
+
+  // Gets the maximum shared memory per block device attribute.
+  cuda_utils.def("get_max_shared_memory_per_block_device_attribute",
+                 &get_max_shared_memory_per_block_device_attribute);
+  cuda_utils.impl("get_max_shared_memory_per_block_device_attribute",
+                  torch::kCUDA,
+                  &get_max_shared_memory_per_block_device_attribute);
+}
+
+#ifndef USE_ROCM
+TORCH_LIBRARY_EXPAND(CONCAT(TORCH_EXTENSION_NAME, _custom_ar), custom_ar) {
+  // Custom all-reduce kernels
+  custom_ar.def("init_custom_ar", &init_custom_ar);
+  custom_ar.impl("init_custom_ar", torch::kCUDA, &init_custom_ar);
+
+  custom_ar.def("should_custom_ar", &should_custom_ar);
+  custom_ar.impl("should_custom_ar", torch::kCUDA, &should_custom_ar);
+
+  custom_ar.def("all_reduce_reg(int fa, Tensor inp, Tensor! out) -> ()");
+  custom_ar.impl("all_reduce_reg", torch::kCUDA, &all_reduce_reg);
+
+  custom_ar.def(
+      "all_reduce_unreg(int fa, Tensor inp, Tensor reg_buffer, Tensor! out) -> "
+      "()");
+  custom_ar.impl("all_reduce_unreg", torch::kCUDA, &all_reduce_unreg);
+
+  custom_ar.def("dispose", &dispose);
+  custom_ar.impl("dispose", torch::kCPU, &dispose);
+
+  custom_ar.def("meta_size", &meta_size);
+  custom_ar.impl("meta_size", torch::kCPU, &meta_size);
+
+  custom_ar.def("register_buffer", &register_buffer);
+  custom_ar.impl("register_buffer", torch::kCUDA, &register_buffer);
+
+  custom_ar.def("get_graph_buffer_ipc_meta", &get_graph_buffer_ipc_meta);
+  custom_ar.impl("get_graph_buffer_ipc_meta", torch::kCPU,
+                 &get_graph_buffer_ipc_meta);
+
+  custom_ar.def("register_graph_buffers", &register_graph_buffers);
+  custom_ar.impl("register_graph_buffers", torch::kCPU,
+                 &register_graph_buffers);
+}
+#endif
+
+REGISTER_EXTENSION(TORCH_EXTENSION_NAME)
diff --git a/setup.py b/setup.py
index cadc89379bec..7a1a0d37ebc3 100644
--- a/setup.py
+++ b/setup.py
@@ -61,7 +61,7 @@ def remove_prefix(text, prefix):
 class CMakeExtension(Extension):
 
     def __init__(self, name: str, cmake_lists_dir: str = '.', **kwa) -> None:
-        super().__init__(name, sources=[], **kwa)
+        super().__init__(name, sources=[], py_limited_api=True, **kwa)
         self.cmake_lists_dir = os.path.abspath(cmake_lists_dir)
 
 
diff --git a/tests/kernels/test_int8_quant.py b/tests/kernels/test_int8_quant.py
index aab7af9d2cbf..0daf7439468a 100644
--- a/tests/kernels/test_int8_quant.py
+++ b/tests/kernels/test_int8_quant.py
@@ -1,7 +1,8 @@
 import pytest
 import torch
 
-from vllm._C import ops
+# ruff: noqa: F401
+import vllm._C
 
 DTYPES = [torch.half, torch.bfloat16, torch.float]
 HIDDEN_SIZES = [16, 67, 768, 2048, 5120, 5137, 8192,
@@ -33,7 +34,7 @@ def test_dynamic_scaled_int8_quant(num_tokens: int, hidden_size: int,
 
     ops_out = torch.empty_like(x, dtype=torch.int8, device="cuda")
     scales_out = torch.empty_like(scales, dtype=torch.float32, device="cuda")
-    ops.dynamic_scaled_int8_quant(ops_out, x, scales_out)
+    torch.ops._C.dynamic_scaled_int8_quant(ops_out, x, scales_out)
 
     assert torch.allclose(scales_out, scales)
     assert torch.allclose(torch_out, ops_out,
@@ -60,6 +61,6 @@ def test_static_scaled_int8_quant(num_tokens: int, hidden_size: int,
     out2 = torch.empty_like(x, dtype=torch.int8)
     scale_argument = torch.tensor([scale], dtype=torch.float32, device="cuda")
 
-    ops.static_scaled_int8_quant(out2, x, scale_argument)
+    torch.ops._C.static_scaled_int8_quant(out2, x, scale_argument)
     assert torch.allclose(out1, out2,
                           atol=1)  # big atol to account for rounding errors
diff --git a/vllm/_custom_ops.py b/vllm/_custom_ops.py
index 7e12f1ba14cd..440b0e8afa99 100644
--- a/vllm/_custom_ops.py
+++ b/vllm/_custom_ops.py
@@ -1,35 +1,47 @@
-from typing import Optional, Tuple, Type
+import contextlib
+from typing import List, Optional, Tuple, Type
 
 import torch
 
 try:
-    from vllm._C import cache_ops as vllm_cache_ops
-    from vllm._C import ops as vllm_ops
+    import vllm._C
 except ImportError as e:
     from vllm.logger import init_logger
     logger = init_logger(__name__)
     logger.warning("Failed to import from vllm._C with %r", e)
 
+with contextlib.suppress(ImportError):
+    import vllm._moe_C
+
+with contextlib.suppress(ImportError):
+    # ruff: noqa: F401
+    import vllm._punica_C
+
+
+def is_custom_op_supported(op_name: str) -> bool:
+    op, overloads = torch._C._jit_get_operation(op_name)
+    return op is not None
+
 
 # activation ops
 def silu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    vllm_ops.silu_and_mul(out, x)
+    torch.ops._C.silu_and_mul(out, x)
 
 
 def gelu_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    vllm_ops.gelu_and_mul(out, x)
+    torch.ops._C.gelu_and_mul(out, x)
 
 
 def gelu_tanh_and_mul(out: torch.Tensor, x: torch.Tensor) -> None:
-    vllm_ops.gelu_tanh_and_mul(out, x)
+    torch.ops._C.gelu_tanh_and_mul(out, x)
 
 
 def gelu_fast(out: torch.Tensor, x: torch.Tensor) -> None:
-    vllm_ops.gelu_fast(out, x)
+    torch.ops._C.gelu_fast(out, x)
 
 
 def gelu_new(out: torch.Tensor, x: torch.Tensor) -> None:
-    vllm_ops.gelu_new(out, x)
+    torch.ops._C.gelu_new(out, x)
 
 
 # page attention ops
@@ -53,7 +65,7 @@ def paged_attention_v1(
     blocksparse_block_size: int = 64,
     blocksparse_head_sliding_step: int = 0,
 ) -> None:
-    vllm_ops.paged_attention_v1(
+    torch.ops._C.paged_attention_v1(
         out, query, key_cache, value_cache, num_kv_heads, scale, block_tables,
         seq_lens, block_size, max_seq_len, alibi_slopes, kv_cache_dtype,
         kv_scale, tp_rank, blocksparse_local_blocks, blocksparse_vert_stride,
@@ -83,7 +95,7 @@ def paged_attention_v2(
     blocksparse_block_size: int = 64,
     blocksparse_head_sliding_step: int = 0,
 ) -> None:
-    vllm_ops.paged_attention_v2(
+    torch.ops._C.paged_attention_v2(
         out, exp_sum, max_logits, tmp_out, query, key_cache, value_cache,
         num_kv_heads, scale, block_tables, seq_lens, block_size, max_seq_len,
         alibi_slopes, kv_cache_dtype, kv_scale, tp_rank,
@@ -100,8 +112,8 @@ def rotary_embedding(
     cos_sin_cache: torch.Tensor,
     is_neox: bool,
 ) -> None:
-    vllm_ops.rotary_embedding(positions, query, key, head_size, cos_sin_cache,
-                              is_neox)
+    torch.ops._C.rotary_embedding(positions, query, key, head_size,
+                                  cos_sin_cache, is_neox)
 
 
 def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
@@ -109,20 +121,20 @@ def batched_rotary_embedding(positions: torch.Tensor, query: torch.Tensor,
                              cos_sin_cache: torch.Tensor, is_neox: bool,
                              rot_dim: int,
                              cos_sin_cache_offsets: torch.Tensor) -> None:
-    vllm_ops.batched_rotary_embedding(positions, query, key, head_size,
-                                      cos_sin_cache, is_neox, rot_dim,
-                                      cos_sin_cache_offsets)
+    torch.ops._C.batched_rotary_embedding(positions, query, key, head_size,
+                                          cos_sin_cache, is_neox, rot_dim,
+                                          cos_sin_cache_offsets)
 
 
 # layer norm ops
 def rms_norm(out: torch.Tensor, input: torch.Tensor, weight: torch.Tensor,
              epsilon: float) -> None:
-    vllm_ops.rms_norm(out, input, weight, epsilon)
+    torch.ops._C.rms_norm(out, input, weight, epsilon)
 
 
 def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
                        weight: torch.Tensor, epsilon: float) -> None:
-    vllm_ops.fused_add_rms_norm(input, residual, weight, epsilon)
+    torch.ops._C.fused_add_rms_norm(input, residual, weight, epsilon)
 
 
 # quantization ops
@@ -130,13 +142,13 @@ def fused_add_rms_norm(input: torch.Tensor, residual: torch.Tensor,
 def awq_dequantize(qweight: torch.Tensor, scales: torch.Tensor,
                    zeros: torch.Tensor, split_k_iters: int, thx: int,
                    thy: int) -> torch.Tensor:
-    return vllm_ops.awq_dequantize(qweight, scales, zeros, split_k_iters, thx,
-                                   thy)
+    return torch.ops._C.awq_dequantize(qweight, scales, zeros, split_k_iters,
+                                       thx, thy)
 
 
 def awq_gemm(input: torch.Tensor, qweight: torch.Tensor, qzeros: torch.Tensor,
              scales: torch.Tensor, split_k_iters: int) -> torch.Tensor:
-    return vllm_ops.awq_gemm(input, qweight, qzeros, scales, split_k_iters)
+    return torch.ops._C.awq_gemm(input, qweight, qzeros, scales, split_k_iters)
 
 
 # gptq
@@ -144,27 +156,27 @@ def gptq_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
               b_gptq_qzeros: torch.Tensor, b_gptq_scales: torch.Tensor,
               b_g_idx: torch.Tensor, use_exllama: bool,
               bit: int) -> torch.Tensor:
-    return vllm_ops.gptq_gemm(a, b_q_weight, b_gptq_qzeros, b_gptq_scales,
-                              b_g_idx, use_exllama, bit)
+    return torch.ops._C.gptq_gemm(a, b_q_weight, b_gptq_qzeros, b_gptq_scales,
+                                  b_g_idx, use_exllama, bit)
 
 
 def gptq_shuffle(q_weight: torch.Tensor, q_perm: torch.Tensor,
                  bit: int) -> None:
-    vllm_ops.gptq_shuffle(q_weight, q_perm, bit)
+    torch.ops._C.gptq_shuffle(q_weight, q_perm, bit)
 
 
 # squeezellm
 def squeezellm_gemm(vec: torch.Tensor, mat: torch.Tensor, mul: torch.Tensor,
                     lookup_table: torch.Tensor) -> None:
-    vllm_ops.squeezellm_gemm(vec, mat, mul, lookup_table)
+    torch.ops._C.squeezellm_gemm(vec, mat, mul, lookup_table)
 
 
 # marlin
 def marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
                 b_scales: torch.Tensor, workspace: torch.Tensor, size_m: int,
                 size_n: int, size_k: int) -> torch.Tensor:
-    return vllm_ops.marlin_gemm(a, b_q_weight, b_scales, workspace, size_m,
-                                size_n, size_k)
+    return torch.ops._C.marlin_gemm(a, b_q_weight, b_scales, workspace, size_m,
+                                    size_n, size_k)
 
 
 # marlin_24
@@ -172,9 +184,9 @@ def gptq_marlin_24_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
                         b_meta: torch.Tensor, b_scales: torch.Tensor,
                         workspace: torch.Tensor, num_bits: int, size_m: int,
                         size_n: int, size_k: int) -> torch.Tensor:
-    return vllm_ops.gptq_marlin_24_gemm(a, b_q_weight, b_meta, b_scales,
-                                        workspace, num_bits, size_m, size_n,
-                                        size_k)
+    return torch.ops._C.gptq_marlin_24_gemm(a, b_q_weight, b_meta, b_scales,
+                                            workspace, num_bits, size_m,
+                                            size_n, size_k)
 
 
 # cutlass
@@ -188,7 +200,7 @@ def cutlass_scaled_mm_dq(a: torch.Tensor, b: torch.Tensor,
     n = b.shape[1]
     out = torch.empty((m, n), dtype=out_dtype, device=a.device)
 
-    vllm_ops.cutlass_scaled_mm_dq(out, a, b, scale_a, scale_b)
+    torch.ops._C.cutlass_scaled_mm_dq(out, a, b, scale_a, scale_b)
 
     return out
 
@@ -198,21 +210,22 @@ def aqlm_gemm(input: torch.Tensor, codes: torch.Tensor,
               codebooks: torch.Tensor, scales: torch.Tensor,
               codebook_partition_sizes: torch.Tensor,
               bias: Optional[torch.Tensor]) -> torch.Tensor:
-    return vllm_ops.aqlm_gemm(input, codes, codebooks, scales,
-                              codebook_partition_sizes, bias)
+    return torch.ops._C.aqlm_gemm(input, codes, codebooks, scales,
+                                  codebook_partition_sizes, bias)
 
 
 def aqlm_dequant(codes: torch.Tensor, codebooks: torch.Tensor,
                  codebook_partition_sizes: torch.Tensor) -> torch.Tensor:
-    return vllm_ops.aqlm_dequant(codes, codebooks, codebook_partition_sizes)
+    return torch.ops._C.aqlm_dequant(codes, codebooks,
+                                     codebook_partition_sizes)
 
 
 # gptq_marlin
 def gptq_marlin_repack(b_q_weight: torch.Tensor, perm: torch.Tensor,
                        size_k: int, size_n: int,
                        num_bits: int) -> torch.Tensor:
-    return vllm_ops.gptq_marlin_repack(b_q_weight, perm, size_k, size_n,
-                                       num_bits)
+    return torch.ops._C.gptq_marlin_repack(b_q_weight, perm, size_k, size_n,
+                                           num_bits)
 
 
 def gptq_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
@@ -220,9 +233,9 @@ def gptq_marlin_gemm(a: torch.Tensor, b_q_weight: torch.Tensor,
                      perm: torch.Tensor, workspace: torch.Tensor,
                      num_bits: int, size_m: int, size_n: int, size_k: int,
                      is_k_full: bool) -> torch.Tensor:
-    return vllm_ops.gptq_marlin_gemm(a, b_q_weight, b_scales, g_idx, perm,
-                                     workspace, num_bits, size_m, size_n,
-                                     size_k, is_k_full)
+    return torch.ops._C.gptq_marlin_gemm(a, b_q_weight, b_scales, g_idx, perm,
+                                         workspace, num_bits, size_m, size_n,
+                                         size_k, is_k_full)
 
 
 # fp8
@@ -259,9 +272,9 @@ def scaled_fp8_quant(
         output = torch.empty_like(input, dtype=torch.float8_e4m3fn)
     if scale is None:
         scale = torch.zeros(1, device=input.device, dtype=torch.float32)
-        vllm_ops.dynamic_scaled_fp8_quant(output, input, scale)
+        torch.ops._C.dynamic_scaled_fp8_quant(output, input, scale)
     else:
-        vllm_ops.static_scaled_fp8_quant(output, input, scale)
+        torch.ops._C.static_scaled_fp8_quant(output, input, scale)
     return output, scale
 
 
@@ -284,14 +297,14 @@ def scaled_int8_quant(
     output = torch.empty_like(input, dtype=torch.int8)
     if scale is not None:
         # static-per-tensor quantization.
-        vllm_ops.static_scaled_int8_quant(output, input, scale)
+        torch.ops._C.static_scaled_int8_quant(output, input, scale)
         return output, scale
 
     # dynamic-per-token quantization.
     input_scales = torch.empty((input.numel() // input.shape[-1], 1),
                                device=input.device,
                                dtype=torch.float32)
-    vllm_ops.dynamic_scaled_int8_quant(output, input, input_scales)
+    torch.ops._C.dynamic_scaled_int8_quant(output, input, input_scales)
     return output, input_scales
 
 
@@ -300,9 +313,16 @@ def moe_align_block_size(topk_ids: torch.Tensor, num_experts: int,
                          block_size: int, sorted_token_ids: torch.Tensor,
                          experts_ids: torch.Tensor,
                          num_tokens_post_pad: torch.Tensor) -> None:
-    vllm_ops.moe_align_block_size(topk_ids, num_experts, block_size,
-                                  sorted_token_ids, experts_ids,
-                                  num_tokens_post_pad)
+    torch.ops._C.moe_align_block_size(topk_ids, num_experts, block_size,
+                                      sorted_token_ids, experts_ids,
+                                      num_tokens_post_pad)
+
+
+def topk_softmax(topk_weights: torch.Tensor, topk_ids: torch.Tensor,
+                 token_expert_indicies: torch.Tensor,
+                 gating_output: float) -> None:
+    torch.ops._moe_C.topk_softmax(topk_weights, topk_ids,
+                                  token_expert_indicies, gating_output)
 
 
 def reshape_and_cache(
@@ -314,8 +334,9 @@ def reshape_and_cache(
     kv_cache_dtype: str,
     kv_scale: float,
 ) -> None:
-    vllm_cache_ops.reshape_and_cache(key, value, key_cache, value_cache,
-                                     slot_mapping, kv_cache_dtype, kv_scale)
+    torch.ops._C_cache_ops.reshape_and_cache(key, value, key_cache,
+                                             value_cache, slot_mapping,
+                                             kv_cache_dtype, kv_scale)
 
 
 def reshape_and_cache_flash(
@@ -326,25 +347,115 @@ def reshape_and_cache_flash(
     slot_mapping: torch.Tensor,
     kv_cache_dtype: str,
 ) -> None:
-    vllm_cache_ops.reshape_and_cache_flash(key, value, key_cache, value_cache,
-                                           slot_mapping, kv_cache_dtype)
+    torch.ops._C_cache_ops.reshape_and_cache_flash(key, value, key_cache,
+                                                   value_cache, slot_mapping,
+                                                   kv_cache_dtype)
 
 
 def copy_blocks(key_caches: torch.Tensor, value_caches: torch.Tensor,
                 block_mapping: torch.Tensor) -> None:
-    vllm_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
+    torch.ops._C_cache_ops.copy_blocks(key_caches, value_caches, block_mapping)
 
 
 def swap_blocks(src: torch.Tensor, dst: torch.Tensor,
                 block_mapping: torch.Tensor) -> None:
-    vllm_cache_ops.swap_blocks(src, dst, block_mapping)
+    torch.ops._C_cache_ops.swap_blocks(src, dst, block_mapping)
 
 
 def convert_fp8(output: torch.Tensor,
                 input: torch.Tensor,
                 scale: float = 1.0,
                 kv_dtype: str = "fp8") -> None:
-    vllm_cache_ops.convert_fp8(output, input, scale, kv_dtype)
+    torch.ops._C_cache_ops.convert_fp8(output, input, scale, kv_dtype)
+
+
+def get_device_attribute(attribute: int, device: int) -> int:
+    return torch.ops._C_cuda_utils.get_device_attribute(attribute, device)
+
+
+def get_max_shared_memory_per_block_device_attribute(device: int) -> int:
+    # ruff: noqa: E501
+    return torch.ops._C_cuda_utils.get_max_shared_memory_per_block_device_attribute(
+        device)
+
+
+# custom ar
+def init_custom_ar(meta: torch.Tensor, rank_data: torch.Tensor,
+                   handles: List[str], offsets: List[int], rank: int,
+                   full_nvlink: bool) -> int:
+    return torch.ops._C_custom_ar.init_custom_ar(meta, rank_data, handles,
+                                                 offsets, rank, full_nvlink)
+
+
+def should_custom_ar(inp: torch.Tensor, max_size: int, world_size: int,
+                     full_nvlink: bool) -> bool:
+    return torch.ops._C_custom_ar.should_custom_ar(inp, max_size, world_size,
+                                                   full_nvlink)
+
+
+def all_reduce_reg(fa: int, inp: torch.Tensor, out: torch.Tensor) -> None:
+    torch.ops._C_custom_ar.all_reduce_reg(fa, inp, out)
+
 
+def all_reduce_unreg(fa: int, inp: torch.Tensor, reg_buffer: torch.Tensor,
+                     out: torch.Tensor) -> None:
+    torch.ops._C_custom_ar.all_reduce_unreg(fa, inp, reg_buffer, out)
 
-#TODO: cuda_utils, custom_ar
+
+def dispose(fa: int) -> None:
+    torch.ops._C_custom_ar.dispose(fa)
+
+
+def meta_size() -> int:
+    return torch.ops._C_custom_ar.meta_size()
+
+
+def register_buffer(fa: int, t: torch.Tensor, handles: List[str],
+                    offsets: List[int]) -> None:
+    return torch.ops._C_custom_ar.register_buffer(fa, t, handles, offsets)
+
+
+def get_graph_buffer_ipc_meta(fa: int) -> Tuple[List[str], List[int]]:
+    return torch.ops._C_custom_ar.get_graph_buffer_ipc_meta(fa)
+
+
+def register_graph_buffers(fa: int, handles: List[str],
+                           offsets: List[List[int]]) -> None:
+    torch.ops._C_custom_ar.register_graph_buffers(fa, handles, offsets)
+
+
+# punica
+def dispatch_bgmv(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    w_t_all: torch.Tensor,
+    indicies: torch.Tensor,
+    layer_idx: int,
+    scale: float,
+) -> None:
+    torch.ops._punica_C.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx,
+                                      scale)
+
+
+def dispatch_bgmv_low_level(
+    y: torch.Tensor,
+    x: torch.Tensor,
+    w_t_all: torch.Tensor,
+    indicies: torch.Tensor,
+    layer_idx: int,
+    scale: float,
+    h_in: int,
+    h_out: int,
+    y_offset: int,
+) -> None:
+    torch.ops._punica_C.dispatch_bgmv_low_level(
+        y,
+        x,
+        w_t_all,
+        indicies,
+        layer_idx,
+        scale,
+        h_in,
+        h_out,
+        y_offset,
+    )
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
index 070c074e511b..8c64c2bfdeb8 100644
--- a/vllm/attention/backends/flash_attn.py
+++ b/vllm/attention/backends/flash_attn.py
@@ -5,7 +5,7 @@
 import torch
 from vllm_flash_attn import flash_attn_varlen_func, flash_attn_with_kvcache
 
-from vllm._C import cache_ops
+from vllm import _custom_ops as ops
 from vllm.attention.backends.abstract import (AttentionBackend, AttentionImpl,
                                               AttentionMetadata)
 
@@ -47,11 +47,11 @@ def swap_blocks(
     ) -> None:
         src_key_cache = src_kv_cache[0]
         dst_key_cache = dst_kv_cache[0]
-        cache_ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
+        ops.swap_blocks(src_key_cache, dst_key_cache, src_to_dst)
 
         src_value_cache = src_kv_cache[1]
         dst_value_cache = dst_kv_cache[1]
-        cache_ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst)
+        ops.swap_blocks(src_value_cache, dst_value_cache, src_to_dst)
 
     @staticmethod
     def copy_blocks(
@@ -60,7 +60,7 @@ def copy_blocks(
     ) -> None:
         key_caches = [kv_cache[0] for kv_cache in kv_caches]
         value_caches = [kv_cache[1] for kv_cache in kv_caches]
-        cache_ops.copy_blocks(key_caches, value_caches, src_to_dists)
+        ops.copy_blocks(key_caches, value_caches, src_to_dists)
 
 
 @dataclass
@@ -285,7 +285,7 @@ def forward(
             # Reshape the input keys and values and store them in the cache.
             # If kv_cache is not provided, the new key and value tensors are
             # not cached. This happens during the initial memory profiling run.
-            cache_ops.reshape_and_cache_flash(
+            ops.reshape_and_cache_flash(
                 key,
                 value,
                 key_cache,
diff --git a/vllm/distributed/device_communicators/custom_all_reduce.py b/vllm/distributed/device_communicators/custom_all_reduce.py
index a3902aecb379..4a0e19bc0c15 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce.py
@@ -6,6 +6,7 @@
 from torch.distributed import ProcessGroup
 
 import vllm.envs as envs
+from vllm import _custom_ops as ops
 from vllm.distributed.device_communicators.custom_all_reduce_utils import (
     gpu_p2p_access_check)
 from vllm.distributed.parallel_state import (
@@ -15,7 +16,11 @@
 try:
     import pynvml
 
-    from vllm._C import custom_ar
+    # Simulate ImportError if custom_ar ops are not supported.
+    if not ops.is_custom_op_supported("_C_custom_ar::meta_size"):
+        raise ImportError("custom_ar", __file__)
+
+    custom_ar = True
 
     @contextmanager
     def _nvml():
@@ -27,7 +32,7 @@ def _nvml():
 
 except ImportError:
     # For AMD GPUs
-    custom_ar = None
+    custom_ar = False
     pynvml = None
 
     @contextmanager
@@ -97,7 +102,7 @@ def __init__(self,
         self._IS_CAPTURING = False
         self.disabled = True
 
-        if custom_ar is None:
+        if not custom_ar:
             # disable because of missing custom allreduce library
             # e.g. in a non-cuda environment
             return
@@ -175,7 +180,7 @@ def __init__(self,
         # meta data composes of two parts: meta data for synchronization
         # (256 bytes) and a temporary buffer for storing intermediate
         # allreduce results.
-        self.meta = torch.zeros(custom_ar.meta_size() + max_size,
+        self.meta = torch.zeros(ops.meta_size() + max_size,
                                 dtype=torch.uint8,
                                 device=self.device)
         # This is a pre-registered IPC buffer. In eager mode, input tensors
@@ -196,9 +201,8 @@ def __init__(self,
         self.world_size = world_size
         handles, offsets = self._get_ipc_meta(self.meta)
         self.full_nvlink = full_nvlink
-        self._ptr = custom_ar.init_custom_ar(self.meta, self.rank_data,
-                                             handles, offsets, rank,
-                                             self.full_nvlink)
+        self._ptr = ops.init_custom_ar(self.meta, self.rank_data, handles,
+                                       offsets, rank, self.full_nvlink)
         self.register_buffer(self.buffer)
 
     @contextmanager
@@ -252,31 +256,31 @@ def _gather_ipc_meta(self, shard_data):
 
     def register_buffer(self, inp: torch.Tensor):
         handles, offsets = self._get_ipc_meta(inp)
-        custom_ar.register_buffer(self._ptr, inp, handles, offsets)
+        ops.register_buffer(self._ptr, inp, handles, offsets)
 
     def register_graph_buffers(self):
-        handle, offset = custom_ar.get_graph_buffer_ipc_meta(self._ptr)
+        handle, offset = ops.get_graph_buffer_ipc_meta(self._ptr)
         handles, offsets = self._gather_ipc_meta((bytes(handle), offset))
         logger.info("Registering %d cuda graph addresses", len(offset))
-        custom_ar.register_graph_buffers(self._ptr, handles, offsets)
+        ops.register_graph_buffers(self._ptr, handles, offsets)
 
     def should_custom_ar(self, inp: torch.Tensor):
-        return custom_ar.should_custom_ar(inp, self.max_size, self.world_size,
-                                          self.full_nvlink)
+        return ops.should_custom_ar(inp, self.max_size, self.world_size,
+                                    self.full_nvlink)
 
     # all reduce, assuming inp tensor is IPC registered with register_buffer,
     # or, in the context of cuda graphs, register_graph_buffers
     def all_reduce_reg(self, inp: torch.Tensor, out: torch.Tensor = None):
         if out is None:
             out = torch.empty_like(inp)
-        custom_ar.all_reduce_reg(self._ptr, inp, out)
+        ops.all_reduce_reg(self._ptr, inp, out)
         return out
 
     # all reduce, assuming inp tensor is NOT IPC registered
     def all_reduce_unreg(self, inp: torch.Tensor, out: torch.Tensor = None):
         if out is None:
             out = torch.empty_like(inp)
-        custom_ar.all_reduce_unreg(self._ptr, inp, self.buffer, out)
+        ops.all_reduce_unreg(self._ptr, inp, self.buffer, out)
         return out
 
     def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
@@ -304,7 +308,7 @@ def custom_all_reduce(self, input: torch.Tensor) -> Optional[torch.Tensor]:
 
     def close(self):
         if not self.disabled and self._ptr:
-            custom_ar.dispose(self._ptr)
+            ops.dispose(self._ptr)
             self._ptr = 0
 
     def __del__(self):
diff --git a/vllm/lora/punica.py b/vllm/lora/punica.py
index c87bed54726f..7ecaa450f175 100644
--- a/vllm/lora/punica.py
+++ b/vllm/lora/punica.py
@@ -4,16 +4,21 @@
 
 import torch
 
+from vllm import _custom_ops as ops
+
+
+def _check_punica_support():
+    if ops.is_custom_op_supported("_punica_C::dispatch_bgmv"):
+        return
 
-def _raise_import_error(e):
     if torch.cuda.get_device_capability() < (8, 0):
         raise ImportError(
-            "punica LoRA kernels require compute capability >= 8.0") from e
+            "punica LoRA kernels require compute capability >= 8.0")
     else:
         raise ImportError(
             "punica LoRA kernels could not be imported. If you built vLLM "
             "from source, make sure VLLM_INSTALL_PUNICA_KERNELS=1 env var "
-            "was set.") from e
+            "was set.")
 
 
 def bgmv(
@@ -41,12 +46,9 @@ def bgmv(
       layer_idx: Layer index of the weight matrices.
       scale: Scaling factor.
     """
-    try:
-        import vllm._punica_C as punica_kernels
-    except ImportError as e:
-        _raise_import_error(e)
+    _check_punica_support()
 
-    punica_kernels.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale)
+    ops.dispatch_bgmv(y, x, w_t_all, indicies, layer_idx, scale)
 
 
 def dispatch_bgmv_low_level(y: torch.Tensor, x: torch.Tensor,
@@ -75,11 +77,9 @@ def dispatch_bgmv_low_level(y: torch.Tensor, x: torch.Tensor,
       y_offset: Offset to apply to the starting column of y.
       y_slice_size: Size of the y column slice.
     """
-    try:
-        import vllm._punica_C as punica_kernels
-    except ImportError as e:
-        _raise_import_error(e)
-    punica_kernels.dispatch_bgmv_low_level(
+    _check_punica_support()
+
+    ops.dispatch_bgmv_low_level(
         y,
         x,
         w_t_all,
@@ -122,10 +122,7 @@ def add_lora(y: torch.Tensor,
       scale: Scaling factor.
       buffer: Optional. Shape: `[B, R]`. Temporary buffer.
     """
-    try:
-        import vllm._punica_C as punica_kernels
-    except ImportError as e:
-        _raise_import_error(e)
+    _check_punica_support()
 
     r = wb_t_all.size(-1)
     if buffer is None:
@@ -135,9 +132,8 @@ def add_lora(y: torch.Tensor,
         buffer = torch.zeros((x.size(0), r),
                              dtype=torch.float32,
                              device=x.device)
-    punica_kernels.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0)
-    punica_kernels.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx,
-                                 scale)
+    ops.dispatch_bgmv(buffer, x, wa_t_all, indicies, layer_idx, 1.0)
+    ops.dispatch_bgmv(y, buffer, wb_t_all, indicies, layer_idx, scale)
 
 
 def add_lora_slice(y: torch.Tensor,
@@ -176,10 +172,7 @@ def add_lora_slice(y: torch.Tensor,
       y_offset: Offset to apply to the starting column of y.
       y_slice_size: Size of the y column slice.
     """
-    try:
-        import vllm._punica_C as punica_kernels
-    except ImportError as e:
-        _raise_import_error(e)
+    _check_punica_support()
 
     r = wb_t_all.size(-1)
     if buffer is None:
@@ -189,7 +182,7 @@ def add_lora_slice(y: torch.Tensor,
         buffer = torch.zeros((x.size(0), r),
                              dtype=torch.float32,
                              device=x.device)
-    punica_kernels.dispatch_bgmv_low_level(
+    ops.dispatch_bgmv_low_level(
         buffer,
         x,
         wa_t_all,
@@ -200,7 +193,7 @@ def add_lora_slice(y: torch.Tensor,
         buffer.size(1),
         0,
     )
-    punica_kernels.dispatch_bgmv_low_level(
+    ops.dispatch_bgmv_low_level(
         y,
         buffer,
         wb_t_all,
diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py
index 1c6947137a1c..4d0160ff296a 100644
--- a/vllm/model_executor/layers/fused_moe/fused_moe.py
+++ b/vllm/model_executor/layers/fused_moe/fused_moe.py
@@ -8,7 +8,6 @@
 import triton
 import triton.language as tl
 
-import vllm._moe_C as moe_kernels
 from vllm import _custom_ops as ops
 from vllm.logger import init_logger
 
@@ -355,7 +354,7 @@ def fused_topk(
                                         topk,
                                         dtype=torch.int32,
                                         device=hidden_states.device)
-    moe_kernels.topk_softmax(
+    ops.topk_softmax(
         topk_weights,
         topk_ids,
         token_expert_indicies,
diff --git a/vllm/utils.py b/vllm/utils.py
index b6edf00d5388..56dc922aaaeb 100644
--- a/vllm/utils.py
+++ b/vllm/utils.py
@@ -22,6 +22,7 @@
 import torch
 
 import vllm.envs as envs
+from vllm import _custom_ops as ops
 from vllm.logger import enable_trace_function_call, init_logger
 
 T = TypeVar("T")
@@ -148,12 +149,8 @@ def is_neuron() -> bool:
 @lru_cache(maxsize=None)
 def get_max_shared_memory_bytes(gpu: int = 0) -> int:
     """Returns the maximum shared memory per thread block in bytes."""
-    # NOTE: This import statement should be executed lazily since
-    # the Neuron-X backend does not have the `cuda_utils` module.
-    from vllm._C import cuda_utils
-
     max_shared_mem = (
-        cuda_utils.get_max_shared_memory_per_block_device_attribute(gpu))
+        ops.get_max_shared_memory_per_block_device_attribute(gpu))
     # value 0 will cause MAX_SEQ_LEN become negative and test_attention.py
     # will fail
     assert max_shared_mem > 0, "max_shared_mem can not be zero"

From 27e68e928454d9295ccd0cd9a7525783176be999 Mon Sep 17 00:00:00 2001
From: Bla_ckB <50193121+BlackBird-Coding@users.noreply.github.com>
Date: Mon, 10 Jun 2024 06:23:14 +0700
Subject: [PATCH 78/93] [Bugfix] Fix KeyError: 1 When Using LoRA adapters
 (#5164)

---
 vllm/core/scheduler.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
index 0159053b4dc6..bb37c5f31361 100644
--- a/vllm/core/scheduler.py
+++ b/vllm/core/scheduler.py
@@ -423,7 +423,9 @@ def _schedule_running(
                 num_running_seqs = seq_group.get_max_num_running_seqs()
                 budget.subtract_num_seqs(seq_group.request_id,
                                          num_running_seqs)
-                if curr_loras is not None and seq_group.lora_int_id > 0:
+
+                if (curr_loras is not None and seq_group.lora_int_id > 0
+                        and seq_group.lora_int_id in curr_loras):
                     curr_loras.remove(seq_group.lora_int_id)
 
                 if running_queue:

From 8f865f633d38be79a999be0cfea95774354885c3 Mon Sep 17 00:00:00 2001
From: Dipika Sikka <dipikasikka1@gmail.com>
Date: Sun, 9 Jun 2024 23:49:46 -0400
Subject: [PATCH 79/93] [Misc] Update to comply with the new
 `compressed-tensors` config (#5350)

Co-authored-by: Michael Goin <michael@neuralmagic.com>
---
 tests/quantization/test_compressed_tensors.py | 20 ++++++++++++-------
 vllm/config.py                                |  8 ++------
 .../layers/quantization/__init__.py           |  2 +-
 .../model_loader/weight_utils.py              |  9 +++------
 4 files changed, 19 insertions(+), 20 deletions(-)

diff --git a/tests/quantization/test_compressed_tensors.py b/tests/quantization/test_compressed_tensors.py
index 9d94d2ecfb22..e6d8218b4137 100644
--- a/tests/quantization/test_compressed_tensors.py
+++ b/tests/quantization/test_compressed_tensors.py
@@ -5,15 +5,15 @@
 
 import torch
 
+from vllm import SamplingParams
 from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import (  # noqa: E501
     CompressedTensorsLinearMethod, CompressedTensorsW8A8DynamicToken,
     CompressedTensorsW8A8StaticTensor)
 
 
 def test_compressed_tensors_w8a8_static_setup(vllm_runner):
-    model_path = "nm-testing/tinyllama-one-shot-static-quant-test-compressed"
-    with vllm_runner(model_path, quantization="sparseml",
-                     enforce_eager=True) as llm:
+    model_path = "nm-testing/tinyllama-oneshot-w8a8-static-v2"
+    with vllm_runner(model_path, enforce_eager=True) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         layer = model.model.layers[0]
 
@@ -40,11 +40,17 @@ def test_compressed_tensors_w8a8_static_setup(vllm_runner):
         assert qkv_proj.input_scale.dtype is torch.float32
 
 
+def test_compressed_tensors_no_enforce_eager(vllm_runner):
+    model_path = "nm-testing/tinyllama-oneshot-w8a8-static-v2"
+    with vllm_runner(model_path) as llm:
+        sampling_params = SamplingParams()
+        output = llm.generate("Hello world!", sampling_params=sampling_params)
+        assert output
+
+
 def test_compressed_tensors_w8a8_dynanmic_per_token(vllm_runner):
-    model_path = "nm-testing/tinyllama-one-shot-dynamic-test"
-    with vllm_runner(model_path,
-                     quantization="sparseml",
-                     enforce_eager=True,
+    model_path = "nm-testing/tinyllama-oneshot-w8a8-dynamic-token-v2"
+    with vllm_runner(model_path, enforce_eager=True,
                      dtype=torch.float16) as llm:
         model = llm.model.llm_engine.model_executor.driver_worker.model_runner.model  # noqa: E501
         layer = model.model.layers[0]
diff --git a/vllm/config.py b/vllm/config.py
index a90b9aa83ad7..72ddc8fda805 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -196,12 +196,8 @@ def _verify_embedding_mode(self) -> None:
     def _parse_quant_hf_config(self):
         quant_cfg = getattr(self.hf_config, "quantization_config", None)
         if quant_cfg is None:
-            # SparseML uses a "compression_config" with a "quantization_config".
-            compression_cfg = getattr(self.hf_config, "compression_config",
-                                      None)
-            if compression_cfg is not None:
-                quant_cfg = compression_cfg.get("quantization_config", None)
-
+            # compress-tensors uses a "compression_config" key
+            quant_cfg = getattr(self.hf_config, "compression_config", None)
         return quant_cfg
 
     def _verify_quantization(self) -> None:
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 0bc42beb6625..40b0df75a69a 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -31,7 +31,7 @@
     "gptq_marlin": GPTQMarlinConfig,
     "gptq": GPTQConfig,
     "squeezellm": SqueezeLLMConfig,
-    "sparseml": CompressedTensorsConfig,
+    "compressed-tensors": CompressedTensorsConfig,
     "bitsandbytes": BitsAndBytesConfig,
 }
 
diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py
index e76d5798fd2e..15675b8fb4c3 100644
--- a/vllm/model_executor/model_loader/weight_utils.py
+++ b/vllm/model_executor/model_loader/weight_utils.py
@@ -137,12 +137,9 @@ def get_quant_config(model_config: ModelConfig,
     hf_quant_config = getattr(model_config.hf_config, "quantization_config",
                               None)
     if hf_quant_config is None:
-        compression_config = getattr(model_config.hf_config,
-                                     "compression_config", None)
-        if compression_config is not None:
-            hf_quant_config = compression_config.get("quantization_config",
-                                                     None)
-
+        # compressed-tensors uses a compressions_config
+        hf_quant_config = getattr(model_config.hf_config, "compression_config",
+                                  None)
     if hf_quant_config is not None:
         return quant_cls.from_config(hf_quant_config)
     # In case of bitsandbytes/QLoRA, get quant config from the adapter model.

From d3bd1358824062c853e4d710d5f620ce14272cdf Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 10 Jun 2024 02:13:39 -0700
Subject: [PATCH 80/93] [Frontend][Misc] Enforce Pixel Values as Input Type for
 VLMs in API Server (#5374)

---
 vllm/entrypoints/openai/api_server.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py
index 95417718b51f..e7503b965583 100644
--- a/vllm/entrypoints/openai/api_server.py
+++ b/vllm/entrypoints/openai/api_server.py
@@ -183,6 +183,16 @@ async def authentication(request: Request, call_next):
         served_model_names = [args.model]
 
     engine_args = AsyncEngineArgs.from_cli_args(args)
+
+    # Enforce pixel values as image input type for vision language models
+    # when serving with API server
+    if engine_args.image_input_type is not None and \
+        engine_args.image_input_type.upper() != "PIXEL_VALUES":
+        raise ValueError(
+            f"Invalid image_input_type: {engine_args.image_input_type}. "
+            "Only --image-input-type 'pixel_values' is supported for serving "
+            "vision language models with the vLLM API server.")
+
     engine = AsyncLLMEngine.from_engine_args(
         engine_args, usage_context=UsageContext.OPENAI_API_SERVER)
 

From b21be06de65efe56231b3beebb7d77659b679459 Mon Sep 17 00:00:00 2001
From: youkaichao <youkaichao@gmail.com>
Date: Mon, 10 Jun 2024 02:51:02 -0700
Subject: [PATCH 81/93] [misc][typo] fix typo (#5372)

---
 .../distributed/device_communicators/custom_all_reduce_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/distributed/device_communicators/custom_all_reduce_utils.py b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
index 24ef3cb45b19..4b89a23dfc46 100644
--- a/vllm/distributed/device_communicators/custom_all_reduce_utils.py
+++ b/vllm/distributed/device_communicators/custom_all_reduce_utils.py
@@ -166,7 +166,7 @@ def gpu_p2p_access_check(i: int, j: int) -> bool:
             and (not os.path.exists(path))):
         # only the local master process (with local_rank == 0) can
         #  enter this block to calculate the cache
-        logger.info("generating GPU P2P access cache for in %s", path)
+        logger.info("generating GPU P2P access cache in %s", path)
         cache = {}
         for _i in range(num_dev):
             for _j in range(num_dev):

From 1b41d112fd1c113fd9a18a4e83b447ba342826f9 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 10 Jun 2024 19:38:49 +0800
Subject: [PATCH 82/93] [Misc] Improve error message when LoRA parsing fails
 (#5194)

---
 tests/lora/test_utils.py | 14 +++++++++++++-
 vllm/lora/utils.py       | 15 +++++++--------
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/tests/lora/test_utils.py b/tests/lora/test_utils.py
index 892f6081e2aa..4ff9715b4ca8 100644
--- a/tests/lora/test_utils.py
+++ b/tests/lora/test_utils.py
@@ -1,12 +1,13 @@
 from collections import OrderedDict
 
+import pytest
 from torch import nn
 
 from vllm.lora.utils import parse_fine_tuned_lora_name, replace_submodule
 from vllm.utils import LRUCache
 
 
-def test_parse_fine_tuned_lora_name():
+def test_parse_fine_tuned_lora_name_valid():
     fixture = {
         ("base_model.model.lm_head.lora_A.weight", "lm_head", True),
         ("base_model.model.lm_head.lora_B.weight", "lm_head", False),
@@ -35,6 +36,17 @@ def test_parse_fine_tuned_lora_name():
         assert (module_name, is_lora_a) == parse_fine_tuned_lora_name(name)
 
 
+def test_parse_fine_tuned_lora_name_invalid():
+    fixture = {
+        "weight",
+        "base_model.weight",
+        "base_model.model.weight",
+    }
+    for name in fixture:
+        with pytest.raises(ValueError, match="unsupported LoRA weight"):
+            parse_fine_tuned_lora_name(name)
+
+
 def test_replace_submodule():
     model = nn.Sequential(
         OrderedDict([
diff --git a/vllm/lora/utils.py b/vllm/lora/utils.py
index b0198a50b1c5..4a86c16cf64d 100644
--- a/vllm/lora/utils.py
+++ b/vllm/lora/utils.py
@@ -94,13 +94,12 @@ def parse_fine_tuned_lora_name(name: str) -> Tuple[str, bool]:
             is_lora_a whether the tensor is lora_a or lora_b.
     """
     parts = name.split(".")
-    assert parts[0] == "base_model"
-    assert parts[1] == "model"
-    if parts[-1] == "weight":
-        assert parts[-2] == "lora_A" or parts[-2] == "lora_B"
-        return ".".join(parts[2:-2]), parts[-2] == "lora_A"
 
-    if parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B":
-        return ".".join(parts[2:-1]), parts[-1] == "lora_embedding_A"
+    if len(parts) >= 2 and parts[0] == "base_model" and parts[1] == "model":
+        if parts[-1] == "weight":
+            if parts[-2] == "lora_A" or parts[-2] == "lora_B":
+                return ".".join(parts[2:-2]), parts[-2] == "lora_A"
+        elif parts[-1] == "lora_embedding_A" or parts[-1] == "lora_embedding_B":
+            return ".".join(parts[2:-1]), parts[-1] == "lora_embedding_A"
 
-    raise ValueError(f"{name} is unsupported format")
+    raise ValueError(f"{name} is unsupported LoRA weight")

From f932e32579f30b9775eba2b119e67014e5de2af9 Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 10 Jun 2024 20:47:15 +0800
Subject: [PATCH 83/93] [Model] Initial support for LLaVA-NeXT (#4199)

Co-authored-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/supported_models.rst  |   6 +-
 tests/models/test_llava.py               |   2 -
 tests/models/test_llava_next.py          | 123 +++++++
 tests/multimodal/test_processor.py       |  62 +++-
 vllm/model_executor/models/__init__.py   |   2 +
 vllm/model_executor/models/llava.py      |  18 +-
 vllm/model_executor/models/llava_next.py | 445 +++++++++++++++++++++++
 7 files changed, 640 insertions(+), 18 deletions(-)
 create mode 100644 tests/models/test_llava_next.py
 create mode 100644 vllm/model_executor/models/llava_next.py

diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst
index 24fa83df7d75..5d3f55be1271 100644
--- a/docs/source/models/supported_models.rst
+++ b/docs/source/models/supported_models.rst
@@ -89,7 +89,11 @@ Alongside each architecture, we include some popular models that use it.
     - ✅︎
   * - :code:`LlavaForConditionalGeneration`
     - LLaVA-1.5
-    - :code:`llava-hf/llava-1.5-7b-hf`\*, :code:`llava-hf/llava-1.5-13b-hf`\*, etc.
+    - :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc.
+    -
+  * - :code:`LlavaNextForConditionalGeneration`
+    - LLaVA-NeXT
+    - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc.
     -
   * - :code:`MiniCPMForCausalLM`
     - MiniCPM
diff --git a/tests/models/test_llava.py b/tests/models/test_llava.py
index 1f446362167a..a1f0cff1cc0e 100644
--- a/tests/models/test_llava.py
+++ b/tests/models/test_llava.py
@@ -39,8 +39,6 @@ def iter_llava_configs(model_name: str):
 
 model_and_vl_config = [
     *iter_llava_configs("llava-hf/llava-1.5-7b-hf"),
-    # Not enough memory
-    # *iter_llava_configs("llava-hf/llava-1.5-13b-hf"),
 ]
 
 
diff --git a/tests/models/test_llava_next.py b/tests/models/test_llava_next.py
new file mode 100644
index 000000000000..aa6ee268ae58
--- /dev/null
+++ b/tests/models/test_llava_next.py
@@ -0,0 +1,123 @@
+from typing import List, Tuple
+
+import pytest
+from transformers import AutoTokenizer
+
+from vllm.config import VisionLanguageConfig
+
+from ..conftest import IMAGE_FILES
+
+pytestmark = pytest.mark.llava
+
+_PREFACE = (
+    "A chat between a curious human and an artificial intelligence assistant. "
+    "The assistant gives helpful, detailed, and polite answers to the human's "
+    "questions.")
+
+# The image token is placed before "user" on purpose so that the test can pass
+HF_IMAGE_PROMPTS = [
+    f"{_PREFACE} <image>\nUSER: What's the content of the image? ASSISTANT:",
+    f"{_PREFACE} <image>\nUSER: What is the season? ASSISTANT:",
+]
+
+assert len(HF_IMAGE_PROMPTS) == len(IMAGE_FILES)
+
+
+def iter_llava_next_configs(model_name: str):
+    image_hw_to_feature_size = {
+        (336, 336): 1176,
+        (672, 672): 2928,
+        (1344, 336): 1944,
+        (336, 1344): 1890,
+    }
+
+    for (h, w), f in image_hw_to_feature_size.items():
+        for input_type, input_shape in [
+            (VisionLanguageConfig.ImageInputType.PIXEL_VALUES, (1, 3, h, w)),
+        ]:
+            yield (model_name,
+                   VisionLanguageConfig(image_input_type=input_type,
+                                        image_feature_size=f,
+                                        image_token_id=32000,
+                                        image_input_shape=input_shape,
+                                        image_processor=model_name,
+                                        image_processor_revision=None))
+
+
+model_and_vl_config = [
+    *iter_llava_next_configs("llava-hf/llava-v1.6-vicuna-7b-hf"),
+]
+
+
+def vllm_to_hf_output(vllm_output: Tuple[List[int], str],
+                      vlm_config: VisionLanguageConfig, model_id: str):
+    """Sanitize vllm output to be comparable with hf output.
+    The function reduces `input_ids` from 1, 32000, 32000, ..., 32000,
+    x1, x2, x3 ... to 1, 32000, x1, x2, x3 ...
+    It also reduces `output_str` from "<image><image>bla" to "bla".
+    """
+    input_ids, output_str = vllm_output
+    image_token_id = vlm_config.image_token_id
+
+    tokenizer = AutoTokenizer.from_pretrained(model_id)
+    image_token_str = tokenizer.decode(image_token_id)
+
+    hf_input_ids = [
+        input_id for idx, input_id in enumerate(input_ids)
+        if input_id != image_token_id or input_ids[idx - 1] != image_token_id
+    ]
+    hf_output_str = output_str \
+        .replace(image_token_str * vlm_config.image_feature_size, " ")
+
+    return hf_input_ids, hf_output_str
+
+
+@pytest.mark.xfail(
+    reason="Inconsistent image processor being used due to lack "
+    "of support for dynamic image token replacement")
+@pytest.mark.parametrize("model_and_config", model_and_vl_config)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+def test_models(hf_runner, vllm_runner, hf_images, vllm_images,
+                model_and_config, dtype: str, max_tokens: int) -> None:
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalData objects and corresponding
+    vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    model_id, vlm_config = model_and_config
+
+    with hf_runner(model_id, dtype=dtype, is_vision_model=True) as hf_model:
+        hf_outputs = hf_model.generate_greedy(HF_IMAGE_PROMPTS,
+                                              max_tokens,
+                                              images=hf_images)
+
+    vllm_image_prompts = [
+        p.replace("<image>", "<image>" * vlm_config.image_feature_size)
+        for p in HF_IMAGE_PROMPTS
+    ]
+
+    with vllm_runner(
+            model_id,
+            dtype=dtype,
+            # should be greater than image_feature_size
+            max_model_len=4096,
+            enforce_eager=True,
+            **vlm_config.as_cli_args_dict(),
+    ) as vllm_model:
+        vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
+                                                  max_tokens,
+                                                  images=vllm_images)
+
+    for i in range(len(HF_IMAGE_PROMPTS)):
+        hf_output_ids, hf_output_str = hf_outputs[i]
+        vllm_output_ids, vllm_output_str = vllm_to_hf_output(
+            vllm_outputs[i], vlm_config, model_id)
+        assert hf_output_str == vllm_output_str, (
+            f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
+        assert hf_output_ids == vllm_output_ids, (
+            f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
diff --git a/tests/multimodal/test_processor.py b/tests/multimodal/test_processor.py
index 3df28e782dd8..51c352361702 100644
--- a/tests/multimodal/test_processor.py
+++ b/tests/multimodal/test_processor.py
@@ -1,6 +1,6 @@
 import numpy as np
 import pytest
-from transformers import CLIPImageProcessor
+from transformers import CLIPImageProcessor, LlavaNextImageProcessor
 
 from vllm.config import ModelConfig, VisionLanguageConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -12,7 +12,7 @@
 @pytest.mark.parametrize("dtype", ["half", "float"])
 def test_clip_image_processor(hf_images, dtype):
     MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
-    IMAGE_HEIGHT = IMAGE_WIDTH = 33
+    IMAGE_HEIGHT = IMAGE_WIDTH = 560
 
     hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)
     assert isinstance(hf_processor, CLIPImageProcessor)
@@ -55,10 +55,61 @@ def test_clip_image_processor(hf_images, dtype):
             assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
 
 
+@pytest.mark.xfail(
+    reason="Inconsistent image processor being used due to lack "
+    "of support for dynamic image token replacement")
+@pytest.mark.parametrize("dtype", ["half", "float"])
+def test_llava_next_image_processor(hf_images, dtype):
+    MODEL_NAME = "llava-hf/llava-v1.6-34b-hf"
+    IMAGE_HEIGHT = IMAGE_WIDTH = 560
+
+    hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME)
+    assert isinstance(hf_processor, LlavaNextImageProcessor)
+
+    model_config = ModelConfig(
+        model=MODEL_NAME,
+        tokenizer=MODEL_NAME,
+        tokenizer_mode="auto",
+        trust_remote_code=False,
+        seed=0,
+        dtype=dtype,
+        revision=None,
+    )
+    vlm_config = VisionLanguageConfig(
+        image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
+        image_token_id=64000,
+        image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
+        image_feature_size=2928,
+        image_processor=MODEL_NAME,
+        image_processor_revision=None,
+    )
+
+    for image in hf_images:
+        hf_result = hf_processor.preprocess(
+            image,
+            return_tensors="pt",
+        ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
+        vllm_result = MULTIMODAL_REGISTRY.process_input(
+            ImagePixelData(image),
+            model_config=model_config,
+            vlm_config=vlm_config,
+        )
+
+        assert hf_result.keys() == vllm_result.keys()
+        for key, hf_tensor in hf_result.items():
+            hf_arr: np.ndarray = hf_tensor.numpy()
+            vllm_arr: np.ndarray = vllm_result[key].numpy()
+
+            assert hf_arr.shape == vllm_arr.shape, f"Failed for key={key}"
+            assert np.allclose(hf_arr, vllm_arr), f"Failed for key={key}"
+
+
+@pytest.mark.xfail(
+    reason="Example image pixels were not processed using HuggingFace")
 @pytest.mark.parametrize("dtype", ["float"])
 def test_image_pixel_types(hf_images, vllm_image_tensors, dtype):
     MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
-    IMAGE_HEIGHT = IMAGE_WIDTH = 33
+    IMAGE_HEIGHT = IMAGE_WIDTH = 560
 
     model_config = ModelConfig(
         model=MODEL_NAME,
@@ -95,7 +146,4 @@ def test_image_pixel_types(hf_images, vllm_image_tensors, dtype):
             tensor_arr: np.ndarray = tensor_result[key].numpy()
 
             assert image_arr.shape == tensor_arr.shape, f"Failed for key={key}"
-
-            # The examples in PR#3042 have slightly different preprocessing from
-            # HuggingFace's LlavaProcessor, causing the test to fail.
-            # assert np.allclose(image_arr, tensor_arr), f"Failed for key={key}"
+            assert np.allclose(image_arr, tensor_arr), f"Failed for key={key}"
diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py
index a92abe6b5b8d..4446914c67c8 100755
--- a/vllm/model_executor/models/__init__.py
+++ b/vllm/model_executor/models/__init__.py
@@ -33,6 +33,8 @@
     "LlamaForCausalLM": ("llama", "LlamaForCausalLM"),
     "LlavaForConditionalGeneration":
     ("llava", "LlavaForConditionalGeneration"),
+    "LlavaNextForConditionalGeneration":
+    ("llava_next", "LlavaNextForConditionalGeneration"),
     # For decapoda-research/llama-*
     "LLaMAForCausalLM": ("llama", "LlamaForCausalLM"),
     "MistralForCausalLM": ("llama", "LlamaForCausalLM"),
diff --git a/vllm/model_executor/models/llava.py b/vllm/model_executor/models/llava.py
index 3332bcc57846..67b32a08833b 100644
--- a/vllm/model_executor/models/llava.py
+++ b/vllm/model_executor/models/llava.py
@@ -1,7 +1,7 @@
 from typing import Iterable, List, Literal, Optional, Tuple, TypedDict, Union
 
 import torch
-from torch import nn
+import torch.nn as nn
 # TODO(xwjiang): We should port CLIPVisionModel's code over to not depend on
 # transformers' impl.
 from transformers import CLIPVisionModel, LlavaConfig
@@ -51,10 +51,10 @@ def forward(self, image_features: torch.Tensor) -> torch.Tensor:
         return hidden_states
 
 
-def _merge_vision_embeddings(input_ids: torch.Tensor,
-                             inputs_embeds: torch.Tensor,
-                             vision_embeddings: torch.Tensor,
-                             image_token_id: int) -> torch.Tensor:
+def merge_vision_embeddings(input_ids: torch.Tensor,
+                            inputs_embeds: torch.Tensor,
+                            vision_embeddings: torch.Tensor,
+                            image_token_id: int) -> torch.Tensor:
     """In place merges in vision_embeddings with inputs_embeds."""
     mask = (input_ids == image_token_id)
 
@@ -151,7 +151,8 @@ def _parse_and_validate_image_input(
                 return None
 
             if not isinstance(pixel_values, torch.Tensor):
-                raise ValueError("Incorrect type of pixel values")
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
 
             return LlavaImagePixelInputs(
                 type="pixel_values",
@@ -166,7 +167,8 @@ def _parse_and_validate_image_input(
                 return None
 
             if not isinstance(image_features, torch.Tensor):
-                raise ValueError("Incorrect type of image features")
+                raise ValueError("Incorrect type of image features. "
+                                 f"Got type: {type(image_features)}")
 
             return LlavaImageFeatureInputs(
                 type="image_features",
@@ -268,7 +270,7 @@ def forward(
             vision_embeddings = self._process_image_input(image_input)
             inputs_embeds = self.language_model.get_input_embeddings(input_ids)
 
-            inputs_embeds = _merge_vision_embeddings(
+            inputs_embeds = merge_vision_embeddings(
                 input_ids, inputs_embeds, vision_embeddings,
                 self.vision_language_config.image_token_id)
 
diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
new file mode 100644
index 000000000000..bb15dcb8ed91
--- /dev/null
+++ b/vllm/model_executor/models/llava_next.py
@@ -0,0 +1,445 @@
+from typing import (Dict, Iterable, List, Literal, Optional, Tuple, TypedDict,
+                    Union)
+
+import torch
+import torch.nn as nn
+from PIL import Image
+# TODO(xwjiang): We should port CLIPVisionModel's code over to not depend on
+# transformers' impl.
+from transformers import CLIPVisionModel, LlavaNextConfig
+from transformers.models.llava_next.modeling_llava_next import (
+    get_anyres_image_grid_shape, unpad_image)
+from typing_extensions import NotRequired
+
+from vllm.attention import AttentionMetadata
+from vllm.config import CacheConfig, ModelConfig, VisionLanguageConfig
+from vllm.logger import init_logger
+from vllm.model_executor.layers.logits_processor import LogitsProcessor
+from vllm.model_executor.layers.quantization.base_config import (
+    QuantizationConfig)
+from vllm.model_executor.layers.sampler import Sampler
+from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead
+from vllm.model_executor.model_loader.weight_utils import default_weight_loader
+from vllm.model_executor.models.llama import LlamaModel
+from vllm.model_executor.sampling_metadata import SamplingMetadata
+from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalData
+from vllm.multimodal.image import ImagePixelData, get_dummy_image_data
+from vllm.sequence import SamplerOutput, SequenceData
+
+from .llava import LlavaMultiModalProjector, merge_vision_embeddings
+from .vlm_base import VisionLanguageModelBase
+
+logger = init_logger(__name__)
+
+_KEYS_TO_MODIFY_MAPPING = {
+    "language_model.lm_head": "lm_head",
+    "language_model.model": "language_model",
+}
+
+
+class LlavaNextImagePixelInputs(TypedDict):
+    type: Literal["pixel_values"]
+    data: torch.Tensor
+    """Shape: (batch_size, 1 + num_patches, num_channels, height, width)"""
+
+    image_sizes: NotRequired[torch.Tensor]
+    """Shape: (batch_size, 2)"""
+
+
+class LlavaNextImageFeatureInputs(TypedDict):
+    type: Literal["image_features"]
+    data: torch.Tensor
+    """Shape: (batch_size, 1 + num_patches, image_feature_size, hidden_size)"""
+
+    image_sizes: NotRequired[torch.Tensor]
+    """Shape: (batch_size, 2)"""
+
+
+LlavaNextImageInputs = Union[LlavaNextImagePixelInputs,
+                             LlavaNextImageFeatureInputs]
+
+
+def _get_dummy_image_data(
+    seq_len: int,
+    model_config: ModelConfig,
+    vlm_config: VisionLanguageConfig,
+) -> Tuple[SequenceData, MultiModalData]:
+    seq_data, fake_mm_data = get_dummy_image_data(seq_len, model_config,
+                                                  vlm_config)
+
+    config_input_type = vlm_config.image_input_type
+    ImageInputType = VisionLanguageConfig.ImageInputType
+
+    if config_input_type == ImageInputType.PIXEL_VALUES:
+        _, c, h, w = vlm_config.image_input_shape
+        mode = {1: "L", 3: "RGB"}[c]
+        fake_mm_data = ImagePixelData(Image.new(mode, (w, h), color=0))
+
+    return seq_data, fake_mm_data
+
+
+def _image_pixel_processor(
+    data: ImagePixelData,
+    model_config: ModelConfig,
+    vlm_config: VisionLanguageConfig,
+) -> Dict[str, torch.Tensor]:
+    image = data.image
+
+    if isinstance(image, torch.Tensor):
+        pixel_values = image.to(model_config.dtype)
+        batch_size, _, _, h, w = pixel_values.shape
+        image_sizes = torch.tensor([(w, h) for _ in range(batch_size)])
+
+        return {"pixel_values": pixel_values, "image_sizes": image_sizes}
+
+    # Temporary patch before dynamic number of image tokens is supported
+    _, _, h, w = vlm_config.image_input_shape
+    if (w, h) != (image.width, image.height):
+        logger.warning(
+            "Dynamic image shape is currently not supported. "
+            "Resizing input image to (%d, %d).", w, h)
+
+        data.image = image.resize((w, h))
+
+    return MULTIMODAL_REGISTRY._get_plugin_for_data_type(ImagePixelData) \
+        ._default_input_processor(data, model_config, vlm_config)
+
+
+@MULTIMODAL_REGISTRY.register_image_pixel_input(_image_pixel_processor)
+@MULTIMODAL_REGISTRY.register_dummy_data(_get_dummy_image_data)
+class LlavaNextForConditionalGeneration(VisionLanguageModelBase):
+    """
+    Args to `forward()`:
+        input_ids: Flattened (concatenated) input_ids corresponding to a
+            batch.
+        pixel_values: For PIXEL_VALUES, expects a batch with shape
+            [1, num_patches, 3, 336, 336].
+        image_features: For IMAGE_FEATURES, expects a batch with shape
+            [1, num_patches, 1176, 1024].
+    """
+
+    def __init__(self,
+                 config: LlavaNextConfig,
+                 vision_language_config: VisionLanguageConfig,
+                 cache_config: Optional[CacheConfig] = None,
+                 quant_config: Optional[QuantizationConfig] = None) -> None:
+        super().__init__(vision_language_config)
+
+        # Update the type annotation from that of its superclass
+        self.config = config
+
+        if self.vision_language_config.image_input_type == (
+                VisionLanguageConfig.ImageInputType.PIXEL_VALUES):
+            self.vision_tower = CLIPVisionModel(config.vision_config)
+        else:
+            raise TypeError("Image features are not supported by LLaVA-NeXT")
+
+        self.multi_modal_projector = LlavaMultiModalProjector(
+            vision_hidden_size=config.vision_config.hidden_size,
+            text_hidden_size=config.text_config.hidden_size,
+            projector_hidden_act=config.projector_hidden_act)
+
+        self.quant_config = quant_config
+        self.language_model = LlamaModel(config.text_config, cache_config,
+                                         quant_config)
+        self.unpadded_vocab_size = config.text_config.vocab_size
+        self.lm_head = ParallelLMHead(
+            self.unpadded_vocab_size,
+            config.text_config.hidden_size,
+            org_num_embeddings=self.language_model.org_vocab_size)
+        logit_scale = getattr(config, "logit_scale", 1.0)
+        self.logits_processor = LogitsProcessor(self.unpadded_vocab_size,
+                                                config.vocab_size, logit_scale)
+        self.sampler = Sampler()
+
+        self.image_newline = nn.Parameter(
+            torch.empty(config.text_config.hidden_size))
+
+    def _validate_image_pixels(self, data: torch.Tensor) -> torch.Tensor:
+        _, num_channels, _, _ = self.vision_language_config.image_input_shape
+
+        # Note that this is different from that of vLLM vision_language_config
+        # since the image is resized by the HuggingFace preprocessor
+        height = width = self.config.vision_config.image_size
+
+        if list(data.shape[2:]) != [num_channels, height, width]:
+            raise ValueError(
+                f"The expected image tensor shape is batch dimension plus "
+                f"num_patches plus {[num_channels, height, width]}. "
+                f"You supplied {data.shape}. "
+                f"If you are using vLLM's entrypoint, make sure your "
+                f"supplied image input is consistent with "
+                f"image_input_shape in engine args.")
+
+        return data
+
+    def _validate_image_sizes(self, data: torch.Tensor) -> torch.Tensor:
+        if list(data.shape[1:]) != [2]:
+            raise ValueError(
+                f"The expected image sizes shape is batch dimension plus "
+                f"{[2]}. You supplied {data.shape}.")
+
+        return data
+
+    def _parse_and_validate_image_input(
+            self, **kwargs: object) -> Optional[LlavaNextImageInputs]:
+        pixel_values = kwargs.pop("pixel_values", None)
+        image_sizes = kwargs.pop("image_sizes", None)
+        image_features = kwargs.pop("image_features", None)
+
+        expected_input_type = self.vision_language_config.image_input_type
+        ImageInputType = VisionLanguageConfig.ImageInputType
+
+        if expected_input_type == ImageInputType.PIXEL_VALUES:
+            if image_features is not None:
+                raise ValueError(
+                    "Expected pixel values but got image features")
+            if pixel_values is None:
+                return None
+
+            if not isinstance(pixel_values, torch.Tensor):
+                raise ValueError("Incorrect type of pixel values. "
+                                 f"Got type: {type(pixel_values)}")
+
+            if not isinstance(image_sizes, torch.Tensor):
+                raise ValueError("Incorrect type of image sizes. "
+                                 f"Got type: {type(image_sizes)}")
+
+            return LlavaNextImagePixelInputs(
+                type="pixel_values",
+                data=self._validate_image_pixels(pixel_values),
+                image_sizes=self._validate_image_sizes(image_sizes),
+            )
+
+        assert expected_input_type != ImageInputType.IMAGE_FEATURES, (
+            "Failed to validate this at initialization time")
+
+        return None
+
+    def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
+                                      patch_embeddings: torch.Tensor, *,
+                                      strategy: str) -> torch.Tensor:
+        # Based on: https://github.com/haotian-liu/LLaVA/blob/main/llava/model/llava_arch.py
+        if strategy == "flat":
+            return patch_embeddings.flatten(0, 1)
+
+        if strategy.startswith("spatial"):
+            orig_width, orig_height = image_size
+            height = width = self.config.vision_config.image_size \
+                // self.config.vision_config.patch_size
+
+            base_patch_embeds = patch_embeddings[0]
+            if height * width != base_patch_embeds.shape[0]:
+                raise ValueError(
+                    "The number of patches is not consistent with the "
+                    "image size.")
+
+            if patch_embeddings.shape[0] > 1:
+                other_patch_embeds = patch_embeddings[1:]
+
+                # image_aspect_ratio == "anyres"
+                num_patch_width, num_patch_height = get_anyres_image_grid_shape(
+                    (orig_width, orig_height),
+                    self.config.image_grid_pinpoints,
+                    self.config.vision_config.image_size,
+                )
+                other_patch_embeds = other_patch_embeds \
+                    .view(num_patch_width, num_patch_height, height, width, -1)
+
+                if "unpad" in strategy:
+                    other_patch_embeds = other_patch_embeds \
+                        .permute(4, 0, 2, 1, 3).contiguous() \
+                        .flatten(1, 2).flatten(2, 3)
+                    other_patch_embeds = unpad_image(other_patch_embeds,
+                                                     image_size)
+                    other_patch_embeds = torch.cat((
+                        other_patch_embeds,
+                        self.image_newline[:, None, None] \
+                            .expand(*other_patch_embeds.shape[:-1], 1) \
+                            .to(other_patch_embeds.device),
+                    ), dim=-1)
+                    other_patch_embeds = other_patch_embeds \
+                        .flatten(1, 2).transpose(0, 1)
+                else:
+                    other_patch_embeds = other_patch_embeds \
+                        .permute(0, 2, 1, 3, 4).contiguous() \
+                        .flatten(0, 3)
+
+                merged_patch_embeddings = torch.cat(
+                    (base_patch_embeds, other_patch_embeds), dim=0)
+            else:
+                if "unpad" in strategy:
+                    merged_patch_embeddings = torch.cat(
+                        (base_patch_embeds,
+                         self.image_newline[None] \
+                            .to(base_patch_embeds.device)
+                    ), dim=0)
+                else:
+                    merged_patch_embeddings = base_patch_embeds
+
+            return merged_patch_embeddings
+
+        raise ValueError(f"Unexpected patch merge strategy: {strategy}")
+
+    def _process_image_pixels(
+            self, inputs: LlavaNextImagePixelInputs) -> torch.Tensor:
+        assert self.vision_tower is not None
+
+        pixel_values = inputs["data"]
+
+        b, num_patches, c, h, w = pixel_values.shape
+        stacked_pixel_values = pixel_values.view(b * num_patches, c, h, w)
+
+        stacked_image_features = self._image_pixels_to_features(
+            self.vision_tower, stacked_pixel_values)
+
+        return stacked_image_features.view(b, num_patches,
+                                           *stacked_image_features.shape[-2:])
+
+    def _process_image_input(
+            self, image_input: LlavaNextImageInputs) -> torch.Tensor:
+        if image_input["type"] == "pixel_values":
+            assert self.vision_tower is not None
+            image_features = self._process_image_pixels(image_input)
+        else:
+            image_features = image_input["data"]
+
+        patch_embeddings = self.multi_modal_projector(image_features)
+
+        image_sizes = image_input.get("image_sizes")
+        if image_sizes is None:
+            batch_size = image_input["data"].shape[0]
+            vision_config = self.config.vision_config
+            default_width = default_height = vision_config.image_size
+            image_sizes = torch.as_tensor([[default_width, default_height]
+                                           for _ in range(batch_size)])
+
+        merged_patch_embeddings = [
+            self._merge_image_patch_embeddings(image_sizes[i],
+                                               patch_features,
+                                               strategy="spatial_unpad")
+            for i, patch_features in enumerate(patch_embeddings)
+        ]
+
+        return torch.stack(merged_patch_embeddings, dim=0)
+
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        positions: torch.Tensor,
+        kv_caches: List[torch.Tensor],
+        attn_metadata: AttentionMetadata,
+        **kwargs: object,
+    ) -> SamplerOutput:
+        """Run forward pass for Llava 1.5.
+
+        One key thing to understand is the `input_ids` already accounts for the
+        positions of the to-be-inserted image embeddings.
+        Concretely, consider a text prompt:
+        "<image>\nUSER: What's the content of the image?\nASSISTANT:".
+        Tokenizer outputs:
+        [1, 32000, 29871, 13, 11889, 29901, 1724, 29915, 29879, 278,
+        2793, 310, 278, 1967, 29973, 13, 22933, 9047, 13566, 29901].
+        The to-be-inserted image has a size of 576 (24 * 24) along the context
+        length dimension.
+        `input_ids` is thus [1, 32000, ..., 32000, 29871, 13, 11889, 29901,
+        1724, 29915, 29879, 278, 2793, 310, 278, 1967, 29973, 13, 22933,
+        9047, 13566, 29901].
+        There will be 576 `32000` in the `input_ids`.
+        (32000 is the token id for `<image>`.)
+
+        This way, the `positions` and `attn_metadata` are consistent
+        with the `input_ids`.
+
+        The model takes two types of image inputs:
+        PIXEL_VALUES and IMAGE_FEATURES.
+        The following shows how each maps to huggingface implementation.
+        PIXEL_VALUES:
+        - https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L353
+        IMAGE_FEATURES:
+        - https://github.com/huggingface/transformers/blob/07bdbeb/src/transformers/models/llava/modeling_llava.py#L430
+        before going through the multi modal projector.
+
+        Args:
+            input_ids: Flattened (concatenated) input_ids corresponding to a
+                batch.
+            pixel_values: For PIXEL_VALUES, expects a batch with shape
+                [1, 3, 336, 336].
+            image_features: For IMAGE_FEATURES, expects a batch with shape
+                [1, 576, 1024].
+        """
+        image_input = self._parse_and_validate_image_input(**kwargs)
+
+        if image_input is not None:
+            vision_embeddings = self._process_image_input(image_input)
+            inputs_embeds = self.language_model.get_input_embeddings(input_ids)
+
+            inputs_embeds = merge_vision_embeddings(
+                input_ids, inputs_embeds, vision_embeddings,
+                self.vision_language_config.image_token_id)
+
+            input_ids = None
+        else:
+            inputs_embeds = None
+
+        hidden_states = self.language_model(input_ids,
+                                            positions,
+                                            kv_caches,
+                                            attn_metadata,
+                                            inputs_embeds=inputs_embeds)
+
+        return hidden_states
+
+    def compute_logits(self, hidden_states: torch.Tensor,
+                       sampling_metadata: SamplingMetadata) -> torch.Tensor:
+        logits = self.logits_processor(self.lm_head.weight, hidden_states,
+                                       sampling_metadata)
+        return logits
+
+    def sample(
+        self,
+        logits: torch.Tensor,
+        sampling_metadata: SamplingMetadata,
+    ) -> Optional[SamplerOutput]:
+        next_tokens = self.sampler(logits, sampling_metadata)
+        return next_tokens
+
+    def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
+        # only doing this for language model part for now.
+        stacked_params_mapping = [
+            # (param_name, shard_name, shard_id)
+            ("qkv_proj", "q_proj", "q"),
+            ("qkv_proj", "k_proj", "k"),
+            ("qkv_proj", "v_proj", "v"),
+            ("gate_up_proj", "gate_proj", 0),
+            ("gate_up_proj", "up_proj", 1),
+        ]
+        params_dict = dict(self.named_parameters())
+        for name, loaded_weight in weights:
+            if "rotary_emb.inv_freq" in name:
+                continue
+            for key_to_modify, new_key in _KEYS_TO_MODIFY_MAPPING.items():
+                if key_to_modify in name:
+                    name = name.replace(key_to_modify, new_key)
+            use_default_weight_loading = False
+            if "vision" in name:
+                if self.vision_tower is not None:
+                    # We only do sharding for language model and
+                    # not vision model for now.
+                    use_default_weight_loading = True
+            else:
+                for (param_name, weight_name,
+                     shard_id) in stacked_params_mapping:
+                    if weight_name not in name:
+                        continue
+                    param = params_dict[name.replace(weight_name, param_name)]
+                    weight_loader = param.weight_loader
+                    weight_loader(param, loaded_weight, shard_id)
+                    break
+                else:
+                    use_default_weight_loading = True
+            if use_default_weight_loading:
+                param = params_dict[name]
+                weight_loader = getattr(param, "weight_loader",
+                                        default_weight_loader)
+                weight_loader(param, loaded_weight)

From e3f0b3263c40f4c3a54b36503f49c4f6ecbf8047 Mon Sep 17 00:00:00 2001
From: Itay Etelis <92247226+Etelis@users.noreply.github.com>
Date: Mon, 10 Jun 2024 17:22:09 +0300
Subject: [PATCH 84/93] [Feature][Frontend]:  Continued `stream_options`
 implementation also in CompletionRequest (#5319)

---
 tests/entrypoints/test_openai_server.py       | 236 ++++++++++--------
 vllm/entrypoints/openai/protocol.py           |   9 +
 vllm/entrypoints/openai/serving_chat.py       |  35 ++-
 vllm/entrypoints/openai/serving_completion.py |  26 +-
 4 files changed, 180 insertions(+), 126 deletions(-)

diff --git a/tests/entrypoints/test_openai_server.py b/tests/entrypoints/test_openai_server.py
index c6fc4769987f..06bd0bf15c84 100644
--- a/tests/entrypoints/test_openai_server.py
+++ b/tests/entrypoints/test_openai_server.py
@@ -477,8 +477,6 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
         temperature=0.0,
     )
     single_output = single_completion.choices[0].text
-    single_usage = single_completion.usage
-
     stream = await client.completions.create(model=model_name,
                                              prompt=prompt,
                                              max_tokens=5,
@@ -494,7 +492,6 @@ async def test_completion_streaming(server, client: openai.AsyncOpenAI,
     assert finish_reason_count == 1
     assert chunk.choices[0].finish_reason == "length"
     assert chunk.choices[0].text
-    assert chunk.usage == single_usage
     assert "".join(chunks) == single_output
 
 
@@ -549,6 +546,138 @@ async def test_chat_streaming(server, client: openai.AsyncOpenAI,
     assert "".join(chunks) == output
 
 
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
+)
+async def test_chat_completion_stream_options(server,
+                                              client: openai.AsyncOpenAI,
+                                              model_name: str):
+    messages = [{
+        "role": "system",
+        "content": "You are a helpful assistant."
+    }, {
+        "role": "user",
+        "content": "What is the capital of France?"
+    }]
+
+    # Test stream=True, stream_options={"include_usage": False}
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": False})
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options={"include_usage": True}
+    stream = await client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_tokens=10,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": True})
+
+    async for chunk in stream:
+        if chunk.choices[0].finish_reason is None:
+            assert chunk.usage is None
+        else:
+            assert chunk.usage is None
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+
+    # Test stream=False, stream_options={"include_usage": None}
+    with pytest.raises(BadRequestError):
+        await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_tokens=10,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": None})
+
+    # Test stream=False, stream_options={"include_usage": True}
+    with pytest.raises(BadRequestError):
+        await client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_tokens=10,
+            temperature=0.0,
+            stream=False,
+            stream_options={"include_usage": True})
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize(
+    "model_name",
+    ["HuggingFaceH4/zephyr-7b-beta", "zephyr-lora"],
+)
+async def test_completion_stream_options(server, client: openai.AsyncOpenAI,
+                                         model_name: str):
+    prompt = "What is the capital of France?"
+
+    # Test stream=True, stream_options={"include_usage": False}
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": False})
+    async for chunk in stream:
+        assert chunk.usage is None
+
+    # Test stream=True, stream_options={"include_usage": True}
+    stream = await client.completions.create(
+        model=model_name,
+        prompt=prompt,
+        max_tokens=5,
+        temperature=0.0,
+        stream=True,
+        stream_options={"include_usage": True})
+    async for chunk in stream:
+        if chunk.choices[0].finish_reason is None:
+            assert chunk.usage is None
+        else:
+            assert chunk.usage is None
+            final_chunk = await stream.__anext__()
+            assert final_chunk.usage is not None
+            assert final_chunk.usage.prompt_tokens > 0
+            assert final_chunk.usage.completion_tokens > 0
+            assert final_chunk.usage.total_tokens == (
+                final_chunk.usage.prompt_tokens +
+                final_chunk.usage.completion_tokens)
+            assert final_chunk.choices == []
+
+    # Test stream=False, stream_options={"include_usage": None}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": None})
+
+    # Test stream=False, stream_options={"include_usage": True}
+    with pytest.raises(BadRequestError):
+        await client.completions.create(model=model_name,
+                                        prompt=prompt,
+                                        max_tokens=5,
+                                        temperature=0.0,
+                                        stream=False,
+                                        stream_options={"include_usage": True})
+
+
 @pytest.mark.asyncio
 @pytest.mark.parametrize(
     # just test 1 lora hereafter
@@ -1342,106 +1471,5 @@ async def test_batch_embedding(embedding_server, client: openai.AsyncOpenAI,
     assert embeddings.usage.total_tokens == 17
 
 
-@pytest.mark.parametrize(
-    "model_name",
-    [MODEL_NAME],
-)
-async def test_stream_options(server, client: openai.AsyncOpenAI,
-                              model_name: str):
-    prompt = "What is the capital of France?"
-
-    # Test stream=True, stream_options=None
-    stream = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-        stream=True,
-        stream_options=None,
-    )
-    chunks = []
-    async for chunk in stream:
-        chunks.append(chunk.choices[0].text)
-    assert len(chunks) > 0
-    assert "usage" not in chunk
-
-    # Test stream=True, stream_options={"include_usage": False}
-    stream = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-        stream=True,
-        stream_options={"include_usage": False},
-    )
-    chunks = []
-    async for chunk in stream:
-        chunks.append(chunk.choices[0].text)
-    assert len(chunks) > 0
-    assert "usage" not in chunk
-
-    # Test stream=True, stream_options={"include_usage": True}
-    stream = await client.completions.create(
-        model=model_name,
-        prompt=prompt,
-        max_tokens=5,
-        temperature=0.0,
-        stream=True,
-        stream_options={"include_usage": True},
-    )
-    chunks = []
-    finish_reason_count = 0
-    async for chunk in stream:
-        if chunk.choices[0].finish_reason is None:
-            assert chunk.usage is None
-            chunks.append(chunk.choices[0].text)
-        else:
-            assert chunk.usage is None
-            finish_reason_count += 1
-
-    # The last message should have usage and no choices
-    last_message = await stream.__anext__()
-    assert last_message.usage is not None
-    assert last_message.usage.prompt_tokens > 0
-    assert last_message.usage.completion_tokens > 0
-    assert last_message.usage.total_tokens == (
-        last_message.usage.prompt_tokens +
-        last_message.usage.completion_tokens)
-    assert last_message.choices == []
-
-    # Test stream=False, stream_options={"include_usage": None}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(
-            model=model_name,
-            prompt=prompt,
-            max_tokens=5,
-            temperature=0.0,
-            stream=False,
-            stream_options={"include_usage": None},
-        )
-
-    # Test stream=False, stream_options={"include_usage": False}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(
-            model=model_name,
-            prompt=prompt,
-            max_tokens=5,
-            temperature=0.0,
-            stream=False,
-            stream_options={"include_usage": False},
-        )
-
-    # Test stream=False, stream_options={"include_usage": True}
-    with pytest.raises(BadRequestError):
-        await client.completions.create(
-            model=model_name,
-            prompt=prompt,
-            max_tokens=5,
-            temperature=0.0,
-            stream=False,
-            stream_options={"include_usage": True},
-        )
-
-
 if __name__ == "__main__":
     pytest.main([__file__])
diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py
index fa33318786b9..9424ccc959d1 100644
--- a/vllm/entrypoints/openai/protocol.py
+++ b/vllm/entrypoints/openai/protocol.py
@@ -346,6 +346,7 @@ class CompletionRequest(OpenAIBaseModel):
                                 le=torch.iinfo(torch.long).max)
     stop: Optional[Union[str, List[str]]] = Field(default_factory=list)
     stream: Optional[bool] = False
+    stream_options: Optional[StreamOptions] = None
     suffix: Optional[str] = None
     temperature: Optional[float] = 1.0
     top_p: Optional[float] = 1.0
@@ -482,6 +483,14 @@ def check_logprobs(cls, data):
                               " in the interval [0, 5]."))
         return data
 
+    @model_validator(mode="before")
+    @classmethod
+    def validate_stream_options(cls, data):
+        if data.get("stream_options") and not data.get("stream"):
+            raise ValueError(
+                "Stream options can only be defined when stream is True.")
+        return data
+
 
 class EmbeddingRequest(BaseModel):
     # Ordered by official OpenAI API documentation
diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py
index c025e7e96826..dae60e4ec99f 100644
--- a/vllm/entrypoints/openai/serving_chat.py
+++ b/vllm/entrypoints/openai/serving_chat.py
@@ -441,25 +441,24 @@ async def chat_completion_stream_generator(
                         yield f"data: {data}\n\n"
                         finish_reason_sent[i] = True
 
-                    if (request.stream_options
-                            and request.stream_options.include_usage):
-                        final_usage = UsageInfo(
-                            prompt_tokens=prompt_tokens,
-                            completion_tokens=previous_num_tokens[i],
-                            total_tokens=prompt_tokens +
-                            previous_num_tokens[i],
-                        )
+            if (request.stream_options
+                    and request.stream_options.include_usage):
+                final_usage = UsageInfo(
+                    prompt_tokens=prompt_tokens,
+                    completion_tokens=previous_num_tokens[i],
+                    total_tokens=prompt_tokens + previous_num_tokens[i],
+                )
 
-                        final_usage_chunk = ChatCompletionStreamResponse(
-                            id=request_id,
-                            object=chunk_object_type,
-                            created=created_time,
-                            choices=[],
-                            model=model_name,
-                            usage=final_usage)
-                        final_usage_data = (final_usage_chunk.model_dump_json(
-                            exclude_unset=True, exclude_none=True))
-                        yield f"data: {final_usage_data}\n\n"
+                final_usage_chunk = ChatCompletionStreamResponse(
+                    id=request_id,
+                    object=chunk_object_type,
+                    created=created_time,
+                    choices=[],
+                    model=model_name,
+                    usage=final_usage)
+                final_usage_data = (final_usage_chunk.model_dump_json(
+                    exclude_unset=True, exclude_none=True))
+                yield f"data: {final_usage_data}\n\n"
 
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
diff --git a/vllm/entrypoints/openai/serving_completion.py b/vllm/entrypoints/openai/serving_completion.py
index 572878b5527d..c3c40f2b97d1 100644
--- a/vllm/entrypoints/openai/serving_completion.py
+++ b/vllm/entrypoints/openai/serving_completion.py
@@ -264,7 +264,8 @@ async def completion_stream_generator(
                         )
                     else:
                         final_usage = None
-                    response_json = CompletionStreamResponse(
+
+                    chunk = CompletionStreamResponse(
                         id=request_id,
                         created=created_time,
                         model=model_name,
@@ -276,10 +277,27 @@ async def completion_stream_generator(
                                 finish_reason=finish_reason,
                                 stop_reason=stop_reason,
                             )
-                        ],
-                        usage=final_usage,
-                    ).model_dump_json(exclude_unset=True)
+                        ])
+                    if (request.stream_options
+                            and request.stream_options.include_usage):
+                        chunk.usage = None
+
+                    response_json = chunk.model_dump_json(exclude_unset=True)
                     yield f"data: {response_json}\n\n"
+
+            if (request.stream_options
+                    and request.stream_options.include_usage):
+                final_usage_chunk = CompletionStreamResponse(
+                    id=request_id,
+                    created=created_time,
+                    model=model_name,
+                    choices=[],
+                    usage=final_usage,
+                )
+                final_usage_data = (final_usage_chunk.model_dump_json(
+                    exclude_unset=True, exclude_none=True))
+                yield f"data: {final_usage_data}\n\n"
+
         except ValueError as e:
             # TODO: Use a vllm-specific Validation Error
             data = self.create_streaming_error_response(str(e))

From f8392d69a1ce4d2e24eaf09ace4685305560498c Mon Sep 17 00:00:00 2001
From: Cyrus Leung <tlleungac@connect.ust.hk>
Date: Mon, 10 Jun 2024 23:38:47 +0800
Subject: [PATCH 85/93] [Bugfix] Fix LLaVA-NeXT (#5380)

---
 vllm/model_executor/models/llava_next.py | 24 ++++++++++++++++++++++++
 vllm/multimodal/utils.py                 |  2 +-
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/llava_next.py b/vllm/model_executor/models/llava_next.py
index bb15dcb8ed91..57cbd1e4a601 100644
--- a/vllm/model_executor/models/llava_next.py
+++ b/vllm/model_executor/models/llava_next.py
@@ -216,6 +216,30 @@ def _parse_and_validate_image_input(
 
         return None
 
+    def _select_image_features(self, image_features: torch.Tensor, *,
+                               strategy: str) -> torch.Tensor:
+        # Copied from https://github.com/huggingface/transformers/blob/39c3c0a72af6fbda5614dde02ff236069bb79827/src/transformers/models/llava/modeling_llava.py#L421  # noqa
+        if strategy == "default":
+            return image_features[:, 1:]
+        elif strategy == "full":
+            return image_features
+
+        raise ValueError(f"Unexpected select feature strategy: {strategy}")
+
+    def _image_pixels_to_features(self, vision_tower: CLIPVisionModel,
+                                  pixel_values: torch.Tensor) -> torch.Tensor:
+        # TODO(xwjiang): Maybe port minimal CLIPVisionModel over.
+        image_outputs = vision_tower(pixel_values.to(vision_tower.device),
+                                     output_hidden_states=True)
+
+        image_features = image_outputs.hidden_states[
+            self.config.vision_feature_layer]
+
+        return self._select_image_features(
+            image_features,
+            strategy=self.config.vision_feature_select_strategy,
+        )
+
     def _merge_image_patch_embeddings(self, image_size: torch.Tensor,
                                       patch_embeddings: torch.Tensor, *,
                                       strategy: str) -> torch.Tensor:
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index b8ad6f8f78e2..c6311d60e0bd 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -77,7 +77,7 @@ def get_full_image_text_prompt(image_prompt: str, text_prompt: str,
     """Combine image and text prompts for vision language model depending on
     the model architecture."""
 
-    if config.hf_config.model_type == "llava":
+    if config.hf_config.model_type in ("llava", "llava_next"):
         full_prompt = f"{image_prompt}\n{text_prompt}"
     else:
         raise ValueError(

From 9d8243316d65ee32644f3e8c9bb9ee79695f71c8 Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Mon, 10 Jun 2024 09:21:11 -0700
Subject: [PATCH 86/93] [ci] Use small_cpu_queue for doc build (#5331)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .buildkite/test-template-aws.j2 | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2
index 9f7d07acca29..4967583055ec 100644
--- a/.buildkite/test-template-aws.j2
+++ b/.buildkite/test-template-aws.j2
@@ -22,7 +22,9 @@ steps:
   {% for step in steps %}
   - label: "{{ step.label }}"
     agents:
-      {% if step.no_gpu %}
+      {% if step.label == "Documentation Build" %}
+      queue: small_cpu_queue
+      {% elif step.no_gpu %}
       queue: cpu_queue
       {% elif step.num_gpus == 2 or step.num_gpus == 4 %}
       queue: gpu_4_queue

From a9bd95b8200b35becb88a0b844e523ece6d49c6f Mon Sep 17 00:00:00 2001
From: "Kevin H. Luu" <kevin@anyscale.com>
Date: Mon, 10 Jun 2024 09:22:34 -0700
Subject: [PATCH 87/93] [ci] Mount buildkite agent on Docker container to
 upload benchmark results (#5330)

Signed-off-by: kevin <kevin@anyscale.com>
---
 .buildkite/run-benchmarks.sh    | 6 +++---
 .buildkite/test-template-aws.j2 | 3 +++
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/.buildkite/run-benchmarks.sh b/.buildkite/run-benchmarks.sh
index 6283cd106401..75e9cf6a6579 100644
--- a/.buildkite/run-benchmarks.sh
+++ b/.buildkite/run-benchmarks.sh
@@ -54,12 +54,12 @@ tail -n 24 benchmark_serving.txt >> benchmark_results.md # last 24 lines
 echo '```' >> benchmark_results.md
 
 # if the agent binary is not found, skip uploading the results, exit 0
-if [ ! -f /workspace/buildkite-agent ]; then
+if [ ! -f buildkite-agent ]; then
     exit 0
 fi
 
 # upload the results to buildkite
-/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
+buildkite-agent annotate --style "info" --context "benchmark-results" < benchmark_results.md
 
 # exit with the exit code of the benchmarks
 if [ $bench_latency_exit_code -ne 0 ]; then
@@ -75,4 +75,4 @@ if [ $bench_serving_exit_code -ne 0 ]; then
 fi
 
 rm ShareGPT_V3_unfiltered_cleaned_split.json
-/workspace/buildkite-agent artifact upload "*.json"
+buildkite-agent artifact upload "*.json"
diff --git a/.buildkite/test-template-aws.j2 b/.buildkite/test-template-aws.j2
index 4967583055ec..3b5d36b24667 100644
--- a/.buildkite/test-template-aws.j2
+++ b/.buildkite/test-template-aws.j2
@@ -49,6 +49,9 @@ steps:
           {% if not step.no_gpu %}
           gpus: all
           {% endif %}
+          {% if step.label == "Benchmarks" %}
+          mount-buildkite-agent: true
+          {% endif %}
           command: ["bash", "-c", "cd {{ (step.working_dir or default_working_dir) | safe  }} && {{ step.command  or (step.commands | join(' && ')) | safe }}"]
           environment:
             - VLLM_USAGE_SOURCE=ci-test

From 6823d9e8c01a7056945d92469d115bb38a9fc2b7 Mon Sep 17 00:00:00 2001
From: Roger Wang <136131678+ywang96@users.noreply.github.com>
Date: Mon, 10 Jun 2024 09:53:50 -0700
Subject: [PATCH 88/93] [Docs] Add Docs on Limitations of VLM Support (#5383)

---
 docs/source/conf.py        | 1 +
 docs/source/models/vlm.rst | 9 ++++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/docs/source/conf.py b/docs/source/conf.py
index f1a7013edd33..ee0f6c53bd1b 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -92,6 +92,7 @@ def setup(app):
     "vllm._C",
     "PIL",
     "numpy",
+    'triton'
     "tqdm",
     "tensorizer",
 ]
diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst
index b917688a529d..33aa8246b2e6 100644
--- a/docs/source/models/vlm.rst
+++ b/docs/source/models/vlm.rst
@@ -16,6 +16,13 @@ The following :ref:`engine arguments <engine_args>` are specific to VLMs:
     :prog: -m vllm.entrypoints.openai.api_server
     :nodefaultconst:
 
+.. important::
+    Currently, the support for vision language models on vLLM has the following limitations:
+
+    * Only single image input is supported per text prompt.
+    * Dynamic ``image_input_shape`` is not supported: the input image will be resized to the static ``image_input_shape``. This means model output might not exactly match the huggingface implementation.
+    We are continuously improving user & developer experience for VLMs. Please raise an issue on GitHub if you have any feedback or feature requests.
+
 Offline Batched Inference
 -------------------------
 
@@ -31,7 +38,7 @@ To initialize a VLM, the aforementioned arguments must be passed to the ``LLM``
         image_feature_size=576,
     )
 
-For now, we only support a single image per text prompt. To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
+To pass an image to the model, note the following in :class:`vllm.inputs.PromptStrictInputs`:
 
 * ``prompt``: The prompt should have a number of ``<image>`` tokens equal to ``image_feature_size``.
 * ``multi_modal_data``: This should be an instance of :class:`~vllm.multimodal.image.ImagePixelData` or :class:`~vllm.multimodal.image.ImageFeatureData`.

From ca0ae3c58f688fd82e8df84bc383d9b49c617bbd Mon Sep 17 00:00:00 2001
From: Woosuk Kwon <woosuk.kwon@berkeley.edu>
Date: Mon, 10 Jun 2024 13:17:19 -0700
Subject: [PATCH 89/93] [Docs] Alphabetically sort sponsors (#5386)

---
 docs/source/community/sponsors.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/source/community/sponsors.md b/docs/source/community/sponsors.md
index 17586125fd6f..c8f2c16d3187 100644
--- a/docs/source/community/sponsors.md
+++ b/docs/source/community/sponsors.md
@@ -15,12 +15,12 @@ vLLM is a community project. Our compute resources for development and testing a
 - Dropbox
 - Lambda Lab
 - NVIDIA
-- Sequoia Capital
 - Replicate
 - Roblox
 - RunPod
+- Sequoia Capital
 - Trainy
 - UC Berkeley
 - UC San Diego
 
-We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.
\ No newline at end of file
+We also have an official fundraising venue through [OpenCollective](https://opencollective.com/vllm). We plan to use the fund to support the development, maintenance, and adoption of vLLM.

From 16be761ff70e5ebd232b6aa9fd2e1a32b9d7e062 Mon Sep 17 00:00:00 2001
From: Simon Mo <simon.mo@hey.com>
Date: Mon, 10 Jun 2024 17:56:06 -0500
Subject: [PATCH 90/93] Bump version to v0.5.0 (#5384)


From 14448229ae0be9f3cda87d743724ef367319695d Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Tue, 11 Jun 2024 01:38:52 +0000
Subject: [PATCH 91/93] format

---
 tests/models/test_marlin.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/models/test_marlin.py b/tests/models/test_marlin.py
index dee086733e4a..a3df2890f307 100644
--- a/tests/models/test_marlin.py
+++ b/tests/models/test_marlin.py
@@ -23,8 +23,6 @@
 from tests.models.utils import check_logprobs_close
 from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
 
-from .utils import check_logprobs_close
-
 marlin_not_supported = True
 
 if torch.cuda.is_available():

From 2df326fdce6f4e9369381c045c16d9afda62b1ce Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Tue, 11 Jun 2024 11:43:51 +0000
Subject: [PATCH 92/93] updated test model logprobs

---
 tests/models/test_models_logprobs.py | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/tests/models/test_models_logprobs.py b/tests/models/test_models_logprobs.py
index 04c172e0a794..a07d4e1e5d89 100644
--- a/tests/models/test_models_logprobs.py
+++ b/tests/models/test_models_logprobs.py
@@ -13,7 +13,7 @@
     "meta-llama/Llama-2-7b-hf",
     "mistralai/Mistral-7B-v0.1",
     "Deci/DeciLM-7b",
-    "tiiuae/falcon-7b",
+    
     "gpt2",
     "bigcode/tiny_starcoder_py",
     "EleutherAI/gpt-j-6b",
@@ -33,8 +33,9 @@
     "bigcode/starcoder2-3b",
 ]
 
-SKIPPED_MODELS_OOM = [
-    "EleutherAI/gpt-j-6b",
+SKIPPED_MODELS_CI = [
+    "EleutherAI/gpt-j-6b",  # OOM on CPU RAM
+    "tiiuae/falcon-7b",     # Fails in vllm if trust_remote_code=True
 ]
 
 
@@ -54,9 +55,9 @@ def test_models(
     if model in SKIPPED_MODELS_ACC:
         pytest.skip(reason="Low priority models not currently passing. "
                     "We need to re-enable these.")
-    if model in SKIPPED_MODELS_OOM:
-        pytest.skip(reason="These models cause OOM issue on the CPU"
-                    "because it is a fp32 checkpoint.")
+    if model in SKIPPED_MODELS_CI:
+        pytest.skip(reason="These models cause some CI issue unrelated "
+                    "to the correctness of the implementation.")
 
     hf_model = hf_runner_nm(model, dtype=dtype)
     hf_outputs = hf_model.generate_greedy_logprobs_nm(example_prompts,
@@ -64,13 +65,8 @@ def test_models(
 
     del hf_model
 
-    trust_remote_code = True
-    # Falcon fails if trust_remote_code = True
-    # https://github.com/vllm-project/vllm/issues/5363
-    trust_remote_code = model != "tiiuae/falcon-7b"
     vllm_model = vllm_runner_nm(model,
                                 dtype=dtype,
-                                trust_remote_code=trust_remote_code,
                                 max_model_len=MODEL_MAX_LEN)
     vllm_outputs = vllm_model.generate_greedy_logprobs(example_prompts,
                                                        max_tokens,

From 446a14491a621e8bcd24dbd3851fe8036f7fa322 Mon Sep 17 00:00:00 2001
From: Robert Shaw <rshaw@neuralmagic.com>
Date: Tue, 11 Jun 2024 16:35:36 +0000
Subject: [PATCH 93/93] format

---
 tests/models/test_models_logprobs.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/models/test_models_logprobs.py b/tests/models/test_models_logprobs.py
index a07d4e1e5d89..4ab78b8fbfe4 100644
--- a/tests/models/test_models_logprobs.py
+++ b/tests/models/test_models_logprobs.py
@@ -13,7 +13,6 @@
     "meta-llama/Llama-2-7b-hf",
     "mistralai/Mistral-7B-v0.1",
     "Deci/DeciLM-7b",
-    
     "gpt2",
     "bigcode/tiny_starcoder_py",
     "EleutherAI/gpt-j-6b",
@@ -35,7 +34,7 @@
 
 SKIPPED_MODELS_CI = [
     "EleutherAI/gpt-j-6b",  # OOM on CPU RAM
-    "tiiuae/falcon-7b",     # Fails in vllm if trust_remote_code=True
+    "tiiuae/falcon-7b",  # Fails in vllm if trust_remote_code=True
 ]