From 9eee8bc52ae975af8cd0aa8e8af7e5f1c8a4a621 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Mon, 27 Nov 2023 09:12:45 +0000
Subject: [PATCH 01/14] update code path of setup.py

---
 setup.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/setup.py b/setup.py
index 0e28b9360277..ffb682bef74b 100644
--- a/setup.py
+++ b/setup.py
@@ -24,11 +24,11 @@
 # TODO(woosuk): Should we use -O3?
 NVCC_FLAGS = ["-O2", "-std=c++17"]
 
-if torch.version.hip:
+if torch.cuda.is_available() and torch.version.hip:
     if ROCM_HOME is not None:
         NVCC_FLAGS += [f"-DUSE_ROCM"]
 
-if not torch.version.hip:
+if torch.cuda.is_available() and torch.version.cuda:
     if CUDA_HOME is None:
         raise RuntimeError(
             "Cannot find CUDA_HOME. CUDA must be available to build the package.")
@@ -37,9 +37,9 @@
 CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
 NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
 
-if CUDA_HOME is None:
-    raise RuntimeError(
-        "Cannot find CUDA_HOME. CUDA must be available to build the package.")
+#if CUDA_HOME is None:
+#    raise RuntimeError(
+#        "Cannot find CUDA_HOME. CUDA must be available to build the package.")
 
 
 def get_nvcc_cuda_version(cuda_dir: str) -> Version:
@@ -93,7 +93,7 @@ def get_torch_arch_list() -> Set[str]:
 
 # First, check the TORCH_CUDA_ARCH_LIST environment variable.
 compute_capabilities = get_torch_arch_list()
-if not torch.version.hip:
+if torch.cuda.is_available() and torch.version.cuda:
     if not compute_capabilities:
         # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available
         # GPUs on the current machine.
@@ -105,7 +105,7 @@ def get_torch_arch_list() -> Set[str]:
                     "GPUs with compute capability below 7.0 are not supported.")
             compute_capabilities.add(f"{major}.{minor}")
 
-if not torch.version.hip:
+if torch.cuda.is_available() and torch.version.cuda:
     nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
     if not compute_capabilities:
         # If no GPU is specified nor available, add all supported architectures
@@ -211,7 +211,7 @@ def get_torch_arch_list() -> Set[str]:
 ext_modules.append(activation_extension)
 
 # Quantization kernels.
-if not torch.version.hip:
+if torch.cuda.is_available() and torch.version.cuda:
     quantization_extension = CUDAExtension(
         name="vllm.quantization_ops",
         sources=[
@@ -224,7 +224,7 @@ def get_torch_arch_list() -> Set[str]:
             "nvcc": NVCC_FLAGS,
         },
     )
-else:
+if torch.cuda.is_available() and torch.version.hip:
     quantization_extension = CUDAExtension(
         name="vllm.quantization_ops",
         sources=[

From 74ed36ba23a1019e2ca7447a85db43d03f6483dd Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@akirakan.com>
Date: Tue, 28 Nov 2023 15:48:14 +0800
Subject: [PATCH 02/14] remove awq and enable squeezellm for rocm

---
 .../layers/quantization/__init__.py           |  5 +++--
 .../model_executor/layers/quantization/awq.py |  6 +++++-
 .../layers/quantization/squeezellm.py         | 19 +++++++++++++------
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 3d937ba64f9f..4eedfba75d16 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -1,14 +1,15 @@
 from typing import Type
 
-from vllm.model_executor.layers.quantization.awq import AWQConfig
 from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 
 _QUANTIZATION_CONFIG_REGISTRY = {
-    "awq": AWQConfig,
     "squeezellm": SqueezeLLMConfig,
 }
 
+if torch.cuda.is_available() and torch.version.cuda:
+    from vllm.model_executor.layers.quantization.awq import AWQConfig
+    _QUANTIZATION_CONFIG_REGISTRY["awq"] = AWQConfig
 
 def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     if quantization not in _QUANTIZATION_CONFIG_REGISTRY:
diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py
index 0ab5819d930a..5561a9309fc3 100644
--- a/vllm/model_executor/layers/quantization/awq.py
+++ b/vllm/model_executor/layers/quantization/awq.py
@@ -2,8 +2,12 @@
 
 import torch
 from torch.nn.parameter import Parameter
+if torch.cuda.is_available() and torch.version.hip:
+    # do something specific for HIP
+    print("Warning: vLLM does not support AWQ on ROCm.")
+elif torch.cuda.is_available() and torch.version.cuda:
+    from vllm import quantization_ops
 
-from vllm import quantization_ops
 from vllm.model_executor.layers.linear import (LinearMethodBase,
                                                set_weight_attrs)
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py
index 61ec8b79b6dd..17d6ca4cc133 100644
--- a/vllm/model_executor/layers/quantization/squeezellm.py
+++ b/vllm/model_executor/layers/quantization/squeezellm.py
@@ -112,12 +112,19 @@ def apply_weights(self,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         qweight = weights["qweight"]
         lookup_table = weights["lookup_table"]
-        out_shape = x.shape[:-1] + (qweight.shape[-1], )
-        reshaped_x = x.reshape(-1, x.shape[-1])
-        # NOTE: The output tensor should be zero-initialized.
-        out = torch.zeros(out_shape, device="cuda", dtype=torch.float16)
-        quantization_ops.squeezellm_gemm(reshaped_x, qweight, out,
-                                         lookup_table)
+        if torch.cuda.is_available() and torch.version.hip:
+            out_float = torch.zeros(out_shape, device="cuda", dtype=torch.float)
+            quantization_ops.squeezellm_gemm(reshaped_x, qweight, out_float,
+                                            lookup_table)
+            out = out_float.to(dtype=torch.float16)
+            # do something specific for HIP
+        elif torch.cuda.is_available() and torch.version.cuda:
+            out_shape = x.shape[:-1] + (qweight.shape[-1], )
+            reshaped_x = x.reshape(-1, x.shape[-1])
+            # NOTE: The output tensor should be zero-initialized.
+            out = torch.zeros(out_shape, device="cuda", dtype=torch.float16)
+            quantization_ops.squeezellm_gemm(reshaped_x, qweight, out,
+                                            lookup_table)
 
         if bias is not None:
             out = out + bias

From d547da612cb2aa8789c9b64d81bcb6824eebe158 Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@embeddedllm.com>
Date: Tue, 28 Nov 2023 09:13:00 +0000
Subject: [PATCH 03/14] fix quantization code path to ignore awq

---
 csrc/quantization.cpp                           | 17 +++++++++++------
 .../layers/quantization/__init__.py             |  2 +-
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/csrc/quantization.cpp b/csrc/quantization.cpp
index dfe17a496c78..764188597fd3 100644
--- a/csrc/quantization.cpp
+++ b/csrc/quantization.cpp
@@ -1,11 +1,14 @@
 #include <torch/extension.h>
 
-torch::Tensor awq_gemm(
-  torch::Tensor _in_feats,
-  torch::Tensor _kernel,
-  torch::Tensor _scaling_factors,
-  torch::Tensor _zeros,
-  int split_k_iters);
+
+#ifndef USE_ROCM
+  torch::Tensor awq_gemm(
+    torch::Tensor _in_feats,
+    torch::Tensor _kernel,
+    torch::Tensor _scaling_factors,
+    torch::Tensor _zeros,
+    int split_k_iters);
+#endif
 
 void squeezellm_gemm(
   torch::Tensor vec,
@@ -14,6 +17,8 @@ void squeezellm_gemm(
   torch::Tensor lookup_table);
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+#ifndef USE_ROCM
   m.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ");
+#endif
   m.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM");
 }
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 4eedfba75d16..748a6f610f2a 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -1,5 +1,5 @@
 from typing import Type
-
+import torch
 from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig
 from vllm.model_executor.layers.quantization.base_config import QuantizationConfig
 

From b4eeb5b7e0b4843845e3809f25fa33a38007ecae Mon Sep 17 00:00:00 2001
From: tjtanaa <tunjian.tan@akirakan.com>
Date: Tue, 28 Nov 2023 17:25:45 +0800
Subject: [PATCH 04/14] use flash attention if rocm

---
 vllm/model_executor/layers/attention.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py
index e51bb311decd..7750a95c3f04 100644
--- a/vllm/model_executor/layers/attention.py
+++ b/vllm/model_executor/layers/attention.py
@@ -113,6 +113,7 @@ def multi_query_kv_attention(
             attn_bias=input_metadata.attn_bias,
             p=0.0,
             scale=self.scale,
+            op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if (torch.cuda.is_available() and torch.version.hip) else None,
         )
         # TODO(woosuk): Unnecessary copy. Optimize.
         output.copy_(out.view_as(output))
@@ -451,6 +452,7 @@ def multi_query_kv_attention(
             attn_bias=input_metadata.attn_bias,
             p=0.0,
             scale=self.scale,
+            op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if (torch.cuda.is_available() and torch.version.hip) else None,
         )
         # TODO(woosuk): Unnecessary copy. Optimize.
         output.copy_(out.view_as(output))

From fcaa7695cf053d27d68ee926e1888f2f6e2cf0f6 Mon Sep 17 00:00:00 2001
From: miloice <kuanfu.liu@akirakan.com>
Date: Wed, 29 Nov 2023 03:37:14 +0000
Subject: [PATCH 05/14] update setup.py to check for AMD GPU Arch supports, and
 load requirements-rocm.txt

---
 setup.py | 48 +++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 37 insertions(+), 11 deletions(-)

diff --git a/setup.py b/setup.py
index ffb682bef74b..061eb9fd0afa 100644
--- a/setup.py
+++ b/setup.py
@@ -17,7 +17,7 @@
 # Supported NVIDIA GPU architectures.
 NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
 ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030","gfx1100"}
-SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)
+# SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)
 
 # Compiler flags.
 CXX_FLAGS = ["-g", "-O2", "-std=c++17"]
@@ -37,11 +37,23 @@
 CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
 NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
 
-#if CUDA_HOME is None:
-#    raise RuntimeError(
-#        "Cannot find CUDA_HOME. CUDA must be available to build the package.")
-
-
+def get_amdgpu_offload_arch():
+    error_message = ""
+    command = "/opt/rocm/llvm/bin/amdgpu-offload-arch"
+    try:
+        output = subprocess.check_output([command])
+        return output.decode('utf-8').strip()
+    except subprocess.CalledProcessError as e:
+        error_message = f"Error: {e}"
+    except FileNotFoundError:
+        # If the command is not found, print an error message
+        error_message = f"The command {command} was not found."
+    
+    if error_message:
+        raise RuntimeError(error_message)
+    
+    return None
+        
 def get_nvcc_cuda_version(cuda_dir: str) -> Version:
     """Get the CUDA version from nvcc.
 
@@ -72,7 +84,7 @@ def get_torch_arch_list() -> Set[str]:
         return set()
 
     # Filter out the invalid architectures and print a warning.
-    valid_archs = SUPPORTED_ARCHS.union({s + "+PTX" for s in NVIDIA_SUPPORTED_ARCHS})
+    valid_archs = NVIDIA_SUPPORTED_ARCHS.union({s + "+PTX" for s in NVIDIA_SUPPORTED_ARCHS})
     arch_list = torch_arch_list.intersection(valid_archs)
     # If none of the specified architectures are valid, raise an error.
     if not arch_list:
@@ -110,7 +122,7 @@ def get_torch_arch_list() -> Set[str]:
     if not compute_capabilities:
         # If no GPU is specified nor available, add all supported architectures
         # based on the NVCC CUDA version.
-        compute_capabilities = SUPPORTED_ARCHS.copy()
+        compute_capabilities = NVIDIA_SUPPORTED_ARCHS.copy()
         if nvcc_cuda_version < Version("11.1"):
             compute_capabilities.remove("8.6")
         if nvcc_cuda_version < Version("11.8"):
@@ -153,6 +165,14 @@ def get_torch_arch_list() -> Set[str]:
         num_threads = min(os.cpu_count(), 8)
         NVCC_FLAGS += ["--threads", str(num_threads)]
 
+elif torch.cuda.is_available() and torch.version.hip:
+    amd_arch = get_amdgpu_offload_arch()
+    if not amd_arch in ROCM_SUPPORTED_ARCHS:
+        raise RuntimeError(
+            f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}"
+            f"amdgpu_arch_found: {amd_arch}"
+        )
+    
 ext_modules = []
 
 # Cache operations.
@@ -224,7 +244,7 @@ def get_torch_arch_list() -> Set[str]:
             "nvcc": NVCC_FLAGS,
         },
     )
-if torch.cuda.is_available() and torch.version.hip:
+elif torch.cuda.is_available() and torch.version.hip:
     quantization_extension = CUDAExtension(
         name="vllm.quantization_ops",
         sources=[
@@ -287,8 +307,14 @@ def read_readme() -> str:
 
 def get_requirements() -> List[str]:
     """Get Python package dependencies from requirements.txt."""
-    with open(get_path("requirements.txt")) as f:
-        requirements = f.read().strip().split("\n")
+    if torch.cuda.is_available() and torch.version.hip:
+        with open(get_path("requirements-rocm.txt")) as f:
+            requirements = f.read().strip().split("\n")
+    elif torch.cuda.is_available() and torch.version.cuda:
+        with open(get_path("requirements.txt")) as f:
+            requirements = f.read().strip().split("\n")
+    print("requirements: ", requirements)
+    # exit()
     return requirements
 
 

From dea3cc394a293eb05eb8a2c940676008c493eafb Mon Sep 17 00:00:00 2001
From: miloice <kuanfu.liu@akirakan.com>
Date: Wed, 29 Nov 2023 03:39:53 +0000
Subject: [PATCH 06/14] add patch for xformers-0.0.22.post7; add missing
 requirements-rocm.txt; add rocm.Dockerfile draft

---
 commonpy_xformers-0.0.22.post7.patch |  13 +++
 flashpy_xformers-0.0.22.post7.patch  | 134 +++++++++++++++++++++++++++
 launch_docker.sh                     |  14 +++
 patch_xformers-0.0.22.post7.sh       |  22 +++++
 requirements-rocm.txt                |  12 +++
 rocm.Dockerfile                      |  64 +++++++++++++
 6 files changed, 259 insertions(+)
 create mode 100644 commonpy_xformers-0.0.22.post7.patch
 create mode 100644 flashpy_xformers-0.0.22.post7.patch
 create mode 100644 launch_docker.sh
 create mode 100644 patch_xformers-0.0.22.post7.sh
 create mode 100644 requirements-rocm.txt
 create mode 100644 rocm.Dockerfile

diff --git a/commonpy_xformers-0.0.22.post7.patch b/commonpy_xformers-0.0.22.post7.patch
new file mode 100644
index 000000000000..4d7495cf13e1
--- /dev/null
+++ b/commonpy_xformers-0.0.22.post7.patch
@@ -0,0 +1,13 @@
+--- /opt/conda/envs/py_3.10/lib/python3.10/site-packages/xformers/ops/fmha/common.py	2023-11-29 03:17:03.930103539 +0000
++++ common.py	2023-11-28 16:14:19.846233146 +0000
+@@ -298,8 +298,8 @@
+         dtype = d.query.dtype
+         if device_type not in cls.SUPPORTED_DEVICES:
+             reasons.append(f"device={device_type} (supported: {cls.SUPPORTED_DEVICES})")
+-        if device_type == "cuda" and not _built_with_cuda:
+-            reasons.append("xFormers wasn't build with CUDA support")
++        #if device_type == "cuda" and not _built_with_cuda:
++        #    reasons.append("xFormers wasn't build with CUDA support")
+         if device_type == "cuda":
+             device_capability = torch.cuda.get_device_capability(d.device)
+             if device_capability < cls.CUDA_MINIMUM_COMPUTE_CAPABILITY:
diff --git a/flashpy_xformers-0.0.22.post7.patch b/flashpy_xformers-0.0.22.post7.patch
new file mode 100644
index 000000000000..4798f1efd461
--- /dev/null
+++ b/flashpy_xformers-0.0.22.post7.patch
@@ -0,0 +1,134 @@
+--- /opt/conda/envs/py_3.10/lib/python3.10/site-packages/xformers/ops/fmha/flash.py	2023-11-29 03:17:03.930103539 +0000
++++ flash.py	2023-11-28 16:14:25.206128903 +0000
+@@ -31,39 +31,39 @@
+ 
+ FLASH_VERSION = "0.0.0"
+ try:
+-    try:
+-        from ... import _C_flashattention  # type: ignore[attr-defined]
+-        from ..._cpp_lib import _build_metadata
+-
+-        if _build_metadata is not None:
+-            FLASH_VERSION = _build_metadata.flash_version
+-    except ImportError:
+-        import flash_attn
+-        from flash_attn.flash_attn_interface import flash_attn_cuda as _C_flashattention
+-
+-        FLASH_VERSION = flash_attn.__version__
+-        flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:2])
+-        if flash_ver_parsed < (2, 3):
+-            raise ImportError("Requires 2.3 for sliding window support")
++    #try:
++    #    from ... import _C_flashattention  # type: ignore[attr-defined]
++    #    from ..._cpp_lib import _build_metadata
++
++    #    if _build_metadata is not None:
++    #        FLASH_VERSION = _build_metadata.flash_version
++    #except ImportError:
++    import flash_attn
++    from flash_attn.flash_attn_interface import flash_attn_cuda as _C_flashattention
++
++    FLASH_VERSION = flash_attn.__version__
++    #    flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:2])
++    #    if flash_ver_parsed < (2, 3):
++    #        raise ImportError("Requires 2.3 for sliding window support")
+ 
+     # create library so that flash-attn goes through the PyTorch Dispatcher
+-    _flash_lib = torch.library.Library("xformers_flash", "DEF")
++    #_flash_lib = torch.library.Library("xformers_flash", "DEF")
+ 
+-    _flash_lib.define(
+-        "flash_fwd(Tensor query, Tensor key, Tensor value, "
+-        "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, "
+-        "int max_seqlen_q, int max_seqlen_k, "
+-        "float p, float softmax_scale, "
+-        "bool is_causal, int window_size, bool return_softmax) -> (Tensor, Tensor, Tensor)"
+-    )
+-
+-    _flash_lib.define(
+-        "flash_bwd(Tensor dout, Tensor query, Tensor key, Tensor value, "
+-        "Tensor out, Tensor softmax_lse_, Tensor dq, Tensor dk, Tensor dv, "
+-        "Tensor cu_seqlens_q, Tensor cu_seqlens_k, "
+-        "int max_seqlen_q, int max_seqlen_k, "
+-        "float p, float softmax_scale, bool is_causal, int window_size, Tensor rng_state) -> (Tensor, Tensor, Tensor)"
+-    )
++    #_flash_lib.define(
++    #    "flash_fwd(Tensor query, Tensor key, Tensor value, "
++    #    "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, "
++    #    "int max_seqlen_q, int max_seqlen_k, "
++    #    "float p, float softmax_scale, "
++    #    "bool is_causal, int window_size, bool return_softmax) -> (Tensor, Tensor, Tensor)"
++    #)
++
++    #_flash_lib.define(
++    #    "flash_bwd(Tensor dout, Tensor query, Tensor key, Tensor value, "
++    #    "Tensor out, Tensor softmax_lse_, Tensor dq, Tensor dk, Tensor dv, "
++    #    "Tensor cu_seqlens_q, Tensor cu_seqlens_k, "
++    #    "int max_seqlen_q, int max_seqlen_k, "
++    #    "float p, float softmax_scale, bool is_causal, int window_size, Tensor rng_state) -> (Tensor, Tensor, Tensor)"
++    #)
+ 
+     def _flash_fwd(
+         query,
+@@ -98,8 +98,8 @@
+                 p,
+                 softmax_scale,
+                 is_causal,
+-                window_size - 1,  # window_size_left
+-                -1,  # window_size_right
++        #        window_size - 1,  # window_size_left
++        #        -1,  # window_size_right
+                 return_softmax,
+                 None,  # rng
+             )
+@@ -127,8 +127,8 @@
+                 softmax_scale,
+                 False,
+                 is_causal,
+-                window_size - 1,  # window_size_left
+-                -1,  # window_size_right
++         #       window_size - 1,  # window_size_left
++         #       -1,  # window_size_right
+                 return_softmax,
+                 None,
+             )
+@@ -169,8 +169,8 @@
+                 p,
+                 softmax_scale,
+                 is_causal,
+-                window_size - 1,  # window_size_left
+-                -1,  # window_size_right
++        #        window_size - 1,  # window_size_left
++        #        -1,  # window_size_right
+                 None,
+                 rng_state,
+             )
+@@ -193,15 +193,15 @@
+                 softmax_scale,
+                 False,  # zero_tensors
+                 is_causal,
+-                window_size - 1,  # window_size_left
+-                -1,  # window_size_right
++        #        window_size - 1,  # window_size_left
++        #        -1,  # window_size_right
+                 None,
+                 rng_state,
+             )
+         return dq, dk, dv
+ 
+-    _flash_lib.impl("flash_fwd", _flash_fwd, "CUDA")
+-    _flash_lib.impl("flash_bwd", _flash_bwd, "CUDA")
++    #_flash_lib.impl("flash_fwd", _flash_fwd, "CUDA")
++    #_flash_lib.impl("flash_bwd", _flash_bwd, "CUDA")
+ except ImportError:
+     pass
+ 
+@@ -348,7 +348,7 @@
+         implementation.
+     """
+ 
+-    OPERATOR = get_operator("xformers_flash", "flash_fwd")
++    OPERATOR = _flash_fwd # get_operator("xformers_flash", "flash_fwd")
+     SUPPORTED_DEVICES: Set[str] = {"cuda"}
+     CUDA_MINIMUM_COMPUTE_CAPABILITY = (8, 0)
+     SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16}
diff --git a/launch_docker.sh b/launch_docker.sh
new file mode 100644
index 000000000000..b26c8044e7de
--- /dev/null
+++ b/launch_docker.sh
@@ -0,0 +1,14 @@
+#!/bin/bash
+docker run -it \
+       --network=host \
+       --group-add=video \
+       --ipc=host \
+       --cap-add=SYS_PTRACE \
+       --security-opt seccomp=unconfined \
+       --shm-size 8G \
+       --device /dev/kfd \
+       --device /dev/dri \
+       -v /home/akk/tjtanaa/vllm-rocm:/app/libs/vllm-rocm-external \
+       -v /home/akk/hf_model:/app/hf_model \
+       vllm-rocm-tj \
+       bash
\ No newline at end of file
diff --git a/patch_xformers-0.0.22.post7.sh b/patch_xformers-0.0.22.post7.sh
new file mode 100644
index 000000000000..4ce38272bca8
--- /dev/null
+++ b/patch_xformers-0.0.22.post7.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+export XFORMERS_FMHA_FLASH_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.flash.__file__)')
+export XFORMERS_FMHA_COMMON_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.common.__file__)')
+
+echo $XFORMERS_FMHA_FLASH_PATH
+echo $XFORMERS_FMHA_COMMON_PATH
+
+if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_FLASH_PATH "flashpy_xformers-0.0.22.post7.patch"; then
+    echo "Applying patch to ${XFORMERS_FMHA_FLASH_PATH}"
+    patch -p0 $XFORMERS_FMHA_FLASH_PATH "flashpy_xformers-0.0.22.post7.patch"
+    echo "Successfully patch ${XFORMERS_FMHA_FLASH_PATH}"
+else
+    echo "${XFORMERS_FMHA_FLASH_PATH} was patched before"
+fi
+
+if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_COMMON_PATH "commonpy_xformers-0.0.22.post7.patch"; then
+    echo "Applying patch to ${XFORMERS_FMHA_COMMON_PATH}"
+    patch -p0 $XFORMERS_FMHA_COMMON_PATH "commonpy_xformers-0.0.22.post7.patch"
+    echo "Successfully patch ${XFORMERS_FMHA_COMMON_PATH}"
+else
+    echo "${XFORMERS_FMHA_COMMON_PATH} was patched before"
+fi
\ No newline at end of file
diff --git a/requirements-rocm.txt b/requirements-rocm.txt
new file mode 100644
index 000000000000..ebbf4783c665
--- /dev/null
+++ b/requirements-rocm.txt
@@ -0,0 +1,12 @@
+ninja  # For faster builds.
+psutil
+ray >= 2.5.1
+pandas  # Required for Ray data.
+pyarrow  # Required for Ray data.
+sentencepiece  # Required for LLaMA tokenizer.
+numpy
+einops  # Required for phi-1_5
+transformers >= 4.34.0  # Required for Mistral.
+fastapi
+uvicorn[standard]
+pydantic == 1.10.13  # Required for OpenAI server.
diff --git a/rocm.Dockerfile b/rocm.Dockerfile
new file mode 100644
index 000000000000..27b57097740d
--- /dev/null
+++ b/rocm.Dockerfile
@@ -0,0 +1,64 @@
+FROM rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1
+
+# Install some basic utilities
+RUN apt-get update && apt-get install python3 python3-pip -y
+
+# Install some basic utilities
+RUN apt-get update && apt-get install -y \
+    curl \
+    ca-certificates \
+    sudo \
+    git \
+    bzip2 \
+    libx11-6 \
+    build-essential \
+    wget \
+    unzip \
+    nvidia-cuda-toolkit \
+    tmux \
+ && rm -rf /var/lib/apt/lists/*
+
+### Mount Point ###
+# When launching the container, mount the code directory to /app
+ARG APP_MOUNT=/app
+VOLUME [ ${APP_MOUNT} ]
+WORKDIR ${APP_MOUNT}
+
+RUN python3 -m pip install --upgrade pip
+RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers
+
+ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer
+ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin:
+ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib:
+ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/:
+ENV PYTORCH_ROCM_ARCH=gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1101
+
+# Install ROCm flash-attention
+RUN mkdir libs \
+    && cd libs \
+    && git clone https://github.com/ROCmSoftwarePlatform/flash-attention.git \
+    && cd flash-attention \
+    && git submodule update --init \
+    && sed -i -e "s/--offload-arch=native/--offload-arch=$(/opt/rocm/llvm/bin/amdgpu-offload-arch)/g" setup.py \
+    && patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch \
+    && python3 setup.py install \
+    && cd ..
+
+COPY ./ /app/vllm-rocm/
+
+# RUN cd /app \
+#     && cd vllm-rocm \
+#     && git checkout v0.2.1.post1-rocm \
+#     && python3 setup.py install \
+#     && cd ..
+
+# RUN cd /app \
+#     && mkdir dataset \
+#     && cd ..
+
+# COPY ./benchmark_throughput.sh /app/benchmark_throughput.sh
+
+RUN python3 -m pip install --upgrade pip
+# RUN python3 -m pip install --no-cache-dir ray[all]
+
+CMD ["/bin/bash"]
\ No newline at end of file

From 4089864fc6935366b689d056eb17c276a2eb9121 Mon Sep 17 00:00:00 2001
From: miloice <kuanfu.liu@akirakan.com>
Date: Wed, 29 Nov 2023 07:14:08 +0000
Subject: [PATCH 07/14] update docs installation; reorganize patch into
 rocm_patch; restrict args if using ROCm

---
 .../getting_started/amd-installation.rst      | 104 ++++++++++++++++++
 launch_docker.sh => launch_amd_docker.sh      |   0
 ....sh => patch_xformers-0.0.22.post7.rocm.sh |   8 +-
 .../commonpy_xformers-0.0.22.post7.rocm.patch |   0
 .../flashpy_xformers-0.0.22.post7.rocm.patch  |   0
 vllm/engine/arg_utils.py                      | 100 +++++++++++------
 6 files changed, 175 insertions(+), 37 deletions(-)
 create mode 100644 docs/source/getting_started/amd-installation.rst
 rename launch_docker.sh => launch_amd_docker.sh (100%)
 rename patch_xformers-0.0.22.post7.sh => patch_xformers-0.0.22.post7.rocm.sh (60%)
 rename commonpy_xformers-0.0.22.post7.patch => rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch (100%)
 rename flashpy_xformers-0.0.22.post7.patch => rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch (100%)

diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst
new file mode 100644
index 000000000000..7f2ebdbcba6d
--- /dev/null
+++ b/docs/source/getting_started/amd-installation.rst
@@ -0,0 +1,104 @@
+.. _installation:
+
+Installation with ROCm
+============
+
+vLLM-ROCm is here! Currently it is supporting llama-2.
+
+Requirements
+------------
+
+* OS: Linux
+* Python: 3.8 -- 3.11 (Recommended 3.10 as this is the version that has been tested on.)
+* GPU: MI210
+* Pytorch 2.0.1/2.1.1
+* ROCm 5.7
+
+
+Install with pip
+----------------
+
+You can install vLLM using pip:
+
+.. code-block:: console
+
+    $ # (Optional) Create a new conda environment.
+    $ conda create -n myenv python=3.8 -y
+    $ conda activate myenv
+
+    $ # Install vLLM with CUDA 12.1.
+    $ pip install vllm
+
+.. note::
+
+    As of now, vLLM's binaries are compiled on CUDA 12.1 by default.
+    However, you can install vLLM with CUDA 11.8 by running:
+
+    .. code-block:: console
+
+        $ # Install vLLM with CUDA 11.8.
+        $ # Replace `cp310` with your Python version (e.g., `cp38`, `cp39`, `cp311`).
+        $ pip install https://github.com/vllm-project/vllm/releases/download/v0.2.2/vllm-0.2.2+cu118-cp310-cp310-manylinux1_x86_64.whl
+
+        $ # Re-install PyTorch with CUDA 11.8.
+        $ pip uninstall torch -y
+        $ pip install torch --upgrade --index-url https://download.pytorch.org/whl/cu118
+
+
+.. _build_from_source:
+
+Build from source with docker
+-----------------
+
+You can also build and install vLLM from source:
+
+Build a docker image from `rocm.Dockerfile`, and launch a docker container.
+
+.. code-block:: console
+
+    $ docker build -f rocm.Dockerfile -t vllm-rocm . 
+    $ docker run -it \
+       --network=host \
+       --group-add=video \
+       --ipc=host \
+       --cap-add=SYS_PTRACE \
+       --security-opt seccomp=unconfined \
+       --shm-size 8G \
+       --device /dev/kfd \
+       --device /dev/dri \
+       -v <path/to/model>:/app/hf_model \
+       vllm-rocm \
+       bash
+
+If you are going to setup on new pytorch+rocm5.7 docker container, you can follow the following steps.
+
+1. Install flash-attention-2-rocm
+
+    If you are using Pytorch-2.0.1+rocm5.7.
+
+    Install flash-attention-2 (v2.0.4) following the instruction from [ROCmSoftwarePlatform/flash-attention](https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm)
+
+
+    If you are using Pytorch-2.1.1+rocm5.7
+    Install flash-attention-2 (v2.0.4) following the instruction from [ROCmSoftwarePlatform/flash-attention](https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm)
+    BUT, use the patch from this repo to patch the `hipify_python.py`
+
+    .. code-block:: console
+
+        $ bash patch_torch211_flash_attn2.rocm.sh
+
+    .. note::
+        Flash-attention-2 (v2.0.4) does not support sliding windows attention.
+
+2. Setup xformers==0.0.22.post7 without dependencies, and apply patches
+
+    .. code-block:: console
+
+        $ pip install xformers==0.0.22.post7 --no-deps
+        $ bash patch_xformers-0.0.22.post7.rocm.sh
+
+3. Build vllm.
+
+    .. code-block:: console
+        $ cd vllm
+        $ python setup.py install # This may take 5-10 minutes.
diff --git a/launch_docker.sh b/launch_amd_docker.sh
similarity index 100%
rename from launch_docker.sh
rename to launch_amd_docker.sh
diff --git a/patch_xformers-0.0.22.post7.sh b/patch_xformers-0.0.22.post7.rocm.sh
similarity index 60%
rename from patch_xformers-0.0.22.post7.sh
rename to patch_xformers-0.0.22.post7.rocm.sh
index 4ce38272bca8..c8e58f721ae8 100644
--- a/patch_xformers-0.0.22.post7.sh
+++ b/patch_xformers-0.0.22.post7.rocm.sh
@@ -5,17 +5,17 @@ export XFORMERS_FMHA_COMMON_PATH=$(python -c 'from xformers import ops as xops;
 echo $XFORMERS_FMHA_FLASH_PATH
 echo $XFORMERS_FMHA_COMMON_PATH
 
-if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_FLASH_PATH "flashpy_xformers-0.0.22.post7.patch"; then
+if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch"; then
     echo "Applying patch to ${XFORMERS_FMHA_FLASH_PATH}"
-    patch -p0 $XFORMERS_FMHA_FLASH_PATH "flashpy_xformers-0.0.22.post7.patch"
+    patch -p0 $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch"
     echo "Successfully patch ${XFORMERS_FMHA_FLASH_PATH}"
 else
     echo "${XFORMERS_FMHA_FLASH_PATH} was patched before"
 fi
 
-if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_COMMON_PATH "commonpy_xformers-0.0.22.post7.patch"; then
+if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch"; then
     echo "Applying patch to ${XFORMERS_FMHA_COMMON_PATH}"
-    patch -p0 $XFORMERS_FMHA_COMMON_PATH "commonpy_xformers-0.0.22.post7.patch"
+    patch -p0 $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch"
     echo "Successfully patch ${XFORMERS_FMHA_COMMON_PATH}"
 else
     echo "${XFORMERS_FMHA_COMMON_PATH} was patched before"
diff --git a/commonpy_xformers-0.0.22.post7.patch b/rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch
similarity index 100%
rename from commonpy_xformers-0.0.22.post7.patch
rename to rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch
diff --git a/flashpy_xformers-0.0.22.post7.patch b/rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch
similarity index 100%
rename from flashpy_xformers-0.0.22.post7.patch
rename to rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index c7e476c70474..c52b5f03b760 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -3,6 +3,8 @@
 from dataclasses import dataclass
 from typing import Optional, Tuple
 
+import torch
+
 from vllm.config import (CacheConfig, ModelConfig, ParallelConfig,
                          SchedulerConfig)
 
@@ -83,32 +85,54 @@ def add_cli_args(
                             help='directory to download and load the weights, '
                             'default to the default cache dir of '
                             'huggingface')
-        parser.add_argument(
-            '--load-format',
-            type=str,
-            default=EngineArgs.load_format,
-            choices=['auto', 'pt', 'safetensors', 'npcache', 'dummy'],
-            help='The format of the model weights to load. '
-            '"auto" will try to load the weights in the safetensors format '
-            'and fall back to the pytorch bin format if safetensors format '
-            'is not available. '
-            '"pt" will load the weights in the pytorch bin format. '
-            '"safetensors" will load the weights in the safetensors format. '
-            '"npcache" will load the weights in pytorch format and store '
-            'a numpy cache to speed up the loading. '
-            '"dummy" will initialize the weights with random values, '
-            'which is mainly for profiling.')
-        parser.add_argument(
-            '--dtype',
-            type=str,
-            default=EngineArgs.dtype,
-            choices=[
-                'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'
-            ],
-            help='data type for model weights and activations. '
-            'The "auto" option will use FP16 precision '
-            'for FP32 and FP16 models, and BF16 precision '
-            'for BF16 models.')
+        if torch.cuda.is_available() and torch.version.hip:
+            # do something specific for HIP
+            parser.add_argument(
+                '--load-format',
+                type=str,
+                default='pt',
+                choices=['pt'],
+                help='The format of the model weights to load. '
+                '"pt" will load the weights in the pytorch bin format. ')
+            parser.add_argument(
+                '--dtype',
+                type=str,
+                default='half',
+                choices=[
+                    'half', 'float16', 'bfloat16'
+                ],
+                help='data type for model weights and activations. '
+                'The default option is FP16 precision '
+                'Supports FP16 and BF16 ')
+        elif torch.cuda.is_available() and torch.version.cuda:
+            # do something specific for CUDA
+            parser.add_argument(
+                '--load-format',
+                type=str,
+                default=EngineArgs.load_format,
+                choices=['auto', 'pt', 'safetensors', 'npcache', 'dummy'],
+                help='The format of the model weights to load. '
+                '"auto" will try to load the weights in the safetensors format '
+                'and fall back to the pytorch bin format if safetensors format '
+                'is not available. '
+                '"pt" will load the weights in the pytorch bin format. '
+                '"safetensors" will load the weights in the safetensors format. '
+                '"npcache" will load the weights in pytorch format and store '
+                'a numpy cache to speed up the loading. '
+                '"dummy" will initialize the weights with random values, '
+                'which is mainly for profiling.')
+            parser.add_argument(
+                '--dtype',
+                type=str,
+                default=EngineArgs.dtype,
+                choices=[
+                    'auto', 'half', 'float16', 'bfloat16', 'float', 'float32'
+                ],
+                help='data type for model weights and activations. '
+                'The "auto" option will use FP16 precision '
+                'for FP32 and FP16 models, and BF16 precision '
+                'for BF16 models.')
+            
         parser.add_argument('--max-model-len',
                             type=int,
                             default=None,
@@ -171,13 +195,23 @@ def add_cli_args(
         parser.add_argument('--disable-log-stats',
                             action='store_true',
                             help='disable logging statistics')
-        # Quantization settings.
-        parser.add_argument('--quantization',
-                            '-q',
-                            type=str,
-                            choices=['awq', 'squeezellm', None],
-                            default=None,
-                            help='Method used to quantize the weights')
+        if torch.cuda.is_available() and torch.version.hip:
+            # Quantization settings.
+            parser.add_argument('--quantization',
+                                '-q',
+                                type=str,
+                                choices=['squeezellm', None],
+                                default=None,
+                                help='Method used to quantize the weights')
+            
+        elif torch.cuda.is_available() and torch.version.cuda:
+            # Quantization settings.
+            parser.add_argument('--quantization',
+                                '-q',
+                                type=str,
+                                choices=['awq', 'squeezellm', None],
+                                default=None,
+                                help='Method used to quantize the weights')
         return parser
 
     @classmethod

From a8e252cc5cfea6b9aacd68089cd071d573a42f6a Mon Sep 17 00:00:00 2001
From: miloice <kuanfu.liu@akirakan.com>
Date: Wed, 29 Nov 2023 10:11:10 +0000
Subject: [PATCH 08/14] add num_gpus to ray_init to fix bug; update
 requirements-rocm.txt

---
 docs/source/getting_started/amd-installation.rst |  3 ++-
 patch_torch211_flash_attn2.rocm.sh               | 12 ++++++++++++
 requirements-rocm.txt                            |  4 ++++
 rocm_patch/hipify_patch-torch2.1.1.patch         | 16 ++++++++++++++++
 vllm/engine/ray_utils.py                         |  2 +-
 5 files changed, 35 insertions(+), 2 deletions(-)
 create mode 100644 patch_torch211_flash_attn2.rocm.sh
 create mode 100644 rocm_patch/hipify_patch-torch2.1.1.patch

diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst
index 7f2ebdbcba6d..eac88a0d47cf 100644
--- a/docs/source/getting_started/amd-installation.rst
+++ b/docs/source/getting_started/amd-installation.rst
@@ -88,7 +88,8 @@ If you are going to setup on new pytorch+rocm5.7 docker container, you can follo
         $ bash patch_torch211_flash_attn2.rocm.sh
 
     .. note::
-        Flash-attention-2 (v2.0.4) does not support sliding windows attention.
+        - Flash-attention-2 (v2.0.4) does not support sliding windows attention.
+        - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`)
 
 2. Setup xformers==0.0.22.post7 without dependencies, and apply patches
 
diff --git a/patch_torch211_flash_attn2.rocm.sh b/patch_torch211_flash_attn2.rocm.sh
new file mode 100644
index 000000000000..fe9e1c51d8a9
--- /dev/null
+++ b/patch_torch211_flash_attn2.rocm.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+export PYTORCH_VERSION=$(python -c 'import torch; print(torch.__version__)')
+
+echo $PYTORCH_VERSION
+
+if [[ $PYTORCH_VERSION == *"2.1"* ]]
+then
+echo "2.1.1"
+export GPU_ARCHS="gfx90a;gfx940;gfx941;gfx942"
+export PYTHON_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])')
+patch "${PYTHON_SITE_PACKAGES}/torch/utils/hipify/hipify_python.py" "./rocm_patch/hipify_patch-torch2.1.1.patch"
+fi
\ No newline at end of file
diff --git a/requirements-rocm.txt b/requirements-rocm.txt
index ebbf4783c665..53fd3ea24d92 100644
--- a/requirements-rocm.txt
+++ b/requirements-rocm.txt
@@ -1,10 +1,14 @@
 ninja  # For faster builds.
+typing-extensions>=4.8.0
+starlette
 psutil
 ray >= 2.5.1
 pandas  # Required for Ray data.
 pyarrow  # Required for Ray data.
 sentencepiece  # Required for LLaMA tokenizer.
 numpy
+tokenizers>=0.15.0
+huggingface_hub<0.18,>=0.16.4
 einops  # Required for phi-1_5
 transformers >= 4.34.0  # Required for Mistral.
 fastapi
diff --git a/rocm_patch/hipify_patch-torch2.1.1.patch b/rocm_patch/hipify_patch-torch2.1.1.patch
new file mode 100644
index 000000000000..bae87c8d788b
--- /dev/null
+++ b/rocm_patch/hipify_patch-torch2.1.1.patch
@@ -0,0 +1,16 @@
+--- hipifypy.ori        2023-11-29 05:59:14.477283000 +0000
++++ hipifypy.edited     2023-11-29 06:13:55.937250000 +0000
+@@ -870,7 +870,12 @@
+                     return m.group(0)
+                 # Hipify header file first if needed
+                 if header_filepath not in HIPIFY_FINAL_RESULT:
+-                    preprocess_file_and_save_result(output_directory,
++                      #JCG added skip logic
++                    if "composable_kernel" in header_filepath:
++                        print("Force skipping hipification of CK file: " + header_filepath)
++                        HIPIFY_FINAL_RESULT[header_filepath] = HipifyResult(current_state=CurrentState.DONE, hipified_path = header_filepath)
++                    else:
++                        preprocess_file_and_save_result(output_directory,
+                                                     header_filepath,
+                                                     all_files, header_include_dirs, stats, hip_clang_launch,
+                                                     is_pytorch_extension, clean_ctx, show_progress)
\ No newline at end of file
diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py
index ee58b8b9074a..d61abb92dbd2 100644
--- a/vllm/engine/ray_utils.py
+++ b/vllm/engine/ray_utils.py
@@ -73,7 +73,7 @@ def initialize_cluster(
                 "Ray is not installed. Please install Ray to use distributed "
                 "serving.")
         # Connect to a ray cluster.
-        ray.init(address=ray_address, ignore_reinit_error=True)
+        ray.init(address=ray_address, ignore_reinit_error=True, num_gpus=parallel_config.world_size)
 
     if not parallel_config.worker_use_ray:
         # Initialize cluster locally.

From 77d519ed7c6e3f7a6975496284332475f440c3e6 Mon Sep 17 00:00:00 2001
From: miloice <kuanfu.liu@akirakan.com>
Date: Wed, 29 Nov 2023 10:15:24 +0000
Subject: [PATCH 09/14] remove unncessary patch as torch 2.1 2.2 does not need
 one

---
 docs/source/getting_started/amd-installation.rst |  5 ++---
 patch_torch211_flash_attn2.rocm.sh               | 12 ------------
 rocm_patch/hipify_patch-torch2.1.1.patch         | 16 ----------------
 3 files changed, 2 insertions(+), 31 deletions(-)
 delete mode 100644 patch_torch211_flash_attn2.rocm.sh
 delete mode 100644 rocm_patch/hipify_patch-torch2.1.1.patch

diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst
index eac88a0d47cf..862b8a1d6e89 100644
--- a/docs/source/getting_started/amd-installation.rst
+++ b/docs/source/getting_started/amd-installation.rst
@@ -79,9 +79,8 @@ If you are going to setup on new pytorch+rocm5.7 docker container, you can follo
     Install flash-attention-2 (v2.0.4) following the instruction from [ROCmSoftwarePlatform/flash-attention](https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm)
 
 
-    If you are using Pytorch-2.1.1+rocm5.7
-    Install flash-attention-2 (v2.0.4) following the instruction from [ROCmSoftwarePlatform/flash-attention](https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm)
-    BUT, use the patch from this repo to patch the `hipify_python.py`
+    If you are using Pytorch-2.1.x+rocm5.7 or Pytorch-2.2.x+rocm5.7, you don't need to apply the `hipify_python.patch`.
+    You can directly build the flash-attention-2.
 
     .. code-block:: console
 
diff --git a/patch_torch211_flash_attn2.rocm.sh b/patch_torch211_flash_attn2.rocm.sh
deleted file mode 100644
index fe9e1c51d8a9..000000000000
--- a/patch_torch211_flash_attn2.rocm.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/bin/bash
-export PYTORCH_VERSION=$(python -c 'import torch; print(torch.__version__)')
-
-echo $PYTORCH_VERSION
-
-if [[ $PYTORCH_VERSION == *"2.1"* ]]
-then
-echo "2.1.1"
-export GPU_ARCHS="gfx90a;gfx940;gfx941;gfx942"
-export PYTHON_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])')
-patch "${PYTHON_SITE_PACKAGES}/torch/utils/hipify/hipify_python.py" "./rocm_patch/hipify_patch-torch2.1.1.patch"
-fi
\ No newline at end of file
diff --git a/rocm_patch/hipify_patch-torch2.1.1.patch b/rocm_patch/hipify_patch-torch2.1.1.patch
deleted file mode 100644
index bae87c8d788b..000000000000
--- a/rocm_patch/hipify_patch-torch2.1.1.patch
+++ /dev/null
@@ -1,16 +0,0 @@
---- hipifypy.ori        2023-11-29 05:59:14.477283000 +0000
-+++ hipifypy.edited     2023-11-29 06:13:55.937250000 +0000
-@@ -870,7 +870,12 @@
-                     return m.group(0)
-                 # Hipify header file first if needed
-                 if header_filepath not in HIPIFY_FINAL_RESULT:
--                    preprocess_file_and_save_result(output_directory,
-+                      #JCG added skip logic
-+                    if "composable_kernel" in header_filepath:
-+                        print("Force skipping hipification of CK file: " + header_filepath)
-+                        HIPIFY_FINAL_RESULT[header_filepath] = HipifyResult(current_state=CurrentState.DONE, hipified_path = header_filepath)
-+                    else:
-+                        preprocess_file_and_save_result(output_directory,
-                                                     header_filepath,
-                                                     all_files, header_include_dirs, stats, hip_clang_launch,
-                                                     is_pytorch_extension, clean_ctx, show_progress)
\ No newline at end of file

From 046162043f95686bff18e81ee67a6789bdc33675 Mon Sep 17 00:00:00 2001
From: miloice <kuanfu.liu@akirakan.com>
Date: Wed, 29 Nov 2023 10:30:10 +0000
Subject: [PATCH 10/14] fix squeezellm variable

---
 vllm/model_executor/layers/quantization/squeezellm.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py
index 17d6ca4cc133..ccacb1c2f586 100644
--- a/vllm/model_executor/layers/quantization/squeezellm.py
+++ b/vllm/model_executor/layers/quantization/squeezellm.py
@@ -112,6 +112,8 @@ def apply_weights(self,
                       bias: Optional[torch.Tensor] = None) -> torch.Tensor:
         qweight = weights["qweight"]
         lookup_table = weights["lookup_table"]
+        out_shape = x.shape[:-1] + (qweight.shape[-1], )
+        reshaped_x = x.reshape(-1, x.shape[-1])
         if torch.cuda.is_available() and torch.version.hip:
             out_float = torch.zeros(out_shape, device="cuda", dtype=torch.float)
             quantization_ops.squeezellm_gemm(reshaped_x, qweight, out_float,
@@ -119,8 +121,6 @@ def apply_weights(self,
             out = out_float.to(dtype=torch.float16)
             # do something specific for HIP
         elif torch.cuda.is_available() and torch.version.cuda:
-            out_shape = x.shape[:-1] + (qweight.shape[-1], )
-            reshaped_x = x.reshape(-1, x.shape[-1])
             # NOTE: The output tensor should be zero-initialized.
             out = torch.zeros(out_shape, device="cuda", dtype=torch.float16)
             quantization_ops.squeezellm_gemm(reshaped_x, qweight, out,

From e838483f7973edd5afe959764e6403bf893cbc1f Mon Sep 17 00:00:00 2001
From: miloice <kuanfu.liu@akirakan.com>
Date: Wed, 29 Nov 2023 10:54:46 +0000
Subject: [PATCH 11/14] support vllm + rocm versioning

---
 setup.py | 39 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 34 insertions(+), 5 deletions(-)

diff --git a/setup.py b/setup.py
index 061eb9fd0afa..786c9d91579d 100644
--- a/setup.py
+++ b/setup.py
@@ -53,7 +53,26 @@ def get_amdgpu_offload_arch():
         raise RuntimeError(error_message)
     
     return None
-        
+
+def get_hipcc_rocm_version():
+    # Run the hipcc --version command
+    result = subprocess.run(['hipcc', '--version'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+
+    # Check if the command was executed successfully
+    if result.returncode != 0:
+        print("Error running 'hipcc --version'")
+        return None
+
+    # Extract the version using a regular expression
+    match = re.search(r'HIP version: (\S+)', result.stdout)
+    if match:
+        # Return the version string
+        return match.group(1)
+    else:
+        print("Could not find HIP version in the output")
+        return None
+
+
 def get_nvcc_cuda_version(cuda_dir: str) -> Version:
     """Get the CUDA version from nvcc.
 
@@ -289,10 +308,20 @@ def find_version(filepath: str) -> str:
 
 def get_vllm_version() -> str:
     version = find_version(get_path("vllm", "__init__.py"))
-    # cuda_version = str(nvcc_cuda_version)
-    # if cuda_version != MAIN_CUDA_VERSION:
-    #     cuda_version_str = cuda_version.replace(".", "")[:3]
-    #     version += f"+cu{cuda_version_str}"
+    
+    if torch.cuda.is_available() and torch.version.cuda:
+        cuda_version = str(nvcc_cuda_version)
+        if cuda_version != MAIN_CUDA_VERSION:
+            cuda_version_str = cuda_version.replace(".", "")[:3]
+            version += f"+cu{cuda_version_str}"
+        
+    elif torch.cuda.is_available() and torch.version.hip:
+        # Get the HIP version
+        hipcc_version = get_hipcc_rocm_version()
+        if hipcc_version != MAIN_CUDA_VERSION:
+            rocm_version_str = hipcc_version.replace(".", "")[:3]
+            version += f"+rocm{rocm_version_str}"
+    
     return version
 
 

From eadba42bad8c2baf47622e6adfe53ef76c907691 Mon Sep 17 00:00:00 2001
From: miloice <kuanfu.liu@akirakan.com>
Date: Wed, 29 Nov 2023 12:27:18 +0000
Subject: [PATCH 12/14] format.sh code

---
 pyproject.toml                                |  2 +-
 setup.py                                      | 73 ++++++++++---------
 vllm/engine/arg_utils.py                      |  8 +-
 vllm/engine/ray_utils.py                      |  4 +-
 vllm/model_executor/layers/attention.py       |  6 +-
 .../layers/quantization/__init__.py           |  1 +
 .../layers/quantization/squeezellm.py         |  8 +-
 7 files changed, 57 insertions(+), 45 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index e3e3e389f789..f9390ed8c52a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -3,7 +3,7 @@ requires = [
     "ninja",
     "packaging",
     "setuptools",
-    "torch >= 2.1.0",
+    # "torch >= 2.1.0", # commented out to accommodate ROCm
     "wheel",
 ]
 build-backend = "setuptools.build_meta"
diff --git a/setup.py b/setup.py
index 786c9d91579d..b75d0912a9bb 100644
--- a/setup.py
+++ b/setup.py
@@ -16,7 +16,7 @@
 
 # Supported NVIDIA GPU architectures.
 NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"}
-ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030","gfx1100"}
+ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030", "gfx1100"}
 # SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS)
 
 # Compiler flags.
@@ -24,19 +24,18 @@
 # TODO(woosuk): Should we use -O3?
 NVCC_FLAGS = ["-O2", "-std=c++17"]
 
-if torch.cuda.is_available() and torch.version.hip:
-    if ROCM_HOME is not None:
-        NVCC_FLAGS += [f"-DUSE_ROCM"]
+if torch.cuda.is_available() and torch.version.hip and ROCM_HOME is not None:
+    NVCC_FLAGS += ["-DUSE_ROCM"]
 
-if torch.cuda.is_available() and torch.version.cuda:
-    if CUDA_HOME is None:
-        raise RuntimeError(
-            "Cannot find CUDA_HOME. CUDA must be available to build the package.")
+if torch.cuda.is_available() and torch.version.cuda and CUDA_HOME is None:
+    raise RuntimeError(
+        "Cannot find CUDA_HOME. CUDA must be available to build the package.")
 
 ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0
 CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
 NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"]
 
+
 def get_amdgpu_offload_arch():
     error_message = ""
     command = "/opt/rocm/llvm/bin/amdgpu-offload-arch"
@@ -48,15 +47,19 @@ def get_amdgpu_offload_arch():
     except FileNotFoundError:
         # If the command is not found, print an error message
         error_message = f"The command {command} was not found."
-    
+
     if error_message:
         raise RuntimeError(error_message)
-    
+
     return None
 
+
 def get_hipcc_rocm_version():
     # Run the hipcc --version command
-    result = subprocess.run(['hipcc', '--version'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
+    result = subprocess.run(['hipcc', '--version'],
+                            stdout=subprocess.PIPE,
+                            stderr=subprocess.STDOUT,
+                            text=True)
 
     # Check if the command was executed successfully
     if result.returncode != 0:
@@ -103,7 +106,9 @@ def get_torch_arch_list() -> Set[str]:
         return set()
 
     # Filter out the invalid architectures and print a warning.
-    valid_archs = NVIDIA_SUPPORTED_ARCHS.union({s + "+PTX" for s in NVIDIA_SUPPORTED_ARCHS})
+    valid_archs = NVIDIA_SUPPORTED_ARCHS.union(
+        {s + "+PTX"
+         for s in NVIDIA_SUPPORTED_ARCHS})
     arch_list = torch_arch_list.intersection(valid_archs)
     # If none of the specified architectures are valid, raise an error.
     if not arch_list:
@@ -124,17 +129,17 @@ def get_torch_arch_list() -> Set[str]:
 
 # First, check the TORCH_CUDA_ARCH_LIST environment variable.
 compute_capabilities = get_torch_arch_list()
-if torch.cuda.is_available() and torch.version.cuda:
-    if not compute_capabilities:
-        # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available
-        # GPUs on the current machine.
-        device_count = torch.cuda.device_count()
-        for i in range(device_count):
-            major, minor = torch.cuda.get_device_capability(i)
-            if major < 7:
-                raise RuntimeError(
-                    "GPUs with compute capability below 7.0 are not supported.")
-            compute_capabilities.add(f"{major}.{minor}")
+if torch.cuda.is_available(
+) and torch.version.cuda and not compute_capabilities:
+    # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available
+    # GPUs on the current machine.
+    device_count = torch.cuda.device_count()
+    for i in range(device_count):
+        major, minor = torch.cuda.get_device_capability(i)
+        if major < 7:
+            raise RuntimeError(
+                "GPUs with compute capability below 7.0 are not supported.")
+        compute_capabilities.add(f"{major}.{minor}")
 
 if torch.cuda.is_available() and torch.version.cuda:
     nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME)
@@ -149,7 +154,8 @@ def get_torch_arch_list() -> Set[str]:
             compute_capabilities.remove("9.0")
     # Validate the NVCC CUDA version.
     if nvcc_cuda_version < Version("11.0"):
-        raise RuntimeError("CUDA 11.0 or higher is required to build the package.")
+        raise RuntimeError(
+            "CUDA 11.0 or higher is required to build the package.")
     if (nvcc_cuda_version < Version("11.1")
             and any(cc.startswith("8.6") for cc in compute_capabilities)):
         raise RuntimeError(
@@ -166,7 +172,7 @@ def get_torch_arch_list() -> Set[str]:
                 "Targeting compute capability 8.0 instead.",
                 stacklevel=2)
             compute_capabilities = set(cc for cc in compute_capabilities
-                                    if not cc.startswith("8.9"))
+                                       if not cc.startswith("8.9"))
             compute_capabilities.add("8.0+PTX")
         if any(cc.startswith("9.0") for cc in compute_capabilities):
             raise RuntimeError(
@@ -177,7 +183,9 @@ def get_torch_arch_list() -> Set[str]:
         num = capability[0] + capability[2]
         NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"]
         if capability.endswith("+PTX"):
-            NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=compute_{num}"]
+            NVCC_FLAGS += [
+                "-gencode", f"arch=compute_{num},code=compute_{num}"
+            ]
 
     # Use NVCC threads to parallelize the build.
     if nvcc_cuda_version >= Version("11.2"):
@@ -186,12 +194,11 @@ def get_torch_arch_list() -> Set[str]:
 
 elif torch.cuda.is_available() and torch.version.hip:
     amd_arch = get_amdgpu_offload_arch()
-    if not amd_arch in ROCM_SUPPORTED_ARCHS:
+    if amd_arch not in ROCM_SUPPORTED_ARCHS:
         raise RuntimeError(
             f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}"
-            f"amdgpu_arch_found: {amd_arch}"
-        )
-    
+            f"amdgpu_arch_found: {amd_arch}")
+
 ext_modules = []
 
 # Cache operations.
@@ -308,20 +315,20 @@ def find_version(filepath: str) -> str:
 
 def get_vllm_version() -> str:
     version = find_version(get_path("vllm", "__init__.py"))
-    
+
     if torch.cuda.is_available() and torch.version.cuda:
         cuda_version = str(nvcc_cuda_version)
         if cuda_version != MAIN_CUDA_VERSION:
             cuda_version_str = cuda_version.replace(".", "")[:3]
             version += f"+cu{cuda_version_str}"
-        
+
     elif torch.cuda.is_available() and torch.version.hip:
         # Get the HIP version
         hipcc_version = get_hipcc_rocm_version()
         if hipcc_version != MAIN_CUDA_VERSION:
             rocm_version_str = hipcc_version.replace(".", "")[:3]
             version += f"+rocm{rocm_version_str}"
-    
+
     return version
 
 
diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py
index c52b5f03b760..c7612b3ac407 100644
--- a/vllm/engine/arg_utils.py
+++ b/vllm/engine/arg_utils.py
@@ -98,9 +98,7 @@ def add_cli_args(
                 '--dtype',
                 type=str,
                 default='half',
-                choices=[
-                    'half', 'float16', 'bfloat16'
-                ],
+                choices=['half', 'float16', 'bfloat16'],
                 help='data type for model weights and activations. '
                 'The default option is FP16 precision '
                 'Supports FP16 and BF16 ')
@@ -132,7 +130,7 @@ def add_cli_args(
                 'The "auto" option will use FP16 precision '
                 'for FP32 and FP16 models, and BF16 precision '
                 'for BF16 models.')
-            
+
         parser.add_argument('--max-model-len',
                             type=int,
                             default=None,
@@ -203,7 +201,7 @@ def add_cli_args(
                                 choices=['squeezellm', None],
                                 default=None,
                                 help='Method used to quantize the weights')
-            
+
         elif torch.cuda.is_available() and torch.version.cuda:
             # Quantization settings.
             parser.add_argument('--quantization',
diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py
index d61abb92dbd2..6bff8153e2a5 100644
--- a/vllm/engine/ray_utils.py
+++ b/vllm/engine/ray_utils.py
@@ -73,7 +73,9 @@ def initialize_cluster(
                 "Ray is not installed. Please install Ray to use distributed "
                 "serving.")
         # Connect to a ray cluster.
-        ray.init(address=ray_address, ignore_reinit_error=True, num_gpus=parallel_config.world_size)
+        ray.init(address=ray_address,
+                 ignore_reinit_error=True,
+                 num_gpus=parallel_config.world_size)
 
     if not parallel_config.worker_use_ray:
         # Initialize cluster locally.
diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py
index 7750a95c3f04..2e042721d9a2 100644
--- a/vllm/model_executor/layers/attention.py
+++ b/vllm/model_executor/layers/attention.py
@@ -113,7 +113,8 @@ def multi_query_kv_attention(
             attn_bias=input_metadata.attn_bias,
             p=0.0,
             scale=self.scale,
-            op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if (torch.cuda.is_available() and torch.version.hip) else None,
+            op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if
+            (torch.cuda.is_available() and torch.version.hip) else None,
         )
         # TODO(woosuk): Unnecessary copy. Optimize.
         output.copy_(out.view_as(output))
@@ -452,7 +453,8 @@ def multi_query_kv_attention(
             attn_bias=input_metadata.attn_bias,
             p=0.0,
             scale=self.scale,
-            op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if (torch.cuda.is_available() and torch.version.hip) else None,
+            op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if
+            (torch.cuda.is_available() and torch.version.hip) else None,
         )
         # TODO(woosuk): Unnecessary copy. Optimize.
         output.copy_(out.view_as(output))
diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py
index 748a6f610f2a..f4d25566cf59 100644
--- a/vllm/model_executor/layers/quantization/__init__.py
+++ b/vllm/model_executor/layers/quantization/__init__.py
@@ -11,6 +11,7 @@
     from vllm.model_executor.layers.quantization.awq import AWQConfig
     _QUANTIZATION_CONFIG_REGISTRY["awq"] = AWQConfig
 
+
 def get_quantization_config(quantization: str) -> Type[QuantizationConfig]:
     if quantization not in _QUANTIZATION_CONFIG_REGISTRY:
         raise ValueError(f"Invalid quantization method: {quantization}")
diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py
index ccacb1c2f586..be318b2ef205 100644
--- a/vllm/model_executor/layers/quantization/squeezellm.py
+++ b/vllm/model_executor/layers/quantization/squeezellm.py
@@ -115,16 +115,18 @@ def apply_weights(self,
         out_shape = x.shape[:-1] + (qweight.shape[-1], )
         reshaped_x = x.reshape(-1, x.shape[-1])
         if torch.cuda.is_available() and torch.version.hip:
-            out_float = torch.zeros(out_shape, device="cuda", dtype=torch.float)
+            out_float = torch.zeros(out_shape,
+                                    device="cuda",
+                                    dtype=torch.float)
             quantization_ops.squeezellm_gemm(reshaped_x, qweight, out_float,
-                                            lookup_table)
+                                             lookup_table)
             out = out_float.to(dtype=torch.float16)
             # do something specific for HIP
         elif torch.cuda.is_available() and torch.version.cuda:
             # NOTE: The output tensor should be zero-initialized.
             out = torch.zeros(out_shape, device="cuda", dtype=torch.float16)
             quantization_ops.squeezellm_gemm(reshaped_x, qweight, out,
-                                            lookup_table)
+                                             lookup_table)
 
         if bias is not None:
             out = out + bias

From fb2610d7ac7a51a30431504e9b908192429d8ac3 Mon Sep 17 00:00:00 2001
From: miloice <kuanfu.liu@akirakan.com>
Date: Wed, 29 Nov 2023 12:28:23 +0000
Subject: [PATCH 13/14] rename rocm launch script

---
 launch_amd_docker.sh => launch_rocm_docker.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename launch_amd_docker.sh => launch_rocm_docker.sh (100%)

diff --git a/launch_amd_docker.sh b/launch_rocm_docker.sh
similarity index 100%
rename from launch_amd_docker.sh
rename to launch_rocm_docker.sh

From e70c1feb1441cdbd84e90d5a8ce391ecbc1c11d8 Mon Sep 17 00:00:00 2001
From: miloice <kuanfu.liu@akirakan.com>
Date: Wed, 29 Nov 2023 12:29:10 +0000
Subject: [PATCH 14/14] remove rom docker launch script

---
 launch_rocm_docker.sh | 14 --------------
 1 file changed, 14 deletions(-)
 delete mode 100644 launch_rocm_docker.sh

diff --git a/launch_rocm_docker.sh b/launch_rocm_docker.sh
deleted file mode 100644
index b26c8044e7de..000000000000
--- a/launch_rocm_docker.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/bin/bash
-docker run -it \
-       --network=host \
-       --group-add=video \
-       --ipc=host \
-       --cap-add=SYS_PTRACE \
-       --security-opt seccomp=unconfined \
-       --shm-size 8G \
-       --device /dev/kfd \
-       --device /dev/dri \
-       -v /home/akk/tjtanaa/vllm-rocm:/app/libs/vllm-rocm-external \
-       -v /home/akk/hf_model:/app/hf_model \
-       vllm-rocm-tj \
-       bash
\ No newline at end of file