From 9eee8bc52ae975af8cd0aa8e8af7e5f1c8a4a621 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Mon, 27 Nov 2023 09:12:45 +0000 Subject: [PATCH 01/14] update code path of setup.py --- setup.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/setup.py b/setup.py index 0e28b9360277..ffb682bef74b 100644 --- a/setup.py +++ b/setup.py @@ -24,11 +24,11 @@ # TODO(woosuk): Should we use -O3? NVCC_FLAGS = ["-O2", "-std=c++17"] -if torch.version.hip: +if torch.cuda.is_available() and torch.version.hip: if ROCM_HOME is not None: NVCC_FLAGS += [f"-DUSE_ROCM"] -if not torch.version.hip: +if torch.cuda.is_available() and torch.version.cuda: if CUDA_HOME is None: raise RuntimeError( "Cannot find CUDA_HOME. CUDA must be available to build the package.") @@ -37,9 +37,9 @@ CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] -if CUDA_HOME is None: - raise RuntimeError( - "Cannot find CUDA_HOME. CUDA must be available to build the package.") +#if CUDA_HOME is None: +# raise RuntimeError( +# "Cannot find CUDA_HOME. CUDA must be available to build the package.") def get_nvcc_cuda_version(cuda_dir: str) -> Version: @@ -93,7 +93,7 @@ def get_torch_arch_list() -> Set[str]: # First, check the TORCH_CUDA_ARCH_LIST environment variable. compute_capabilities = get_torch_arch_list() -if not torch.version.hip: +if torch.cuda.is_available() and torch.version.cuda: if not compute_capabilities: # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available # GPUs on the current machine. @@ -105,7 +105,7 @@ def get_torch_arch_list() -> Set[str]: "GPUs with compute capability below 7.0 are not supported.") compute_capabilities.add(f"{major}.{minor}") -if not torch.version.hip: +if torch.cuda.is_available() and torch.version.cuda: nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME) if not compute_capabilities: # If no GPU is specified nor available, add all supported architectures @@ -211,7 +211,7 @@ def get_torch_arch_list() -> Set[str]: ext_modules.append(activation_extension) # Quantization kernels. -if not torch.version.hip: +if torch.cuda.is_available() and torch.version.cuda: quantization_extension = CUDAExtension( name="vllm.quantization_ops", sources=[ @@ -224,7 +224,7 @@ def get_torch_arch_list() -> Set[str]: "nvcc": NVCC_FLAGS, }, ) -else: +if torch.cuda.is_available() and torch.version.hip: quantization_extension = CUDAExtension( name="vllm.quantization_ops", sources=[ From 74ed36ba23a1019e2ca7447a85db43d03f6483dd Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Tue, 28 Nov 2023 15:48:14 +0800 Subject: [PATCH 02/14] remove awq and enable squeezellm for rocm --- .../layers/quantization/__init__.py | 5 +++-- .../model_executor/layers/quantization/awq.py | 6 +++++- .../layers/quantization/squeezellm.py | 19 +++++++++++++------ 3 files changed, 21 insertions(+), 9 deletions(-) diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index 3d937ba64f9f..4eedfba75d16 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -1,14 +1,15 @@ from typing import Type -from vllm.model_executor.layers.quantization.awq import AWQConfig from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig from vllm.model_executor.layers.quantization.base_config import QuantizationConfig _QUANTIZATION_CONFIG_REGISTRY = { - "awq": AWQConfig, "squeezellm": SqueezeLLMConfig, } +if torch.cuda.is_available() and torch.version.cuda: + from vllm.model_executor.layers.quantization.awq import AWQConfig + _QUANTIZATION_CONFIG_REGISTRY["awq"] = AWQConfig def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: if quantization not in _QUANTIZATION_CONFIG_REGISTRY: diff --git a/vllm/model_executor/layers/quantization/awq.py b/vllm/model_executor/layers/quantization/awq.py index 0ab5819d930a..5561a9309fc3 100644 --- a/vllm/model_executor/layers/quantization/awq.py +++ b/vllm/model_executor/layers/quantization/awq.py @@ -2,8 +2,12 @@ import torch from torch.nn.parameter import Parameter +if torch.cuda.is_available() and torch.version.hip: + # do something specific for HIP + print("Warning: vLLM does not support AWQ on ROCm.") +elif torch.cuda.is_available() and torch.version.cuda: + from vllm import quantization_ops -from vllm import quantization_ops from vllm.model_executor.layers.linear import (LinearMethodBase, set_weight_attrs) from vllm.model_executor.layers.quantization.base_config import QuantizationConfig diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py index 61ec8b79b6dd..17d6ca4cc133 100644 --- a/vllm/model_executor/layers/quantization/squeezellm.py +++ b/vllm/model_executor/layers/quantization/squeezellm.py @@ -112,12 +112,19 @@ def apply_weights(self, bias: Optional[torch.Tensor] = None) -> torch.Tensor: qweight = weights["qweight"] lookup_table = weights["lookup_table"] - out_shape = x.shape[:-1] + (qweight.shape[-1], ) - reshaped_x = x.reshape(-1, x.shape[-1]) - # NOTE: The output tensor should be zero-initialized. - out = torch.zeros(out_shape, device="cuda", dtype=torch.float16) - quantization_ops.squeezellm_gemm(reshaped_x, qweight, out, - lookup_table) + if torch.cuda.is_available() and torch.version.hip: + out_float = torch.zeros(out_shape, device="cuda", dtype=torch.float) + quantization_ops.squeezellm_gemm(reshaped_x, qweight, out_float, + lookup_table) + out = out_float.to(dtype=torch.float16) + # do something specific for HIP + elif torch.cuda.is_available() and torch.version.cuda: + out_shape = x.shape[:-1] + (qweight.shape[-1], ) + reshaped_x = x.reshape(-1, x.shape[-1]) + # NOTE: The output tensor should be zero-initialized. + out = torch.zeros(out_shape, device="cuda", dtype=torch.float16) + quantization_ops.squeezellm_gemm(reshaped_x, qweight, out, + lookup_table) if bias is not None: out = out + bias From d547da612cb2aa8789c9b64d81bcb6824eebe158 Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Tue, 28 Nov 2023 09:13:00 +0000 Subject: [PATCH 03/14] fix quantization code path to ignore awq --- csrc/quantization.cpp | 17 +++++++++++------ .../layers/quantization/__init__.py | 2 +- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/csrc/quantization.cpp b/csrc/quantization.cpp index dfe17a496c78..764188597fd3 100644 --- a/csrc/quantization.cpp +++ b/csrc/quantization.cpp @@ -1,11 +1,14 @@ #include -torch::Tensor awq_gemm( - torch::Tensor _in_feats, - torch::Tensor _kernel, - torch::Tensor _scaling_factors, - torch::Tensor _zeros, - int split_k_iters); + +#ifndef USE_ROCM + torch::Tensor awq_gemm( + torch::Tensor _in_feats, + torch::Tensor _kernel, + torch::Tensor _scaling_factors, + torch::Tensor _zeros, + int split_k_iters); +#endif void squeezellm_gemm( torch::Tensor vec, @@ -14,6 +17,8 @@ void squeezellm_gemm( torch::Tensor lookup_table); PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) { +#ifndef USE_ROCM m.def("awq_gemm", &awq_gemm, "Quantized GEMM for AWQ"); +#endif m.def("squeezellm_gemm", &squeezellm_gemm, "Quantized GEMM for SqueezeLLM"); } diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index 4eedfba75d16..748a6f610f2a 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -1,5 +1,5 @@ from typing import Type - +import torch from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig from vllm.model_executor.layers.quantization.base_config import QuantizationConfig From b4eeb5b7e0b4843845e3809f25fa33a38007ecae Mon Sep 17 00:00:00 2001 From: tjtanaa Date: Tue, 28 Nov 2023 17:25:45 +0800 Subject: [PATCH 04/14] use flash attention if rocm --- vllm/model_executor/layers/attention.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index e51bb311decd..7750a95c3f04 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -113,6 +113,7 @@ def multi_query_kv_attention( attn_bias=input_metadata.attn_bias, p=0.0, scale=self.scale, + op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if (torch.cuda.is_available() and torch.version.hip) else None, ) # TODO(woosuk): Unnecessary copy. Optimize. output.copy_(out.view_as(output)) @@ -451,6 +452,7 @@ def multi_query_kv_attention( attn_bias=input_metadata.attn_bias, p=0.0, scale=self.scale, + op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if (torch.cuda.is_available() and torch.version.hip) else None, ) # TODO(woosuk): Unnecessary copy. Optimize. output.copy_(out.view_as(output)) From fcaa7695cf053d27d68ee926e1888f2f6e2cf0f6 Mon Sep 17 00:00:00 2001 From: miloice Date: Wed, 29 Nov 2023 03:37:14 +0000 Subject: [PATCH 05/14] update setup.py to check for AMD GPU Arch supports, and load requirements-rocm.txt --- setup.py | 48 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/setup.py b/setup.py index ffb682bef74b..061eb9fd0afa 100644 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ # Supported NVIDIA GPU architectures. NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"} ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030","gfx1100"} -SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS) +# SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS) # Compiler flags. CXX_FLAGS = ["-g", "-O2", "-std=c++17"] @@ -37,11 +37,23 @@ CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] -#if CUDA_HOME is None: -# raise RuntimeError( -# "Cannot find CUDA_HOME. CUDA must be available to build the package.") - - +def get_amdgpu_offload_arch(): + error_message = "" + command = "/opt/rocm/llvm/bin/amdgpu-offload-arch" + try: + output = subprocess.check_output([command]) + return output.decode('utf-8').strip() + except subprocess.CalledProcessError as e: + error_message = f"Error: {e}" + except FileNotFoundError: + # If the command is not found, print an error message + error_message = f"The command {command} was not found." + + if error_message: + raise RuntimeError(error_message) + + return None + def get_nvcc_cuda_version(cuda_dir: str) -> Version: """Get the CUDA version from nvcc. @@ -72,7 +84,7 @@ def get_torch_arch_list() -> Set[str]: return set() # Filter out the invalid architectures and print a warning. - valid_archs = SUPPORTED_ARCHS.union({s + "+PTX" for s in NVIDIA_SUPPORTED_ARCHS}) + valid_archs = NVIDIA_SUPPORTED_ARCHS.union({s + "+PTX" for s in NVIDIA_SUPPORTED_ARCHS}) arch_list = torch_arch_list.intersection(valid_archs) # If none of the specified architectures are valid, raise an error. if not arch_list: @@ -110,7 +122,7 @@ def get_torch_arch_list() -> Set[str]: if not compute_capabilities: # If no GPU is specified nor available, add all supported architectures # based on the NVCC CUDA version. - compute_capabilities = SUPPORTED_ARCHS.copy() + compute_capabilities = NVIDIA_SUPPORTED_ARCHS.copy() if nvcc_cuda_version < Version("11.1"): compute_capabilities.remove("8.6") if nvcc_cuda_version < Version("11.8"): @@ -153,6 +165,14 @@ def get_torch_arch_list() -> Set[str]: num_threads = min(os.cpu_count(), 8) NVCC_FLAGS += ["--threads", str(num_threads)] +elif torch.cuda.is_available() and torch.version.hip: + amd_arch = get_amdgpu_offload_arch() + if not amd_arch in ROCM_SUPPORTED_ARCHS: + raise RuntimeError( + f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}" + f"amdgpu_arch_found: {amd_arch}" + ) + ext_modules = [] # Cache operations. @@ -224,7 +244,7 @@ def get_torch_arch_list() -> Set[str]: "nvcc": NVCC_FLAGS, }, ) -if torch.cuda.is_available() and torch.version.hip: +elif torch.cuda.is_available() and torch.version.hip: quantization_extension = CUDAExtension( name="vllm.quantization_ops", sources=[ @@ -287,8 +307,14 @@ def read_readme() -> str: def get_requirements() -> List[str]: """Get Python package dependencies from requirements.txt.""" - with open(get_path("requirements.txt")) as f: - requirements = f.read().strip().split("\n") + if torch.cuda.is_available() and torch.version.hip: + with open(get_path("requirements-rocm.txt")) as f: + requirements = f.read().strip().split("\n") + elif torch.cuda.is_available() and torch.version.cuda: + with open(get_path("requirements.txt")) as f: + requirements = f.read().strip().split("\n") + print("requirements: ", requirements) + # exit() return requirements From dea3cc394a293eb05eb8a2c940676008c493eafb Mon Sep 17 00:00:00 2001 From: miloice Date: Wed, 29 Nov 2023 03:39:53 +0000 Subject: [PATCH 06/14] add patch for xformers-0.0.22.post7; add missing requirements-rocm.txt; add rocm.Dockerfile draft --- commonpy_xformers-0.0.22.post7.patch | 13 +++ flashpy_xformers-0.0.22.post7.patch | 134 +++++++++++++++++++++++++++ launch_docker.sh | 14 +++ patch_xformers-0.0.22.post7.sh | 22 +++++ requirements-rocm.txt | 12 +++ rocm.Dockerfile | 64 +++++++++++++ 6 files changed, 259 insertions(+) create mode 100644 commonpy_xformers-0.0.22.post7.patch create mode 100644 flashpy_xformers-0.0.22.post7.patch create mode 100644 launch_docker.sh create mode 100644 patch_xformers-0.0.22.post7.sh create mode 100644 requirements-rocm.txt create mode 100644 rocm.Dockerfile diff --git a/commonpy_xformers-0.0.22.post7.patch b/commonpy_xformers-0.0.22.post7.patch new file mode 100644 index 000000000000..4d7495cf13e1 --- /dev/null +++ b/commonpy_xformers-0.0.22.post7.patch @@ -0,0 +1,13 @@ +--- /opt/conda/envs/py_3.10/lib/python3.10/site-packages/xformers/ops/fmha/common.py 2023-11-29 03:17:03.930103539 +0000 ++++ common.py 2023-11-28 16:14:19.846233146 +0000 +@@ -298,8 +298,8 @@ + dtype = d.query.dtype + if device_type not in cls.SUPPORTED_DEVICES: + reasons.append(f"device={device_type} (supported: {cls.SUPPORTED_DEVICES})") +- if device_type == "cuda" and not _built_with_cuda: +- reasons.append("xFormers wasn't build with CUDA support") ++ #if device_type == "cuda" and not _built_with_cuda: ++ # reasons.append("xFormers wasn't build with CUDA support") + if device_type == "cuda": + device_capability = torch.cuda.get_device_capability(d.device) + if device_capability < cls.CUDA_MINIMUM_COMPUTE_CAPABILITY: diff --git a/flashpy_xformers-0.0.22.post7.patch b/flashpy_xformers-0.0.22.post7.patch new file mode 100644 index 000000000000..4798f1efd461 --- /dev/null +++ b/flashpy_xformers-0.0.22.post7.patch @@ -0,0 +1,134 @@ +--- /opt/conda/envs/py_3.10/lib/python3.10/site-packages/xformers/ops/fmha/flash.py 2023-11-29 03:17:03.930103539 +0000 ++++ flash.py 2023-11-28 16:14:25.206128903 +0000 +@@ -31,39 +31,39 @@ + + FLASH_VERSION = "0.0.0" + try: +- try: +- from ... import _C_flashattention # type: ignore[attr-defined] +- from ..._cpp_lib import _build_metadata +- +- if _build_metadata is not None: +- FLASH_VERSION = _build_metadata.flash_version +- except ImportError: +- import flash_attn +- from flash_attn.flash_attn_interface import flash_attn_cuda as _C_flashattention +- +- FLASH_VERSION = flash_attn.__version__ +- flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:2]) +- if flash_ver_parsed < (2, 3): +- raise ImportError("Requires 2.3 for sliding window support") ++ #try: ++ # from ... import _C_flashattention # type: ignore[attr-defined] ++ # from ..._cpp_lib import _build_metadata ++ ++ # if _build_metadata is not None: ++ # FLASH_VERSION = _build_metadata.flash_version ++ #except ImportError: ++ import flash_attn ++ from flash_attn.flash_attn_interface import flash_attn_cuda as _C_flashattention ++ ++ FLASH_VERSION = flash_attn.__version__ ++ # flash_ver_parsed = tuple(int(s) for s in FLASH_VERSION.split(".")[:2]) ++ # if flash_ver_parsed < (2, 3): ++ # raise ImportError("Requires 2.3 for sliding window support") + + # create library so that flash-attn goes through the PyTorch Dispatcher +- _flash_lib = torch.library.Library("xformers_flash", "DEF") ++ #_flash_lib = torch.library.Library("xformers_flash", "DEF") + +- _flash_lib.define( +- "flash_fwd(Tensor query, Tensor key, Tensor value, " +- "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, " +- "int max_seqlen_q, int max_seqlen_k, " +- "float p, float softmax_scale, " +- "bool is_causal, int window_size, bool return_softmax) -> (Tensor, Tensor, Tensor)" +- ) +- +- _flash_lib.define( +- "flash_bwd(Tensor dout, Tensor query, Tensor key, Tensor value, " +- "Tensor out, Tensor softmax_lse_, Tensor dq, Tensor dk, Tensor dv, " +- "Tensor cu_seqlens_q, Tensor cu_seqlens_k, " +- "int max_seqlen_q, int max_seqlen_k, " +- "float p, float softmax_scale, bool is_causal, int window_size, Tensor rng_state) -> (Tensor, Tensor, Tensor)" +- ) ++ #_flash_lib.define( ++ # "flash_fwd(Tensor query, Tensor key, Tensor value, " ++ # "Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, " ++ # "int max_seqlen_q, int max_seqlen_k, " ++ # "float p, float softmax_scale, " ++ # "bool is_causal, int window_size, bool return_softmax) -> (Tensor, Tensor, Tensor)" ++ #) ++ ++ #_flash_lib.define( ++ # "flash_bwd(Tensor dout, Tensor query, Tensor key, Tensor value, " ++ # "Tensor out, Tensor softmax_lse_, Tensor dq, Tensor dk, Tensor dv, " ++ # "Tensor cu_seqlens_q, Tensor cu_seqlens_k, " ++ # "int max_seqlen_q, int max_seqlen_k, " ++ # "float p, float softmax_scale, bool is_causal, int window_size, Tensor rng_state) -> (Tensor, Tensor, Tensor)" ++ #) + + def _flash_fwd( + query, +@@ -98,8 +98,8 @@ + p, + softmax_scale, + is_causal, +- window_size - 1, # window_size_left +- -1, # window_size_right ++ # window_size - 1, # window_size_left ++ # -1, # window_size_right + return_softmax, + None, # rng + ) +@@ -127,8 +127,8 @@ + softmax_scale, + False, + is_causal, +- window_size - 1, # window_size_left +- -1, # window_size_right ++ # window_size - 1, # window_size_left ++ # -1, # window_size_right + return_softmax, + None, + ) +@@ -169,8 +169,8 @@ + p, + softmax_scale, + is_causal, +- window_size - 1, # window_size_left +- -1, # window_size_right ++ # window_size - 1, # window_size_left ++ # -1, # window_size_right + None, + rng_state, + ) +@@ -193,15 +193,15 @@ + softmax_scale, + False, # zero_tensors + is_causal, +- window_size - 1, # window_size_left +- -1, # window_size_right ++ # window_size - 1, # window_size_left ++ # -1, # window_size_right + None, + rng_state, + ) + return dq, dk, dv + +- _flash_lib.impl("flash_fwd", _flash_fwd, "CUDA") +- _flash_lib.impl("flash_bwd", _flash_bwd, "CUDA") ++ #_flash_lib.impl("flash_fwd", _flash_fwd, "CUDA") ++ #_flash_lib.impl("flash_bwd", _flash_bwd, "CUDA") + except ImportError: + pass + +@@ -348,7 +348,7 @@ + implementation. + """ + +- OPERATOR = get_operator("xformers_flash", "flash_fwd") ++ OPERATOR = _flash_fwd # get_operator("xformers_flash", "flash_fwd") + SUPPORTED_DEVICES: Set[str] = {"cuda"} + CUDA_MINIMUM_COMPUTE_CAPABILITY = (8, 0) + SUPPORTED_DTYPES: Set[torch.dtype] = {torch.half, torch.bfloat16} diff --git a/launch_docker.sh b/launch_docker.sh new file mode 100644 index 000000000000..b26c8044e7de --- /dev/null +++ b/launch_docker.sh @@ -0,0 +1,14 @@ +#!/bin/bash +docker run -it \ + --network=host \ + --group-add=video \ + --ipc=host \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --shm-size 8G \ + --device /dev/kfd \ + --device /dev/dri \ + -v /home/akk/tjtanaa/vllm-rocm:/app/libs/vllm-rocm-external \ + -v /home/akk/hf_model:/app/hf_model \ + vllm-rocm-tj \ + bash \ No newline at end of file diff --git a/patch_xformers-0.0.22.post7.sh b/patch_xformers-0.0.22.post7.sh new file mode 100644 index 000000000000..4ce38272bca8 --- /dev/null +++ b/patch_xformers-0.0.22.post7.sh @@ -0,0 +1,22 @@ +#!/bin/bash +export XFORMERS_FMHA_FLASH_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.flash.__file__)') +export XFORMERS_FMHA_COMMON_PATH=$(python -c 'from xformers import ops as xops; print(xops.fmha.common.__file__)') + +echo $XFORMERS_FMHA_FLASH_PATH +echo $XFORMERS_FMHA_COMMON_PATH + +if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_FLASH_PATH "flashpy_xformers-0.0.22.post7.patch"; then + echo "Applying patch to ${XFORMERS_FMHA_FLASH_PATH}" + patch -p0 $XFORMERS_FMHA_FLASH_PATH "flashpy_xformers-0.0.22.post7.patch" + echo "Successfully patch ${XFORMERS_FMHA_FLASH_PATH}" +else + echo "${XFORMERS_FMHA_FLASH_PATH} was patched before" +fi + +if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_COMMON_PATH "commonpy_xformers-0.0.22.post7.patch"; then + echo "Applying patch to ${XFORMERS_FMHA_COMMON_PATH}" + patch -p0 $XFORMERS_FMHA_COMMON_PATH "commonpy_xformers-0.0.22.post7.patch" + echo "Successfully patch ${XFORMERS_FMHA_COMMON_PATH}" +else + echo "${XFORMERS_FMHA_COMMON_PATH} was patched before" +fi \ No newline at end of file diff --git a/requirements-rocm.txt b/requirements-rocm.txt new file mode 100644 index 000000000000..ebbf4783c665 --- /dev/null +++ b/requirements-rocm.txt @@ -0,0 +1,12 @@ +ninja # For faster builds. +psutil +ray >= 2.5.1 +pandas # Required for Ray data. +pyarrow # Required for Ray data. +sentencepiece # Required for LLaMA tokenizer. +numpy +einops # Required for phi-1_5 +transformers >= 4.34.0 # Required for Mistral. +fastapi +uvicorn[standard] +pydantic == 1.10.13 # Required for OpenAI server. diff --git a/rocm.Dockerfile b/rocm.Dockerfile new file mode 100644 index 000000000000..27b57097740d --- /dev/null +++ b/rocm.Dockerfile @@ -0,0 +1,64 @@ +FROM rocm/pytorch:rocm5.7_ubuntu22.04_py3.10_pytorch_2.0.1 + +# Install some basic utilities +RUN apt-get update && apt-get install python3 python3-pip -y + +# Install some basic utilities +RUN apt-get update && apt-get install -y \ + curl \ + ca-certificates \ + sudo \ + git \ + bzip2 \ + libx11-6 \ + build-essential \ + wget \ + unzip \ + nvidia-cuda-toolkit \ + tmux \ + && rm -rf /var/lib/apt/lists/* + +### Mount Point ### +# When launching the container, mount the code directory to /app +ARG APP_MOUNT=/app +VOLUME [ ${APP_MOUNT} ] +WORKDIR ${APP_MOUNT} + +RUN python3 -m pip install --upgrade pip +RUN python3 -m pip install --no-cache-dir fastapi ninja tokenizers + +ENV LLVM_SYMBOLIZER_PATH=/opt/rocm/llvm/bin/llvm-symbolizer +ENV PATH=$PATH:/opt/rocm/bin:/libtorch/bin: +ENV LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib/:/libtorch/lib: +ENV CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/libtorch/include:/libtorch/include/torch/csrc/api/include/:/opt/rocm/include/: +ENV PYTORCH_ROCM_ARCH=gfx900;gfx906;gfx908;gfx90a;gfx1030;gfx1101 + +# Install ROCm flash-attention +RUN mkdir libs \ + && cd libs \ + && git clone https://github.com/ROCmSoftwarePlatform/flash-attention.git \ + && cd flash-attention \ + && git submodule update --init \ + && sed -i -e "s/--offload-arch=native/--offload-arch=$(/opt/rocm/llvm/bin/amdgpu-offload-arch)/g" setup.py \ + && patch /opt/conda/envs/py_3.10/lib/python3.10/site-packages/torch/utils/hipify/hipify_python.py hipify_patch.patch \ + && python3 setup.py install \ + && cd .. + +COPY ./ /app/vllm-rocm/ + +# RUN cd /app \ +# && cd vllm-rocm \ +# && git checkout v0.2.1.post1-rocm \ +# && python3 setup.py install \ +# && cd .. + +# RUN cd /app \ +# && mkdir dataset \ +# && cd .. + +# COPY ./benchmark_throughput.sh /app/benchmark_throughput.sh + +RUN python3 -m pip install --upgrade pip +# RUN python3 -m pip install --no-cache-dir ray[all] + +CMD ["/bin/bash"] \ No newline at end of file From 4089864fc6935366b689d056eb17c276a2eb9121 Mon Sep 17 00:00:00 2001 From: miloice Date: Wed, 29 Nov 2023 07:14:08 +0000 Subject: [PATCH 07/14] update docs installation; reorganize patch into rocm_patch; restrict args if using ROCm --- .../getting_started/amd-installation.rst | 104 ++++++++++++++++++ launch_docker.sh => launch_amd_docker.sh | 0 ....sh => patch_xformers-0.0.22.post7.rocm.sh | 8 +- .../commonpy_xformers-0.0.22.post7.rocm.patch | 0 .../flashpy_xformers-0.0.22.post7.rocm.patch | 0 vllm/engine/arg_utils.py | 100 +++++++++++------ 6 files changed, 175 insertions(+), 37 deletions(-) create mode 100644 docs/source/getting_started/amd-installation.rst rename launch_docker.sh => launch_amd_docker.sh (100%) rename patch_xformers-0.0.22.post7.sh => patch_xformers-0.0.22.post7.rocm.sh (60%) rename commonpy_xformers-0.0.22.post7.patch => rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch (100%) rename flashpy_xformers-0.0.22.post7.patch => rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch (100%) diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst new file mode 100644 index 000000000000..7f2ebdbcba6d --- /dev/null +++ b/docs/source/getting_started/amd-installation.rst @@ -0,0 +1,104 @@ +.. _installation: + +Installation with ROCm +============ + +vLLM-ROCm is here! Currently it is supporting llama-2. + +Requirements +------------ + +* OS: Linux +* Python: 3.8 -- 3.11 (Recommended 3.10 as this is the version that has been tested on.) +* GPU: MI210 +* Pytorch 2.0.1/2.1.1 +* ROCm 5.7 + + +Install with pip +---------------- + +You can install vLLM using pip: + +.. code-block:: console + + $ # (Optional) Create a new conda environment. + $ conda create -n myenv python=3.8 -y + $ conda activate myenv + + $ # Install vLLM with CUDA 12.1. + $ pip install vllm + +.. note:: + + As of now, vLLM's binaries are compiled on CUDA 12.1 by default. + However, you can install vLLM with CUDA 11.8 by running: + + .. code-block:: console + + $ # Install vLLM with CUDA 11.8. + $ # Replace `cp310` with your Python version (e.g., `cp38`, `cp39`, `cp311`). + $ pip install https://github.com/vllm-project/vllm/releases/download/v0.2.2/vllm-0.2.2+cu118-cp310-cp310-manylinux1_x86_64.whl + + $ # Re-install PyTorch with CUDA 11.8. + $ pip uninstall torch -y + $ pip install torch --upgrade --index-url https://download.pytorch.org/whl/cu118 + + +.. _build_from_source: + +Build from source with docker +----------------- + +You can also build and install vLLM from source: + +Build a docker image from `rocm.Dockerfile`, and launch a docker container. + +.. code-block:: console + + $ docker build -f rocm.Dockerfile -t vllm-rocm . + $ docker run -it \ + --network=host \ + --group-add=video \ + --ipc=host \ + --cap-add=SYS_PTRACE \ + --security-opt seccomp=unconfined \ + --shm-size 8G \ + --device /dev/kfd \ + --device /dev/dri \ + -v :/app/hf_model \ + vllm-rocm \ + bash + +If you are going to setup on new pytorch+rocm5.7 docker container, you can follow the following steps. + +1. Install flash-attention-2-rocm + + If you are using Pytorch-2.0.1+rocm5.7. + + Install flash-attention-2 (v2.0.4) following the instruction from [ROCmSoftwarePlatform/flash-attention](https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm) + + + If you are using Pytorch-2.1.1+rocm5.7 + Install flash-attention-2 (v2.0.4) following the instruction from [ROCmSoftwarePlatform/flash-attention](https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm) + BUT, use the patch from this repo to patch the `hipify_python.py` + + .. code-block:: console + + $ bash patch_torch211_flash_attn2.rocm.sh + + .. note:: + Flash-attention-2 (v2.0.4) does not support sliding windows attention. + +2. Setup xformers==0.0.22.post7 without dependencies, and apply patches + + .. code-block:: console + + $ pip install xformers==0.0.22.post7 --no-deps + $ bash patch_xformers-0.0.22.post7.rocm.sh + +3. Build vllm. + + .. code-block:: console + $ cd vllm + $ python setup.py install # This may take 5-10 minutes. diff --git a/launch_docker.sh b/launch_amd_docker.sh similarity index 100% rename from launch_docker.sh rename to launch_amd_docker.sh diff --git a/patch_xformers-0.0.22.post7.sh b/patch_xformers-0.0.22.post7.rocm.sh similarity index 60% rename from patch_xformers-0.0.22.post7.sh rename to patch_xformers-0.0.22.post7.rocm.sh index 4ce38272bca8..c8e58f721ae8 100644 --- a/patch_xformers-0.0.22.post7.sh +++ b/patch_xformers-0.0.22.post7.rocm.sh @@ -5,17 +5,17 @@ export XFORMERS_FMHA_COMMON_PATH=$(python -c 'from xformers import ops as xops; echo $XFORMERS_FMHA_FLASH_PATH echo $XFORMERS_FMHA_COMMON_PATH -if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_FLASH_PATH "flashpy_xformers-0.0.22.post7.patch"; then +if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch"; then echo "Applying patch to ${XFORMERS_FMHA_FLASH_PATH}" - patch -p0 $XFORMERS_FMHA_FLASH_PATH "flashpy_xformers-0.0.22.post7.patch" + patch -p0 $XFORMERS_FMHA_FLASH_PATH "./rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch" echo "Successfully patch ${XFORMERS_FMHA_FLASH_PATH}" else echo "${XFORMERS_FMHA_FLASH_PATH} was patched before" fi -if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_COMMON_PATH "commonpy_xformers-0.0.22.post7.patch"; then +if ! patch -R -p0 -s -f --dry-run $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch"; then echo "Applying patch to ${XFORMERS_FMHA_COMMON_PATH}" - patch -p0 $XFORMERS_FMHA_COMMON_PATH "commonpy_xformers-0.0.22.post7.patch" + patch -p0 $XFORMERS_FMHA_COMMON_PATH "./rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch" echo "Successfully patch ${XFORMERS_FMHA_COMMON_PATH}" else echo "${XFORMERS_FMHA_COMMON_PATH} was patched before" diff --git a/commonpy_xformers-0.0.22.post7.patch b/rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch similarity index 100% rename from commonpy_xformers-0.0.22.post7.patch rename to rocm_patch/commonpy_xformers-0.0.22.post7.rocm.patch diff --git a/flashpy_xformers-0.0.22.post7.patch b/rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch similarity index 100% rename from flashpy_xformers-0.0.22.post7.patch rename to rocm_patch/flashpy_xformers-0.0.22.post7.rocm.patch diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c7e476c70474..c52b5f03b760 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -3,6 +3,8 @@ from dataclasses import dataclass from typing import Optional, Tuple +import torch + from vllm.config import (CacheConfig, ModelConfig, ParallelConfig, SchedulerConfig) @@ -83,32 +85,54 @@ def add_cli_args( help='directory to download and load the weights, ' 'default to the default cache dir of ' 'huggingface') - parser.add_argument( - '--load-format', - type=str, - default=EngineArgs.load_format, - choices=['auto', 'pt', 'safetensors', 'npcache', 'dummy'], - help='The format of the model weights to load. ' - '"auto" will try to load the weights in the safetensors format ' - 'and fall back to the pytorch bin format if safetensors format ' - 'is not available. ' - '"pt" will load the weights in the pytorch bin format. ' - '"safetensors" will load the weights in the safetensors format. ' - '"npcache" will load the weights in pytorch format and store ' - 'a numpy cache to speed up the loading. ' - '"dummy" will initialize the weights with random values, ' - 'which is mainly for profiling.') - parser.add_argument( - '--dtype', - type=str, - default=EngineArgs.dtype, - choices=[ - 'auto', 'half', 'float16', 'bfloat16', 'float', 'float32' - ], - help='data type for model weights and activations. ' - 'The "auto" option will use FP16 precision ' - 'for FP32 and FP16 models, and BF16 precision ' - 'for BF16 models.') + if torch.cuda.is_available() and torch.version.hip: + # do something specific for HIP + parser.add_argument( + '--load-format', + type=str, + default='pt', + choices=['pt'], + help='The format of the model weights to load. ' + '"pt" will load the weights in the pytorch bin format. ') + parser.add_argument( + '--dtype', + type=str, + default='half', + choices=[ + 'half', 'float16', 'bfloat16' + ], + help='data type for model weights and activations. ' + 'The default option is FP16 precision ' + 'Supports FP16 and BF16 ') + elif torch.cuda.is_available() and torch.version.cuda: + # do something specific for CUDA + parser.add_argument( + '--load-format', + type=str, + default=EngineArgs.load_format, + choices=['auto', 'pt', 'safetensors', 'npcache', 'dummy'], + help='The format of the model weights to load. ' + '"auto" will try to load the weights in the safetensors format ' + 'and fall back to the pytorch bin format if safetensors format ' + 'is not available. ' + '"pt" will load the weights in the pytorch bin format. ' + '"safetensors" will load the weights in the safetensors format. ' + '"npcache" will load the weights in pytorch format and store ' + 'a numpy cache to speed up the loading. ' + '"dummy" will initialize the weights with random values, ' + 'which is mainly for profiling.') + parser.add_argument( + '--dtype', + type=str, + default=EngineArgs.dtype, + choices=[ + 'auto', 'half', 'float16', 'bfloat16', 'float', 'float32' + ], + help='data type for model weights and activations. ' + 'The "auto" option will use FP16 precision ' + 'for FP32 and FP16 models, and BF16 precision ' + 'for BF16 models.') + parser.add_argument('--max-model-len', type=int, default=None, @@ -171,13 +195,23 @@ def add_cli_args( parser.add_argument('--disable-log-stats', action='store_true', help='disable logging statistics') - # Quantization settings. - parser.add_argument('--quantization', - '-q', - type=str, - choices=['awq', 'squeezellm', None], - default=None, - help='Method used to quantize the weights') + if torch.cuda.is_available() and torch.version.hip: + # Quantization settings. + parser.add_argument('--quantization', + '-q', + type=str, + choices=['squeezellm', None], + default=None, + help='Method used to quantize the weights') + + elif torch.cuda.is_available() and torch.version.cuda: + # Quantization settings. + parser.add_argument('--quantization', + '-q', + type=str, + choices=['awq', 'squeezellm', None], + default=None, + help='Method used to quantize the weights') return parser @classmethod From a8e252cc5cfea6b9aacd68089cd071d573a42f6a Mon Sep 17 00:00:00 2001 From: miloice Date: Wed, 29 Nov 2023 10:11:10 +0000 Subject: [PATCH 08/14] add num_gpus to ray_init to fix bug; update requirements-rocm.txt --- docs/source/getting_started/amd-installation.rst | 3 ++- patch_torch211_flash_attn2.rocm.sh | 12 ++++++++++++ requirements-rocm.txt | 4 ++++ rocm_patch/hipify_patch-torch2.1.1.patch | 16 ++++++++++++++++ vllm/engine/ray_utils.py | 2 +- 5 files changed, 35 insertions(+), 2 deletions(-) create mode 100644 patch_torch211_flash_attn2.rocm.sh create mode 100644 rocm_patch/hipify_patch-torch2.1.1.patch diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst index 7f2ebdbcba6d..eac88a0d47cf 100644 --- a/docs/source/getting_started/amd-installation.rst +++ b/docs/source/getting_started/amd-installation.rst @@ -88,7 +88,8 @@ If you are going to setup on new pytorch+rocm5.7 docker container, you can follo $ bash patch_torch211_flash_attn2.rocm.sh .. note:: - Flash-attention-2 (v2.0.4) does not support sliding windows attention. + - Flash-attention-2 (v2.0.4) does not support sliding windows attention. + - You might need to downgrade the "ninja" version to 1.10 it is not used when compiling flash-attention-2 (e.g. `pip install ninja==1.10.2.4`) 2. Setup xformers==0.0.22.post7 without dependencies, and apply patches diff --git a/patch_torch211_flash_attn2.rocm.sh b/patch_torch211_flash_attn2.rocm.sh new file mode 100644 index 000000000000..fe9e1c51d8a9 --- /dev/null +++ b/patch_torch211_flash_attn2.rocm.sh @@ -0,0 +1,12 @@ +#!/bin/bash +export PYTORCH_VERSION=$(python -c 'import torch; print(torch.__version__)') + +echo $PYTORCH_VERSION + +if [[ $PYTORCH_VERSION == *"2.1"* ]] +then +echo "2.1.1" +export GPU_ARCHS="gfx90a;gfx940;gfx941;gfx942" +export PYTHON_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])') +patch "${PYTHON_SITE_PACKAGES}/torch/utils/hipify/hipify_python.py" "./rocm_patch/hipify_patch-torch2.1.1.patch" +fi \ No newline at end of file diff --git a/requirements-rocm.txt b/requirements-rocm.txt index ebbf4783c665..53fd3ea24d92 100644 --- a/requirements-rocm.txt +++ b/requirements-rocm.txt @@ -1,10 +1,14 @@ ninja # For faster builds. +typing-extensions>=4.8.0 +starlette psutil ray >= 2.5.1 pandas # Required for Ray data. pyarrow # Required for Ray data. sentencepiece # Required for LLaMA tokenizer. numpy +tokenizers>=0.15.0 +huggingface_hub<0.18,>=0.16.4 einops # Required for phi-1_5 transformers >= 4.34.0 # Required for Mistral. fastapi diff --git a/rocm_patch/hipify_patch-torch2.1.1.patch b/rocm_patch/hipify_patch-torch2.1.1.patch new file mode 100644 index 000000000000..bae87c8d788b --- /dev/null +++ b/rocm_patch/hipify_patch-torch2.1.1.patch @@ -0,0 +1,16 @@ +--- hipifypy.ori 2023-11-29 05:59:14.477283000 +0000 ++++ hipifypy.edited 2023-11-29 06:13:55.937250000 +0000 +@@ -870,7 +870,12 @@ + return m.group(0) + # Hipify header file first if needed + if header_filepath not in HIPIFY_FINAL_RESULT: +- preprocess_file_and_save_result(output_directory, ++ #JCG added skip logic ++ if "composable_kernel" in header_filepath: ++ print("Force skipping hipification of CK file: " + header_filepath) ++ HIPIFY_FINAL_RESULT[header_filepath] = HipifyResult(current_state=CurrentState.DONE, hipified_path = header_filepath) ++ else: ++ preprocess_file_and_save_result(output_directory, + header_filepath, + all_files, header_include_dirs, stats, hip_clang_launch, + is_pytorch_extension, clean_ctx, show_progress) \ No newline at end of file diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py index ee58b8b9074a..d61abb92dbd2 100644 --- a/vllm/engine/ray_utils.py +++ b/vllm/engine/ray_utils.py @@ -73,7 +73,7 @@ def initialize_cluster( "Ray is not installed. Please install Ray to use distributed " "serving.") # Connect to a ray cluster. - ray.init(address=ray_address, ignore_reinit_error=True) + ray.init(address=ray_address, ignore_reinit_error=True, num_gpus=parallel_config.world_size) if not parallel_config.worker_use_ray: # Initialize cluster locally. From 77d519ed7c6e3f7a6975496284332475f440c3e6 Mon Sep 17 00:00:00 2001 From: miloice Date: Wed, 29 Nov 2023 10:15:24 +0000 Subject: [PATCH 09/14] remove unncessary patch as torch 2.1 2.2 does not need one --- docs/source/getting_started/amd-installation.rst | 5 ++--- patch_torch211_flash_attn2.rocm.sh | 12 ------------ rocm_patch/hipify_patch-torch2.1.1.patch | 16 ---------------- 3 files changed, 2 insertions(+), 31 deletions(-) delete mode 100644 patch_torch211_flash_attn2.rocm.sh delete mode 100644 rocm_patch/hipify_patch-torch2.1.1.patch diff --git a/docs/source/getting_started/amd-installation.rst b/docs/source/getting_started/amd-installation.rst index eac88a0d47cf..862b8a1d6e89 100644 --- a/docs/source/getting_started/amd-installation.rst +++ b/docs/source/getting_started/amd-installation.rst @@ -79,9 +79,8 @@ If you are going to setup on new pytorch+rocm5.7 docker container, you can follo Install flash-attention-2 (v2.0.4) following the instruction from [ROCmSoftwarePlatform/flash-attention](https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm) - If you are using Pytorch-2.1.1+rocm5.7 - Install flash-attention-2 (v2.0.4) following the instruction from [ROCmSoftwarePlatform/flash-attention](https://github.com/ROCmSoftwarePlatform/flash-attention/tree/flash_attention_for_rocm) - BUT, use the patch from this repo to patch the `hipify_python.py` + If you are using Pytorch-2.1.x+rocm5.7 or Pytorch-2.2.x+rocm5.7, you don't need to apply the `hipify_python.patch`. + You can directly build the flash-attention-2. .. code-block:: console diff --git a/patch_torch211_flash_attn2.rocm.sh b/patch_torch211_flash_attn2.rocm.sh deleted file mode 100644 index fe9e1c51d8a9..000000000000 --- a/patch_torch211_flash_attn2.rocm.sh +++ /dev/null @@ -1,12 +0,0 @@ -#!/bin/bash -export PYTORCH_VERSION=$(python -c 'import torch; print(torch.__version__)') - -echo $PYTORCH_VERSION - -if [[ $PYTORCH_VERSION == *"2.1"* ]] -then -echo "2.1.1" -export GPU_ARCHS="gfx90a;gfx940;gfx941;gfx942" -export PYTHON_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])') -patch "${PYTHON_SITE_PACKAGES}/torch/utils/hipify/hipify_python.py" "./rocm_patch/hipify_patch-torch2.1.1.patch" -fi \ No newline at end of file diff --git a/rocm_patch/hipify_patch-torch2.1.1.patch b/rocm_patch/hipify_patch-torch2.1.1.patch deleted file mode 100644 index bae87c8d788b..000000000000 --- a/rocm_patch/hipify_patch-torch2.1.1.patch +++ /dev/null @@ -1,16 +0,0 @@ ---- hipifypy.ori 2023-11-29 05:59:14.477283000 +0000 -+++ hipifypy.edited 2023-11-29 06:13:55.937250000 +0000 -@@ -870,7 +870,12 @@ - return m.group(0) - # Hipify header file first if needed - if header_filepath not in HIPIFY_FINAL_RESULT: -- preprocess_file_and_save_result(output_directory, -+ #JCG added skip logic -+ if "composable_kernel" in header_filepath: -+ print("Force skipping hipification of CK file: " + header_filepath) -+ HIPIFY_FINAL_RESULT[header_filepath] = HipifyResult(current_state=CurrentState.DONE, hipified_path = header_filepath) -+ else: -+ preprocess_file_and_save_result(output_directory, - header_filepath, - all_files, header_include_dirs, stats, hip_clang_launch, - is_pytorch_extension, clean_ctx, show_progress) \ No newline at end of file From 046162043f95686bff18e81ee67a6789bdc33675 Mon Sep 17 00:00:00 2001 From: miloice Date: Wed, 29 Nov 2023 10:30:10 +0000 Subject: [PATCH 10/14] fix squeezellm variable --- vllm/model_executor/layers/quantization/squeezellm.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py index 17d6ca4cc133..ccacb1c2f586 100644 --- a/vllm/model_executor/layers/quantization/squeezellm.py +++ b/vllm/model_executor/layers/quantization/squeezellm.py @@ -112,6 +112,8 @@ def apply_weights(self, bias: Optional[torch.Tensor] = None) -> torch.Tensor: qweight = weights["qweight"] lookup_table = weights["lookup_table"] + out_shape = x.shape[:-1] + (qweight.shape[-1], ) + reshaped_x = x.reshape(-1, x.shape[-1]) if torch.cuda.is_available() and torch.version.hip: out_float = torch.zeros(out_shape, device="cuda", dtype=torch.float) quantization_ops.squeezellm_gemm(reshaped_x, qweight, out_float, @@ -119,8 +121,6 @@ def apply_weights(self, out = out_float.to(dtype=torch.float16) # do something specific for HIP elif torch.cuda.is_available() and torch.version.cuda: - out_shape = x.shape[:-1] + (qweight.shape[-1], ) - reshaped_x = x.reshape(-1, x.shape[-1]) # NOTE: The output tensor should be zero-initialized. out = torch.zeros(out_shape, device="cuda", dtype=torch.float16) quantization_ops.squeezellm_gemm(reshaped_x, qweight, out, From e838483f7973edd5afe959764e6403bf893cbc1f Mon Sep 17 00:00:00 2001 From: miloice Date: Wed, 29 Nov 2023 10:54:46 +0000 Subject: [PATCH 11/14] support vllm + rocm versioning --- setup.py | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/setup.py b/setup.py index 061eb9fd0afa..786c9d91579d 100644 --- a/setup.py +++ b/setup.py @@ -53,7 +53,26 @@ def get_amdgpu_offload_arch(): raise RuntimeError(error_message) return None - + +def get_hipcc_rocm_version(): + # Run the hipcc --version command + result = subprocess.run(['hipcc', '--version'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) + + # Check if the command was executed successfully + if result.returncode != 0: + print("Error running 'hipcc --version'") + return None + + # Extract the version using a regular expression + match = re.search(r'HIP version: (\S+)', result.stdout) + if match: + # Return the version string + return match.group(1) + else: + print("Could not find HIP version in the output") + return None + + def get_nvcc_cuda_version(cuda_dir: str) -> Version: """Get the CUDA version from nvcc. @@ -289,10 +308,20 @@ def find_version(filepath: str) -> str: def get_vllm_version() -> str: version = find_version(get_path("vllm", "__init__.py")) - # cuda_version = str(nvcc_cuda_version) - # if cuda_version != MAIN_CUDA_VERSION: - # cuda_version_str = cuda_version.replace(".", "")[:3] - # version += f"+cu{cuda_version_str}" + + if torch.cuda.is_available() and torch.version.cuda: + cuda_version = str(nvcc_cuda_version) + if cuda_version != MAIN_CUDA_VERSION: + cuda_version_str = cuda_version.replace(".", "")[:3] + version += f"+cu{cuda_version_str}" + + elif torch.cuda.is_available() and torch.version.hip: + # Get the HIP version + hipcc_version = get_hipcc_rocm_version() + if hipcc_version != MAIN_CUDA_VERSION: + rocm_version_str = hipcc_version.replace(".", "")[:3] + version += f"+rocm{rocm_version_str}" + return version From eadba42bad8c2baf47622e6adfe53ef76c907691 Mon Sep 17 00:00:00 2001 From: miloice Date: Wed, 29 Nov 2023 12:27:18 +0000 Subject: [PATCH 12/14] format.sh code --- pyproject.toml | 2 +- setup.py | 73 ++++++++++--------- vllm/engine/arg_utils.py | 8 +- vllm/engine/ray_utils.py | 4 +- vllm/model_executor/layers/attention.py | 6 +- .../layers/quantization/__init__.py | 1 + .../layers/quantization/squeezellm.py | 8 +- 7 files changed, 57 insertions(+), 45 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index e3e3e389f789..f9390ed8c52a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ requires = [ "ninja", "packaging", "setuptools", - "torch >= 2.1.0", + # "torch >= 2.1.0", # commented out to accommodate ROCm "wheel", ] build-backend = "setuptools.build_meta" diff --git a/setup.py b/setup.py index 786c9d91579d..b75d0912a9bb 100644 --- a/setup.py +++ b/setup.py @@ -16,7 +16,7 @@ # Supported NVIDIA GPU architectures. NVIDIA_SUPPORTED_ARCHS = {"7.0", "7.5", "8.0", "8.6", "8.9", "9.0"} -ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030","gfx1100"} +ROCM_SUPPORTED_ARCHS = {"gfx90a", "gfx908", "gfx906", "gfx1030", "gfx1100"} # SUPPORTED_ARCHS = NVIDIA_SUPPORTED_ARCHS.union(ROCM_SUPPORTED_ARCHS) # Compiler flags. @@ -24,19 +24,18 @@ # TODO(woosuk): Should we use -O3? NVCC_FLAGS = ["-O2", "-std=c++17"] -if torch.cuda.is_available() and torch.version.hip: - if ROCM_HOME is not None: - NVCC_FLAGS += [f"-DUSE_ROCM"] +if torch.cuda.is_available() and torch.version.hip and ROCM_HOME is not None: + NVCC_FLAGS += ["-DUSE_ROCM"] -if torch.cuda.is_available() and torch.version.cuda: - if CUDA_HOME is None: - raise RuntimeError( - "Cannot find CUDA_HOME. CUDA must be available to build the package.") +if torch.cuda.is_available() and torch.version.cuda and CUDA_HOME is None: + raise RuntimeError( + "Cannot find CUDA_HOME. CUDA must be available to build the package.") ABI = 1 if torch._C._GLIBCXX_USE_CXX11_ABI else 0 CXX_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] NVCC_FLAGS += [f"-D_GLIBCXX_USE_CXX11_ABI={ABI}"] + def get_amdgpu_offload_arch(): error_message = "" command = "/opt/rocm/llvm/bin/amdgpu-offload-arch" @@ -48,15 +47,19 @@ def get_amdgpu_offload_arch(): except FileNotFoundError: # If the command is not found, print an error message error_message = f"The command {command} was not found." - + if error_message: raise RuntimeError(error_message) - + return None + def get_hipcc_rocm_version(): # Run the hipcc --version command - result = subprocess.run(['hipcc', '--version'], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True) + result = subprocess.run(['hipcc', '--version'], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + text=True) # Check if the command was executed successfully if result.returncode != 0: @@ -103,7 +106,9 @@ def get_torch_arch_list() -> Set[str]: return set() # Filter out the invalid architectures and print a warning. - valid_archs = NVIDIA_SUPPORTED_ARCHS.union({s + "+PTX" for s in NVIDIA_SUPPORTED_ARCHS}) + valid_archs = NVIDIA_SUPPORTED_ARCHS.union( + {s + "+PTX" + for s in NVIDIA_SUPPORTED_ARCHS}) arch_list = torch_arch_list.intersection(valid_archs) # If none of the specified architectures are valid, raise an error. if not arch_list: @@ -124,17 +129,17 @@ def get_torch_arch_list() -> Set[str]: # First, check the TORCH_CUDA_ARCH_LIST environment variable. compute_capabilities = get_torch_arch_list() -if torch.cuda.is_available() and torch.version.cuda: - if not compute_capabilities: - # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available - # GPUs on the current machine. - device_count = torch.cuda.device_count() - for i in range(device_count): - major, minor = torch.cuda.get_device_capability(i) - if major < 7: - raise RuntimeError( - "GPUs with compute capability below 7.0 are not supported.") - compute_capabilities.add(f"{major}.{minor}") +if torch.cuda.is_available( +) and torch.version.cuda and not compute_capabilities: + # If TORCH_CUDA_ARCH_LIST is not defined or empty, target all available + # GPUs on the current machine. + device_count = torch.cuda.device_count() + for i in range(device_count): + major, minor = torch.cuda.get_device_capability(i) + if major < 7: + raise RuntimeError( + "GPUs with compute capability below 7.0 are not supported.") + compute_capabilities.add(f"{major}.{minor}") if torch.cuda.is_available() and torch.version.cuda: nvcc_cuda_version = get_nvcc_cuda_version(CUDA_HOME) @@ -149,7 +154,8 @@ def get_torch_arch_list() -> Set[str]: compute_capabilities.remove("9.0") # Validate the NVCC CUDA version. if nvcc_cuda_version < Version("11.0"): - raise RuntimeError("CUDA 11.0 or higher is required to build the package.") + raise RuntimeError( + "CUDA 11.0 or higher is required to build the package.") if (nvcc_cuda_version < Version("11.1") and any(cc.startswith("8.6") for cc in compute_capabilities)): raise RuntimeError( @@ -166,7 +172,7 @@ def get_torch_arch_list() -> Set[str]: "Targeting compute capability 8.0 instead.", stacklevel=2) compute_capabilities = set(cc for cc in compute_capabilities - if not cc.startswith("8.9")) + if not cc.startswith("8.9")) compute_capabilities.add("8.0+PTX") if any(cc.startswith("9.0") for cc in compute_capabilities): raise RuntimeError( @@ -177,7 +183,9 @@ def get_torch_arch_list() -> Set[str]: num = capability[0] + capability[2] NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=sm_{num}"] if capability.endswith("+PTX"): - NVCC_FLAGS += ["-gencode", f"arch=compute_{num},code=compute_{num}"] + NVCC_FLAGS += [ + "-gencode", f"arch=compute_{num},code=compute_{num}" + ] # Use NVCC threads to parallelize the build. if nvcc_cuda_version >= Version("11.2"): @@ -186,12 +194,11 @@ def get_torch_arch_list() -> Set[str]: elif torch.cuda.is_available() and torch.version.hip: amd_arch = get_amdgpu_offload_arch() - if not amd_arch in ROCM_SUPPORTED_ARCHS: + if amd_arch not in ROCM_SUPPORTED_ARCHS: raise RuntimeError( f"Only the following arch is supported: {ROCM_SUPPORTED_ARCHS}" - f"amdgpu_arch_found: {amd_arch}" - ) - + f"amdgpu_arch_found: {amd_arch}") + ext_modules = [] # Cache operations. @@ -308,20 +315,20 @@ def find_version(filepath: str) -> str: def get_vllm_version() -> str: version = find_version(get_path("vllm", "__init__.py")) - + if torch.cuda.is_available() and torch.version.cuda: cuda_version = str(nvcc_cuda_version) if cuda_version != MAIN_CUDA_VERSION: cuda_version_str = cuda_version.replace(".", "")[:3] version += f"+cu{cuda_version_str}" - + elif torch.cuda.is_available() and torch.version.hip: # Get the HIP version hipcc_version = get_hipcc_rocm_version() if hipcc_version != MAIN_CUDA_VERSION: rocm_version_str = hipcc_version.replace(".", "")[:3] version += f"+rocm{rocm_version_str}" - + return version diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index c52b5f03b760..c7612b3ac407 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -98,9 +98,7 @@ def add_cli_args( '--dtype', type=str, default='half', - choices=[ - 'half', 'float16', 'bfloat16' - ], + choices=['half', 'float16', 'bfloat16'], help='data type for model weights and activations. ' 'The default option is FP16 precision ' 'Supports FP16 and BF16 ') @@ -132,7 +130,7 @@ def add_cli_args( 'The "auto" option will use FP16 precision ' 'for FP32 and FP16 models, and BF16 precision ' 'for BF16 models.') - + parser.add_argument('--max-model-len', type=int, default=None, @@ -203,7 +201,7 @@ def add_cli_args( choices=['squeezellm', None], default=None, help='Method used to quantize the weights') - + elif torch.cuda.is_available() and torch.version.cuda: # Quantization settings. parser.add_argument('--quantization', diff --git a/vllm/engine/ray_utils.py b/vllm/engine/ray_utils.py index d61abb92dbd2..6bff8153e2a5 100644 --- a/vllm/engine/ray_utils.py +++ b/vllm/engine/ray_utils.py @@ -73,7 +73,9 @@ def initialize_cluster( "Ray is not installed. Please install Ray to use distributed " "serving.") # Connect to a ray cluster. - ray.init(address=ray_address, ignore_reinit_error=True, num_gpus=parallel_config.world_size) + ray.init(address=ray_address, + ignore_reinit_error=True, + num_gpus=parallel_config.world_size) if not parallel_config.worker_use_ray: # Initialize cluster locally. diff --git a/vllm/model_executor/layers/attention.py b/vllm/model_executor/layers/attention.py index 7750a95c3f04..2e042721d9a2 100644 --- a/vllm/model_executor/layers/attention.py +++ b/vllm/model_executor/layers/attention.py @@ -113,7 +113,8 @@ def multi_query_kv_attention( attn_bias=input_metadata.attn_bias, p=0.0, scale=self.scale, - op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if (torch.cuda.is_available() and torch.version.hip) else None, + op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if + (torch.cuda.is_available() and torch.version.hip) else None, ) # TODO(woosuk): Unnecessary copy. Optimize. output.copy_(out.view_as(output)) @@ -452,7 +453,8 @@ def multi_query_kv_attention( attn_bias=input_metadata.attn_bias, p=0.0, scale=self.scale, - op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if (torch.cuda.is_available() and torch.version.hip) else None, + op=xops.fmha.MemoryEfficientAttentionFlashAttentionOp[0] if + (torch.cuda.is_available() and torch.version.hip) else None, ) # TODO(woosuk): Unnecessary copy. Optimize. output.copy_(out.view_as(output)) diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index 748a6f610f2a..f4d25566cf59 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -11,6 +11,7 @@ from vllm.model_executor.layers.quantization.awq import AWQConfig _QUANTIZATION_CONFIG_REGISTRY["awq"] = AWQConfig + def get_quantization_config(quantization: str) -> Type[QuantizationConfig]: if quantization not in _QUANTIZATION_CONFIG_REGISTRY: raise ValueError(f"Invalid quantization method: {quantization}") diff --git a/vllm/model_executor/layers/quantization/squeezellm.py b/vllm/model_executor/layers/quantization/squeezellm.py index ccacb1c2f586..be318b2ef205 100644 --- a/vllm/model_executor/layers/quantization/squeezellm.py +++ b/vllm/model_executor/layers/quantization/squeezellm.py @@ -115,16 +115,18 @@ def apply_weights(self, out_shape = x.shape[:-1] + (qweight.shape[-1], ) reshaped_x = x.reshape(-1, x.shape[-1]) if torch.cuda.is_available() and torch.version.hip: - out_float = torch.zeros(out_shape, device="cuda", dtype=torch.float) + out_float = torch.zeros(out_shape, + device="cuda", + dtype=torch.float) quantization_ops.squeezellm_gemm(reshaped_x, qweight, out_float, - lookup_table) + lookup_table) out = out_float.to(dtype=torch.float16) # do something specific for HIP elif torch.cuda.is_available() and torch.version.cuda: # NOTE: The output tensor should be zero-initialized. out = torch.zeros(out_shape, device="cuda", dtype=torch.float16) quantization_ops.squeezellm_gemm(reshaped_x, qweight, out, - lookup_table) + lookup_table) if bias is not None: out = out + bias From fb2610d7ac7a51a30431504e9b908192429d8ac3 Mon Sep 17 00:00:00 2001 From: miloice Date: Wed, 29 Nov 2023 12:28:23 +0000 Subject: [PATCH 13/14] rename rocm launch script --- launch_amd_docker.sh => launch_rocm_docker.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename launch_amd_docker.sh => launch_rocm_docker.sh (100%) diff --git a/launch_amd_docker.sh b/launch_rocm_docker.sh similarity index 100% rename from launch_amd_docker.sh rename to launch_rocm_docker.sh From e70c1feb1441cdbd84e90d5a8ce391ecbc1c11d8 Mon Sep 17 00:00:00 2001 From: miloice Date: Wed, 29 Nov 2023 12:29:10 +0000 Subject: [PATCH 14/14] remove rom docker launch script --- launch_rocm_docker.sh | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100644 launch_rocm_docker.sh diff --git a/launch_rocm_docker.sh b/launch_rocm_docker.sh deleted file mode 100644 index b26c8044e7de..000000000000 --- a/launch_rocm_docker.sh +++ /dev/null @@ -1,14 +0,0 @@ -#!/bin/bash -docker run -it \ - --network=host \ - --group-add=video \ - --ipc=host \ - --cap-add=SYS_PTRACE \ - --security-opt seccomp=unconfined \ - --shm-size 8G \ - --device /dev/kfd \ - --device /dev/dri \ - -v /home/akk/tjtanaa/vllm-rocm:/app/libs/vllm-rocm-external \ - -v /home/akk/hf_model:/app/hf_model \ - vllm-rocm-tj \ - bash \ No newline at end of file