vllm-project · simon-mo · Apr 3, 2024 · Feb 5, 2024 · Feb 6, 2024 · Feb 7, 2024
diff --git a/.gitignore b/.gitignore
@@ -181,6 +181,7 @@ _build/
 # hip files generated by PyTorch
 *.hip
 *_hip*
+hip_compat.h
 
 # Benchmark dataset
 *.json
@@ -16,7 +16,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
 set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")
 
 # Supported AMD GPU architectures.
-set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
+set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")
 
 #
 # Supported/expected torch versions for CUDA/ROCm.

diff --git a/benchmarks/benchmark_latency.py b/benchmarks/benchmark_latency.py
@@ -24,6 +24,7 @@ def main(args: argparse.Namespace):
               dtype=args.dtype,
               enforce_eager=args.enforce_eager,
               kv_cache_dtype=args.kv_cache_dtype,
+              quantization_param_path=args.quantization_param_path,
               device=args.device,
               ray_workers_use_nsight=args.ray_workers_use_nsight,
               download_dir=args.download_dir)
@@ -125,10 +126,23 @@ def run_to_completion(profile_dir: Optional[str] = None):
     parser.add_argument(
         "--kv-cache-dtype",
         type=str,
-        choices=['auto', 'fp8_e5m2'],
+        choices=['auto', 'fp8'],
         default='auto',
         help=
-        'Data type for kv cache storage. If "auto", will use model data type.')
+        'Data type for kv cache storage. If "auto", will use model data type. '
+        'FP8_E5M2 (without scaling) is only supported on cuda version greater '
+        'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
+        'common inference criteria.')
+    parser.add_argument(
+        '--quantization-param-path',
+        type=str,
+        default=None,
+        help='Path to the JSON file containing the KV cache scaling factors. '
+        'This should generally be supplied, when KV cache dtype is FP8. '
+        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
+        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
+        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
+        'instead supported for common inference criteria.')
     parser.add_argument(
         '--profile',
         action='store_true',

diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py
@@ -72,6 +72,7 @@ def run_vllm(
     max_model_len: Optional[int],
     enforce_eager: bool,
     kv_cache_dtype: str,
+    quantization_param_path: Optional[str],
     device: str,
     enable_prefix_caching: bool,
     gpu_memory_utilization: float = 0.9,
@@ -89,6 +90,7 @@ def run_vllm(
               gpu_memory_utilization=gpu_memory_utilization,
               enforce_eager=enforce_eager,
               kv_cache_dtype=kv_cache_dtype,
+              quantization_param_path=quantization_param_path,
               device=device,
               enable_prefix_caching=enable_prefix_caching,
               download_dir=download_dir)
@@ -215,7 +217,8 @@ def main(args: argparse.Namespace):
                                 args.seed, args.n, args.use_beam_search,
                                 args.trust_remote_code, args.dtype,
                                 args.max_model_len, args.enforce_eager,
-                                args.kv_cache_dtype, args.device,
+                                args.kv_cache_dtype,
+                                args.quantization_param_path, args.device,
                                 args.enable_prefix_caching,
                                 args.gpu_memory_utilization, args.download_dir)
     elif args.backend == "hf":
@@ -304,10 +307,23 @@ def main(args: argparse.Namespace):
     parser.add_argument(
         "--kv-cache-dtype",
         type=str,
-        choices=["auto", "fp8_e5m2"],
+        choices=["auto", "fp8"],
         default="auto",
         help=
-        'Data type for kv cache storage. If "auto", will use model data type.')
+        'Data type for kv cache storage. If "auto", will use model data type. '
+        'FP8_E5M2 (without scaling) is only supported on cuda version greater '
+        'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
+        'common inference criteria.')
+    parser.add_argument(
+        '--quantization-param-path',
+        type=str,
+        default=None,
+        help='Path to the JSON file containing the KV cache scaling factors. '
+        'This should generally be supplied, when KV cache dtype is FP8. '
+        'Otherwise, KV cache scaling factors default to 1.0, which may cause '
+        'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
+        'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
+        'instead supported for common inference criteria.')
     parser.add_argument(
         "--device",
         type=str,

diff --git a/benchmarks/kernels/benchmark_paged_attention.py b/benchmarks/kernels/benchmark_paged_attention.py
@@ -97,6 +97,9 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
             torch.cuda.cudart().cudaProfilerStart()
         start_time = time.perf_counter()
 
+        # Using default kv_scale
+        kv_scale = 1.0
+
         for _ in range(num_iters):
             if version == "v1":
                 ops.paged_attention_v1(
@@ -112,6 +115,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
                     max_context_len,
                     alibi_slopes,
                     kv_cache_dtype,
+                    kv_scale,
                 )
             elif version == "v2":
                 ops.paged_attention_v2(
@@ -130,6 +134,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
                     max_context_len,
                     alibi_slopes,
                     kv_cache_dtype,
+                    kv_scale,
                 )
             else:
                 raise ValueError(f"Invalid version: {version}")
@@ -179,11 +184,13 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
     parser.add_argument(
         "--kv-cache-dtype",
         type=str,
-        choices=["auto", "fp8_e5m2"],
+        choices=["auto", "fp8"],
         default="auto",
         help=
-        'Data type for kv cache storage. If "auto", will use model data type.')
-    parser.add_argument("--device", type=str, choices=["cuda"], default="cuda")
+        'Data type for kv cache storage. If "auto", will use model data type. '
+        'FP8_E5M2 (without scaling) is only supported on cuda version greater '
+        'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
+        'common inference criteria.')
     args = parser.parse_args()
     print(args)
 

diff --git a/cmake/utils.cmake b/cmake/utils.cmake
@@ -112,6 +112,7 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)
 
     list(APPEND GPU_FLAGS
       "-DUSE_ROCM"
+      "-DENABLE_FP8_E4M3"
       "-U__HIP_NO_HALF_CONVERSIONS__"
       "-U__HIP_NO_HALF_OPERATORS__"
       "-fno-gpu-rdc")

diff --git a/csrc/attention/attention_dtypes.h b/csrc/attention/attention_dtypes.h
@@ -4,4 +4,4 @@
 #include "dtype_float16.cuh"
 #include "dtype_float32.cuh"
 #include "dtype_bfloat16.cuh"
-#include "dtype_fp8_e5m2.cuh"
+#include "dtype_fp8.cuh"