Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
178 commits
Select commit Hold shift + click to select a range
ad53f74
Add hip_fp8 datatype and conversions
gshtras Feb 5, 2024
9b1577a
Add 3rdparty quantizer utility and usage to quantize models (HF default)
HaiShaw Feb 6, 2024
644b165
Update 3rdparty quantizer utility and usage with ammo updates
HaiShaw Feb 7, 2024
0ed1d98
Use e4m3 and e5m2 interchangeably
gshtras Feb 7, 2024
18b2516
Using fp8 in any cache tests that could support it
gshtras Feb 7, 2024
81a6859
Integrate e4m3 alongside e5m2 and adapt cache tests
gshtras Feb 8, 2024
83089d0
Add gfx942 to the arch list
gshtras Feb 8, 2024
926e2b8
Less forgiving atol in fp8 tests
gshtras Feb 8, 2024
475a2ef
Merge pull request #1 from ROCm/greg/fp8_tests
HaiShaw Feb 9, 2024
1b8bc9f
enable fp8-e4m3 kv cache on rocm
Feb 9, 2024
777cc35
Rename remaining fp8_e5m2 to general fp8
gshtras Feb 9, 2024
a2897d6
Fix or comparisons
mawong-amd Feb 9, 2024
a06ddac
Add e4m3 to attention kernels
gshtras Feb 9, 2024
f358dcd
Remove remaining mentions of e5m2 where it refers to general fp8
gshtras Feb 9, 2024
4db0038
Address naming conventions
Feb 9, 2024
2e85bc7
Merge branch 'fp8-e4m3-kvcache-rocm' of https://github.com/ROCm/vllm-…
Feb 9, 2024
17a91a0
More verbose help message for fp8 cache type
gshtras Feb 9, 2024
4fbc915
Updated fp8 help text in additional files sililar to arg_utils
gshtras Feb 9, 2024
7f5623d
Merge pull request #3 from ROCm/greg/tweaks
HaiShaw Feb 9, 2024
0da44bc
Merge branch 'fp8_kv' of https://github.com/ROCm/vllm-fp8 into fp8-e4…
Feb 9, 2024
2c525de
Fix merge conflict
Feb 9, 2024
54d1d4d
generalize fp8 convention
Feb 9, 2024
4a0d880
Merge pull request #2 from ROCm/fp8-e4m3-kvcache-rocm
HaiShaw Feb 9, 2024
7a9db00
Update log info and args description w.r.t. FP8 KV cache.
HaiShaw Feb 10, 2024
0d59cfb
Initial skeleton; scaling factors in CacheEngine and PagedAttention
mawong-amd Feb 12, 2024
eaf08ff
Initial conversion back to KV cache scales in model, using float scal…
mawong-amd Feb 15, 2024
0a80226
Completing KV cache scaling factors ingest (TP>1 todo), clean up code…
HaiShaw Feb 20, 2024
7b26ec9
Fix typos, add a few more sanity checks to the KV cache scales loader…
mawong-amd Feb 20, 2024
936821e
Add additional checks to the scaling factor loader and fail gracefull…
mawong-amd Feb 20, 2024
74b2b3f
Remove lingering PT fallback in extraction utility
mawong-amd Feb 20, 2024
2c7ce96
Add ROCm clarification to extract scales script
mawong-amd Feb 20, 2024
5985e25
Merge pull request #4 from ROCm/fp8_ingest_stage1_model
AdrianAbeyta Feb 20, 2024
763b283
Preliminary TP rank > 1 extraction and loading support
mawong-amd Feb 21, 2024
ef26716
Ensure loaded dictionary has same TP size as currently running engine
mawong-amd Feb 21, 2024
c7e2587
Add tp_size argument for user to specify TP size to expect in quantiz…
mawong-amd Feb 23, 2024
61f2046
Add specific FP8 E4M3 and ROCm flavor text to the --quantized_model a…
mawong-amd Feb 23, 2024
dc71088
Small tweak on expected TP size flavor text for clarity
mawong-amd Feb 23, 2024
4cd76e9
Add output filename argument, rename output_path to output_dir, and c…
mawong-amd Feb 23, 2024
ad8b841
Fix up remaining 'output_path's from the rename
mawong-amd Feb 23, 2024
fec2232
Add scaling factor correction for ROCm FP8
mawong-amd Feb 23, 2024
7fdcf10
Add example output for extract_scales
Feb 23, 2024
6a6bbcd
Strip out download functionality in scale extraction utility
mawong-amd Feb 23, 2024
9336cdb
Merge pull request #5 from ROCm/fp8_ingest_stage1_model
AdrianAbeyta Feb 23, 2024
8e108d3
Correcting a stray type hint
mawong-amd Feb 23, 2024
31ebfa6
Merge branch 'fp8_kv' into fp8_ingest_scales_correction
mawong-amd Feb 23, 2024
4dd7d1e
Correct a stray type hint
mawong-amd Feb 23, 2024
9a03b96
Create README.md and add usage example
AdrianAbeyta Feb 23, 2024
4064973
Added benchmark description
AdrianAbeyta Feb 23, 2024
988ffc3
Clean up readme
AdrianAbeyta Feb 24, 2024
0650ae4
Merge pull request #6 from ROCm/fp8_ingest_scales_correction
AdrianAbeyta Feb 26, 2024
ee6ba29
Change convention: Initialize scaling factors always if KV cache is F…
mawong-amd Feb 26, 2024
4007656
Updated example descriptions
AdrianAbeyta Feb 26, 2024
6f2b248
Merge pull request #8 from ROCm/fp8_ingest_stage1_model
AdrianAbeyta Feb 26, 2024
0a45612
Merge pull request #7 from ROCm/fp8_doc
AdrianAbeyta Feb 26, 2024
c8059c2
Kernel and Device functions to enable FP8 KV cache scaling factors
HaiShaw Feb 27, 2024
fc2cdaf
Make KV cache scaling factors default to 1.0 instead of None
HaiShaw Feb 27, 2024
76c6058
Update KV cache scales loader name to clarify that we are not using a…
mawong-amd Feb 27, 2024
c825bb3
Fix test cases from the introduction of KV cache scaling factors, usi…
HaiShaw Feb 28, 2024
4f574cd
Cleanup comments according to reviews
HaiShaw Feb 29, 2024
86f06ca
Merge pull request #9 from ROCm/fp8_kv_cache
HaiShaw Feb 29, 2024
f325cb0
Add hip_fp8 datatype and conversions
gshtras Feb 5, 2024
4bb8dac
Add 3rdparty quantizer utility and usage to quantize models (HF default)
HaiShaw Feb 6, 2024
8b1279b
Update 3rdparty quantizer utility and usage with ammo updates
HaiShaw Feb 7, 2024
9c4226e
Use e4m3 and e5m2 interchangeably
gshtras Feb 7, 2024
30bba1c
Using fp8 in any cache tests that could support it
gshtras Feb 7, 2024
692f5ad
Integrate e4m3 alongside e5m2 and adapt cache tests
gshtras Feb 8, 2024
c9321a0
Add gfx942 to the arch list
gshtras Feb 8, 2024
4e1f89a
Less forgiving atol in fp8 tests
gshtras Feb 8, 2024
7bc2574
enable fp8-e4m3 kv cache on rocm
Feb 9, 2024
ebf7542
Address naming conventions
Feb 9, 2024
ad44055
Fix or comparisons
mawong-amd Feb 9, 2024
e5e0e7c
Rename remaining fp8_e5m2 to general fp8
gshtras Feb 9, 2024
c86b2ec
Add e4m3 to attention kernels
gshtras Feb 9, 2024
a432815
Remove remaining mentions of e5m2 where it refers to general fp8
gshtras Feb 9, 2024
4b77126
Updated fp8 help text in additional files sililar to arg_utils
gshtras Feb 9, 2024
bbf6d49
generalize fp8 convention
Feb 9, 2024
4dfb26d
Update log info and args description w.r.t. FP8 KV cache.
HaiShaw Feb 10, 2024
0f492f5
Initial skeleton; scaling factors in CacheEngine and PagedAttention
mawong-amd Feb 12, 2024
030c9eb
Initial conversion back to KV cache scales in model, using float scal…
mawong-amd Feb 15, 2024
e00a86d
Completing KV cache scaling factors ingest (TP>1 todo), clean up code…
HaiShaw Feb 20, 2024
d3e98f3
Fix typos, add a few more sanity checks to the KV cache scales loader…
mawong-amd Feb 20, 2024
714e42c
Add additional checks to the scaling factor loader and fail gracefull…
mawong-amd Feb 20, 2024
3ff51b1
Remove lingering PT fallback in extraction utility
mawong-amd Feb 20, 2024
e97a31e
Add ROCm clarification to extract scales script
mawong-amd Feb 20, 2024
0ba975d
Add scaling factor correction for ROCm FP8
mawong-amd Feb 23, 2024
6991c59
Correcting a stray type hint
mawong-amd Feb 23, 2024
2292776
Preliminary TP rank > 1 extraction and loading support
mawong-amd Feb 21, 2024
221699b
Ensure loaded dictionary has same TP size as currently running engine
mawong-amd Feb 21, 2024
96a7546
Add tp_size argument for user to specify TP size to expect in quantiz…
mawong-amd Feb 23, 2024
9d08a92
Add specific FP8 E4M3 and ROCm flavor text to the --quantized_model a…
mawong-amd Feb 23, 2024
553209b
Small tweak on expected TP size flavor text for clarity
mawong-amd Feb 23, 2024
c852549
Add output filename argument, rename output_path to output_dir, and c…
mawong-amd Feb 23, 2024
e23379e
Fix up remaining 'output_path's from the rename
mawong-amd Feb 23, 2024
40171c9
Strip out download functionality in scale extraction utility
mawong-amd Feb 23, 2024
7666587
Correct a stray type hint
mawong-amd Feb 23, 2024
7c0bf6e
Change convention: Initialize scaling factors always if KV cache is F…
mawong-amd Feb 26, 2024
42e2aef
Add example output for extract_scales
Feb 23, 2024
d3dbb1a
Create README.md and add usage example
AdrianAbeyta Feb 23, 2024
f39839b
Added benchmark description
AdrianAbeyta Feb 23, 2024
2cfea65
Clean up readme
AdrianAbeyta Feb 24, 2024
257a7da
Updated example descriptions
AdrianAbeyta Feb 26, 2024
730562a
Update KV cache scales loader name to clarify that we are not using a…
mawong-amd Feb 27, 2024
f20eceb
Kernel and Device functions to enable FP8 KV cache scaling factors
HaiShaw Feb 27, 2024
8834917
Make KV cache scaling factors default to 1.0 instead of None
HaiShaw Feb 27, 2024
3187582
Fix test cases from the introduction of KV cache scaling factors, usi…
HaiShaw Feb 28, 2024
49df502
Cleanup comments according to reviews
HaiShaw Feb 29, 2024
12f7650
Remove load_dummy_kv_cache_scales as convention change in PR#9 render…
mawong-amd Mar 4, 2024
4a8d06c
Add back removal of gather cached kv kernel for use with FP8
Mar 5, 2024
b87aec1
Clean up IFU
Mar 5, 2024
fa6fbce
Clean up IFU
Mar 5, 2024
65f70d7
Schema change: preliminary changes to extract script, TODO: loading l…
mawong-amd Mar 6, 2024
f5c0236
Fix runtime issues with upstream rebase
Mar 6, 2024
d2a42f9
[Minor fix] The domain dns.google may cause a socket.gaierror excepti…
ttbachyinsda Mar 4, 2024
4b3e4b0
Preliminary refactoring: KV cache scales JSON into general scales JSO…
mawong-amd Mar 7, 2024
00f5113
Merge branch 'fp8_kv' into fp8_ingest_stage1_model
mawong-amd Mar 7, 2024
a7e6e81
Fixing stray syntax errors and typos, refactoring rank_keyword detection
mawong-amd Mar 7, 2024
ef85f98
Address reviewer comments
mawong-amd Mar 7, 2024
d8b2843
Address Greg's strong type checking :)
mawong-amd Mar 7, 2024
52df603
Add an additional TODO
mawong-amd Mar 7, 2024
b484112
Merge remote-tracking branch 'upstream/main' into IFU-2024-03-01-fp8-kv
Mar 7, 2024
7b72159
Merge pull request #16 from ROCm/fp8_ingest_stage1_model
AdrianAbeyta Mar 7, 2024
18c55d2
Fix OOM bug in quantize script, remove extraneous model_export
mawong-amd Mar 7, 2024
7d0fa2f
Fix rocm build conditions
Mar 7, 2024
e7db6af
Keep previous build flow for neuron
Mar 7, 2024
660dbb3
Merge remote-tracking branch 'origin/fp8_kv' into IFU-2024-03-01-fp8-kv
Mar 7, 2024
ca1b39c
Measure model memory usage (#3120)
mgoin Mar 7, 2024
fd6e57e
Possible fix for conflict between Automated Prefix Caching (#2762) an…
jacobthebanana Mar 7, 2024
90c2cd4
Update fp8 examples
Mar 7, 2024
2d520f2
Merge remote-tracking branch 'upstream/main' into IFU-2024-03-01-fp8-kv
Mar 8, 2024
9e6144a
[FIX] Make `flash_attn` optional (#3269)
WoosukKwon Mar 8, 2024
fd01e9a
Fix setup.py up to where it should be before the excitement of the la…
mawong-amd Mar 8, 2024
6edfbf1
Fix missing enable FP8_E4M3 flag and cherry pick newest load convention
mawong-amd Mar 8, 2024
be92918
Merge branch 'IFU-2024-03-01-fp8-kv' of https://github.com/ROCm/vllm-…
Mar 8, 2024
dd469df
Add model flag as example option
Mar 8, 2024
b3d81e0
Merge pull request #17 from ROCm/IFU-2024-03-01-fp8-kv
AdrianAbeyta Mar 8, 2024
5a2d747
Merge branch 'main' into fp8_kv
AdrianAbeyta Mar 11, 2024
267c847
Merge remote-tracking branch 'upstream/main' into fp8_kv
Mar 11, 2024
2f60ad7
Fix ruff syntax errors
Mar 13, 2024
31d96dd
Merge remote-tracking branch 'upstream/main' into fp8_kv
Mar 13, 2024
94c2e7c
Update model config for scales path
Mar 13, 2024
a350641
Add .rst for fp8_e4m3_kvcache and rename fp8_kvcache to fp8_e5m2
Mar 13, 2024
eb8e3d8
Skip fp8 UT test on CUDA for e4m3
Mar 14, 2024
f9eba0c
Fix device id formatting
gshtras Mar 14, 2024
db8f29c
Fix scales_path location
mawong-amd Mar 15, 2024
49d1593
Fix yapf formatting
mawong-amd Mar 15, 2024
b5ebb41
Skipping certain cache tests when using fp8 cache with e5m2 type. The…
gshtras Mar 15, 2024
45d5912
Fix yapf ci error
Mar 15, 2024
4e8d5a8
Relocate kv_scale into self_attn, some yapf fixes
mawong-amd Mar 19, 2024
f738593
Fix incorrect yapf versioning
mawong-amd Mar 19, 2024
bcc9e8b
Fix type hints
mawong-amd Mar 19, 2024
0e498d2
Merge branch 'main' of github.com:vllm-project/vllm into fp8_kv
mawong-amd Mar 20, 2024
7f1db2a
Remove legacy setup.py contents
mawong-amd Mar 20, 2024
4e82548
Add kv_scale to flash attn
mawong-amd Mar 20, 2024
81c2547
Merge branch 'main' into fp8_kv
mawong-amd Mar 20, 2024
0ebbe3f
Preliminary batch of linter fixes, more to come
mawong-amd Mar 21, 2024
5914548
Linter fixes
mawong-amd Mar 21, 2024
2b165bc
Merge branch 'main' of github.com:vllm-project/vllm into fp8_kv
mawong-amd Mar 21, 2024
69ad9c8
Fix incorrect legacy code left behind by merge
mawong-amd Mar 21, 2024
9efcee0
Small style fixes aligning with upstream, remove orphan internlm
mawong-amd Mar 21, 2024
3ddcd9d
Merge branch 'main' of github.com:vllm-project/vllm into fp8_kv
mawong-amd Mar 26, 2024
c647d3f
Fix merge imperfections, linter fixes
mawong-amd Mar 26, 2024
84d4950
Fix import order (isort lint)
mawong-amd Mar 26, 2024
a7a0009
Revert unnecessary changes from 2f60ad72815d3e84fb2250097aa6e1d77de567b0
gshtras Mar 26, 2024
1c060fb
Address review comments, removing unnecessary comment code
HaiShaw Mar 26, 2024
de9fa4e
Address review comments, removing gather_cached_kv stale code
HaiShaw Mar 27, 2024
ae41b49
Address review comments, rewording doc references
HaiShaw Mar 27, 2024
8709bce
Update scales_path to the clearer quantization_param_path; update FP8…
mawong-amd Mar 27, 2024
6ec281d
Fix minor typo
mawong-amd Mar 27, 2024
8197bd8
Merge branch 'main' into fp8_kv
mawong-amd Mar 27, 2024
a92f8fc
Fix things left out from latest merge
mawong-amd Mar 28, 2024
ac47949
Address review comments on model path and nVIDIA name
HaiShaw Mar 29, 2024
b6cd2e1
Adding reference to kv cache scaling factor file generation to docs
HaiShaw Mar 29, 2024
ebceda5
Relocating 3rdparty to examples/fp8/3rdparty
HaiShaw Mar 29, 2024
0676bbe
Update loading logic to use Pydantic schema checking
mawong-amd Mar 29, 2024
b4fea5a
Address review comments on docs and miscellaneous
HaiShaw Apr 2, 2024
ef8d9fb
Enforce line length to 80
HaiShaw Apr 2, 2024
14d55ec
Refactor quant param schema extraction + loading, address reviewer co…
mawong-amd Apr 2, 2024
fb3d245
Fix merge conflict and add example output to documentation
mawong-amd Apr 2, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -181,6 +181,7 @@ _build/
# hip files generated by PyTorch
*.hip
*_hip*
hip_compat.h

# Benchmark dataset
*.json
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ set(PYTHON_SUPPORTED_VERSIONS "3.8" "3.9" "3.10" "3.11")
set(CUDA_SUPPORTED_ARCHS "7.0;7.5;8.0;8.6;8.9;9.0")

# Supported AMD GPU architectures.
set(HIP_SUPPORTED_ARCHS "gfx908;gfx90a;gfx942;gfx1100")
set(HIP_SUPPORTED_ARCHS "gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100")

#
# Supported/expected torch versions for CUDA/ROCm.
Expand Down
18 changes: 16 additions & 2 deletions benchmarks/benchmark_latency.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def main(args: argparse.Namespace):
dtype=args.dtype,
enforce_eager=args.enforce_eager,
kv_cache_dtype=args.kv_cache_dtype,
quantization_param_path=args.quantization_param_path,
device=args.device,
ray_workers_use_nsight=args.ray_workers_use_nsight,
download_dir=args.download_dir)
Expand Down Expand Up @@ -125,10 +126,23 @@ def run_to_completion(profile_dir: Optional[str] = None):
parser.add_argument(
"--kv-cache-dtype",
type=str,
choices=['auto', 'fp8_e5m2'],
choices=['auto', 'fp8'],
default='auto',
help=
'Data type for kv cache storage. If "auto", will use model data type.')
'Data type for kv cache storage. If "auto", will use model data type. '
'FP8_E5M2 (without scaling) is only supported on cuda version greater '
'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
'common inference criteria.')
parser.add_argument(
'--quantization-param-path',
type=str,
default=None,
help='Path to the JSON file containing the KV cache scaling factors. '
'This should generally be supplied, when KV cache dtype is FP8. '
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
'instead supported for common inference criteria.')
parser.add_argument(
'--profile',
action='store_true',
Expand Down
22 changes: 19 additions & 3 deletions benchmarks/benchmark_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ def run_vllm(
max_model_len: Optional[int],
enforce_eager: bool,
kv_cache_dtype: str,
quantization_param_path: Optional[str],
device: str,
enable_prefix_caching: bool,
gpu_memory_utilization: float = 0.9,
Expand All @@ -89,6 +90,7 @@ def run_vllm(
gpu_memory_utilization=gpu_memory_utilization,
enforce_eager=enforce_eager,
kv_cache_dtype=kv_cache_dtype,
quantization_param_path=quantization_param_path,
device=device,
enable_prefix_caching=enable_prefix_caching,
download_dir=download_dir)
Expand Down Expand Up @@ -215,7 +217,8 @@ def main(args: argparse.Namespace):
args.seed, args.n, args.use_beam_search,
args.trust_remote_code, args.dtype,
args.max_model_len, args.enforce_eager,
args.kv_cache_dtype, args.device,
args.kv_cache_dtype,
args.quantization_param_path, args.device,
args.enable_prefix_caching,
args.gpu_memory_utilization, args.download_dir)
elif args.backend == "hf":
Expand Down Expand Up @@ -304,10 +307,23 @@ def main(args: argparse.Namespace):
parser.add_argument(
"--kv-cache-dtype",
type=str,
choices=["auto", "fp8_e5m2"],
choices=["auto", "fp8"],
default="auto",
help=
'Data type for kv cache storage. If "auto", will use model data type.')
'Data type for kv cache storage. If "auto", will use model data type. '
'FP8_E5M2 (without scaling) is only supported on cuda version greater '
'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
'common inference criteria.')
parser.add_argument(
'--quantization-param-path',
type=str,
default=None,
help='Path to the JSON file containing the KV cache scaling factors. '
'This should generally be supplied, when KV cache dtype is FP8. '
'Otherwise, KV cache scaling factors default to 1.0, which may cause '
'accuracy issues. FP8_E5M2 (without scaling) is only supported on '
'cuda version greater than 11.8. On ROCm (AMD GPU), FP8_E4M3 is '
'instead supported for common inference criteria.')
parser.add_argument(
"--device",
type=str,
Expand Down
13 changes: 10 additions & 3 deletions benchmarks/kernels/benchmark_paged_attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,9 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
torch.cuda.cudart().cudaProfilerStart()
start_time = time.perf_counter()

# Using default kv_scale
kv_scale = 1.0

for _ in range(num_iters):
if version == "v1":
ops.paged_attention_v1(
Expand All @@ -112,6 +115,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
max_context_len,
alibi_slopes,
kv_cache_dtype,
kv_scale,
)
elif version == "v2":
ops.paged_attention_v2(
Expand All @@ -130,6 +134,7 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
max_context_len,
alibi_slopes,
kv_cache_dtype,
kv_scale,
)
else:
raise ValueError(f"Invalid version: {version}")
Expand Down Expand Up @@ -179,11 +184,13 @@ def run_cuda_benchmark(num_iters: int, profile: bool = False) -> float:
parser.add_argument(
"--kv-cache-dtype",
type=str,
choices=["auto", "fp8_e5m2"],
choices=["auto", "fp8"],
default="auto",
help=
'Data type for kv cache storage. If "auto", will use model data type.')
parser.add_argument("--device", type=str, choices=["cuda"], default="cuda")
'Data type for kv cache storage. If "auto", will use model data type. '
'FP8_E5M2 (without scaling) is only supported on cuda version greater '
'than 11.8. On ROCm (AMD GPU), FP8_E4M3 is instead supported for '
'common inference criteria.')
args = parser.parse_args()
print(args)

Expand Down
1 change: 1 addition & 0 deletions cmake/utils.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ function (get_torch_gpu_compiler_flags OUT_GPU_FLAGS GPU_LANG)

list(APPEND GPU_FLAGS
"-DUSE_ROCM"
"-DENABLE_FP8_E4M3"
"-U__HIP_NO_HALF_CONVERSIONS__"
"-U__HIP_NO_HALF_OPERATORS__"
"-fno-gpu-rdc")
Expand Down
2 changes: 1 addition & 1 deletion csrc/attention/attention_dtypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,4 @@
#include "dtype_float16.cuh"
#include "dtype_float32.cuh"
#include "dtype_bfloat16.cuh"
#include "dtype_fp8_e5m2.cuh"
#include "dtype_fp8.cuh"
Loading