diff --git a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py index 21cec66c8b..85cb388679 100644 --- a/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py +++ b/examples/sparse_2of4_quantization_fp8/llama3_8b_2of4.py @@ -116,7 +116,5 @@ def get_recipe(fp8_enabled): print("==========================================\n") # Save compressed model and tokenizer -model.save_pretrained( - save_dir, save_compressed=args.fp8, disable_sparse_compression=True -) +model.save_pretrained(save_dir, save_compressed=args.fp8) tokenizer.save_pretrained(save_dir) diff --git a/tests/e2e/vLLM/configs/sparse_24.yaml b/tests/e2e/vLLM/configs/sparse_24.yaml index 653168b977..d0805a1b0d 100644 --- a/tests/e2e/vLLM/configs/sparse_24.yaml +++ b/tests/e2e/vLLM/configs/sparse_24.yaml @@ -5,4 +5,4 @@ recipe: tests/e2e/vLLM/recipes/Sparse_2of4/recipe_sparse_2of4.yaml scheme: sparse2of4_only dataset_id: HuggingFaceH4/ultrachat_200k dataset_split: train_sft -save_compressed: False \ No newline at end of file +save_compressed: True \ No newline at end of file