huggingface · regisss · Jul 8, 2024 · Jun 21, 2024 · Jul 4, 2024
@@ -469,6 +469,43 @@ python ../gaudi_spawn.py \
     --low_cpu_mem_usage True
 ```
 
+- Multi-card finetuning of Llama2-7B with FP8:
+```bash
+LOWER_LIST=ops_bf16.txt python ../gaudi_spawn.py \
+	--world_size 8 --use_mpi run_lora_clm.py \
+	--model_name_or_path meta-llama/Llama-2-7b-hf \
+	--dataset_name tatsu-lab/alpaca \
+	--bf16 True \
+	--output_dir ./model_lora_llama \
+	--num_train_epochs 3 \
+	--per_device_train_batch_size 16 \
+	--gradient_accumulation_steps 1 \
+	--evaluation_strategy "no" \
+	--save_strategy "no" \
+	--learning_rate 3e-4 \
+	--warmup_ratio 0.03 \
+	--lr_scheduler_type "constant" \
+	--max_grad_norm 0.3 \
+	--logging_steps 20 \
+	--do_train \
+	--do_eval \
+	--use_habana \
+	--use_lazy_mode \
+	--throughput_warmup_steps 18 \
+	--lora_rank=8 \
+	--lora_alpha=16 \
+	--lora_dropout=0.05 \
+	--lora_target_modules "q_proj" "v_proj" \
+	--dataset_concatenation \
+	--max_seq_length 512 \
+	--ddp_bucket_cap_mb 50 \
+	--adam_epsilon 1e-08 \
+	--validation_split_percentage 10 \
+	--low_cpu_mem_usage True \
+	--pipelining_fwd_bwd \
+	--fp8 True
+```
+
 - Multi-card finetuning of codegen-16B-mono:
 ```bash
 python ../gaudi_spawn.py \

@@ -274,6 +274,43 @@
                     ]
                 }
             }
+        },
+        "tatsu-lab/alpaca_fp8": {
+            "num_train_epochs": 3,
+            "eval_batch_size": 4,
+            "distribution": {
+                "multi_card": {
+                    "learning_rate": 3e-4,
+                    "train_batch_size": 16,
+                    "perplexity": 2.3692,
+                    "train_runtime": 411.9935,
+                    "train_samples_per_second": 232.439,
+                    "extra_arguments": [
+                        "--bf16",
+                        "--gradient_accumulation_steps 1",
+                        "--evaluation_strategy no",
+                        "--save_strategy no",
+                        "--warmup_ratio  0.03",
+                        "--lr_scheduler_type constant",
+                        "--logging_steps 40",
+                        "--lora_rank 8",
+                        "--lora_alpha 16",
+                        "--lora_dropout 0.05",
+                        "--lora_target_modules q_proj v_proj",
+                        "--dataset_concatenation",
+                        "--max_seq_length 512",
+                        "--low_cpu_mem_usage True",
+                        "--adam_epsilon 1e-08",
+                        "--ddp_bucket_cap_mb 50",
+                        "--validation_split_percentage 10",
+                        "--pipelining_fwd_bwd",
+                        "--throughput_warmup_steps 18",
+                        "--use_lazy_mode",
+                        "--max_grad_norm 0.3",
+                        "--fp8"
+                    ]
+                }
+            }
         }
     }
-}
+}
@@ -192,7 +192,7 @@ class ExampleTestMeta(type):
     """
 
     @staticmethod
-    def to_test(model_name: str, multi_card: bool, deepspeed: bool, example_name: str, fsdp: bool):
+    def to_test(model_name: str, multi_card: bool, deepspeed: bool, example_name: str, fsdp: bool, fp8: bool):
         models_with_specific_rules = [
             "albert-xxlarge-v1",
             "gpt2-xl",
@@ -208,7 +208,7 @@ def to_test(model_name: str, multi_card: bool, deepspeed: bool, example_name: st
             "meta-llama/LlamaGuard-7b",
         ]
 
-        if fsdp and not IS_GAUDI2:
+        if (fsdp or fp8) and not IS_GAUDI2:
             return False
         elif (
             "sft" in example_name
@@ -241,7 +241,7 @@ def to_test(model_name: str, multi_card: bool, deepspeed: bool, example_name: st
             return True
         elif "bridgetower" in model_name and IS_GAUDI2:
             return True
-        elif "falcon" in model_name and IS_GAUDI2 and not fsdp:
+        elif "falcon" in model_name and IS_GAUDI2 and not fsdp and not fp8:
             return True
         elif "bloom" in model_name and deepspeed and not IS_GAUDI2:
             return True
@@ -253,7 +253,16 @@ def to_test(model_name: str, multi_card: bool, deepspeed: bool, example_name: st
         return False
 
     def __new__(
-        cls, name, bases, attrs, example_name=None, multi_card=False, deepspeed=False, fsdp=False, torch_compile=False
+        cls,
+        name,
+        bases,
+        attrs,
+        example_name=None,
+        multi_card=False,
+        deepspeed=False,
+        fsdp=False,
+        torch_compile=False,
+        fp8=False,
     ):
         distribution = "single_card"
         if multi_card:
@@ -274,9 +283,9 @@ def __new__(
                     )
 
         for model_name, gaudi_config_name in models_to_test:
-            if cls.to_test(model_name, multi_card, deepspeed, example_name, fsdp):
+            if cls.to_test(model_name, multi_card, deepspeed, example_name, fsdp, fp8):
                 attrs[f"test_{example_name}_{model_name.split('/')[-1]}_{distribution}"] = cls._create_test(
-                    model_name, gaudi_config_name, multi_card, deepspeed, fsdp, torch_compile
+                    model_name, gaudi_config_name, multi_card, deepspeed, fsdp, torch_compile, fp8
                 )
         attrs["EXAMPLE_NAME"] = example_name
         return super().__new__(cls, name, bases, attrs)
@@ -290,6 +299,7 @@ def _create_test(
         deepspeed: bool = False,
         fsdp: bool = False,
         torch_compile: bool = False,
+        fp8: bool = False,
     ) -> Callable[[], None]:
         """
         Create a test function that runs an example for a specific (model_name, gaudi_config_name) pair.
@@ -393,6 +403,9 @@ def test(self):
             elif deepspeed and "gpt-neox-20b" in model_name:
                 env_variables["LD_PRELOAD"] = ""
 
+            if fp8 and "llama" in model_name:
+                env_variables["LOWER_LIST"] = str(example_script.parent / "ops_bf16.txt")
+
             extra_command_line_arguments = baseline.get("distribution").get(distribution).get("extra_arguments", [])
 
             if os.environ.get("DATA_CACHE", None) is not None and self.EXAMPLE_NAME == "run_clip":
@@ -784,3 +797,10 @@ class MultiCardCausalLanguageModelingPTuningExampleTester(
 ):
     TASK_NAME = ["p-tuning"]
     DATASET_NAME = "ought/raft"
+
+
+class MultiCardCausalLanguageModelingLoRAFP8ExampleTester(
+    ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_lora_clm", multi_card=True, fp8=True
+):
+    TASK_NAME = "tatsu-lab/alpaca_fp8"
+    DATASET_NAME = "tatsu-lab/alpaca"