Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions examples/language-modeling/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -469,6 +469,43 @@ python ../gaudi_spawn.py \
--low_cpu_mem_usage True
```

- Multi-card finetuning of Llama2-7B with FP8:
```bash
LOWER_LIST=ops_bf16.txt python ../gaudi_spawn.py \
--world_size 8 --use_mpi run_lora_clm.py \
--model_name_or_path meta-llama/Llama-2-7b-hf \
--dataset_name tatsu-lab/alpaca \
--bf16 True \
--output_dir ./model_lora_llama \
--num_train_epochs 3 \
--per_device_train_batch_size 16 \
--gradient_accumulation_steps 1 \
--evaluation_strategy "no" \
--save_strategy "no" \
--learning_rate 3e-4 \
--warmup_ratio 0.03 \
--lr_scheduler_type "constant" \
--max_grad_norm 0.3 \
--logging_steps 20 \
--do_train \
--do_eval \
--use_habana \
--use_lazy_mode \
--throughput_warmup_steps 18 \
--lora_rank=8 \
--lora_alpha=16 \
--lora_dropout=0.05 \
--lora_target_modules "q_proj" "v_proj" \
--dataset_concatenation \
--max_seq_length 512 \
--ddp_bucket_cap_mb 50 \
--adam_epsilon 1e-08 \
--validation_split_percentage 10 \
--low_cpu_mem_usage True \
--pipelining_fwd_bwd \
--fp8 True
```

- Multi-card finetuning of codegen-16B-mono:
```bash
python ../gaudi_spawn.py \
Expand Down
39 changes: 38 additions & 1 deletion tests/baselines/llama_7b.json
Original file line number Diff line number Diff line change
Expand Up @@ -274,6 +274,43 @@
]
}
}
},
"tatsu-lab/alpaca_fp8": {
"num_train_epochs": 3,
"eval_batch_size": 4,
"distribution": {
"multi_card": {
"learning_rate": 3e-4,
"train_batch_size": 16,
"perplexity": 2.3692,
"train_runtime": 411.9935,
"train_samples_per_second": 232.439,
"extra_arguments": [
"--bf16",
"--gradient_accumulation_steps 1",
"--evaluation_strategy no",
"--save_strategy no",
"--warmup_ratio 0.03",
"--lr_scheduler_type constant",
"--logging_steps 40",
"--lora_rank 8",
"--lora_alpha 16",
"--lora_dropout 0.05",
"--lora_target_modules q_proj v_proj",
"--dataset_concatenation",
"--max_seq_length 512",
"--low_cpu_mem_usage True",
"--adam_epsilon 1e-08",
"--ddp_bucket_cap_mb 50",
"--validation_split_percentage 10",
"--pipelining_fwd_bwd",
"--throughput_warmup_steps 18",
"--use_lazy_mode",
"--max_grad_norm 0.3",
"--fp8"
]
}
}
}
}
}
}
32 changes: 26 additions & 6 deletions tests/test_examples.py
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ class ExampleTestMeta(type):
"""

@staticmethod
def to_test(model_name: str, multi_card: bool, deepspeed: bool, example_name: str, fsdp: bool):
def to_test(model_name: str, multi_card: bool, deepspeed: bool, example_name: str, fsdp: bool, fp8: bool):
models_with_specific_rules = [
"albert-xxlarge-v1",
"gpt2-xl",
Expand All @@ -208,7 +208,7 @@ def to_test(model_name: str, multi_card: bool, deepspeed: bool, example_name: st
"meta-llama/LlamaGuard-7b",
]

if fsdp and not IS_GAUDI2:
if (fsdp or fp8) and not IS_GAUDI2:
return False
elif (
"sft" in example_name
Expand Down Expand Up @@ -241,7 +241,7 @@ def to_test(model_name: str, multi_card: bool, deepspeed: bool, example_name: st
return True
elif "bridgetower" in model_name and IS_GAUDI2:
return True
elif "falcon" in model_name and IS_GAUDI2 and not fsdp:
elif "falcon" in model_name and IS_GAUDI2 and not fsdp and not fp8:
return True
elif "bloom" in model_name and deepspeed and not IS_GAUDI2:
Comment thread
libinta marked this conversation as resolved.
return True
Expand All @@ -253,7 +253,16 @@ def to_test(model_name: str, multi_card: bool, deepspeed: bool, example_name: st
return False

def __new__(
cls, name, bases, attrs, example_name=None, multi_card=False, deepspeed=False, fsdp=False, torch_compile=False
cls,
name,
bases,
attrs,
example_name=None,
multi_card=False,
deepspeed=False,
fsdp=False,
torch_compile=False,
fp8=False,
):
distribution = "single_card"
if multi_card:
Expand All @@ -274,9 +283,9 @@ def __new__(
)

for model_name, gaudi_config_name in models_to_test:
if cls.to_test(model_name, multi_card, deepspeed, example_name, fsdp):
if cls.to_test(model_name, multi_card, deepspeed, example_name, fsdp, fp8):
attrs[f"test_{example_name}_{model_name.split('/')[-1]}_{distribution}"] = cls._create_test(
model_name, gaudi_config_name, multi_card, deepspeed, fsdp, torch_compile
model_name, gaudi_config_name, multi_card, deepspeed, fsdp, torch_compile, fp8
)
attrs["EXAMPLE_NAME"] = example_name
return super().__new__(cls, name, bases, attrs)
Expand All @@ -290,6 +299,7 @@ def _create_test(
deepspeed: bool = False,
fsdp: bool = False,
torch_compile: bool = False,
fp8: bool = False,
) -> Callable[[], None]:
"""
Create a test function that runs an example for a specific (model_name, gaudi_config_name) pair.
Expand Down Expand Up @@ -393,6 +403,9 @@ def test(self):
elif deepspeed and "gpt-neox-20b" in model_name:
env_variables["LD_PRELOAD"] = ""

if fp8 and "llama" in model_name:
env_variables["LOWER_LIST"] = str(example_script.parent / "ops_bf16.txt")

extra_command_line_arguments = baseline.get("distribution").get(distribution).get("extra_arguments", [])

if os.environ.get("DATA_CACHE", None) is not None and self.EXAMPLE_NAME == "run_clip":
Expand Down Expand Up @@ -784,3 +797,10 @@ class MultiCardCausalLanguageModelingPTuningExampleTester(
):
TASK_NAME = ["p-tuning"]
DATASET_NAME = "ought/raft"


class MultiCardCausalLanguageModelingLoRAFP8ExampleTester(
ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_lora_clm", multi_card=True, fp8=True
):
TASK_NAME = "tatsu-lab/alpaca_fp8"
DATASET_NAME = "tatsu-lab/alpaca"