From e97330b7c87426c14bcb6f84cde9219997c240e9 Mon Sep 17 00:00:00 2001 From: Sun Choi Date: Wed, 17 Jan 2024 17:00:39 -0800 Subject: [PATCH 1/5] Falcon changes for 1.14.0 1) fix for invalid input shape with DS 0.12.4 2) revert --skip_hash_with_views changes 3) add falcon to model_on_meta() --- examples/text-generation/README.md | 5 ++--- examples/text-generation/run_generation.py | 5 ----- examples/text-generation/utils.py | 3 +-- optimum/habana/checkpoint_utils.py | 2 +- optimum/habana/transformers/models/falcon/modeling_falcon.py | 4 +++- tests/test_text_generation_example.py | 3 --- 6 files changed, 7 insertions(+), 15 deletions(-) diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md index bc57fe30f3..aa9cab9ae2 100644 --- a/examples/text-generation/README.md +++ b/examples/text-generation/README.md @@ -135,7 +135,7 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \ --trim_logits ``` -To run Falcon inference, use the following command. Please note that the option `--skip_hash_with_views` is added to the command to disable the `hash_with_views` feature in HPU graphs, which requires SynapseAI 1.13.0 or later: +To run Falcon inference, use the following command: ```bash python run_generation.py \ --model_name_or_path tiiuae/falcon-7b \ @@ -144,8 +144,7 @@ python run_generation.py \ --use_kv_cache \ --batch_size 1 \ --max_new_tokens 128 \ - --do_sample \ - --skip_hash_with_views + --do_sample ``` > To be able to run gated models like [StarCoder](https://huggingface.co/bigcode/starcoder), you should: diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py index ab308e7023..7e932ef48e 100644 --- a/examples/text-generation/run_generation.py +++ b/examples/text-generation/run_generation.py @@ -201,11 +201,6 @@ def setup_parser(parser): action="store_true", help="Whether to reuse key/value cache for decoding. It should save memory.", ) - parser.add_argument( - "--skip_hash_with_views", - action="store_true", - help="Whether to skip hash with views for HPU graphs. When skip_hash_with_views is not used, the input to HPU graphs includes both view and base tensors.", - ) parser.add_argument("--verbose_workers", action="store_true", help="Enable output from non-master workers") parser.add_argument( "--simulate_dyn_prompt", diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py index 5c03de7dc6..8adc523a6a 100644 --- a/examples/text-generation/utils.py +++ b/examples/text-generation/utils.py @@ -166,8 +166,7 @@ def setup_model(args, model_dtype, model_kwargs, logger): # TODO: remove the following check from SynapseAI v1.15 if check_habana_frameworks_version("1.13.0"): if model.config.model_type == "falcon": - args.skip_hash_with_views = True - model = wrap_in_hpu_graph(model, hash_with_views=not args.skip_hash_with_views) + model = wrap_in_hpu_graph(model, hash_with_views=False) else: model = wrap_in_hpu_graph(model) return model diff --git a/optimum/habana/checkpoint_utils.py b/optimum/habana/checkpoint_utils.py index 45509c0bed..e0fc139f5d 100644 --- a/optimum/habana/checkpoint_utils.py +++ b/optimum/habana/checkpoint_utils.py @@ -76,7 +76,7 @@ def model_on_meta(config): """ Checks if load the model to meta. """ - return config.model_type in ["bloom", "llama"] + return config.model_type in ["bloom", "llama", "falcon"] def get_optimized_model_name(config): diff --git a/optimum/habana/transformers/models/falcon/modeling_falcon.py b/optimum/habana/transformers/models/falcon/modeling_falcon.py index ebb141fa5d..122d68824a 100644 --- a/optimum/habana/transformers/models/falcon/modeling_falcon.py +++ b/optimum/habana/transformers/models/falcon/modeling_falcon.py @@ -123,10 +123,12 @@ def gaudi_falcon_attention_split_heads( if self.config.num_attention_heads != self.num_heads: # When DS divides heads for TP num_heads = self.config.num_attention_heads + num_kv_heads = self.config.num_kv_heads else: # When DS not in use num_heads = self.num_heads + num_kv_heads = self.num_kv_heads - qkv = fused_qkv.view(batch, seq_len, -1, num_heads // self.num_kv_heads + 2, self.head_dim) + qkv = fused_qkv.view(batch, seq_len, -1, num_heads // num_kv_heads + 2, self.head_dim) # query = qkv[:, :, :, :-2] # key = qkv[:, :, :, [-2]] # value = qkv[:, :, :, [-1]] diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py index faae5b79db..9abdc0e8dd 100644 --- a/tests/test_text_generation_example.py +++ b/tests/test_text_generation_example.py @@ -76,9 +76,6 @@ def _test_text_generation(model_name: str, baseline: float, token: str, deepspee if not deepspeed: command.append("--bf16") - if "falcon" in model_name: - command.append("--skip_hash_with_views") - with TemporaryDirectory() as tmp_dir: command.append(f"--output_dir {tmp_dir}") print(f"\n\nCommand to test: {' '.join(command)}\n") From b557297bbf3b0dd9c074cc399123022f86bba34e Mon Sep 17 00:00:00 2001 From: Sun Choi Date: Tue, 23 Jan 2024 00:19:14 +0000 Subject: [PATCH 2/5] address comments --- examples/text-generation/README.md | 12 ++++++++++++ examples/text-generation/utils.py | 1 - 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md index aa9cab9ae2..6298904ccf 100644 --- a/examples/text-generation/README.md +++ b/examples/text-generation/README.md @@ -147,6 +147,18 @@ python run_generation.py \ --do_sample ``` +To run Falcon-40b inference on 8 Gaudi2 cards, use the following command: +```bash +python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \ +--model_name_or_path tiiuae/falcon-40b \ +--max_new_tokens 2048 \ +--bf16 \ +--use_hpu_graphs \ +--use_kv_cache \ +--batch_size 1 \ +--do_sample +``` + > To be able to run gated models like [StarCoder](https://huggingface.co/bigcode/starcoder), you should: > - have a HF account > - agree to the terms of use of the model in its model card on the HF Hub diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py index 8adc523a6a..fafac1b872 100644 --- a/examples/text-generation/utils.py +++ b/examples/text-generation/utils.py @@ -163,7 +163,6 @@ def setup_model(args, model_dtype, model_kwargs, logger): if args.use_hpu_graphs: from habana_frameworks.torch.hpu import wrap_in_hpu_graph - # TODO: remove the following check from SynapseAI v1.15 if check_habana_frameworks_version("1.13.0"): if model.config.model_type == "falcon": model = wrap_in_hpu_graph(model, hash_with_views=False) From a8857f8a528e7b242767ce8ef3cf9ca9f4999e40 Mon Sep 17 00:00:00 2001 From: Sun Choi Date: Tue, 23 Jan 2024 01:54:09 +0000 Subject: [PATCH 3/5] README change --- examples/text-generation/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md index 6298904ccf..9a747a8bc7 100644 --- a/examples/text-generation/README.md +++ b/examples/text-generation/README.md @@ -135,7 +135,7 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \ --trim_logits ``` -To run Falcon inference, use the following command: +To run Falcon-7B inference, use the following command: ```bash python run_generation.py \ --model_name_or_path tiiuae/falcon-7b \ @@ -147,7 +147,7 @@ python run_generation.py \ --do_sample ``` -To run Falcon-40b inference on 8 Gaudi2 cards, use the following command: +To run Falcon-40B inference on 8 Gaudi2 cards, use the following command: ```bash python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \ --model_name_or_path tiiuae/falcon-40b \ From dee41afe80e9feecce31e6bd5684e2c7328e7867 Mon Sep 17 00:00:00 2001 From: Sun Choi Date: Wed, 24 Jan 2024 17:16:15 +0000 Subject: [PATCH 4/5] make PR#654 backward compatible for other models --- examples/text-generation/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py index 821f790604..49529dfdd2 100644 --- a/examples/text-generation/utils.py +++ b/examples/text-generation/utils.py @@ -177,6 +177,8 @@ def setup_model(args, model_dtype, model_kwargs, logger): if check_habana_frameworks_version("1.13.0"): if model.config.model_type == "falcon": model = wrap_in_hpu_graph(model, hash_with_views=False) + else: + model = wrap_in_hpu_graph(model, hash_with_views=True) else: model = wrap_in_hpu_graph(model) From 5a6753043edc78340e77749fa528f4a2560e4d22 Mon Sep 17 00:00:00 2001 From: Sun Choi Date: Wed, 24 Jan 2024 18:08:58 +0000 Subject: [PATCH 5/5] address comments --- examples/text-generation/utils.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py index 49529dfdd2..53e4c3bab6 100644 --- a/examples/text-generation/utils.py +++ b/examples/text-generation/utils.py @@ -174,11 +174,8 @@ def setup_model(args, model_dtype, model_kwargs, logger): if args.use_hpu_graphs: from habana_frameworks.torch.hpu import wrap_in_hpu_graph - if check_habana_frameworks_version("1.13.0"): - if model.config.model_type == "falcon": - model = wrap_in_hpu_graph(model, hash_with_views=False) - else: - model = wrap_in_hpu_graph(model, hash_with_views=True) + if check_habana_frameworks_version("1.13.0") and model.config.model_type == "falcon": + model = wrap_in_hpu_graph(model, hash_with_views=False) else: model = wrap_in_hpu_graph(model)