From e97330b7c87426c14bcb6f84cde9219997c240e9 Mon Sep 17 00:00:00 2001
From: Sun Choi <schoi@habana.ai>
Date: Wed, 17 Jan 2024 17:00:39 -0800
Subject: [PATCH 1/5] Falcon changes for 1.14.0

1) fix for invalid input shape with DS 0.12.4
2) revert --skip_hash_with_views changes
3) add falcon to model_on_meta()
---
 examples/text-generation/README.md                           | 5 ++---
 examples/text-generation/run_generation.py                   | 5 -----
 examples/text-generation/utils.py                            | 3 +--
 optimum/habana/checkpoint_utils.py                           | 2 +-
 optimum/habana/transformers/models/falcon/modeling_falcon.py | 4 +++-
 tests/test_text_generation_example.py                        | 3 ---
 6 files changed, 7 insertions(+), 15 deletions(-)

diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md
index bc57fe30f3..aa9cab9ae2 100644
--- a/examples/text-generation/README.md
+++ b/examples/text-generation/README.md
@@ -135,7 +135,7 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
 --trim_logits
 ```
 
-To run Falcon inference, use the following command. Please note that the option `--skip_hash_with_views` is added to the command to disable the `hash_with_views` feature in HPU graphs, which requires SynapseAI 1.13.0 or later:
+To run Falcon inference, use the following command:
 ```bash
 python run_generation.py \
  --model_name_or_path tiiuae/falcon-7b \
@@ -144,8 +144,7 @@ python run_generation.py \
  --use_kv_cache \
  --batch_size 1 \
  --max_new_tokens 128 \
- --do_sample \
- --skip_hash_with_views
+ --do_sample
 ```
 
 > To be able to run gated models like [StarCoder](https://huggingface.co/bigcode/starcoder), you should:
diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py
index ab308e7023..7e932ef48e 100644
--- a/examples/text-generation/run_generation.py
+++ b/examples/text-generation/run_generation.py
@@ -201,11 +201,6 @@ def setup_parser(parser):
         action="store_true",
         help="Whether to reuse key/value cache for decoding. It should save memory.",
     )
-    parser.add_argument(
-        "--skip_hash_with_views",
-        action="store_true",
-        help="Whether to skip hash with views for HPU graphs. When skip_hash_with_views is not used, the input to HPU graphs includes both view and base tensors.",
-    )
     parser.add_argument("--verbose_workers", action="store_true", help="Enable output from non-master workers")
     parser.add_argument(
         "--simulate_dyn_prompt",
diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py
index 5c03de7dc6..8adc523a6a 100644
--- a/examples/text-generation/utils.py
+++ b/examples/text-generation/utils.py
@@ -166,8 +166,7 @@ def setup_model(args, model_dtype, model_kwargs, logger):
         # TODO: remove the following check from SynapseAI v1.15
         if check_habana_frameworks_version("1.13.0"):
             if model.config.model_type == "falcon":
-                args.skip_hash_with_views = True
-            model = wrap_in_hpu_graph(model, hash_with_views=not args.skip_hash_with_views)
+                model = wrap_in_hpu_graph(model, hash_with_views=False)
         else:
             model = wrap_in_hpu_graph(model)
     return model
diff --git a/optimum/habana/checkpoint_utils.py b/optimum/habana/checkpoint_utils.py
index 45509c0bed..e0fc139f5d 100644
--- a/optimum/habana/checkpoint_utils.py
+++ b/optimum/habana/checkpoint_utils.py
@@ -76,7 +76,7 @@ def model_on_meta(config):
     """
     Checks if load the model to meta.
     """
-    return config.model_type in ["bloom", "llama"]
+    return config.model_type in ["bloom", "llama", "falcon"]
 
 
 def get_optimized_model_name(config):
diff --git a/optimum/habana/transformers/models/falcon/modeling_falcon.py b/optimum/habana/transformers/models/falcon/modeling_falcon.py
index ebb141fa5d..122d68824a 100644
--- a/optimum/habana/transformers/models/falcon/modeling_falcon.py
+++ b/optimum/habana/transformers/models/falcon/modeling_falcon.py
@@ -123,10 +123,12 @@ def gaudi_falcon_attention_split_heads(
 
         if self.config.num_attention_heads != self.num_heads:  # When DS divides heads for TP
             num_heads = self.config.num_attention_heads
+            num_kv_heads = self.config.num_kv_heads
         else:  # When DS not in use
             num_heads = self.num_heads
+            num_kv_heads = self.num_kv_heads
 
-        qkv = fused_qkv.view(batch, seq_len, -1, num_heads // self.num_kv_heads + 2, self.head_dim)
+        qkv = fused_qkv.view(batch, seq_len, -1, num_heads // num_kv_heads + 2, self.head_dim)
         # query = qkv[:, :, :, :-2]
         # key = qkv[:, :, :, [-2]]
         # value = qkv[:, :, :, [-1]]
diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py
index faae5b79db..9abdc0e8dd 100644
--- a/tests/test_text_generation_example.py
+++ b/tests/test_text_generation_example.py
@@ -76,9 +76,6 @@ def _test_text_generation(model_name: str, baseline: float, token: str, deepspee
     if not deepspeed:
         command.append("--bf16")
 
-    if "falcon" in model_name:
-        command.append("--skip_hash_with_views")
-
     with TemporaryDirectory() as tmp_dir:
         command.append(f"--output_dir {tmp_dir}")
         print(f"\n\nCommand to test: {' '.join(command)}\n")

From b557297bbf3b0dd9c074cc399123022f86bba34e Mon Sep 17 00:00:00 2001
From: Sun Choi <schoi@habana.ai>
Date: Tue, 23 Jan 2024 00:19:14 +0000
Subject: [PATCH 2/5] address comments

---
 examples/text-generation/README.md | 12 ++++++++++++
 examples/text-generation/utils.py  |  1 -
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md
index aa9cab9ae2..6298904ccf 100644
--- a/examples/text-generation/README.md
+++ b/examples/text-generation/README.md
@@ -147,6 +147,18 @@ python run_generation.py \
  --do_sample
 ```
 
+To run Falcon-40b inference on 8 Gaudi2 cards, use the following command:
+```bash
+python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
+--model_name_or_path tiiuae/falcon-40b \
+--max_new_tokens 2048 \
+--bf16 \
+--use_hpu_graphs \
+--use_kv_cache \
+--batch_size 1 \
+--do_sample
+```
+
 > To be able to run gated models like [StarCoder](https://huggingface.co/bigcode/starcoder), you should:
 > - have a HF account
 > - agree to the terms of use of the model in its model card on the HF Hub
diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py
index 8adc523a6a..fafac1b872 100644
--- a/examples/text-generation/utils.py
+++ b/examples/text-generation/utils.py
@@ -163,7 +163,6 @@ def setup_model(args, model_dtype, model_kwargs, logger):
     if args.use_hpu_graphs:
         from habana_frameworks.torch.hpu import wrap_in_hpu_graph
 
-        # TODO: remove the following check from SynapseAI v1.15
         if check_habana_frameworks_version("1.13.0"):
             if model.config.model_type == "falcon":
                 model = wrap_in_hpu_graph(model, hash_with_views=False)

From a8857f8a528e7b242767ce8ef3cf9ca9f4999e40 Mon Sep 17 00:00:00 2001
From: Sun Choi <schoi@habana.ai>
Date: Tue, 23 Jan 2024 01:54:09 +0000
Subject: [PATCH 3/5] README change

---
 examples/text-generation/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md
index 6298904ccf..9a747a8bc7 100644
--- a/examples/text-generation/README.md
+++ b/examples/text-generation/README.md
@@ -135,7 +135,7 @@ python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
 --trim_logits
 ```
 
-To run Falcon inference, use the following command:
+To run Falcon-7B inference, use the following command:
 ```bash
 python run_generation.py \
  --model_name_or_path tiiuae/falcon-7b \
@@ -147,7 +147,7 @@ python run_generation.py \
  --do_sample
 ```
 
-To run Falcon-40b inference on 8 Gaudi2 cards, use the following command:
+To run Falcon-40B inference on 8 Gaudi2 cards, use the following command:
 ```bash
 python ../gaudi_spawn.py --use_deepspeed --world_size 8 run_generation.py \
 --model_name_or_path tiiuae/falcon-40b \

From dee41afe80e9feecce31e6bd5684e2c7328e7867 Mon Sep 17 00:00:00 2001
From: Sun Choi <schoi@habana.ai>
Date: Wed, 24 Jan 2024 17:16:15 +0000
Subject: [PATCH 4/5] make PR#654 backward compatible for other models

---
 examples/text-generation/utils.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py
index 821f790604..49529dfdd2 100644
--- a/examples/text-generation/utils.py
+++ b/examples/text-generation/utils.py
@@ -177,6 +177,8 @@ def setup_model(args, model_dtype, model_kwargs, logger):
         if check_habana_frameworks_version("1.13.0"):
             if model.config.model_type == "falcon":
                 model = wrap_in_hpu_graph(model, hash_with_views=False)
+            else:
+                model = wrap_in_hpu_graph(model, hash_with_views=True)
         else:
             model = wrap_in_hpu_graph(model)
 

From 5a6753043edc78340e77749fa528f4a2560e4d22 Mon Sep 17 00:00:00 2001
From: Sun Choi <schoi@habana.ai>
Date: Wed, 24 Jan 2024 18:08:58 +0000
Subject: [PATCH 5/5] address comments

---
 examples/text-generation/utils.py | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py
index 49529dfdd2..53e4c3bab6 100644
--- a/examples/text-generation/utils.py
+++ b/examples/text-generation/utils.py
@@ -174,11 +174,8 @@ def setup_model(args, model_dtype, model_kwargs, logger):
     if args.use_hpu_graphs:
         from habana_frameworks.torch.hpu import wrap_in_hpu_graph
 
-        if check_habana_frameworks_version("1.13.0"):
-            if model.config.model_type == "falcon":
-                model = wrap_in_hpu_graph(model, hash_with_views=False)
-            else:
-                model = wrap_in_hpu_graph(model, hash_with_views=True)
+        if check_habana_frameworks_version("1.13.0") and model.config.model_type == "falcon":
+            model = wrap_in_hpu_graph(model, hash_with_views=False)
         else:
             model = wrap_in_hpu_graph(model)