diff --git a/.github/workflows/fast_tests.yml b/.github/workflows/fast_tests.yml
index 31de2b8408..bfb463466c 100644
--- a/.github/workflows/fast_tests.yml
+++ b/.github/workflows/fast_tests.yml
@@ -27,7 +27,7 @@ jobs:
     runs-on: ubuntu-22.04
     env:
       AWS_REGION: us-east-1
-      EC2_AMI_ID: ami-0a2179742e502fdfe
+      EC2_AMI_ID: ami-04fe9856174d852b8
       EC2_INSTANCE_TYPE: dl1.24xlarge
       EC2_SUBNET_ID: subnet-b7533b96
       EC2_SECURITY_GROUP: sg-08af7938042271373
@@ -77,7 +77,7 @@ jobs:
           ref: ${{ github.event.pull_request.merge_commit_sha }}
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
+            docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
       - name: Run tests
         run: |
             docker run \
@@ -89,7 +89,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \
+            vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \
             /bin/bash tests/ci/fast_tests.sh
   diffusers:
     name: Run tests for optimum.habana.diffusers
@@ -113,7 +113,7 @@ jobs:
           ref: ${{ github.event.pull_request.merge_commit_sha }}
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
+            docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
       - name: Run tests
         run: |
             docker run \
@@ -125,7 +125,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \
+            vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \
             /bin/bash tests/ci/fast_tests_diffusers.sh
   stop-runner:
     name: Stop self-hosted EC2 runner
diff --git a/.github/workflows/slow_tests.yml b/.github/workflows/slow_tests.yml
index 5e18f2460e..755e54e161 100644
--- a/.github/workflows/slow_tests.yml
+++ b/.github/workflows/slow_tests.yml
@@ -15,7 +15,7 @@ jobs:
     runs-on: ubuntu-22.04
     env:
       AWS_REGION: us-west-2
-      EC2_AMI_ID: ami-0961e95b539f72c46
+      EC2_AMI_ID: ami-03549026a9aa06f99
       EC2_INSTANCE_TYPE: dl1.24xlarge
       EC2_SUBNET_ID: subnet-452c913d
       EC2_SECURITY_GROUP: sg-0894f4f70dd6bd778
@@ -55,7 +55,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
+            docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
       - name: Run tests
         run: |
             docker run \
@@ -67,7 +67,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \
+            vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \
             /bin/bash tests/ci/example_diff_tests.sh
   stable-diffusion:
     name: Test Stable Diffusion
@@ -83,7 +83,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
+            docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
       - name: Run tests
         run: |
             docker run \
@@ -95,7 +95,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \
+            vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \
             /bin/bash tests/ci/slow_tests_diffusers.sh
   deepspeed:
     name: Test DeepSpeed models
@@ -112,7 +112,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
+            docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
       - name: Run tests
         run: |
             docker run \
@@ -124,7 +124,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \
+            vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \
             /bin/bash tests/ci/slow_tests_deepspeed.sh
   multi-card:
     name: Test multi-card models
@@ -141,7 +141,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
+            docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
       - name: Run tests
         run: |
             docker run \
@@ -153,7 +153,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \
+            vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \
             /bin/bash tests/ci/slow_tests_8x.sh
   single-card:
     name: Test single-card models
@@ -171,7 +171,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
+            docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
       - name: Run tests
         run: |
             docker run \
@@ -183,7 +183,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \
+            vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \
             /bin/bash tests/ci/slow_tests_1x.sh
   albert-xxl-single-card:
     name: Test single-card ALBERT XXL
@@ -204,7 +204,7 @@ jobs:
       - name: Pull image
         if: github.event.schedule == '0 21 * * 6'
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
+            docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
       - name: Run test
         if: github.event.schedule == '0 21 * * 6'
         run: |
@@ -217,7 +217,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \
+            vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \
             /bin/bash tests/ci/albert_xxl_1x.sh
       - name: Warning
         if: github.event.schedule != '0 21 * * 6'
@@ -240,7 +240,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
+            docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
       - name: Run tests
         run: |
             docker run \
@@ -252,7 +252,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \
+            vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \
             make slow_tests_text_generation_example TOKEN=${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }}
   stop-runner:
     name: Stop self-hosted EC2 runner
diff --git a/.github/workflows/slow_tests_gaudi2.yml b/.github/workflows/slow_tests_gaudi2.yml
index a54b70f77c..1f540d432d 100644
--- a/.github/workflows/slow_tests_gaudi2.yml
+++ b/.github/workflows/slow_tests_gaudi2.yml
@@ -17,7 +17,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
+            docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
       - name: Run tests
         run: |
             docker run \
@@ -30,7 +30,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \
+            vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \
             /bin/bash tests/ci/slow_tests_diffusers.sh
   deepspeed:
     name: Test DeepSpeed models
@@ -43,7 +43,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
+            docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
       - name: Run tests
         run: |
             docker run \
@@ -56,7 +56,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \
+            vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \
             /bin/bash tests/ci/slow_tests_deepspeed.sh
   fsdp:
     name: Test FSDP models
@@ -69,7 +69,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
+            docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
       - name: Run tests
         run: |
             docker run \
@@ -82,7 +82,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \
+            vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \
             make slow_tests_fsdp TOKEN=${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }}
   multi-card:
     name: Test multi-card models
@@ -95,7 +95,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
+            docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
       - name: Run tests
         run: |
             docker run \
@@ -108,7 +108,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \
+            vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \
             /bin/bash tests/ci/slow_tests_8x.sh
   single-card:
     name: Test single-card models
@@ -122,7 +122,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
+            docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest:latest
       - name: Run tests
         run: |
             docker run \
@@ -136,7 +136,7 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \
+            vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \
             /bin/bash tests/ci/slow_tests_1x.sh
   text-generation:
     name: Test text-generation example
@@ -151,7 +151,7 @@ jobs:
         uses: actions/checkout@v2
       - name: Pull image
         run: |
-            docker pull vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
+            docker pull vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
       - name: Run tests
         run: |
             docker run \
@@ -164,5 +164,5 @@ jobs:
             --cap-add=sys_nice \
             --net=host \
             --ipc=host \
-            vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest \
+            vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest \
             make slow_tests_text_generation_example TOKEN=${{ secrets.TEXT_GENERATION_CI_HUB_TOKEN }}
diff --git a/Makefile b/Makefile
index c5de7c04fe..988435a0d4 100644
--- a/Makefile
+++ b/Makefile
@@ -51,7 +51,7 @@ slow_tests_8x: test_installs
 
 # Run DeepSpeed non-regression tests
 slow_tests_deepspeed: test_installs
-	python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0
+	python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0
 	python -m pytest tests/test_examples.py -v -s -k "deepspeed"
 
 slow_tests_diffusers: test_installs
@@ -63,7 +63,7 @@ slow_tests_diffusers: test_installs
 
 # Run text-generation non-regression tests
 slow_tests_text_generation_example: test_installs
-	python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0
+	python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0
 	python -m pytest tests/test_text_generation_example.py tests/test_encoder_decoder.py -v -s --token $(TOKEN)
 
 slow_tests_fsdp: test_installs
diff --git a/README.md b/README.md
index 390214c47e..aa92ba63f1 100644
--- a/README.md
+++ b/README.md
@@ -45,9 +45,9 @@ The `--upgrade-strategy eager` option is needed to ensure `optimum-habana` is up
 To use the example associated with the latest stable release, run:
 > ```
 > git clone https://github.com/huggingface/optimum-habana
-> cd optimum-habana && git checkout v1.10.4
+> cd optimum-habana && git checkout v1.11.0
 > ```
-> with `v1.10.4` the version number of this release.
+> with `v1.11.0` the version number of this release.
 
 ### Option 2: Use the latest main branch under development
 
@@ -62,7 +62,7 @@ git clone https://github.com/huggingface/optimum-habana
 
 To use DeepSpeed on HPUs, you also need to run the following command:
 >```bash
->pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0
+>pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0
 >```
 
 To install the requirements for every example:
@@ -230,7 +230,7 @@ Please refer to Habana Gaudi's official [installation guide](https://docs.habana
 
 > Tests should be run in a Docker container based on Habana Docker images.
 >
-> The current version has been validated for SynapseAI 1.14.
+> The current version has been validated for SynapseAI 1.15.
 
 
 ## Development
diff --git a/docs/Dockerfile b/docs/Dockerfile
index 0c60a33b86..3d253fd361 100644
--- a/docs/Dockerfile
+++ b/docs/Dockerfile
@@ -1,4 +1,4 @@
-FROM vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
+FROM vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
 
 ARG commit_sha
 ARG clone_url
diff --git a/docs/source/installation.mdx b/docs/source/installation.mdx
index eaad58cfd1..3dd8c2c1b0 100644
--- a/docs/source/installation.mdx
+++ b/docs/source/installation.mdx
@@ -23,6 +23,6 @@ python -m pip install --upgrade-strategy eager optimum[habana]
 To use DeepSpeed on HPUs, you also need to run the following command:
 
 ```bash
-python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0
+python -m pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0
 ```
 
diff --git a/docs/source/usage_guides/deepspeed.mdx b/docs/source/usage_guides/deepspeed.mdx
index b115554c83..51734bb42f 100644
--- a/docs/source/usage_guides/deepspeed.mdx
+++ b/docs/source/usage_guides/deepspeed.mdx
@@ -31,7 +31,7 @@ You can find more information about DeepSpeed Gaudi integration [here](https://d
 To use DeepSpeed on Gaudi, you need to install Optimum Habana and [Habana's DeepSpeed fork](https://github.com/HabanaAI/DeepSpeed) with:
 ```bash
 pip install optimum[habana]
-pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0
+pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0
 ```
 
 
@@ -78,7 +78,7 @@ It is strongly advised to read [this section](https://huggingface.co/docs/transf
 
 </Tip>
 
-Other examples of configurations for HPUs are proposed [here](https://github.com/HabanaAI/Model-References/tree/1.14.0/PyTorch/nlp/DeepSpeedExamples/deepspeed-bert/scripts) by Habana.
+Other examples of configurations for HPUs are proposed [here](https://github.com/HabanaAI/Model-References/tree/1.15.0/PyTorch/nlp/DeepSpeedExamples/deepspeed-bert/scripts) by Habana.
 
 The [Transformers documentation](https://huggingface.co/docs/transformers/main_classes/deepspeed#configuration) explains how to write a configuration from scratch very well.
 A more complete description of all configuration possibilities is available [here](https://www.deepspeed.ai/docs/config-json/).
diff --git a/examples/audio-classification/README.md b/examples/audio-classification/README.md
index 58af855758..ecd227e018 100644
--- a/examples/audio-classification/README.md
+++ b/examples/audio-classification/README.md
@@ -100,7 +100,7 @@ On 8 HPUs, this script should run in ~12 minutes and yield an accuracy of **80.4
 
 > You need to install DeepSpeed with:
 > ```bash
-> pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0
+> pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0
 > ```
 
 DeepSpeed can be used with almost the same command as for a multi-card run:
diff --git a/examples/gaudi_spawn.py b/examples/gaudi_spawn.py
index a7cda9b9eb..b7833c4177 100644
--- a/examples/gaudi_spawn.py
+++ b/examples/gaudi_spawn.py
@@ -84,7 +84,7 @@ def main():
         if not is_deepspeed_available():
             raise ImportError(
                 "--use_deepspeed requires deepspeed: `pip install"
-                " git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0`."
+                " git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0`."
             )
 
     # Patch sys.argv
diff --git a/examples/language-modeling/README.md b/examples/language-modeling/README.md
index abf19c457b..776993aca1 100644
--- a/examples/language-modeling/README.md
+++ b/examples/language-modeling/README.md
@@ -562,41 +562,41 @@ python3 ../gaudi_spawn.py --use_deepspeed  --world_size 8  run_lora_clm.py \
 - Multi-card finetuning of Llama2-70B with FSDP and LoRA:
 
 ```bash
-PT_HPU_MAX_COMPOUND_OP_SIZE=10 DEEPSPEED_HPU_ZERO3_SYNC_MARK_STEP_REQUIRED=1 \
-python3 ../gaudi_spawn.py --use_mpi  --world_size 8  run_lora_clm.py \
+LOWER_LIST=ops_bf16.txt PT_HPU_LAZY_MODE=0 \
+python3 ../gaudi_spawn.py --world_size 8 --use_mpi run_lora_clm.py \
   --model_name_or_path meta-llama/Llama-2-70b-hf \
   --dataset_name tatsu-lab/alpaca \
   --bf16 True \
   --output_dir ./lora_out \
-  --num_train_epochs 2 \
   --max_seq_len 2048 \
-  --per_device_train_batch_size 10 \
-  --per_device_eval_batch_size 10 \
   --gradient_checkpointing \
-  --evaluation_strategy epoch \
-  --eval_delay 2 \
+  --per_device_train_batch_size 5 \
   --save_strategy no \
   --learning_rate 0.0004 \
   --warmup_ratio 0.03 \
   --lr_scheduler_type "constant" \
   --logging_steps 1 \
   --dataset_concatenation \
-  --attn_softmax_bf16 True \
   --do_train \
-  --do_eval \
   --use_habana \
-  --use_lazy_mode False \
-  --pipelining_fwd_bwd False \
   --throughput_warmup_steps 3 \
   --lora_rank 4 \
   --lora_target_modules "q_proj" "v_proj" "k_proj" "o_proj" \
+  --attn_softmax_bf16 True \
   --validation_split_percentage 4 \
-  --use_flash_attention True \
+  --use_lazy_mode False \
   --fsdp_config fsdp_config.json \
-  --fsdp "auto_wrap" \
-  --torch_compile_backend hpu_backend \.
+  --fsdp auto_wrap \
+  --num_train_epochs 2 \
+  --evaluation_strategy epoch \
+  --per_device_eval_batch_size 1 \
+  --eval_delay 2 \
+  --do_eval \
+  --pipelining_fwd_bwd False \
+  --use_fused_rope False \
+  --torch_compile_backend hpu_backend \
   --torch_compile \
-  --use_fused_rope False
+  --gradient_accumulation_steps 2
 ```
 
 - Multi-card finetuning of Falcon-180B:
diff --git a/examples/multi-node-training/EFA/Dockerfile b/examples/multi-node-training/EFA/Dockerfile
index a3d4d3ca99..2b97d0e54c 100644
--- a/examples/multi-node-training/EFA/Dockerfile
+++ b/examples/multi-node-training/EFA/Dockerfile
@@ -1,4 +1,4 @@
-FROM vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
+FROM vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
 
 # Installs pdsh and upgrade pip
 RUN apt-get update && apt-get install -y pdsh && \
@@ -18,7 +18,7 @@ RUN sed -i 's/#Port 22/Port 3022/g' /etc/ssh/sshd_config && \
 
 # Installs Optimum Habana and Habana's fork of DeepSpeed
 RUN pip install optimum[habana] && \
-   pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0
+   pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0
 
 CMD ssh-keygen -t rsa -b 4096 -N '' -f ~/.ssh/id_rsa && \
    chmod 600 ~/.ssh/id_rsa && \
diff --git a/examples/multi-node-training/GaudiNIC/Dockerfile b/examples/multi-node-training/GaudiNIC/Dockerfile
index 9e73a4528f..a35013ea47 100644
--- a/examples/multi-node-training/GaudiNIC/Dockerfile
+++ b/examples/multi-node-training/GaudiNIC/Dockerfile
@@ -1,4 +1,4 @@
-FROM vault.habana.ai/gaudi-docker/1.14.0/ubuntu22.04/habanalabs/pytorch-installer-2.1.1:latest
+FROM vault.habana.ai/gaudi-docker/1.15.0/ubuntu22.04/habanalabs/pytorch-installer-2.2.0:latest
 
 # Installs pdsh and upgrade pip
 RUN apt-get update && apt-get install -y pdsh && \
@@ -12,7 +12,7 @@ RUN sed -i 's/#Port 22/Port 3022/g' /etc/ssh/sshd_config && \
 
 # Installs Optimum Habana and Habana's fork of DeepSpeed
 RUN pip install optimum[habana] && \
-   pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0
+   pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0
 
 CMD ssh-keygen -t rsa -b 4096 -N '' -f ~/.ssh/id_rsa && \
    chmod 600 ~/.ssh/id_rsa && \
diff --git a/examples/speech-recognition/README.md b/examples/speech-recognition/README.md
index 510a52d213..50ae873ffb 100644
--- a/examples/speech-recognition/README.md
+++ b/examples/speech-recognition/README.md
@@ -78,13 +78,15 @@ python run_speech_recognition_ctc.py \
     --use_lazy_mode \
     --gaudi_config_name="Habana/wav2vec2" \
     --throughput_warmup_steps="3" \
-    --bf16
+    --bf16 \
+    --use_hpu_graphs_for_training \
+    --use_hpu_grpahs_for_inference
 ```
 
 On a single HPU, this script should run in *ca.* 6 hours and yield a CTC loss of **0.059** and a word error rate of **0.0423**.
 
 > If your data has a sampling rate which is different from the one of the data the model was trained on, this script will raise an error.
-> Resampling with the `datasets` library is not supported on HPUs yet.
+> Resampling with the `datasets` library is not supported on HPUs yet. HPU graphs are supported only on Gaudi2 and from SynapseAI v1.15.
 
 ### Multi-HPU CTC
 
@@ -117,20 +119,22 @@ python ../gaudi_spawn.py \
     --use_lazy_mode \
     --gaudi_config_name Habana/wav2vec2 \
     --throughput_warmup_steps 3 \
-    --bf16
+    --bf16 \
+    --use_hpu_graphs_for_training \
+    --use_hpu_graphs_for_inference
 ```
 
 On 8 HPUs, this script should run in *ca.* 49 minutes and yield a CTC loss of **0.0613** and a word error rate of **0.0458**.
 
 > If your data has a sampling rate which is different from the one of the data the model was trained on, this script will raise an error.
-> Resampling with the `datasets` library is not supported on HPUs yet.
+> Resampling with the `datasets` library is not supported on HPUs yet. HPU graphs are supported only on Gaudi2 and from SynapseAI v1.15.
 
 
 ## DeepSpeed
 
 > You need to install DeepSpeed with:
 > ```bash
-> pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0
+> pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0
 > ```
 
 DeepSpeed can be used with almost the same command as for a multi-card run:
@@ -196,7 +200,8 @@ python run_speech_recognition_ctc.py \
     --use_habana \
     --use_lazy_mode \
     --gaudi_config_name="Habana/wav2vec2" \
-    --bf16
+    --bf16 \
+    --use_hpu_graphs_for_inference
 ```
 ## Sequence to Sequence
 
diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md
index 83a481970c..0f9a2c7b16 100644
--- a/examples/text-generation/README.md
+++ b/examples/text-generation/README.md
@@ -28,7 +28,7 @@ pip install -r requirements.txt
 
 Then, if you plan to use [DeepSpeed-inference](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/Inference_Using_DeepSpeed.html) (e.g. to use BLOOM/BLOOMZ), you should install DeepSpeed as follows:
 ```bash
-pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0
+pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0
 ```
 
 
@@ -108,7 +108,6 @@ Here are a few settings you may be interested in:
 - `--attn_softmax_bf16` to run attention softmax layer in bfloat16 precision provided that the model (such as Llama) supports it
 - `--trim_logits` to calculate logits only for the last token in the first time step provided that the model (such as Llama) supports it
 - `--fp8` Enable Quantization to fp8
-- `--kv_cache_fp8` Deprecated - Store kv-cache in float8 when kv-cache is used. should not be used with HQT(The Quantization Toolkit)
 
 For example, you can reproduce the results presented in [this blog post](https://huggingface.co/blog/habana-gaudi-2-bloom) with the following command:
 ```bash
@@ -241,7 +240,7 @@ While `--bucket_size` works for any model without model file changes, an even mo
 
 ### Running with FP8
 
-Llama2-70b, Llama2-7b and Mixtral-8x7B in FP8 are enabled using the Quantization Toolkit (HQT), which provides model measurement and quantization capabilities in PyTorch.
+Llama2-70b, Llama2-7b,  Mixtral-8x7B, Falcon-7B, Falcon-40B, and Falcon-180B in FP8 are enabled using the Quantization Toolkit (HQT), which provides model measurement and quantization capabilities in PyTorch.
 
 More information on enabling fp8 in SynapseAI is available here:
 https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Inference_Using_FP8.html
@@ -321,6 +320,38 @@ QUANT_CONFIG=./quantization_config/maxabs_quant_mixtral.json python run_generati
 --bf16 \
 --fp8
 ```
+
+Here is an example to measure the tensor quantization statistics on Falcon-180B with 8 cards:
+> Please note that Falcon-180B is a gated model, and users are required to request access to it. Please refer to the instructions provided in the StarCoder example above.
+```bash
+QUANT_CONFIG=./quantization_config/maxabs_measure_include_outputs.json python ../gaudi_spawn.py \
+--use_deepspeed --world_size 8 run_lm_eval.py \
+-o acc_falcon180b_bs1_quant.txt \
+--model_name_or_path tiiuae/falcon-180B \
+--use_hpu_graphs \
+--use_kv_cache \
+--trim_logits \
+--batch_size 1 \
+--bf16 \
+--reuse_cache
+```
+
+Here is an example to quantize the model based on previous measurements for Falcon-180B with 8 cards:
+```bash
+QUANT_CONFIG=./quantization_config/maxabs_quant.json python ../gaudi_spawn.py \
+--use_deepspeed --world_size 8 run_generation.py \
+--model_name_or_path tiiuae/falcon-180B \
+--use_hpu_graphs \
+--use_kv_cache \
+--limit_hpu_graphs \
+--max_input_tokens 128 \
+--max_new_tokens 2048 \
+--batch_size 110 \
+--bf16 \
+--reuse_cache \
+--trim_logits \
+--fp8
+```
 `--fp8` is required to enable quantization in fp8.
 
 
diff --git a/examples/text-generation/quantization_config/act_maxabs_hw_weights_pcs_maxabs_pow2_quant.json b/examples/text-generation/quantization_config/act_maxabs_hw_weights_pcs_maxabs_pow2_quant.json
index c83fa281f6..602a147baa 100644
--- a/examples/text-generation/quantization_config/act_maxabs_hw_weights_pcs_maxabs_pow2_quant.json
+++ b/examples/text-generation/quantization_config/act_maxabs_hw_weights_pcs_maxabs_pow2_quant.json
@@ -2,9 +2,9 @@
     "method": "HOOKS",
     "mode": "QUANTIZE",
     "observer": "maxabs",
-    "scale_method": "ACT_MAXABS_HW_WEIGHTS_PCS_MAXABS_POW2",
-    "whitelist": {"types": [], "names":  []},
-    "blacklist": {"types": [], "names":  []},
+    "scale_method": "ACT_MAXABS_POW2_WEIGHTS_PCS_OPT_POW2",
+    "allowlist": {"types": [], "names":  []},
+    "blocklist": {"types": [], "names":  []},
     "dump_stats_path": "./hqt_output/measure",
     "dump_stats_xlsx_path": "./hqt_output/measure/fp8stats.xlsx"
 }
diff --git a/examples/text-generation/quantization_config/maxabs_measure.json b/examples/text-generation/quantization_config/maxabs_measure.json
index 3715b506b6..3645fe743a 100644
--- a/examples/text-generation/quantization_config/maxabs_measure.json
+++ b/examples/text-generation/quantization_config/maxabs_measure.json
@@ -2,8 +2,8 @@
     "method": "HOOKS",
     "mode": "MEASURE",
     "observer": "maxabs",
-    "whitelist": {"types": [], "names":  []},
-    "blacklist": {"types": [], "names":  []},
+    "allowlist": {"types": [], "names":  []},
+    "blocklist": {"types": [], "names":  []},
     "dump_stats_path": "./hqt_output/measure",
     "dump_stats_xlsx_path": "./hqt_output/measure/fp8stats.xlsx"
 }
\ No newline at end of file
diff --git a/examples/text-generation/quantization_config/maxabs_measure_include_outputs.json b/examples/text-generation/quantization_config/maxabs_measure_include_outputs.json
new file mode 100644
index 0000000000..6de845a54d
--- /dev/null
+++ b/examples/text-generation/quantization_config/maxabs_measure_include_outputs.json
@@ -0,0 +1,10 @@
+{
+    "method": "HOOKS",
+    "mode": "MEASURE",
+    "observer": "maxabs",
+    "measure_exclude": "NONE",
+    "allowlist": {"types": [], "names":  []},
+    "blocklist": {"types": [], "names":  []},
+    "dump_stats_path": "./hqt_output/measure",
+    "dump_stats_xlsx_path": "./hqt_output/measure/fp8stats.xlsx"
+}
\ No newline at end of file
diff --git a/examples/text-generation/quantization_config/maxabs_quant.json b/examples/text-generation/quantization_config/maxabs_quant.json
index cb37e98a6e..02314a728e 100644
--- a/examples/text-generation/quantization_config/maxabs_quant.json
+++ b/examples/text-generation/quantization_config/maxabs_quant.json
@@ -3,8 +3,8 @@
     "mode": "QUANTIZE",
     "observer": "maxabs",
     "scale_method": "maxabs_hw",
-    "whitelist": {"types": [], "names":  []},
-    "blacklist": {"types": [], "names":  []},
+    "allowlist": {"types": [], "names":  []},
+    "blocklist": {"types": [], "names":  []},
     "dump_stats_path": "./hqt_output/measure",
     "dump_stats_xlsx_path": "./hqt_output/measure/fp8stats.xlsx"
 }
\ No newline at end of file
diff --git a/examples/text-generation/quantization_config/unit_scale_quant.json b/examples/text-generation/quantization_config/unit_scale_quant.json
index e2d709da61..caad4bb2a4 100644
--- a/examples/text-generation/quantization_config/unit_scale_quant.json
+++ b/examples/text-generation/quantization_config/unit_scale_quant.json
@@ -3,8 +3,8 @@
     "mode": "QUANTIZE",
     "observer": "maxabs",
     "scale_method": "unit_scale",
-    "whitelist": {"types": [], "names":  []},
-    "blacklist": {"types": [], "names":  []},
+    "allowlist": {"types": [], "names":  []},
+    "blocklist": {"types": [], "names":  []},
     "dump_stats_path": "./hqt_output/measure",
     "dump_stats_xlsx_path": "./hqt_output/measure/fp8stats.xlsx"
 }
diff --git a/examples/text-generation/quantization_tools/unify_measurements.py b/examples/text-generation/quantization_tools/unify_measurements.py
new file mode 100644
index 0000000000..75ae329a44
--- /dev/null
+++ b/examples/text-generation/quantization_tools/unify_measurements.py
@@ -0,0 +1,171 @@
+import argparse
+import json
+import os
+import sys
+
+import numpy as np
+
+
+def find_measurement_path(measurement, measurements_dir_path, scales, group_size):
+    measurment_card = measurement + "_" + str(group_size)
+    for measurment_file in os.listdir(measurements_dir_path):
+        filename = os.fsdecode(measurment_file)
+        if not filename.endswith(".json") or "_mod_list" in filename or measurment_card not in filename:
+            continue
+        if scales:
+            if "MAXABS" in filename:
+                return os.path.join(measurements_dir_path, measurment_file)
+        else:
+            if "MAXABS" not in filename:
+                return os.path.join(measurements_dir_path, measurment_file)
+
+
+def unify_measurements(measurement_group, measurements_dir_path, output_path, scales=False):
+    measurements_paths = []
+    group_name = ""
+
+    # save all the jsons paths in the given measurement group
+    for measurement in measurement_group:
+        measurement_path = find_measurement_path(measurement, measurements_dir_path, scales, len(measurement_group))
+        measurements_paths.append(measurement_path)
+        group_name += measurement
+
+    # save all the jsons content in the given measurement group
+    measurements_jsons = []
+    for measurement_path in measurements_paths:
+        with open(measurement_path, "r") as f:
+            js = json.load(f)
+            measurements_jsons.append(js["Nodes"])
+
+    # create a name for the unified json that will be created for this measurement group
+    unified_json_name = (
+        find_measurement_path(measurement_group[0], measurements_dir_path, scales, len(measurement_group))
+        .split("/")[-1]
+        .replace("_" + measurement_group[0] + "_" + str(len(measurement_group)), "")
+    )
+    unified_json_path = os.path.join(output_path, unified_json_name)
+
+    # open a unified json file
+    with open(measurements_paths[0], "r") as origin, open(unified_json_path, "w") as copy:
+        copy.write(origin.read())
+    with open(unified_json_path, "r") as json_file:
+        unified_json = json.load(json_file)
+
+    # iterate all unified json nodes
+    for node_name, node_values in unified_json["Nodes"].items():
+        max_inputs = node_values["inputs"]
+        max_outputs = None
+        if node_values.get("outputs") is not None:
+            max_outputs = node_values["outputs"]
+        max_weight = None
+        if node_values.get("params") is not None and node_values["params"].get("weight") is not None:
+            max_weight = node_values["params"]["weight"]
+
+        # iterate over all the measurment group and take the maximum for each tensor and its channel
+        if scales:
+            for measurement_json in measurements_jsons:
+                max_inputs[0] = max(measurement_json[node_name]["inputs"][0], max_inputs[0])
+                if max_outputs is not None:
+                    max_outputs = max(measurement_json[node_name]["outputs"], max_outputs)
+                if max_weight is not None:
+                    max_weight = max(measurement_json[node_name]["params"]["weight"], max_weight)
+        else:
+            for measurement_json in measurements_jsons:
+                for i in range(0, len(max_inputs)):
+                    for j in range(0, len(max_inputs[i])):
+                        max_inputs[i][j][0] = max(measurement_json[node_name]["inputs"][i][j][0], max_inputs[i][j][0])
+                if max_outputs is not None:
+                    for i in range(0, len(max_outputs)):
+                        max_outputs[i][0] = max(measurement_json[node_name]["outputs"][i][0], max_outputs[i][0])
+                if max_weight is not None:
+                    for i in range(0, len(max_weight)):
+                        max_weight[i][0] = max(measurement_json[node_name]["params"]["weight"][i][0], max_weight[i][0])
+
+        # update the maximum in the unified json
+        if scales:
+            unified_json["Nodes"][node_name]["inputs"][0] = max_inputs[0]
+            if max_outputs is not None:
+                unified_json["Nodes"][node_name]["outputs"] = max_outputs
+            if max_weight is not None:
+                unified_json["Nodes"][node_name]["params"]["weight"] = max_weight
+        else:
+            for i in range(0, len(max_inputs)):
+                for j in range(0, len(max_inputs[i])):
+                    unified_json["Nodes"][node_name]["inputs"][i][j][0] = max_inputs[i][j][0]
+            if max_outputs is not None:
+                for i in range(0, len(max_outputs)):
+                    unified_json["Nodes"][node_name]["outputs"][i][0] = max_outputs[i][0]
+            if max_weight is not None:
+                for i in range(0, len(max_weight)):
+                    unified_json["Nodes"][node_name]["params"]["weight"][i][0] = max_weight[i][0]
+    global_rank = None
+    local_rank = None
+    mode = ""
+    layers = {}
+    with open(unified_json_path, "w") as json_file:
+        json.dump(unified_json, json_file)
+    mode = unified_json["Mode"]
+    nodes = unified_json["Nodes"]
+
+    # create unified npz file from the unified json
+    unified_npz_path = os.path.join(output_path, unified_json_name.replace(".json", ".npz"))
+    for layer, dlayer in nodes.items():
+        layers[layer] = {}
+        layers[layer]["inputs"] = [np.array(x) for x in dlayer["inputs"]]
+        if dlayer.get("outputs") is not None:
+            layers[layer]["outputs"] = np.array(dlayer["outputs"])
+        if dlayer.get("params") is not None and dlayer["params"].get("weight") is not None:
+            layers[layer]["params"] = {}
+            layers[layer]["params"]["weight"] = np.array(dlayer["params"]["weight"])
+    df = {"GlobalRank": global_rank, "LocalRank": local_rank, "Mode": mode, "Nodes": layers}
+    with open(unified_npz_path, "w"):
+        np.savez(unified_npz_path, df)
+
+
+def parse_args(args):
+    parser = argparse.ArgumentParser(
+        description="Run the measurements parser", formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "-m", "--measurements", type=str, help="path to the directory of the measurements that will be unified"
+    )
+    parser.add_argument(
+        "-g",
+        "--groups",
+        type=list,
+        nargs="+",
+        help="the groups of cards that are going to be unified- e.g. 01 23 45 67",
+    )
+    parser.add_argument(
+        "-o",
+        "--out",
+        type=str,
+        default=os.getcwd(),
+        help="path to the directory where the unified measurements will be written",
+    )
+    return parser.parse_args(args)
+
+
+def main(args):
+    args = parse_args(args)
+    output_path = args.out
+    if not os.path.exists(output_path):
+        os.mkdir(output_path)
+    measurements_path = args.measurements
+    groups = args.groups
+
+    num_jsons = 0
+    for path in os.listdir(measurements_path):
+        if path.endswith(".json"):
+            num_jsons += 1
+    assert os.path.isdir(measurements_path) and (num_jsons % len(groups)) == 0
+
+    for group in groups:
+        unify_measurements(group, measurements_path, output_path, scales=False)
+        unify_measurements(group, measurements_path, output_path, scales=True)
+
+    print("finished measurement unifier script")
+
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/examples/text-generation/run_generation.py b/examples/text-generation/run_generation.py
index 6b0b2e4695..1f503ed5e1 100644
--- a/examples/text-generation/run_generation.py
+++ b/examples/text-generation/run_generation.py
@@ -221,11 +221,6 @@ def setup_parser(parser):
         help="Preprocess on cpu, and some other optimizations. Useful to prevent recompilations when using dynamic prompts (simulate_dyn_prompt)",
     )
 
-    parser.add_argument(
-        "--kv_cache_fp8",
-        action="store_true",
-        help="Store kv-cache in float8 when kv-cache is used. Can't use this argument together with QUANT_CONFIG env var",
-    )
     parser.add_argument("--fp8", action="store_true", help="Enable Quantization to fp8")
     parser.add_argument(
         "--use_flash_attention",
@@ -239,7 +234,17 @@ def setup_parser(parser):
     )
     parser.add_argument("--temperature", default=1.0, type=float, help="Temperature value for text generation")
     parser.add_argument("--top_p", default=1.0, type=float, help="Top_p value for generating text via sampling")
-
+    parser.add_argument(
+        "--const_serialization_path",
+        "--csp",
+        type=str,
+        help="Path to serialize const params. Const params will be held on disk memory instead of being allocated on host memory.",
+    )
+    parser.add_argument(
+        "--disk_offload",
+        action="store_true",
+        help="Whether to enable device map auto. In case no space left on cpu, weights will be offloaded to disk.",
+    )
     args = parser.parse_args()
 
     if args.torch_compile:
@@ -249,10 +254,6 @@ def setup_parser(parser):
         args.limit_hpu_graphs = False
 
     args.quant_config = os.getenv("QUANT_CONFIG", "")
-    if args.quant_config and args.kv_cache_fp8:
-        # can't use both quant_config and kv_cache_fp8, since quant_config may trigger kv cache quantization
-        # with habana quantization toolkit
-        raise parser.error("Can't use QUANT_CONFIG env var with kv_cache_fp8 argument")
     return args
 
 
@@ -561,6 +562,10 @@ def generate_dataset(batch):
         import habana_quantization_toolkit
 
         habana_quantization_toolkit.finish_measurements(model)
+    if args.const_serialization_path and os.path.isdir(args.const_serialization_path):
+        import shutil
+
+        shutil.rmtree(args.const_serialization_path)
 
 
 if __name__ == "__main__":
diff --git a/examples/text-generation/run_lm_eval.py b/examples/text-generation/run_lm_eval.py
index 4ae8dcb26c..8d61118890 100644
--- a/examples/text-generation/run_lm_eval.py
+++ b/examples/text-generation/run_lm_eval.py
@@ -75,10 +75,15 @@ def __init__(self, tokenizer, model, args, options):
         self.options = options
         self._device = args.device
         self.model_inputs = {"use_cache": self.options.use_cache}
-        if self.model.config.model_type == "llama":
+        if self.model.config.model_type == "llama" or "falcon":
             self.model_inputs.update(
                 {
                     "reuse_cache": self.options.reuse_cache,
+                }
+            )
+        if self.model.config.model_type == "llama":
+            self.model_inputs.update(
+                {
                     "attn_softmax_bf16": self.options.attn_softmax_bf16,
                 }
             )
@@ -131,12 +136,7 @@ def _model_call(self, inps):
         if self.options.static_shapes:
             bucket_length = self.find_bucket(seq_length)
             if self.options.use_cache and self.options.reuse_cache:
-                self.model.allocate_kv_cache(
-                    bs,
-                    bucket_length + 1,
-                    bucket_length,
-                    False,
-                )
+                self.model.allocate_kv_cache(bs, bucket_length + 1, bucket_length)
             padding_length = bucket_length - seq_length
             inps = F.pad(inps, (0, padding_length), value=self.model.config.pad_token_id)
         logits = self.model(inps.to(self._device), **self.model_inputs)["logits"].cpu()
@@ -176,6 +176,10 @@ def main():
         import habana_quantization_toolkit
 
         habana_quantization_toolkit.finish_measurements(model)
+    if args.const_serialization_path and os.path.isdir(args.const_serialization_path):
+        import shutil
+
+        shutil.rmtree(args.const_serialization_path)
 
 
 if __name__ == "__main__":
diff --git a/examples/text-generation/text-generation-pipeline/README.md b/examples/text-generation/text-generation-pipeline/README.md
index e73243dc8f..203b9ff333 100644
--- a/examples/text-generation/text-generation-pipeline/README.md
+++ b/examples/text-generation/text-generation-pipeline/README.md
@@ -28,7 +28,7 @@ export PYTHONPATH=${PYTHONPATH}:${OPTIMUM_HABANA_PATH}/examples/text-generation
 
 If you plan to use [DeepSpeed-inference](https://docs.habana.ai/en/latest/PyTorch/DeepSpeed/Inference_Using_DeepSpeed.html), you should install DeepSpeed as follows:
 ```bash
-pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0
+pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0
 ```
 
 If you would like to use the pipeline with LangChain classes, you can install LangChain as follows:
diff --git a/examples/text-generation/utils.py b/examples/text-generation/utils.py
index e8c847c2f7..54d08d017f 100644
--- a/examples/text-generation/utils.py
+++ b/examples/text-generation/utils.py
@@ -96,18 +96,15 @@ def setup_distributed(args):
     args.global_rank = int(os.getenv("RANK", "0"))
 
 
-def setup_quantization(args, model):
-    import habana_frameworks.torch.core as htcore
-    from habana_frameworks.torch.core.quantization import _check_params_as_const, _mark_params_as_const
-    from habana_frameworks.torch.hpu import hpu
-
-    print("Initializing inference with quantization")
-    _mark_params_as_const(model)
-    _check_params_as_const(model)
-    if not args.quant_config:
-        hpu.enable_quantization()
-    htcore.hpu_initialize(model)
-    return model
+def setup_const_serialization(const_serialization_path):
+    import uuid
+
+    const_serialization_path = os.path.join(const_serialization_path + uuid.uuid4().hex)
+    os.makedirs(const_serialization_path)
+    from habana_frameworks.torch.hpu import enable_const_section_serialization
+
+    print("Serializing const params to {}".format(const_serialization_path))
+    enable_const_section_serialization(const_serialization_path, False, True)
 
 
 def setup_env(args):
@@ -237,7 +234,7 @@ def setup_distributed_model(args, model_dtype, model_kwargs, logger):
 
     model = deepspeed.init_inference(model, **ds_inference_kwargs)
     model = model.module
-    if model.config.model_type == "llama":
+    if model.config.model_type in ["llama", "falcon"]:
         patch_scoped_linear_all_reduce(model)
 
     if args.quant_config:
@@ -349,7 +346,6 @@ def setup_generation_config(args, model, tokenizer):
     generation_config.reduce_recompile = args.reduce_recompile
     if generation_config.reduce_recompile:
         assert generation_config.bucket_size > 0
-    generation_config.kv_cache_fp8 = args.kv_cache_fp8
     generation_config.use_flash_attention = args.use_flash_attention
     return generation_config
 
@@ -373,6 +369,10 @@ def initialize_model(args, logger):
         "revision": args.model_revision,
         "token": args.token,
     }
+    if args.disk_offload:
+        model_kwargs["device_map"] = "auto"
+        model_kwargs["offload_folder"] = "/tmp/offload_folder/"
+
     model = (
         setup_model(args, model_dtype, model_kwargs, logger)
         if not use_deepspeed
@@ -380,8 +380,16 @@ def initialize_model(args, logger):
     )
     tokenizer, model = setup_tokenizer(args, model)
     generation_config = setup_generation_config(args, model, tokenizer)
+
+    if args.const_serialization_path:
+        setup_const_serialization(args.const_serialization_path)
     if args.fp8:
-        model = setup_quantization(args, model)
+        import habana_frameworks.torch.core as htcore
+
+        print("Initializing inference mode")
+        const_marking = os.getenv("ENABLE_CONST_MARKING", "True")
+        if const_marking == "True":
+            htcore.hpu_initialize(model)
     init_end = time.perf_counter()
     logger.info(f"Args: {args}")
     logger.info(f"device: {args.device}, n_hpu: {args.world_size}, bf16: {model_dtype == torch.bfloat16}")
diff --git a/notebooks/AI_HW_Summit_2022.ipynb b/notebooks/AI_HW_Summit_2022.ipynb
index 37075fd3ef..cf6c8bdea5 100644
--- a/notebooks/AI_HW_Summit_2022.ipynb
+++ b/notebooks/AI_HW_Summit_2022.ipynb
@@ -261,7 +261,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0"
+    "!pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0"
    ]
   },
   {
diff --git a/optimum/habana/accelerate/accelerator.py b/optimum/habana/accelerate/accelerator.py
index e33f5210db..84b8fb6d7f 100644
--- a/optimum/habana/accelerate/accelerator.py
+++ b/optimum/habana/accelerate/accelerator.py
@@ -141,7 +141,7 @@ def __init__(
         if deepspeed_plugin:
             if not is_deepspeed_available():
                 raise ImportError(
-                    "DeepSpeed is not installed => run `pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0`."
+                    "DeepSpeed is not installed => run `pip install git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0`."
                 )
 
             mixed_precision = (
diff --git a/optimum/habana/accelerate/state.py b/optimum/habana/accelerate/state.py
index e29651efa9..ab1cae7fef 100644
--- a/optimum/habana/accelerate/state.py
+++ b/optimum/habana/accelerate/state.py
@@ -55,7 +55,7 @@ def __init__(self, cpu: bool = False, **kwargs):
                     if not is_deepspeed_available():
                         raise ImportError(
                             "DeepSpeed is not available, install it with: `pip install"
-                            " git+https://github.com/HabanaAI/DeepSpeed.git@1.14.0`."
+                            " git+https://github.com/HabanaAI/DeepSpeed.git@1.15.0`."
                         )
                     self.distributed_type = GaudiDistributedType.DEEPSPEED
                     import deepspeed
diff --git a/optimum/habana/transformers/generation/configuration_utils.py b/optimum/habana/transformers/generation/configuration_utils.py
index e75e48a7c7..93df1335db 100644
--- a/optimum/habana/transformers/generation/configuration_utils.py
+++ b/optimum/habana/transformers/generation/configuration_utils.py
@@ -29,8 +29,6 @@ class GaudiGenerationConfig(GenerationConfig):
         Only active if `static_shapes` is used. Can't be used with `reuse_cache`.
     bucket_internal (`bool`, *optional*):
         Split kv sequence into buckets in decode phase. It improves throughput when max_new_tokens is large.
-    kv_cache_fp8 (`bool`, *optional*):
-        Store kv-cache in float8 when kv-cache is used
     use_flash_attention (`bool`, *optional*):
         Whether to use flash attention optimization.
     flash_attention_recompute (`bool`, *optional*):
@@ -48,7 +46,6 @@ def __init__(self, **kwargs):
         self.bucket_size = kwargs.get("bucket_size", -1)
         self.bucket_internal = kwargs.get("bucket_internal", None)
         self.reduce_recompile = kwargs.get("reduce_recompile", None)
-        self.kv_cache_fp8 = kwargs.get("kv_cache_fp8", None)
         self.use_flash_attention = kwargs.get("use_flash_attention", None)
         self.flash_attention_recompute = kwargs.get("flash_attention_recompute", None)
         self.use_fused_rope = kwargs.get("use_fused_rope", None)
diff --git a/optimum/habana/transformers/generation/utils.py b/optimum/habana/transformers/generation/utils.py
index aa7d92ebce..92df17bb50 100755
--- a/optimum/habana/transformers/generation/utils.py
+++ b/optimum/habana/transformers/generation/utils.py
@@ -584,7 +584,8 @@ def generate(
             assert self.config.model_type in [
                 "llama",
                 "mistral",
-            ], "reuse_cache only supported by llama and mistral at the moment"
+                "falcon",
+            ], "reuse_cache only supported by llama, mistral and falcon at the moment"
             if not generation_config.bucket_internal:
                 assert (
                     generation_config.bucket_size <= 0
@@ -733,14 +734,11 @@ def generate(
                 bs, _ = input_ids.shape
                 if not is_greedy_or_beam_and_bucket:
                     unwrap_deepspeed_model(self).allocate_kv_cache(
-                        bs * generation_config.num_beams,
-                        calculated_max_length,
-                        token_idx,
-                        generation_config.kv_cache_fp8,
+                        bs * generation_config.num_beams, calculated_max_length, token_idx
                     )
                     model_kwargs["kv_cache_len"] = calculated_max_length
 
-            if self.config.model_type in ["llama"]:
+            if self.config.model_type in ["llama", "falcon"]:
                 if self.config.max_position_embeddings < calculated_max_length:
                     unwrap_deepspeed_model(self).update_sincos_cache(seq_len=calculated_max_length)
 
diff --git a/optimum/habana/transformers/modeling_utils.py b/optimum/habana/transformers/modeling_utils.py
index 9d4e473aab..6dc40a73bf 100644
--- a/optimum/habana/transformers/modeling_utils.py
+++ b/optimum/habana/transformers/modeling_utils.py
@@ -26,7 +26,10 @@
     GaudiBloomMLP,
     GaudiCodeGenAttention,
     GaudiCodeGenForCausalLM,
+    GaudiFalconAttention,
+    GaudiFalconDecoderLayer,
     GaudiFalconForCausalLM,
+    GaudiFalconMLP,
     GaudiFalconModel,
     GaudiGPT2Attention,
     GaudiGPT2LMHeadModel,
@@ -84,9 +87,7 @@
     gaudi_conv1d_forward,
     gaudi_esm_for_protein_folding_forward,
     gaudi_esmfolding_trunk_forward,
-    gaudi_falcon_attention_forward,
     gaudi_falcon_attention_split_heads,
-    gaudi_falcon_decoder_layer_forward,
     gaudi_generate_speech,
     gaudi_get_extended_attention_mask,
     gaudi_gpt2_block_forward,
@@ -135,6 +136,7 @@
     gaudi_wav2vec2_encoder_forward,
     gaudi_wav2vec2_forward,
     gaudi_wav2vec2_tdnnlayer_forward,
+    gaudi_wav2vec2forctc_forward,
 )
 
 
@@ -161,6 +163,7 @@ def adapt_transformers_to_gaudi():
     )
     transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model.forward = gaudi_wav2vec2_forward
     transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Encoder.forward = gaudi_wav2vec2_encoder_forward
+    transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.forward = gaudi_wav2vec2forctc_forward
     transformers.models.wav2vec2.modeling_wav2vec2.TDNNLayer.forward = gaudi_wav2vec2_tdnnlayer_forward
 
     # Generation is modified to run faster in lazy mode
@@ -298,10 +301,11 @@ def adapt_transformers_to_gaudi():
     transformers.models.llama.modeling_llama.LlamaRMSNorm.forward = gaudi_llama_rmsnorm_forward
 
     # Optimization for falcon generation on Gaudi
+    transformers.models.falcon.modeling_falcon.FalconAttention = GaudiFalconAttention
     transformers.models.falcon.modeling_falcon.FalconForCausalLM = GaudiFalconForCausalLM
+    transformers.models.falcon.modeling_falcon.FalconMLP = GaudiFalconMLP
     transformers.models.falcon.modeling_falcon.FalconModel = GaudiFalconModel
-    transformers.models.falcon.modeling_falcon.FalconDecoderLayer.forward = gaudi_falcon_decoder_layer_forward
-    transformers.models.falcon.modeling_falcon.FalconAttention.forward = gaudi_falcon_attention_forward
+    transformers.models.falcon.modeling_falcon.FalconDecoderLayer = GaudiFalconDecoderLayer
     transformers.models.falcon.modeling_falcon.FalconAttention._split_heads = gaudi_falcon_attention_split_heads
 
     # Optimization for t5 on Gaudi
diff --git a/optimum/habana/transformers/models/__init__.py b/optimum/habana/transformers/models/__init__.py
index d0eb8b2dcd..1582d3f09e 100644
--- a/optimum/habana/transformers/models/__init__.py
+++ b/optimum/habana/transformers/models/__init__.py
@@ -43,11 +43,12 @@
     gaudi_rot_vec_mul,
 )
 from .falcon import (
+    GaudiFalconAttention,
+    GaudiFalconDecoderLayer,
     GaudiFalconForCausalLM,
+    GaudiFalconMLP,
     GaudiFalconModel,
-    gaudi_falcon_attention_forward,
     gaudi_falcon_attention_split_heads,
-    gaudi_falcon_decoder_layer_forward,
 )
 from .gpt2 import GaudiGPT2Attention, GaudiGPT2LMHeadModel, gaudi_gpt2_block_forward, gaudi_gpt2_forward
 from .gpt_bigcode import (
@@ -146,4 +147,5 @@
     gaudi_wav2vec2_encoder_forward,
     gaudi_wav2vec2_forward,
     gaudi_wav2vec2_tdnnlayer_forward,
+    gaudi_wav2vec2forctc_forward,
 )
diff --git a/optimum/habana/transformers/models/falcon/__init__.py b/optimum/habana/transformers/models/falcon/__init__.py
index 44ac5451f6..00c73ad110 100644
--- a/optimum/habana/transformers/models/falcon/__init__.py
+++ b/optimum/habana/transformers/models/falcon/__init__.py
@@ -1,7 +1,8 @@
 from .modeling_falcon import (
+    GaudiFalconAttention,
+    GaudiFalconDecoderLayer,
     GaudiFalconForCausalLM,
+    GaudiFalconMLP,
     GaudiFalconModel,
-    gaudi_falcon_attention_forward,
     gaudi_falcon_attention_split_heads,
-    gaudi_falcon_decoder_layer_forward,
 )
diff --git a/optimum/habana/transformers/models/falcon/modeling_falcon.py b/optimum/habana/transformers/models/falcon/modeling_falcon.py
index 98e3555e95..9b9a74c12f 100644
--- a/optimum/habana/transformers/models/falcon/modeling_falcon.py
+++ b/optimum/habana/transformers/models/falcon/modeling_falcon.py
@@ -1,5 +1,6 @@
 import contextlib
 import math
+import os
 import warnings
 from typing import Optional, Tuple, Union
 
@@ -27,6 +28,7 @@
 
 
 import habana_frameworks.torch.core as htcore
+from torch import nn
 from torch.nn import CrossEntropyLoss
 from torch.nn import functional as F
 from transformers.modeling_attn_mask_utils import _prepare_4d_causal_attention_mask_for_sdpa
@@ -34,12 +36,15 @@
     BaseModelOutputWithPastAndCrossAttentions,
     CausalLMOutputWithCrossAttentions,
 )
+from transformers.models.falcon.configuration_falcon import FalconConfig
 from transformers.models.falcon.modeling_falcon import (
+    FalconAttention,
+    FalconDecoderLayer,
     FalconForCausalLM,
+    FalconMLP,
     FalconModel,
     apply_rotary_pos_emb,
     build_alibi_tensor,
-    dropout_add,
 )
 from transformers.utils import logging
 
@@ -52,6 +57,20 @@
 logger = logging.get_logger(__name__)
 
 
+def dropout_add(x: torch.Tensor, residual: torch.Tensor, prob: float, training: bool) -> torch.Tensor:
+    """
+    Copied from transformers.models.falcon.modeling_falcon/dropout_add
+    https://github.com/huggingface/transformers/blob/b338a6c3b8eda29610d4d472cad8cd87cbfdaaed/src/transformers/models/falcon/modeling_falcon.py#L248
+    """
+    out = F.dropout(x, p=prob, training=training)
+    if training:
+        out = residual + out
+        return out
+    else:
+        residual.add_(out)
+        return residual
+
+
 def apply_customized_rope(q, k, cos, sin, position_ids):
     if q.device.type == "hpu" and FusedRoPE:
         # TODO: remove `.clone()` when it is fixed in SynapseAI
@@ -111,257 +130,506 @@ def gaudi_falcon_attention_split_heads(
         return query, key, value
 
 
-def gaudi_falcon_attention_forward(
-    self,
-    hidden_states: torch.Tensor,
-    alibi: Optional[torch.Tensor],
-    attention_mask: torch.Tensor,
-    position_ids: Optional[torch.LongTensor] = None,
-    layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-    head_mask: Optional[torch.Tensor] = None,
-    use_cache: bool = False,
-    output_attentions: bool = False,
-    token_idx: Optional[torch.Tensor] = None,
-    **kwargs,
-):
+class Softmax(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, dim=None, invAttnHead=None):
+        return torch.ops.hpu.softmax_fp8(x, dim, None, None, invAttnHead)
+
+
+class Matmul(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, *args, **kwargs):
+        return torch.matmul(*args, **kwargs)
+
+
+# ScaledDotProductAttention is based on torch.nn.functional.scaled_dot_product_attention
+class ScaledDotProductAttention(nn.Module):
+    def __init__(self, config: FalconConfig):
+        super().__init__()
+        self.head_dim = config.hidden_size // config.num_attention_heads
+        self.bmm1 = Matmul()
+        self.bmm2 = Matmul()
+        self.softmax = Softmax()
+
+    def forward(self, query, key, value, attn_mask=None, dropout_p=0.0, is_causal=False, scale=None) -> torch.Tensor:
+        L, S = query.size(-2), key.size(-2)
+        scale_factor = 1 / math.sqrt(self.head_dim)
+        invAttnHead = torch.tensor(scale_factor, dtype=torch.float32).to("hpu")
+
+        if is_causal:
+            assert attn_mask is None
+            attn_bias = torch.zeros(L, S, dtype=query.dtype)
+            temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
+            attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+            attn_bias.to(query.dtype)
+
+        if attn_mask is not None:
+            if attn_mask.dtype == torch.bool:
+                attn_mask.masked_fill_(attn_mask.logical_not(), float("-inf"))
+
+        attn_weight = self.bmm1(query, key.transpose(-2, -1))
+
+        attn_weight += attn_mask
+        attn_weight = self.softmax(attn_weight, dim=-1, invAttnHead=invAttnHead)
+        attn_weight = torch.dropout(attn_weight, dropout_p, train=True)
+        return self.bmm2(attn_weight, value)
+
+
+def update(prev, cur, dim, idx, inp_seq_len):
+    orig_cur = cur
+    cur = cur.to(dtype=prev.dtype)
+
+    if prev.shape == cur.shape:
+        prev.copy_(cur)
+        return orig_cur
+
+    if cur.shape[-2] > 1 and cur.shape[-2] <= prev.shape[-2]:
+        # Initialize
+        prev[:, :, :inp_seq_len, :].copy_(cur)
+        return orig_cur
+    assert cur.shape[2] == 1, f"Cannot update kv-cache. Unsupported shapes. prev:{prev.shape} cur:{cur.shape}"
+    if idx is not None:
+        prev.index_copy_(dim, idx - 1, cur)
+        prev_cast = prev.to(orig_cur.dtype)
+        return prev_cast
+    else:
+        return torch.cat((prev, cur), dim=dim)
+
+
+class KVCache(torch.nn.Module):
+    def __init__(self):
+        super(KVCache, self).__init__()
+        self.cache = None
+        self.inp_seq_len = -1
+
+    def allocate(self, inp_seq_len, dtype, device, shape):
+        if self.cache is None or self.cache.shape != shape:
+            self.inp_seq_len = inp_seq_len
+            self.cache = torch.zeros(shape, dtype=dtype, device=device)
+        else:
+            assert (
+                self.inp_seq_len == inp_seq_len
+            ), f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
+            self.cache.fill_(0)
+
+    def get_shape(self):
+        if self.cache is None:
+            return None
+        return self.cache.shape
+
+    def forward(self, cur, dim, idx):
+        return self.update(self.cache, cur, dim, idx, self.inp_seq_len)
+
+    def update(self, prev, cur, dim, idx, inp_seq_len):
+        return update(prev, cur, dim, idx, inp_seq_len)
+
+
+class GaudiFalconAttention(FalconAttention):
     """
-    Copied from FalconAttention.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/falcon/modeling_falcon.py
+    Inherits from FalconAttention: https://github.com/huggingface/transformers/blob/838b87abe231fd70be5132088d0dee72a7bb8d62/src/transformers/models/falcon/modeling_falcon.py#L267
     The only differences are:
     - add new args token_idx and position_ids
-    - replace F.scaled_dot_product_attention with Habana torch's version
+    - replace F.scaled_dot_product_attention with Habana torch's version for BF16
+    - use ScaledDotProductAttention for FP8 quantization
+    - add new arg reuse_cache
     """
-    if "padding_mask" in kwargs:
-        warnings.warn(
-            "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-        )
 
-    fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
-    # 3 x [batch_size, seq_length, num_heads, head_dim]
-    (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
+    def __init__(self, config: FalconConfig):
+        super().__init__(config)
 
-    batch_size, query_length, _, _ = query_layer.shape
+        if os.getenv("QUANT_CONFIG", ""):
+            self.sdpa = ScaledDotProductAttention(config)
 
-    query_layer = query_layer.transpose(1, 2).reshape(batch_size, -1, query_length, self.head_dim)
-    key_layer = key_layer.transpose(1, 2).reshape(batch_size, -1, query_length, self.head_dim)
-    value_layer = value_layer.transpose(1, 2).reshape(batch_size, -1, query_length, self.head_dim)
+        self.k_cache = KVCache()
+        self.v_cache = KVCache()
+        self.inp_seq_len = -1
+        self.max_position_embeddings = config.max_position_embeddings
 
-    kv_seq_len = key_layer.shape[-2]
-    if layer_past is not None:
-        if token_idx is not None:
-            # When token_idx is used,
-            # past_kv_length = 0
-            # static seq len = (input token len + max output token len)
-            kv_seq_len = layer_past[0].shape[-2]
+    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
+        if self.config.new_decoder_architecture:
+            cache_shape = (batch_size, self.num_heads, max_seq_len, self.head_dim)
         else:
-            kv_seq_len += layer_past[0].shape[-2]
-    if alibi is None:
-        cos, sin = self.rotary_emb(value_layer, seq_len=kv_seq_len)
-        query_layer, key_layer = apply_customized_rope(query_layer, key_layer, cos, sin, position_ids)
-
-    if layer_past is not None:
-        past_key, past_value = layer_past
-        if token_idx is not None:
-            past_key.index_copy_(-2, token_idx - 1, key_layer)
-            past_value.index_copy_(-2, token_idx - 1, value_layer)
-            key_layer = past_key
-            value_layer = past_value
-        else:
-            # concatenate along seq_length dimension:
-            #  - key: [batch_size, self.num_heads, kv_length, head_dim]
-            #  - value: [batch_size, self.num_heads, kv_length, head_dim]
-            key_layer = torch.cat((past_key, key_layer), dim=-2)
-            value_layer = torch.cat((past_value, value_layer), dim=-2)
-
-    kv_length = key_layer.shape[-2]
-    if use_cache:
-        present = (key_layer, value_layer)
-    else:
-        present = None
+            cache_shape = (batch_size, 1, max_seq_len, self.head_dim)
+        device = self.query_key_value.weight.device
+        dtype = self.config.torch_dtype
+        self.k_cache.allocate(inp_seq_len, dtype, device, cache_shape)
+        self.v_cache.allocate(inp_seq_len, dtype, device, cache_shape)
+
+    def update_sincos_cache(self, seq_len):
+        # Call rotary emb forward() to update cos/sin cache when infering more than self.max_position_embeddings
+        # This helps in avoiding creation of these caches during actual model forward pass and
+        # reduce memory consumption and improve performance.
+        if seq_len > self.max_position_embeddings:
+            self.max_position_embeddings = seq_len
+            self.rotary_emb._set_cos_sin_cache(
+                seq_len, self.query_key_value.weight.device, self.query_key_value.weight.dtype
+            )
 
-    if alibi is None:
-        if output_attentions:
-            attention_scores = query_layer @ key_layer.transpose(-1, -2)
-            attention_scores /= math.sqrt(self.head_dim)
+    def pre_attn_forward(
+        self,
+        hidden_states: torch.Tensor,
+        alibi: Optional[torch.Tensor],
+        attention_mask: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+        token_idx: Optional[torch.Tensor] = None,
+        reuse_cache: Optional[bool] = False,
+        cache_idx: int = None,
+        **kwargs,
+    ):
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+
+        fused_qkv = self.query_key_value(hidden_states)  # [batch_size, seq_length, 3 x hidden_size]
+        # 3 x [batch_size, seq_length, num_heads, head_dim]
+        (query_layer, key_layer, value_layer) = self._split_heads(fused_qkv)
+
+        batch_size, query_length, _, _ = query_layer.shape
+
+        query_layer = query_layer.transpose(1, 2).reshape(batch_size, -1, query_length, self.head_dim)
+        key_layer = key_layer.transpose(1, 2).reshape(batch_size, -1, query_length, self.head_dim)
+        value_layer = value_layer.transpose(1, 2).reshape(batch_size, -1, query_length, self.head_dim)
+
+        kv_seq_len = key_layer.shape[-2]
+        if layer_past is not None:
+            if token_idx is not None:
+                if reuse_cache:
+                    kv_seq_len = layer_past[0][-2]
+                else:
+                    kv_seq_len = layer_past[0].shape[-2]
+            else:
+                kv_seq_len += layer_past[0].shape[-2]
+
+        if alibi is None:
+            cos, sin = self.rotary_emb(value_layer, seq_len=kv_seq_len)
+            query_layer, key_layer = apply_customized_rope(query_layer, key_layer, cos, sin, position_ids)
+
+        if use_cache:
+            if self.training:
+                present = None
+            else:
+                if reuse_cache:
+                    key_layer = self.k_cache(key_layer, -2, token_idx)
+                    value_layer = self.v_cache(value_layer, -2, token_idx)
+                    present = (self.k_cache.get_shape(), self.v_cache.get_shape())
+                else:
+                    if layer_past is None:
+                        past_key = torch.zeros(
+                            key_layer.shape,
+                            dtype=self.query_key_value.weight.dtype,
+                            device=self.query_key_value.weight.device,
+                        )
+                        past_value = torch.zeros(
+                            key_layer.shape,
+                            dtype=self.query_key_value.weight.dtype,
+                            device=self.query_key_value.weight.device,
+                        )
+                        layer_past = (past_key, past_value)
+                    key_layer = self.k_cache.update(
+                        layer_past[0], key_layer, -2, token_idx, self.inp_seq_len
+                    )  # k_layer bs*1, q_len, head_dim
+                    value_layer = self.v_cache.update(layer_past[1], value_layer, -2, token_idx, self.inp_seq_len)
+                    if token_idx is None:
+                        layer_past = (key_layer, value_layer)
+                    present = layer_past
+
+                if cache_idx is not None and query_length == 1:
+                    key_layer = key_layer[:, :, :cache_idx, :]
+                    value_layer = value_layer[:, :, :cache_idx, :]
+                    attention_mask = attention_mask[:, :, :, :cache_idx]
+        else:
+            present = None
 
-            attention_scores = F.softmax(attention_scores + attention_mask, dim=-1, dtype=hidden_states.dtype)
-            # It is unclear why neither dropout nor head_mask is applied here (while it is with alibi).
-            attn_output = attention_scores @ value_layer
+        if self.training or present is None:
+            kv_length = key_layer.shape[-2]
         else:
-            if FusedSDPA:
-                with sdp_kernel(enable_recompute=False) if SDPContext else contextlib.nullcontext():
-                    attn_output = FusedSDPA.apply(
+            kv_length = present[0][-2] if reuse_cache else present[0].shape[-2]
+
+        if alibi is None:
+            if output_attentions:
+                attention_scores = query_layer @ key_layer.transpose(-1, -2)
+                attention_scores /= math.sqrt(self.head_dim)
+
+                attention_scores = F.softmax(attention_scores + attention_mask, dim=-1, dtype=hidden_states.dtype)
+                # It is unclear why neither dropout nor head_mask is applied here (while it is with alibi).
+                attn_output = attention_scores @ value_layer
+            else:
+                if FusedSDPA:
+                    if os.getenv("QUANT_CONFIG", ""):
+                        attn_output = self.sdpa(
+                            query_layer, key_layer, value_layer, attention_mask, 0.0, is_causal=False
+                        )
+                    else:
+                        with sdp_kernel(enable_recompute=False) if SDPContext else contextlib.nullcontext():
+                            attn_output = FusedSDPA.apply(
+                                query_layer,
+                                key_layer,
+                                value_layer,
+                                attention_mask,
+                                0.0,
+                                # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case query_length == 1.
+                                self.is_causal and attention_mask is None and query_length > 1,
+                            )
+                else:
+                    # Workaround util scaled_dot_product_attention support broadcast.
+                    if self.training is True and query_layer.shape != key_layer.shape:
+                        key_layer = torch.broadcast_to(key_layer, query_layer.shape)
+                        value_layer = torch.broadcast_to(value_layer, query_layer.shape)
+                    attn_output = F.scaled_dot_product_attention(
                         query_layer,
                         key_layer,
                         value_layer,
                         attention_mask,
                         0.0,
                         # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case query_length == 1.
-                        self.is_causal and attention_mask is None and query_length > 1,
+                        is_causal=self.is_causal and attention_mask is None and query_length > 1,
                     )
-            else:
-                # Workaround util scaled_dot_product_attention support broadcast.
-                if self.training is True and query_layer.shape != key_layer.shape:
-                    key_layer = torch.broadcast_to(key_layer, query_layer.shape)
-                    value_layer = torch.broadcast_to(value_layer, query_layer.shape)
-                attn_output = F.scaled_dot_product_attention(
-                    query_layer,
-                    key_layer,
-                    value_layer,
-                    attention_mask,
-                    0.0,
-                    # The query_length > 1 is necessary to match with AttentionMaskConverter.to_causal_4d that does not create a causal mask in case query_length == 1.
-                    is_causal=self.is_causal and attention_mask is None and query_length > 1,
-                )
-            # Performance improvement for HPU
-            if self.training is True and htcore:
-                htcore.mark_step()
-            attention_scores = None
+                # Performance improvement for HPU
+                if self.training is True and htcore:
+                    htcore.mark_step()
+                attention_scores = None
 
-        attn_output = attn_output.view(batch_size, -1, query_length, self.head_dim)
-        attn_output = attn_output.permute(0, 2, 1, 3)
-        attn_output = attn_output.reshape(batch_size, query_length, -1)
+            attn_output = attn_output.view(batch_size, -1, query_length, self.head_dim)
+            attn_output = attn_output.permute(0, 2, 1, 3)
+            attn_output = attn_output.reshape(batch_size, query_length, -1)
 
-        attn_output = self.dense(attn_output)
+            attn_output = self.dense(attn_output)
 
-        if output_attentions:
-            return attn_output, present, attention_scores
-        else:
-            return attn_output, present
+            if output_attentions:
+                return attn_output, present, attention_scores
+            else:
+                return attn_output, present, _
 
-    else:
-        if self._use_sdpa and not output_attentions and head_mask is None:
-            if FusedSDPA:
-                with sdp_kernel(enable_recompute=False) if SDPContext else contextlib.nullcontext():
-                    attn_output = FusedSDPA.apply(
+        else:
+            if self._use_sdpa and not output_attentions and head_mask is None:
+                if FusedSDPA:
+                    with sdp_kernel(enable_recompute=False) if SDPContext else contextlib.nullcontext():
+                        attn_output = FusedSDPA.apply(
+                            query_layer,
+                            key_layer,
+                            value_layer,
+                            attention_mask,
+                            self.attention_dropout.p if self.training else 0.0,
+                            self.is_causal and attention_mask is None and query_length > 1,
+                        )
+                else:
+                    attn_output = F.scaled_dot_product_attention(
                         query_layer,
                         key_layer,
                         value_layer,
-                        attention_mask,
-                        self.attention_dropout.p if self.training else 0.0,
-                        self.is_causal and attention_mask is None and query_length > 1,
+                        attn_mask=attention_mask,
+                        dropout_p=self.attention_dropout.p if self.training else 0.0,
+                        is_causal=self.is_causal and attention_mask is None and query_length > 1,
                     )
+                attn_output = attn_output.transpose(1, 2)
+                attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim)
+
+                attn_output = self.dense(attn_output)
             else:
-                attn_output = F.scaled_dot_product_attention(
-                    query_layer,
-                    key_layer,
-                    value_layer,
-                    attn_mask=attention_mask,
-                    dropout_p=self.attention_dropout.p if self.training else 0.0,
-                    is_causal=self.is_causal and attention_mask is None and query_length > 1,
-                )
-            attn_output = attn_output.transpose(1, 2)
-            attn_output = attn_output.reshape(batch_size, query_length, self.num_heads * self.head_dim)
+                matmul_result = query_layer @ key_layer.transpose(-1, -2)
 
-            attn_output = self.dense(attn_output)
-        else:
-            matmul_result = query_layer @ key_layer.transpose(-1, -2)
+                # change view to [batch_size, num_heads, q_length, kv_length]
+                attention_scores = matmul_result.view(batch_size, self.num_heads, query_length, kv_length)
 
-            # change view to [batch_size, num_heads, q_length, kv_length]
-            attention_scores = matmul_result.view(batch_size, self.num_heads, query_length, kv_length)
+                # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
+                input_dtype = attention_scores.dtype
+                # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38`
+                if input_dtype == torch.float16 or input_dtype == torch.bfloat16:
+                    attention_scores = attention_scores.to(torch.float32)
 
-            # cast attention scores to fp32, compute scaled softmax and cast back to initial dtype - [batch_size, num_heads, q_length, kv_length]
-            input_dtype = attention_scores.dtype
-            # `float16` has a minimum value of -65504.0, whereas `bfloat16` and `float32` have a minimum value of `-3.4e+38`
-            if input_dtype == torch.float16 or input_dtype == torch.bfloat16:
-                attention_scores = attention_scores.to(torch.float32)
+                attention_logits = attention_scores + alibi.view(batch_size, self.num_heads, 1, -1)
+                attention_logits *= self.inv_norm_factor
+                attention_probs = F.softmax(attention_logits + attention_mask, dim=-1, dtype=hidden_states.dtype)
+                # [batch_size, num_heads, q_length, kv_length]
+                attention_probs = self.attention_dropout(attention_probs)
 
-            attention_logits = attention_scores + alibi.view(batch_size, self.num_heads, 1, -1)
-            attention_logits *= self.inv_norm_factor
-            attention_probs = F.softmax(attention_logits + attention_mask, dim=-1, dtype=hidden_states.dtype)
-            # [batch_size, num_heads, q_length, kv_length]
-            attention_probs = self.attention_dropout(attention_probs)
+                if head_mask is not None:
+                    attention_probs = attention_probs * head_mask
 
-            if head_mask is not None:
-                attention_probs = attention_probs * head_mask
+                # change view [batch_size, num_heads, q_length, kv_length]
+                attention_probs_reshaped = attention_probs.view(batch_size, self.num_heads, query_length, kv_length)
 
-            # change view [batch_size, num_heads, q_length, kv_length]
-            attention_probs_reshaped = attention_probs.view(batch_size, self.num_heads, query_length, kv_length)
+                # matmul: [batch_size * num_heads, q_length, head_dim]
+                attn_output = (attention_probs_reshaped @ value_layer).flatten(0, 1)
 
-            # matmul: [batch_size * num_heads, q_length, head_dim]
-            attn_output = (attention_probs_reshaped @ value_layer).flatten(0, 1)
+                # change view [batch_size, q_length, num_heads * head_dim]
+                attn_output = self._merge_heads(attn_output)
 
-            # change view [batch_size, q_length, num_heads * head_dim]
-            attn_output = self._merge_heads(attn_output)
+                attn_output = self.dense(attn_output)
 
-            attn_output = self.dense(attn_output)
+            if output_attentions:
+                return attn_output, present, attention_probs
+            else:
+                return attn_output, present, _
 
-        if output_attentions:
-            return attn_output, present, attention_probs
-        else:
-            return attn_output, present
-
-
-def gaudi_falcon_decoder_layer_forward(
-    self,
-    hidden_states: torch.Tensor,
-    alibi: Optional[torch.Tensor],
-    attention_mask: torch.Tensor,
-    position_ids: Optional[torch.LongTensor] = None,
-    layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
-    head_mask: Optional[torch.Tensor] = None,
-    use_cache: bool = False,
-    output_attentions: bool = False,
-    token_idx: Optional[torch.Tensor] = None,
-    **kwargs,
-):
+    def attention_all_reduce(self, attn_output):
+        if hasattr(self.dense, "all_reduce"):
+            self.dense.all_reduce(attn_output)
+
+    def post_attn_forward(self, attn_output):
+        if hasattr(self.dense, "all_reduce"):
+            self.dense.post_all_reduce(attn_output)
+        return attn_output
+
+
+class GaudiFalconMLP(FalconMLP):
+    """
+    Inherits from FalconMLP: https://github.com/huggingface/transformers/blob/main/src/transformers/models/falcon/modeling_falcon.py
+    """
+
+    def pre_mlp_forward(self, x):
+        x = self.act(self.dense_h_to_4h(x))
+        x = self.dense_4h_to_h(x)
+        return x
+
+    def mlp_all_reduce(self, x):
+        if hasattr(self.dense_4h_to_h, "all_reduce"):
+            self.dense_4h_to_h.all_reduce(x)
+
+    def post_mlp_forward(self, x):
+        if hasattr(self.dense_4h_to_h, "all_reduce"):
+            self.dense_4h_to_h.post_all_reduce(x)
+        return x
+
+
+class GaudiFalconDecoderLayer(FalconDecoderLayer):
     """
-    Copied from FalconDecoderLayer.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/falcon/modeling_falcon.py
+    Inherits from FalconDecoderLayer: https://github.com/huggingface/transformers/blob/main/src/transformers/models/falcon/modeling_falcon.py
     The only differences are:
     - add new args token_idx and position_ids
     - add token_idx and position_ids into attention inputs
+    - add new args reuse_cache
     """
-    if "padding_mask" in kwargs:
-        warnings.warn(
-            "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
-        )
 
-    residual = hidden_states
+    def __init__(self, config: FalconConfig):
+        super().__init__(config)
+        self.self_attention = GaudiFalconAttention(config)
 
-    if self.config.new_decoder_architecture:
-        attention_layernorm_out = self.ln_attn(hidden_states)
-        mlp_layernorm_out = self.ln_mlp(hidden_states)
-    else:
-        attention_layernorm_out = self.input_layernorm(hidden_states)
-
-    # Self attention.
-    attn_outputs = self.self_attention(
-        attention_layernorm_out,
-        layer_past=layer_past,
-        attention_mask=attention_mask,
-        position_ids=position_ids,
-        alibi=alibi,
-        head_mask=head_mask,
-        use_cache=use_cache,
-        output_attentions=output_attentions,
-        token_idx=token_idx,
+    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
+        self.self_attention.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
+
+    def update_sincos_cache(self, seq_len):
+        self.self_attention.update_sincos_cache(seq_len)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        alibi: Optional[torch.Tensor],
+        attention_mask: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+        token_idx: Optional[torch.Tensor] = None,
+        reuse_cache: Optional[bool] = False,
+        cache_idx: int = None,
         **kwargs,
-    )
+    ):
+        if "padding_mask" in kwargs:
+            warnings.warn(
+                "Passing `padding_mask` is deprecated and will be removed in v4.37. Please make sure use `attention_mask` instead.`"
+            )
+        residual = hidden_states
+        (
+            hidden_states,
+            present,
+            attn_scores,
+            attention_layernorm_out,
+            mlp_layernorm_out,
+        ) = self.pre_attn(  # layernorm + attention before AllReduce
+            hidden_states,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            alibi=alibi,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            token_idx=token_idx,
+            reuse_cache=reuse_cache,
+            cache_idx=cache_idx,
+            **kwargs,
+        )
 
-    attention_output = attn_outputs[0]
+        self.self_attention.attention_all_reduce(hidden_states)
+        hidden_states = self.self_attention.post_attn_forward(hidden_states)
 
-    if not self.config.new_decoder_architecture:
-        if self.config.parallel_attn:
-            mlp_layernorm_out = attention_layernorm_out
-        else:
-            residual = dropout_add(attention_output, residual, self.config.attention_dropout, training=self.training)
-            mlp_layernorm_out = self.post_attention_layernorm(residual)
+        attention_output = hidden_states
 
-    outputs = attn_outputs[1:]
+        if not self.config.new_decoder_architecture:
+            if self.config.parallel_attn:
+                mlp_layernorm_out = attention_layernorm_out
+            else:
+                residual = dropout_add(
+                    attention_output, residual, self.config.attention_dropout, training=self.training
+                )
+                mlp_layernorm_out = self.post_attention_layernorm(residual)
 
-    # MLP.
-    mlp_output = self.mlp(mlp_layernorm_out)
+        outputs = (present, attn_scores)
 
-    if self.config.new_decoder_architecture or self.config.parallel_attn:
-        mlp_output += attention_output
+        hidden_states = self.mlp.pre_mlp_forward(mlp_layernorm_out)
+        self.mlp.mlp_all_reduce(hidden_states)
+        hidden_states = self.mlp.post_mlp_forward(hidden_states)
 
-    output = dropout_add(mlp_output, residual, self.config.hidden_dropout, training=self.training)
+        if self.config.new_decoder_architecture or self.config.parallel_attn:
+            hidden_states += attention_output
 
-    if use_cache:
-        outputs = (output,) + outputs
-    else:
-        outputs = (output,) + outputs[1:]
+        output = dropout_add(hidden_states, residual, self.config.hidden_dropout, training=self.training)
+
+        if use_cache:
+            outputs = (output,) + outputs
+        else:
+            outputs = (output,) + outputs[1:]
+
+        return outputs  # hidden_states, present, attentions
+
+    def pre_attn(
+        self,
+        hidden_states: torch.Tensor,
+        alibi: Optional[torch.Tensor],
+        attention_mask: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        layer_past: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+        output_attentions: bool = False,
+        token_idx: Optional[torch.Tensor] = None,
+        reuse_cache: Optional[bool] = False,
+        cache_idx: int = None,
+    ):
+        if self.config.new_decoder_architecture:
+            attention_layernorm_out = self.ln_attn(hidden_states)
+            mlp_layernorm_out = self.ln_mlp(hidden_states)
+        else:
+            attention_layernorm_out = self.input_layernorm(hidden_states)
+            mlp_layernorm_out = None
 
-    return outputs  # hidden_states, present, attentions
+        # Self attention.
+        attn_outputs, present, attn_scores = self.self_attention.pre_attn_forward(
+            attention_layernorm_out,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            alibi=alibi,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            token_idx=token_idx,
+            reuse_cache=reuse_cache,
+            cache_idx=cache_idx,
+        )
+
+        return attn_outputs, present, attn_scores, attention_layernorm_out, mlp_layernorm_out
 
 
 class GaudiFalconModel(FalconModel):
@@ -370,11 +638,17 @@ class GaudiFalconModel(FalconModel):
     The only differences are:
     - add new args token_idx and position_ids
     - add token_idx and position_ids into decoder inputs
-    - set past_key_values_length=0 when token_idx is used (with static input shape)
-    - add new arg tgt_len to _expand_mask because past_key_values_length is no longer valid with token_idx
-    - use old version of _make_causal_mask to workaround toch.triu that is not supported in Synapse
+    - add new arg reuse_cache
     """
 
+    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
+        for layer in self.h:
+            layer.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
+
+    def update_sincos_cache(self, seq_len):
+        for layer in self.h:
+            layer.update_sincos_cache(seq_len)
+
     def forward(
         self,
         input_ids: Optional[torch.LongTensor] = None,
@@ -388,6 +662,8 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         token_idx: Optional[torch.Tensor] = None,
+        reuse_cache: Optional[bool] = False,
+        cache_idx: int = None,
     ) -> Union[Tuple[torch.Tensor, ...], BaseModelOutputWithPastAndCrossAttentions]:
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
         output_hidden_states = (
@@ -426,7 +702,10 @@ def forward(
         # Compute alibi tensor: check build_alibi_tensor documentation
         past_key_values_length = 0
         if past_key_values[0] is not None and token_idx is None:
-            past_key_values_length = past_key_values[0][0].shape[-2]
+            if reuse_cache:
+                past_key_values_length = past_key_values[0][0][-2]
+            else:
+                past_key_values_length = past_key_values[0][0].shape[-2]
 
         if self.use_alibi:
             mask = (
@@ -489,6 +768,7 @@ def forward(
                 attention_mask = _gaudi_prepare_4d_causal_attention_mask(
                     attention_mask, (batch_size, seq_length), inputs_embeds, past_key_values_length
                 )
+
         else:
             # 4d mask is passed through the layers
             attention_mask = _gaudi_prepare_4d_causal_attention_mask(
@@ -501,6 +781,7 @@ def forward(
         # head_mask has shape n_layer x batch x num_heads x N x N
         head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
 
+        htcore.mark_step()
         for i, (block, layer_past) in enumerate(zip(self.h, past_key_values)):
             if output_hidden_states:
                 all_hidden_states = all_hidden_states + (hidden_states,)
@@ -529,6 +810,8 @@ def forward(
                     output_attentions=output_attentions,
                     alibi=alibi,
                     token_idx=token_idx,
+                    reuse_cache=reuse_cache,
+                    cache_idx=cache_idx,
                 )
 
             hidden_states = outputs[0]
@@ -563,8 +846,16 @@ class GaudiFalconForCausalLM(FalconForCausalLM):
     - add token_idx and position_ids into model inputs
     - from step2 when enable KV cache, slice next_input_ids from input_ids base on the token_idx
     - from step2 when enable KV cache, slice next_position_ids from position_ids base on the token_idx
+    - add new args reuse_cache
     """
 
+    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
+        self.transformer.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
+        self.kv_cache_len = max_seq_len
+
+    def update_sincos_cache(self, seq_len):
+        self.transformer.update_sincos_cache(seq_len)
+
     def prepare_inputs_for_generation(
         self,
         input_ids: torch.LongTensor,
@@ -574,6 +865,7 @@ def prepare_inputs_for_generation(
         token_idx: Optional[torch.Tensor] = None,
         **kwargs,
     ) -> dict:
+        reuse_cache = kwargs.get("reuse_cache")
         if past_key_values is not None:
             if token_idx is not None:
                 input_ids = torch.index_select(input_ids, 1, token_idx - 1)
@@ -588,6 +880,10 @@ def prepare_inputs_for_generation(
                     remove_prefix_length = input_ids.shape[1] - 1
 
                 input_ids = input_ids[:, remove_prefix_length:]
+        elif reuse_cache and token_idx is not None:
+            # With reuse_cache, KV cache is pre allocated hence for the 1st token we can slice the inputs till token idx for the fwd pass
+            input_ids = input_ids[:, :token_idx]
+            attention_mask = attention_mask[:, :token_idx]
 
         # Note: versions of Falcon with alibi do not use position_ids. It is used with RoPE.
         if (
@@ -612,6 +908,8 @@ def prepare_inputs_for_generation(
             "use_cache": kwargs.get("use_cache"),
             "attention_mask": attention_mask,
             "token_idx": token_idx,
+            "reuse_cache": reuse_cache,
+            "cache_idx": kwargs.get("cache_idx"),
         }
 
     def forward(
@@ -628,6 +926,9 @@ def forward(
         output_hidden_states: Optional[bool] = None,
         return_dict: Optional[bool] = None,
         token_idx: Optional[torch.Tensor] = None,
+        reuse_cache: Optional[bool] = False,
+        trim_logits: Optional[bool] = False,
+        cache_idx: int = None,
     ) -> Union[Tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
         r"""
         labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
@@ -649,9 +950,18 @@ def forward(
             output_hidden_states=output_hidden_states,
             return_dict=return_dict,
             token_idx=token_idx,
+            reuse_cache=reuse_cache,
+            cache_idx=cache_idx,
         )
         hidden_states = transformer_outputs[0]
 
+        _, seq_len, _ = hidden_states.shape
+        if seq_len > 1 and trim_logits and not self.training:
+            if token_idx is not None:
+                hidden_states = hidden_states.index_select(1, token_idx - 1)
+            else:
+                hidden_states = hidden_states[:, -1:, :]
+
         lm_logits = self.lm_head(hidden_states)
 
         loss = None
diff --git a/optimum/habana/transformers/models/llama/modeling_llama.py b/optimum/habana/transformers/models/llama/modeling_llama.py
index 72e4b0fa55..4d0f3513d7 100755
--- a/optimum/habana/transformers/models/llama/modeling_llama.py
+++ b/optimum/habana/transformers/models/llama/modeling_llama.py
@@ -33,9 +33,11 @@
 
 try:
     from habana_frameworks.torch.hpex.normalization import FusedRMSNorm as FusedRMSNorm
+
+    has_fused_rms_norm = True
 except ImportError:
+    has_fused_rms_norm = False
     print("Not using HPU fused kernel for RMSNorm")
-    FusedRMSNorm = None
 
 try:
     from habana_frameworks.torch.hpex.kernels import FusedSDPA
@@ -44,32 +46,13 @@
     FusedSDPA = None
 
 
-def update(prev, cur, dim, idx, inp_seq_len):
-    orig_cur = cur
-    if prev.dtype == torch.float8_e4m3fn:
-        from habana_frameworks.torch.hpex.kernels.Fp8Ops import cast_to_fp8_v2
-
-        cur = cast_to_fp8_v2(cur, None, False, False, prev.dtype)[0]
-    if cur.shape[2] > 1 and cur.shape[2] <= prev.shape[2]:
-        # Initialize
-        prev[:, :, :inp_seq_len, :].copy_(cur)
-        return orig_cur
-    assert cur.shape[2] == 1, f"Cannot update kv-cache. Unsupported shapes. prev:{prev.shape} cur:{cur.shape}"
-    if idx is not None:
-        prev.index_copy_(dim, idx - 1, cur)
-        prev_cast = prev.to(orig_cur.dtype)
-        return prev_cast
-    else:
-        return torch.cat((prev, cur), dim=dim)
-
-
 def gaudi_llama_rmsnorm_forward(self, hidden_states):
     """
     Copied from LlamaRMSNorm.forward: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
     The only differences are:
         - override RMSNorm with Habana fused RMSNorm
     """
-    if hidden_states.device.type == "hpu" and FusedRMSNorm:
+    if hidden_states.device.type == "hpu" and has_fused_rms_norm:
         # mixed dtypes are not good for FusedRMSNorm, both inputs need to have same dtype
         if hidden_states.dtype != self.weight.dtype:
             orig_dtype = hidden_states.dtype
@@ -169,11 +152,9 @@ def __init__(self):
         self.cache = None
         self.inp_seq_len = -1
 
-    def allocate(self, inp_seq_len, kv_cache_fp8, dtype, device, shape):
+    def allocate(self, inp_seq_len, dtype, device, shape):
         if self.cache is None or self.cache.shape != shape:
             self.inp_seq_len = inp_seq_len
-            if kv_cache_fp8:
-                dtype = torch.float8_e4m3fn
             self.cache = torch.zeros(shape, dtype=dtype, device=device)
         else:
             assert (
@@ -181,13 +162,29 @@ def allocate(self, inp_seq_len, kv_cache_fp8, dtype, device, shape):
             ), f"inp_seq_len must be the same. self.inp_seq_len:{self.inp_seq_len} inp_seq_len:{inp_seq_len}"
             self.cache.fill_(0)
 
+    def update(self, prev, cur, dim, idx, inp_seq_len):
+        orig_cur = cur
+        if prev.shape == cur.shape:
+            prev.copy_(cur)
+            return orig_cur
+        if cur.shape[2] > 1 and cur.shape[2] <= prev.shape[2]:
+            # Initialize
+            prev[:, :, :inp_seq_len, :].copy_(cur)
+            return orig_cur
+        assert cur.shape[2] == 1, f"Cannot update kv-cache. Unsupported shapes. prev:{prev.shape} cur:{cur.shape}"
+        if idx is not None:
+            prev.index_copy_(dim, idx - 1, cur)
+            return prev
+        else:
+            return torch.cat((prev, cur), dim=dim)
+
     def get_shape(self):
         if self.cache is None:
             return None
         return self.cache.shape
 
     def forward(self, cur, dim, idx):
-        return update(self.cache, cur, dim, idx, self.inp_seq_len)
+        return self.update(self.cache, cur, dim, idx, self.inp_seq_len)
 
 
 class GaudiLlamaRotaryEmbedding(torch.nn.Module):
@@ -271,12 +268,12 @@ def __init__(self, config: LlamaConfig, layer_idx: Optional[int] = None):
         self.inp_seq_len = -1
         self.norm_factor = 1.0 / math.sqrt(self.head_dim)
 
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len, kv_cache_fp8):
+    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
         cache_shape = (batch_size, self.num_key_value_heads, max_seq_len, self.head_dim)
         device = self.k_proj.weight.device
         dtype = self.config.torch_dtype
-        self.k_cache.allocate(inp_seq_len, kv_cache_fp8, dtype, device, cache_shape)
-        self.v_cache.allocate(inp_seq_len, kv_cache_fp8, dtype, device, cache_shape)
+        self.k_cache.allocate(inp_seq_len, dtype, device, cache_shape)
+        self.v_cache.allocate(inp_seq_len, dtype, device, cache_shape)
 
     def update_sincos_cache(self, seq_len):
         # Call rotary emb forward() to update cos/sin cache when infering more than self.max_position_embeddings
@@ -371,14 +368,23 @@ def pre_attn_forward(
         cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_customized_rope(query_states, key_states, cos, sin, position_ids)
 
-        if past_key_value is not None or reuse_cache:
+        if use_cache:
             # reuse k, v, self_attention
             if reuse_cache:
                 key_states = self.k_cache(key_states, 2, token_idx)
                 value_states = self.v_cache(value_states, 2, token_idx)
+                past_key_value = (self.k_cache.get_shape(), self.v_cache.get_shape())
             else:
-                key_states = update(past_key_value[0], key_states, 2, token_idx, self.inp_seq_len)
-                value_states = update(past_key_value[1], value_states, 2, token_idx, self.inp_seq_len)
+                if past_key_value is None:
+                    past_key = torch.zeros(key_states.shape, dtype=self.k_proj.weight.dtype, device=key_states.device)
+                    past_value = torch.zeros(
+                        key_states.shape, dtype=self.k_proj.weight.dtype, device=key_states.device
+                    )
+                    past_key_value = (past_key, past_value)
+                key_states = self.k_cache.update(past_key_value[0], key_states, 2, token_idx, self.inp_seq_len)
+                value_states = self.v_cache.update(past_key_value[1], value_states, 2, token_idx, self.inp_seq_len)
+                if token_idx is None:
+                    past_key_value = (key_states, value_states)
 
             if cache_idx is not None and q_len == 1:
                 key_states = key_states[:, :, :cache_idx, :]
@@ -386,12 +392,6 @@ def pre_attn_forward(
                 if attention_mask is not None:
                     attention_mask = attention_mask[:, :, :, :cache_idx]
                 kv_seq_len = key_states.shape[-2]
-
-        if use_cache:
-            if reuse_cache:
-                past_key_value = (self.k_cache.get_shape(), self.v_cache.get_shape())
-            else:
-                past_key_value = (key_states.contiguous(), value_states.contiguous())
         else:
             past_key_value = None
 
@@ -473,8 +473,8 @@ def __init__(self, config: LlamaConfig, layer_idx: int):
         self.input_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.post_attention_layernorm = LlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
 
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len, kv_cache_fp8):
-        self.self_attn.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len, kv_cache_fp8)
+    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
+        self.self_attn.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
 
     def reorder_kv_cache(self, beam_idx: torch.LongTensor):
         return self.self_attn.reorder_kv_cache(beam_idx)
@@ -629,9 +629,9 @@ def __init__(self, config: LlamaConfig):
         # Initialize weights and apply final processing
         self.post_init()
 
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len, kv_cache_fp8):
+    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
         for layer in self.layers:
-            layer.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len, kv_cache_fp8)
+            layer.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
 
     def reorder_kv_cache(self, beam_idx: torch.LongTensor):
         return tuple(layer.reorder_kv_cache(beam_idx) for layer in self.layers)
@@ -820,9 +820,8 @@ class GaudiLlamaForCausalLM(LlamaForCausalLM):
     - add new args reuse_cache
     """
 
-    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len, kv_cache_fp8):
-        self.model.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len, kv_cache_fp8)
-        self.kv_cache_len = max_seq_len
+    def allocate_kv_cache(self, batch_size, max_seq_len, inp_seq_len):
+        self.model.allocate_kv_cache(batch_size, max_seq_len, inp_seq_len)
 
     def reorder_kv_cache(self, beam_idx: torch.LongTensor):
         return self.model.reorder_kv_cache(beam_idx)
diff --git a/optimum/habana/transformers/models/wav2vec2/__init__.py b/optimum/habana/transformers/models/wav2vec2/__init__.py
index 3a5bae22b8..84372061b6 100644
--- a/optimum/habana/transformers/models/wav2vec2/__init__.py
+++ b/optimum/habana/transformers/models/wav2vec2/__init__.py
@@ -5,4 +5,5 @@
     gaudi_wav2vec2_encoder_forward,
     gaudi_wav2vec2_forward,
     gaudi_wav2vec2_tdnnlayer_forward,
+    gaudi_wav2vec2forctc_forward,
 )
diff --git a/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py b/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py
index 983c5b5375..c6dd9cb546 100644
--- a/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py
+++ b/optimum/habana/transformers/models/wav2vec2/modeling_wav2vec2.py
@@ -17,11 +17,23 @@
 from typing import Optional, Tuple, Union
 
 import torch
+from habana_frameworks.torch.hpu import get_device_name
 from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
 from transformers.modeling_outputs import (
     BaseModelOutput,
+    CausalLMOutput,
     Wav2Vec2BaseModelOutput,
 )
+from transformers.models.wav2vec2.modeling_wav2vec2 import _HIDDEN_STATES_START_POSITION
+
+
+try:
+    from habana_frameworks.torch.hpex.kernels import CTCLoss
+
+    custom_ctc_loss_fwd = CTCLoss.apply
+except ImportError:
+    print("Could not import Custom CTCLoss kernel. This Kernel is available only for SynapseAI >= 1.15.0")
+    custom_ctc_loss_fwd = None
 
 
 def _gaudi_wav2vec2_compute_mask_indices(
@@ -33,7 +45,8 @@ def _gaudi_wav2vec2_compute_mask_indices(
 ) -> torch.Tensor:
     """
     Copied from Transformers: https://github.com/huggingface/transformers/blob/bd469c40659ce76c81f69c7726759d249b4aef49/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L135
-    The only difference is that the processing is performed with PyTorch on HPUs (Numpy is used in Transformers).
+    The only differences are (1) that the processing is performed with PyTorch on HPUs (Numpy is used in Transformers), (2) epsilon is generated on HPU instead of CPU, (3) check
+    to ensure indices are not larger than sequence length is re-written to avoid host sync.
     """
     batch_size, sequence_length = shape
 
@@ -122,8 +135,13 @@ def compute_num_masked_span(input_length):
     spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
 
     # ensure that we cannot have indices larger than sequence_length
-    if spec_aug_mask_idxs.max() > sequence_length - 1:
-        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+    if get_device_name() == "GAUDI" or custom_ctc_loss_fwd is None:
+        if spec_aug_mask_idxs.max() > sequence_length - 1:
+            spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+    else:
+        mask = (spec_aug_mask_idxs > sequence_length - 1) * (spec_aug_mask_idxs.max() > sequence_length - 1)
+        inverse_mask = torch.bitwise_not(mask)
+        spec_aug_mask_idxs = spec_aug_mask_idxs * inverse_mask + (sequence_length - 1) * mask
 
     # scatter indices to mask
     spec_aug_mask.scatter_(-1, spec_aug_mask_idxs, 1)
@@ -172,6 +190,63 @@ def _gaudi_wav2vec2_sample_negative_indices(
     return sampled_negative_indices
 
 
+def gaudi_wav2vec2_forward(
+    self,
+    input_values: Optional[torch.Tensor],
+    attention_mask: Optional[torch.Tensor] = None,
+    mask_time_indices: Optional[torch.FloatTensor] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
+    """
+    Copied from Transformers: https://github.com/huggingface/transformers/blob/bd469c40659ce76c81f69c7726759d249b4aef49/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L1282
+    The only difference is that a clone of `hidden_states` is given to _mask_hidden_states to avoid an error.
+    """
+    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+    output_hidden_states = (
+        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+    )
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+    extract_features = self.feature_extractor(input_values)
+    extract_features = extract_features.transpose(1, 2)
+
+    if attention_mask is not None:
+        # compute reduced attention_mask corresponding to feature vectors
+        attention_mask = self._get_feature_vector_attention_mask(
+            extract_features.shape[1], attention_mask, add_adapter=False
+        )
+
+    hidden_states, extract_features = self.feature_projection(extract_features)
+    hidden_states = self._mask_hidden_states(
+        hidden_states.clone(), mask_time_indices=mask_time_indices, attention_mask=attention_mask
+    )
+
+    encoder_outputs = self.encoder(
+        hidden_states,
+        attention_mask=attention_mask,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+    )
+
+    hidden_states = encoder_outputs[0]
+
+    if self.adapter is not None:
+        hidden_states = self.adapter(hidden_states)
+
+    if not return_dict:
+        return (hidden_states, extract_features) + encoder_outputs[1:]
+
+    return Wav2Vec2BaseModelOutput(
+        last_hidden_state=hidden_states,
+        extract_features=extract_features,
+        hidden_states=encoder_outputs.hidden_states,
+        attentions=encoder_outputs.attentions,
+    )
+
+
 def _gaudi_wav2vec2_mask_hidden_states(
     self,
     hidden_states: torch.FloatTensor,
@@ -300,63 +375,6 @@ def gaudi_wav2vec2_encoder_forward(
     )
 
 
-def gaudi_wav2vec2_forward(
-    self,
-    input_values: Optional[torch.Tensor],
-    attention_mask: Optional[torch.Tensor] = None,
-    mask_time_indices: Optional[torch.FloatTensor] = None,
-    output_attentions: Optional[bool] = None,
-    output_hidden_states: Optional[bool] = None,
-    return_dict: Optional[bool] = None,
-) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
-    """
-    Copied from Transformers: https://github.com/huggingface/transformers/blob/bd469c40659ce76c81f69c7726759d249b4aef49/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L1282
-    The only difference is that a clone of `hidden_states` is given to _mask_hidden_states to avoid an error.
-    """
-    output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-    output_hidden_states = (
-        output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-    )
-    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-    extract_features = self.feature_extractor(input_values)
-    extract_features = extract_features.transpose(1, 2)
-
-    if attention_mask is not None:
-        # compute reduced attention_mask corresponding to feature vectors
-        attention_mask = self._get_feature_vector_attention_mask(
-            extract_features.shape[1], attention_mask, add_adapter=False
-        )
-
-    hidden_states, extract_features = self.feature_projection(extract_features)
-    hidden_states = self._mask_hidden_states(
-        hidden_states.clone(), mask_time_indices=mask_time_indices, attention_mask=attention_mask
-    )
-
-    encoder_outputs = self.encoder(
-        hidden_states,
-        attention_mask=attention_mask,
-        output_attentions=output_attentions,
-        output_hidden_states=output_hidden_states,
-        return_dict=return_dict,
-    )
-
-    hidden_states = encoder_outputs[0]
-
-    if self.adapter is not None:
-        hidden_states = self.adapter(hidden_states)
-
-    if not return_dict:
-        return (hidden_states, extract_features) + encoder_outputs[1:]
-
-    return Wav2Vec2BaseModelOutput(
-        last_hidden_state=hidden_states,
-        extract_features=extract_features,
-        hidden_states=encoder_outputs.hidden_states,
-        attentions=encoder_outputs.attentions,
-    )
-
-
 def gaudi_wav2vec2_tdnnlayer_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
     """
     Copied from Transformers: https://github.com/huggingface/transformers/blob/v4.37.2/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L2290
@@ -374,3 +392,74 @@ def gaudi_wav2vec2_tdnnlayer_forward(self, hidden_states: torch.Tensor) -> torch
 
     hidden_states = self.activation(hidden_states)
     return hidden_states
+
+
+def gaudi_wav2vec2forctc_forward(
+    self,
+    input_values: Optional[torch.Tensor],
+    attention_mask: Optional[torch.Tensor] = None,
+    output_attentions: Optional[bool] = None,
+    output_hidden_states: Optional[bool] = None,
+    return_dict: Optional[bool] = None,
+    labels: Optional[torch.Tensor] = None,
+) -> Union[Tuple, CausalLMOutput]:
+    """
+    copied from Transformers https://github.com/huggingface/transformers/blob/e770f0316d2a9b787c9d1440f204fcb65e176682/src/transformers/models/wav2vec2/modeling_wav2vec2.py#L1950
+    only differences are (1) attention_mask tensor generation using ones_like is done on HPU, (2) masked_select is not applied on labels to compute flattened_targets to avoid
+    changing flattened_targets tensor shapes across training iterations.
+    """
+    return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+    outputs = self.wav2vec2(
+        input_values,
+        attention_mask=attention_mask,
+        output_attentions=output_attentions,
+        output_hidden_states=output_hidden_states,
+        return_dict=return_dict,
+    )
+    hidden_states = outputs[0]
+    hidden_states = self.dropout(hidden_states)
+    logits = self.lm_head(hidden_states)
+    loss = None
+    if labels is not None:
+        if labels.max() >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+        # retrieve loss input_lengths from attention_mask
+        attention_mask = (
+            attention_mask
+            if attention_mask is not None
+            else torch.ones_like(input_values, dtype=torch.long, device="hpu")
+        )
+        input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+        # assuming that padded tokens are filled with -100
+        # when not being attended to
+        labels_mask = labels >= 0
+        target_lengths = labels_mask.sum(-1)
+        # ctc_loss doesn't support fp16
+        log_probs = torch.nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+        if get_device_name() == "GAUDI" or custom_ctc_loss_fwd is None:
+            flattened_targets = labels.masked_select(labels_mask)
+            loss = torch.nn.functional.ctc_loss(
+                log_probs,
+                flattened_targets,
+                input_lengths,
+                target_lengths,
+                blank=self.config.pad_token_id,
+                reduction=self.config.ctc_loss_reduction,
+                zero_infinity=self.config.ctc_zero_infinity,
+            )
+        else:
+            flattened_targets = labels
+            loss = custom_ctc_loss_fwd(
+                log_probs,
+                flattened_targets,
+                input_lengths,
+                target_lengths,
+                self.config.pad_token_id,
+                self.config.ctc_loss_reduction,
+                self.config.ctc_zero_infinity,
+            )
+
+    if not return_dict:
+        output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+        return ((loss,) + output) if loss is not None else output
+    return CausalLMOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
diff --git a/optimum/habana/utils.py b/optimum/habana/utils.py
index a1707c5602..306e619471 100644
--- a/optimum/habana/utils.py
+++ b/optimum/habana/utils.py
@@ -31,7 +31,7 @@
 logger = logging.get_logger(__name__)
 
 
-CURRENTLY_VALIDATED_SYNAPSE_VERSION = version.parse("1.14.0")
+CURRENTLY_VALIDATED_SYNAPSE_VERSION = version.parse("1.15.0")
 
 
 def to_device_dtype(my_input: Any, target_device: torch.device = None, target_dtype: torch.dtype = None):
diff --git a/tests/baselines/albert_large_v2.json b/tests/baselines/albert_large_v2.json
index 62c685b473..2f13722a95 100644
--- a/tests/baselines/albert_large_v2.json
+++ b/tests/baselines/albert_large_v2.json
@@ -7,9 +7,9 @@
                 "single_card": {
                     "learning_rate": 6e-5,
                     "train_batch_size": 32,
-                    "eval_f1": 92.0109,
-                    "train_runtime": 3246.7928,
-                    "train_samples_per_second": 55.517,
+                    "eval_f1": 91.8679,
+                    "train_runtime": 2900.5518,
+                    "train_samples_per_second": 62.298,
                     "extra_arguments": [
                         "--max_seq_length 384",
                         "--use_hpu_graphs_for_inference"
@@ -18,9 +18,9 @@
                 "multi_card": {
                     "learning_rate": 6e-5,
                     "train_batch_size": 32,
-                    "eval_f1": 92.8155,
-                    "train_runtime": 497.1048,
-                    "train_samples_per_second": 449.321,
+                    "eval_f1": 92.7647,
+                    "train_runtime": 464.9893,
+                    "train_samples_per_second": 494.936,
                     "extra_arguments": [
                         "--max_seq_length 384",
                         "--use_hpu_graphs_for_inference"
@@ -37,9 +37,9 @@
                 "single_card": {
                     "learning_rate": 6e-5,
                     "train_batch_size": 128,
-                    "eval_f1": 92.6585,
-                    "train_runtime": 659.795,
-                    "train_samples_per_second": 277.916,
+                    "eval_f1": 92.4235,
+                    "train_runtime": 571.138,
+                    "train_samples_per_second": 321.635,
                     "extra_arguments": [
                         "--max_seq_length 384",
                         "--use_hpu_graphs_for_inference"
@@ -48,9 +48,9 @@
                 "multi_card": {
                     "learning_rate": 7e-5,
                     "train_batch_size": 128,
-                    "eval_f1": 91.9053,
-                    "train_runtime": 126.0638,
-                    "train_samples_per_second": 2271.729,
+                    "eval_f1": 92.2111,
+                    "train_runtime": 115.15,
+                    "train_samples_per_second": 2464.403,
                     "extra_arguments": [
                         "--max_seq_length 384",
                         "--use_hpu_graphs_for_inference"
diff --git a/tests/baselines/albert_xxlarge_v1.json b/tests/baselines/albert_xxlarge_v1.json
index 511344bf52..8efe5d729d 100644
--- a/tests/baselines/albert_xxlarge_v1.json
+++ b/tests/baselines/albert_xxlarge_v1.json
@@ -18,9 +18,9 @@
                 "multi_card": {
                     "learning_rate": 5e-5,
                     "train_batch_size": 12,
-                    "eval_f1": 95.1629,
-                    "train_runtime": 1308.2465,
-                    "train_samples_per_second": 75.506,
+                    "eval_f1": 95.1221,
+                    "train_runtime": 1312.9496,
+                    "train_samples_per_second": 75.51,
                     "extra_arguments": [
                         "--max_seq_length 384",
                         "--use_hpu_graphs_for_inference"
@@ -48,9 +48,9 @@
                 "multi_card": {
                     "learning_rate": 7e-5,
                     "train_batch_size": 16,
-                    "eval_f1": 95.0743,
-                    "train_runtime": 218.7903,
-                    "train_samples_per_second": 442.758,
+                    "eval_f1": 95.1227,
+                    "train_runtime": 221.2125,
+                    "train_samples_per_second": 439.114,
                     "extra_arguments": [
                         "--max_seq_length 384",
                         "--use_hpu_graphs_for_inference"
diff --git a/tests/baselines/bert_large_uncased_whole_word_masking.json b/tests/baselines/bert_large_uncased_whole_word_masking.json
index 62ea2558b7..d153328e4a 100644
--- a/tests/baselines/bert_large_uncased_whole_word_masking.json
+++ b/tests/baselines/bert_large_uncased_whole_word_masking.json
@@ -7,9 +7,9 @@
                 "single_card": {
                     "learning_rate": 3e-5,
                     "train_batch_size": 24,
-                    "eval_f1": 93.2812,
-                    "train_runtime": 1719.9389,
-                    "train_samples_per_second": 52.696,
+                    "eval_f1": 93.1962,
+                    "train_runtime": 1678.3456,
+                    "train_samples_per_second": 54.101,
                     "extra_arguments": [
                         "--max_seq_length 384",
                         "--use_hpu_graphs_for_inference"
@@ -18,9 +18,9 @@
                 "multi_card": {
                     "learning_rate": 7e-5,
                     "train_batch_size": 24,
-                    "eval_f1": 93.2092,
-                    "train_runtime": 306.8871,
-                    "train_samples_per_second": 397.041,
+                    "eval_f1": 93.1869,
+                    "train_runtime": 309.9553,
+                    "train_samples_per_second": 398.459,
                     "extra_arguments": [
                         "--max_seq_length 384",
                         "--use_hpu_graphs_for_inference"
@@ -35,9 +35,9 @@
                 "single_card": {
                     "learning_rate": 3e-5,
                     "train_batch_size": 32,
-                    "eval_f1": 0.8968,
-                    "train_runtime": 88.7004,
-                    "train_samples_per_second": 171.252,
+                    "eval_f1": 0.9022,
+                    "train_runtime": 90.3943,
+                    "train_samples_per_second": 172.792,
                     "extra_arguments": [
                         "--max_seq_length 128",
                         "--use_hpu_graphs_for_inference"
@@ -46,9 +46,9 @@
                 "multi_card": {
                     "learning_rate": 3e-5,
                     "train_batch_size": 16,
-                    "eval_f1": 0.8885,
-                    "train_runtime": 61.8181,
-                    "train_samples_per_second": 893.265,
+                    "eval_f1": 0.8897,
+                    "train_runtime": 64.4986,
+                    "train_samples_per_second": 968.596,
                     "extra_arguments": [
                         "--max_seq_length 128",
                         "--use_hpu_graphs_for_inference"
@@ -65,9 +65,9 @@
                 "single_card": {
                     "learning_rate": 4e-5,
                     "train_batch_size": 32,
-                    "eval_f1": 93.3512,
-                    "train_runtime": 323.3053,
-                    "train_samples_per_second": 287.096,
+                    "eval_f1": 93.2753,
+                    "train_runtime": 309.9491,
+                    "train_samples_per_second": 302.089,
                     "extra_arguments": [
                         "--max_seq_length 384",
                         "--use_hpu_graphs_for_inference"
@@ -76,9 +76,9 @@
                 "multi_card": {
                     "learning_rate": 8e-5,
                     "train_batch_size": 32,
-                    "eval_f1": 92.9464,
-                    "train_runtime": 77.4588,
-                    "train_samples_per_second": 2178.613,
+                    "eval_f1": 93.0981,
+                    "train_runtime": 78.387,
+                    "train_samples_per_second": 2300.127,
                     "extra_arguments": [
                         "--max_seq_length 384",
                         "--use_hpu_graphs_for_inference"
@@ -93,9 +93,9 @@
                 "single_card": {
                     "learning_rate": 9e-5,
                     "train_batch_size": 256,
-                    "eval_f1": 0.9027,
-                    "train_runtime": 29.8624,
-                    "train_samples_per_second": 1161.008,
+                    "eval_f1": 0.8998,
+                    "train_runtime": 33.2909,
+                    "train_samples_per_second": 1151.598,
                     "extra_arguments": [
                         "--max_seq_length 128",
                         "--use_hpu_graphs_for_inference"
@@ -104,9 +104,9 @@
                 "multi_card": {
                     "learning_rate": 3e-5,
                     "train_batch_size": 40,
-                    "eval_f1": 0.8601,
-                    "train_runtime": 38.35,
-                    "train_samples_per_second": 2895.6,
+                    "eval_f1": 0.8758,
+                    "train_runtime": 41.4282,
+                    "train_samples_per_second": 2771.405,
                     "extra_arguments": [
                         "--max_seq_length 128",
                         "--use_hpu_graphs_for_inference"
diff --git a/tests/baselines/bridgetower_large_itm_mlm_itc.json b/tests/baselines/bridgetower_large_itm_mlm_itc.json
index c81f437c70..e188228256 100644
--- a/tests/baselines/bridgetower_large_itm_mlm_itc.json
+++ b/tests/baselines/bridgetower_large_itm_mlm_itc.json
@@ -7,8 +7,8 @@
                 "multi_card": {
                     "learning_rate": 1e-5,
                     "train_batch_size": 48,
-                    "train_runtime": 300.6945,
-                    "train_samples_per_second": 930.245,
+                    "train_runtime": 314.5877,
+                    "train_samples_per_second": 918.387,
                     "extra_arguments": [
                         "--dataset_config_name matching",
                         "--dataset_revision 3c6c4f6c0ff7e902833d3afa5f8f3875c2b036e6",
diff --git a/tests/baselines/clip_roberta.json b/tests/baselines/clip_roberta.json
index 50105645f1..b95d98c016 100644
--- a/tests/baselines/clip_roberta.json
+++ b/tests/baselines/clip_roberta.json
@@ -7,8 +7,8 @@
                 "multi_card": {
                     "learning_rate": 5e-5,
                     "train_batch_size": 64,
-                    "train_runtime": 304.18,
-                    "train_samples_per_second": 2582.676,
+                    "train_runtime": 314.7726,
+                    "train_samples_per_second": 2560.999,
                     "extra_arguments": [
                         "--data_dir $PWD/",
                         "--dataset_config_name 2017",
diff --git a/tests/baselines/distilbert_base_uncased.json b/tests/baselines/distilbert_base_uncased.json
index e9bd14dafd..a85474a073 100644
--- a/tests/baselines/distilbert_base_uncased.json
+++ b/tests/baselines/distilbert_base_uncased.json
@@ -7,20 +7,20 @@
                 "single_card": {
                     "learning_rate": 1e-4,
                     "train_batch_size": 48,
-                    "eval_f1": 84.7137,
-                    "train_runtime": 271.2751,
-                    "train_samples_per_second": 334.792,
+                    "eval_f1": 84.5384,
+                    "train_runtime": 264.3669,
+                    "train_samples_per_second": 344.126,
                     "extra_arguments": [
                         "--max_seq_length 384",
                         "--use_hpu_graphs_for_inference"
                     ]
                 },
                 "multi_card": {
-                    "learning_rate": 3e-4,
+                    "learning_rate": 4e-4,
                     "train_batch_size": 48,
-                    "eval_f1": 82.8831,
-                    "train_runtime": 54.0269,
-                    "train_samples_per_second": 2500.721,
+                    "eval_f1": 83.0667,
+                    "train_runtime": 54.5344,
+                    "train_samples_per_second": 2503.657,
                     "extra_arguments": [
                         "--max_seq_length 384",
                         "--use_hpu_graphs_for_inference"
@@ -37,9 +37,9 @@
                 "single_card": {
                     "learning_rate": 2e-4,
                     "train_batch_size": 64,
-                    "eval_f1": 84.87642669075069,
-                    "train_runtime": 131.655,
-                    "train_samples_per_second": 1377.209,
+                    "eval_f1": 84.5418,
+                    "train_runtime": 108.8333,
+                    "train_samples_per_second": 1676.689,
                     "extra_arguments": [
                         "--max_seq_length 384",
                         "--use_hpu_graphs_for_inference"
@@ -48,9 +48,9 @@
                 "multi_card": {
                     "learning_rate": 3e-4,
                     "train_batch_size": 64,
-                    "eval_f1": 83.27897440376087,
-                    "train_runtime": 25.7792,
-                    "train_samples_per_second": 9951.533,
+                    "eval_f1": 83.2233,
+                    "train_runtime": 24.0441,
+                    "train_samples_per_second": 11144.651,
                     "extra_arguments": [
                         "--max_seq_length 384",
                         "--use_hpu_graphs_for_inference"
diff --git a/tests/baselines/falcon_40b.json b/tests/baselines/falcon_40b.json
index 1b2b761907..cb08dc4ed4 100644
--- a/tests/baselines/falcon_40b.json
+++ b/tests/baselines/falcon_40b.json
@@ -7,9 +7,9 @@
                 "multi_card": {
                     "learning_rate": 4e-4,
                     "train_batch_size": 1,
-                    "perplexity": 4.0596,
-                    "train_runtime": 944.9201,
-                    "train_samples_per_second": 27.045,
+                    "perplexity": 4.0893,
+                    "train_runtime": 931.1213,
+                    "train_samples_per_second": 28.162,
                     "extra_arguments": [
                         "--bf16",
                         "--gradient_accumulation_steps 16",
diff --git a/tests/baselines/flan_t5_xxl.json b/tests/baselines/flan_t5_xxl.json
index 6b3f293f8f..779bc9fd83 100644
--- a/tests/baselines/flan_t5_xxl.json
+++ b/tests/baselines/flan_t5_xxl.json
@@ -7,9 +7,9 @@
                 "deepspeed": {
                     "learning_rate": 1e-4,
                     "train_batch_size": 22,
-                    "eval_rougeLsum": 0.0,
-                    "train_runtime": 90.2563,
-                    "train_samples_per_second": 27.175,
+                    "eval_rougeLsum": 0.1429,
+                    "train_runtime": 89.486,
+                    "train_samples_per_second": 27.299,
                     "extra_arguments": [
                         "--max_steps 10",
                         "--max_eval_samples 880",
diff --git a/tests/baselines/gpt2.json b/tests/baselines/gpt2.json
index d7f6d8dca6..889bdbd3d4 100644
--- a/tests/baselines/gpt2.json
+++ b/tests/baselines/gpt2.json
@@ -7,9 +7,9 @@
                 "single_card": {
                     "learning_rate": 5e-5,
                     "train_batch_size": 4,
-                    "perplexity": 22.2641,
-                    "train_runtime": 236.4595,
-                    "train_samples_per_second": 20.24,
+                    "perplexity": 22.2751,
+                    "train_runtime": 225.2898,
+                    "train_samples_per_second": 21.308,
                     "extra_arguments": [
                         "--dataset_config_name wikitext-2-raw-v1",
                         "--use_hpu_graphs_for_inference",
@@ -19,9 +19,9 @@
                 "multi_card": {
                     "learning_rate": 4e-4,
                     "train_batch_size": 4,
-                    "perplexity": 22.2696,
-                    "train_runtime": 72.1582,
-                    "train_samples_per_second": 150.303,
+                    "perplexity": 22.2699,
+                    "train_runtime": 68.9627,
+                    "train_samples_per_second": 156.241,
                     "extra_arguments": [
                         "--dataset_config_name wikitext-2-raw-v1",
                         "--use_hpu_graphs_for_inference",
@@ -39,9 +39,9 @@
                 "single_card": {
                     "learning_rate": 2e-4,
                     "train_batch_size": 16,
-                    "perplexity": 21.0687,
-                    "train_runtime": 45.091,
-                    "train_samples_per_second": 118.884,
+                    "perplexity": 21.0729,
+                    "train_runtime": 43.9361,
+                    "train_samples_per_second": 130.785,
                     "extra_arguments": [
                         "--dataset_config_name wikitext-2-raw-v1",
                         "--use_hpu_graphs_for_inference"
@@ -50,9 +50,9 @@
                 "multi_card": {
                     "learning_rate": 8e-4,
                     "train_batch_size": 16,
-                    "perplexity": 21.7965,
-                    "train_runtime": 18.9527,
-                    "train_samples_per_second": 847.568,
+                    "perplexity": 21.7858,
+                    "train_runtime": 23.8993,
+                    "train_samples_per_second": 939.24,
                     "extra_arguments": [
                         "--dataset_config_name wikitext-2-raw-v1",
                         "--use_hpu_graphs_for_inference"
diff --git a/tests/baselines/gpt2_xl.json b/tests/baselines/gpt2_xl.json
index 2a5bd96ecf..ffd92331cb 100644
--- a/tests/baselines/gpt2_xl.json
+++ b/tests/baselines/gpt2_xl.json
@@ -7,9 +7,9 @@
                 "deepspeed": {
                     "learning_rate": 5e-5,
                     "train_batch_size": 2,
-                    "perplexity": 12.6711,
-                    "train_runtime": 380.1311,
-                    "train_samples_per_second": 16.045,
+                    "perplexity": 12.6744,
+                    "train_runtime": 366.8694,
+                    "train_samples_per_second": 16.464,
                     "extra_arguments": [
                         "--dataset_config_name wikitext-2-raw-v1",
                         "--use_hpu_graphs_for_inference",
@@ -27,9 +27,9 @@
                 "deepspeed": {
                     "learning_rate": 4e-4,
                     "train_batch_size": 16,
-                    "perplexity": 13.0563,
-                    "train_runtime": 196.3264,
-                    "train_samples_per_second": 86.855,
+                    "perplexity": 13.0461,
+                    "train_runtime": 190.696,
+                    "train_samples_per_second": 89.877,
                     "extra_arguments": [
                         "--dataset_config_name wikitext-2-raw-v1",
                         "--gradient_checkpointing",
diff --git a/tests/baselines/gpt_neox_20b.json b/tests/baselines/gpt_neox_20b.json
index 165debd4ca..61b27156bf 100644
--- a/tests/baselines/gpt_neox_20b.json
+++ b/tests/baselines/gpt_neox_20b.json
@@ -8,8 +8,8 @@
                     "learning_rate": 5e-5,
                     "train_batch_size": 2,
                     "perplexity": 8.0545,
-                    "train_runtime": 745.7237,
-                    "train_samples_per_second": 7.242,
+                    "train_runtime": 721.5428,
+                    "train_samples_per_second": 7.571,
                     "extra_arguments": [
                         "--dataset_config_name wikitext-2-raw-v1",
                         "--gradient_checkpointing",
diff --git a/tests/baselines/llama_7b.json b/tests/baselines/llama_7b.json
index 7f05fde3b7..9bff377dbe 100644
--- a/tests/baselines/llama_7b.json
+++ b/tests/baselines/llama_7b.json
@@ -7,9 +7,9 @@
                 "multi_card": {
                     "learning_rate": 1e-4,
                     "train_batch_size": 2,
-                    "perplexity": 2.7362,
-                    "train_runtime": 538.1199,
-                    "train_samples_per_second": 20.37,
+                    "perplexity": 2.7542,
+                    "train_runtime": 538.0159,
+                    "train_samples_per_second": 20.397,
                     "extra_arguments": [
                         "--bf16",
                         "--gradient_accumulation_steps 4",
@@ -32,9 +32,9 @@
                 "multi_card": {
                     "learning_rate": 3e-4,
                     "train_batch_size": 8,
-                    "perplexity": 2.3666,
-                    "train_runtime": 303.8345,
-                    "train_samples_per_second": 144.392,
+                    "perplexity": 2.3665,
+                    "train_runtime": 294.5707,
+                    "train_samples_per_second": 148.093,
                     "extra_arguments": [
                         "--bf16",
                         "--gradient_accumulation_steps 2",
@@ -68,8 +68,8 @@
                     "learning_rate": 3e-4,
                     "train_batch_size": 8,
                     "perplexity": 2.4259,
-                    "train_runtime": 199.94,
-                    "train_samples_per_second": 88.664,
+                    "train_runtime": 186.2483,
+                    "train_samples_per_second": 93.5,
                     "extra_arguments": [
                         "--bf16 True",
                         "--gradient_accumulation_steps 2",
@@ -89,7 +89,7 @@
                         "--adam_epsilon 1e-08",
                         "--ddp_bucket_cap_mb 50",
                         "--validation_split_percentage 10",
-                        "--attn_softmax_bf16 True",
+                        "--attn_softmax_bf16",
                         "--pipelining_fwd_bwd False",
                         "--fsdp auto_wrap",
                         "--torch_compile_backend hpu_backend",
@@ -100,5 +100,4 @@
             }
         }
     }
-}
-
+}
\ No newline at end of file
diff --git a/tests/baselines/roberta_base.json b/tests/baselines/roberta_base.json
index c6dc95babc..210f608d27 100644
--- a/tests/baselines/roberta_base.json
+++ b/tests/baselines/roberta_base.json
@@ -7,9 +7,9 @@
                 "single_card": {
                     "learning_rate": 3e-5,
                     "train_batch_size": 12,
-                    "eval_f1": 91.8466,
-                    "train_runtime": 610.6291,
-                    "train_samples_per_second": 147.028,
+                    "eval_f1": 91.9903,
+                    "train_runtime": 599.9343,
+                    "train_samples_per_second": 149.781,
                     "extra_arguments": [
                         "--max_seq_length 384",
                         "--use_hpu_graphs_for_inference"
@@ -18,9 +18,9 @@
                 "multi_card": {
                     "learning_rate": 8e-5,
                     "train_batch_size": 12,
-                    "eval_f1": 91.7635,
-                    "train_runtime": 102.8332,
-                    "train_samples_per_second": 1081.823,
+                    "eval_f1": 91.624,
+                    "train_runtime": 103.5987,
+                    "train_samples_per_second": 1083.304,
                     "extra_arguments": [
                         "--max_seq_length 384",
                         "--use_hpu_graphs_for_inference"
@@ -35,9 +35,9 @@
                 "multi_card": {
                     "learning_rate": 5e-5,
                     "train_batch_size": 24,
-                    "perplexity": 3.5988,
-                    "train_runtime": 41.6183,
-                    "train_samples_per_second": 553.572,
+                    "perplexity": 3.6338,
+                    "train_runtime": 43.1541,
+                    "train_samples_per_second": 554.787,
                     "extra_arguments": [
                         "--dataset_config_name wikitext-2-raw-v1",
                         "--use_hpu_graphs_for_inference",
@@ -55,9 +55,9 @@
                 "single_card": {
                     "learning_rate": 7e-5,
                     "train_batch_size": 64,
-                    "eval_f1": 91.5167,
-                    "train_runtime": 111.4348,
-                    "train_samples_per_second": 851.971,
+                    "eval_f1": 91.5253,
+                    "train_runtime": 105.6042,
+                    "train_samples_per_second": 907.395,
                     "extra_arguments": [
                         "--max_seq_length 384",
                         "--use_hpu_graphs_for_inference"
@@ -66,9 +66,9 @@
                 "multi_card": {
                     "learning_rate": 2e-4,
                     "train_batch_size": 64,
-                    "eval_f1": 90.7807,
-                    "train_runtime": 31.8781,
-                    "train_samples_per_second": 6634.081,
+                    "eval_f1": 90.8766,
+                    "train_runtime": 32.2213,
+                    "train_samples_per_second": 6568.625,
                     "extra_arguments": [
                         "--max_seq_length 384",
                         "--use_hpu_graphs_for_inference"
@@ -83,9 +83,9 @@
                 "multi_card": {
                     "learning_rate": 8e-5,
                     "train_batch_size": 32,
-                    "perplexity": 3.6515,
-                    "train_runtime": 12.0388,
-                    "train_samples_per_second": 2754.437,
+                    "perplexity": 3.6691,
+                    "train_runtime": 12.3633,
+                    "train_samples_per_second": 2758.371,
                     "extra_arguments": [
                         "--dataset_config_name wikitext-2-raw-v1",
                         "--use_hpu_graphs_for_inference",
diff --git a/tests/baselines/roberta_large.json b/tests/baselines/roberta_large.json
index 0e82fae0d8..4f1ba4c89d 100644
--- a/tests/baselines/roberta_large.json
+++ b/tests/baselines/roberta_large.json
@@ -7,9 +7,9 @@
                 "single_card": {
                     "learning_rate": 3e-5,
                     "train_batch_size": 12,
-                    "eval_f1": 94.3634,
-                    "train_runtime": 1801.8127,
-                    "train_samples_per_second": 49.895,
+                    "eval_f1": 94.2959,
+                    "train_runtime": 1771.3319,
+                    "train_samples_per_second": 50.815,
                     "extra_arguments": [
                         "--max_seq_length 384",
                         "--use_hpu_graphs_for_inference"
@@ -18,9 +18,9 @@
                 "multi_card": {
                     "learning_rate": 8e-5,
                     "train_batch_size": 12,
-                    "eval_f1": 94.0942,
-                    "train_runtime": 299.6718,
-                    "train_samples_per_second": 364.947,
+                    "eval_f1": 94.2867,
+                    "train_runtime": 304.9084,
+                    "train_samples_per_second": 366.177,
                     "extra_arguments": [
                         "--max_seq_length 384",
                         "--use_hpu_graphs_for_inference"
@@ -36,8 +36,8 @@
                     "learning_rate": 5e-5,
                     "train_batch_size": 8,
                     "perplexity": 2.7851,
-                    "train_runtime": 72.0278,
-                    "train_samples_per_second": 217.107,
+                    "train_runtime": 75.0033,
+                    "train_samples_per_second": 217.752,
                     "extra_arguments": [
                         "--dataset_config_name wikitext-2-raw-v1",
                         "--use_hpu_graphs_for_inference",
@@ -55,9 +55,9 @@
                 "single_card": {
                     "learning_rate": 3e-5,
                     "train_batch_size": 32,
-                    "eval_f1": 94.5763,
-                    "train_runtime": 325.6019,
-                    "train_samples_per_second": 286.78,
+                    "eval_f1": 94.5886,
+                    "train_runtime": 314.4407,
+                    "train_samples_per_second": 300.578,
                     "extra_arguments": [
                         "--max_seq_length 384",
                         "--use_hpu_graphs_for_inference"
@@ -66,9 +66,9 @@
                 "multi_card": {
                     "learning_rate": 7e-5,
                     "train_batch_size": 32,
-                    "eval_f1": 94.0626,
-                    "train_runtime": 76.6936,
-                    "train_samples_per_second": 2242.639,
+                    "eval_f1": 94.4348,
+                    "train_runtime": 79.1007,
+                    "train_samples_per_second": 2280.328,
                     "extra_arguments": [
                         "--max_seq_length 384",
                         "--use_hpu_graphs_for_inference"
@@ -83,9 +83,9 @@
                 "multi_card": {
                     "learning_rate": 7e-5,
                     "train_batch_size": 16,
-                    "perplexity": 2.8312,
-                    "train_runtime": 25.2018,
-                    "train_samples_per_second": 1075.842,
+                    "perplexity": 2.829,
+                    "train_runtime": 25.6323,
+                    "train_samples_per_second": 1183.796,
                     "extra_arguments": [
                         "--dataset_config_name wikitext-2-raw-v1",
                         "--use_hpu_graphs_for_inference",
diff --git a/tests/baselines/swin_base_patch4_window7_224_in22k.json b/tests/baselines/swin_base_patch4_window7_224_in22k.json
index f8f5576d42..b6c09b6dec 100644
--- a/tests/baselines/swin_base_patch4_window7_224_in22k.json
+++ b/tests/baselines/swin_base_patch4_window7_224_in22k.json
@@ -7,9 +7,9 @@
                 "single_card": {
                     "learning_rate": 3e-5,
                     "train_batch_size": 64,
-                    "eval_accuracy": 0.9851,
-                    "train_runtime": 249.7865,
-                    "train_samples_per_second": 203.94,
+                    "eval_accuracy": 0.9871,
+                    "train_runtime": 246.4134,
+                    "train_samples_per_second": 212.722,
                     "extra_arguments": [
                         "--remove_unused_columns False",
                         "--image_column_name img",
@@ -24,9 +24,9 @@
                 "multi_card": {
                     "learning_rate": 2e-4,
                     "train_batch_size": 64,
-                    "eval_accuracy": 0.9836,
-                    "train_runtime": 113.9324,
-                    "train_samples_per_second": 1691.705,
+                    "eval_accuracy": 0.9819,
+                    "train_runtime": 117.6424,
+                    "train_samples_per_second": 1683.344,
                     "extra_arguments": [
                         "--remove_unused_columns False",
                         "--image_column_name img",
@@ -49,9 +49,9 @@
                 "single_card": {
                     "learning_rate": 6e-5,
                     "train_batch_size": 160,
-                    "eval_accuracy": 0.9845,
-                    "train_runtime": 77.0917,
-                    "train_samples_per_second": 862.671,
+                    "eval_accuracy": 0.9852,
+                    "train_runtime": 73.5918,
+                    "train_samples_per_second": 957.491,
                     "extra_arguments": [
                         "--remove_unused_columns False",
                         "--image_column_name img",
@@ -66,9 +66,9 @@
                 "multi_card": {
                     "learning_rate": 2e-4,
                     "train_batch_size": 160,
-                    "eval_accuracy": 0.9824,
-                    "train_runtime": 61.0788,
-                    "train_samples_per_second": 6170.79,
+                    "eval_accuracy": 0.9821,
+                    "train_runtime": 62.9986,
+                    "train_samples_per_second": 6202.525,
                     "extra_arguments": [
                         "--remove_unused_columns False",
                         "--image_column_name img",
diff --git a/tests/baselines/t5_small.json b/tests/baselines/t5_small.json
index ce1dcc588b..ebbb774f89 100644
--- a/tests/baselines/t5_small.json
+++ b/tests/baselines/t5_small.json
@@ -7,10 +7,10 @@
                 "multi_card": {
                     "learning_rate": 5e-5,
                     "train_batch_size": 4,
-                    "eval_rougeLsum": 38.6197,
-                    "train_runtime": 1087.1076,
-                    "train_samples_per_second": 268.231,
-                    "eval_samples_per_second": 68.222,
+                    "eval_rougeLsum": 38.5895,
+                    "train_runtime": 1089.366,
+                    "train_samples_per_second": 267.843,
+                    "eval_samples_per_second": 71.913,
                     "extra_arguments": [
                         "--dataset_config \"3.0.0\"",
                         "--source_prefix \"summarize: \"",
@@ -30,9 +30,9 @@
                 "multi_card": {
                     "learning_rate": 2e-4,
                     "train_batch_size": 16,
-                    "eval_f1": 64.8034,
-                    "train_runtime": 228.8655,
-                    "train_samples_per_second": 1246.527,
+                    "eval_f1": 64.8769,
+                    "train_runtime": 230.6405,
+                    "train_samples_per_second": 1235.893,
                     "extra_arguments": [
                         "--context_column context",
                         "--question_column question",
@@ -57,10 +57,10 @@
                 "multi_card": {
                     "learning_rate": 2e-4,
                     "train_batch_size": 32,
-                    "eval_rougeLsum": 38.5749,
-                    "train_runtime": 162.5389,
-                    "train_samples_per_second": 1870.707,
-                    "eval_samples_per_second": 78.586,
+                    "eval_rougeLsum": 38.5648,
+                    "train_runtime": 164.962,
+                    "train_samples_per_second": 1912.578,
+                    "eval_samples_per_second": 116.48,
                     "extra_arguments": [
                         "--dataset_config \"3.0.0\"",
                         "--source_prefix \"summarize: \"",
@@ -80,9 +80,9 @@
                 "multi_card": {
                     "learning_rate": 2e-3,
                     "train_batch_size": 64,
-                    "eval_f1": 66.4991,
-                    "train_runtime": 53.9037,
-                    "train_samples_per_second": 5710.614,
+                    "eval_f1": 65.7157,
+                    "train_runtime": 49.5816,
+                    "train_samples_per_second": 6353.351,
                     "extra_arguments": [
                         "--context_column context",
                         "--question_column question",
diff --git a/tests/baselines/vit_base_patch16_224_in21k.json b/tests/baselines/vit_base_patch16_224_in21k.json
index 3762a6f06c..03cd9f6131 100644
--- a/tests/baselines/vit_base_patch16_224_in21k.json
+++ b/tests/baselines/vit_base_patch16_224_in21k.json
@@ -7,9 +7,9 @@
                 "single_card": {
                     "learning_rate": 5e-5,
                     "train_batch_size": 64,
-                    "eval_accuracy": 0.9828,
-                    "train_runtime": 139.4456,
-                    "train_samples_per_second": 349.624,
+                    "eval_accuracy": 0.9812,
+                    "train_runtime": 136.9418,
+                    "train_samples_per_second": 359.584,
                     "extra_arguments": [
                         "--remove_unused_columns False",
                         "--image_column_name img",
@@ -23,9 +23,9 @@
                 "multi_card": {
                     "learning_rate": 2e-4,
                     "train_batch_size": 64,
-                    "eval_accuracy": 0.98,
-                    "train_runtime": 58.345,
-                    "train_samples_per_second": 2509.51,
+                    "eval_accuracy": 0.9803,
+                    "train_runtime": 59.972,
+                    "train_samples_per_second": 2508.955,
                     "extra_arguments": [
                         "--remove_unused_columns False",
                         "--image_column_name img",
@@ -48,9 +48,9 @@
                 "single_card": {
                     "learning_rate": 6e-5,
                     "train_batch_size": 96,
-                    "eval_accuracy": 0.9819,
-                    "train_runtime": 53.7091,
-                    "train_samples_per_second": 916.872,
+                    "eval_accuracy": 0.9813,
+                    "train_runtime": 53.4501,
+                    "train_samples_per_second": 931.955,
                     "extra_arguments": [
                         "--remove_unused_columns False",
                         "--image_column_name img",
@@ -64,9 +64,9 @@
                 "multi_card": {
                     "learning_rate": 5e-4,
                     "train_batch_size": 96,
-                    "eval_accuracy": 0.9811,
-                    "train_runtime": 23.1594,
-                    "train_samples_per_second": 6528.949,
+                    "eval_accuracy": 0.9775,
+                    "train_runtime": 22.8292,
+                    "train_samples_per_second": 7337.003,
                     "extra_arguments": [
                         "--remove_unused_columns False",
                         "--image_column_name img",
diff --git a/tests/baselines/wav2vec2_base.json b/tests/baselines/wav2vec2_base.json
index 2778c1c036..3927ec4a5b 100644
--- a/tests/baselines/wav2vec2_base.json
+++ b/tests/baselines/wav2vec2_base.json
@@ -7,10 +7,10 @@
                 "multi_card": {
                     "learning_rate": 5e-4,
                     "train_batch_size": 32,
-                    "eval_accuracy": 0.8045,
-                    "train_runtime": 363.7165,
-                    "train_samples_per_second": 715.004,
-                    "eval_samples_per_second": 312.719,
+                    "eval_accuracy": 0.8013,
+                    "train_runtime": 366.8081,
+                    "train_samples_per_second": 716.385,
+                    "eval_samples_per_second": 329.12,
                     "extra_arguments": [
                         "--audio_column_name audio",
                         "--label_column_name language",
@@ -35,10 +35,10 @@
                 "multi_card": {
                     "learning_rate": 5e-4,
                     "train_batch_size": 32,
-                    "eval_accuracy": 0.795,
-                    "train_runtime": 109.4142,
-                    "train_samples_per_second": 2962.248,
-                    "eval_samples_per_second": 580.266,
+                    "eval_accuracy": 0.8006,
+                    "train_runtime": 109.2047,
+                    "train_samples_per_second": 3048.207,
+                    "eval_samples_per_second": 631.601,
                     "extra_arguments": [
                         "--audio_column_name audio",
                         "--label_column_name language",
diff --git a/tests/baselines/wav2vec2_large_lv60.json b/tests/baselines/wav2vec2_large_lv60.json
index b1071302fa..d645ced656 100644
--- a/tests/baselines/wav2vec2_large_lv60.json
+++ b/tests/baselines/wav2vec2_large_lv60.json
@@ -7,10 +7,10 @@
                 "multi_card": {
                     "learning_rate": 6e-4,
                     "train_batch_size": 8,
-                    "eval_wer": 0.0555,
-                    "train_runtime": 889.0079,
-                    "train_samples_per_second": 70.036,
-                    "eval_samples_per_second": 57.302,
+                    "eval_wer": 0.0496,
+                    "train_runtime": 984.3022,
+                    "train_samples_per_second": 63.043,
+                    "eval_samples_per_second": 54.189,
                     "extra_arguments": [
                         "--dataset_config_name clean",
                         "--train_split_name train.100",
@@ -33,12 +33,12 @@
             "eval_batch_size": 8,
             "distribution": {
                 "multi_card": {
-                    "learning_rate": 3e-4,
+                    "learning_rate": 4e-4,
                     "train_batch_size": 8,
-                    "eval_wer": 0.0531535105117017,
-                    "train_runtime": 356.4723,
-                    "train_samples_per_second": 183.245,
-                    "eval_samples_per_second": 158.985,
+                    "eval_wer": 0.06120587068623562,
+                    "train_runtime": 308.8036,
+                    "train_samples_per_second": 225.572,
+                    "eval_samples_per_second": 196.665,
                     "extra_arguments": [
                         "--dataset_config_name clean",
                         "--train_split_name train.100",
@@ -49,10 +49,12 @@
                         "--layerdrop 0.0",
                         "--freeze_feature_encoder",
                         "--dataloader_num_workers 8",
-                        "--chars_to_ignore ',?.!-;:\"“%‘”'"
+                        "--chars_to_ignore ',?.!-;:\"“%‘”'",
+                        "--use_hpu_graphs_for_training",
+                        "--use_hpu_graphs_for_inference"
                     ]
                 }
             }
         }
     }
-}
+}
\ No newline at end of file
diff --git a/tests/baselines/whisper_small.json b/tests/baselines/whisper_small.json
index 513aeac2fa..42a18efe79 100644
--- a/tests/baselines/whisper_small.json
+++ b/tests/baselines/whisper_small.json
@@ -7,10 +7,10 @@
                 "multi_card": {
                     "learning_rate": 1e-4,
                     "train_batch_size": 8,
-                    "eval_wer": 1.352786940708788,
-                    "train_runtime": 532.0875,
-                    "train_samples_per_second": 147.56,
-                    "eval_samples_per_second": 7.683,
+                    "eval_wer": 2.1133,
+                    "train_runtime": 551.3249,
+                    "train_samples_per_second": 145.59,
+                    "eval_samples_per_second": 6.851,
                     "extra_arguments": [
                         "--dataset_config_name hi",
                         "--language hindi",
@@ -41,10 +41,10 @@
                 "multi_card": {
                     "learning_rate": 8e-5,
                     "train_batch_size": 32,
-                    "eval_wer": 1.2335690515806987,
-                    "train_runtime": 298.2158,
-                    "train_samples_per_second": 284.875,
-                    "eval_samples_per_second": 15.679,
+                    "eval_wer": 0.8477,
+                    "train_runtime": 287.0947,
+                    "train_samples_per_second": 307.526,
+                    "eval_samples_per_second": 12.069,
                     "extra_arguments": [
                         "--dataset_config_name hi",
                         "--language hindi",
diff --git a/tests/example_diff/run_generation.txt b/tests/example_diff/run_generation.txt
index 4bc6a80807..d68c09a4c9 100644
--- a/tests/example_diff/run_generation.txt
+++ b/tests/example_diff/run_generation.txt
@@ -48,7 +48,7 @@
 < from transformers.modeling_outputs import CausalLMOutputWithPast
 ---
 > from optimum.habana.utils import get_hpu_memory_stats
-62,263d42
+62,284d42
 < MAX_LENGTH = int(10000)  # Hardcoded max length to avoid infinite loop
 < 
 < MODEL_CLASSES = {
@@ -251,7 +251,7 @@
 <             attentions=None,
 <         )
 <         return fixed_output
-265,287c44,46
+< 
 <     def __getattr__(self, item):
 <         return getattr(self._default, item)
 < 
@@ -272,7 +272,7 @@
 <         """
 <         return self._default._reorder_cache(past_key_values, beam_idx)
 < 
-< 
+286,287c44,46
 < def main():
 <     parser = argparse.ArgumentParser()
 ---
@@ -477,56 +477,58 @@
 <     parser.add_argument("--prefix", type=str, default="", help="Text added prior to input.")
 <     parser.add_argument("--padding_text", type=str, default="", help="Deprecated, the use of `--prefix` is preferred.")
 <     parser.add_argument("--xlm_language", type=str, default="", help="Optional language when used with the XLM model.")
-323d223
+323c224
 <     parser.add_argument("--seed", type=int, default=42, help="random seed for initialization")
-325c225
+---
+>     parser.add_argument("--fp8", action="store_true", help="Enable Quantization to fp8")
+325c226
 <         "--use_cpu",
 ---
->         "--kv_cache_fp8",
-327c227
+>         "--use_flash_attention",
+327c228
 <         help="Whether or not to use cpu. If set to False, " "we will use gpu/npu or mps device if available",
 ---
->         help="Store kv-cache in float8 when kv-cache is used. Can't use this argument together with QUANT_CONFIG env var",
-329c229
+>         help="Whether to enable Habana Flash Attention, provided that the model supports it.",
+329d229
 <     parser.add_argument("--num_return_sequences", type=int, default=1, help="The number of samples to generate.")
----
->     parser.add_argument("--fp8", action="store_true", help="Enable Quantization to fp8")
 331c231
 <         "--fp16",
 ---
->         "--use_flash_attention",
-333c233
+>         "--torch_compile",
+333c233,246
 <         help="Whether to use 16-bit (mixed) precision (through NVIDIA apex) instead of 32-bit",
 ---
->         help="Whether to enable Habana Flash Attention, provided that the model supports it.",
-335,336c235,241
-<     parser.add_argument("--jit", action="store_true", help="Whether or not to use jit trace to accelerate inference")
-<     args = parser.parse_args()
----
->     parser.add_argument(
->         "--torch_compile",
->         action="store_true",
 >         help="Whether to use torch compiled model or not.",
 >     )
 >     parser.add_argument("--temperature", default=1.0, type=float, help="Temperature value for text generation")
 >     parser.add_argument("--top_p", default=1.0, type=float, help="Top_p value for generating text via sampling")
-338,339c243
+>     parser.add_argument(
+>         "--const_serialization_path",
+>         "--csp",
+>         type=str,
+>         help="Path to serialize const params. Const params will be held on disk memory instead of being allocated on host memory.",
+>     )
+>     parser.add_argument(
+>         "--disk_offload",
+>         action="store_true",
+>         help="Whether to enable device map auto. In case no space left on cpu, weights will be offloaded to disk.",
+335d247
+<     parser.add_argument("--jit", action="store_true", help="Whether or not to use jit trace to accelerate inference")
+338,341c250,251
 <     # Initialize the distributed state.
 <     distributed_state = PartialState(cpu=args.use_cpu)
----
->     args = parser.parse_args()
-341c245,246
+< 
 <     logger.warning(f"device: {distributed_state.device}, 16-bits inference: {args.fp16}")
 ---
 >     if args.torch_compile:
 >         args.use_hpu_graphs = False
-343,344c248,249
+343,344c253,254
 <     if args.seed is not None:
 <         set_seed(args.seed)
 ---
 >     if not args.use_hpu_graphs:
 >         args.limit_hpu_graphs = False
-346,373c251,256
+346,373c256,257
 <     # Initialize the model and tokenizer
 <     try:
 <         args.model_type = args.model_type.lower()
@@ -557,17 +559,13 @@
 <         preprocessed_prompt_text = prepare_input(args, model, tokenizer, prompt_text)
 ---
 >     args.quant_config = os.getenv("QUANT_CONFIG", "")
->     if args.quant_config and args.kv_cache_fp8:
->         # can't use both quant_config and kv_cache_fp8, since quant_config may trigger kv cache quantization
->         # with habana quantization toolkit
->         raise parser.error("Can't use QUANT_CONFIG env var with kv_cache_fp8 argument")
 >     return args
-375,378d257
+375,378d258
 <         if model.__class__.__name__ in ["TransfoXLLMHeadModel"]:
 <             tokenizer_kwargs = {"add_space_before_punct_symbol": True}
 <         else:
 <             tokenizer_kwargs = {}
-380,386c259,262
+380,386c260,263
 <         encoded_prompt = tokenizer.encode(
 <             preprocessed_prompt_text, add_special_tokens=False, return_tensors="pt", **tokenizer_kwargs
 <         )
@@ -580,7 +578,7 @@
 >     parser = argparse.ArgumentParser()
 >     args = setup_parser(parser)
 >     model, tokenizer, generation_config = initialize_model(args, logger)
-388,389c264,413
+388,389c265,414
 <     if encoded_prompt.size()[-1] == 0:
 <         input_ids = None
 ---
@@ -734,7 +732,7 @@
 >         print(f"Graph compilation duration          = {compilation_duration} seconds")
 >         print(separator)
 >         print()
-391c415,432
+391c416,433
 <         input_ids = encoded_prompt
 ---
 >         # Downloading and loading a dataset from the hub.
@@ -755,7 +753,7 @@
 >             .shuffle()
 >             .select(range(args.dataset_max_samples if args.dataset_max_samples > 0 else (raw_dataset[split]).num_rows))
 >         )
-393,399c434,441
+393,399c435,442
 <     if args.jit:
 <         jit_input_texts = ["enable jit"]
 <         jit_inputs = prepare_jit_inputs(jit_input_texts, model, tokenizer)
@@ -772,7 +770,7 @@
 >             logger.info(
 >                 f"No column name was given so automatically choosing '{column_name}' for prompts. If you would like to use another column of the dataset, you can set the argument `--column_name`."
 >             )
-401,439c443,463
+401,439c444,464
 <             sig = inspect.signature(model.__call__)
 <         jit_inputs = tuple(jit_inputs[key] for key in sig.parameters if jit_inputs.get(key, None) is not None)
 <         traced_model = torch.jit.trace(model, jit_inputs, strict=False)
@@ -834,7 +832,7 @@
 >             preprocess_function,
 >             batched=True,
 >             desc="Running tokenizer on dataset",
-440a465,545
+440a466,546
 >         # After tokenization, we can remove the column of interest
 >         raw_dataset = raw_dataset.remove_columns([column_name])
 >         raw_dataset.set_format(type="torch")
@@ -916,7 +914,7 @@
 > 
 >         throughput = total_new_tokens_generated / duration
 >         # Print Stats
-442,443c547,561
+442,443c548,566
 <         generated_sequences.append(total_sequence)
 <         print(total_sequence)
 ---
@@ -935,7 +933,11 @@
 >         print(separator)
 >     if args.quant_config:
 >         import habana_quantization_toolkit
-445c563
+> 
+>         habana_quantization_toolkit.finish_measurements(model)
+>     if args.const_serialization_path and os.path.isdir(args.const_serialization_path):
+>         import shutil
+445c568
 <     return generated_sequences
 ---
->         habana_quantization_toolkit.finish_measurements(model)
+>         shutil.rmtree(args.const_serialization_path)
diff --git a/tests/test_diffusers.py b/tests/test_diffusers.py
index c4b5e07a69..25f9b6be7a 100755
--- a/tests/test_diffusers.py
+++ b/tests/test_diffusers.py
@@ -55,7 +55,7 @@
 
 
 if os.environ.get("GAUDI2_CI", "0") == "1":
-    THROUGHPUT_BASELINE_BF16 = 1.016
+    THROUGHPUT_BASELINE_BF16 = 1.086
     THROUGHPUT_BASELINE_AUTOCAST = 0.394
     TEXTUAL_INVERSION_THROUGHPUT = 104.29806
     TEXTUAL_INVERSION_RUNTIME = 114.1344320399221
@@ -64,10 +64,10 @@
 else:
     THROUGHPUT_BASELINE_BF16 = 0.309
     THROUGHPUT_BASELINE_AUTOCAST = 0.114
-    TEXTUAL_INVERSION_THROUGHPUT = 58.17508958300077
-    TEXTUAL_INVERSION_RUNTIME = 202.94231038199996
-    CONTROLNET_THROUGHPUT = 44.412012818816905
-    CONTROLNET_RUNTIME = 1124.0202105600001
+    TEXTUAL_INVERSION_THROUGHPUT = 60.5991479573174
+    TEXTUAL_INVERSION_RUNTIME = 196.43840550999994
+    CONTROLNET_THROUGHPUT = 44.7278034963213
+    CONTROLNET_RUNTIME = 1116.084316640001
 
 
 _run_custom_bf16_ops_test_ = parse_flag_from_env("CUSTOM_BF16_OPS", default=False)
diff --git a/tests/test_encoder_decoder.py b/tests/test_encoder_decoder.py
index 532b24da12..5608679470 100644
--- a/tests/test_encoder_decoder.py
+++ b/tests/test_encoder_decoder.py
@@ -16,15 +16,15 @@
     MODELS_TO_TEST = {
         "summarization": {
             "bf16": [
-                ("facebook/bart-large-cnn", "Habana/bart", 4.691, 26.0688, 2, 1),
-                ("t5-3b", "Habana/t5", 2.88, 21.56, 2, 1),
+                ("facebook/bart-large-cnn", "Habana/bart", 5.233, 26.6928, 2, 1),
+                ("t5-3b", "Habana/t5", 2.955, 21.8877, 2, 1),
             ],
         },
         "translation": {
             "bf16": [
-                ("Babelscape/mrebel-large", "Habana/t5", 1.41, 0.162, 2, 1),
-                ("Helsinki-NLP/opus-mt-zh-en", "Habana/t5", 2.8, 0.813, 2, 1),
-                ("facebook/nllb-200-distilled-600M", "Habana/t5", 1.496, 1.2531, 2, 1),
+                ("Babelscape/mrebel-large", "Habana/t5", 1.323, 0.1618, 2, 1),
+                ("Helsinki-NLP/opus-mt-zh-en", "Habana/t5", 2.815, 0.8132, 2, 1),
+                ("facebook/nllb-200-distilled-600M", "Habana/t5", 1.401, 1.2599, 2, 1),
             ],
         },
     }
@@ -33,15 +33,15 @@
     MODELS_TO_TEST = {
         "summarization": {
             "bf16": [
-                ("facebook/bart-large-cnn", "Habana/bart", 2.574, 26.5069, 2, 1),
-                ("t5-3b", "Habana/t5", 0.987, 21.3831, 2, 1),
+                ("facebook/bart-large-cnn", "Habana/bart", 2.628, 26.7494, 2, 1),
+                ("t5-3b", "Habana/t5", 1.005, 21.7286, 2, 1),
             ],
         },
         "translation": {
             "bf16": [
-                ("Babelscape/mrebel-large", "Habana/t5", 1.015, 0.162, 2, 1),
-                ("Helsinki-NLP/opus-mt-zh-en", "Habana/t5", 2.421, 0.7995, 2, 1),
-                ("facebook/nllb-200-distilled-600M", "Habana/t5", 1.03, 1.2531, 2, 1),
+                ("Babelscape/mrebel-large", "Habana/t5", 0.995, 0.1784, 2, 1),
+                ("Helsinki-NLP/opus-mt-zh-en", "Habana/t5", 2.409, 0.7995, 2, 1),
+                ("facebook/nllb-200-distilled-600M", "Habana/t5", 0.998, 1.2457, 2, 1),
             ],
         },
     }
diff --git a/tests/test_examples.py b/tests/test_examples.py
old mode 100644
new mode 100755
index 961f8a1ab5..5cf2559f5f
--- a/tests/test_examples.py
+++ b/tests/test_examples.py
@@ -276,9 +276,12 @@ def test(self):
                 self.assertEqual(return_code, 0)
                 return
             elif self.EXAMPLE_NAME == "run_clip":
-                from .clip_coco_utils import COCO_URLS, create_clip_roberta_model, download_files
+                if not os.environ.get("DATA_CACHE", "0"):
+                    from .clip_coco_utils import COCO_URLS, download_files
+
+                    download_files(COCO_URLS)
+                from .clip_coco_utils import create_clip_roberta_model
 
-                download_files(COCO_URLS)
                 create_clip_roberta_model()
 
             self._install_requirements(example_script.parent / "requirements.txt")
@@ -322,6 +325,11 @@ def test(self):
                     env_variables["LOWER_LIST"] = str(example_script.parent / "ops_bf16.txt")
                 env_variables["PT_HPU_LAZY_MODE"] = "0"
 
+            extra_command_line_arguments = baseline.get("distribution").get(distribution).get("extra_arguments", [])
+
+            if os.environ.get("DATA_CACHE", "0") and self.EXAMPLE_NAME == "run_clip":
+                extra_command_line_arguments[0] = "--data_dir {}".format(os.environ.get("DATA_CACHE", "$PWD"))
+
             with TemporaryDirectory() as tmp_dir:
                 cmd_line = self._create_command_line(
                     multi_card,
@@ -336,9 +344,7 @@ def test(self):
                     train_batch_size=baseline.get("distribution").get(distribution).get("train_batch_size"),
                     eval_batch_size=baseline.get("eval_batch_size"),
                     num_epochs=baseline.get("num_train_epochs"),
-                    extra_command_line_arguments=baseline.get("distribution")
-                    .get(distribution)
-                    .get("extra_arguments", []),
+                    extra_command_line_arguments=extra_command_line_arguments,
                 )
 
                 p = subprocess.Popen(cmd_line, env=env_variables)
@@ -577,6 +583,7 @@ class MultiCardSpeechRecognitionExampleTester(
     ExampleTesterBase, metaclass=ExampleTestMeta, example_name="run_speech_recognition_ctc", multi_card=True
 ):
     TASK_NAME = "regisss/librispeech_asr_for_optimum_habana_ci"
+    DATASET_NAME = os.environ.get("DATA_CACHE", 0)
 
 
 class MultiCardSummarizationExampleTester(
diff --git a/tests/test_fsdp_examples.py b/tests/test_fsdp_examples.py
index 29198ae7bd..13de801832 100644
--- a/tests/test_fsdp_examples.py
+++ b/tests/test_fsdp_examples.py
@@ -10,34 +10,37 @@
 from .test_examples import ACCURACY_PERF_FACTOR, TIME_PERF_FACTOR
 
 
-# Gaudi2 CI baselines
-# FSDP is not supported on Gaudi1
-MODELS_TO_TEST = {
-    "bf16": [
-        (
-            "bert-base-uncased",
-            "Habana/bert-base-uncased",
-            2807,
-            85.4688,
-            "question-answering",
-            24,
-            8,
-            "run_qa.py",
-            "full_shard",
-        ),
-        (
-            "meta-llama/Llama-2-7b-hf",
-            "",
-            54,
-            0.92,
-            "language-modeling",
-            8,
-            8,
-            "run_lora_clm.py",
-            "auto_wrap",
-        ),
-    ],
-}
+if os.environ.get("GAUDI2_CI", "0") == "1":
+    # Gaudi2 CI baselines
+    MODELS_TO_TEST = {
+        "bf16": [
+            (
+                "bert-base-uncased",
+                "Habana/bert-base-uncased",
+                3516.322,
+                85.5503,
+                "question-answering",
+                24,
+                8,
+                "run_qa.py",
+                "full_shard",
+            ),
+            (
+                "meta-llama/Llama-2-7b-hf",
+                "",
+                87.016,
+                0.9093,
+                "language-modeling",
+                8,
+                8,
+                "run_lora_clm.py",
+                "auto_wrap",
+            ),
+        ],
+    }
+else:
+    # FSDP is not supported on Gaudi1
+    MODELS_TO_TEST = {"bf16": []}
 
 
 def _test_fsdp(
@@ -54,8 +57,6 @@ def _test_fsdp(
     world_size: int = 8,
 ):
     os.environ["PT_HPU_LAZY_MODE"] = "0"
-    os.environ["PT_HPU_EAGER_4_STAGE_PIPELINE_ENABLE"] = "0"  # To be removed later
-    os.environ["PT_HPU_EAGER_PIPELINE_ENABLE"] = "0"  # To be removed later
     path_to_example_dir = Path(__file__).resolve().parent.parent / "examples"
 
     # Install question-answering example requirements
diff --git a/tests/test_text_generation_example.py b/tests/test_text_generation_example.py
index af7e9cfab3..8f3da77526 100644
--- a/tests/test_text_generation_example.py
+++ b/tests/test_text_generation_example.py
@@ -14,50 +14,54 @@
     # Gaudi2 CI baselines
     MODELS_TO_TEST = {
         "bf16": [
-            ("bigscience/bloomz-7b1", 130.10463607610703),
-            ("gpt2-xl", 293.2967921508155),
-            ("EleutherAI/gpt-j-6b", 157.39646612198123),
-            ("EleutherAI/gpt-neox-20b", 49.65827341338015),
-            ("meta-llama/Llama-2-7b-hf", 142.00624811267403),
-            ("tiiuae/falcon-40b", 25.065388035178792),
-            ("bigcode/starcoder", 65.50236665863024),
-            ("Salesforce/codegen2-1B", 456.7740998156863),
-            ("mosaicml/mpt-30b", 35.64501131267502),
-            ("mistralai/Mistral-7B-v0.1", 125.26115369093216),
-            ("mistralai/Mixtral-8x7B-v0.1", 23.78652574031883),
-            ("microsoft/phi-2", 218.08752713569007),
+            ("bigscience/bloomz-7b1", 130.0472971205316),
+            ("gpt2-xl", 281.8734689674413),
+            ("EleutherAI/gpt-j-6b", 160.5823842101192),
+            ("EleutherAI/gpt-neox-20b", 50.67672679310354),
+            ("meta-llama/Llama-2-7b-hf", 141.25776956002076),
+            ("tiiuae/falcon-40b", 25.202450111088346),
+            ("bigcode/starcoder", 65.58632640700114),
+            ("Salesforce/codegen2-1B", 446.4029486883532),
+            ("mosaicml/mpt-30b", 36.06464336116623),
+            ("mistralai/Mistral-7B-v0.1", 130.2172236767782),
+            ("mistralai/Mixtral-8x7B-v0.1", 23.7931001677926),
+            ("microsoft/phi-2", 224.72307766211117),
+        ],
+        "fp8": [
+            ("tiiuae/falcon-180B", 52.85086442722326),
         ],
         "deepspeed": [
-            ("bigscience/bloomz", 36.34664210641816),
-            ("meta-llama/Llama-2-70b-hf", 61.973950428647164),
-            ("facebook/opt-66b", 28.16154122335556),
+            ("bigscience/bloomz", 36.77314954096159),
+            ("meta-llama/Llama-2-70b-hf", 64.10514998902435),
+            ("facebook/opt-66b", 28.48069266504111),
         ],
         "torch_compile": [
-            ("meta-llama/Llama-2-7b-hf", 12.468247401430999),
+            ("meta-llama/Llama-2-7b-hf", 102.27823420713148),
         ],
         "torch_compile_distributed": [
-            ("meta-llama/Llama-2-7b-hf", 20.178927030275947),
+            ("meta-llama/Llama-2-7b-hf", 39.72973199515235),
         ],
     }
 else:
     # Gaudi1 CI baselines
     MODELS_TO_TEST = {
         "bf16": [
-            ("bigscience/bloomz-7b1", 41.51855420676164),
-            ("gpt2-xl", 137.159223188195),
+            ("bigscience/bloomz-7b1", 41.7555095197846),
+            ("gpt2-xl", 142.11481820425706),
             # TODO: fix OPT 6.7B
             # ("facebook/opt-6.7b", 0.0),
-            ("EleutherAI/gpt-j-6b", 50.66146537939035),
-            ("meta-llama/Llama-2-7b-hf", 44.29688546702468),
-            ("tiiuae/falcon-7b", 44.217408724737744),
-            ("bigcode/starcoder", 15.948143541091655),
-            ("Salesforce/codegen2-1B", 153.79670508220687),
-            ("mosaicml/mpt-7b", 44.80241777760578),
-            ("mistralai/Mistral-7B-v0.1", 40.00435417311187),
-            ("microsoft/phi-2", 90.10751623430603),
+            ("EleutherAI/gpt-j-6b", 50.79545107991805),
+            ("meta-llama/Llama-2-7b-hf", 44.39616259946937),
+            ("tiiuae/falcon-7b", 44.82870145718665),
+            ("bigcode/starcoder", 15.945023767901013),
+            ("Salesforce/codegen2-1B", 155.32071248826423),
+            ("mosaicml/mpt-7b", 45.45168927038262),
+            ("mistralai/Mistral-7B-v0.1", 41.21906841459711),
+            ("microsoft/phi-2", 92.53083167241344),
         ],
+        "fp8": [],
         "deepspeed": [
-            ("bigscience/bloomz-7b1", 31.044523676681507),
+            ("bigscience/bloomz-7b1", 31.994268212011505),
         ],
         "torch_compile": [],
         "torch_compile_distributed": [],
@@ -71,6 +75,7 @@ def _test_text_generation(
     deepspeed: bool = False,
     world_size: int = 8,
     torch_compile: bool = False,
+    fp8: bool = False,
 ):
     command = ["python3"]
     path_to_example_dir = Path(__file__).resolve().parent.parent / "examples"
@@ -108,6 +113,12 @@ def _test_text_generation(
     if not deepspeed:
         command.append("--bf16")
 
+    if fp8:
+        command += [
+            "--reuse_cache",
+            "--trim_logits",
+        ]
+
     with TemporaryDirectory() as tmp_dir:
         command.append(f"--output_dir {tmp_dir}")
         print(f"\n\nCommand to test: {' '.join(command)}\n")
@@ -117,6 +128,16 @@ def _test_text_generation(
         pattern = re.compile(r"([\"\'].+?[\"\'])|\s")
         command = [x for y in command for x in re.split(pattern, y) if x]
 
+        if fp8:
+            env_variables["QUANT_CONFIG"] = os.path.join(
+                path_to_example_dir, "text-generation/quantization_config/maxabs_measure_include_outputs.json"
+            )
+            subprocess.run(command, env=env_variables)
+            env_variables["QUANT_CONFIG"] = os.path.join(
+                path_to_example_dir, "text-generation/quantization_config/maxabs_quant.json"
+            )
+            command.insert(-1, "--fp8")
+
         proc = subprocess.run(command, env=env_variables)
 
         # Ensure the run finished without any issue
@@ -140,6 +161,13 @@ def test_text_generation_bf16(model_name: str, baseline: float, token: str):
     _test_text_generation(model_name, baseline, token)
 
 
+@pytest.mark.parametrize("model_name, baseline", MODELS_TO_TEST["fp8"])
+def test_text_generation_fp8(model_name: str, baseline: float, token: str):
+    deepspeed = True if "falcon-180B" in model_name else False
+    world_size = 8 if "falcon-180B" in model_name else None
+    _test_text_generation(model_name, baseline, token, deepspeed=deepspeed, world_size=world_size, fp8=True)
+
+
 @pytest.mark.parametrize("model_name, baseline", MODELS_TO_TEST["deepspeed"])
 def test_text_generation_deepspeed(model_name: str, baseline: float, token: str):
     world_size = 2 if "opt-66b" in model_name else 8
diff --git a/tests/transformers/tests/models/falcon/test_modeling_falcon.py b/tests/transformers/tests/models/falcon/test_modeling_falcon.py
index 1ab9f84cf9..20e9067f31 100644
--- a/tests/transformers/tests/models/falcon/test_modeling_falcon.py
+++ b/tests/transformers/tests/models/falcon/test_modeling_falcon.py
@@ -353,7 +353,7 @@ def test_past_key_values_format(self):
             outputs = model(**inputs)
 
             # If "past_key_values" is not returned, pass the test (e.g. RWKV uses a different cache name and format)
-            if "past_key_values" not in outputs:
+            if "past_key_values" not in outputs or all(ele is None for ele in outputs["past_key_values"]):
                 return
 
             num_hidden_layers = (