From 3e4b17ae78e88834b71dff00cf9bb2772a7821a4 Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Thu, 29 Feb 2024 07:24:12 +0000 Subject: [PATCH 1/2] Workaround for RoPE computed in bf16 --- optimum/habana/transformers/modeling_utils.py | 4 ++++ optimum/habana/transformers/models/__init__.py | 1 + .../habana/transformers/models/gpt_neox/__init__.py | 1 + .../transformers/models/gpt_neox/modeling_gpt_neox.py | 11 +++++++++++ 4 files changed, 17 insertions(+) diff --git a/optimum/habana/transformers/modeling_utils.py b/optimum/habana/transformers/modeling_utils.py index 4cd637c04c..3e5b74cef2 100644 --- a/optimum/habana/transformers/modeling_utils.py +++ b/optimum/habana/transformers/modeling_utils.py @@ -88,6 +88,7 @@ gaudi_gpt_neox_attention_forward, gaudi_gpt_neox_layer_forward, gaudi_gpt_neox_model_forward, + gaudi_gpt_neox_rotary_embedding_set_cos_sin_cache, gaudi_gptj_block_forward, gaudi_gptj_model_forward, gaudi_invert_attention_mask, @@ -251,6 +252,9 @@ def adapt_transformers_to_gaudi(): transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXModel.forward = gaudi_gpt_neox_model_forward transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXLayer.forward = gaudi_gpt_neox_layer_forward transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXAttention.forward = gaudi_gpt_neox_attention_forward + transformers.models.gpt_neox.modeling_gpt_neox.GPTNeoXRotaryEmbedding._set_cos_sin_cache = ( + gaudi_gpt_neox_rotary_embedding_set_cos_sin_cache + ) # Optimization for llama generation on Gaudi transformers.models.llama.modeling_llama.LlamaForCausalLM = GaudiLlamaForCausalLM diff --git a/optimum/habana/transformers/models/__init__.py b/optimum/habana/transformers/models/__init__.py index 08fd25c8c8..3cd1d0cba7 100644 --- a/optimum/habana/transformers/models/__init__.py +++ b/optimum/habana/transformers/models/__init__.py @@ -61,6 +61,7 @@ gaudi_gpt_neox_attention_forward, gaudi_gpt_neox_layer_forward, gaudi_gpt_neox_model_forward, + gaudi_gpt_neox_rotary_embedding_set_cos_sin_cache, ) from .gptj import ( GaudiGPTJAttention, diff --git a/optimum/habana/transformers/models/gpt_neox/__init__.py b/optimum/habana/transformers/models/gpt_neox/__init__.py index d3f6ab124d..cceb114b82 100644 --- a/optimum/habana/transformers/models/gpt_neox/__init__.py +++ b/optimum/habana/transformers/models/gpt_neox/__init__.py @@ -3,4 +3,5 @@ gaudi_gpt_neox_attention_forward, gaudi_gpt_neox_layer_forward, gaudi_gpt_neox_model_forward, + gaudi_gpt_neox_rotary_embedding_set_cos_sin_cache, ) diff --git a/optimum/habana/transformers/models/gpt_neox/modeling_gpt_neox.py b/optimum/habana/transformers/models/gpt_neox/modeling_gpt_neox.py index 9e2f9aaae0..66aca2a040 100644 --- a/optimum/habana/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/optimum/habana/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -404,6 +404,17 @@ def prepare_inputs_for_generation( return model_inputs +def gaudi_gpt_neox_rotary_embedding_set_cos_sin_cache(self, seq_len, device, dtype): + self.max_seq_len_cached = seq_len + t = torch.arange(self.max_seq_len_cached, device=device, dtype=self.inv_freq.dtype) + + freqs = torch.outer(t, self.inv_freq) + # Different from paper, but it uses a different permutation in order to obtain the same calculation + emb = torch.cat((freqs, freqs), dim=-1) + self.cos_cached = emb.cos() + self.sin_cached = emb.sin() + + def apply_customized_rope(q, k, cos, sin, position_ids): if q.device.type == "hpu" and FusedRoPE: return FusedRoPE.apply( From 8f86826a50da8e8ac76d3a8431dd00b41cb8d1dd Mon Sep 17 00:00:00 2001 From: regisss <15324346+regisss@users.noreply.github.com> Date: Tue, 12 Mar 2024 16:46:02 +0000 Subject: [PATCH 2/2] Add workaround for FusedRoPE --- .../transformers/models/gpt_neox/modeling_gpt_neox.py | 5 +++++ tests/baselines/gpt_neox_20b.json | 6 +++--- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/optimum/habana/transformers/models/gpt_neox/modeling_gpt_neox.py b/optimum/habana/transformers/models/gpt_neox/modeling_gpt_neox.py index 66aca2a040..08f3433377 100644 --- a/optimum/habana/transformers/models/gpt_neox/modeling_gpt_neox.py +++ b/optimum/habana/transformers/models/gpt_neox/modeling_gpt_neox.py @@ -31,6 +31,11 @@ def gaudi_gpt_neox_attention_forward( - add new args token_idx - optimize KV cache """ + # Workaround till FusedRoPE is fixed + global FusedRoPE + if self.training and FusedRoPE is not None: + FusedRoPE = None + has_layer_past = layer_past is not None # Compute QKV diff --git a/tests/baselines/gpt_neox_20b.json b/tests/baselines/gpt_neox_20b.json index b3c8114d1d..165debd4ca 100644 --- a/tests/baselines/gpt_neox_20b.json +++ b/tests/baselines/gpt_neox_20b.json @@ -7,9 +7,9 @@ "deepspeed": { "learning_rate": 5e-5, "train_batch_size": 2, - "perplexity": 8.787531864839819, - "train_runtime": 670.5209, - "train_samples_per_second": 8.485, + "perplexity": 8.0545, + "train_runtime": 745.7237, + "train_samples_per_second": 7.242, "extra_arguments": [ "--dataset_config_name wikitext-2-raw-v1", "--gradient_checkpointing",