From 24182ad9ce760e8e50214014908845c4ffdf4340 Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Mon, 29 Apr 2024 12:32:31 +0300 Subject: [PATCH 1/4] Fix rope scaling factor --- keras_nlp/src/layers/modeling/rotary_embedding.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/keras_nlp/src/layers/modeling/rotary_embedding.py b/keras_nlp/src/layers/modeling/rotary_embedding.py index 8e375ba5ad..14d65ed131 100644 --- a/keras_nlp/src/layers/modeling/rotary_embedding.py +++ b/keras_nlp/src/layers/modeling/rotary_embedding.py @@ -35,7 +35,8 @@ class RotaryEmbedding(keras.layers.Layer): Args: max_wavelength: int. The maximum angular wavelength of the sine/cosine curves. - scaling_factor: float. The scaling factor used to scale frequency range. + scaling_factor: float. The scaling factor used to scale positions of + the tokens. sequence_axis: int. Sequence axis in the input tensor. feature_axis: int. Feature axis in the input tensor. **kwargs: other keyword arguments passed to `keras.layers.Layer`, @@ -125,6 +126,7 @@ def _compute_cos_sin_embedding(self, inputs, start_index=0, positions=None): else: positions = ops.cast(positions, "float32") + positions = positions / ops.cast(self.scaling_factor, "float32") freq = ops.einsum("i,j->ij", positions, inverse_freq) embedding = ops.stack((freq, freq), axis=-2) embedding = ops.reshape( @@ -143,12 +145,11 @@ def _compute_cos_sin_embedding(self, inputs, start_index=0, positions=None): return cos_emb, sin_emb def _get_inverse_freq(self, rotary_dim): - freq_range = ops.arange(0, rotary_dim, 2, dtype="float32") - freq_range = freq_range / ops.cast(self.scaling_factor, "float32") - inverse_freq = 1.0 / ( - self.max_wavelength - ** (freq_range / ops.cast(rotary_dim, "float32")) + freq_range = ops.divide( + ops.arange(0, rotary_dim, 2, dtype="float32"), + ops.cast(rotary_dim, "float32"), ) + inverse_freq = 1.0 / (self.max_wavelength**freq_range) return inverse_freq def get_config(self): From f93a81391bb99b96354ed988ca56d598287daea8 Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Mon, 29 Apr 2024 12:50:59 +0300 Subject: [PATCH 2/4] Fix format --- keras_nlp/src/layers/modeling/rotary_embedding.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/keras_nlp/src/layers/modeling/rotary_embedding.py b/keras_nlp/src/layers/modeling/rotary_embedding.py index 14d65ed131..515f606675 100644 --- a/keras_nlp/src/layers/modeling/rotary_embedding.py +++ b/keras_nlp/src/layers/modeling/rotary_embedding.py @@ -35,7 +35,7 @@ class RotaryEmbedding(keras.layers.Layer): Args: max_wavelength: int. The maximum angular wavelength of the sine/cosine curves. - scaling_factor: float. The scaling factor used to scale positions of + scaling_factor: float. The scaling factor used to scale positions of the tokens. sequence_axis: int. Sequence axis in the input tensor. feature_axis: int. Feature axis in the input tensor. From dbf4d05a0243a9db0bf21fc057ba4394840fa903 Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Mon, 6 May 2024 23:38:49 +0300 Subject: [PATCH 3/4] Add tests --- .../layers/modeling/rotary_embedding_test.py | 59 +++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/keras_nlp/src/layers/modeling/rotary_embedding_test.py b/keras_nlp/src/layers/modeling/rotary_embedding_test.py index 3ce7763236..80bbce6f9c 100644 --- a/keras_nlp/src/layers/modeling/rotary_embedding_test.py +++ b/keras_nlp/src/layers/modeling/rotary_embedding_test.py @@ -168,3 +168,62 @@ def test_positions_array(self): got = layer(x, positions=positions) np.testing.assert_allclose(expected, ops.convert_to_numpy(got)) + + def test_rope_scaling(self): + # Reference values computed from Huggingface llama implementation + # With `scaling_factor` = 2.0 + # from transformers.models.llama.modeling_llama import ( + # LlamaLinearScalingRotaryEmbedding,apply_rotary_pos_emb + # ) + # import torch + # torch.set_printoptions(precision=9) + # rotary_emb = LlamaLinearScalingRotaryEmbedding( + # dim=4, max_position_embeddings=3, scaling_factor=2.0 + # ) + # query = torch.ones((1, 2, 3, 4)) # [bsz, num_heads, seq_len, head_dim] + # cos, sin = rotary_emb( + # query, torch.unsqueeze(torch.arange(3, dtype=torch.int32), 0) + # ) + # query, _ = apply_rotary_pos_emb(query, query, cos, sin) + # print(query.transpose(1, 2)) + # fmt: off + expected = [[[[ 1.000000000, 1.000000000, 1.000000000, 1.000000000], + [ 1.000000000, 1.000000000, 1.000000000, 1.000000000]], + [[ 0.398157001, 0.994987488, 1.357008100, 1.004987478], + [ 0.398157001, 0.994987488, 1.357008100, 1.004987478]], + [[-0.301168621, 0.989950180, 1.381773233, 1.009949803], + [-0.301168621, 0.989950180, 1.381773233, 1.009949803]]]] # fmt: on + + layer = RotaryEmbedding(scaling_factor=2.0) + self.assertAllClose( + layer(ops.ones((1, 3, 2, 4))), + ops.convert_to_tensor(expected), + ) + + def test_rope_scaling_with_kv_cache(self): + # Reference values computed from Huggingface llama implementation + # With `scaling_factor` = 5.0 + # from transformers.models.llama.modeling_llama import ( + # LlamaLinearScalingRotaryEmbedding,apply_rotary_pos_emb + # ) + # import torch + # torch.set_printoptions(precision=9) + # rotary_emb = LlamaLinearScalingRotaryEmbedding( + # dim=4, max_position_embeddings=3, scaling_factor=5.0 + # ) + + # query = torch.ones((1, 2, 1, 4)) # [bsz, num_heads, seq_len, head_dim] + # cos, sin = rotary_emb( + # query, torch.unsqueeze(torch.arange(12, 13, dtype=torch.int32), 0) + # ) + # query, _ = apply_rotary_pos_emb(query, query, cos, sin) + # query.transpose(1, 2) + # fmt: off + expected = [[[[-1.412856817, 0.975714266, -0.061930716, 1.023709655], + [-1.412856817, 0.975714266, -0.061930716, 1.023709655]]]] # fmt: on + + layer = RotaryEmbedding(scaling_factor=5.0) + self.assertAllClose( + layer(ops.ones((1, 1, 2, 4)), start_index=12), + ops.convert_to_tensor(expected), + ) From 479d4e8f0a73a0cc4e8dfa6f4a22e0556b8ae4d2 Mon Sep 17 00:00:00 2001 From: abuelnasr0 Date: Mon, 6 May 2024 23:43:30 +0300 Subject: [PATCH 4/4] Fix format --- .../layers/modeling/rotary_embedding_test.py | 34 +++++++++++++------ 1 file changed, 24 insertions(+), 10 deletions(-) diff --git a/keras_nlp/src/layers/modeling/rotary_embedding_test.py b/keras_nlp/src/layers/modeling/rotary_embedding_test.py index 80bbce6f9c..6b10cb0534 100644 --- a/keras_nlp/src/layers/modeling/rotary_embedding_test.py +++ b/keras_nlp/src/layers/modeling/rotary_embedding_test.py @@ -186,13 +186,22 @@ def test_rope_scaling(self): # ) # query, _ = apply_rotary_pos_emb(query, query, cos, sin) # print(query.transpose(1, 2)) - # fmt: off - expected = [[[[ 1.000000000, 1.000000000, 1.000000000, 1.000000000], - [ 1.000000000, 1.000000000, 1.000000000, 1.000000000]], - [[ 0.398157001, 0.994987488, 1.357008100, 1.004987478], - [ 0.398157001, 0.994987488, 1.357008100, 1.004987478]], - [[-0.301168621, 0.989950180, 1.381773233, 1.009949803], - [-0.301168621, 0.989950180, 1.381773233, 1.009949803]]]] # fmt: on + expected = [ + [ + [ + [1.000000000, 1.000000000, 1.000000000, 1.000000000], + [1.000000000, 1.000000000, 1.000000000, 1.000000000], + ], + [ + [0.398157001, 0.994987488, 1.357008100, 1.004987478], + [0.398157001, 0.994987488, 1.357008100, 1.004987478], + ], + [ + [-0.301168621, 0.989950180, 1.381773233, 1.009949803], + [-0.301168621, 0.989950180, 1.381773233, 1.009949803], + ], + ] + ] layer = RotaryEmbedding(scaling_factor=2.0) self.assertAllClose( @@ -218,9 +227,14 @@ def test_rope_scaling_with_kv_cache(self): # ) # query, _ = apply_rotary_pos_emb(query, query, cos, sin) # query.transpose(1, 2) - # fmt: off - expected = [[[[-1.412856817, 0.975714266, -0.061930716, 1.023709655], - [-1.412856817, 0.975714266, -0.061930716, 1.023709655]]]] # fmt: on + expected = [ + [ + [ + [-1.412856817, 0.975714266, -0.061930716, 1.023709655], + [-1.412856817, 0.975714266, -0.061930716, 1.023709655], + ] + ] + ] layer = RotaryEmbedding(scaling_factor=5.0) self.assertAllClose(