Address review comments

tirthasheshpatel · tirthasheshpatel · commit 45b03a50be10 · 2023-11-22T05:28:38.000Z
diff --git a/keras_nlp/models/mistral/mistral_attention.py b/keras_nlp/models/mistral/mistral_attention.py
@@ -34,12 +34,14 @@ def __init__(
         rope_scaling_factor=1.0,
         kernel_initializer="glorot_uniform",
         sliding_window=512,
+        dropout=0,
         **kwargs,
     ):
         super().__init__(**kwargs)
         self._num_query_heads = num_query_heads
         self._num_key_value_heads = num_key_value_heads
         self._sliding_window = sliding_window
+        self._dropout = dropout
 
         self._num_key_value_groups = num_query_heads // num_key_value_heads
         self._rope_max_wavelength = rope_max_wavelength
@@ -51,24 +53,32 @@ def __init__(
         self._rope_scaling_factor = rope_scaling_factor
 
     def build(self, inputs_shape):
+        # Einsum variables:
+        # b = batch size
+        # q = query length
+        # k = key/value length
+        # m = model dim
+        # u = num query heads
+        # v = num key/value heads
+        # h = head dim
         self._hidden_dim = inputs_shape[-1]
-        self._attn_head_size = self._hidden_dim // self._num_query_heads
+        self._head_dim = self._hidden_dim // self._num_query_heads
 
         self._query_dense = keras.layers.EinsumDense(
-            equation="abc,cde->abde",
-            output_shape=(None, self._num_query_heads, self._attn_head_size),
+            equation="bqm,muh->bquh",
+            output_shape=(None, self._num_query_heads, self._head_dim),
             kernel_initializer=self._kernel_initializer,
             dtype=self.compute_dtype,
             name="query",
         )
         self._query_dense.build(inputs_shape)
 
         self._key_dense = keras.layers.EinsumDense(
-            equation="abc,cde->abde",
+            equation="bkm,mvh->bkvh",
             output_shape=(
                 None,
                 self._num_key_value_heads,
-                self._attn_head_size,
+                self._head_dim,
             ),
             kernel_initializer=self._kernel_initializer,
             dtype=self.compute_dtype,
@@ -77,11 +87,11 @@ def build(self, inputs_shape):
         self._key_dense.build(inputs_shape)
 
         self._value_dense = keras.layers.EinsumDense(
-            equation="abc,cde->abde",
+            equation="bkm,mvh->bkvh",
             output_shape=(
                 None,
                 self._num_key_value_heads,
-                self._attn_head_size,
+                self._head_dim,
             ),
             kernel_initializer=self._kernel_initializer,
             dtype=self.compute_dtype,
@@ -91,14 +101,20 @@ def build(self, inputs_shape):
 
         self._softmax = keras.layers.Softmax(axis=-1, name="attention_softmax")
 
+        self._dropout_layer = keras.layers.Dropout(
+            rate=self._dropout, dtype=self.compute_dtype
+        )
+
         self._output_dense = keras.layers.EinsumDense(
-            equation="abc,cd->abd",
+            equation="bquh,uhm->bqm",
             output_shape=(None, self._hidden_dim),
             kernel_initializer=self._kernel_initializer,
             dtype=self.compute_dtype,
             name="attention_output",
         )
-        self._output_dense.build(inputs_shape)
+        self._output_dense.build(
+            (None, None, self._num_query_heads, self._head_dim)
+        )
 
         self.rotary_embedding_layer = RotaryEmbedding(
             max_wavelength=self._rope_max_wavelength,
@@ -114,6 +130,7 @@ def call(
         attention_mask=None,
         cache=None,
         cache_update_index=None,
+        training=None,
     ):
         seq_len = ops.shape(hidden_states)[1]
         start_index = (
@@ -221,14 +238,8 @@ def _compute_key_value(x):
             query, key, value, attention_mask
         )
 
-        attention_output_shape = ops.shape(attention_output)
-        attention_output = ops.reshape(
-            attention_output,
-            [
-                attention_output_shape[0],  # batch_shape
-                attention_output_shape[1],  # seq_len
-                self._hidden_dim,
-            ],
+        attention_output = self._dropout_layer(
+            attention_output, training=training
         )
 
         attention_output = self._output_dense(attention_output)
@@ -247,9 +258,7 @@ def _masked_softmax(self, attention_scores, attention_mask=None):
     def _compute_attention(self, query, key, value, attention_mask=None):
         attention_scores = ops.einsum("aecd,abcd->acbe", key, query)
 
-        norm_factor = ops.sqrt(
-            ops.cast(self._attn_head_size, self.compute_dtype)
-        )
+        norm_factor = ops.sqrt(ops.cast(self._head_dim, self.compute_dtype))
 
         attention_scores = attention_scores / norm_factor
 
@@ -274,6 +283,7 @@ def get_config(self):
                     self._kernel_initializer
                 ),
                 "sliding_window": self._sliding_window,
+                "dropout": self._dropout,
             }
         )
         return config
diff --git a/keras_nlp/models/mistral/mistral_transformer_decoder.py b/keras_nlp/models/mistral/mistral_transformer_decoder.py
@@ -39,10 +39,9 @@ def __init__(
         layer_norm_epsilon=1e-5,
         kernel_initializer="glorot_uniform",
         sliding_window=512,
+        dropout=0,
         **kwargs,
     ):
-        decoder_sequence_shape = kwargs.pop("decoder_sequence_shape", None)
-
         super().__init__(**kwargs)
         self.intermediate_dim = intermediate_dim
         self.num_query_heads = num_query_heads
@@ -51,16 +50,14 @@ def __init__(
         self.rope_max_wavelength = rope_max_wavelength
         self.rope_scaling_factor = rope_scaling_factor
 
+        self.dropout = dropout
+
         self.sliding_window = sliding_window
         self.activation = keras.activations.get(activation)
         self.layer_norm_epsilon = layer_norm_epsilon
         self.kernel_initializer = keras.initializers.get(kernel_initializer)
 
         self.supports_masking = True
-        self._decoder_sequence_shape = None
-
-        if decoder_sequence_shape:
-            self.build(decoder_sequence_shape)
 
     def build(self, decoder_sequence_shape):
         self._decoder_sequence_shape = decoder_sequence_shape
@@ -74,6 +71,7 @@ def build(self, decoder_sequence_shape):
             rope_scaling_factor=self.rope_scaling_factor,
             sliding_window=self.sliding_window,
             kernel_initializer=clone_initializer(self.kernel_initializer),
+            dropout=self.dropout,
             dtype=self.compute_dtype,
             name="self_attention",
         )
@@ -85,6 +83,11 @@ def build(self, decoder_sequence_shape):
             dtype=self.compute_dtype,
         )
         self._self_attention_layernorm.build(decoder_sequence_shape)
+        self._self_attention_dropout = keras.layers.Dropout(
+            rate=self.dropout,
+            dtype=self.compute_dtype,
+            name="self_attention_dropout",
+        )
 
         # Feedforward layers.
         self._feedforward_intermediate_dense = keras.layers.Dense(
@@ -135,6 +138,7 @@ def call(
         decoder_attention_mask=None,
         self_attention_cache=None,
         self_attention_cache_update_index=None,
+        training=None,
     ):
         self_attention_mask = self._compute_self_attention_mask(
             decoder_sequence=decoder_sequence,
@@ -156,6 +160,8 @@ def call(
         if self_attention_cache is not None:
             x, self_attention_cache = x
 
+        x = self._self_attention_dropout(x, training=training)
+
         x = x + residual
         residual = x
 
@@ -220,7 +226,7 @@ def get_config(self):
                 "kernel_initializer": keras.initializers.serialize(
                     self.kernel_initializer
                 ),
-                "decoder_sequence_shape": self._decoder_sequence_shape,
+                "dropout": self.dropout,
             }
         )
         return config