Add docs; Make args keyword-only; Cosmetic fixes

tirthasheshpatel · tirthasheshpatel · commit ad2ae6444cb4 · 2023-11-27T21:44:26.000Z
diff --git a/keras_nlp/models/mistral/mistral_attention.py b/keras_nlp/models/mistral/mistral_attention.py
@@ -24,10 +24,12 @@
 # TODO(tirthasheshpatel): Generalize the attention layer
 # TODO(tirthasheshpatel): Merge `LlamaAttention` with this layer
 # TODO(tirthasheshpatel): Use flash attention
-# TODO(tirthasheshpatel): Add dropout
 class CachedMistralAttention(keras.layers.Layer):
+    """A cached grounded query attention layer with sliding window."""
+
     def __init__(
         self,
+        *,
         num_query_heads,
         num_key_value_heads,
         rope_max_wavelength=10000,
diff --git a/keras_nlp/models/mistral/mistral_backbone.py b/keras_nlp/models/mistral/mistral_backbone.py
@@ -30,8 +30,72 @@ def _mistral_kernel_initializer(stddev=0.02):
 
 @keras_nlp_export("keras_nlp.models.MistralBackbone")
 class MistralBackbone(Backbone):
+    """
+    The Mistral Transformer core architecture with hyperparameters.
+
+    This network implements a Transformer-based decoder network,
+    Mistral, as described in
+    ["Mistral 7B"](https://arxiv.org/pdf/2310.06825.pdf).
+    It includes the embedding lookups and transformer layers.
+
+    The default constructor gives a fully customizable, randomly initialized
+    Mistral model with any number of layers, heads, and embedding
+    dimensions. To load preset architectures and weights, use the `from_preset`
+    constructor.
+
+    Args:
+        vocabulary_size (int): The size of the token vocabulary.
+        num_layers (int): The number of transformer layers.
+        num_query_heads (int): The number of query attention heads for
+            each transformer.
+        hidden_dim (int): The size of the transformer encoding and pooling layers.
+        intermediate_dim (int): The output dimension of the first Dense layer in a
+            three-layer feedforward network for each transformer.
+        num_key_value_heads (int): The number of key and value attention heads for
+            each transformer.
+        rope_max_wavelength (int, optional): The maximum angular wavelength of the
+            sine/cosine curves, for rotary embeddings. Defaults to `10000`.
+        rope_scaling_factor (float, optional): The scaling factor for calculation
+            of roatary embedding. Defaults to `1.0`.
+        layer_norm_epsilon (float, optional): Epsilon for the layer normalization
+            layers in the transformer decoder. Defaults to `1e-6`.
+        sliding_window (int, optional): The sliding window for the mistral
+            attention layers. This controls the maximum cache size for the attention
+            layers in each transformer decoder. Only `sliding_window` number of tokens
+            are saved in the cache and used to generate the next token.
+            Defaults to `512`.
+
+    Examples:
+
+    ```python
+    input_data = {
+        "token_ids": np.ones(shape=(1, 12), dtype="int32"),
+        "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]),
+    }
+
+    # Pretrained Mistral decoder.
+    model = keras_nlp.models.MistralBackbone.from_preset("mistral7b_base_en")
+    model(input_data)
+
+    # Randomly initialized Mistral decoder with custom config.
+    model = keras_nlp.models.MistralBackbone(
+        vocabulary_size=10,
+        hidden_dim=512,
+        num_layers=2,
+        num_query_heads=32,
+        num_key_value_heads=8,
+        intermediate_dim=1024,
+        sliding_window=512,
+        layer_norm_epsilon=1e-6,
+        dtype="float32"
+    )
+    model(input_data)
+    ```
+    """
+
     def __init__(
         self,
+        *,
         vocabulary_size,
         num_layers,
         num_query_heads,
@@ -42,6 +106,7 @@ def __init__(
         rope_scaling_factor=1.0,
         layer_norm_epsilon=1e-6,
         sliding_window=512,
+        dropout=0,
         **kwargs,
     ):
         # Get the dtype
@@ -76,6 +141,7 @@ def __init__(
                 activation=ops.silu,
                 kernel_initializer=_mistral_kernel_initializer(stddev=0.02),
                 sliding_window=sliding_window,
+                dropout=dropout,
                 dtype=dtype,
                 name=f"transformer_layer_{i}",
             )(x, decoder_padding_mask=padding_mask)
@@ -107,6 +173,7 @@ def __init__(
         self.rope_scaling_factor = rope_scaling_factor
         self.sliding_window = sliding_window
         self.layer_norm_epsilon = layer_norm_epsilon
+        self.dropout = dropout
         self.token_embedding = token_embedding_layer
 
     def get_config(self):
@@ -123,6 +190,7 @@ def get_config(self):
                 "num_key_value_heads": self.num_key_value_heads,
                 "sliding_window": self.sliding_window,
                 "layer_norm_epsilon": self.layer_norm_epsilon,
+                "dropout": self.dropout,
             }
         )
         return config
diff --git a/keras_nlp/models/mistral/mistral_layer_norm.py b/keras_nlp/models/mistral/mistral_layer_norm.py
@@ -15,7 +15,11 @@
 from keras_nlp.backend import ops
 
 
+# TODO: Deprecate this in favor of `keras.layers.LayerNormalization` once
+#       Keras 2 support is removed.
 class MistralLayerNormalization(keras.layers.Layer):
+    """A normalization layer for Mistral that implements RMS normalization."""
+
     def __init__(self, epsilon=1e-6, **kwargs):
         super().__init__(**kwargs)
         self._epsilon = epsilon
diff --git a/keras_nlp/models/mistral/mistral_transformer_decoder.py b/keras_nlp/models/mistral/mistral_transformer_decoder.py
@@ -26,10 +26,12 @@
 from keras_nlp.utils.keras_utils import clone_initializer
 
 
-# TODO(tirthasheshpatel): Add dropout
 class MistralTransformerDecoder(keras.layers.Layer):
+    """A Transformer decoder layer for the Mistral backbone."""
+
     def __init__(
         self,
+        *,
         intermediate_dim,
         num_query_heads,
         num_key_value_heads,
diff --git a/tools/checkpoint_conversion/convert_mistral_checkpoints.py b/tools/checkpoint_conversion/convert_mistral_checkpoints.py
@@ -0,0 +1,161 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import pathlib
+
+import torch
+
+from keras_nlp.models import MistralBackbone
+
+from .scripts.mistral_torch import ModelArgs
+from .scripts.mistral_torch import Transformer as TorchTransformer
+
+MODEL_PATH = pathlib.Path("mistral-7B-v0.1")
+
+
+def port_weights(
+    model_k3: MistralBackbone, model_torch: TorchTransformer, params: ModelArgs
+):
+    model_k3.get_layer("token_embedding").embeddings.assign(
+        model_torch.tok_embeddings.weight.detach().cpu().numpy()
+    )
+
+    for i in range(model_k3.num_layers):
+        model_k3.get_layer(
+            f"transformer_layer_{i}"
+        )._self_attention_layer._key_dense.set_weights(
+            [
+                model_torch.layers[i]
+                .attention.wk.weight.T.reshape(
+                    params.dim, params.n_kv_heads, params.head_dim
+                )
+                .detach()
+                .cpu()
+                .numpy()
+            ]
+        )
+        model_k3.get_layer(
+            f"transformer_layer_{i}"
+        )._self_attention_layer._query_dense.set_weights(
+            [
+                model_torch.layers[i]
+                .attention.wq.weight.T.reshape(
+                    params.dim, params.n_heads, params.head_dim
+                )
+                .detach()
+                .cpu()
+                .numpy()
+            ]
+        )
+        model_k3.get_layer(
+            f"transformer_layer_{i}"
+        )._self_attention_layer._value_dense.set_weights(
+            [
+                model_torch.layers[i]
+                .attention.wv.weight.T.reshape(
+                    params.dim, params.n_kv_heads, params.head_dim
+                )
+                .detach()
+                .cpu()
+                .numpy()
+            ]
+        )
+        model_k3.get_layer(
+            f"transformer_layer_{i}"
+        )._self_attention_layer._output_dense.set_weights(
+            [
+                model_torch.layers[i]
+                .attention.wo.weight.T.reshape(
+                    params.n_heads, params.head_dim, params.dim
+                )
+                .detach()
+                .cpu()
+                .numpy()
+            ]
+        )
+        model_k3.get_layer(
+            f"transformer_layer_{i}"
+        )._self_attention_layernorm.set_weights(
+            [model_torch.layers[i].attention_norm.weight.detach().cpu().numpy()]
+        )
+        model_k3.get_layer(
+            f"transformer_layer_{i}"
+        )._feedforward_intermediate_dense.set_weights(
+            [
+                model_torch.layers[i]
+                .feed_forward.w3.weight.T.detach()
+                .cpu()
+                .numpy()
+            ]
+        )
+        model_k3.get_layer(
+            f"transformer_layer_{i}"
+        )._feedforward_output_dense.set_weights(
+            [
+                model_torch.layers[i]
+                .feed_forward.w2.weight.T.detach()
+                .cpu()
+                .numpy()
+            ]
+        )
+        model_k3.get_layer(
+            f"transformer_layer_{i}"
+        )._feedforward_gate_dense.set_weights(
+            [
+                model_torch.layers[i]
+                .feed_forward.w1.weight.T.detach()
+                .cpu()
+                .numpy()
+            ]
+        )
+        model_k3.get_layer(
+            f"transformer_layer_{i}"
+        )._feedforward_layernorm.set_weights(
+            [model_torch.layers[i].ffn_norm.weight.detach().cpu().numpy()]
+        )
+
+    model_k3.get_layer("sequence_output_layernorm").set_weights(
+        [model_torch.norm.weight.detach().cpu().numpy()]
+    )
+    model_k3.get_layer("token_embedding").reverse_embeddings.assign(
+        model_torch.output.weight.T.detach().cpu().numpy()
+    )
+
+
+if __name__ == "__main__":
+    with open(MODEL_PATH / "params.json", "r") as params_file:
+        params = ModelArgs(**json.load(params_file))
+
+    model_torch = TorchTransformer.from_folder(
+        MODEL_PATH, device="cpu", dtype=torch.float16
+    )
+    print("Torch model loaded")
+    model_k3 = MistralBackbone(
+        vocabulary_size=32000,
+        hidden_dim=4096,
+        num_layers=32,
+        num_query_heads=32,
+        num_key_value_heads=8,
+        intermediate_dim=14336,
+        sliding_window=4096,
+        layer_norm_epsilon=1e-6,
+        dtype="float16",
+    )
+    print("Keras 3 model loaded.")
+
+    port_weights(model_k3, model_torch, params)
+    print("Weight transfer done.")
+
+    model_k3.save_weights("mistral_7b.weights.h5")
+    print("Weights saved.")
diff --git a/tools/checkpoint_conversion/scripts/__init__.py b/tools/checkpoint_conversion/scripts/__init__.py
@@ -0,0 +1,13 @@
+# Copyright 2023 The KerasNLP Authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/tools/checkpoint_conversion/scripts/mistral_torch.py b/tools/checkpoint_conversion/scripts/mistral_torch.py