keras-team
diff --git a/‎STYLE_GUIDE.md‎
Lines changed: 1 addition & 1 deletion b/‎STYLE_GUIDE.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎keras_nlp/layers/modeling/alibi_bias.py‎
Lines changed: 4 additions & 1 deletion b/‎keras_nlp/layers/modeling/alibi_bias.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎keras_nlp/layers/modeling/f_net_encoder.py‎
Lines changed: 4 additions & 5 deletions b/‎keras_nlp/layers/modeling/f_net_encoder.py‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎keras_nlp/layers/modeling/masked_lm_head.py‎
Lines changed: 3 additions & 1 deletion b/‎keras_nlp/layers/modeling/masked_lm_head.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎keras_nlp/layers/modeling/position_embedding.py‎
Lines changed: 3 additions & 1 deletion b/‎keras_nlp/layers/modeling/position_embedding.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎keras_nlp/layers/modeling/reversible_embedding.py‎
Lines changed: 3 additions & 1 deletion b/‎keras_nlp/layers/modeling/reversible_embedding.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎keras_nlp/layers/modeling/rotary_embedding.py‎
Lines changed: 2 additions & 0 deletions b/‎keras_nlp/layers/modeling/rotary_embedding.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎keras_nlp/layers/modeling/sine_position_encoding.py‎
Lines changed: 3 additions & 1 deletion b/‎keras_nlp/layers/modeling/sine_position_encoding.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎keras_nlp/layers/modeling/token_and_position_embedding.py‎
Lines changed: 6 additions & 1 deletion b/‎keras_nlp/layers/modeling/token_and_position_embedding.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎keras_nlp/layers/modeling/transformer_decoder.py‎
Lines changed: 3 additions & 3 deletions b/‎keras_nlp/layers/modeling/transformer_decoder.py‎
Lines changed: 3 additions & 3 deletions
@@ -116,7 +116,7 @@ class PositionEmbedding(keras.layers.Layer):
     Args:
         sequence_length: The maximum length of the dynamic sequence.
 
-    Examples:
+    Example:
 
     Direct call.
     >>> layer = keras_nlp.layers.PositionEmbedding(sequence_length=10)
 
@@ -35,12 +35,15 @@ class AlibiBias(keras.layers.Layer):
             each head. The heads' slopes are a geometric sequence that starts at
             `2**(-alibi_bias_max/num_heads)` and uses that same value as its
             ratio. Defaults to 8.
+        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+            including `name`, `trainable`, `dtype` etc.
+
     Call arguments:
         attention_scores: The result of multipying the query and the key of the
             multi-head attention layer of the transformer to add alibi bias to
             it. With shape `(batch_size, num_heads, query_length, key_length)`.
 
-    Examples:
+    Example:
     ```python
     query_length = 10
     key_length = 10
 
@@ -47,10 +47,10 @@ class FNetEncoder(keras.layers.Layer):
         bias_initializer: "string" or `keras.initializers` initializer.
             The bias initializer for the dense layers.
             Defaults to `"zeros"`.
-        name: string. The name of the layer. Defaults to `None`.
-        **kwargs: other keyword arguments.
+        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+            including `name`, `trainable`, `dtype` etc.
 
-    Examples:
+    Example:
 
     ```python
     # Create a single FNet encoder layer.
@@ -79,10 +79,9 @@ def __init__(
         layer_norm_epsilon=1e-5,
         kernel_initializer="glorot_uniform",
         bias_initializer="zeros",
-        name=None,
         **kwargs
     ):
-        super().__init__(name=name, **kwargs)
+        super().__init__(**kwargs)
         self.intermediate_dim = intermediate_dim
         self.dropout = dropout
         self.activation = keras.activations.get(activation)
 
@@ -59,8 +59,10 @@ class MaskedLMHead(keras.layers.Layer):
         bias_initializer: string or `keras.initializers` initializer.
             The bias initializer for the dense and multiheaded
             attention layers. Defaults to `"zeros"`.
+        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+            including `name`, `trainable`, `dtype` etc.
 
-    Examples:
+    Example:
 
     ```python
     batch_size = 16
 
@@ -33,6 +33,8 @@ class PositionEmbedding(keras.layers.Layer):
         initializer: The initializer to use for the embedding weights. Defaults
             to `"glorot_uniform"`.
         seq_axis: The axis of the input tensor where we add the embeddings.
+        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+            including `name`, `trainable`, `dtype` etc.
 
     Call arguments:
         inputs: The tensor inputs to compute an embedding for, with shape
@@ -43,7 +45,7 @@ class PositionEmbedding(keras.layers.Layer):
             compute the position embedding from. This is useful during cached
             decoding, where each position is predicted separately in a loop.
 
-    Examples:
+    Example:
 
     Called directly on input.
     >>> layer = keras_nlp.layers.PositionEmbedding(sequence_length=10)
 
@@ -52,14 +52,16 @@ class ReversibleEmbedding(keras.layers.Embedding):
         reverse_dtype: The dtype for the reverse projection computation.
             For stability, it is usually best to use full precision even when
             working with half or mixed precision training.
+        **kwargs: other keyword arguments passed to `keras.layers.Embedding`,
+            including `name`, `trainable`, `dtype` etc.
 
     Call arguments:
         inputs: The tensor inputs to the layer.
         reverse: Boolean. If `True` the layer will perform a linear projection
             from `output_dim` to `input_dim`, instead of a normal embedding
             call. Default to `False`.
 
-    Examples:
+    Example:
     ```python
     batch_size = 16
     vocab_size = 100
 
@@ -38,6 +38,8 @@ class RotaryEmbedding(keras.layers.Layer):
         scaling_factor: float. The scaling factor used to scale frequency range.
         sequence_axis: int. Sequence axis in the input tensor.
         feature_axis: int. Feature axis in the input tensor.
+        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+            including `name`, `trainable`, `dtype` etc.
 
     Call arguments:
         inputs: The tensor inputs to apply the embedding to. This can have
 
@@ -34,6 +34,8 @@ class SinePositionEncoding(keras.layers.Layer):
         max_wavelength: The maximum angular wavelength of the sine/cosine
             curves, as described in Attention is All You Need. Defaults to
             `10000`.
+        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+            including `name`, `trainable`, `dtype` etc.
 
     Call arguments:
         inputs: The tensor inputs to compute an embedding for, with shape
@@ -42,7 +44,7 @@ class SinePositionEncoding(keras.layers.Layer):
             compute the encoding from. This is useful during cached decoding,
             where each position is predicted separately in a loop.
 
-    Examples:
+    Example:
     ```python
     # create a simple embedding layer with sinusoidal positional encoding
     seq_len = 100
 
@@ -33,6 +33,9 @@ class TokenAndPositionEmbedding(keras.layers.Layer):
         vocabulary_size: The size of the vocabulary.
         sequence_length: The maximum length of input sequence
         embedding_dim: The output dimension of the embedding layer
+        tie_weights: Boolean, whether or not the matrix for embedding and
+            the matrix for the `reverse` projection should share the same
+            weights.
         embeddings_initializer: The initializer to use for the Embedding
             Layers
         mask_zero: Boolean, whether or not the input value 0 is a special
@@ -43,8 +46,10 @@ class TokenAndPositionEmbedding(keras.layers.Layer):
             If mask_zero` is set to True, as a consequence, index 0 cannot be
             used in the vocabulary
             (input_dim should equal size of vocabulary + 1).
+        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+            including `name`, `trainable`, `dtype` etc.
 
-    Examples:
+    Example:
     ```python
     inputs = np.ones(shape=(1, 50), dtype="int32")
     embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
 
@@ -69,10 +69,10 @@ class TransformerDecoder(keras.layers.Layer):
             (similar to GPT-2). If set to False, outputs of attention layer and
             intermediate dense layer are normalized (similar to BERT).
             Defaults to `False`.
-        name: string. The name of the layer. Defaults to `None`.
-        **kwargs: other keyword arguments.
+        **kwargs: other keyword arguments passed to `keras.layers.Layer`,
+            including `name`, `trainable`, `dtype` etc.
 
-    Examples:
+    Example:
     ```python
     # Create a single transformer decoder layer.
     decoder = keras_nlp.layers.TransformerDecoder(