From 40f0380bba1fceed1dbc470672b67b4b14d4b446 Mon Sep 17 00:00:00 2001 From: Matt Watson Date: Fri, 8 Dec 2023 11:29:59 -0800 Subject: [PATCH] Remove lora dense for now We are consider brining this into core Keras, with a slightly different design. Let's remove this version from KerasNLP so people don't rely on it. --- keras_nlp/layers/__init__.py | 1 - keras_nlp/layers/modeling/lora_dense.py | 234 ------------------- keras_nlp/layers/modeling/lora_dense_test.py | 135 ----------- 3 files changed, 370 deletions(-) delete mode 100644 keras_nlp/layers/modeling/lora_dense.py delete mode 100644 keras_nlp/layers/modeling/lora_dense_test.py diff --git a/keras_nlp/layers/__init__.py b/keras_nlp/layers/__init__.py index 71c5eb0411..595c4eb661 100644 --- a/keras_nlp/layers/__init__.py +++ b/keras_nlp/layers/__init__.py @@ -16,7 +16,6 @@ CachedMultiHeadAttention, ) from keras_nlp.layers.modeling.f_net_encoder import FNetEncoder -from keras_nlp.layers.modeling.lora_dense import LoraDense from keras_nlp.layers.modeling.masked_lm_head import MaskedLMHead from keras_nlp.layers.modeling.position_embedding import PositionEmbedding from keras_nlp.layers.modeling.reversible_embedding import ReversibleEmbedding diff --git a/keras_nlp/layers/modeling/lora_dense.py b/keras_nlp/layers/modeling/lora_dense.py deleted file mode 100644 index c439d86399..0000000000 --- a/keras_nlp/layers/modeling/lora_dense.py +++ /dev/null @@ -1,234 +0,0 @@ -# Copyright 2023 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -import re - -from keras_nlp.api_export import keras_nlp_export -from keras_nlp.backend import config -from keras_nlp.backend import keras -from keras_nlp.backend import ops - - -def validate_einsum_equation(equation): - # For simplicity, we greatly restrict possible einsum equations. The final - # axis of the input must be the first axis of our kernel, and must not - # appear in our output. - left, right, output = re.split(",|->", equation) - valid = ( - left[-1] == right[0] - and left[-1] not in output - and set(left[:-1]).isdisjoint(set(right[1:])) - ) - if not valid: - raise ValueError( - "When passing a `EinsumDense` layer to a `LoraDense` layer, the " - "einsum `equation` must always have the form `*x,x*->*`, where " - "each `*` can be any sequence. Conceptually, the `equation` should " - "always represent a dense matmul on the last axis of the input. " - f"Received invalid equation `'{equation}'`." - ) - - -@keras_nlp_export("keras_nlp.layers.LoraDense") -class LoraDense(keras.layers.Layer): - """A LoRA adapter layer for a dense input layer. - - This layer implements a low-rank decomposition of a dense transformation, as - described in [LoRA: Low-Rank Adaptation Of Large Language Models](https://arxiv.org/pdf/2106.09685.pdf) - This layer can be used to replace a dense layer with a layer whose - parameters are mostly frozen. - - By default, this layer takes in an `inner_dense` layer, freezes its - parameters, and builds a low-rank decomposed update to sum with the original - `inner_dense` output. These update parameters can be merged back into the - `inner_dense` kernel by calling `merge_weights()`. - - Args: - inner_dense: A `keras.layers.Dense` or `keras.layers.EinsumDense`. - The inner dense layer to freeze and wrap with the `LoraDense` - layer. Note that for `EinsumDense` layers, the einsum equation must - represent a dense transformation on the last axis of the input, - though adding new axes to the output (e.g. a multi-head axis) is - allowed. - rank: int. The inner rank of the decomposed dense transformation. The - lower this number, the fewer trainable parameters the layer will - have. - alpha: float. A constant value used for scaling the lora update. The - lora update to the original dense transformation will be scaled by - `alpha / rank`. - lora_a_initializer: The initializer to use for the inner projection - from layer inputs to the inner `rank` intermediate outputs. - freeze_kernel: If true, the kernel of the inner dense layer will have - `trainable` set to `False`. - freeze_bias: If true, the kernel of the inner dense layer will have - `trainable` set to `False`. - **kwargs: other keyword arguments. - - Examples: - - Wrap a `Dense` layer. - ```python - batch_size, feature_size = 4, 16 - rank = 4 - inputs = np.random.uniform(size=(batch_size, feature_size)) - inner_dense = keras.layers.Dense(feature_size) - lora_dense = keras_nlp.layers.LoraDense(inner_dense, rank=4) - # Output with inner dense begins equal. - assert np.allclose(inner_dense(inputs), lora_dense(inputs)) - - # Add some random updates to the lora parameters. - lora_dense.lora_a.assign(np.random.uniform(size=(feature_size, rank))) - lora_dense.lora_b.assign(np.random.uniform(size=(rank, feature_size))) - assert not np.allclose(inner_dense(inputs), lora_dense(inputs)) - - # Merge the lora dense and output - lora_dense.merge_weights() - assert np.allclose(inner_dense(inputs), lora_dense(inputs)) - ``` - - Wrap an `EinsumDense` layer with a multi-head projection. - ```python - batch_size, sequence_length, feature_size = 4, 10, 16 - num_heads = 2 - rank = 4 - inputs = np.random.uniform(size=(batch_size, sequence_length, feature_size)) - inner_dense = keras.layers.EinsumDense( - "abc,cde->abde", - output_shape=(sequence_length, num_heads, feature_size // num_heads), - ) - lora_dense = keras_nlp.layers.LoraDense(inner_dense, rank=4) - # Output shape (4, 10, 2, 8) - lora_dense(inputs) - ``` - """ - - def __init__( - self, - inner_dense, - rank=8, - alpha=8.0, - lora_a_initializer="variance_scaling", - freeze_kernel=True, - freeze_bias=True, - **kwargs, - ): - # Default to the same dtype as our inner layer. - if "dtype" not in kwargs: - kwargs["dtype"] = inner_dense.dtype_policy - super().__init__(**kwargs) - - if not config.keras_3(): - raise ValueError( - "Lora requires with Keras 3, but Keras 2 is installed. Please " - "see https://github.com/keras-team/keras-nlp#installation" - ) - - if isinstance(inner_dense, keras.layers.Dense): - self.inner_dense = inner_dense - elif isinstance(inner_dense, keras.layers.EinsumDense): - self.inner_dense = inner_dense - validate_einsum_equation(inner_dense.equation) - else: - raise ValueError( - "Only `Dense` and `EinsumDense` inner layers are supported. " - f"Received: inner_dense={inner_dense}" - ) - - self.rank = rank - self.alpha = alpha - self.scale = alpha / rank - self.freeze_kernel = freeze_kernel - self.freeze_bias = freeze_bias - self.lora_a_initializer = keras.initializers.get(lora_a_initializer) - - if inner_dense.built: - self.build_from_config(inner_dense.get_build_config()) - - def build(self, inputs_shape): - if not self.inner_dense.built: - self.inner_dense.build(inputs_shape) - - if self.freeze_kernel and self.inner_dense.kernel is not None: - self.inner_dense.kernel.trainable = False - - if self.freeze_bias and self.inner_dense.bias is not None: - self.inner_dense.bias.trainable = False - - input_dim = inputs_shape[-1] - self.lora_a = self.add_weight( - name="lora_a", - shape=(input_dim, self.rank), - initializer=self.lora_a_initializer, - ) - kernel_shape = self.inner_dense.kernel.shape - self.lora_b = self.add_weight( - name="lora_b", - shape=(self.rank,) + kernel_shape[1:], - initializer="zeros", - ) - self.built = True - - def merge_weights(self): - """Merge lora updates into the wrapped dense layer. - - This function should only be called outside of any compiled context - (e.g. not during `fit()`, `predict()` or `evaluate()`). It will merge - the updates from the lora layers into the original dense layer, and - re-initialize the lora variables. - """ - if not self.built: - return - - # Compute matmul of lora_a and lora_b to get a kernel sized update. - update = ops.tensordot(self.lora_a, self.lora_b, axes=([-1], [0])) - update = update * ops.cast(self.scale, update.dtype) - # Add lora updates back into the inner dense kernel. - self.inner_dense.kernel.assign_add(update) - # Re-initialize lora weights. - self.lora_a.assign( - self.lora_a_initializer(self.lora_a.shape, self.lora_a.dtype) - ) - self.lora_b.assign(ops.zeros_like(self.lora_b)) - - def call(self, inputs): - original_output = self.inner_dense(inputs) - # Compute the low-rank intermediate output. - update = ops.matmul(inputs, self.lora_a) - # Use the matching dense computation for a Dense or EinsumDense. - if isinstance(self.inner_dense, keras.layers.Dense): - update = ops.matmul(update, self.lora_b) - else: - update = ops.einsum(self.inner_dense.equation, update, self.lora_b) - # Scale and sum the lora update with the original frozen output. - return original_output + update * ops.cast(self.scale, update.dtype) - - @classmethod - def from_config(cls, config): - config["inner_dense"] = keras.layers.deserialize(config["inner_dense"]) - return super().from_config(config) - - def get_config(self): - config = super().get_config() - config.update( - { - "inner_dense": keras.layers.serialize(self.inner_dense), - "rank": self.rank, - "alpha": self.alpha, - "lora_a_initializer": keras.initializers.serialize( - self.lora_a_initializer - ), - "freeze_kernel": self.freeze_kernel, - "freeze_bias": self.freeze_bias, - } - ) - return config diff --git a/keras_nlp/layers/modeling/lora_dense_test.py b/keras_nlp/layers/modeling/lora_dense_test.py deleted file mode 100644 index a15718b03a..0000000000 --- a/keras_nlp/layers/modeling/lora_dense_test.py +++ /dev/null @@ -1,135 +0,0 @@ -# Copyright 2023 The KerasNLP Authors -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import pytest - -from keras_nlp.backend import keras -from keras_nlp.backend import random -from keras_nlp.layers.modeling.lora_dense import LoraDense -from keras_nlp.tests.test_case import TestCase - - -@pytest.mark.keras_3_only -class LoraDenseTest(TestCase): - def test_layer_behaviors(self): - self.run_layer_test( - cls=LoraDense, - init_kwargs={ - "inner_dense": keras.layers.Dense(16), - "rank": 2, - "alpha": 16, - "lora_a_initializer": "HeNormal", - }, - input_data=random.uniform(shape=(2, 4, 8)), - expected_output_shape=(2, 4, 16), - expected_num_trainable_weights=2, - expected_num_non_trainable_weights=2, - expected_num_non_trainable_variables=2, - run_mixed_precision_check=False, - ) - - def test_layer_behaviors_einsum(self): - self.run_layer_test( - cls=LoraDense, - init_kwargs={ - "inner_dense": keras.layers.EinsumDense( - "abc,cde->abde", - output_shape=(None, 2, 16), - ), - "lora_a_initializer": "HeNormal", - }, - input_data=random.uniform(shape=(2, 4, 8)), - expected_output_shape=(2, 4, 2, 16), - expected_num_trainable_weights=2, - expected_num_non_trainable_weights=1, - expected_num_non_trainable_variables=1, - run_mixed_precision_check=False, - ) - - def test_merge_dense(self): - inner_dense = keras.layers.Dense(16) - layer = LoraDense(inner_dense, rank=4) - layer.build((2, 16)) - layer.lora_a.assign(random.uniform(shape=(16, 4))) - layer.lora_b.assign(random.uniform(shape=(4, 16))) - - input_data = random.uniform((2, 16)) - lora_output = layer(input_data) - dense_output = inner_dense(input_data) - self.assertNotAllClose(lora_output, dense_output) - - layer.merge_weights() - merged_lora_output = layer(input_data) - dense_output = inner_dense(input_data) - self.assertAllClose(lora_output, merged_lora_output) - self.assertAllClose(lora_output, dense_output) - - def test_merge_einsum(self): - inner_dense = keras.layers.EinsumDense( - "abc,cde->abde", - output_shape=(None, 2, 16), - ) - layer = LoraDense(inner_dense, rank=4) - layer.build((2, 4, 16)) - layer.lora_a.assign(random.uniform(shape=(16, 4))) - layer.lora_b.assign(random.uniform(shape=(4, 2, 16))) - - input_data = random.uniform((2, 4, 16)) - lora_output = layer(input_data) - dense_output = inner_dense(input_data) - self.assertNotAllClose(lora_output, dense_output) - - layer.merge_weights() - merged_lora_output = layer(input_data) - dense_output = inner_dense(input_data) - self.assertAllClose(lora_output, merged_lora_output) - self.assertAllClose(lora_output, dense_output) - - def test_freezing(self): - inner_dense = keras.layers.Dense(16) - layer = LoraDense(inner_dense, freeze_bias=False) - layer.build((2, 16)) - self.assertFalse(inner_dense.kernel.trainable) - self.assertTrue(inner_dense.bias.trainable) - - inner_dense = keras.layers.Dense(16) - layer = LoraDense(inner_dense) - layer.build((2, 16)) - self.assertFalse(inner_dense.kernel.trainable) - self.assertFalse(inner_dense.bias.trainable) - - def test_errors_if_not_dense(self): - with self.assertRaises(ValueError): - LoraDense(keras.layers.Concatenate()) - - def test_errors_invalid_einsum(self): - with self.assertRaises(ValueError): - # Kernel feature dim in the wrong place. - einsum = keras.layers.EinsumDense("abc,dec->abde", (2, 2, 16)) - LoraDense(einsum, rank=4) - - with self.assertRaises(ValueError): - # Input feature dim in the wrong place. - einsum = keras.layers.EinsumDense("acb,cde->abde", (2, 2, 16)) - LoraDense(einsum, rank=4) - - with self.assertRaises(ValueError): - # Input feature dim not summed over. - einsum = keras.layers.EinsumDense("abc,cde->abcde", (2, 2, 2, 16)) - LoraDense(einsum, rank=4) - - with self.assertRaises(ValueError): - # Double summations. - einsum = keras.layers.EinsumDense("abcd,cde->abe", (2, 2, 16)) - LoraDense(einsum, rank=4)