Update PaliGemma to remove include_rescaling arg (keras-team#1917)

divyashreepathihalli · ushareng · commit f56d20edceae · 2024-10-25T00:35:24.000+05:30
* update PaliGemma

* update conversion script

* fix GPU tests
diff --git a/keras_hub/src/models/pali_gemma/pali_gemma_backbone.py b/keras_hub/src/models/pali_gemma/pali_gemma_backbone.py
@@ -61,8 +61,6 @@ class PaliGemmaBackbone(Backbone):
         vit_classifier_activation: activation function. The activation that
             is used for final output classification in the vision transformer.
         vit_name: string. The name used for vision transformer layers.
-        include_rescaling: bool. If true, the image input will be rescaled from
-            the range `[0, 255]`, to the range `[0, 1]`.
         layer_norm_epsilon: float. The epsilon value user for every layer norm
             in all transformer blocks.
         dropout: float. Dropout probability for the Transformer decoder blocks.
@@ -121,7 +119,6 @@ def __init__(
         vit_pooling=None,
         vit_classifier_activation=None,
         vit_name=None,
-        include_rescaling=True,
         layer_norm_epsilon=1e-6,
         dropout=0,
         dtype=None,
@@ -145,7 +142,6 @@ def __init__(
         vit_intermediate_dim = vit_intermediate_dim or 4304
         self.vit_encoder = PaliGemmaVit(
             image_size=image_size,
-            include_rescaling=include_rescaling,
             patch_size=vit_patch_size,
             num_heads=vit_num_heads,
             hidden_dim=vit_hidden_dim,
@@ -215,7 +211,6 @@ def __init__(
         # === Config ===
         self.vocabulary_size = vocabulary_size
         self.image_size = image_size
-        self.include_rescaling = include_rescaling
         self.num_layers = num_layers
         self.num_query_heads = num_query_heads
         self.num_key_value_heads = num_key_value_heads
@@ -242,7 +237,6 @@ def get_config(self):
             {
                 "vocabulary_size": self.vocabulary_size,
                 "image_size": self.image_size,
-                "include_rescaling": self.include_rescaling,
                 "num_layers": self.num_layers,
                 "num_query_heads": self.num_query_heads,
                 "num_key_value_heads": self.num_key_value_heads,
diff --git a/keras_hub/src/models/pali_gemma/pali_gemma_vit.py b/keras_hub/src/models/pali_gemma/pali_gemma_vit.py
@@ -410,8 +410,6 @@ class PaliGemmaVit(keras.Model):
     Args:
         image_size: int. The height/width of the image. Both height and width is
             expected to be the same.
-        include_rescaling: bool. If true, the image input will be rescaled from
-            the range `[0, 255]`, to the range `[0, 1]`.
         patch_size: int. The size of each square patch in the input image.
         num_heads: int. The number of attention heads for the vision(image)
             transformer encoder.
@@ -452,7 +450,6 @@ def __init__(
         num_layers,
         intermediate_dim,
         num_classes,
-        include_rescaling=True,
         pooling=None,
         classifier_activation=None,
         dtype=None,
@@ -463,14 +460,6 @@ def __init__(
             shape=(image_size, image_size, 3), name="images"
         )
         x = image_input  # Intermediate result.
-        # TODO we have moved this rescaling to preprocessing layers for most
-        # models. We should consider removing it here, though it would break
-        # compatibility.
-        if include_rescaling:
-            rescaling = keras.layers.Rescaling(
-                scale=1.0 / 127.5, offset=-1.0, name="rescaling"
-            )
-            x = rescaling(image_input)
         x = PaliGemmaVitEncoder(
             hidden_dim=hidden_dim,
             num_layers=num_layers,
@@ -520,7 +509,6 @@ def __init__(
         self.pooling = pooling
         self.num_classes = num_classes
         self.image_size = image_size
-        self.include_rescaling = include_rescaling
         self.patch_size = patch_size
         self.classifier_activation = keras.activations.get(
             classifier_activation
@@ -549,7 +537,6 @@ def get_config(self):
                     self.classifier_activation
                 ),
                 "image_size": self.image_size,
-                "include_rescaling": self.include_rescaling,
                 "patch_size": self.patch_size,
             }
         )
diff --git a/keras_hub/src/models/pali_gemma/pali_gemma_vit_test.py b/keras_hub/src/models/pali_gemma/pali_gemma_vit_test.py
@@ -30,23 +30,6 @@ def test_vit_encoder(self):
             output.shape, (batch_size, intermediate_dim, hidden_dim)
         )
 
-    def test_vit_rescaling(self):
-        vit_encoder = PaliGemmaVit(
-            image_size=16,
-            patch_size=4,
-            hidden_dim=8,
-            num_layers=2,
-            num_heads=2,
-            intermediate_dim=16,
-            num_classes=32,
-        )
-        self.assertIsNotNone(vit_encoder.get_layer("rescaling"))
-        with self.assertRaises(ValueError):
-            config = vit_encoder.get_config()
-            config["include_rescaling"] = False
-            vit_encoder = PaliGemmaVit.from_config(config)
-            vit_encoder.get_layer("rescaling")
-
     def test_vision_embeddings(self):
         embeddings_layer = PaliGemmaVitEmbeddings(
             image_size=16,
diff --git a/tools/checkpoint_conversion/convert_pali_gemma_checkpoints.py b/tools/checkpoint_conversion/convert_pali_gemma_checkpoints.py
@@ -3,15 +3,24 @@
 
 import numpy as np
 
+from keras_hub.src.models.pali_gemma.pali_gemma_backbone import (
+    PaliGemmaBackbone,
+)
+from keras_hub.src.models.pali_gemma.pali_gemma_causal_lm import (
+    PaliGemmaCausalLM,
+)
+from keras_hub.src.models.pali_gemma.pali_gemma_causal_lm_preprocessor import (
+    PaliGemmaCausalLMPreprocessor,
+)
+from keras_hub.src.models.pali_gemma.pali_gemma_image_converter import (
+    PaliGemmaImageConverter,
+)
+
 os.environ["KERAS_BACKEND"] = "jax"
 
 import keras  # noqa: E402
 from keras import ops  # noqa: E402
 
-from keras_hub.src.models.pali_gemma.pali_gemma_backbone import (  # noqa: E402
-    PaliGemmaBackbone,
-)
-
 # No GPU for conversion, makes memory management easier.
 os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
 
@@ -301,7 +310,18 @@ def main(args):
         "vit_hidden_dim": 1152,
         "image_size": args.image_size,
     }
-    keras_model = PaliGemmaBackbone(**pali_gemma_backbone_config)
+    pg_image_converter = PaliGemmaImageConverter(
+        image_size=(args.image_size, args.image_size),
+        scale=1.0 / 127.5,
+        offset=-1,
+    )
+    pg_presprocessor = PaliGemmaCausalLMPreprocessor(
+        image_converter=pg_image_converter
+    )
+    pg_backbone = PaliGemmaBackbone(**pali_gemma_backbone_config)
+    keras_model = PaliGemmaCausalLM(
+        preprocessor=pg_presprocessor, backbone=pg_backbone
+    )
     # This could be from kaggle or provide local dir path
     weights = np.load(args.weights_path)
     jax_weights = get_weights_as_numpy(weights, **pali_gemma_backbone_config)