From aa86f5ab85205312cde9a8568fb809e987ee99d8 Mon Sep 17 00:00:00 2001 From: Divyashree Sreepathihalli Date: Mon, 18 Aug 2025 17:14:57 -0700 Subject: [PATCH 01/10] add styleguide for code reviews --- .gemini/styleguide.md | 750 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 750 insertions(+) create mode 100644 .gemini/styleguide.md diff --git a/.gemini/styleguide.md b/.gemini/styleguide.md new file mode 100644 index 0000000000..3c4ec8c234 --- /dev/null +++ b/.gemini/styleguide.md @@ -0,0 +1,750 @@ +## Key Principles + +- **Modularity**: Models are broken down into distinct, reusable components: Backbone, Converter (Tokenizer, ImageConverter, etc.), Preprocessor, and Task. +- **Consistency**: Strict adherence to naming and file structure conventions is crucial for predictability and maintainability. +- **Validation**: Every component must be rigorously validated against the original model's implementation. Numerical equivalence is a primary requirement, demonstrated via Colab notebooks. +- **Reusability**: Prioritize using existing layers from `keras.layers` and `keras_nlp.layers` before implementing custom logic. +- **Backend Agnostic**: All code must be backend-agnostic, supporting TensorFlow, JAX, and PyTorch backends. + +## Directory and File Structure + +All new model contributions must follow a standardized directory and file structure within `keras_hub/src/models/`. For a model named `MyModel`, the structure must be: + +``` +keras_hub/ +└── src/ + └── models/ + └── my_model/ + ├── __init__.py + ├── my_model_backbone.py + ├── my_model_backbone_test.py + ├── my_model_tokenizer.py # For NLP models + ├── my_model_tokenizer_test.py # For NLP models + ├── my_model_image_converter.py # For Vision models + ├── my_model_image_converter_test.py # For Vision models + ├── my_model_audio_converter.py # For Audio models + ├── my_model_audio_converter_test.py # For Audio models + ├── my_model_classifier.py # Example task + ├── my_model_classifier_test.py # Example task test + ├── my_model_preprocessor.py # Preprocessor for all tasks + ├── my_model_preprocessor_test.py + └── my_model_presets.py +``` + +Checkpoint conversion scripts have their own location: +``` +tools/ +└── checkpoint_conversion/ + └── convert_my_model_checkpoints.py +``` + +For models being ported from HuggingFace, converters should be added to: +``` +keras_hub/src/utils/transformers/ +├── convert_my_model.py +└── convert_my_model_test.py +``` + +## Naming Conventions + +### Files +- **Format**: All filenames must be lowercase with underscores (snake_case). +- **Pattern**: Follow the pattern `_.py`. +- **Examples**: `distil_bert_backbone.py`, `distil_bert_tokenizer.py`, `distil_bert_classifier_test.py`. + +### Classes +- **Format**: All class names must use CapWords (PascalCase). +- **Pattern**: Follow the pattern ``. +- **Examples**: `DistilBertBackbone`, `DistilBertTokenizer`, `DistilBertClassifier`, `DistilBertPreprocessor`. + +### Functions and Methods +- **Format**: Use lowercase with underscores (snake_case). +- **Examples**: `from_preset()`, `call()`, `predict()`. + +### Model Inputs +Use standardized names for model input arguments to ensure interoperability: +- **Text Models**: `token_ids`, `padding_mask` +- **Image Models**: `pixel_values` +- **Audio Models**: `audio_features` + +## Code Implementation Style + +### Backbone Models (`_backbone.py`) + +**Structure**: The backbone model must be a class that inherits from `keras.Model`. + +**Implementation**: Use the Keras Functional API to define the model graph inside the class `__init__` method. + +**API**: Do not implement `from_preset()` in the initial PR for the backbone. This is added later with the presets. + +**Reusability**: Prefer using layers from `keras.layers` and `keras_nlp.layers`. Custom layers should only be implemented for significant architectural deviations not covered by existing Keras components. + +**Example Structure**: +```python +@keras_hub_export("keras_hub.models.MyModelBackbone") +class MyModelBackbone(Backbone): + """MyModel core network with hyperparameters. + + This backbone implements the base architecture for MyModel. + + Args: + vocabulary_size: int. The size of the token vocabulary. + num_layers: int. The number of transformer layers. + hidden_dim: int. The size of the transformer hidden state. + intermediate_dim: int. The output dimension of the first Dense layer. + dropout: float. Dropout probability for the Transformer encoder. + max_sequence_length: int. The maximum sequence length that this encoder + can consume. + dtype: string or `keras.mixed_precision.DTypePolicy`. The dtype to use + for model computations and weights. + + Example: + ```python + input_data = { + "token_ids": np.ones(shape=(1, 12), dtype="int32"), + "padding_mask": np.array([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0]]), + } + + # Pretrained MyModel backbone. + model = keras_hub.models.MyModelBackbone.from_preset("my_model_base") + model(input_data) + ``` + """ + + def __init__( + self, + vocabulary_size, + num_layers, + # ... other parameters + dtype=None, + **kwargs, + ): + super().__init__(dtype=dtype, **kwargs) + + # === Layers === + self.token_embedding = keras.layers.Embedding( + input_dim=vocabulary_size, + output_dim=hidden_dim, + name="token_embedding", + ) + # ... other layers + + # === Functional Model === + token_ids = keras.Input(shape=(None,), dtype="int32", name="token_ids") + padding_mask = keras.Input(shape=(None,), dtype="int32", name="padding_mask") + + # ... model graph definition + + super().__init__( + inputs={ + "token_ids": token_ids, + "padding_mask": padding_mask, + }, + outputs=outputs, + **kwargs, + ) +``` + +### Data Converters (`_tokenizer.py`, etc.) + +**Purpose**: Converters transform raw data (text, images, audio) into a numerical format. They handle tasks like vocabulary mapping, resizing, or feature extraction. + +**Structure**: +- Text models use a `Tokenizer` class (e.g., `MyModelTokenizer`). +- Image models use an `ImageConverter` class (e.g., `MyModelImageConverter`). +- Audio models use an `AudioConverter` class (e.g., `MyModelAudioConverter`). + +**Inheritance**: Subclass from the appropriate base class in KerasHub where available. + +**Example Tokenizer**: +```python +@keras_hub_export( + [ + "keras_hub.tokenizers.MyModelTokenizer", + "keras_hub.models.MyModelTokenizer", + ] +) +class MyModelTokenizer(WordPieceTokenizer): + """A MyModel tokenizer using WordPiece subword segmentation. + + This tokenizer class will tokenize raw strings into integer sequences and + is based on `keras_hub.tokenizers.WordPieceTokenizer`. + + Args: + vocabulary: list of strings or str. A list of strings or a string filename path. + lowercase: bool. If `True`, the input text will be first lowered before tokenization. + + Examples: + ```python + # Unbatched input. + tokenizer = keras_hub.models.MyModelTokenizer.from_preset("my_model_base") + tokenizer("The quick brown fox jumped.") + ``` + """ + + backbone_cls = MyModelBackbone + + def __init__( + self, + vocabulary=None, + lowercase=False, + **kwargs, + ): + super().__init__(vocabulary=vocabulary, lowercase=lowercase, **kwargs) + + # Add special tokens + self.start_token_id = self.token_to_id("[CLS]") + self.end_token_id = self.token_to_id("[SEP]") + self.pad_token_id = self.token_to_id("[PAD]") + self.mask_token_id = self.token_to_id("[MASK]") +``` + +### Preprocessors (`_preprocessor.py`) + +**Purpose**: A preprocessor is a `keras.layers.Layer` that orchestrates the entire preprocessing pipeline, turning raw user input into model-ready tensors. + +**Structure**: It internally instantiates and uses the model's specific Converter (e.g., `MyModelTokenizer`). + +**Functionality**: It handles padding, truncation, generating attention masks, and formatting the output into a dictionary of tensors that match the backbone's input signature (e.g., `{"token_ids": ..., "padding_mask": ...}`). + +**Example Preprocessor**: +```python +@keras_hub_export("keras_hub.models.MyModelPreprocessor") +class MyModelPreprocessor(TextClassifierPreprocessor): + """MyModel preprocessing for text classification. + + This preprocessing layer will prepare inputs for text classification. + + Args: + tokenizer: `keras_hub.models.MyModelTokenizer`. A tokenizer instance. + sequence_length: int. The length of the packed inputs. + + Examples: + ```python + preprocessor = keras_hub.models.MyModelPreprocessor.from_preset("my_model_base") + preprocessor("The quick brown fox jumped.") + ``` + """ + + backbone_cls = MyModelBackbone + tokenizer_cls = MyModelTokenizer + + def __init__( + self, + tokenizer, + sequence_length=512, + **kwargs, + ): + super().__init__(tokenizer=tokenizer, sequence_length=sequence_length, **kwargs) +``` + +### Task Models (`_.py`) + +**Purpose**: A task model combines a Backbone, a Preprocessor, and a task-specific head (e.g., a classification or generation head). + +**Structure**: It should be a class that inherits from `keras.Model`. + +**API**: Provide a simple, high-level API for end-users, such as `predict()`, `fit()`, and `generate()`. + +**Example Task Model**: +```python +@keras_hub_export("keras_hub.models.MyModelTextClassifier") +class MyModelTextClassifier(TextClassifier): + """MyModel text classification model. + + This model combines a MyModel backbone with a classification head. + + Args: + backbone: `keras_hub.models.MyModelBackbone`. A backbone instance. + preprocessor: `keras_hub.models.MyModelPreprocessor`. A preprocessor instance. + num_classes: int. Number of classes to predict. + + Examples: + ```python + classifier = keras_hub.models.MyModelTextClassifier.from_preset( + "my_model_base", + num_classes=2, + ) + classifier.predict(["What an amazing movie!", "A total waste of my time."]) + ``` + """ + + backbone_cls = MyModelBackbone + preprocessor_cls = MyModelPreprocessor + + def __init__( + self, + backbone, + preprocessor=None, + num_classes=2, + activation="softmax", + **kwargs, + ): + # ... implementation +``` + +### Presets (`_presets.py`) + +**Purpose**: This file defines a dictionary of preset configurations for the model. + +**Content**: Each entry includes the configuration arguments for the model (`config`), a description, and the URL to the pre-trained weights hosted on Kaggle (`weights_url`). + +**Example Presets**: +```python +"""MyModel preset configurations.""" + +backbone_presets = { + "my_model_base_en": { + "metadata": { + "description": "Base MyModel model trained on English text.", + "params": 110000000, + "path": "my_model", + }, + "kaggle_handle": "kaggle://keras/my_model/keras/my_model_base_en/1", + }, + "my_model_large_en": { + "metadata": { + "description": "Large MyModel model trained on English text.", + "params": 340000000, + "path": "my_model", + }, + "kaggle_handle": "kaggle://keras/my_model/keras/my_model_large_en/1", + }, +} +``` + +## Docstrings and Type Hinting + +### Docstrings +- Use Google-style docstrings for all public classes, methods, and functions. +- The first line should be a concise summary. +- Include comprehensive examples showing usage patterns. +- Document all parameters, return values, and exceptions. + +### Type Hints +- KerasHub does not use type hints in function signatures or `__init__` methods. +- Type information is provided in the docstring Args section using the format `arg_name: type. description`. +- Focus on clear, descriptive parameter names and comprehensive docstrings. + +**Example of good docstring with type hints in Args section**: +```python +def load_vocabulary(vocab_path): + """Loads a vocabulary file into a dictionary. + + Args: + vocab_path: str. The path to the vocabulary file. Each line in the + file should contain a single token. + + Returns: + A dictionary mapping tokens to their integer IDs. + + Raises: + FileNotFoundError: If the vocabulary file does not exist. + """ + vocab = {} + with open(vocab_path, "r") as reader: + for index, token in enumerate(reader): + token = token.strip() + vocab[token] = index + return vocab +``` + +## Testing and Validation + +### Testing Requirements +Testing is a non-negotiable part of every contribution. Beyond the existence of test files, the tests themselves must follow standardized routines to ensure all core functionality is covered. + +### Unit Tests +**Requirement**: Every `.py` file containing logic (backbone, tokenizer, task, etc.) must have a corresponding `_test.py` file. + +### Standardized Test Routines +KerasHub provides helper methods in the `TestCase` class that handle the standardized test routines. Users should use these methods instead of writing tests from scratch: + +#### 1. Basic Usage and Shape Inference +**Method**: Use `self.run_backbone_test()` for backbone models or `self.run_layer_test()` for layers. +**Purpose**: Verifies that the model can be instantiated and called with valid inputs, checks output shapes, and runs additional validation. + +#### 2. Variable Input Shapes +**Method**: Handled automatically by `self.run_backbone_test()` and `self.run_layer_test()`. +**Purpose**: Ensures the model works with dynamic input shapes (e.g., variable batch size or sequence length). + +#### 3. from_preset() Functionality +**Method**: Use `self.run_preset_test()` for testing preset loading. +**Purpose**: Confirms that all model presets can be loaded correctly and produce expected outputs. + +#### 4. Serialization (save() and load_model()) +**Method**: Use `self.run_model_saving_test()` for testing model serialization. +**Purpose**: Guarantees that the model can be saved and reloaded without losing its state. + +#### 5. Attached Preprocessor (for Task Models) +**Method**: Use `self.run_task_test()` for testing task models with preprocessors. +**Purpose**: Verifies the end-to-end functionality of a task model with raw inputs. + +#### Available Test Helper Methods: +- `self.run_backbone_test()` - For backbone models +- `self.run_vision_backbone_test()` - For vision backbone models +- `self.run_layer_test()` - For individual layers +- `self.run_preprocessor_test()` - For preprocessors +- `self.run_task_test()` - For task models +- `self.run_preset_test()` - For testing preset loading +- `self.run_model_saving_test()` - For testing serialization + +### Example Test Structure +```python +import pytest +from keras import ops + +from keras_hub.src.models.my_model.my_model_backbone import MyModelBackbone +from keras_hub.src.tests.test_case import TestCase + + +class MyModelBackboneTest(TestCase): + def setUp(self): + self.init_kwargs = { + "vocabulary_size": 10, + "num_layers": 2, + "hidden_dim": 16, + "intermediate_dim": 32, + } + self.input_data = { + "token_ids": ops.ones((2, 5), dtype="int32"), + "padding_mask": ops.ones((2, 5), dtype="int32"), + } + + def test_backbone_basics(self): + self.run_backbone_test( + cls=MyModelBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + expected_output_shape=(2, 5, 16), + ) + + @pytest.mark.large + def test_saved_model(self): + self.run_model_saving_test( + cls=MyModelBackbone, + init_kwargs=self.init_kwargs, + input_data=self.input_data, + ) + + @pytest.mark.large + def test_smallest_preset(self): + self.run_preset_test( + cls=MyModelBackbone, + preset="my_model_base", + input_data=self.input_data, + expected_output_shape=(2, 5, 16), + expected_partial_output=ops.array([1.0, 2.0, 3.0, 4.0, 5.0]), + ) + + @pytest.mark.extra_large + def test_all_presets(self): + for preset in MyModelBackbone.presets: + self.run_preset_test( + cls=MyModelBackbone, + preset=preset, + input_data=self.input_data, + ) +``` + +## Validation Colab Notebooks + +### Requirement +Each pull request must include links to Colab notebooks that demonstrate numerical equivalence with the original model's implementation. + +### Structure +Provide separate Colabs for each major component: + +1. **Backbone Validation**: Load original weights into your KerasHub backbone and show that for the same input tensor, the output tensor is numerically identical (or within a very small tolerance). + +2. **Converter/Preprocessor Validation**: Show that your preprocessor pipeline produces the same token IDs, padding masks, or pixel values as the original library's preprocessing functions. + +3. **End-to-End Validation**: Use `MyModelTask.from_preset()` to load your pre-trained model and run a full task (e.g., classification). The final output (e.g., logits, probabilities) must match the original model. + +## Import Conventions + +### Keras Imports +Prefer importing `keras` as a top-level object: +```python +import keras +from keras import ops +from keras import layers +``` + +❌ `tf.keras.activations.X` +✅ `keras.activations.X` + +❌ `layers.X` +✅ `keras.layers.X` or `keras_hub.layers.X` + +### KerasHub Imports +```python +from keras_hub.src.api_export import keras_hub_export +from keras_hub.src.models.backbone import Backbone +from keras_hub.src.tests.test_case import TestCase +``` + +## Layer Implementation Guidelines + +### Ideal Layer Style +When writing a new KerasHub layer (or tokenizer or metric), follow these guidelines: + +1. **Accept `**kwargs`** in `__init__` and forward this to the super class. +2. **Keep Python attributes** on the layer for each `__init__` argument to the layer. The name and value should match the passed value. +3. **Write a `get_config()`** which chains to super. +4. **Document thoroughly** including call behavior through a class level docstring. +5. **Include usage examples** using the full symbol location in `keras_hub`. +6. **Include reference citations** if applicable. + +### Example Layer Implementation +```python +@keras_hub_export("keras_hub.layers.MyCustomLayer") +class MyCustomLayer(keras.layers.Layer): + """A custom layer for specific functionality. + + This layer implements [specific functionality] as described in + [reference paper]. It accepts [input description] and outputs + [output description]. + + Args: + param1: int. Description of parameter 1. + param2: str. Description of parameter 2. + + Example: + ```python + layer = keras_hub.layers.MyCustomLayer(param1=10, param2=20) + output = layer(input_tensor) + ``` + + Reference: + - [Author et al., Year](https://arxiv.org/abs/paper_id) + """ + + def __init__(self, param1, param2, **kwargs): + super().__init__(**kwargs) + self.param1 = param1 + self.param2 = param2 + + def build(self, input_shape): + # Layer building logic + super().build(input_shape) + + def call(self, inputs): + # Layer computation logic + return processed_output + + def get_config(self): + config = super().get_config() + config.update({ + "param1": self.param1, + "param2": self.param2, + }) + return config +``` + +## Checkpoint Conversion + +### Script Location +All checkpoint conversion scripts should be placed in `tools/checkpoint_conversion/`. + +### Script Requirements +- Must be reusable and well-documented +- Should handle all presets for the model +- Must demonstrate numerical equivalence with original implementation +- Should include proper error handling and validation + +### Example Conversion Script Structure +```python +"""Convert MyModel checkpoints from original format to KerasHub.""" + +import argparse +import json +import os + +import keras +import numpy as np + +from keras_hub.src.models.my_model.my_model_backbone import MyModelBackbone + + +def convert_checkpoint(checkpoint_path, output_dir, preset_name): + """Convert a MyModel checkpoint to KerasHub format.""" + # Load original checkpoint + # Convert weights to KerasHub format + # Save in KerasHub format + # Validate numerical equivalence + pass + + +def main(): + parser = argparse.ArgumentParser(description="Convert MyModel checkpoints") + parser.add_argument("--checkpoint_path", required=True) + parser.add_argument("--output_dir", required=True) + parser.add_argument("--preset_name", required=True) + + args = parser.parse_args() + convert_checkpoint(args.checkpoint_path, args.output_dir, args.preset_name) + + +if __name__ == "__main__": + main() +``` + +## HuggingFace Converters + +### When to Add +If the model is being ported from HuggingFace, a converter must be added to `keras_hub/src/utils/transformers/`. + +### Converter Structure +Each HuggingFace converter should include: + +1. **Configuration conversion**: `convert_backbone_config()` function that maps HuggingFace config to KerasHub config +2. **Weight conversion**: `convert_weights()` function that maps HuggingFace weights to KerasHub weights +3. **Backbone class reference**: `backbone_cls` variable pointing to the KerasHub backbone class + +### Example HuggingFace Converter +```python +"""Convert MyModel from HuggingFace format to KerasHub.""" + +import numpy as np + +from keras_hub.src.models.my_model.my_model_backbone import MyModelBackbone +from keras_hub.src.utils.preset_utils import get_file + +backbone_cls = MyModelBackbone + + +def convert_backbone_config(transformers_config): + """Convert HuggingFace config to KerasHub config.""" + return { + "vocabulary_size": transformers_config["vocab_size"], + "num_layers": transformers_config["num_hidden_layers"], + "num_heads": transformers_config["num_attention_heads"], + "hidden_dim": transformers_config["hidden_size"], + "intermediate_dim": transformers_config["intermediate_size"], + } + + +def convert_weights(backbone, loader, transformers_config): + """Convert HuggingFace weights to KerasHub weights.""" + # Embedding layer + loader.port_weight( + keras_variable=backbone.get_layer("token_embedding").embeddings, + hf_weight_key="model.embed_tokens.weight", + ) + + # Transformer layers + for i in range(backbone.num_layers): + layer = backbone.get_layer(f"transformer_layer_{i}") + hf_prefix = f"model.layers.{i}" + + # Attention weights + loader.port_weight( + keras_variable=layer.attention.query_dense.kernel, + hf_weight_key=f"{hf_prefix}.self_attn.q_proj.weight", + hook_fn=lambda hf_tensor, keras_shape: np.transpose( + np.reshape(hf_tensor, (keras_shape[0], keras_shape[2], keras_shape[1])), + axes=(0, 2, 1), + ), + ) + # ... additional weight mappings +``` + +### Converter Test Requirements +- Test that the converter can load HuggingFace models correctly +- Verify class detection works for both backbone and task models +- Test with `load_weights=False` to ensure config conversion works +- Include numerical equivalence tests when possible + +### Example Converter Test +```python +"""Tests for MyModel HuggingFace converter.""" + +import pytest + +from keras_hub.src.models.backbone import Backbone +from keras_hub.src.models.my_model.my_model_backbone import MyModelBackbone +from keras_hub.src.models.my_model.my_model_text_classifier import MyModelTextClassifier +from keras_hub.src.models.text_classifier import TextClassifier +from keras_hub.src.tests.test_case import TestCase + + +class TestMyModelConverter(TestCase): + @pytest.mark.large + def test_convert_preset(self): + model = MyModelTextClassifier.from_preset( + "hf://huggingface/my-model-base", num_classes=2 + ) + prompt = "This is a test sentence." + model.predict([prompt]) + + @pytest.mark.large + def test_class_detection(self): + model = TextClassifier.from_preset( + "hf://huggingface/my-model-base", + num_classes=2, + load_weights=False, + ) + self.assertIsInstance(model, MyModelTextClassifier) + + model = Backbone.from_preset( + "hf://huggingface/my-model-base", + load_weights=False, + ) + self.assertIsInstance(model, MyModelBackbone) +``` + +## Code Quality Standards + +### Formatting +- Use `black` for code formatting +- Follow PEP 8 guidelines +- Use consistent indentation (4 spaces) + +### Error Handling +- Provide meaningful error messages +- Use appropriate exception types +- Include context in error messages + +### Performance +- Keep computation inside TensorFlow graph when possible +- Support XLA compilation where applicable +- Use efficient data structures and algorithms + +### Documentation +- All public APIs must be documented +- Include comprehensive examples +- Document edge cases and limitations +- Keep documentation up-to-date with code changes + +## Contributing Workflow + +### Step 1: Backbone Implementation +1. Implement the backbone model class +2. Add comprehensive unit tests +3. Create validation Colab notebook +4. Submit PR for review + +### Step 2: Tokenizer/Converter Implementation +1. Implement the tokenizer/converter class +2. Add unit tests +3. Create validation Colab notebook +4. Submit PR for review + +### Step 3: Presets and Checkpoint Conversion +1. Create presets configuration file +2. Implement checkpoint conversion script +3. **If porting from HuggingFace**: Add HuggingFace converter in `keras_hub/src/utils/transformers/` +4. Add `from_preset()` methods +5. Add comprehensive preset tests +6. Submit PR for review + +### Step 4: Task Models and Preprocessors (Optional) +1. Implement task-specific models +2. Implement preprocessors +3. Add comprehensive tests +4. Submit PR for review + +## Conclusion + +This style guide ensures consistency, maintainability, and quality across the KerasHub codebase. All contributors should follow these guidelines to maintain the high standards expected in the library. When in doubt, refer to existing implementations in the codebase for examples of proper patterns and conventions. \ No newline at end of file From c250ecd0411036ff8d885b841de9fa98154f1269 Mon Sep 17 00:00:00 2001 From: Divyashree Sreepathihalli Date: Mon, 18 Aug 2025 17:20:06 -0700 Subject: [PATCH 02/10] Update styleguide.md --- .gemini/styleguide.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.gemini/styleguide.md b/.gemini/styleguide.md index 3c4ec8c234..56cc2faa37 100644 --- a/.gemini/styleguide.md +++ b/.gemini/styleguide.md @@ -707,7 +707,6 @@ class TestMyModelConverter(TestCase): - Include context in error messages ### Performance -- Keep computation inside TensorFlow graph when possible - Support XLA compilation where applicable - Use efficient data structures and algorithms @@ -747,4 +746,4 @@ class TestMyModelConverter(TestCase): ## Conclusion -This style guide ensures consistency, maintainability, and quality across the KerasHub codebase. All contributors should follow these guidelines to maintain the high standards expected in the library. When in doubt, refer to existing implementations in the codebase for examples of proper patterns and conventions. \ No newline at end of file +This style guide ensures consistency, maintainability, and quality across the KerasHub codebase. All contributors should follow these guidelines to maintain the high standards expected in the library. When in doubt, refer to existing implementations in the codebase for examples of proper patterns and conventions. From 59228de3a0364ce68a1a55b496d6f248e7220811 Mon Sep 17 00:00:00 2001 From: Divyashree Sreepathihalli Date: Mon, 18 Aug 2025 17:21:30 -0700 Subject: [PATCH 03/10] Update styleguide.md --- .gemini/styleguide.md | 32 -------------------------------- 1 file changed, 32 deletions(-) diff --git a/.gemini/styleguide.md b/.gemini/styleguide.md index 56cc2faa37..0a13857a68 100644 --- a/.gemini/styleguide.md +++ b/.gemini/styleguide.md @@ -715,35 +715,3 @@ class TestMyModelConverter(TestCase): - Include comprehensive examples - Document edge cases and limitations - Keep documentation up-to-date with code changes - -## Contributing Workflow - -### Step 1: Backbone Implementation -1. Implement the backbone model class -2. Add comprehensive unit tests -3. Create validation Colab notebook -4. Submit PR for review - -### Step 2: Tokenizer/Converter Implementation -1. Implement the tokenizer/converter class -2. Add unit tests -3. Create validation Colab notebook -4. Submit PR for review - -### Step 3: Presets and Checkpoint Conversion -1. Create presets configuration file -2. Implement checkpoint conversion script -3. **If porting from HuggingFace**: Add HuggingFace converter in `keras_hub/src/utils/transformers/` -4. Add `from_preset()` methods -5. Add comprehensive preset tests -6. Submit PR for review - -### Step 4: Task Models and Preprocessors (Optional) -1. Implement task-specific models -2. Implement preprocessors -3. Add comprehensive tests -4. Submit PR for review - -## Conclusion - -This style guide ensures consistency, maintainability, and quality across the KerasHub codebase. All contributors should follow these guidelines to maintain the high standards expected in the library. When in doubt, refer to existing implementations in the codebase for examples of proper patterns and conventions. From f2a17e33625be56afbef422447e2ee65e69ef7a3 Mon Sep 17 00:00:00 2001 From: Divyashree Sreepathihalli Date: Mon, 18 Aug 2025 17:26:25 -0700 Subject: [PATCH 04/10] Update styleguide.md --- .gemini/styleguide.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.gemini/styleguide.md b/.gemini/styleguide.md index 0a13857a68..e2d0c9a759 100644 --- a/.gemini/styleguide.md +++ b/.gemini/styleguide.md @@ -3,7 +3,7 @@ - **Modularity**: Models are broken down into distinct, reusable components: Backbone, Converter (Tokenizer, ImageConverter, etc.), Preprocessor, and Task. - **Consistency**: Strict adherence to naming and file structure conventions is crucial for predictability and maintainability. - **Validation**: Every component must be rigorously validated against the original model's implementation. Numerical equivalence is a primary requirement, demonstrated via Colab notebooks. -- **Reusability**: Prioritize using existing layers from `keras.layers` and `keras_nlp.layers` before implementing custom logic. +- **Reusability**: Prioritize using existing layers from `keras.layers` and `keras_hub.layers` before implementing custom logic. - **Backend Agnostic**: All code must be backend-agnostic, supporting TensorFlow, JAX, and PyTorch backends. ## Directory and File Structure @@ -77,7 +77,7 @@ Use standardized names for model input arguments to ensure interoperability: **API**: Do not implement `from_preset()` in the initial PR for the backbone. This is added later with the presets. -**Reusability**: Prefer using layers from `keras.layers` and `keras_nlp.layers`. Custom layers should only be implemented for significant architectural deviations not covered by existing Keras components. +**Reusability**: Prefer using layers from `keras.layers` and `keras_hub.layers`. Custom layers should only be implemented for significant architectural deviations not covered by existing Keras components. **Example Structure**: ```python From dd3acb466df750f53e3f8527a2448c4e8d973a58 Mon Sep 17 00:00:00 2001 From: Divyashree Sreepathihalli Date: Mon, 18 Aug 2025 19:12:17 -0700 Subject: [PATCH 05/10] Update styleguide.md --- .gemini/styleguide.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/.gemini/styleguide.md b/.gemini/styleguide.md index e2d0c9a759..6c6bd70e7d 100644 --- a/.gemini/styleguide.md +++ b/.gemini/styleguide.md @@ -75,8 +75,6 @@ Use standardized names for model input arguments to ensure interoperability: **Implementation**: Use the Keras Functional API to define the model graph inside the class `__init__` method. -**API**: Do not implement `from_preset()` in the initial PR for the backbone. This is added later with the presets. - **Reusability**: Prefer using layers from `keras.layers` and `keras_hub.layers`. Custom layers should only be implemented for significant architectural deviations not covered by existing Keras components. **Example Structure**: From 93d9b977b2ebae4588c08fc806b52179e9293d36 Mon Sep 17 00:00:00 2001 From: Divyashree Sreepathihalli Date: Mon, 18 Aug 2025 19:28:54 -0700 Subject: [PATCH 06/10] Update styleguide.md --- .gemini/styleguide.md | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/.gemini/styleguide.md b/.gemini/styleguide.md index 6c6bd70e7d..03c2dfeb45 100644 --- a/.gemini/styleguide.md +++ b/.gemini/styleguide.md @@ -2,9 +2,9 @@ - **Modularity**: Models are broken down into distinct, reusable components: Backbone, Converter (Tokenizer, ImageConverter, etc.), Preprocessor, and Task. - **Consistency**: Strict adherence to naming and file structure conventions is crucial for predictability and maintainability. -- **Validation**: Every component must be rigorously validated against the original model's implementation. Numerical equivalence is a primary requirement, demonstrated via Colab notebooks. +- **Validation**: Every component must be validated against the original model's implementation. Numerical equivalence is a primary requirement, demonstrated via Colab notebooks. - **Reusability**: Prioritize using existing layers from `keras.layers` and `keras_hub.layers` before implementing custom logic. -- **Backend Agnostic**: All code must be backend-agnostic, supporting TensorFlow, JAX, and PyTorch backends. +- **Backend Agnostic**: All code must be keras 3 backend-agnostic, supporting TensorFlow, JAX, and PyTorch backends. ## Directory and File Structure @@ -63,9 +63,12 @@ keras_hub/src/utils/transformers/ ### Model Inputs Use standardized names for model input arguments to ensure interoperability: -- **Text Models**: `token_ids`, `padding_mask` -- **Image Models**: `pixel_values` -- **Audio Models**: `audio_features` +- **Text Models**: `token_ids`, `padding_mask`, etc +- **Image Models**: `pixel_values`, etc +- **Audio Models**: `audio_features`, etc + +### Class arg names +Use standardized names for arg names that should be consistent with other models in the repository ## Code Implementation Style @@ -469,10 +472,8 @@ from keras import ops from keras import layers ``` -❌ `tf.keras.activations.X` ✅ `keras.activations.X` -❌ `layers.X` ✅ `keras.layers.X` or `keras_hub.layers.X` ### KerasHub Imports From 8dee218b2098982ff31fad0b13b79c4b523b1d3a Mon Sep 17 00:00:00 2001 From: Divyashree Sreepathihalli Date: Mon, 18 Aug 2025 21:34:37 -0700 Subject: [PATCH 07/10] Update styleguide.md --- .gemini/styleguide.md | 17 ++--------------- 1 file changed, 2 insertions(+), 15 deletions(-) diff --git a/.gemini/styleguide.md b/.gemini/styleguide.md index 03c2dfeb45..c1a3457906 100644 --- a/.gemini/styleguide.md +++ b/.gemini/styleguide.md @@ -464,18 +464,6 @@ Provide separate Colabs for each major component: ## Import Conventions -### Keras Imports -Prefer importing `keras` as a top-level object: -```python -import keras -from keras import ops -from keras import layers -``` - -✅ `keras.activations.X` - -✅ `keras.layers.X` or `keras_hub.layers.X` - ### KerasHub Imports ```python from keras_hub.src.api_export import keras_hub_export @@ -696,9 +684,8 @@ class TestMyModelConverter(TestCase): ## Code Quality Standards ### Formatting -- Use `black` for code formatting -- Follow PEP 8 guidelines -- Use consistent indentation (4 spaces) +- Use `ruff` for code formatting +- If code format test fails, add a comment to run code formatting using `pre-commit run --all-files` ### Error Handling - Provide meaningful error messages From cefd1e91965219197d1da4bd0427e65f7819ce97 Mon Sep 17 00:00:00 2001 From: Divyashree Sreepathihalli Date: Mon, 18 Aug 2025 21:40:21 -0700 Subject: [PATCH 08/10] use calusallm example --- .gemini/styleguide.md | 163 +++++++++++++++++++++++++++++------------- 1 file changed, 115 insertions(+), 48 deletions(-) diff --git a/.gemini/styleguide.md b/.gemini/styleguide.md index c1a3457906..cbad065f18 100644 --- a/.gemini/styleguide.md +++ b/.gemini/styleguide.md @@ -210,33 +210,56 @@ class MyModelTokenizer(WordPieceTokenizer): **Example Preprocessor**: ```python -@keras_hub_export("keras_hub.models.MyModelPreprocessor") -class MyModelPreprocessor(TextClassifierPreprocessor): - """MyModel preprocessing for text classification. - - This preprocessing layer will prepare inputs for text classification. - +@keras_hub_export("keras_hub.models.MyModelCausalLMPreprocessor") +class MyModelCausalLMPreprocessor(CausalLMPreprocessor): + """MyModel Causal LM preprocessor. + + This preprocessing layer is meant for use with + `keras_hub.models.MyModelCausalLM`. By default, it will take in batches of + strings, and return outputs in a `(x, y, sample_weight)` format, where the + `y` label is the next token id in the `x` sequence. + + For use with generation, the layer also exposes two methods + `generate_preprocess()` and `generate_postprocess()`. When this preprocessor + is attached to a `keras_hub.models.MyModelCausalLM` instance, these methods + will be called implicitly in `generate()`. They can also be called + standalone (e.g. to precompute preprocessing inputs for generation in a + separate process). + Args: - tokenizer: `keras_hub.models.MyModelTokenizer`. A tokenizer instance. - sequence_length: int. The length of the packed inputs. - + tokenizer: A `keras_hub.models.MyModelTokenizer` instance. + sequence_length: The length of the packed inputs. + add_start_token: If `True`, the preprocessor will prepend the tokenizer + start token to each input sequence. + add_end_token: If `True`, the preprocessor will append the tokenizer + end token to each input sequence. + Examples: ```python - preprocessor = keras_hub.models.MyModelPreprocessor.from_preset("my_model_base") + # Load the preprocessor from a preset. + preprocessor = keras_hub.models.MyModelCausalLMPreprocessor.from_preset( + "my_model_base" + ) + + # Tokenize and pack a single sentence. preprocessor("The quick brown fox jumped.") + + # Tokenize a batch of sentences. + preprocessor(["The quick brown fox jumped.", "Call me Ishmael."]) + + # Prepare tokens for generation (no end token). + preprocessor.generate_preprocess(["The quick brown fox jumped."]) + + # Map generation outputs back to strings. + preprocessor.generate_postprocess({ + 'token_ids': np.array([[2, 714, 4320, 8426, 25341, 32292, 235265, 0]]), + 'padding_mask': np.array([[ 1, 1, 1, 1, 1, 1, 1, 0]]), + }) ``` """ - + backbone_cls = MyModelBackbone tokenizer_cls = MyModelTokenizer - - def __init__( - self, - tokenizer, - sequence_length=512, - **kwargs, - ): - super().__init__(tokenizer=tokenizer, sequence_length=sequence_length, **kwargs) ``` ### Task Models (`_.py`) @@ -249,39 +272,84 @@ class MyModelPreprocessor(TextClassifierPreprocessor): **Example Task Model**: ```python -@keras_hub_export("keras_hub.models.MyModelTextClassifier") -class MyModelTextClassifier(TextClassifier): - """MyModel text classification model. - - This model combines a MyModel backbone with a classification head. - +@keras_hub_export("keras_hub.models.MyModelCausalLM") +class MyModelCausalLM(CausalLM): + """An end-to-end MyModel for causal language modeling. + + A causal language model (LM) predicts the next token based on previous + tokens. This task setup can be used to train the model unsupervised on + plain text input, or to autoregressively generate plain text similar to + the data used for training. This task can be used for pre-training or + fine-tuning a MyModel, simply by calling `fit()`. + + This model has a `generate()` method, which generates text based on a + prompt. The generation strategy used is controlled by an additional + `sampler` argument on `compile()`. You can recompile the model with + different `keras_hub.samplers` objects to control the generation. By + default, `"greedy"` sampling will be used. + + This model can optionally be configured with a `preprocessor` layer, in + which case it will automatically apply preprocessing to string inputs during + `fit()`, `predict()`, `evaluate()` and `generate()`. This is done by default + when creating the model with `from_preset()`. + Args: - backbone: `keras_hub.models.MyModelBackbone`. A backbone instance. - preprocessor: `keras_hub.models.MyModelPreprocessor`. A preprocessor instance. - num_classes: int. Number of classes to predict. - + backbone: A `keras_hub.models.MyModelBackbone` instance. + preprocessor: A `keras_hub.models.MyModelCausalLMPreprocessor` or `None`. + If `None`, this model will not apply preprocessing, and inputs + should be preprocessed before calling the model. + Examples: + + Use `generate()` to do text generation. ```python - classifier = keras_hub.models.MyModelTextClassifier.from_preset( - "my_model_base", - num_classes=2, - ) - classifier.predict(["What an amazing movie!", "A total waste of my time."]) + model = keras_hub.models.MyModelCausalLM.from_preset("my_model_base") + model.generate("I want to say", max_length=30) + + # Generate with batched prompts. + model.generate(["This is a", "Where are you"], max_length=30) + ``` + + Compile the `generate()` function with a custom sampler. + ```python + model = keras_hub.models.MyModelCausalLM.from_preset("my_model_base") + model.compile(sampler="top_k") + model.generate("I want to say", max_length=30) + + model.compile(sampler=keras_hub.samplers.BeamSampler(num_beams=2)) + model.generate("I want to say", max_length=30) + ``` + + Call `fit()` on a single batch. + ```python + features = ["The quick brown fox jumped.", "I forgot my homework."] + model = keras_hub.models.MyModelCausalLM.from_preset("my_model_base") + model.fit(x=features, batch_size=2) ``` """ - + backbone_cls = MyModelBackbone - preprocessor_cls = MyModelPreprocessor - + preprocessor_cls = MyModelCausalLMPreprocessor + def __init__( self, backbone, preprocessor=None, - num_classes=2, - activation="softmax", **kwargs, ): - # ... implementation + # === Layers === + self.backbone = backbone + self.preprocessor = preprocessor + + # === Functional Model === + inputs = backbone.input + hidden_states = backbone(inputs) + outputs = backbone.token_embedding(hidden_states, reverse=True) + super().__init__( + inputs=inputs, + outputs=outputs, + **kwargs, + ) ``` ### Presets (`_presets.py`) @@ -651,28 +719,27 @@ import pytest from keras_hub.src.models.backbone import Backbone from keras_hub.src.models.my_model.my_model_backbone import MyModelBackbone -from keras_hub.src.models.my_model.my_model_text_classifier import MyModelTextClassifier -from keras_hub.src.models.text_classifier import TextClassifier +from keras_hub.src.models.my_model.my_model_causal_lm import MyModelCausalLM +from keras_hub.src.models.causal_lm import CausalLM from keras_hub.src.tests.test_case import TestCase class TestMyModelConverter(TestCase): @pytest.mark.large def test_convert_preset(self): - model = MyModelTextClassifier.from_preset( - "hf://huggingface/my-model-base", num_classes=2 + model = MyModelCausalLM.from_preset( + "hf://huggingface/my-model-base" ) prompt = "This is a test sentence." - model.predict([prompt]) + model.generate(prompt, max_length=10) @pytest.mark.large def test_class_detection(self): - model = TextClassifier.from_preset( + model = CausalLM.from_preset( "hf://huggingface/my-model-base", - num_classes=2, load_weights=False, ) - self.assertIsInstance(model, MyModelTextClassifier) + self.assertIsInstance(model, MyModelCausalLM) model = Backbone.from_preset( "hf://huggingface/my-model-base", From 246c6105da77937cc0fa5ad5103b1b9c0e0a269b Mon Sep 17 00:00:00 2001 From: Divyashree Sreepathihalli Date: Mon, 18 Aug 2025 21:43:20 -0700 Subject: [PATCH 09/10] update styleguide --- .gemini/styleguide.md | 79 ++++++++++++++++--------------------------- 1 file changed, 29 insertions(+), 50 deletions(-) diff --git a/.gemini/styleguide.md b/.gemini/styleguide.md index cbad065f18..bba8b4767a 100644 --- a/.gemini/styleguide.md +++ b/.gemini/styleguide.md @@ -210,56 +210,34 @@ class MyModelTokenizer(WordPieceTokenizer): **Example Preprocessor**: ```python -@keras_hub_export("keras_hub.models.MyModelCausalLMPreprocessor") -class MyModelCausalLMPreprocessor(CausalLMPreprocessor): - """MyModel Causal LM preprocessor. - - This preprocessing layer is meant for use with - `keras_hub.models.MyModelCausalLM`. By default, it will take in batches of - strings, and return outputs in a `(x, y, sample_weight)` format, where the - `y` label is the next token id in the `x` sequence. - - For use with generation, the layer also exposes two methods - `generate_preprocess()` and `generate_postprocess()`. When this preprocessor - is attached to a `keras_hub.models.MyModelCausalLM` instance, these methods - will be called implicitly in `generate()`. They can also be called - standalone (e.g. to precompute preprocessing inputs for generation in a - separate process). - +@keras_hub_export("keras_hub.models.MyModelPreprocessor") +class MyModelPreprocessor(TextClassifierPreprocessor): + """MyModel preprocessing for text classification. + + This preprocessing layer will prepare inputs for text classification. + Args: - tokenizer: A `keras_hub.models.MyModelTokenizer` instance. - sequence_length: The length of the packed inputs. - add_start_token: If `True`, the preprocessor will prepend the tokenizer - start token to each input sequence. - add_end_token: If `True`, the preprocessor will append the tokenizer - end token to each input sequence. - + tokenizer: `keras_hub.models.MyModelTokenizer`. A tokenizer instance. + sequence_length: int. The length of the packed inputs. + Examples: ```python - # Load the preprocessor from a preset. - preprocessor = keras_hub.models.MyModelCausalLMPreprocessor.from_preset( - "my_model_base" - ) - - # Tokenize and pack a single sentence. + preprocessor = keras_hub.models.MyModelPreprocessor.from_preset("my_model_base") preprocessor("The quick brown fox jumped.") - - # Tokenize a batch of sentences. - preprocessor(["The quick brown fox jumped.", "Call me Ishmael."]) - - # Prepare tokens for generation (no end token). - preprocessor.generate_preprocess(["The quick brown fox jumped."]) - - # Map generation outputs back to strings. - preprocessor.generate_postprocess({ - 'token_ids': np.array([[2, 714, 4320, 8426, 25341, 32292, 235265, 0]]), - 'padding_mask': np.array([[ 1, 1, 1, 1, 1, 1, 1, 0]]), - }) ``` """ - + backbone_cls = MyModelBackbone tokenizer_cls = MyModelTokenizer + + def __init__( + self, + tokenizer, + sequence_length=512, + **kwargs, + ): + super().__init__(tokenizer=tokenizer, sequence_length=sequence_length, **kwargs) +``` ``` ### Task Models (`_.py`) @@ -329,7 +307,7 @@ class MyModelCausalLM(CausalLM): """ backbone_cls = MyModelBackbone - preprocessor_cls = MyModelCausalLMPreprocessor + preprocessor_cls = MyModelPreprocessor def __init__( self, @@ -719,27 +697,28 @@ import pytest from keras_hub.src.models.backbone import Backbone from keras_hub.src.models.my_model.my_model_backbone import MyModelBackbone -from keras_hub.src.models.my_model.my_model_causal_lm import MyModelCausalLM -from keras_hub.src.models.causal_lm import CausalLM +from keras_hub.src.models.my_model.my_model_text_classifier import MyModelTextClassifier +from keras_hub.src.models.text_classifier import TextClassifier from keras_hub.src.tests.test_case import TestCase class TestMyModelConverter(TestCase): @pytest.mark.large def test_convert_preset(self): - model = MyModelCausalLM.from_preset( - "hf://huggingface/my-model-base" + model = MyModelTextClassifier.from_preset( + "hf://huggingface/my-model-base", num_classes=2 ) prompt = "This is a test sentence." - model.generate(prompt, max_length=10) + model.predict([prompt]) @pytest.mark.large def test_class_detection(self): - model = CausalLM.from_preset( + model = TextClassifier.from_preset( "hf://huggingface/my-model-base", + num_classes=2, load_weights=False, ) - self.assertIsInstance(model, MyModelCausalLM) + self.assertIsInstance(model, MyModelTextClassifier) model = Backbone.from_preset( "hf://huggingface/my-model-base", From 7a7cd9748c4c133bededc42a728a9004aa060a2e Mon Sep 17 00:00:00 2001 From: divyashreepathihalli Date: Mon, 25 Aug 2025 19:12:31 +0000 Subject: [PATCH 10/10] address comments --- .gemini/styleguide.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/.gemini/styleguide.md b/.gemini/styleguide.md index bba8b4767a..b3c3d7ea42 100644 --- a/.gemini/styleguide.md +++ b/.gemini/styleguide.md @@ -334,7 +334,8 @@ class MyModelCausalLM(CausalLM): **Purpose**: This file defines a dictionary of preset configurations for the model. -**Content**: Each entry includes the configuration arguments for the model (`config`), a description, and the URL to the pre-trained weights hosted on Kaggle (`weights_url`). +**Content**: Each entry includes the configuration arguments for the model (`config`), a description, and the URL to the pre-trained weights hosted on Kaggle (`weights_url`). The preset names have to be in snake_case. + **Example Presets**: ```python @@ -400,7 +401,7 @@ def load_vocabulary(vocab_path): ### Testing Requirements Testing is a non-negotiable part of every contribution. Beyond the existence of test files, the tests themselves must follow standardized routines to ensure all core functionality is covered. - +When a test is set up, the test inputs should be small to allow for fast testing. Example: Test with (16,16,3) image size instead of (256, 256, 3) ### Unit Tests **Requirement**: Every `.py` file containing logic (backbone, tokenizer, task, etc.) must have a corresponding `_test.py` file. @@ -626,7 +627,9 @@ if __name__ == "__main__": ## HuggingFace Converters ### When to Add -If the model is being ported from HuggingFace, a converter must be added to `keras_hub/src/utils/transformers/`. +If the model is being ported from HuggingFace Transformers, a converter must be added to `keras_hub/src/utils/transformers/`. +If the model is an image model with presets avaialble on Timm, a converter must be added to `keras_hub/src/utils/timm/`. + ### Converter Structure Each HuggingFace converter should include: