diff --git a/docs/source/en/model_doc/granite.md b/docs/source/en/model_doc/granite.md index ef8bb0867b6e..d968550ee80a 100644 --- a/docs/source/en/model_doc/granite.md +++ b/docs/source/en/model_doc/granite.md @@ -124,3 +124,8 @@ print(tokenizer.decode(outputs[0], skip_special_tokens=True)) [[autodoc]] GraniteForCausalLM - forward + +## GraniteForSequenceClassification + +[[autodoc]] GraniteForSequenceClassification + - forward diff --git a/docs/source/en/model_doc/granitemoe.md b/docs/source/en/model_doc/granitemoe.md index 32616c07a289..dfbc159f404d 100644 --- a/docs/source/en/model_doc/granitemoe.md +++ b/docs/source/en/model_doc/granitemoe.md @@ -78,3 +78,8 @@ This model was contributed by [mayank-mishra](https://huggingface.co/mayank-mish [[autodoc]] GraniteMoeForCausalLM - forward + +## GraniteMoeForSequenceClassification + +[[autodoc]] GraniteMoeForSequenceClassification + - forward diff --git a/docs/source/en/model_doc/granitemoehybrid.md b/docs/source/en/model_doc/granitemoehybrid.md index cb3db122e65d..3059a834b57d 100644 --- a/docs/source/en/model_doc/granitemoehybrid.md +++ b/docs/source/en/model_doc/granitemoehybrid.md @@ -87,3 +87,8 @@ This HF implementation is contributed by [Sukriti Sharma](https://huggingface.co [[autodoc]] GraniteMoeHybridForCausalLM - forward + +## GraniteMoeHybridForSequenceClassification + +[[autodoc]] GraniteMoeHybridForSequenceClassification + - forward diff --git a/docs/source/en/model_doc/granitemoeshared.md b/docs/source/en/model_doc/granitemoeshared.md index 9db702c9f705..22067b972aab 100644 --- a/docs/source/en/model_doc/granitemoeshared.md +++ b/docs/source/en/model_doc/granitemoeshared.md @@ -63,3 +63,8 @@ This HF implementation is contributed by [Mayank Mishra](https://huggingface.co/ [[autodoc]] GraniteMoeSharedForCausalLM - forward + +## GraniteMoeSharedForSequenceClassification + +[[autodoc]] GraniteMoeSharedForSequenceClassification + - forward diff --git a/src/transformers/models/auto/modeling_auto.py b/src/transformers/models/auto/modeling_auto.py index 952ff1da2bfa..4d207d23531f 100644 --- a/src/transformers/models/auto/modeling_auto.py +++ b/src/transformers/models/auto/modeling_auto.py @@ -1195,6 +1195,10 @@ class _BaseModelWithGenerate(PreTrainedModel, GenerationMixin): ("gpt_neox", "GPTNeoXForSequenceClassification"), ("gpt_oss", "GptOssForSequenceClassification"), ("gptj", "GPTJForSequenceClassification"), + ("granite", "GraniteForSequenceClassification"), + ("granitemoe", "GraniteMoeForSequenceClassification"), + ("granitemoehybrid", "GraniteMoeHybridForSequenceClassification"), + ("granitemoeshared", "GraniteMoeSharedForSequenceClassification"), ("helium", "HeliumForSequenceClassification"), ("hunyuan_v1_dense", "HunYuanDenseV1ForSequenceClassification"), ("hunyuan_v1_moe", "HunYuanMoEV1ForSequenceClassification"), diff --git a/src/transformers/models/granite/modeling_granite.py b/src/transformers/models/granite/modeling_granite.py index 1541f6cad55d..7e4cf50b54cd 100644 --- a/src/transformers/models/granite/modeling_granite.py +++ b/src/transformers/models/granite/modeling_granite.py @@ -30,7 +30,7 @@ from ...generation import GenerationMixin from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func from ...masking_utils import create_causal_mask -from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel @@ -588,4 +588,8 @@ def forward( ) -__all__ = ["GraniteForCausalLM", "GraniteModel", "GranitePreTrainedModel"] +class GraniteForSequenceClassification(GenericForSequenceClassification, GranitePreTrainedModel): + pass + + +__all__ = ["GraniteForCausalLM", "GraniteForSequenceClassification", "GraniteModel", "GranitePreTrainedModel"] diff --git a/src/transformers/models/granite/modular_granite.py b/src/transformers/models/granite/modular_granite.py index a4e979fb2324..c965dcf3a6bb 100644 --- a/src/transformers/models/granite/modular_granite.py +++ b/src/transformers/models/granite/modular_granite.py @@ -18,6 +18,7 @@ from ...cache_utils import Cache, DynamicCache from ...masking_utils import create_causal_mask +from ...modeling_layers import GenericForSequenceClassification from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast from ...processing_utils import Unpack from ...utils import TransformersKwargs, logging @@ -276,4 +277,8 @@ def forward( ) -__all__ = ["GraniteForCausalLM", "GraniteModel", "GranitePreTrainedModel"] +class GraniteForSequenceClassification(GenericForSequenceClassification, GranitePreTrainedModel): + pass + + +__all__ = ["GraniteForCausalLM", "GraniteForSequenceClassification", "GraniteModel", "GranitePreTrainedModel"] diff --git a/src/transformers/models/granitemoe/modeling_granitemoe.py b/src/transformers/models/granitemoe/modeling_granitemoe.py index e6d98911f362..0c4671d63d26 100644 --- a/src/transformers/models/granitemoe/modeling_granitemoe.py +++ b/src/transformers/models/granitemoe/modeling_granitemoe.py @@ -32,7 +32,7 @@ from ...generation import GenerationMixin from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func from ...masking_utils import create_causal_mask -from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel @@ -741,4 +741,13 @@ def forward( ) -__all__ = ["GraniteMoeForCausalLM", "GraniteMoeModel", "GraniteMoePreTrainedModel"] +class GraniteMoeForSequenceClassification(GenericForSequenceClassification, GraniteMoePreTrainedModel): + pass + + +__all__ = [ + "GraniteMoeForCausalLM", + "GraniteMoeForSequenceClassification", + "GraniteMoeModel", + "GraniteMoePreTrainedModel", +] diff --git a/src/transformers/models/granitemoe/modular_granitemoe.py b/src/transformers/models/granitemoe/modular_granitemoe.py index 88c50171096e..080122a8159c 100644 --- a/src/transformers/models/granitemoe/modular_granitemoe.py +++ b/src/transformers/models/granitemoe/modular_granitemoe.py @@ -20,6 +20,7 @@ from ...activations import ACT2FN from ...cache_utils import Cache, DynamicCache from ...masking_utils import create_causal_mask +from ...modeling_layers import GenericForSequenceClassification from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast from ...modeling_utils import PreTrainedModel from ...processing_utils import Unpack @@ -323,4 +324,13 @@ def forward( ) -__all__ = ["GraniteMoeForCausalLM", "GraniteMoeModel", "GraniteMoePreTrainedModel"] +class GraniteMoeForSequenceClassification(GenericForSequenceClassification, GraniteMoePreTrainedModel): + pass + + +__all__ = [ + "GraniteMoeForCausalLM", + "GraniteMoeForSequenceClassification", + "GraniteMoeModel", + "GraniteMoePreTrainedModel", +] diff --git a/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py index 2e1625742cce..f76f8c520f34 100644 --- a/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py +++ b/src/transformers/models/granitemoehybrid/modeling_granitemoehybrid.py @@ -33,7 +33,7 @@ from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func from ...integrations.hub_kernels import lazy_load_kernel from ...masking_utils import create_causal_mask -from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer from ...modeling_outputs import BaseModelOutputWithPast, MoeCausalLMOutputWithPast, MoeModelOutputWithPast from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel @@ -1588,4 +1588,13 @@ def prepare_inputs_for_generation( return model_inputs -__all__ = ["GraniteMoeHybridForCausalLM", "GraniteMoeHybridModel", "GraniteMoeHybridPreTrainedModel"] +class GraniteMoeHybridForSequenceClassification(GenericForSequenceClassification, GraniteMoeHybridPreTrainedModel): + pass + + +__all__ = [ + "GraniteMoeHybridForCausalLM", + "GraniteMoeHybridForSequenceClassification", + "GraniteMoeHybridModel", + "GraniteMoeHybridPreTrainedModel", +] diff --git a/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py b/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py index 9b9bd65bf9b0..50157c39ca40 100644 --- a/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py +++ b/src/transformers/models/granitemoehybrid/modular_granitemoehybrid.py @@ -20,6 +20,7 @@ from ... import initialization as init from ...cache_utils import Cache from ...masking_utils import create_causal_mask +from ...modeling_layers import GenericForSequenceClassification from ...modeling_outputs import BaseModelOutputWithPast, MoeModelOutputWithPast from ...modeling_utils import ALL_ATTENTION_FUNCTIONS from ...processing_utils import Unpack @@ -359,4 +360,13 @@ def prepare_inputs_for_generation( return model_inputs -__all__ = ["GraniteMoeHybridForCausalLM", "GraniteMoeHybridModel", "GraniteMoeHybridPreTrainedModel"] +class GraniteMoeHybridForSequenceClassification(GenericForSequenceClassification, GraniteMoeHybridPreTrainedModel): + pass + + +__all__ = [ + "GraniteMoeHybridForCausalLM", + "GraniteMoeHybridForSequenceClassification", + "GraniteMoeHybridModel", + "GraniteMoeHybridPreTrainedModel", +] diff --git a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py index 91f6a4ed5158..233300986387 100644 --- a/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py +++ b/src/transformers/models/granitemoeshared/modeling_granitemoeshared.py @@ -31,7 +31,7 @@ from ...generation import GenerationMixin from ...integrations import use_kernel_forward_from_hub, use_kernel_func_from_hub, use_kernelized_func from ...masking_utils import create_causal_mask -from ...modeling_layers import GradientCheckpointingLayer +from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer from ...modeling_outputs import MoeCausalLMOutputWithPast, MoeModelOutputWithPast from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel @@ -810,4 +810,13 @@ def forward( ) -__all__ = ["GraniteMoeSharedForCausalLM", "GraniteMoeSharedModel", "GraniteMoeSharedPreTrainedModel"] +class GraniteMoeSharedForSequenceClassification(GenericForSequenceClassification, GraniteMoeSharedPreTrainedModel): + pass + + +__all__ = [ + "GraniteMoeSharedForCausalLM", + "GraniteMoeSharedForSequenceClassification", + "GraniteMoeSharedModel", + "GraniteMoeSharedPreTrainedModel", +] diff --git a/src/transformers/models/granitemoeshared/modular_granitemoeshared.py b/src/transformers/models/granitemoeshared/modular_granitemoeshared.py index efb03ad06a87..a089c040ee1a 100644 --- a/src/transformers/models/granitemoeshared/modular_granitemoeshared.py +++ b/src/transformers/models/granitemoeshared/modular_granitemoeshared.py @@ -19,6 +19,7 @@ from ...activations import ACT2FN from ...cache_utils import Cache +from ...modeling_layers import GenericForSequenceClassification from ...processing_utils import Unpack from ...utils import logging from ..granitemoe.modeling_granitemoe import ( @@ -153,4 +154,13 @@ def __init__(self, config: GraniteMoeSharedConfig): self.post_init() -__all__ = ["GraniteMoeSharedForCausalLM", "GraniteMoeSharedModel", "GraniteMoeSharedPreTrainedModel"] +class GraniteMoeSharedForSequenceClassification(GenericForSequenceClassification, GraniteMoeSharedPreTrainedModel): + pass + + +__all__ = [ + "GraniteMoeSharedForCausalLM", + "GraniteMoeSharedForSequenceClassification", + "GraniteMoeSharedModel", + "GraniteMoeSharedPreTrainedModel", +] diff --git a/tests/models/granite/test_modeling_granite.py b/tests/models/granite/test_modeling_granite.py index b1f12981d4db..ce77152be347 100644 --- a/tests/models/granite/test_modeling_granite.py +++ b/tests/models/granite/test_modeling_granite.py @@ -35,6 +35,7 @@ from transformers import ( GraniteForCausalLM, + GraniteForSequenceClassification, GraniteModel, ) @@ -140,6 +141,16 @@ def create_and_check_model( result = model(input_ids) self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + def create_and_check_for_sequence_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = GraniteForSequenceClassification(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, labels=sequence_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() ( @@ -161,6 +172,7 @@ class GraniteModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi ( GraniteModel, GraniteForCausalLM, + GraniteForSequenceClassification, ) if is_torch_available() else () @@ -169,6 +181,7 @@ class GraniteModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi { "feature-extraction": GraniteModel, "text-generation": GraniteForCausalLM, + "text-classification": GraniteForSequenceClassification, } if is_torch_available() else {} @@ -189,6 +202,10 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) + @require_torch_accelerator class GraniteIntegrationTest(unittest.TestCase): diff --git a/tests/models/granitemoe/test_modeling_granitemoe.py b/tests/models/granitemoe/test_modeling_granitemoe.py index 6e0cd53ed6a5..73d92a1a7ec6 100644 --- a/tests/models/granitemoe/test_modeling_granitemoe.py +++ b/tests/models/granitemoe/test_modeling_granitemoe.py @@ -34,6 +34,7 @@ from transformers import ( GraniteMoeForCausalLM, + GraniteMoeForSequenceClassification, GraniteMoeModel, ) @@ -139,6 +140,16 @@ def create_and_check_model( result = model(input_ids) self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + def create_and_check_for_sequence_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = GraniteMoeForSequenceClassification(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, labels=sequence_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() ( @@ -160,6 +171,7 @@ class GraniteMoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test ( GraniteMoeModel, GraniteMoeForCausalLM, + GraniteMoeForSequenceClassification, ) if is_torch_available() else () @@ -168,6 +180,7 @@ class GraniteMoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test { "feature-extraction": GraniteMoeModel, "text-generation": GraniteMoeForCausalLM, + "text-classification": GraniteMoeForSequenceClassification, } if is_torch_available() else {} @@ -188,6 +201,10 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) + @require_torch_accelerator class GraniteMoeIntegrationTest(unittest.TestCase): diff --git a/tests/models/granitemoehybrid/test_modeling_granitemoehybrid.py b/tests/models/granitemoehybrid/test_modeling_granitemoehybrid.py index 32246fe0212d..70b8cf9a4f3f 100644 --- a/tests/models/granitemoehybrid/test_modeling_granitemoehybrid.py +++ b/tests/models/granitemoehybrid/test_modeling_granitemoehybrid.py @@ -38,7 +38,7 @@ from ...generation.test_utils import GenerationTesterMixin from ...models.bamba.test_modeling_bamba import BambaModelTester from ...test_configuration_common import ConfigTester -from ...test_modeling_common import ModelTesterMixin +from ...test_modeling_common import ModelTesterMixin, ids_tensor from ...test_pipeline_mixin import PipelineTesterMixin @@ -47,6 +47,7 @@ from transformers import ( GraniteMoeHybridForCausalLM, + GraniteMoeHybridForSequenceClassification, GraniteMoeHybridModel, ) from transformers.models.granitemoehybrid.modeling_granitemoehybrid import HybridMambaAttentionDynamicCache @@ -64,11 +65,13 @@ def __init__( use_cache=False, shared_intermediate_size=174, layer_types=None, + type_sequence_label_size=2, ): super().__init__(parent) self.shared_intermediate_size = shared_intermediate_size self.layer_types = layer_types self.use_cache = use_cache + self.type_sequence_label_size = type_sequence_label_size def _update_layer_configs(self): super()._update_layer_configs() @@ -83,6 +86,30 @@ def get_config(self): layer_types=self.layer_types, ) + def prepare_config_and_inputs_for_sequence_classification(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = torch.tril(torch.ones_like(input_ids).to(torch_device)) + + sequence_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + + self._update_layer_configs() + config = self.get_config() + + return config, input_ids, input_mask, sequence_labels + + def create_and_check_for_sequence_classification(self, config, input_ids, input_mask, sequence_labels): + config.num_labels = self.num_labels + model = GraniteMoeHybridForSequenceClassification(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, labels=sequence_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + @require_torch class GraniteMoeHybridModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase): @@ -91,6 +118,7 @@ class GraniteMoeHybridModelTest(ModelTesterMixin, GenerationTesterMixin, Pipelin ( GraniteMoeHybridModel, GraniteMoeHybridForCausalLM, + GraniteMoeHybridForSequenceClassification, ) if is_torch_available() else () @@ -99,6 +127,7 @@ class GraniteMoeHybridModelTest(ModelTesterMixin, GenerationTesterMixin, Pipelin { "feature-extraction": GraniteMoeHybridModel, "text-generation": GraniteMoeHybridForCausalLM, + "text-classification": GraniteMoeHybridForSequenceClassification, } if is_torch_available() else {} @@ -141,6 +170,10 @@ def test_for_causal_lm(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_for_causal_lm(*config_and_inputs) + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs_for_sequence_classification() + self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) + def test_decoder_model_past_with_large_inputs(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_decoder_model_past_large_inputs(*config_and_inputs) diff --git a/tests/models/granitemoeshared/test_modeling_granitemoeshared.py b/tests/models/granitemoeshared/test_modeling_granitemoeshared.py index c86100c4c112..bc8e66304610 100644 --- a/tests/models/granitemoeshared/test_modeling_granitemoeshared.py +++ b/tests/models/granitemoeshared/test_modeling_granitemoeshared.py @@ -34,6 +34,7 @@ from transformers import ( GraniteMoeSharedForCausalLM, + GraniteMoeSharedForSequenceClassification, GraniteMoeSharedModel, ) @@ -142,6 +143,16 @@ def create_and_check_model( result = model(input_ids) self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + def create_and_check_for_sequence_classification( + self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels + ): + config.num_labels = self.num_labels + model = GraniteMoeSharedForSequenceClassification(config=config) + model.to(torch_device) + model.eval() + result = model(input_ids, attention_mask=input_mask, labels=sequence_labels) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + def prepare_config_and_inputs_for_common(self): config_and_inputs = self.prepare_config_and_inputs() ( @@ -163,6 +174,7 @@ class GraniteMoeSharedModelTest(ModelTesterMixin, GenerationTesterMixin, unittes ( GraniteMoeSharedModel, GraniteMoeSharedForCausalLM, + GraniteMoeSharedForSequenceClassification, ) if is_torch_available() else () @@ -171,6 +183,7 @@ class GraniteMoeSharedModelTest(ModelTesterMixin, GenerationTesterMixin, unittes { "feature-extraction": GraniteMoeSharedModel, "text-generation": GraniteMoeSharedForCausalLM, + "text-classification": GraniteMoeSharedForSequenceClassification, } if is_torch_available() else {} @@ -191,6 +204,10 @@ def test_model(self): config_and_inputs = self.model_tester.prepare_config_and_inputs() self.model_tester.create_and_check_model(*config_and_inputs) + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) + @require_torch_accelerator class GraniteMoeSharedIntegrationTest(unittest.TestCase):