torch-spyre · joerunde · Mar 11, 2026 · Mar 10, 2026 · Mar 10, 2026 · prashantgupta24
@@ -36,6 +36,14 @@ def granite_3_3_hf_config():
     return _load_hf_config(fixture_path)
 
 
+@pytest.fixture
+def granite_4_hf_dense_hybrid_config():
+    """Fixture providing a version of a real granite-4-8b-dense HF config that's a spoofed version
+    of a granitemoehybrid model. Granite 4 dense configs used to look like this."""
+    fixture_path = FIXTURES_PATH / "ibm-granite" / "granite-4-8b-dense-hybrid" / "config.json"
+    return _load_hf_config(fixture_path)
+
+
 @pytest.fixture
 def granite_4_hf_config():
     """Fixture providing real granite-4-8b-dense HF config."""

@@ -50,8 +50,26 @@ def test_match_granite_3_3_cb_config(self, registry, granite_3_3_hf_config):
         assert configurator.device_config is not None
         assert "VLLM_DT_MAX_BATCH_TKV_LIMIT" in configurator.device_config.env_vars
 
-    def test_match_granite_4_cb_config(self, registry, granite_4_hf_config):
-        """Test matching granite-4-8b-dense with CB config."""
+    def test_match_granite_4_dense_hybrid_config(self, registry, granite_4_hf_dense_hybrid_config):
+        """Test matching granite-4-8b-dense configs that have type granitemoehybrid."""
+        vllm_config = create_vllm_config(
+            hf_config=granite_4_hf_dense_hybrid_config,
+            world_size=4,
+            max_model_len=32768,
+            max_num_seqs=32,
+        )
+
+        configurator = registry.get_configurator_for_runtime(vllm_config)
+
+        assert configurator is not None
+        assert isinstance(configurator, ModelConfigurator)
+        # This is really a dense model, but it has model type "granitemoehybrid"
+        # It has the same overrides as the regular dense variant
+        assert configurator.model_config.name == "ibm-granite/granite-4-8b-dense-hybrid"
+        assert configurator.device_config is not None
+
+    def test_match_granite_4_dense_config(self, registry, granite_4_hf_config):
+        """Test matching granite-4-8b-dense configs that aren't spoofed moe hybrid models."""
         vllm_config = create_vllm_config(
             hf_config=granite_4_hf_config, world_size=4, max_model_len=32768, max_num_seqs=32
         )

@@ -0,0 +1,90 @@
+{
+  "architectures": [
+    "GraniteMoeHybridForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "attention_multiplier": 0.0078125,
+  "bos_token_id": 100257,
+  "embedding_multiplier": 12,
+  "eos_token_id": 100257,
+  "hidden_act": "silu",
+  "hidden_size": 4096,
+  "init_method": "mup",
+  "initializer_range": 0.1,
+  "intermediate_size": 12800,
+  "layer_types": [
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention",
+    "attention"
+  ],
+  "logits_scaling": 16,
+  "mamba_chunk_size": 256,
+  "mamba_conv_bias": true,
+  "mamba_d_conv": 4,
+  "mamba_d_head": 64,
+  "mamba_d_state": 256,
+  "mamba_expand": 2,
+  "mamba_n_groups": 1,
+  "mamba_n_heads": 128,
+  "mamba_proj_bias": false,
+  "max_position_embeddings": 131072,
+  "model_type": "granitemoehybrid",
+  "normalization_function": "rmsnorm",
+  "num_attention_heads": 32,
+  "num_experts_per_tok": 0,
+  "num_hidden_layers": 40,
+  "num_key_value_heads": 8,
+  "num_local_experts": 0,
+  "output_router_logits": false,
+  "pad_token_id": 100256,
+  "position_embedding_type": "rope",
+  "residual_multiplier": 0.22,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 10000000,
+  "router_aux_loss_coef": 0.01,
+  "shared_intermediate_size": 12800,
+  "tie_word_embeddings": true,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.56.0",
+  "use_cache": true,
+  "vocab_size": 100352
+}
@@ -1,6 +1,6 @@
 {
   "architectures": [
-    "GraniteMoeHybridForCausalLM"
+    "GraniteForCausalLM"
   ],
   "attention_bias": false,
   "attention_dropout": 0.0,
@@ -10,81 +10,22 @@
   "eos_token_id": 100257,
   "hidden_act": "silu",
   "hidden_size": 4096,
-  "init_method": "mup",
   "initializer_range": 0.1,
   "intermediate_size": 12800,
-  "layer_types": [
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention",
-    "attention"
-  ],
   "logits_scaling": 16,
-  "mamba_chunk_size": 256,
-  "mamba_conv_bias": true,
-  "mamba_d_conv": 4,
-  "mamba_d_head": 64,
-  "mamba_d_state": 256,
-  "mamba_expand": 2,
-  "mamba_n_groups": 1,
-  "mamba_n_heads": 128,
-  "mamba_proj_bias": false,
   "max_position_embeddings": 131072,
-  "model_type": "granitemoehybrid",
-  "normalization_function": "rmsnorm",
+  "mlp_bias": false,
+  "model_type": "granite",
   "num_attention_heads": 32,
-  "num_experts_per_tok": 0,
   "num_hidden_layers": 40,
   "num_key_value_heads": 8,
-  "num_local_experts": 0,
-  "output_router_logits": false,
   "pad_token_id": 100256,
-  "position_embedding_type": "rope",
   "residual_multiplier": 0.22,
   "rms_norm_eps": 1e-05,
   "rope_scaling": null,
   "rope_theta": 10000000,
-  "router_aux_loss_coef": 0.01,
-  "shared_intermediate_size": 12800,
   "tie_word_embeddings": true,
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.56.0",
+  "transformers_version": "4.53.3",
   "use_cache": true,
   "vocab_size": 100352
 }
@@ -9,7 +9,16 @@
 
 # templates for reuse via YAML anchors
 _templates:
-  # model architecture for Granite 8
+
+  granite_4_8b_architecture: &granite_4_8b_architecture
+    model_type: granite
+    num_hidden_layers: 40
+    max_position_embeddings: 131072
+    hidden_size: 4096
+    vocab_size: 100352
+    num_key_value_heads: 8
+    num_attention_heads: 32
+
   granite_33_8b_architecture: &granite_33_8b_architecture
     model_type: granite
     num_hidden_layers: 40
@@ -131,17 +140,33 @@ models:
         max_num_seqs: 32
         device_config: *llama3_8b_tp4_device_config
 
+  # Granite 4 8B Dense Hybrid
+  ibm-granite/granite-4-8b-dense-hybrid:
+    architecture:
+      <<: *granite_4_8b_architecture
+      model_type: granitemoehybrid
+      num_experts_per_tok: 0
+
+    # Continuous batching configurations
+    continuous_batching_configs:
+      - tp_size: 1
+        max_model_len: 3072
+        max_num_seqs: 16
+      - tp_size: 1
+        max_model_len: 8192
+        max_num_seqs: 4
+      - tp_size: 2
+        max_model_len: 8192
+        max_num_seqs: 4
+      - tp_size: 4
+        max_model_len: 32768
+        max_num_seqs: 32
+        device_config: *granite_8b_tp4_device_config
+
   # Granite 4 8B Dense
   ibm-granite/granite-4-8b-dense:
     architecture:
-      model_type: granitemoehybrid
-      num_hidden_layers: 40
-      num_experts_per_tok: 0  # dense model
-      max_position_embeddings: 131072
-      hidden_size: 4096
-      vocab_size: 100352
-      num_key_value_heads: 8
-      num_attention_heads: 32
+      <<: *granite_4_8b_architecture
 
     # Continuous batching configurations
     continuous_batching_configs:
@@ -159,6 +184,26 @@ models:
         max_num_seqs: 32
         device_config: *granite_8b_tp4_device_config
 
+  ibm-granite/granite-4-8b-dense-FP8:
+    architecture:
+      <<: *granite_4_8b_architecture
+      quantization_config:
+        format: float-quantized
+
+    # Continuous batching configurations
+    continuous_batching_configs:
+      - tp_size: 1
+        max_model_len: 3072
+        max_num_seqs: 16
+      - tp_size: 4
+        max_model_len: 16384
+        max_num_seqs: 4
+        device_config: *granite_8b_tp4_device_config
+      - tp_size: 4
+        max_model_len: 32768
+        max_num_seqs: 32
+        device_config: *granite_8b_tp4_device_config
+
   # Mistral Small 3.2 24B Instruct
   mistralai/Mistral-Small-3.2-24B-Instruct-2506:
     architecture: *mistral3_24b_architecture