diff --git a/tests/config/conftest.py b/tests/config/conftest.py index 46449d1e0..1e17ed2dd 100644 --- a/tests/config/conftest.py +++ b/tests/config/conftest.py @@ -36,6 +36,14 @@ def granite_3_3_hf_config(): return _load_hf_config(fixture_path) +@pytest.fixture +def granite_4_hf_dense_hybrid_config(): + """Fixture providing a version of a real granite-4-8b-dense HF config that's a spoofed version + of a granitemoehybrid model. Granite 4 dense configs used to look like this.""" + fixture_path = FIXTURES_PATH / "ibm-granite" / "granite-4-8b-dense-hybrid" / "config.json" + return _load_hf_config(fixture_path) + + @pytest.fixture def granite_4_hf_config(): """Fixture providing real granite-4-8b-dense HF config.""" diff --git a/tests/config/test_integration.py b/tests/config/test_integration.py index 9f426d53b..638b2bbb5 100644 --- a/tests/config/test_integration.py +++ b/tests/config/test_integration.py @@ -50,8 +50,26 @@ def test_match_granite_3_3_cb_config(self, registry, granite_3_3_hf_config): assert configurator.device_config is not None assert "VLLM_DT_MAX_BATCH_TKV_LIMIT" in configurator.device_config.env_vars - def test_match_granite_4_cb_config(self, registry, granite_4_hf_config): - """Test matching granite-4-8b-dense with CB config.""" + def test_match_granite_4_dense_hybrid_config(self, registry, granite_4_hf_dense_hybrid_config): + """Test matching granite-4-8b-dense configs that have type granitemoehybrid.""" + vllm_config = create_vllm_config( + hf_config=granite_4_hf_dense_hybrid_config, + world_size=4, + max_model_len=32768, + max_num_seqs=32, + ) + + configurator = registry.get_configurator_for_runtime(vllm_config) + + assert configurator is not None + assert isinstance(configurator, ModelConfigurator) + # This is really a dense model, but it has model type "granitemoehybrid" + # It has the same overrides as the regular dense variant + assert configurator.model_config.name == "ibm-granite/granite-4-8b-dense-hybrid" + assert configurator.device_config is not None + + def test_match_granite_4_dense_config(self, registry, granite_4_hf_config): + """Test matching granite-4-8b-dense configs that aren't spoofed moe hybrid models.""" vllm_config = create_vllm_config( hf_config=granite_4_hf_config, world_size=4, max_model_len=32768, max_num_seqs=32 ) diff --git a/tests/fixtures/model_configs/ibm-granite/granite-4-8b-dense-hybrid/config.json b/tests/fixtures/model_configs/ibm-granite/granite-4-8b-dense-hybrid/config.json new file mode 100644 index 000000000..03dbf6ed9 --- /dev/null +++ b/tests/fixtures/model_configs/ibm-granite/granite-4-8b-dense-hybrid/config.json @@ -0,0 +1,90 @@ +{ + "architectures": [ + "GraniteMoeHybridForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "attention_multiplier": 0.0078125, + "bos_token_id": 100257, + "embedding_multiplier": 12, + "eos_token_id": 100257, + "hidden_act": "silu", + "hidden_size": 4096, + "init_method": "mup", + "initializer_range": 0.1, + "intermediate_size": 12800, + "layer_types": [ + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention", + "attention" + ], + "logits_scaling": 16, + "mamba_chunk_size": 256, + "mamba_conv_bias": true, + "mamba_d_conv": 4, + "mamba_d_head": 64, + "mamba_d_state": 256, + "mamba_expand": 2, + "mamba_n_groups": 1, + "mamba_n_heads": 128, + "mamba_proj_bias": false, + "max_position_embeddings": 131072, + "model_type": "granitemoehybrid", + "normalization_function": "rmsnorm", + "num_attention_heads": 32, + "num_experts_per_tok": 0, + "num_hidden_layers": 40, + "num_key_value_heads": 8, + "num_local_experts": 0, + "output_router_logits": false, + "pad_token_id": 100256, + "position_embedding_type": "rope", + "residual_multiplier": 0.22, + "rms_norm_eps": 1e-05, + "rope_scaling": null, + "rope_theta": 10000000, + "router_aux_loss_coef": 0.01, + "shared_intermediate_size": 12800, + "tie_word_embeddings": true, + "torch_dtype": "bfloat16", + "transformers_version": "4.56.0", + "use_cache": true, + "vocab_size": 100352 +} diff --git a/tests/fixtures/model_configs/ibm-granite/granite-4-8b-dense/config.json b/tests/fixtures/model_configs/ibm-granite/granite-4-8b-dense/config.json index 03dbf6ed9..853438d09 100644 --- a/tests/fixtures/model_configs/ibm-granite/granite-4-8b-dense/config.json +++ b/tests/fixtures/model_configs/ibm-granite/granite-4-8b-dense/config.json @@ -1,6 +1,6 @@ { "architectures": [ - "GraniteMoeHybridForCausalLM" + "GraniteForCausalLM" ], "attention_bias": false, "attention_dropout": 0.0, @@ -10,81 +10,22 @@ "eos_token_id": 100257, "hidden_act": "silu", "hidden_size": 4096, - "init_method": "mup", "initializer_range": 0.1, "intermediate_size": 12800, - "layer_types": [ - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention", - "attention" - ], "logits_scaling": 16, - "mamba_chunk_size": 256, - "mamba_conv_bias": true, - "mamba_d_conv": 4, - "mamba_d_head": 64, - "mamba_d_state": 256, - "mamba_expand": 2, - "mamba_n_groups": 1, - "mamba_n_heads": 128, - "mamba_proj_bias": false, "max_position_embeddings": 131072, - "model_type": "granitemoehybrid", - "normalization_function": "rmsnorm", + "mlp_bias": false, + "model_type": "granite", "num_attention_heads": 32, - "num_experts_per_tok": 0, "num_hidden_layers": 40, "num_key_value_heads": 8, - "num_local_experts": 0, - "output_router_logits": false, "pad_token_id": 100256, - "position_embedding_type": "rope", "residual_multiplier": 0.22, "rms_norm_eps": 1e-05, "rope_scaling": null, "rope_theta": 10000000, - "router_aux_loss_coef": 0.01, - "shared_intermediate_size": 12800, "tie_word_embeddings": true, - "torch_dtype": "bfloat16", - "transformers_version": "4.56.0", + "transformers_version": "4.53.3", "use_cache": true, "vocab_size": 100352 } diff --git a/vllm_spyre/config/model_configs.yaml b/vllm_spyre/config/model_configs.yaml index 9839a77dd..26a7ec959 100644 --- a/vllm_spyre/config/model_configs.yaml +++ b/vllm_spyre/config/model_configs.yaml @@ -9,7 +9,16 @@ # templates for reuse via YAML anchors _templates: - # model architecture for Granite 8 + + granite_4_8b_architecture: &granite_4_8b_architecture + model_type: granite + num_hidden_layers: 40 + max_position_embeddings: 131072 + hidden_size: 4096 + vocab_size: 100352 + num_key_value_heads: 8 + num_attention_heads: 32 + granite_33_8b_architecture: &granite_33_8b_architecture model_type: granite num_hidden_layers: 40 @@ -131,17 +140,33 @@ models: max_num_seqs: 32 device_config: *llama3_8b_tp4_device_config + # Granite 4 8B Dense Hybrid + ibm-granite/granite-4-8b-dense-hybrid: + architecture: + <<: *granite_4_8b_architecture + model_type: granitemoehybrid + num_experts_per_tok: 0 + + # Continuous batching configurations + continuous_batching_configs: + - tp_size: 1 + max_model_len: 3072 + max_num_seqs: 16 + - tp_size: 1 + max_model_len: 8192 + max_num_seqs: 4 + - tp_size: 2 + max_model_len: 8192 + max_num_seqs: 4 + - tp_size: 4 + max_model_len: 32768 + max_num_seqs: 32 + device_config: *granite_8b_tp4_device_config + # Granite 4 8B Dense ibm-granite/granite-4-8b-dense: architecture: - model_type: granitemoehybrid - num_hidden_layers: 40 - num_experts_per_tok: 0 # dense model - max_position_embeddings: 131072 - hidden_size: 4096 - vocab_size: 100352 - num_key_value_heads: 8 - num_attention_heads: 32 + <<: *granite_4_8b_architecture # Continuous batching configurations continuous_batching_configs: @@ -159,6 +184,26 @@ models: max_num_seqs: 32 device_config: *granite_8b_tp4_device_config + ibm-granite/granite-4-8b-dense-FP8: + architecture: + <<: *granite_4_8b_architecture + quantization_config: + format: float-quantized + + # Continuous batching configurations + continuous_batching_configs: + - tp_size: 1 + max_model_len: 3072 + max_num_seqs: 16 + - tp_size: 4 + max_model_len: 16384 + max_num_seqs: 4 + device_config: *granite_8b_tp4_device_config + - tp_size: 4 + max_model_len: 32768 + max_num_seqs: 32 + device_config: *granite_8b_tp4_device_config + # Mistral Small 3.2 24B Instruct mistralai/Mistral-Small-3.2-24B-Instruct-2506: architecture: *mistral3_24b_architecture