Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions tests/config/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,14 @@ def granite_3_3_hf_config():
return _load_hf_config(fixture_path)


@pytest.fixture
def granite_4_hf_dense_hybrid_config():
"""Fixture providing a version of a real granite-4-8b-dense HF config that's a spoofed version
of a granitemoehybrid model. Granite 4 dense configs used to look like this."""
fixture_path = FIXTURES_PATH / "ibm-granite" / "granite-4-8b-dense-hybrid" / "config.json"
return _load_hf_config(fixture_path)


@pytest.fixture
def granite_4_hf_config():
"""Fixture providing real granite-4-8b-dense HF config."""
Expand Down
22 changes: 20 additions & 2 deletions tests/config/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,8 +50,26 @@ def test_match_granite_3_3_cb_config(self, registry, granite_3_3_hf_config):
assert configurator.device_config is not None
assert "VLLM_DT_MAX_BATCH_TKV_LIMIT" in configurator.device_config.env_vars

def test_match_granite_4_cb_config(self, registry, granite_4_hf_config):
"""Test matching granite-4-8b-dense with CB config."""
def test_match_granite_4_dense_hybrid_config(self, registry, granite_4_hf_dense_hybrid_config):
"""Test matching granite-4-8b-dense configs that have type granitemoehybrid."""
vllm_config = create_vllm_config(
hf_config=granite_4_hf_dense_hybrid_config,
world_size=4,
max_model_len=32768,
max_num_seqs=32,
)

configurator = registry.get_configurator_for_runtime(vllm_config)

assert configurator is not None
assert isinstance(configurator, ModelConfigurator)
# This is really a dense model, but it has model type "granitemoehybrid"
# It has the same overrides as the regular dense variant
Comment on lines +66 to +67
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This makes me think that there might have be a mistake when creating the new checkpoint 🤔

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah it's kinda unclear how things evolved here, probably part of the reason why these configs haven't quite yet landed on hf hub

assert configurator.model_config.name == "ibm-granite/granite-4-8b-dense-hybrid"
assert configurator.device_config is not None

def test_match_granite_4_dense_config(self, registry, granite_4_hf_config):
"""Test matching granite-4-8b-dense configs that aren't spoofed moe hybrid models."""
vllm_config = create_vllm_config(
hf_config=granite_4_hf_config, world_size=4, max_model_len=32768, max_num_seqs=32
)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
{
"architectures": [
"GraniteMoeHybridForCausalLM"
],
"attention_bias": false,
"attention_dropout": 0.0,
"attention_multiplier": 0.0078125,
"bos_token_id": 100257,
"embedding_multiplier": 12,
"eos_token_id": 100257,
"hidden_act": "silu",
"hidden_size": 4096,
"init_method": "mup",
"initializer_range": 0.1,
"intermediate_size": 12800,
"layer_types": [
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention"
],
"logits_scaling": 16,
"mamba_chunk_size": 256,
"mamba_conv_bias": true,
"mamba_d_conv": 4,
"mamba_d_head": 64,
"mamba_d_state": 256,
"mamba_expand": 2,
"mamba_n_groups": 1,
"mamba_n_heads": 128,
"mamba_proj_bias": false,
"max_position_embeddings": 131072,
"model_type": "granitemoehybrid",
"normalization_function": "rmsnorm",
"num_attention_heads": 32,
"num_experts_per_tok": 0,
"num_hidden_layers": 40,
"num_key_value_heads": 8,
"num_local_experts": 0,
"output_router_logits": false,
"pad_token_id": 100256,
"position_embedding_type": "rope",
"residual_multiplier": 0.22,
"rms_norm_eps": 1e-05,
"rope_scaling": null,
"rope_theta": 10000000,
"router_aux_loss_coef": 0.01,
"shared_intermediate_size": 12800,
"tie_word_embeddings": true,
"torch_dtype": "bfloat16",
"transformers_version": "4.56.0",
"use_cache": true,
"vocab_size": 100352
}
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"architectures": [
"GraniteMoeHybridForCausalLM"
"GraniteForCausalLM"
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the diff between the old and new checkpoint configs

],
"attention_bias": false,
"attention_dropout": 0.0,
Expand All @@ -10,81 +10,22 @@
"eos_token_id": 100257,
"hidden_act": "silu",
"hidden_size": 4096,
"init_method": "mup",
"initializer_range": 0.1,
"intermediate_size": 12800,
"layer_types": [
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention",
"attention"
],
"logits_scaling": 16,
"mamba_chunk_size": 256,
"mamba_conv_bias": true,
"mamba_d_conv": 4,
"mamba_d_head": 64,
"mamba_d_state": 256,
"mamba_expand": 2,
"mamba_n_groups": 1,
"mamba_n_heads": 128,
"mamba_proj_bias": false,
"max_position_embeddings": 131072,
"model_type": "granitemoehybrid",
"normalization_function": "rmsnorm",
"mlp_bias": false,
"model_type": "granite",
"num_attention_heads": 32,
"num_experts_per_tok": 0,
"num_hidden_layers": 40,
"num_key_value_heads": 8,
"num_local_experts": 0,
"output_router_logits": false,
"pad_token_id": 100256,
"position_embedding_type": "rope",
"residual_multiplier": 0.22,
"rms_norm_eps": 1e-05,
"rope_scaling": null,
"rope_theta": 10000000,
"router_aux_loss_coef": 0.01,
"shared_intermediate_size": 12800,
"tie_word_embeddings": true,
"torch_dtype": "bfloat16",
"transformers_version": "4.56.0",
"transformers_version": "4.53.3",
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would different transformers version cause any issues between the 2 variants?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know the answer :/

It's probably not too relevant for us as we're not using transformers to load the model

"use_cache": true,
"vocab_size": 100352
}
63 changes: 54 additions & 9 deletions vllm_spyre/config/model_configs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,16 @@

# templates for reuse via YAML anchors
_templates:
# model architecture for Granite 8

granite_4_8b_architecture: &granite_4_8b_architecture
model_type: granite
num_hidden_layers: 40
max_position_embeddings: 131072
hidden_size: 4096
vocab_size: 100352
num_key_value_heads: 8
num_attention_heads: 32

granite_33_8b_architecture: &granite_33_8b_architecture
model_type: granite
num_hidden_layers: 40
Expand Down Expand Up @@ -131,17 +140,33 @@ models:
max_num_seqs: 32
device_config: *llama3_8b_tp4_device_config

# Granite 4 8B Dense Hybrid
ibm-granite/granite-4-8b-dense-hybrid:
architecture:
<<: *granite_4_8b_architecture
model_type: granitemoehybrid
num_experts_per_tok: 0

# Continuous batching configurations
continuous_batching_configs:
- tp_size: 1
max_model_len: 3072
max_num_seqs: 16
- tp_size: 1
max_model_len: 8192
max_num_seqs: 4
- tp_size: 2
max_model_len: 8192
max_num_seqs: 4
- tp_size: 4
max_model_len: 32768
max_num_seqs: 32
device_config: *granite_8b_tp4_device_config

# Granite 4 8B Dense
ibm-granite/granite-4-8b-dense:
architecture:
model_type: granitemoehybrid
num_hidden_layers: 40
num_experts_per_tok: 0 # dense model
max_position_embeddings: 131072
hidden_size: 4096
vocab_size: 100352
num_key_value_heads: 8
num_attention_heads: 32
<<: *granite_4_8b_architecture

# Continuous batching configurations
continuous_batching_configs:
Expand All @@ -159,6 +184,26 @@ models:
max_num_seqs: 32
device_config: *granite_8b_tp4_device_config

ibm-granite/granite-4-8b-dense-FP8:
architecture:
<<: *granite_4_8b_architecture
quantization_config:
format: float-quantized

# Continuous batching configurations
continuous_batching_configs:
- tp_size: 1
max_model_len: 3072
max_num_seqs: 16
- tp_size: 4
max_model_len: 16384
max_num_seqs: 4
device_config: *granite_8b_tp4_device_config
- tp_size: 4
max_model_len: 32768
max_num_seqs: 32
device_config: *granite_8b_tp4_device_config

# Mistral Small 3.2 24B Instruct
mistralai/Mistral-Small-3.2-24B-Instruct-2506:
architecture: *mistral3_24b_architecture
Expand Down