diff --git a/README.md b/README.md index 93ec85b22..c9f026cdb 100644 --- a/README.md +++ b/README.md @@ -264,6 +264,8 @@ loss.backward() | OLMo2 | `liger_kernel.transformers.apply_liger_kernel_to_olmo2` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy | | GLM-4 | `liger_kernel.transformers.apply_liger_kernel_to_glm4` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy | | InternVL3 | `liger_kernel.transformers.apply_liger_kernel_to_internvl` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy | +| HunyuanV1 | `liger_kernel.transformers.apply_liger_kernel_to_hunyuan_v1_dense` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy | +| HunyuanV1 MoE | `liger_kernel.transformers.apply_liger_kernel_to_hunyuan_v1_moe` | RoPE, RMSNorm, SwiGLU, CrossEntropyLoss, FusedLinearCrossEntropy | ## Low-level APIs diff --git a/src/liger_kernel/transformers/__init__.py b/src/liger_kernel/transformers/__init__.py index 39c372438..f2cb4ffcd 100644 --- a/src/liger_kernel/transformers/__init__.py +++ b/src/liger_kernel/transformers/__init__.py @@ -42,6 +42,8 @@ from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_glm4v # noqa: F401 from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_glm4v_moe # noqa: F401 from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_granite # noqa: F401 + from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_hunyuan_v1_dense # noqa: F401 + from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_hunyuan_v1_moe # noqa: F401 from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_internvl # noqa: F401 from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_llama # noqa: F401 from liger_kernel.transformers.monkey_patch import apply_liger_kernel_to_llama4 # noqa: F401 @@ -128,6 +130,8 @@ def __getattr__(name: str): "apply_liger_kernel_to_qwen3_vl_moe", "apply_liger_kernel_to_smollm3", "apply_liger_kernel_to_smolvlm", + "apply_liger_kernel_to_hunyuan_v1_dense", + "apply_liger_kernel_to_hunyuan_v1_moe", } if name in monkey_patch_symbols: @@ -202,5 +206,7 @@ def __getattr__(name: str): "apply_liger_kernel_to_qwen3_vl_moe", "apply_liger_kernel_to_smollm3", "apply_liger_kernel_to_smolvlm", + "apply_liger_kernel_to_hunyuan_v1_dense", + "apply_liger_kernel_to_hunyuan_v1_moe", ] ) diff --git a/src/liger_kernel/transformers/model/hunyuan_v1.py b/src/liger_kernel/transformers/model/hunyuan_v1.py new file mode 100644 index 000000000..652ec94db --- /dev/null +++ b/src/liger_kernel/transformers/model/hunyuan_v1.py @@ -0,0 +1,134 @@ +from typing import List +from typing import Optional +from typing import Union + +import torch + +from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss +from liger_kernel.transformers.model.loss_utils import unpack_cross_entropy_result +from liger_kernel.transformers.model.output_classes import LigerCausalLMOutputWithPast + + +def lce_forward( + self, + input_ids: Optional[torch.LongTensor] = None, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_values: Optional[List[torch.FloatTensor]] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + labels: Optional[torch.LongTensor] = None, + use_cache: Optional[bool] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + cache_position: Optional[torch.LongTensor] = None, + logits_to_keep: Union[int, torch.Tensor] = 0, + skip_logits: Optional[bool] = None, + return_dict: Optional[bool] = None, + **kwargs, +) -> LigerCausalLMOutputWithPast: + r""" + labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): + Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., + config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored + (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. + + logits_to_keep (`int` or `torch.Tensor`, *optional*): + If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all + `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that + token can save memory, which becomes pretty significant for long sequences or large vocabulary size. + If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension. + This is useful when using packed tensor format (single dimension for batch and sequence length). + + Returns: + + Example: + + ```python + >>> from transformers import AutoTokenizer, HunYuanDenseV1ForCausalLM + + >>> model = HunYuanDenseV1ForCausalLM.from_pretrained("meta-hunyuan_v1_dense/HunYuanDenseV1-2-7b-hf") + >>> tokenizer = AutoTokenizer.from_pretrained("meta-hunyuan_v1_dense/HunYuanDenseV1-2-7b-hf") + + >>> prompt = "Hey, are you conscious? Can you talk to me?" + >>> inputs = tokenizer(prompt, return_tensors="pt") + + >>> # Generate + >>> generate_ids = model.generate(inputs.input_ids, max_length=30) + >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] + "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you." + ```""" + output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_hidden_states = ( + output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + ) + return_dict = return_dict if return_dict is not None else self.config.use_return_dict + + # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn) + outputs = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + cache_position=cache_position, + **kwargs, + ) + + hidden_states = outputs[0] + # Only compute necessary logits, and do not upcast them to float if we are not computing the loss + slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep + kept_hidden_states = hidden_states[:, slice_indices, :] + + shift_labels = kwargs.pop("shift_labels", None) + logits = None + loss = None + token_accuracy = None + + if skip_logits and labels is None and shift_labels is None: + raise ValueError("skip_logits is True, but labels and shift_labels are None") + + if skip_logits is None: + # By default, if in training mode, don't materialize logits + skip_logits = self.training and (labels is not None or shift_labels is not None) + + # Compute loss + if skip_logits: + result = LigerForCausalLMLoss( + hidden_states=kept_hidden_states, + lm_head_weight=self.lm_head.weight, + labels=labels, + shift_labels=shift_labels, + hidden_size=self.config.hidden_size, + **kwargs, + ) + loss, _, token_accuracy = unpack_cross_entropy_result(result) + + else: + logits = self.lm_head(kept_hidden_states) + if labels is not None or shift_labels is not None: + loss = self.loss_function( + logits=logits, + labels=labels, + shift_labels=shift_labels, + vocab_size=self.config.vocab_size, + **kwargs, + ) + + if not return_dict: + output = (logits,) + outputs[1:] + output = ((loss,) + output) if loss is not None else output + output = output + (token_accuracy,) if token_accuracy is not None else output + return output + + # Return custom output class with accuracy field + return LigerCausalLMOutputWithPast( + loss=loss, + logits=logits, + past_key_values=outputs.past_key_values, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + token_accuracy=token_accuracy, + ) diff --git a/src/liger_kernel/transformers/monkey_patch.py b/src/liger_kernel/transformers/monkey_patch.py index 2845933ee..de3c93b82 100755 --- a/src/liger_kernel/transformers/monkey_patch.py +++ b/src/liger_kernel/transformers/monkey_patch.py @@ -2558,6 +2558,123 @@ def apply_liger_kernel_to_qwen3_next( _patch_swiglu_module(expert, LigerQwen3MoeSwiGLUMLP) +def apply_liger_kernel_to_hunyuan_v1_dense( + rope: bool = True, + cross_entropy: bool = False, + fused_linear_cross_entropy: bool = True, + rms_norm: bool = True, + swiglu: bool = True, + model: PreTrainedModel = None, +) -> None: + """ + Apply Liger kernels to replace original implementation in HuggingFace Hunyuan v1 dense models. + """ + assert not (cross_entropy and fused_linear_cross_entropy), ( + "cross_entropy and fused_linear_cross_entropy cannot both be True." + ) + + from transformers.models.hunyuan_v1_dense import modeling_hunyuan_v1_dense + from transformers.models.hunyuan_v1_dense.modeling_hunyuan_v1_dense import HunYuanDenseV1Model + + from liger_kernel.transformers.model.hunyuan_v1 import lce_forward as hunyuan_v1_lce_forward + from liger_kernel.transformers.swiglu import LigerHunyuanV1SwiGLUMLP + + if rope: + modeling_hunyuan_v1_dense.apply_rotary_pos_emb = liger_rotary_pos_emb + + if rms_norm: + modeling_hunyuan_v1_dense.HunYuanDenseV1RMSNorm = LigerRMSNorm + + if cross_entropy: + from transformers.loss.loss_utils import nn + + nn.functional.cross_entropy = liger_cross_entropy + + if fused_linear_cross_entropy: + if model is not None: + model.forward = MethodType(hunyuan_v1_lce_forward, model) + else: + modeling_hunyuan_v1_dense.HunYuanDenseV1ForCausalLM.forward = hunyuan_v1_lce_forward + + if swiglu: + modeling_hunyuan_v1_dense.HunYuanDenseV1MLP = LigerHunyuanV1SwiGLUMLP + + if model is not None: + # The model instance already exists, so we need to additionally patch the + # instance variables that reference already-instantiated modules + + # get the base model from the model instance + base_model: HunYuanDenseV1Model = getattr(model, model.base_model_prefix, model) + + if rms_norm: + _patch_rms_norm_module(base_model.norm) + for decoder_layer in base_model.layers: + if swiglu: + _patch_swiglu_module(decoder_layer.mlp, LigerHunyuanV1SwiGLUMLP) + if rms_norm: + _patch_rms_norm_module(decoder_layer.input_layernorm) + _patch_rms_norm_module(decoder_layer.post_attention_layernorm) + + +def apply_liger_kernel_to_hunyuan_v1_moe( + rope: bool = True, + cross_entropy: bool = False, + fused_linear_cross_entropy: bool = True, + rms_norm: bool = True, + swiglu: bool = True, + model: PreTrainedModel = None, +) -> None: + """ + Apply Liger kernels to replace original implementation in HuggingFace Qwen3 models. + """ + assert not (cross_entropy and fused_linear_cross_entropy), ( + "cross_entropy and fused_linear_cross_entropy cannot both be True." + ) + + from transformers.models.hunyuan_v1_moe import modeling_hunyuan_v1_moe + from transformers.models.hunyuan_v1_moe.modeling_hunyuan_v1_moe import HunYuanMoEV1Model + + from liger_kernel.transformers.model.hunyuan_v1 import lce_forward as hunyuan_v1_moe_lce_forward + from liger_kernel.transformers.swiglu import LigerHunyuanV1SwiGLUMLP + + if rope: + modeling_hunyuan_v1_moe.apply_rotary_pos_emb = liger_rotary_pos_emb + + if rms_norm: + modeling_hunyuan_v1_moe.HunYuanMoEV1RMSNorm = LigerRMSNorm + + if cross_entropy: + from transformers.loss.loss_utils import nn + + nn.functional.cross_entropy = liger_cross_entropy + + if fused_linear_cross_entropy: + if model is not None: + model.forward = MethodType(hunyuan_v1_moe_lce_forward, model) + else: + modeling_hunyuan_v1_moe.HunYuanMoEV1ForCausalLM.forward = hunyuan_v1_moe_lce_forward + + if swiglu: + modeling_hunyuan_v1_moe.HunYuanMoEV1MLP = LigerHunyuanV1SwiGLUMLP + + if model is not None: + # The model instance already exists, so we need to additionally patch the + # instance variables that reference already-instantiated modules + + # get the base model from the model instance + base_model: HunYuanMoEV1Model = getattr(model, model.base_model_prefix, model) + + if rms_norm: + _patch_rms_norm_module(base_model.norm) + for decoder_layer in base_model.layers: + if swiglu: + for mlp_expert in decoder_layer.mlp.experts: + _patch_swiglu_module(mlp_expert, LigerHunyuanV1SwiGLUMLP) + if rms_norm: + _patch_rms_norm_module(decoder_layer.input_layernorm) + _patch_rms_norm_module(decoder_layer.post_attention_layernorm) + + # Model type corresponds to the keys defined in transformers/models/auto/modeling_auto.py MODEL_TYPE_TO_APPLY_LIGER_FN = { "gemma": apply_liger_kernel_to_gemma, @@ -2595,6 +2712,8 @@ def apply_liger_kernel_to_qwen3_next( "paligemma": apply_liger_kernel_to_paligemma, "falcon_h1": apply_liger_kernel_to_falcon_h1, "smolvlm": apply_liger_kernel_to_smolvlm, + "hunyuan_v1_dense": apply_liger_kernel_to_hunyuan_v1_dense, + "hunyuan_v1_moe": apply_liger_kernel_to_hunyuan_v1_moe, } diff --git a/src/liger_kernel/transformers/swiglu.py b/src/liger_kernel/transformers/swiglu.py index 6e8f1f77c..4cf386673 100644 --- a/src/liger_kernel/transformers/swiglu.py +++ b/src/liger_kernel/transformers/swiglu.py @@ -77,3 +77,20 @@ def __init__(self, config, intermediate_size=None): def forward(self, x): return self.down_proj(LigerSiLUMulFunction.apply(self.gate_proj(x), self.up_proj(x))) + + +class LigerHunyuanV1SwiGLUMLP(nn.Module): + def __init__(self, config, layer_idx=None, is_shared_mlp=False): + super().__init__() + self.config = config + self.hidden_size = config.hidden_size + self.intermediate_size = config.intermediate_size + self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False) + self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False) + self.layer_idx = layer_idx + if config.hidden_act not in ["silu", "swish"]: + raise ValueError(f"Activation function {config.hidden_act} not supported.") + + def forward(self, x): + return self.down_proj(LigerSiLUMulFunction.apply(self.gate_proj(x), self.up_proj(x))) \ No newline at end of file diff --git a/test/convergence/bf16/test_mini_models.py b/test/convergence/bf16/test_mini_models.py index 1a2697b50..6eb042fc6 100644 --- a/test/convergence/bf16/test_mini_models.py +++ b/test/convergence/bf16/test_mini_models.py @@ -30,6 +30,8 @@ from liger_kernel.transformers import apply_liger_kernel_to_glm4v from liger_kernel.transformers import apply_liger_kernel_to_glm4v_moe from liger_kernel.transformers import apply_liger_kernel_to_granite +from liger_kernel.transformers import apply_liger_kernel_to_hunyuan_v1_dense +from liger_kernel.transformers import apply_liger_kernel_to_hunyuan_v1_moe from liger_kernel.transformers import apply_liger_kernel_to_internvl from liger_kernel.transformers import apply_liger_kernel_to_llama from liger_kernel.transformers import apply_liger_kernel_to_llama4 @@ -62,6 +64,8 @@ from test.utils import revert_liger_kernel_to_glm4v from test.utils import revert_liger_kernel_to_glm4v_moe from test.utils import revert_liger_kernel_to_granite +from test.utils import revert_liger_kernel_to_hunyuan_v1 +from test.utils import revert_liger_kernel_to_hunyuan_v1_moe from test.utils import revert_liger_kernel_to_internvl from test.utils import revert_liger_kernel_to_llama from test.utils import revert_liger_kernel_to_llama4 @@ -261,6 +265,16 @@ except ImportError: QWEN3NEXT_AVAILABLE = False +try: + from transformers.models.hunyuan_v1_dense.configuration_hunyuan_v1_dense import HunYuanDenseV1Config + from transformers.models.hunyuan_v1_dense.modeling_hunyuan_v1_dense import HunYuanDenseV1ForCausalLM + from transformers.models.hunyuan_v1_moe.configuration_hunyuan_v1_moe import HunYuanMoEV1Config + from transformers.models.hunyuan_v1_moe.modeling_hunyuan_v1_moe import HunYuanMoEV1ForCausalLM + + HUNYUAN_V1_AVAILABLE = True +except ImportError: + HUNYUAN_V1_AVAILABLE = False + from liger_kernel.utils import infer_device device = infer_device() @@ -1304,6 +1318,69 @@ ) +if HUNYUAN_V1_AVAILABLE: + MINI_MODEL_SETUPS["mini_hunyuan_v1"] = MiniModelConfig( + liger_kernel_patch_func=apply_liger_kernel_to_hunyuan_v1_dense, + liger_kernel_patch_revert_func=revert_liger_kernel_to_hunyuan_v1, + model_class=HunYuanDenseV1ForCausalLM, + mini_model_config=HunYuanDenseV1Config( + attention_dropout=0.0, + bos_token_id=1, + eos_token_id=2, + hidden_act="silu", + num_hidden_layers=4, + hidden_size=896, + intermediate_size=4864, + num_attention_heads=8, + head_dim=112, + rms_norm_eps=1e-6, + tie_word_embeddings=True, + max_position_embeddings=32768, + initializer_range=0.02, + norm_eps=1e-6, + num_key_value_heads=2, + rope_theta=10000.0, + partial_rotary_factor=1.0, + vocab_size=32000, + use_cache=True, + attn_implementation="sdpa", + ), + ) + + MINI_MODEL_SETUPS["mini_hunyuan_v1_moe"] = MiniModelConfig( + liger_kernel_patch_func=apply_liger_kernel_to_hunyuan_v1_moe, + liger_kernel_patch_revert_func=revert_liger_kernel_to_hunyuan_v1_moe, + model_class=HunYuanMoEV1ForCausalLM, + mini_model_config = HunYuanMoEV1Config( + vocab_size=32000, + hidden_size=128, + intermediate_size=512, + head_dim=16, + num_hidden_layers=2, + num_attention_heads=8, + num_key_value_heads=2, + hidden_act="silu", + max_position_embeddings=32768, + initializer_range=0.02, + rms_norm_eps=1e-5, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + eod_token_id=3, + sep_token_id=4, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + num_experts=2, + moe_topk=1, + attn_implementation="sdpa", + ), + ) + + def create_model(model_name="mini_llama4"): """ Create a mini version model @@ -1885,6 +1962,44 @@ def run_mini_model( ), ], ), + pytest.param( + "mini_hunyuan_v1", + 32, + 1e-5, + torch.bfloat16, + 1e-2, + 5e-2, + 1e-1, + 1e-2, + 1e-2, + 1e-2, + marks=[ + pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"), + pytest.mark.skipif( + not HUNYUAN_V1_AVAILABLE, + reason="Hunyuan_v1 not available in this version of transformers", + ), + ], + ), + pytest.param( + "mini_hunyuan_v1_moe", + 32, + 1e-5, + torch.bfloat16, + 5e-2, + 5e-2, + 1e-1, # 1e-1 + 1e-1, # 1e-2 + 1e-2, + 1e-2, + marks=[ + pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"), + pytest.mark.skipif( + not HUNYUAN_V1_AVAILABLE, + reason="Hunyuan_v1_moe not available in this version of transformers", + ), + ], + ), ], ) def test_mini_model( diff --git a/test/convergence/bf16/test_mini_models_with_logits.py b/test/convergence/bf16/test_mini_models_with_logits.py index 8100cf980..0d5028210 100644 --- a/test/convergence/bf16/test_mini_models_with_logits.py +++ b/test/convergence/bf16/test_mini_models_with_logits.py @@ -30,6 +30,8 @@ from liger_kernel.transformers import apply_liger_kernel_to_glm4v from liger_kernel.transformers import apply_liger_kernel_to_glm4v_moe from liger_kernel.transformers import apply_liger_kernel_to_granite +from liger_kernel.transformers import apply_liger_kernel_to_hunyuan_v1_dense +from liger_kernel.transformers import apply_liger_kernel_to_hunyuan_v1_moe from liger_kernel.transformers import apply_liger_kernel_to_internvl from liger_kernel.transformers import apply_liger_kernel_to_llama from liger_kernel.transformers import apply_liger_kernel_to_llama4 @@ -62,6 +64,8 @@ from test.utils import revert_liger_kernel_to_glm4v from test.utils import revert_liger_kernel_to_glm4v_moe from test.utils import revert_liger_kernel_to_granite +from test.utils import revert_liger_kernel_to_hunyuan_v1 +from test.utils import revert_liger_kernel_to_hunyuan_v1_moe from test.utils import revert_liger_kernel_to_internvl from test.utils import revert_liger_kernel_to_llama from test.utils import revert_liger_kernel_to_llama4 @@ -254,6 +258,17 @@ QWEN3NEXT_AVAILABLE = False +try: + from transformers.models.hunyuan_v1_dense.configuration_hunyuan_v1_dense import HunYuanDenseV1Config + from transformers.models.hunyuan_v1_dense.modeling_hunyuan_v1_dense import HunYuanDenseV1ForCausalLM + from transformers.models.hunyuan_v1_moe.configuration_hunyuan_v1_moe import HunYuanMoEV1Config + from transformers.models.hunyuan_v1_moe.modeling_hunyuan_v1_moe import HunYuanMoEV1ForCausalLM + + HUNYUAN_V1_AVAILABLE = True +except ImportError: + HUNYUAN_V1_AVAILABLE = False + + from liger_kernel.utils import infer_device device = infer_device() @@ -1298,6 +1313,69 @@ ) +if HUNYUAN_V1_AVAILABLE: + MINI_MODEL_SETUPS["mini_hunyuan_v1"] = MiniModelConfig( + liger_kernel_patch_func=apply_liger_kernel_to_hunyuan_v1_dense, + liger_kernel_patch_revert_func=revert_liger_kernel_to_hunyuan_v1, + model_class=HunYuanDenseV1ForCausalLM, + mini_model_config=HunYuanDenseV1Config( + attention_dropout=0.0, + bos_token_id=1, + eos_token_id=2, + hidden_act="silu", + num_hidden_layers=4, + hidden_size=896, + intermediate_size=4864, + num_attention_heads=8, + head_dim=112, + rms_norm_eps=1e-6, + tie_word_embeddings=True, + max_position_embeddings=32768, + initializer_range=0.02, + norm_eps=1e-6, + num_key_value_heads=2, + rope_theta=10000.0, + partial_rotary_factor=1.0, + vocab_size=32000, + use_cache=True, + attn_implementation="sdpa", + ), + ) + + MINI_MODEL_SETUPS["mini_hunyuan_v1_moe"] = MiniModelConfig( + liger_kernel_patch_func=apply_liger_kernel_to_hunyuan_v1_moe, + liger_kernel_patch_revert_func=revert_liger_kernel_to_hunyuan_v1_moe, + model_class=HunYuanMoEV1ForCausalLM, + mini_model_config = HunYuanMoEV1Config( + vocab_size=32000, + hidden_size=128, + intermediate_size=512, + head_dim=16, + num_hidden_layers=2, + num_attention_heads=8, + num_key_value_heads=2, + hidden_act="silu", + max_position_embeddings=32768, + initializer_range=0.02, + rms_norm_eps=1e-5, + use_cache=True, + pad_token_id=0, + bos_token_id=1, + eos_token_id=2, + eod_token_id=3, + sep_token_id=4, + tie_word_embeddings=False, + rope_theta=10000.0, + rope_scaling=None, + attention_bias=False, + attention_dropout=0.0, + num_experts=2, + moe_topk=1, + attn_implementation="sdpa", + ), + ) + + def create_model(model_name="mini_llama3"): """ Create a mini version model @@ -1858,6 +1936,44 @@ def run_mini_model( ), ], ), + pytest.param( + "mini_hunyuan_v1", + 32, + 1e-5, + torch.bfloat16, + 1e-2, + 5e-2, + 1e-1, + 1e-2, + 1e-2, + 1e-2, + marks=[ + pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"), + pytest.mark.skipif( + not HUNYUAN_V1_AVAILABLE, + reason="Hunyuan_v1 not available in this version of transformers", + ), + ], + ), + pytest.param( + "mini_hunyuan_v1_moe", + 32, + 1e-5, + torch.bfloat16, + 1e-2, + 5e-2, + 1e-1, + 1e-2, + 1e-2, + 1e-2, + marks=[ + pytest.mark.skipif(not supports_bfloat16(), reason="bfloat16 not supported on this GPU"), + pytest.mark.skipif( + not HUNYUAN_V1_AVAILABLE, + reason="Hunyuan_v1_moe not available in this version of transformers", + ), + ], + ), ], ) def test_mini_model( diff --git a/test/convergence/fp32/test_mini_models.py b/test/convergence/fp32/test_mini_models.py index 18920639d..67350b0f8 100644 --- a/test/convergence/fp32/test_mini_models.py +++ b/test/convergence/fp32/test_mini_models.py @@ -30,6 +30,8 @@ from liger_kernel.transformers import apply_liger_kernel_to_glm4v from liger_kernel.transformers import apply_liger_kernel_to_glm4v_moe from liger_kernel.transformers import apply_liger_kernel_to_granite +from liger_kernel.transformers import apply_liger_kernel_to_hunyuan_v1_dense +from liger_kernel.transformers import apply_liger_kernel_to_hunyuan_v1_moe from liger_kernel.transformers import apply_liger_kernel_to_internvl from liger_kernel.transformers import apply_liger_kernel_to_llama from liger_kernel.transformers import apply_liger_kernel_to_llama4 @@ -62,6 +64,8 @@ from test.utils import revert_liger_kernel_to_glm4v from test.utils import revert_liger_kernel_to_glm4v_moe from test.utils import revert_liger_kernel_to_granite +from test.utils import revert_liger_kernel_to_hunyuan_v1 +from test.utils import revert_liger_kernel_to_hunyuan_v1_moe from test.utils import revert_liger_kernel_to_internvl from test.utils import revert_liger_kernel_to_llama from test.utils import revert_liger_kernel_to_llama4 @@ -259,6 +263,16 @@ except ImportError: QWEN3NEXT_AVAILABLE = False +try: + from transformers.models.hunyuan_v1_dense.configuration_hunyuan_v1_dense import HunYuanDenseV1Config + from transformers.models.hunyuan_v1_dense.modeling_hunyuan_v1_dense import HunYuanDenseV1ForCausalLM + from transformers.models.hunyuan_v1_moe.configuration_hunyuan_v1_moe import HunYuanMoEV1Config + from transformers.models.hunyuan_v1_moe.modeling_hunyuan_v1_moe import HunYuanMoEV1ForCausalLM + + HUNYUAN_V1_AVAILABLE = True +except ImportError: + HUNYUAN_V1_AVAILABLE = False + from liger_kernel.utils import infer_device device = infer_device() @@ -1296,6 +1310,63 @@ ), ) +if HUNYUAN_V1_AVAILABLE: + MINI_MODEL_SETUPS["mini_hunyuan_v1"] = MiniModelConfig( + liger_kernel_patch_func=apply_liger_kernel_to_hunyuan_v1_dense, + liger_kernel_patch_revert_func=revert_liger_kernel_to_hunyuan_v1, + model_class=HunYuanDenseV1ForCausalLM, + mini_model_config=HunYuanDenseV1Config( + attention_dropout=0.0, + bos_token_id=1, + eos_token_id=2, + hidden_act="silu", + num_hidden_layers=4, + hidden_size=896, + intermediate_size=4864, + num_attention_heads=8, + head_dim=112, + rms_norm_eps=1e-6, + tie_word_embeddings=True, + max_position_embeddings=32768, + initializer_range=0.02, + norm_eps=1e-6, + num_key_value_heads=2, + rope_theta=10000.0, + partial_rotary_factor=1.0, + vocab_size=32000, + use_cache=True, + attn_implementation="sdpa", + ), + ) + + MINI_MODEL_SETUPS["mini_hunyuan_v1_moe"] = MiniModelConfig( + liger_kernel_patch_func=apply_liger_kernel_to_hunyuan_v1_moe, + liger_kernel_patch_revert_func=revert_liger_kernel_to_hunyuan_v1_moe, + model_class=HunYuanMoEV1ForCausalLM, + mini_model_config = HunYuanMoEV1Config( + hidden_act="silu", + attention_dropout=0.0, + num_hidden_layers=4, + hidden_size=896, + intermediate_size=4864, + num_attention_heads=8, + head_dim=112, + rms_norm_eps=1e-6, + tie_word_embeddings=True, + max_position_embeddings=32768, + initializer_range=0.02, + norm_eps=1e-6, + num_key_value_heads=2, + rope_theta=10000.0, + partial_rotary_factor=1.0, + vocab_size=32000, + num_experts=8, + moe_topk=2, + use_cache=True, + attn_implementation="sdpa", + ), + ) + def create_model(model_name="mini_llama3"): """ @@ -1737,6 +1808,38 @@ def run_mini_model( ), ], ), + pytest.param( + "mini_hunyuan_v1", + 32, + 1e-4, + torch.float32, + 1e-8, + 1e-5, + 5e-3, + 1e-5, + 5e-3, + 1e-5, + marks=pytest.mark.skipif( + not HUNYUAN_V1_AVAILABLE, + reason="Hunyuan_v1 not available in this version of transformers", + ), + ), + pytest.param( + "mini_hunyuan_v1_moe", + 32, + 1e-5, + torch.float32, + 1e-8, + 1e-5, + 5e-3, + 1e-5, + 5e-3, + 1e-5, + marks=pytest.mark.skipif( + not HUNYUAN_V1_AVAILABLE, + reason="Hunyuan_v1_moe not available in this version of transformers", + ), + ), ], ) def test_mini_model( diff --git a/test/convergence/fp32/test_mini_models_with_logits.py b/test/convergence/fp32/test_mini_models_with_logits.py index a667b8207..89e23ccc7 100644 --- a/test/convergence/fp32/test_mini_models_with_logits.py +++ b/test/convergence/fp32/test_mini_models_with_logits.py @@ -30,6 +30,8 @@ from liger_kernel.transformers import apply_liger_kernel_to_glm4v from liger_kernel.transformers import apply_liger_kernel_to_glm4v_moe from liger_kernel.transformers import apply_liger_kernel_to_granite +from liger_kernel.transformers import apply_liger_kernel_to_hunyuan_v1_dense +from liger_kernel.transformers import apply_liger_kernel_to_hunyuan_v1_moe from liger_kernel.transformers import apply_liger_kernel_to_internvl from liger_kernel.transformers import apply_liger_kernel_to_llama from liger_kernel.transformers import apply_liger_kernel_to_llama4 @@ -62,6 +64,8 @@ from test.utils import revert_liger_kernel_to_glm4v from test.utils import revert_liger_kernel_to_glm4v_moe from test.utils import revert_liger_kernel_to_granite +from test.utils import revert_liger_kernel_to_hunyuan_v1 +from test.utils import revert_liger_kernel_to_hunyuan_v1_moe from test.utils import revert_liger_kernel_to_internvl from test.utils import revert_liger_kernel_to_llama from test.utils import revert_liger_kernel_to_llama4 @@ -273,6 +277,16 @@ except ImportError: QWEN3NEXT_AVAILABLE = False +try: + from transformers.models.hunyuan_v1_dense.configuration_hunyuan_v1_dense import HunYuanDenseV1Config + from transformers.models.hunyuan_v1_dense.modeling_hunyuan_v1_dense import HunYuanDenseV1ForCausalLM + from transformers.models.hunyuan_v1_moe.configuration_hunyuan_v1_moe import HunYuanMoEV1Config + from transformers.models.hunyuan_v1_moe.modeling_hunyuan_v1_moe import HunYuanMoEV1ForCausalLM + + HUNYUAN_V1_AVAILABLE = True +except ImportError: + HUNYUAN_V1_AVAILABLE = False + from liger_kernel.utils import infer_device device = infer_device() @@ -1316,6 +1330,64 @@ ) +if HUNYUAN_V1_AVAILABLE: + MINI_MODEL_SETUPS["mini_hunyuan_v1"] = MiniModelConfig( + liger_kernel_patch_func=apply_liger_kernel_to_hunyuan_v1_dense, + liger_kernel_patch_revert_func=revert_liger_kernel_to_hunyuan_v1, + model_class=HunYuanDenseV1ForCausalLM, + mini_model_config=HunYuanDenseV1Config( + attention_dropout=0.0, + bos_token_id=1, + eos_token_id=2, + hidden_act="silu", + num_hidden_layers=4, + hidden_size=896, + intermediate_size=4864, + num_attention_heads=8, + head_dim=112, + rms_norm_eps=1e-6, + tie_word_embeddings=True, + max_position_embeddings=32768, + initializer_range=0.02, + norm_eps=1e-6, + num_key_value_heads=2, + rope_theta=10000.0, + partial_rotary_factor=1.0, + vocab_size=32000, + use_cache=True, + attn_implementation="sdpa", + ), + ) + + MINI_MODEL_SETUPS["mini_hunyuan_v1_moe"] = MiniModelConfig( + liger_kernel_patch_func=apply_liger_kernel_to_hunyuan_v1_moe, + liger_kernel_patch_revert_func=revert_liger_kernel_to_hunyuan_v1_moe, + model_class=HunYuanMoEV1ForCausalLM, + mini_model_config = HunYuanMoEV1Config( + hidden_act="silu", + attention_dropout=0.0, + num_hidden_layers=4, + hidden_size=896, + intermediate_size=4864, + num_attention_heads=8, + head_dim=112, + rms_norm_eps=1e-6, + tie_word_embeddings=True, + max_position_embeddings=32768, + initializer_range=0.02, + norm_eps=1e-6, + num_key_value_heads=2, + rope_theta=10000.0, + partial_rotary_factor=1.0, + vocab_size=32000, + num_experts=8, + moe_topk=2, + use_cache=True, + attn_implementation="sdpa", + ), + ) + + def create_model(model_name="mini_llama3"): """ Create a mini version model @@ -1723,6 +1795,42 @@ def run_mini_model( ), ], ), + pytest.param( + "mini_hunyuan_v1", + 32, + 1e-5, + torch.float32, + 1e-2, + 5e-2, + 1e-1, + 1e-2, + 1e-2, + 1e-2, + marks=[ + pytest.mark.skipif( + not HUNYUAN_V1_AVAILABLE, + reason="Hunyuan_v1 not available in this version of transformers", + ), + ], + ), + pytest.param( + "mini_hunyuan_v1_moe", + 32, + 1e-5, + torch.float32, + 1e-2, + 5e-2, + 1e-1, + 1e-2, + 1e-2, + 1e-2, + marks=[ + pytest.mark.skipif( + not HUNYUAN_V1_AVAILABLE, + reason="Hunyuan_v1_moe not available in this version of transformers", + ), + ], + ), ], ) def test_mini_model( diff --git a/test/transformers/test_monkey_patch.py b/test/transformers/test_monkey_patch.py index 200c9ed6e..b1446316a 100755 --- a/test/transformers/test_monkey_patch.py +++ b/test/transformers/test_monkey_patch.py @@ -206,6 +206,15 @@ def is_qwen3_next_available(): return False +def is_hunyuan_v1_available(): + try: + import transformers.models.hunyuan_v1_dense # noqa: F401 + + return True + except ImportError: + return False + + def test_import_from_root(): try: from liger_kernel.transformers import AutoLigerKernelForCausalLM # noqa: F401 @@ -2790,3 +2799,90 @@ def test_apply_liger_kernel_to_instance_for_qwen3_next(): print(dummy_model_instance) except Exception as e: pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}") + + +@pytest.mark.skipif(not is_hunyuan_v1_available(), reason="hunyuan_v1 module not available") +def test_apply_liger_kernel_to_instance_for_hunyuan_v1_moe(): + # Ensure any monkey patching is cleaned up for subsequent tests + with patch("transformers.models.hunyuan_v1_moe.modeling_hunyuan_v1_moe"): + from liger_kernel.transformers.model.hunyuan_v1 import lce_forward as hunyuan_v1_moe_lce_forward + + # Instantiate a dummy model + config = transformers.models.hunyuan_v1_moe.configuration_hunyuan_v1_moe.HunYuanMoEV1Config( + torch_dtype=torch.bfloat16, + rms_norm_eps=1e-5, + hidden_size=32, + intermediate_size=64, + hidden_act="silu", + num_hidden_layers=2, + head_dim=1, + ) + dummy_model_instance = AutoModelForCausalLM.from_config(config) + + # Check that model instance variables are not yet patched with Liger modules + assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(hunyuan_v1_moe_lce_forward) + assert inspect.getsource(dummy_model_instance.model.norm.forward) != inspect.getsource(LigerRMSNorm.forward) + for layer in dummy_model_instance.model.layers: + assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerQwen3MoeSwiGLUMLP.forward) + assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward) + assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward) + + # Test applying kernels to the model instance + _apply_liger_kernel_to_instance(model=dummy_model_instance) + + # Check that the model's instance variables were correctly patched with Liger modules + assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(hunyuan_v1_moe_lce_forward) + assert inspect.getsource(dummy_model_instance.model.norm.forward) == inspect.getsource(LigerRMSNorm.forward) + for layer in dummy_model_instance.model.layers: + for mlp_expert in layer.mlp.experts: + assert inspect.getsource(mlp_expert.forward) == inspect.getsource(LigerQwen3MoeSwiGLUMLP.forward) + assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward) + assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward) + + try: + print(dummy_model_instance) + except Exception as e: + pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}") + + +@pytest.mark.skipif(not is_hunyuan_v1_available(), reason="hunyuan_v1_dense module not available") +def test_apply_liger_kernel_to_instance_for_hunyuan_v1_dense(): + # Ensure any monkey patching is cleaned up for subsequent tests + with patch("transformers.models.hunyuan_v1_dense.modeling_hunyuan_v1_dense"): + from liger_kernel.transformers.model.hunyuan_v1 import lce_forward as hunyuan_v1_dense_lce_forward + + # Instantiate a dummy model + config = transformers.models.hunyuan_v1_dense.configuration_hunyuan_v1_dense.HunYuanDenseV1Config( + torch_dtype=torch.bfloat16, + rms_norm_eps=1e-5, + hidden_size=32, + intermediate_size=64, + hidden_act="silu", + num_hidden_layers=2, + head_dim=1, + ) + dummy_model_instance = AutoModelForCausalLM.from_config(config) + + # Check that model instance variables are not yet patched with Liger modules + assert inspect.getsource(dummy_model_instance.forward) != inspect.getsource(hunyuan_v1_dense_lce_forward) + assert inspect.getsource(dummy_model_instance.model.norm.forward) != inspect.getsource(LigerRMSNorm.forward) + for layer in dummy_model_instance.model.layers: + assert inspect.getsource(layer.mlp.forward) != inspect.getsource(LigerSwiGLUMLP.forward) + assert inspect.getsource(layer.input_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward) + assert inspect.getsource(layer.post_attention_layernorm.forward) != inspect.getsource(LigerRMSNorm.forward) + + # Test applying kernels to the model instance + _apply_liger_kernel_to_instance(model=dummy_model_instance) + + # Check that the model's instance variables were correctly patched with Liger modules + assert inspect.getsource(dummy_model_instance.forward) == inspect.getsource(hunyuan_v1_dense_lce_forward) + assert inspect.getsource(dummy_model_instance.model.norm.forward) == inspect.getsource(LigerRMSNorm.forward) + for layer in dummy_model_instance.model.layers: + assert inspect.getsource(layer.mlp.forward) == inspect.getsource(LigerSwiGLUMLP.forward) + assert inspect.getsource(layer.input_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward) + assert inspect.getsource(layer.post_attention_layernorm.forward) == inspect.getsource(LigerRMSNorm.forward) + + try: + print(dummy_model_instance) + except Exception as e: + pytest.fail(f"An exception occured in extra_expr: {type(e).__name__} - {e}") diff --git a/test/utils.py b/test/utils.py index e4fd271dd..efc2eb8bd 100644 --- a/test/utils.py +++ b/test/utils.py @@ -661,6 +661,30 @@ def revert_liger_kernel_to_qwen3_next(model_config: MiniModelConfig): print("Liger kernel patches have been reverted.") +def revert_liger_kernel_to_hunyuan_v1(model_config: MiniModelConfig): + """ + Revert all Liger kernel patches applied to Hunyuanv1. + """ + from transformers.models.hunyuan_v1_dense import modeling_hunyuan_v1_dense + + importlib.reload(modeling_hunyuan_v1_dense) + model_config.model_class = modeling_hunyuan_v1_dense.HunYuanDenseV1ForCausalLM + + print("Liger kernel patches have been reverted.") + + +def revert_liger_kernel_to_hunyuan_v1_moe(model_config: MiniModelConfig): + """ + Revert all Liger kernel patches applied to Hunyuanv1 MoE. + """ + from transformers.models.hunyuan_v1_moe import modeling_hunyuan_v1_moe + + importlib.reload(modeling_hunyuan_v1_moe) + model_config.model_class = modeling_hunyuan_v1_moe.HunYuanMoEV1ForCausalLM + + print("Liger kernel patches have been reverted.") + + class HFAlignmentLoss: def __init__( self,