From 896e7a1695493408674cd76bff8693c2b673a9bb Mon Sep 17 00:00:00 2001 From: bigximik Date: Fri, 28 Nov 2025 14:09:39 +0000 Subject: [PATCH 01/11] fix qwen converted to correctly load qkv biases --- fast_llm/models/gpt/conversion/qwen2.py | 32 +++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/fast_llm/models/gpt/conversion/qwen2.py b/fast_llm/models/gpt/conversion/qwen2.py index a8bc3345..57c9614b 100644 --- a/fast_llm/models/gpt/conversion/qwen2.py +++ b/fast_llm/models/gpt/conversion/qwen2.py @@ -2,6 +2,7 @@ from fast_llm.engine.checkpoint.config import CheckpointFormat from fast_llm.layers.attention.config import AttentionConfig +from fast_llm.layers.decoder.mlp.config import MLPConfig from fast_llm.models.gpt.conversion.config import Qwen2CheckpointFormat from fast_llm.models.gpt.conversion.llama import ( LlamaAttentionConverter, @@ -10,6 +11,7 @@ LlamaDecoderConverter, LlamaHeadConverter, LlamaHuggingfaceCheckpointHandler, + LlamaMLPConverter, ) from fast_llm.utils import Assert @@ -17,6 +19,22 @@ class Qwen2AttentionConverter(LlamaAttentionConverter): # TODO: Support sliding window with max_window_layers (need 2 kinds of block?) + @classmethod + def import_config(cls, config: dict) -> dict: + config["attention_bias"] = True + out = super().import_config(config) + out["query_layer"] = {"bias": {"enabled": True}} + out["key_layer"] = {"bias": {"enabled": True}} + out["value_layer"] = {"bias": {"enabled": True}} + out["dense_layer"] = {"bias": {"enabled": False}} + return out + + @classmethod + def export_config(cls, config: AttentionConfig) -> dict: + out = super().export_config(config) + del out["attention_bias"] + return out + @classmethod def _check_config(cls, config: AttentionConfig) -> None: Assert.is_(type(config), AttentionConfig) @@ -33,8 +51,22 @@ def _check_config(cls, config: AttentionConfig) -> None: Assert.incl(config.dense_layer.bias.enabled, (None, False)) +class Qwen2MLPConverter(LlamaMLPConverter): + @classmethod + def import_config(cls, config: dict) -> dict: + config["mlp_bias"] = False + return super().import_config(config) + + @classmethod + def export_config(cls, config: MLPConfig) -> dict: + out = super().export_config(config) + del out["mlp_bias"] + return out + + class Qwen2BlockConverter(LlamaBlockConverter): mixer_converter_class: typing.ClassVar[type[Qwen2AttentionConverter]] = Qwen2AttentionConverter + mlp_converter_class: typing.ClassVar[type[Qwen2MLPConverter]] = Qwen2MLPConverter class Qwen2DecoderConverter(LlamaDecoderConverter): From fcc81161213533ed059a48407fb3d39228aa15ba Mon Sep 17 00:00:00 2001 From: bigximik Date: Mon, 1 Dec 2025 08:22:58 +0000 Subject: [PATCH 02/11] enabled qwen tests --- tests/utils/model_configs.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index f7797e3c..a81df07c 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -477,10 +477,10 @@ def _update_and_add_testing_config( checkpoint_format=Qwen2CheckpointFormat, # TODO: Add back generate as `normal` when stable. groups={ - ModelTestingGroup.basic: ModelTestingGroupAction.broken, - ModelTestingGroup.checkpoint: ModelTestingGroupAction.broken, - ModelTestingGroup.convert: ModelTestingGroupAction.broken, - ModelTestingGroup.generate: ModelTestingGroupAction.broken, + ModelTestingGroup.basic: ModelTestingGroupAction.normal, + ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal, + ModelTestingGroup.convert: ModelTestingGroupAction.normal, + ModelTestingGroup.generate: ModelTestingGroupAction.normal, ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented, ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant, }, From 1a0ec4a9b6a4fc656ae8f47d252d3b87c2bf67c5 Mon Sep 17 00:00:00 2001 From: bigximik Date: Mon, 1 Dec 2025 09:56:44 +0000 Subject: [PATCH 03/11] generate test fix --- tests/models/test_generate.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/models/test_generate.py b/tests/models/test_generate.py index bce77d4f..e6d1040c 100644 --- a/tests/models/test_generate.py +++ b/tests/models/test_generate.py @@ -10,6 +10,7 @@ from fast_llm.models.gpt.config import PretrainedGPTModelConfig from fast_llm.models.gpt.conversion.config import LlamaCheckpointFormat from fast_llm.models.gpt.huggingface import HuggingfaceGPTModelForCausalLM +from tests.utils.distributed_configs import DistributedTestingConfig from tests.utils.model_configs import ModelTestingGroup from tests.utils.utils import requires_cuda @@ -244,13 +245,19 @@ def test_export_for_generate(run_test_script_for_all_models, model_testing_confi # Not really testing, anything, but handles dependencies more easily than a fixture. if model_testing_config.checkpoint_format is None: pytest.skip(f"Conversion not supported for {model_testing_config.name}") - run_test_script_for_all_models( - [ + if torch.cuda.device_count() < 1: + pytest.skip(f"Not enough gpus to run the test") + + distr_config = DistributedTestingConfig( + name=model_testing_config.name, + config_args=[ "training.train_iters=1", f"training.export.format={model_testing_config.checkpoint_format.name}", "training.export.interval=1", ], + num_gpus=1, ) + run_test_script_for_all_models(distr_config) @pytest.mark.slow From a4e1b5782f7a852f558be54a0fd768dca3f257a7 Mon Sep 17 00:00:00 2001 From: bigximik Date: Mon, 1 Dec 2025 09:57:34 +0000 Subject: [PATCH 04/11] qwen test model config fix --- tests/utils/model_configs.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py index a81df07c..64a26a07 100644 --- a/tests/utils/model_configs.py +++ b/tests/utils/model_configs.py @@ -470,7 +470,10 @@ def _update_and_add_testing_config( "qwen_2", # TODO: replace updates={ - ("model", "base_model", "decoder", "block", "add_linear_biases"): "only_attn_qkv", + ("model", "base_model", "decoder", "block", "mixer", "query_layer", "bias", "enabled"): True, + ("model", "base_model", "decoder", "block", "mixer", "key_layer", "bias", "enabled"): True, + ("model", "base_model", "decoder", "block", "mixer", "value_layer", "bias", "enabled"): True, + ("model", "base_model", "decoder", "block", "mixer", "dense_layer", "bias", "enabled"): False, }, # Megatron doesn't support per sub layer biases. megatron_args=None, From c23ed861b64d1316ed0ef7ab83d7453bc0bd40b1 Mon Sep 17 00:00:00 2001 From: bigximik Date: Mon, 1 Dec 2025 09:58:11 +0000 Subject: [PATCH 05/11] lm_eval evaluator setup fix --- fast_llm/engine/evaluation/lm_eval/evaluator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fast_llm/engine/evaluation/lm_eval/evaluator.py b/fast_llm/engine/evaluation/lm_eval/evaluator.py index 5bfb544e..af714b07 100644 --- a/fast_llm/engine/evaluation/lm_eval/evaluator.py +++ b/fast_llm/engine/evaluation/lm_eval/evaluator.py @@ -60,7 +60,7 @@ def setup( self._flm_wrapper = FastLLMLmEvalWrapper( model=self._hf_model, - tokenizer=self._config.tokenizer.get_tokenizer(), + tokenizer=self._config.tokenizer.get_tokenizer().tokenizer, truncation=self._config.truncation, logits_cache=self._config.logits_cache, add_bos_token=self._config.add_bos_token, From 871ba78af5b8b95bb31a4f65f8a274552ca44eee Mon Sep 17 00:00:00 2001 From: bigximik Date: Mon, 1 Dec 2025 09:58:53 +0000 Subject: [PATCH 06/11] lm_eval config fix --- tests/models/test_lm_eval.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/models/test_lm_eval.py b/tests/models/test_lm_eval.py index 8011b5bb..45bde11c 100644 --- a/tests/models/test_lm_eval.py +++ b/tests/models/test_lm_eval.py @@ -34,8 +34,8 @@ def do_get_lm_eval_config(base_path): task_dir = pathlib.Path(lm_eval.tasks.__file__).parent.resolve() return [ - f"data.tokenizer.path={tokenizer_path}", - f"model.base_model.vocab_size=49157", + f"training.evaluators.evaluation_test.evaluator.tokenizer.path={tokenizer_path}", + f"model.base_model.embeddings.vocab_size=49157", "training.evaluators.evaluation_test.interval=2", "training.evaluators.evaluation_test.evaluator.type=lm_eval", "training.evaluators.evaluation_test.evaluator.cli_args=" From 072eb922703282f6432664e258e7b26fc5c4b5e5 Mon Sep 17 00:00:00 2001 From: bigximik Date: Tue, 2 Dec 2025 12:09:03 +0000 Subject: [PATCH 07/11] fix converters --- fast_llm/models/gpt/conversion/qwen2.py | 37 +++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/fast_llm/models/gpt/conversion/qwen2.py b/fast_llm/models/gpt/conversion/qwen2.py index 57c9614b..4ebf18c3 100644 --- a/fast_llm/models/gpt/conversion/qwen2.py +++ b/fast_llm/models/gpt/conversion/qwen2.py @@ -1,10 +1,12 @@ import typing from fast_llm.engine.checkpoint.config import CheckpointFormat +from fast_llm.engine.checkpoint.external import WeightConverter from fast_llm.layers.attention.config import AttentionConfig from fast_llm.layers.decoder.mlp.config import MLPConfig from fast_llm.models.gpt.conversion.config import Qwen2CheckpointFormat from fast_llm.models.gpt.conversion.llama import ( + KeyValueWeightConverter, LlamaAttentionConverter, LlamaBaseModelConverter, LlamaBlockConverter, @@ -12,6 +14,8 @@ LlamaHeadConverter, LlamaHuggingfaceCheckpointHandler, LlamaMLPConverter, + QueryWeightConverter, + get_weight_and_bias_converters, ) from fast_llm.utils import Assert @@ -50,6 +54,39 @@ def _check_config(cls, config: AttentionConfig) -> None: Assert.is_(config.value_layer.bias.enabled, True) Assert.incl(config.dense_layer.bias.enabled, (None, False)) + @classmethod + def get_converters( + cls, + config: AttentionConfig, + fast_llm_prefix: str, + hf_prefix: str, + drop_on_export: bool = False, + ) -> list[WeightConverter]: + return [ + *get_weight_and_bias_converters( + f"{fast_llm_prefix}.query", + f"{hf_prefix}.q_proj", + True, + QueryWeightConverter, + config, + drop_on_export=drop_on_export, + ), + *get_weight_and_bias_converters( + f"{fast_llm_prefix}.key_value", + (f"{hf_prefix}.k_proj", f"{hf_prefix}.v_proj"), + True, + KeyValueWeightConverter, + config, + drop_on_export=drop_on_export, + ), + *get_weight_and_bias_converters( + f"{fast_llm_prefix}.dense", + f"{hf_prefix}.o_proj", + False, + drop_on_export=drop_on_export, + ), + ] + class Qwen2MLPConverter(LlamaMLPConverter): @classmethod From eb858a9718b19cb2db96d255bf73c7d166e785e4 Mon Sep 17 00:00:00 2001 From: bigximik Date: Tue, 2 Dec 2025 12:09:57 +0000 Subject: [PATCH 08/11] fix generate tests --- tests/models/test_generate.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tests/models/test_generate.py b/tests/models/test_generate.py index e6d1040c..5aa92afc 100644 --- a/tests/models/test_generate.py +++ b/tests/models/test_generate.py @@ -153,7 +153,9 @@ def _test_for_batches( if tokenizer is not None: inputs = _prepare_data(tokenizer, use_batch_size2=False) else: - inputs = _prepare_rand_data(fast_llm_model.config.fast_llm_config.base_model.vocab_size, use_batch_size2=False) + inputs = _prepare_rand_data( + fast_llm_model.config.fast_llm_config.base_model.embeddings.vocab_size, use_batch_size2=False + ) outputs = _generate( inputs, hf_model, @@ -165,7 +167,9 @@ def _test_for_batches( if tokenizer is not None: inputs = _prepare_data(tokenizer, use_batch_size2=True) else: - inputs = _prepare_rand_data(fast_llm_model.config.fast_llm_config.base_model.vocab_size, use_batch_size2=True) + inputs = _prepare_rand_data( + fast_llm_model.config.fast_llm_config.base_model.embeddings.vocab_size, use_batch_size2=True + ) outputs = _generate( inputs, hf_model, @@ -249,7 +253,7 @@ def test_export_for_generate(run_test_script_for_all_models, model_testing_confi pytest.skip(f"Not enough gpus to run the test") distr_config = DistributedTestingConfig( - name=model_testing_config.name, + name="test_export_for_generate", config_args=[ "training.train_iters=1", f"training.export.format={model_testing_config.checkpoint_format.name}", @@ -347,7 +351,7 @@ def _test_forward_return_hidden_states( inputs_ids = torch.randint( 1, - fast_llm_model.config.fast_llm_config.base_model.vocab_size if vocab_size is None else vocab_size, + fast_llm_model.config.fast_llm_config.base_model.embeddings.vocab_size if vocab_size is None else vocab_size, [1, 10], dtype=torch.int64, generator=torch.Generator().manual_seed(42), From 225db233bc57ef3ad3d6bfa1ae94c24082a7f2d6 Mon Sep 17 00:00:00 2001 From: bigximik Date: Tue, 2 Dec 2025 14:58:37 +0000 Subject: [PATCH 09/11] fix can generate --- fast_llm/engine/inference/huggingface.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fast_llm/engine/inference/huggingface.py b/fast_llm/engine/inference/huggingface.py index 3ffed453..ced13681 100644 --- a/fast_llm/engine/inference/huggingface.py +++ b/fast_llm/engine/inference/huggingface.py @@ -244,3 +244,7 @@ def stop_workers(self): def inner_forward(*args, **kwargs) -> tuple | transformers.utils.generic.ModelOutput: # Meant to be overridden in derived classes raise NotImplementedError() + + @classmethod + def can_generate(cls) -> bool: + return True From ba8a92fc8018e0c1ce5f98235ed4b625655f0ba7 Mon Sep 17 00:00:00 2001 From: bigximik Date: Tue, 2 Dec 2025 15:00:48 +0000 Subject: [PATCH 10/11] new forward signature --- fast_llm/models/gpt/huggingface.py | 66 ++++++++++++++--------- fast_llm/models/multimodal/huggingface.py | 18 ++++--- 2 files changed, 50 insertions(+), 34 deletions(-) diff --git a/fast_llm/models/gpt/huggingface.py b/fast_llm/models/gpt/huggingface.py index a418c3fb..76a2f4a0 100644 --- a/fast_llm/models/gpt/huggingface.py +++ b/fast_llm/models/gpt/huggingface.py @@ -4,7 +4,9 @@ import typing import torch +import transformers.cache_utils import transformers.modeling_outputs +import transformers.utils from fast_llm.data.sample.language_model import LanguageModelBatch from fast_llm.data.sample.token import TokenBatch @@ -36,13 +38,13 @@ def inner_forward( input_ids: torch.Tensor | None = None, attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, - past_key_values=None, + past_key_values: transformers.cache_utils.Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - labels: torch.LongTensor | None = None, + labels: torch.Tensor | None = None, use_cache: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, + cache_position: torch.Tensor | None = None, + logits_to_keep: int | torch.Tensor = 0, + **kwargs: typing.Unpack[transformers.utils.TransformersKwargs], ) -> tuple | transformers.modeling_outputs.CausalLMOutputWithPast: return self._inner_forward( self._get_batch(input_ids, attention_mask, position_ids), @@ -50,9 +52,9 @@ def inner_forward( inputs_embeds, labels, use_cache, - output_attentions, - output_hidden_states, - return_dict, + cache_position, + logits_to_keep, + **kwargs, ) def _get_batch( @@ -82,20 +84,26 @@ def _get_batch( def _inner_forward( self, batch: LanguageModelBatch, - past_key_values=None, + past_key_values: transformers.cache_utils.Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - labels: torch.LongTensor | None = None, + labels: torch.Tensor | None = None, use_cache: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: list[str | re.Pattern] | bool | None = None, - return_dict: bool | None = None, + cache_position: torch.Tensor | None = None, + logits_to_keep: int | torch.Tensor = 0, + **kwargs: typing.Unpack[transformers.utils.TransformersKwargs], ) -> tuple | transformers.modeling_outputs.CausalLMOutputWithPast: # TODO: Most of this is generalizable. - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions + output_attentions = ( + kwargs["output_attentions"] + if "output_attentions" in kwargs and kwargs["output_attentions"] is not None + else self.config.output_attentions + ) output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states + kwargs["output_hidden_states"] + if "output_hidden_states" in kwargs and kwargs["output_hidden_states"] is not None + else self.config.output_hidden_states ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict + return_dict = kwargs["return_dict"] if "return_dict" in kwargs else self.config.use_return_dict use_cache = use_cache if use_cache is not None else self.config.use_cache if output_attentions: @@ -104,6 +112,12 @@ def _inner_forward( raise NotImplementedError() if labels is not None: raise NotImplementedError() + # TODO: seems cache_position are always provided even if use_cache is false + # check if it is the case and implement support to it. + # if cache_position is not None: + # raise NotImplementedError() + if isinstance(logits_to_keep, torch.Tensor) or logits_to_keep > 0: + raise NotImplementedError() # Iteration serves as a random seed, using random module because it's not seeded by Fast LLM iteration = random.randint(0, 2**32) @@ -122,33 +136,33 @@ def _inner_forward( # kwargs is shallow-copied so changes will propagate back to the main namespace. kwargs_meta[BlockKwargs.output_hidden_states] = [re.compile(pattern) for pattern in output_hidden_states] - ((input_, kwargs),) = self.fast_llm_base_model.preprocess_batch( + ((input_, batch_kwargs),) = self.fast_llm_base_model.preprocess_batch( batch, [(input_meta, kwargs_meta)], phase=PhaseType.inference, iteration=iteration ) if past_key_values is not None: # The transformers will use the past keys and values to this list. - kwargs[AttentionKwargs.past_key_values] = past_key_values + batch_kwargs[AttentionKwargs.past_key_values] = past_key_values # TODO: preprocess needs to know about the past. raise NotImplementedError() if use_cache: # The transformers will save the present keys and values to this list. - kwargs[AttentionKwargs.presents] = [] + batch_kwargs[AttentionKwargs.presents] = [] - kwargs["global_logits"] = True + batch_kwargs["global_logits"] = True - self._inference_runner.forward(input_, kwargs, iteration=iteration) + self._inference_runner.forward(input_, batch_kwargs, iteration=iteration) # TODO: Make a proper way of returning the model output. - if kwargs[AttentionKwargs.sequence_first]: - logits = kwargs["logits"].transpose(0, 1) + if batch_kwargs[AttentionKwargs.sequence_first]: + logits = batch_kwargs["logits"].transpose(0, 1) else: - logits = kwargs["logits"] + logits = batch_kwargs["logits"] if output_hidden_states: hidden_states = { key: tensor if meta is None else meta.local_to_global(tensor)[0] - for key, (meta, tensor) in kwargs["hidden_states"].items() + for key, (meta, tensor) in batch_kwargs["hidden_states"].items() } else: hidden_states = None @@ -167,5 +181,5 @@ def _inner_forward( return transformers.modeling_outputs.CausalLMOutputWithPast( logits=logits, hidden_states=hidden_states, - past_key_values=kwargs[AttentionKwargs.presents], + past_key_values=batch_kwargs[AttentionKwargs.presents], ) diff --git a/fast_llm/models/multimodal/huggingface.py b/fast_llm/models/multimodal/huggingface.py index 8b085999..a9db4293 100644 --- a/fast_llm/models/multimodal/huggingface.py +++ b/fast_llm/models/multimodal/huggingface.py @@ -2,7 +2,9 @@ import typing import torch +import transformers.cache_utils import transformers.modeling_outputs +import transformers.utils from fast_llm.data.preprocessing.image_patch import ImagePatchConfig from fast_llm.data.sample.patch import PatchBatch @@ -51,13 +53,13 @@ def inner_forward( attention_mask: torch.Tensor | None = None, position_ids: torch.Tensor | None = None, image_sizes: torch.Tensor | None = None, - past_key_values=None, + past_key_values: transformers.cache_utils.Cache | None = None, inputs_embeds: torch.FloatTensor | None = None, - labels: torch.LongTensor | None = None, + labels: torch.Tensor | None = None, use_cache: bool | None = None, - output_attentions: bool | None = None, - output_hidden_states: bool | None = None, - return_dict: bool | None = None, + cache_position: torch.Tensor | None = None, + logits_to_keep: int | torch.Tensor = 0, + **kwargs: typing.Unpack[transformers.utils.TransformersKwargs], ) -> tuple | transformers.modeling_outputs.CausalLMOutputWithPast: return self._inner_forward( self._get_batch(input_ids, pixel_values, attention_mask, position_ids, image_sizes), @@ -65,9 +67,9 @@ def inner_forward( inputs_embeds, labels, use_cache, - output_attentions, - output_hidden_states, - return_dict, + cache_position, + logits_to_keep, + **kwargs, ) def _get_batch( From 12bbaddf0a7f40efee5b43cbceac05ecfc397419 Mon Sep 17 00:00:00 2001 From: bigximik Date: Tue, 2 Dec 2025 15:01:19 +0000 Subject: [PATCH 11/11] update test --- tests/models/test_generate.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/models/test_generate.py b/tests/models/test_generate.py index 5aa92afc..50178a82 100644 --- a/tests/models/test_generate.py +++ b/tests/models/test_generate.py @@ -362,8 +362,7 @@ def _test_forward_return_hidden_states( input_ids=inputs_ids, output_hidden_states=True, return_dict=True, use_cache=False ) - # hidden_states include embeddings layer - assert len(res_fast_llm.hidden_states) - 1 == len(fast_llm_model.config.fast_llm_config.base_model.decoder) + assert len(res_fast_llm.hidden_states) == fast_llm_model.config.fast_llm_config.base_model.decoder.num_blocks @pytest.mark.extra_slow