From 63da6278a989e2b3f221ce73a3437a4b9d6f8864 Mon Sep 17 00:00:00 2001
From: vovanphuc <vanphucpro123@gmail.com>
Date: Thu, 8 Jan 2026 10:29:10 +0700
Subject: [PATCH 1/8] [model] add LFM2AudioPlugin for LFM2.5-Audio support

---
 src/llamafactory/data/mm_plugin.py | 149 +++++++++++++++++++++++++++++
 1 file changed, 149 insertions(+)

diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py
index b0a345e1c2..d47373dc09 100644
--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
@@ -2159,6 +2159,154 @@ def process_messages(
         return messages
 
 
+@dataclass
+class LFM2AudioPlugin(BasePlugin):
+    r"""Plugin for LFM2.5-Audio models.
+
+    LFM2.5-Audio Architecture:
+    - FastConformer audio encoder (16kHz input, 8x subsampling)
+    - Audio markers: <|audio_start|> ... <|text_start|>
+    - Uses liquid_audio package for feature extraction (optional)
+
+    Token Structure:
+    - <|audio_start|> (token 128): Audio region start
+    - <|text_start|> (token 129): Audio region end / text start
+    - audio_token: Placeholder token repeated for sequence length
+    """
+
+    audio_bos_token: str = "<|audio_start|>"
+    audio_eos_token: str = "<|text_start|>"
+
+    @override
+    def _validate_input(
+        self,
+        processor: Optional["MMProcessor"],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+    ) -> None:
+        r"""Validate inputs. Allow audio without standard HF feature_extractor.
+
+        LFM2.5-Audio uses liquid_audio package for audio processing, not standard
+        HuggingFace feature_extractor. We skip the audio validation here.
+        """
+        # Only validate images/videos, skip audio feature_extractor check
+        if len(images) != 0 or len(videos) != 0:
+            super()._validate_input(processor, images, videos, [])
+
+    @override
+    def _get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> dict[str, "torch.Tensor"]:
+        r"""Extract audio features using liquid_audio or HF processor.
+
+        LFM2.5-Audio uses custom liquid_audio processor, not standard HuggingFace.
+        This method tries to extract features if a compatible processor is available.
+        """
+        mm_inputs: dict[str, torch.Tensor] = {}
+
+        if len(audios) == 0 or processor is None:
+            return mm_inputs
+
+        # Try liquid_audio processor first (has audio_processor attribute)
+        if hasattr(processor, "audio_processor") and processor.audio_processor is not None:
+            audio_processor = processor.audio_processor
+            audios_regularized = self._regularize_audios(audios, sampling_rate=16000)["audios"]
+            # liquid_audio returns log-mel features
+            features = audio_processor(audios_regularized, sampling_rate=16000)
+            mm_inputs["audio_features"] = features
+            # Calculate sequence lengths from feature shapes (8x subsampling in FastConformer)
+            if hasattr(features, "shape"):
+                seq_len = (features.shape[-1] - 1) // 8 + 1
+                mm_inputs["audio_seq_lengths"] = [seq_len] * len(audios)
+        # Fallback: standard HF feature_extractor
+        elif hasattr(processor, "feature_extractor") and processor.feature_extractor is not None:
+            feature_extractor: SequenceFeatureExtractor = processor.feature_extractor
+            audios_regularized = self._regularize_audios(audios, sampling_rate=16000)["audios"]
+            mm_inputs.update(
+                feature_extractor(
+                    audios_regularized,
+                    sampling_rate=16000,
+                    return_attention_mask=True,
+                    padding="max_length",
+                    return_tensors="pt",
+                )
+            )
+            mm_inputs["feature_attention_mask"] = mm_inputs.pop("attention_mask", None)
+
+        return mm_inputs
+
+    @override
+    def process_messages(
+        self,
+        messages: list[dict[str, str]],
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        processor: Optional["MMProcessor"],
+    ) -> list[dict[str, str]]:
+        r"""Replace audio placeholders with boundary-wrapped tokens.
+
+        Produces: <|audio_start|>{audio_token * seqlen}<|text_start|>
+        """
+        self._validate_input(processor, images, videos, audios)
+        self._validate_messages(messages, images, videos, audios)
+
+        num_audio_tokens = 0
+        messages = deepcopy(messages)
+
+        # Calculate audio sequence lengths if processor is available
+        audio_seqlens: list[int] = []
+        if self.expand_mm_tokens and processor is not None:
+            mm_inputs = self._get_mm_inputs([], [], audios, processor)
+            if "audio_seq_lengths" in mm_inputs:
+                # liquid_audio path
+                audio_seqlens = mm_inputs["audio_seq_lengths"]
+            elif "feature_attention_mask" in mm_inputs and mm_inputs["feature_attention_mask"] is not None:
+                # HF path - calculate from attention mask (8x subsampling)
+                input_lengths = mm_inputs["feature_attention_mask"].sum(-1).numpy()
+                audio_seqlens = [(int(length) - 1) // 8 + 1 for length in input_lengths]
+
+        for message in messages:
+            content = message["content"]
+            while AUDIO_PLACEHOLDER in content:
+                # Get audio sequence length
+                if self.expand_mm_tokens and num_audio_tokens < len(audio_seqlens):
+                    audio_seqlen = audio_seqlens[num_audio_tokens]
+                else:
+                    audio_seqlen = 1  # Fallback: single token
+
+                # Build: <|audio_start|>{audio_token * seqlen}<|text_start|>
+                audio_tokens = self.audio_token * audio_seqlen if self.audio_token else ""
+                replacement = f"{self.audio_bos_token}{audio_tokens}{self.audio_eos_token}"
+
+                content = content.replace(AUDIO_PLACEHOLDER, replacement, 1)
+                num_audio_tokens += 1
+
+            message["content"] = content
+
+        return messages
+
+    @override
+    def get_mm_inputs(
+        self,
+        images: list["ImageInput"],
+        videos: list["VideoInput"],
+        audios: list["AudioInput"],
+        imglens: list[int],
+        vidlens: list[int],
+        audlens: list[int],
+        batch_ids: list[list[int]],
+        processor: Optional["MMProcessor"],
+    ) -> dict[str, Union[list[int], "torch.Tensor"]]:
+        self._validate_input(processor, images, videos, audios)
+        return self._get_mm_inputs(images, videos, audios, processor)
+
+
 PLUGINS = {
     "base": BasePlugin,
     "ernie_vl": ErnieVLPlugin,
@@ -2172,6 +2320,7 @@ def process_messages(
     "llava_next": LlavaNextPlugin,
     "llava_next_video": LlavaNextVideoPlugin,
     "lfm2_vl": LFMVLPlugin,
+    "lfm2_audio": LFM2AudioPlugin,
     "minicpm_v": MiniCPMVPlugin,
     "mllama": MllamaPlugin,
     "paligemma": PaliGemmaPlugin,

From d9c7a61443a50914d10d7006629c8bfaec233784 Mon Sep 17 00:00:00 2001
From: vovanphuc <vanphucpro123@gmail.com>
Date: Thu, 8 Jan 2026 10:29:32 +0700
Subject: [PATCH 2/8] [feature] add lfm2_audio template and model registration

---
 src/llamafactory/data/template.py    | 26 ++++++++++++++++++++++++++
 src/llamafactory/extras/constants.py | 11 +++++++++++
 2 files changed, 37 insertions(+)

diff --git a/src/llamafactory/data/template.py b/src/llamafactory/data/template.py
index ef1c5db698..b88124ed04 100644
--- a/src/llamafactory/data/template.py
+++ b/src/llamafactory/data/template.py
@@ -1371,6 +1371,32 @@ def get_template_and_fix_tokenizer(tokenizer: "PreTrainedTokenizer", data_args:
 )
 
 
+register_template(
+    name="lfm2_audio",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_assistant=StringFormatter(slots=["{{content}}<|im_end|>\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_function=FunctionFormatter(slots=["{{content}}<|im_end|>\n"], tool_format="lfm2"),
+    format_observation=StringFormatter(
+        slots=[
+            "<|im_start|>tool\n<|tool_response_start|>{{content}}<|tool_response_end|><|im_end|>\n"
+            "<|im_start|>assistant\n"
+        ]
+    ),
+    format_tools=ToolFormatter(tool_format="lfm2"),
+    default_system="You are a helpful audio assistant by Liquid AI.",
+    stop_words=["<|im_end|>"],
+    tool_call_words=("<|tool_call_start|>", "<|tool_call_end|>"),
+    replace_eos=True,
+    mm_plugin=get_mm_plugin(
+        name="lfm2_audio",
+        audio_token="<|reserved_1|>",  # Token ID 17 - placeholder between markers
+        audio_bos_token="<|audio_start|>",  # Token ID 128
+        audio_eos_token="<|text_start|>",  # Token ID 129
+    ),
+)
+
+
 register_template(
     name="llama2",
     format_user=StringFormatter(slots=[{"bos_token"}, "[INST] {{content}} [/INST]"]),
diff --git a/src/llamafactory/extras/constants.py b/src/llamafactory/extras/constants.py
index 0b4d35ef18..9648f0f253 100644
--- a/src/llamafactory/extras/constants.py
+++ b/src/llamafactory/extras/constants.py
@@ -1517,6 +1517,17 @@ def register_model_group(
 )
 
 
+register_model_group(
+    models={
+        "LFM2.5-Audio-1.5B": {
+            DownloadSource.DEFAULT: "LiquidAI/LFM2.5-Audio-1.5B",
+        },
+    },
+    template="lfm2_audio",
+    multimodal=True,
+)
+
+
 register_model_group(
     models={
         "Llama-7B": {

From 130996f81274e26b5f4e19a8881a2bcc3028c54b Mon Sep 17 00:00:00 2001
From: vovanphuc <vanphucpro123@gmail.com>
Date: Thu, 8 Jan 2026 10:29:42 +0700
Subject: [PATCH 3/8] [model] add LFM2.5-Audio model loader with liquid_audio
 integration

---
 src/llamafactory/extras/packages.py           |   4 +
 src/llamafactory/model/loader.py              |  13 +
 .../model/model_utils/lfm2_audio.py           | 347 ++++++++++++++++++
 3 files changed, 364 insertions(+)
 create mode 100644 src/llamafactory/model/model_utils/lfm2_audio.py

diff --git a/src/llamafactory/extras/packages.py b/src/llamafactory/extras/packages.py
index c6328a7b02..d259ca0461 100644
--- a/src/llamafactory/extras/packages.py
+++ b/src/llamafactory/extras/packages.py
@@ -122,3 +122,7 @@ def is_uvicorn_available():
 
 def is_vllm_available():
     return _is_package_available("vllm")
+
+
+def is_liquid_audio_available():
+    return _is_package_available("liquid_audio")
diff --git a/src/llamafactory/model/loader.py b/src/llamafactory/model/loader.py
index 8c24622381..44ca540639 100644
--- a/src/llamafactory/model/loader.py
+++ b/src/llamafactory/model/loader.py
@@ -33,6 +33,7 @@
 from ..extras.packages import is_torch_version_greater_than
 from .adapter import init_adapter
 from .model_utils.ktransformers import load_kt_pretrained_model
+from .model_utils.lfm2_audio import is_lfm2_audio_model, load_lfm2_audio_pretrained_model
 from .model_utils.liger_kernel import apply_liger_kernel
 from .model_utils.misc import register_autoclass
 from .model_utils.mod import convert_pretrained_model_to_mod, load_mod_pretrained_model
@@ -127,6 +128,14 @@ def load_tokenizer(model_args: "ModelArguments") -> "TokenizerModule":
 def load_config(model_args: "ModelArguments") -> "PretrainedConfig":
     r"""Load model config."""
     init_kwargs = _get_init_kwargs(model_args)
+
+    # Special handling for LFM2.5-Audio models
+    if is_lfm2_audio_model(model_args.model_name_or_path):
+        from .model_utils.lfm2_audio import LFM2AudioConfig
+
+        logger.info_rank0("Detected LFM2.5-Audio model, using custom config loader.")
+        return LFM2AudioConfig()
+
     return AutoConfig.from_pretrained(model_args.model_name_or_path, **init_kwargs)
 
 
@@ -155,6 +164,10 @@ def load_model(
             lazy_load = True
         elif is_trainable:
             model = load_unsloth_pretrained_model(config, model_args, finetuning_args)
+    elif is_lfm2_audio_model(model_args.model_name_or_path):
+        # Load LFM2.5-Audio model using liquid_audio package
+        logger.info_rank0("Loading LFM2.5-Audio model with liquid_audio package...")
+        model = load_lfm2_audio_pretrained_model(model_args, **init_kwargs)
 
     if model is None and not lazy_load:
         init_kwargs["config"] = config
diff --git a/src/llamafactory/model/model_utils/lfm2_audio.py b/src/llamafactory/model/model_utils/lfm2_audio.py
new file mode 100644
index 0000000000..04d5d7d89a
--- /dev/null
+++ b/src/llamafactory/model/model_utils/lfm2_audio.py
@@ -0,0 +1,347 @@
+# Copyright 2025 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Custom model loader for LFM2.5-Audio models using liquid_audio package.
+
+LFM2.5-Audio models use a custom architecture that requires the liquid_audio package
+for proper model loading and audio processing.
+"""
+
+from typing import TYPE_CHECKING, Any, Optional
+
+import torch
+from transformers import GenerationConfig, PretrainedConfig, PreTrainedModel
+from transformers.generation import GenerationMixin
+from transformers.modeling_outputs import CausalLMOutputWithPast
+
+from ...extras import logging
+from ...extras.packages import is_liquid_audio_available
+
+
+if TYPE_CHECKING:
+    from ...hparams import ModelArguments
+
+
+logger = logging.get_logger(__name__)
+
+
+class LFM2AudioConfig(PretrainedConfig):
+    """Config class for LFM2.5-Audio models to enable HuggingFace compatibility."""
+
+    model_type = "lfm2_audio"
+
+    def __init__(
+        self,
+        vocab_size: int = 65536,
+        hidden_size: int = 2048,
+        num_hidden_layers: int = 16,
+        num_attention_heads: int = 32,
+        num_key_value_heads: int = 8,
+        codebooks: int = 8,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.codebooks = codebooks
+        super().__init__(**kwargs)
+
+
+class LFM2AudioModelForCausalLM(PreTrainedModel, GenerationMixin):
+    """HuggingFace-compatible wrapper for LFM2AudioModel from liquid_audio.
+
+    This wrapper enables LFM2.5-Audio models to be used with LLaMA-Factory's
+    training pipeline while leveraging the liquid_audio package for model loading.
+    """
+
+    config_class = LFM2AudioConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Lfm2DecoderLayer", "ConformerBlock"]
+    main_input_name = "input_ids"
+    _supports_cache_class = True
+
+    def __init__(self, config: LFM2AudioConfig):
+        super().__init__(config)
+        self._liquid_model = None
+        self._is_loaded = False
+        # Initialize generation_config for HuggingFace compatibility
+        self.generation_config = GenerationConfig(
+            eos_token_id=config.eos_token_id if hasattr(config, "eos_token_id") else 7,
+            pad_token_id=config.pad_token_id if hasattr(config, "pad_token_id") else 0,
+        )
+
+    @classmethod
+    def from_pretrained(
+        cls,
+        pretrained_model_name_or_path: str,
+        *model_args,
+        config: Optional[LFM2AudioConfig] = None,
+        torch_dtype: Optional[torch.dtype] = None,
+        device_map: Optional[str] = None,
+        **kwargs,
+    ) -> "LFM2AudioModelForCausalLM":
+        """Load LFM2.5-Audio model using liquid_audio package."""
+        if not is_liquid_audio_available():
+            raise ImportError(
+                "liquid-audio package is required for LFM2.5-Audio models. "
+                "Please install it with: pip install liquid-audio"
+            )
+
+        from liquid_audio import LFM2AudioModel
+
+        # Determine dtype
+        if torch_dtype is None or torch_dtype == "auto":
+            torch_dtype = torch.bfloat16
+
+        # Determine device - liquid_audio expects string or torch.device, not dict
+        if device_map is None or device_map == "auto":
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+        elif isinstance(device_map, dict):
+            # Handle dict device_map like {'': device(type='cuda', index=0)}
+            if "" in device_map:
+                dev = device_map[""]
+                device = str(dev) if hasattr(dev, "type") else "cuda"
+            else:
+                device = "cuda" if torch.cuda.is_available() else "cpu"
+        elif isinstance(device_map, str):
+            device = device_map
+        else:
+            device = "cuda" if torch.cuda.is_available() else "cpu"
+
+        logger.info_rank0(f"Loading LFM2.5-Audio model from {pretrained_model_name_or_path}")
+        logger.info_rank0(f"Using dtype={torch_dtype}, device={device}")
+
+        # Load using liquid_audio
+        liquid_model = LFM2AudioModel.from_pretrained(
+            pretrained_model_name_or_path,
+            dtype=torch_dtype,
+            device=device,
+            revision=kwargs.get("revision"),
+        )
+
+        # Create config from liquid model
+        lfm_config = liquid_model.conf.lfm
+        if config is None:
+            config = LFM2AudioConfig(
+                vocab_size=lfm_config.vocab_size,
+                hidden_size=lfm_config.hidden_size,
+                num_hidden_layers=lfm_config.num_hidden_layers,
+                num_attention_heads=lfm_config.num_attention_heads,
+                num_key_value_heads=lfm_config.num_key_value_heads,
+                codebooks=liquid_model.conf.codebooks,
+                torch_dtype=torch_dtype,
+            )
+
+        # Create wrapper instance
+        wrapper = cls(config)
+        wrapper._liquid_model = liquid_model
+        wrapper._is_loaded = True
+
+        return wrapper
+
+    @property
+    def model(self):
+        """Return the underlying liquid_audio model."""
+        return self._liquid_model
+
+    @property
+    def lfm(self):
+        """Return the LFM2 backbone (HuggingFace Lfm2Model)."""
+        if self._liquid_model is not None:
+            return self._liquid_model.lfm
+        return None
+
+    def get_input_embeddings(self):
+        """Get text embeddings from the LFM backbone."""
+        if self.lfm is not None:
+            return self.lfm.embed_tokens
+        return None
+
+    def set_input_embeddings(self, value):
+        """Set text embeddings in the LFM backbone."""
+        if self.lfm is not None:
+            self.lfm.embed_tokens = value
+
+    def get_output_embeddings(self):
+        """LFM2 uses tied embeddings, return the same as input."""
+        return self.get_input_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        """Set output embeddings (tied with input)."""
+        self.set_input_embeddings(new_embeddings)
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Any] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        """Forward pass for training.
+
+        For training, we use the LFM backbone directly for text-only forward pass.
+        The audio processing is handled by the mm_plugin during data preprocessing.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self._liquid_model is None:
+            raise RuntimeError("Model not loaded. Call from_pretrained first.")
+
+        # Use the LFM backbone for forward pass
+        lfm = self._liquid_model.lfm
+
+        # Get embeddings
+        if inputs_embeds is None and input_ids is not None:
+            inputs_embeds = lfm.embed_tokens(input_ids)
+
+        # Forward through LFM backbone
+        outputs = lfm(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+
+        hidden_states = outputs.last_hidden_state
+
+        # Compute logits using tied embeddings
+        logits = torch.nn.functional.linear(hidden_states, lfm.embed_tokens.weight)
+
+        loss = None
+        if labels is not None:
+            # Shift for next token prediction
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+
+            loss_fct = torch.nn.CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        past_key_values: Optional[Any] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        **kwargs,
+    ) -> dict[str, Any]:
+        """Prepare inputs for generation."""
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+
+        model_inputs = {
+            "input_ids": input_ids,
+            "past_key_values": past_key_values,
+            "use_cache": kwargs.get("use_cache"),
+            "attention_mask": attention_mask,
+        }
+
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs["inputs_embeds"] = inputs_embeds
+            model_inputs["input_ids"] = None
+
+        return model_inputs
+
+    def _reorder_cache(self, past_key_values, beam_idx):
+        """Reorder cache for beam search."""
+        if self.lfm is not None and hasattr(self.lfm, "_reorder_cache"):
+            return self.lfm._reorder_cache(past_key_values, beam_idx)
+        return past_key_values
+
+    @classmethod
+    def can_generate(cls) -> bool:
+        """Return True to indicate this model can generate sequences."""
+        return True
+
+    @property
+    def device(self) -> torch.device:
+        """Return the device of the model."""
+        if self.lfm is not None:
+            return next(self.lfm.parameters()).device
+        return torch.device("cpu")
+
+    @property
+    def dtype(self) -> torch.dtype:
+        """Return the dtype of the model."""
+        if self.lfm is not None:
+            return next(self.lfm.parameters()).dtype
+        return torch.float32
+
+
+def is_lfm2_audio_model(model_name_or_path: str) -> bool:
+    """Check if the model is an LFM2.5-Audio model."""
+    lfm2_audio_patterns = [
+        "LFM2.5-Audio",
+        "lfm2.5-audio",
+        "lfm2-audio",
+        "LFM2-Audio",
+    ]
+    return any(pattern.lower() in model_name_or_path.lower() for pattern in lfm2_audio_patterns)
+
+
+def load_lfm2_audio_pretrained_model(
+    model_args: "ModelArguments",
+    **kwargs,
+) -> "LFM2AudioModelForCausalLM":
+    """Load LFM2.5-Audio model using liquid_audio package.
+
+    Args:
+        model_args: Model arguments containing model path and configuration.
+        **kwargs: Additional arguments passed to from_pretrained.
+
+    Returns:
+        LFM2AudioModelForCausalLM: Loaded model wrapper.
+    """
+    if not is_liquid_audio_available():
+        raise ImportError(
+            "LFM2.5-Audio models require the liquid-audio package. Please install it with: pip install liquid-audio"
+        )
+
+    return LFM2AudioModelForCausalLM.from_pretrained(
+        model_args.model_name_or_path,
+        torch_dtype=kwargs.get("torch_dtype", "auto"),
+        device_map=kwargs.get("device_map"),
+        revision=model_args.model_revision,
+        cache_dir=model_args.cache_dir,
+        token=model_args.hf_hub_token,
+        trust_remote_code=model_args.trust_remote_code,
+    )

From 2e2062caa4e91a5658ad39a48bedd9fddbd18928 Mon Sep 17 00:00:00 2001
From: vovanphuc <vanphucpro123@gmail.com>
Date: Thu, 8 Jan 2026 10:29:52 +0700
Subject: [PATCH 4/8] [test] add LFM2.5-Audio plugin tests

---
 tests/data/test_mm_plugin.py | 41 ++++++++++++++++++++++++++++++++++++
 1 file changed, 41 insertions(+)

diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py
index 3187004aa5..18a6810b3c 100644
--- a/tests/data/test_mm_plugin.py
+++ b/tests/data/test_mm_plugin.py
@@ -431,3 +431,44 @@ def test_lfm2_vl_plugin():
     assert lfm2_vl_plugin.video_token is None
     assert lfm2_vl_plugin.audio_token is None
     assert lfm2_vl_plugin.__class__.__name__ == "LFMVLPlugin"
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_lfm2_audio_plugin():
+    """Test LFM2.5-Audio plugin instantiation."""
+    # Test plugin can be instantiated with correct tokens
+    lfm2_audio_plugin = get_mm_plugin(
+        name="lfm2_audio",
+        audio_token="<|reserved_1|>",
+        audio_bos_token="<|audio_start|>",
+        audio_eos_token="<|text_start|>",
+    )
+    assert lfm2_audio_plugin is not None
+    assert lfm2_audio_plugin.audio_token == "<|reserved_1|>"
+    assert lfm2_audio_plugin.audio_bos_token == "<|audio_start|>"
+    assert lfm2_audio_plugin.audio_eos_token == "<|text_start|>"
+    assert lfm2_audio_plugin.image_token is None
+    assert lfm2_audio_plugin.video_token is None
+    assert lfm2_audio_plugin.__class__.__name__ == "LFM2AudioPlugin"
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_lfm2_audio_plugin_process_messages():
+    """Test LFM2.5-Audio placeholder replacement."""
+    lfm2_audio_plugin = get_mm_plugin(
+        name="lfm2_audio",
+        audio_token="<|reserved_1|>",
+        audio_bos_token="<|audio_start|>",
+        audio_eos_token="<|text_start|>",
+    )
+    messages = [{"content": "Transcribe this: <audio>"}]
+    processed = lfm2_audio_plugin.process_messages(
+        messages=messages,
+        images=[],
+        videos=[],
+        audios=["dummy.wav"],
+        processor=None,
+    )
+    # Without processor: single token expansion
+    expected = "Transcribe this: <|audio_start|><|reserved_1|><|text_start|>"
+    assert processed[0]["content"] == expected

From e36d8ce26dc2ec7860d856c850b535efedd99f6d Mon Sep 17 00:00:00 2001
From: vovanphuc <vanphucpro123@gmail.com>
Date: Thu, 8 Jan 2026 10:30:01 +0700
Subject: [PATCH 5/8] [docs] add LFM2.5-Audio to supported models list

---
 README.md    | 2 +-
 README_zh.md | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 0add75bb59..db0cd64b15 100644
--- a/README.md
+++ b/README.md
@@ -298,7 +298,7 @@ Read technical notes:
 | [InternLM/Intern-S1-mini](https://huggingface.co/internlm/)       | 8B                               | intern_s1            |
 | [Kimi-VL](https://huggingface.co/moonshotai)                      | 16B                              | kimi_vl              |
 | [Ling 2.0 (mini/flash)](https://huggingface.co/inclusionAI)       | 16B/100B                         | bailing_v2           |
-| [LFM 2.5 (VL)](https://huggingface.co/LiquidAI)                   | 1.2B/1.6B                        | lfm2/lfm2_vl         |
+| [LFM 2.5 (VL/Audio)](https://huggingface.co/LiquidAI)             | 1.2B/1.5B/1.6B                   | lfm2/lfm2_vl/lfm2_audio |
 | [Llama](https://github.com/facebookresearch/llama)                | 7B/13B/33B/65B                   | -                    |
 | [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                       | llama2               |
 | [Llama 3-3.3](https://huggingface.co/meta-llama)                  | 1B/3B/8B/70B                     | llama3               |
diff --git a/README_zh.md b/README_zh.md
index 72ad732fef..cdeb4f8dea 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -300,7 +300,7 @@ https://github.com/user-attachments/assets/43b700c6-a178-41db-b1f8-8190a5d3fcfc
 | [InternLM/Intern-S1-mini](https://huggingface.co/internlm/)       | 8B                               | intern_s1            |
 | [Kimi-VL](https://huggingface.co/moonshotai)                      | 16B                              | kimi_vl              |
 | [Ling 2.0 (mini/flash)](https://huggingface.co/inclusionAI)       | 16B/100B                         | bailing_v2           |
-| [LFM 2.5 (VL)](https://huggingface.co/LiquidAI)                   | 1.2B/1.6B                        | lfm2/lfm2_vl         |
+| [LFM 2.5 (VL/Audio)](https://huggingface.co/LiquidAI)             | 1.2B/1.5B/1.6B                   | lfm2/lfm2_vl/lfm2_audio |
 | [Llama](https://github.com/facebookresearch/llama)                | 7B/13B/33B/65B                   | -                    |
 | [Llama 2](https://huggingface.co/meta-llama)                      | 7B/13B/70B                       | llama2               |
 | [Llama 3-3.3](https://huggingface.co/meta-llama)                  | 1B/3B/8B/70B                     | llama3               |

From 591ebca93afc15c71fd395e3aff18ec94a319eed Mon Sep 17 00:00:00 2001
From: vovanphuc <vanphucpro123@gmail.com>
Date: Thu, 8 Jan 2026 11:14:25 +0700
Subject: [PATCH 6/8] [fix] address review feedback for LFM2.5-Audio
 implementation

- Fix audio seq_lengths calculation to handle variable-length audios
  (previously assumed all audios had same length)
- Add comments documenting magic number token IDs (7=<|im_end|>, 0=<unk>)
- Improve test coverage with 3 additional test cases:
  - Multiple audio placeholders
  - Text-only messages
  - get_mm_inputs with no processor
---
 src/llamafactory/data/mm_plugin.py            | 16 +++--
 .../model/model_utils/lfm2_audio.py           |  3 +
 tests/data/test_mm_plugin.py                  | 66 +++++++++++++++++++
 3 files changed, 80 insertions(+), 5 deletions(-)

diff --git a/src/llamafactory/data/mm_plugin.py b/src/llamafactory/data/mm_plugin.py
index d47373dc09..9f01c16f48 100644
--- a/src/llamafactory/data/mm_plugin.py
+++ b/src/llamafactory/data/mm_plugin.py
@@ -2216,13 +2216,19 @@ def _get_mm_inputs(
         if hasattr(processor, "audio_processor") and processor.audio_processor is not None:
             audio_processor = processor.audio_processor
             audios_regularized = self._regularize_audios(audios, sampling_rate=16000)["audios"]
-            # liquid_audio returns log-mel features
+
+            # Calculate per-audio sequence lengths BEFORE batching (each audio may have different length)
+            # FastConformer uses 8x subsampling: seq_len = (audio_samples - 1) // 8 + 1
+            audio_seq_lengths = []
+            for audio_array in audios_regularized:
+                audio_len = len(audio_array) if hasattr(audio_array, "__len__") else audio_array.shape[-1]
+                seq_len = (audio_len - 1) // 8 + 1
+                audio_seq_lengths.append(seq_len)
+
+            # liquid_audio returns log-mel features (may pad to max length)
             features = audio_processor(audios_regularized, sampling_rate=16000)
             mm_inputs["audio_features"] = features
-            # Calculate sequence lengths from feature shapes (8x subsampling in FastConformer)
-            if hasattr(features, "shape"):
-                seq_len = (features.shape[-1] - 1) // 8 + 1
-                mm_inputs["audio_seq_lengths"] = [seq_len] * len(audios)
+            mm_inputs["audio_seq_lengths"] = audio_seq_lengths
         # Fallback: standard HF feature_extractor
         elif hasattr(processor, "feature_extractor") and processor.feature_extractor is not None:
             feature_extractor: SequenceFeatureExtractor = processor.feature_extractor
diff --git a/src/llamafactory/model/model_utils/lfm2_audio.py b/src/llamafactory/model/model_utils/lfm2_audio.py
index 04d5d7d89a..c8a5bdc1f5 100644
--- a/src/llamafactory/model/model_utils/lfm2_audio.py
+++ b/src/llamafactory/model/model_utils/lfm2_audio.py
@@ -79,6 +79,9 @@ def __init__(self, config: LFM2AudioConfig):
         self._liquid_model = None
         self._is_loaded = False
         # Initialize generation_config for HuggingFace compatibility
+        # LFM2.5-Audio token IDs from tokenizer_config.json:
+        #   eos_token_id=7 -> <|im_end|> (end of message marker)
+        #   pad_token_id=0 -> <unk> (unknown/padding token)
         self.generation_config = GenerationConfig(
             eos_token_id=config.eos_token_id if hasattr(config, "eos_token_id") else 7,
             pad_token_id=config.pad_token_id if hasattr(config, "pad_token_id") else 0,
diff --git a/tests/data/test_mm_plugin.py b/tests/data/test_mm_plugin.py
index 18a6810b3c..3b9636a996 100644
--- a/tests/data/test_mm_plugin.py
+++ b/tests/data/test_mm_plugin.py
@@ -472,3 +472,69 @@ def test_lfm2_audio_plugin_process_messages():
     # Without processor: single token expansion
     expected = "Transcribe this: <|audio_start|><|reserved_1|><|text_start|>"
     assert processed[0]["content"] == expected
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_lfm2_audio_plugin_multiple_audios():
+    """Test LFM2.5-Audio plugin with multiple audio placeholders."""
+    lfm2_audio_plugin = get_mm_plugin(
+        name="lfm2_audio",
+        audio_token="<|reserved_1|>",
+        audio_bos_token="<|audio_start|>",
+        audio_eos_token="<|text_start|>",
+    )
+    messages = [{"content": "First: <audio> Second: <audio>"}]
+    processed = lfm2_audio_plugin.process_messages(
+        messages=messages,
+        images=[],
+        videos=[],
+        audios=["audio1.wav", "audio2.wav"],
+        processor=None,
+    )
+    # Each placeholder should be replaced independently
+    expected = "First: <|audio_start|><|reserved_1|><|text_start|> Second: <|audio_start|><|reserved_1|><|text_start|>"
+    assert processed[0]["content"] == expected
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_lfm2_audio_plugin_text_only():
+    """Test LFM2.5-Audio plugin with text-only messages (no audio)."""
+    lfm2_audio_plugin = get_mm_plugin(
+        name="lfm2_audio",
+        audio_token="<|reserved_1|>",
+        audio_bos_token="<|audio_start|>",
+        audio_eos_token="<|text_start|>",
+    )
+    messages = [{"content": "Hello, how are you?"}]
+    processed = lfm2_audio_plugin.process_messages(
+        messages=messages,
+        images=[],
+        videos=[],
+        audios=[],
+        processor=None,
+    )
+    # Text-only message should remain unchanged
+    assert processed[0]["content"] == "Hello, how are you?"
+
+
+@pytest.mark.runs_on(["cpu", "mps"])
+def test_lfm2_audio_plugin_get_mm_inputs_no_processor():
+    """Test LFM2.5-Audio plugin get_mm_inputs returns empty dict without processor."""
+    lfm2_audio_plugin = get_mm_plugin(
+        name="lfm2_audio",
+        audio_token="<|reserved_1|>",
+        audio_bos_token="<|audio_start|>",
+        audio_eos_token="<|text_start|>",
+    )
+    mm_inputs = lfm2_audio_plugin.get_mm_inputs(
+        images=[],
+        videos=[],
+        audios=AUDIOS,
+        imglens=[0],
+        vidlens=[0],
+        audlens=[1],
+        batch_ids=BATCH_IDS,
+        processor=None,
+    )
+    # Without processor, should return empty dict
+    assert mm_inputs == {}

From 9994ca2ca6136a76cd766a7f1337f3650d0d5f18 Mon Sep 17 00:00:00 2001
From: vovanphuc <vanphucpro123@gmail.com>
Date: Thu, 8 Jan 2026 16:02:20 +0700
Subject: [PATCH 7/8] [fix] add _keys_to_ignore_on_save for LFM2.5-Audio model
 export

Handle tied weights in depth_embeddings when saving merged model.
The embedding.weight and to_logits.weight are shared in each depth
embedding layer, causing save_pretrained to fail without this fix.
---
 src/llamafactory/model/model_utils/lfm2_audio.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/llamafactory/model/model_utils/lfm2_audio.py b/src/llamafactory/model/model_utils/lfm2_audio.py
index c8a5bdc1f5..827b86914b 100644
--- a/src/llamafactory/model/model_utils/lfm2_audio.py
+++ b/src/llamafactory/model/model_utils/lfm2_audio.py
@@ -73,6 +73,8 @@ class LFM2AudioModelForCausalLM(PreTrainedModel, GenerationMixin):
     _no_split_modules = ["Lfm2DecoderLayer", "ConformerBlock"]
     main_input_name = "input_ids"
     _supports_cache_class = True
+    # Skip tied weights when saving (depth_embeddings.*.embedding.weight == .*.to_logits.weight)
+    _keys_to_ignore_on_save = [f"_liquid_model.depth_embeddings.{i}.to_logits.weight" for i in range(8)]
 
     def __init__(self, config: LFM2AudioConfig):
         super().__init__(config)

From 81517385099183d7da6f2670de37d9f90b71b2a9 Mon Sep 17 00:00:00 2001
From: vovanphuc <vanphucpro123@gmail.com>
Date: Thu, 8 Jan 2026 16:09:51 +0700
Subject: [PATCH 8/8] [fix] support loading merged LFM2.5-Audio models for
 finetuning

Add detection of merged/exported models (safetensors format) and load
them by first creating base model structure from liquid_audio, then
applying the merged weights from safetensors files.
---
 .../model/model_utils/lfm2_audio.py           | 74 ++++++++++++++++++-
 1 file changed, 72 insertions(+), 2 deletions(-)

diff --git a/src/llamafactory/model/model_utils/lfm2_audio.py b/src/llamafactory/model/model_utils/lfm2_audio.py
index 827b86914b..9c28f0858e 100644
--- a/src/llamafactory/model/model_utils/lfm2_audio.py
+++ b/src/llamafactory/model/model_utils/lfm2_audio.py
@@ -99,7 +99,14 @@ def from_pretrained(
         device_map: Optional[str] = None,
         **kwargs,
     ) -> "LFM2AudioModelForCausalLM":
-        """Load LFM2.5-Audio model using liquid_audio package."""
+        """Load LFM2.5-Audio model using liquid_audio package.
+
+        Supports both:
+        1. Original liquid_audio models (from HuggingFace hub or local liquid format)
+        2. Merged/exported models (safetensors format from LLaMA-Factory export)
+        """
+        from pathlib import Path
+
         if not is_liquid_audio_available():
             raise ImportError(
                 "liquid-audio package is required for LFM2.5-Audio models. "
@@ -130,7 +137,70 @@ def from_pretrained(
         logger.info_rank0(f"Loading LFM2.5-Audio model from {pretrained_model_name_or_path}")
         logger.info_rank0(f"Using dtype={torch_dtype}, device={device}")
 
-        # Load using liquid_audio
+        # Check if this is a merged/exported model (has safetensors but no liquid_audio config)
+        model_path = Path(pretrained_model_name_or_path)
+        is_merged_model = (
+            model_path.exists()
+            and model_path.is_dir()
+            and (model_path / "model.safetensors.index.json").exists()
+            and not (model_path / "lfm2_audio_config.yaml").exists()
+        )
+
+        if is_merged_model:
+            # Load merged model: first create base from original, then load merged weights
+            logger.info_rank0("Detected merged model, loading base model then applying merged weights...")
+
+            # We need to load base model first to get the structure
+            # Try to find original model name from config or use default
+            base_model_id = "LiquidAI/LFM2.5-Audio-1.5B"
+
+            liquid_model = LFM2AudioModel.from_pretrained(
+                base_model_id,
+                dtype=torch_dtype,
+                device=device,
+            )
+
+            # Create config from liquid model
+            lfm_config = liquid_model.conf.lfm
+            if config is None:
+                config = LFM2AudioConfig(
+                    vocab_size=lfm_config.vocab_size,
+                    hidden_size=lfm_config.hidden_size,
+                    num_hidden_layers=lfm_config.num_hidden_layers,
+                    num_attention_heads=lfm_config.num_attention_heads,
+                    num_key_value_heads=lfm_config.num_key_value_heads,
+                    codebooks=liquid_model.conf.codebooks,
+                    torch_dtype=torch_dtype,
+                )
+
+            # Create wrapper instance
+            wrapper = cls(config)
+            wrapper._liquid_model = liquid_model
+            wrapper._is_loaded = True
+
+            # Load merged weights from safetensors
+            import json
+
+            from safetensors.torch import load_file
+
+            index_file = model_path / "model.safetensors.index.json"
+            with open(index_file) as f:
+                index = json.load(f)
+
+            # Load all shards
+            weight_files = set(index["weight_map"].values())
+            merged_state_dict = {}
+            for weight_file in weight_files:
+                shard_path = model_path / weight_file
+                merged_state_dict.update(load_file(str(shard_path)))
+
+            # Load merged weights (with strict=False to handle tied weights)
+            wrapper.load_state_dict(merged_state_dict, strict=False)
+            logger.info_rank0(f"Loaded merged weights from {pretrained_model_name_or_path}")
+
+            return wrapper
+
+        # Standard path: load using liquid_audio
         liquid_model = LFM2AudioModel.from_pretrained(
             pretrained_model_name_or_path,
             dtype=torch_dtype,