From 7bd0dd7aa4058e7f4e0d736b6a972b4a681a3cd3 Mon Sep 17 00:00:00 2001 From: Sy03 <1370724210@qq.com> Date: Thu, 2 Apr 2026 03:32:18 +0800 Subject: [PATCH] [Perf][Fish Speech] Free unused DAC codec components to save ~1.2 GiB VRAM Fish Speech S2 Pro loads the full DAC codec (encoder + quantizer + decoder) into GPU in both stages, but each stage only uses a subset: - Encoder stage (dac_encoder.py): only uses encoder + quantizer.forward() -> decoder is unused, wasting ~208 MiB - Decoder stage (fish_speech_dac_decoder.py): only uses quantizer.decode() + decoder -> encoder, quantizer.pre_module, and quantizer.downsample are unused, wasting ~1,067 MiB Free the unused components before moving to device so they are never allocated on GPU. Verified bit-identical output and successful e2e encode/decode with real codec.pth weights on H20. Signed-off-by: Sy03 <1370724210@qq.com> --- vllm_omni/model_executor/models/fish_speech/dac_encoder.py | 3 +++ .../models/fish_speech/fish_speech_dac_decoder.py | 7 +++++++ 2 files changed, 10 insertions(+) diff --git a/vllm_omni/model_executor/models/fish_speech/dac_encoder.py b/vllm_omni/model_executor/models/fish_speech/dac_encoder.py index e89815ab43..a634d3e7a2 100644 --- a/vllm_omni/model_executor/models/fish_speech/dac_encoder.py +++ b/vllm_omni/model_executor/models/fish_speech/dac_encoder.py @@ -54,6 +54,9 @@ def _load_dac_codec( if "generator" in state_dict: state_dict = state_dict["generator"] codec.load_state_dict(state_dict, strict=False) + # Encoder path only uses encoder + quantizer.forward(); prune the + # decoder before moving to device to avoid unnecessary GPU allocation. + codec.decoder = None codec = codec.to(device=device, dtype=dtype) codec.eval() diff --git a/vllm_omni/model_executor/models/fish_speech/fish_speech_dac_decoder.py b/vllm_omni/model_executor/models/fish_speech/fish_speech_dac_decoder.py index 3a8042eb2e..5bab942440 100644 --- a/vllm_omni/model_executor/models/fish_speech/fish_speech_dac_decoder.py +++ b/vllm_omni/model_executor/models/fish_speech/fish_speech_dac_decoder.py @@ -141,6 +141,13 @@ def _ensure_codec_loaded(self) -> None: self._bake_weight_norm(codec) self._cache_attention_masks(codec) + # Decode path only uses quantizer.decode() + decoder; prune + # encode-only components before moving to device to avoid + # unnecessary GPU allocation. + codec.encoder = None + codec.quantizer.pre_module = None + codec.quantizer.downsample = None + device = self.vllm_config.device_config.device codec = codec.to(device=device, dtype=torch.float32) codec.eval()