From 7bd0dd7aa4058e7f4e0d736b6a972b4a681a3cd3 Mon Sep 17 00:00:00 2001
From: Sy03 <1370724210@qq.com>
Date: Thu, 2 Apr 2026 03:32:18 +0800
Subject: [PATCH] [Perf][Fish Speech] Free unused DAC codec components to save
 ~1.2 GiB VRAM

Fish Speech S2 Pro loads the full DAC codec (encoder + quantizer + decoder)
into GPU in both stages, but each stage only uses a subset:

- Encoder stage (dac_encoder.py): only uses encoder + quantizer.forward()
  -> decoder is unused, wasting ~208 MiB
- Decoder stage (fish_speech_dac_decoder.py): only uses quantizer.decode()
  + decoder -> encoder, quantizer.pre_module, and quantizer.downsample are
  unused, wasting ~1,067 MiB

Free the unused components before moving to device so they are never
allocated on GPU. Verified bit-identical output and successful e2e
encode/decode with real codec.pth weights on H20.

Signed-off-by: Sy03 <1370724210@qq.com>
---
 vllm_omni/model_executor/models/fish_speech/dac_encoder.py | 3 +++
 .../models/fish_speech/fish_speech_dac_decoder.py          | 7 +++++++
 2 files changed, 10 insertions(+)

diff --git a/vllm_omni/model_executor/models/fish_speech/dac_encoder.py b/vllm_omni/model_executor/models/fish_speech/dac_encoder.py
index e89815ab43..a634d3e7a2 100644
--- a/vllm_omni/model_executor/models/fish_speech/dac_encoder.py
+++ b/vllm_omni/model_executor/models/fish_speech/dac_encoder.py
@@ -54,6 +54,9 @@ def _load_dac_codec(
     if "generator" in state_dict:
         state_dict = state_dict["generator"]
     codec.load_state_dict(state_dict, strict=False)
+    # Encoder path only uses encoder + quantizer.forward(); prune the
+    # decoder before moving to device to avoid unnecessary GPU allocation.
+    codec.decoder = None
     codec = codec.to(device=device, dtype=dtype)
     codec.eval()
 
diff --git a/vllm_omni/model_executor/models/fish_speech/fish_speech_dac_decoder.py b/vllm_omni/model_executor/models/fish_speech/fish_speech_dac_decoder.py
index 3a8042eb2e..5bab942440 100644
--- a/vllm_omni/model_executor/models/fish_speech/fish_speech_dac_decoder.py
+++ b/vllm_omni/model_executor/models/fish_speech/fish_speech_dac_decoder.py
@@ -141,6 +141,13 @@ def _ensure_codec_loaded(self) -> None:
         self._bake_weight_norm(codec)
         self._cache_attention_masks(codec)
 
+        # Decode path only uses quantizer.decode() + decoder; prune
+        # encode-only components before moving to device to avoid
+        # unnecessary GPU allocation.
+        codec.encoder = None
+        codec.quantizer.pre_module = None
+        codec.quantizer.downsample = None
+
         device = self.vllm_config.device_config.device
         codec = codec.to(device=device, dtype=torch.float32)
         codec.eval()