sgl-project · AniZpZ · Dec 10, 2025 · Aug 26, 2025 · Aug 26, 2025 · Aug 26, 2025
diff --git a/python/sglang/srt/configs/load_config.py b/python/sglang/srt/configs/load_config.py
@@ -23,6 +23,7 @@ class LoadFormat(str, enum.Enum):
     BITSANDBYTES = "bitsandbytes"
     MISTRAL = "mistral"
     LAYERED = "layered"
+    FLASH_RL = "flash_rl"  # For RL training with quantized models
     JAX = "jax"
     REMOTE = "remote"
     REMOTE_INSTANCE = "remote_instance"
@@ -46,6 +47,8 @@ class LoadConfig:
         "dummy" will initialize the weights with random values, which is
             mainly for profiling.
         "bitsandbytes" will load nf4 type weights.
+        "flash_rl" will load weights with support for RL training
+            with quantized models, enabling efficient weight reloading.
     ignore_patterns: The list of patterns to ignore when loading the model.
         Default to "original/**/*" to avoid repeated loading of llama's
         checkpoints.
@@ -78,6 +81,11 @@ class LoadConfig:
     # ModelOpt configuration object
     modelopt_config: Optional[ModelOptConfig] = None
 
+    # QuantizedRL-specific options (for FlashRL-style quantization)
+    rl_quant_profile: Optional[str] = (
+        None  # Path to rollout quantization profile (e.g., /root/profile.7b.pt)
+    )
+
     def __post_init__(self):
         model_loader_extra_config = self.model_loader_extra_config or {}
         if isinstance(model_loader_extra_config, str):

@@ -419,7 +419,16 @@ def weight_loader_v2(self, param: Parameter, loaded_weight: torch.Tensor):
         else:
             # FIXME: This branch is needed to load deepseek v3 awq.
             # However, we should fix this and avoid the branching here.
-            param.load_column_parallel_weight(loaded_weight)
+            # After QuantizedRL reload, params might still need tp_rank
+            try:
+                param.load_column_parallel_weight(
+                    loaded_weight,
+                    tp_rank=self.tp_rank,
+                    use_presharded_weights=self.use_presharded_weights,
+                )
+            except TypeError:
+                # Fallback for parameters that don't accept additional args
+                param.load_column_parallel_weight(loaded_weight)
 
     def forward(self, input_):
         bias = self.bias if not self.skip_bias_add else None
@@ -1360,7 +1369,16 @@ def weight_loader_v2(self, param: BasevLLMParameter, loaded_weight: torch.Tensor
         else:
             # `params` is defined in `vllm/model_executor/parameter.py`,
             # It does not support additional parameters.
-            param.load_row_parallel_weight(loaded_weight)
+            # However, after QuantizedRL reload, params might still need tp_rank
+            try:
+                param.load_row_parallel_weight(
+                    loaded_weight,
+                    tp_rank=self.tp_rank,
+                    use_presharded_weights=self.use_presharded_weights,
+                )
+            except TypeError:
+                # Fallback for parameters that don't accept additional args
+                param.load_row_parallel_weight(loaded_weight)
 
     def forward(self, input_, skip_all_reduce=False):
         if self.input_is_parallel:

@@ -764,6 +764,7 @@ def load_model(self):
             remote_instance_weight_loader_seed_instance_service_port=self.server_args.remote_instance_weight_loader_seed_instance_service_port,
             remote_instance_weight_loader_send_weights_group_ports=self.server_args.remote_instance_weight_loader_send_weights_group_ports,
             modelopt_config=modelopt_config,
+            rl_quant_profile=self.server_args.rl_quant_profile,
         )
         if self.device == "cpu":
             self.model_config = adjust_config_with_unaligned_cpu_tp(