Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,8 @@ def __init__(
self.available = True
self.disabled = False

logger.info("sglang is using nccl==%s", self.nccl.ncclGetVersion())
if self.rank == 0:
logger.info("sglang is using nccl==%s", self.nccl.ncclGetVersion())

if self.rank == 0:
# get the unique id from NCCL
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
get_device_name,
is_cuda,
is_hip,
log_info_on_rank0,
)

_is_hip = is_hip()
Expand Down Expand Up @@ -945,7 +946,9 @@ def get_moe_configs(
# For example, updating the Triton version might cause all old configs to become suboptimal.
# To achieve the best performance, consider re-tuning the Triton fused MOE kernel in your environment.
# For the tuning method, refer to: https://github.com/sgl-project/sglang/tree/main/benchmark/kernels/fused_moe_triton
logger.info("Using MoE kernel config from %s.", config_file_path)
log_info_on_rank0(
logger, f"Using MoE kernel config from {config_file_path}."
)
# If a configuration has been found, return it
return {int(key): val for key, val in json.load(f).items()}

Expand Down
6 changes: 2 additions & 4 deletions python/sglang/srt/layers/quantization/fp8.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ def dummy_func(*args, **kwargs):
get_bool_env_var,
is_cuda,
is_hip,
log_info_on_rank0,
print_warning_once,
set_weight_attrs,
)
Expand Down Expand Up @@ -97,10 +98,7 @@ def __init__(
) -> None:
self.is_checkpoint_fp8_serialized = is_checkpoint_fp8_serialized
if is_checkpoint_fp8_serialized:
logger.warning(
"Detected fp8 checkpoint. Please note that the "
"format is experimental and subject to change."
)
log_info_on_rank0(logger, "Detected fp8 checkpoint.")
if activation_scheme not in ACTIVATION_SCHEMES:
raise ValueError(f"Unsupported activation scheme {activation_scheme}")
self.activation_scheme = activation_scheme
Expand Down
7 changes: 4 additions & 3 deletions python/sglang/srt/layers/quantization/fp8_kernel.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
get_device_name,
is_cuda,
is_hip,
log_info_on_rank0,
supports_custom_op,
)

Expand Down Expand Up @@ -685,9 +686,9 @@ def get_w8a8_block_fp8_configs(
)
if os.path.exists(config_file_path):
with open(config_file_path) as f:
logger.info(
"Using configuration from %s for W8A8 Block FP8 kernel.",
config_file_path,
log_info_on_rank0(
logger,
f"Using configuration from {config_file_path} for W8A8 Block FP8 kernel.",
)
# If a configuration has been found, return it
return {int(key): val for key, val in json.load(f).items()}
Expand Down
61 changes: 35 additions & 26 deletions python/sglang/srt/model_executor/model_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,9 +278,10 @@ def model_specific_adjustment(self):
server_args.attention_backend = "fa3"
else:
server_args.attention_backend = "triton"
logger.info(
f"Attention backend not set. Use {server_args.attention_backend} backend by default."
)
if self.should_log:
logger.info(
f"Attention backend not set. Use {server_args.attention_backend} backend by default."
)
elif self.use_mla_backend:
if server_args.device != "cpu":
if server_args.attention_backend in [
Expand All @@ -290,9 +291,10 @@ def model_specific_adjustment(self):
"flashmla",
"cutlass_mla",
]:
logger.info(
f"MLA optimization is turned on. Use {server_args.attention_backend} backend."
)
if self.should_log:
logger.info(
f"MLA optimization is turned on. Use {server_args.attention_backend} backend."
)
else:
raise ValueError(
f"Invalid attention backend for MLA: {server_args.attention_backend}"
Expand All @@ -311,9 +313,10 @@ def model_specific_adjustment(self):
server_args.attention_backend = "triton"

if server_args.enable_double_sparsity:
logger.info(
"Double sparsity optimization is turned on. Use triton backend without CUDA graph."
)
if self.should_log:
logger.info(
"Double sparsity optimization is turned on. Use triton backend without CUDA graph."
)
server_args.attention_backend = "triton"
server_args.disable_cuda_graph = True
if server_args.ds_heavy_channel_type is None:
Expand All @@ -324,23 +327,26 @@ def model_specific_adjustment(self):

if self.is_multimodal:
self.mem_fraction_static *= 0.90
logger.info(
f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static:.3f} "
f"because this is a multimodal model."
)
logger.info(
"Automatically turn off --chunked-prefill-size for multimodal model."
)
if self.should_log:
logger.info(
f"Automatically reduce --mem-fraction-static to {self.mem_fraction_static:.3f} "
f"because this is a multimodal model."
)
logger.info(
"Automatically turn off --chunked-prefill-size for multimodal model."
)
server_args.chunked_prefill_size = -1

if not self.use_mla_backend:
server_args.disable_chunked_prefix_cache = True
elif self.page_size > 1:
logger.info("Disable chunked prefix cache when page size > 1.")
if self.should_log:
logger.info("Disable chunked prefix cache when page size > 1.")
server_args.disable_chunked_prefix_cache = True

if not server_args.disable_chunked_prefix_cache:
logger.info("Chunked prefix cache is turned on.")
if self.should_log:
logger.info("Chunked prefix cache is turned on.")

def init_torch_distributed(self):
logger.info("Init torch distributed begin.")
Expand Down Expand Up @@ -433,9 +439,10 @@ def load_model(self):
torch.set_num_threads(1)
if self.device == "cuda":
if torch.cuda.get_device_capability()[0] < 8:
logger.info(
"Compute capability below sm80. Use float16 due to lack of bfloat16 support."
)
if self.should_log:
logger.info(
"Compute capability below sm80. Use float16 due to lack of bfloat16 support."
)
self.server_args.dtype = "float16"
self.model_config.dtype = torch.float16
if torch.cuda.get_device_capability()[1] < 5:
Expand Down Expand Up @@ -471,10 +478,11 @@ def load_model(self):
self.model.load_kv_cache_scales(
self.server_args.quantization_param_path
)
logger.info(
"Loaded KV cache scaling factors from %s",
self.server_args.quantization_param_path,
)
if self.should_log:
logger.info(
"Loaded KV cache scaling factors from %s",
self.server_args.quantization_param_path,
)
else:
raise RuntimeError(
"Using FP8 KV cache and scaling factors provided but "
Expand Down Expand Up @@ -1021,7 +1029,8 @@ def init_cuda_graphs(self):
)

def apply_torch_tp(self):
logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")
if self.should_log:
logger.info(f"Enabling torch tensor parallelism on {self.tp_size} devices.")
from sglang.srt.model_parallel import tensor_parallel

device_mesh = torch.distributed.init_device_mesh(self.device, (self.tp_size,))
Expand Down
11 changes: 7 additions & 4 deletions python/sglang/srt/models/deepseek_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@
get_int_env_var,
is_cuda,
is_hip,
log_info_on_rank0,
)

_is_hip = is_hip()
Expand Down Expand Up @@ -1487,8 +1488,9 @@ def determine_n_share_experts_fusion(
):
self.n_share_experts_fusion = 0
global_server_args_dict["n_share_experts_fusion"] = 0
logger.info(
"Only Deepseek V3/R1 can use shared experts fusion optimization. Shared experts fusion optimization is disabled."
log_info_on_rank0(
logger,
"Only Deepseek V3/R1 can use shared experts fusion optimization. Shared experts fusion optimization is disabled.",
)
else:
assert (
Expand All @@ -1503,8 +1505,9 @@ def determine_n_share_experts_fusion(
):
self.n_share_experts_fusion = self.tp_size
global_server_args_dict["n_share_experts_fusion"] = self.tp_size
logger.info(
"Deepseek V3/R1 with fp8 can use shared experts fusion optimization when SM version >=90. Shared experts fusion optimization is enabled."
log_info_on_rank0(
logger,
"Deepseek V3/R1 with fp8 can use shared experts fusion optimization when SM version >=90. Shared experts fusion optimization is enabled.",
)

def get_input_embeddings(self) -> nn.Embedding:
Expand Down
7 changes: 7 additions & 0 deletions python/sglang/srt/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -2096,3 +2096,10 @@ def allocate(self, size: int):
output = self._buffer[self._pointer : self._pointer + size]
self._pointer += size
return output


def log_info_on_rank0(logger, msg):
from sglang.srt.distributed import get_tensor_model_parallel_rank

if get_tensor_model_parallel_rank() == 0:
logger.info(msg)
Loading