Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 49 additions & 0 deletions custom_ops/gpu_ops/machete/machete_mm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,52 @@ std::vector<paddle::Tensor> MacheteMMKernel(
maybe_schedule);
return {out};
}

std::vector<std::vector<int64_t>> MacheteMMKernelInferShape(
std::vector<int64_t> const& A_shape,
std::vector<int64_t> const& B_shape,
paddle::optional<std::vector<int64_t>> const& maybe_group_scales_shape,
paddle::optional<std::vector<int64_t>> const& maybe_group_zeros_shape,
paddle::optional<std::vector<int64_t>> const& maybe_channel_scales_shape,
paddle::optional<std::vector<int64_t>> const& maybe_token_scales_shape,
std::string const& b_type_str,
std::string const& maybe_out_type_str,
int64_t const& maybe_group_size,
std::string const& maybe_schedule) {
return {{A_shape[0], B_shape[1]}};
}

std::vector<paddle::DataType> MacheteMMKernelInferDtype(
paddle::DataType const& A_dtype,
paddle::DataType const& B_dtype,
paddle::optional<paddle::DataType> const& maybe_group_scales_dtype,
paddle::optional<paddle::DataType> const& maybe_group_zeros_dtype,
paddle::optional<paddle::DataType> const& maybe_channel_scales_dtype,
paddle::optional<paddle::DataType> const& maybe_token_scales_dtype,
std::string const& b_type_str,
std::string const& maybe_out_type_str,
int64_t const& maybe_group_size,
std::string const& maybe_schedule) {

paddle::DataType maybe_out_type;
if (maybe_out_type_str == "float16") {
maybe_out_type = paddle::DataType::FLOAT16;
} else if (maybe_out_type_str == "bfloat16") {
maybe_out_type = paddle::DataType::BFLOAT16;
} else {
maybe_out_type = A_dtype;
}
return {maybe_out_type};
}

PD_BUILD_STATIC_OP(machete_mm)
.Inputs({"A", "B",
paddle::Optional("maybe_group_scales"),
paddle::Optional("maybe_group_zeros"),
paddle::Optional("maybe_channel_scales"),
paddle::Optional("maybe_token_scales")})
.Outputs({"out"})
.Attrs({"b_type_str:std::string", "maybe_out_type_str:std::string", "maybe_group_size:int64_t", "maybe_schedule:std::string"})
.SetKernelFn(PD_KERNEL(MacheteMMKernel))
.SetInferShapeFn(PD_INFER_SHAPE(MacheteMMKernelInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(MacheteMMKernelInferDtype));
20 changes: 20 additions & 0 deletions custom_ops/gpu_ops/machete/machete_prepack_B.cu
Original file line number Diff line number Diff line change
Expand Up @@ -71,3 +71,23 @@ std::vector<paddle::Tensor> MachetePrepackBKernel(
return {B_prepacked};

}

std::vector<std::vector<int64_t>> MachetePrepackBKernelInferShape(
std::vector<int64_t> const& B_shape, std::string const& a_type_str, std::string const& b_type_str,
std::string const& maybe_group_scales_type_str) {
return {{B_shape[1], B_shape[0]}};
}

std::vector<paddle::DataType> MachetePrepackBKernelInferDtype(
paddle::DataType const& B_dtype, std::string const& a_type_str, std::string const& b_type_str,
std::string const& maybe_group_scales_type_str) {
return {B_dtype};
}

PD_BUILD_STATIC_OP(machete_prepack_B)
.Inputs({"B"})
.Outputs({"B_prepacked"})
.Attrs({"a_type_str:std::string", "b_type_str:std::string", "maybe_group_scales_type_str:std::string"})
.SetKernelFn(PD_KERNEL(MachetePrepackBKernel))
.SetInferShapeFn(PD_INFER_SHAPE(MachetePrepackBKernelInferShape))
.SetInferDtypeFn(PD_INFER_DTYPE(MachetePrepackBKernelInferDtype));
2 changes: 1 addition & 1 deletion docs/usage/environment_variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
"FD_ENABLE_MODEL_LOAD_CACHE": lambda: bool(int(os.getenv("FD_ENABLE_MODEL_LOAD_CACHE", "0"))),

# Whether to use Machete for wint4 dense GEMM.
"FD_USE_MACHETE": lambda: os.getenv("FD_USE_MACHETE", "0"),
"FD_USE_MACHETE": lambda: os.getenv("FD_USE_MACHETE", "1"),

# Used to truncate the string inserted during thinking when reasoning in a model. (</think> for ernie4_5_vl, \n</think>\n\n for ernie_x1)
"FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", "</think>"),
Expand Down
5 changes: 2 additions & 3 deletions docs/zh/usage/environment_variables.md
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ environment_variables: dict[str, Callable[[], Any]] = {
"FD_ENABLE_MODEL_LOAD_CACHE": lambda: bool(int(os.getenv("FD_ENABLE_MODEL_LOAD_CACHE", "0"))),

# 是否使用 Machete 后端的 wint4 GEMM.
"FD_USE_MACHETE": lambda: os.getenv("FD_USE_MACHETE", "0"),
"FD_USE_MACHETE": lambda: os.getenv("FD_USE_MACHETE", "1"),

# Used to truncate the string inserted during thinking when reasoning in a model. (</think> for ernie4_5_vl, \n</think>\n\n for ernie_x1)
"FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR": lambda: os.getenv("FD_LIMIT_THINKING_CONTENT_TRUNCATE_STR", "</think>"),
Expand All @@ -87,6 +87,5 @@ environment_variables: dict[str, Callable[[], Any]] = {
"FD_CACHE_PROC_EXIT_TIMEOUT": lambda: int(os.getenv("FD_CACHE_PROC_EXIT_TIMEOUT", "600")),

# cache_transfer_manager 进程残留时连续错误阈值
"FD_CACHE_PROC_ERROR_COUNT": lambda: int(os.getenv("FD_CACHE_PROC_ERROR_COUNT", "10")),
}
"FD_CACHE_PROC_ERROR_COUNT": lambda: int(os.getenv("FD_CACHE_PROC_ERROR_COUNT", "10")),}
```
2 changes: 1 addition & 1 deletion fastdeploy/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@
# Set moe backend."cutlass","marlin" and "triton" can be set currently.
"FD_MOE_BACKEND": lambda: os.getenv("FD_MOE_BACKEND", "cutlass"),
# Whether to use Machete for wint4 dense gemm.
"FD_USE_MACHETE": lambda: os.getenv("FD_USE_MACHETE", "0"),
"FD_USE_MACHETE": lambda: os.getenv("FD_USE_MACHETE", "1"),
# Set whether to disable recompute the request when the KV cache is full.
"FD_DISABLED_RECOVER": lambda: os.getenv("FD_DISABLED_RECOVER", "0"),
# Set triton kernel JIT compilation directory.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,7 +167,7 @@ def machete_quantize_and_pack(
atype,
quant_type,
scale_type,
)[0]
)
return w_q_prepack, w_s


Expand All @@ -194,5 +194,5 @@ def machete_wint_mm(
out_dtype, # out_dtype
group_size, # group_size
scheduler, # scheduler
)[0]
)
return out
17 changes: 9 additions & 8 deletions fastdeploy/model_executor/layers/quantization/weight_only.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,18 @@
else:
from paddle.nn.quant import weight_only_linear

from fastdeploy.model_executor.layers.quantization.ops.machete_mm import _ENABLE_MACHETE

from ..moe import FusedMoE
from ..utils import get_tensor
from .quant_base import QuantConfigBase, QuantMethodBase

if _ENABLE_MACHETE:
from fastdeploy.model_executor.layers.quantization.ops import (
machete_quantize_and_pack,
machete_wint_mm,
)


class WeightOnlyConfig(QuantConfigBase):
"""
Expand Down Expand Up @@ -154,14 +162,11 @@ def get_quant_method(self, layer) -> Optional[QuantMethodBase]:
else:
raise ValueError(f"Unsupported MOE backend {layer.use_method}")
else:
from fastdeploy.model_executor.layers.quantization.ops.machete_mm import (
_ENABLE_MACHETE,
)

if (
_ENABLE_MACHETE
and envs.FD_USE_MACHETE == "1"
and not layer.is_quantized
and not layer.fd_config.load_config.dynamic_load_weight
and layer.weight_shape[1]
and layer.weight_shape[1] % 128 == 0
):
Expand Down Expand Up @@ -406,9 +411,6 @@ def process_prequanted_weights(self, layer, state_dict) -> None:
raise NotImplementedError("Machete kernel doesn't support prequant. Please set FD_USE_MACHETE to 0.")

def process_loaded_weights(self, layer, weight) -> None:
from fastdeploy.model_executor.layers.quantization.ops import (
machete_quantize_and_pack,
)

# Using group scale for machete, group size is 128
quanted_weight_tensor, weight_scale_tensor = machete_quantize_and_pack(
Expand All @@ -421,7 +423,6 @@ def process_loaded_weights(self, layer, weight) -> None:
layer.weight_scale.set_value(weight_scale_tensor.astype(paddle.get_default_dtype()))

def apply(self, layer, x):
from fastdeploy.model_executor.layers.quantization.ops import machete_wint_mm

# Using group scale for machete, group size is 128
linear_out = machete_wint_mm(
Expand Down
2 changes: 2 additions & 0 deletions tests/ci_use/EB_VL_Lite/test_EB_VL_Lite_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
# List of ports to clean before and after tests
PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT, FD_CACHE_QUEUE_PORT]

os.environ["FD_USE_MACHETE"] = "0"


def is_port_open(host: str, port: int, timeout=1.0):
"""
Expand Down
2 changes: 2 additions & 0 deletions tests/e2e/test_EB_VL_Lite_serving.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@
# List of ports to clean before and after tests
PORTS_TO_CLEAN = [FD_API_PORT, FD_ENGINE_QUEUE_PORT, FD_METRICS_PORT, FD_CACHE_QUEUE_PORT]

os.environ["FD_USE_MACHETE"] = "0"


def is_port_open(host: str, port: int, timeout=1.0):
"""
Expand Down
Loading
Loading