diff --git a/.ci/scripts/test_llama.sh b/.ci/scripts/test_llama.sh index 47394cbdba0..fa9f463b8c2 100644 --- a/.ci/scripts/test_llama.sh +++ b/.ci/scripts/test_llama.sh @@ -12,7 +12,7 @@ source "$(dirname "${BASH_SOURCE[0]}")/utils.sh" MODEL_NAME=$1 # stories110M.pt BUILD_TOOL=$2 # buck2 or cmake DTYPE=$3 # fp16 or fp32 -MODE=${4:-"xnnpack"} # portable or xnnpack +MODE=${4:-"xnnpack+custom"} # portable or xnnpack+custom or xnnpack+custom+qe if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args echo "Expecting atleast 4 positional arguments" echo "Usage: [...]" @@ -37,7 +37,7 @@ if [[ -z "${MODE:-}" ]]; then exit 1 fi -if [[ "${MODE}" =~ xnnpack.* ]]; then +if [[ "${MODE}" =~ .*xnnpack.* ]]; then XNNPACK=ON else XNNPACK=OFF @@ -49,6 +49,12 @@ else CUSTOM=OFF fi +if [[ "${MODE}" =~ .*qe.* ]]; then + QE=ON +else + QE=OFF +fi + if [[ -z "${BUCK:-}" ]]; then BUCK=buck2 fi @@ -84,7 +90,6 @@ cmake_build_llama_runner() { -DEXECUTORCH_BUILD_CUSTOM="$CUSTOM" \ -DEXECUTORCH_BUILD_OPTIMIZED=ON \ -DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \ - -DEXECUTORCH_BUILD_OPTIMIZED=ON \ -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \ -Bcmake-out/${dir} \ ${dir} @@ -126,9 +131,15 @@ fi # Export model. EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}.pte" echo "Exporting ${EXPORTED_MODEL_NAME}" -EXPORT_ARGS="-c stories110M.pt -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME}" -if [[ "${MODE}" == "xnnpack+kv+custom" ]]; then - EXPORT_ARGS="${EXPORT_ARGS} -kv --use_sdpa_with_kv_cache -X -qmode 8da4w -G 128" +EXPORT_ARGS="-c stories110M.pt -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME} -kv" +if [[ "${XNNPACK}" == "ON" ]]; then + EXPORT_ARGS="${EXPORT_ARGS} -X -qmode 8da4w -G 128" +fi +if [[ "${CUSTOM}" == "ON" ]]; then + EXPORT_ARGS="${EXPORT_ARGS} --use_sdpa_with_kv_cache" +fi +if [[ "${QE}" == "ON" ]]; then + EXPORT_ARGS="${EXPORT_ARGS} --embedding-quantize 8,1024" fi # Add dynamically linked library location $PYTHON_EXECUTABLE -m examples.models.llama2.export_llama ${EXPORT_ARGS} diff --git a/.ci/scripts/test_quantized_aot_lib.sh b/.ci/scripts/test_quantized_aot_lib.sh index ed9c789c5e4..0ab9ceb81a7 100755 --- a/.ci/scripts/test_quantized_aot_lib.sh +++ b/.ci/scripts/test_quantized_aot_lib.sh @@ -24,7 +24,7 @@ build_cmake_quantized_aot_lib() { && retry cmake -DBUCK2=buck2 \ -DCMAKE_BUILD_TYPE=Release \ -DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \ - -DEXECUTORCH_BUILD_QUANTIZED=ON \ + -DEXECUTORCH_BUILD_QUANTIZED_OPS_AOT=ON \ -DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..) cmake --build ${CMAKE_OUTPUT_DIR} -j4 diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index f2cc83693c7..f650fc79209 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -90,7 +90,7 @@ jobs: matrix: dtype: [fp32] build-tool: [buck2, cmake] - mode: [portable, xnnpack+kv+custom] + mode: [portable, xnnpack+custom, xnnpack+custom+qe] fail-fast: false with: runner: linux.2xlarge diff --git a/CMakeLists.txt b/CMakeLists.txt index e963f0aaadd..03b87d5656d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -164,8 +164,6 @@ option(EXECUTORCH_BUILD_QNN "Build the Qualcomm backend" OFF) option(EXECUTORCH_BUILD_OPTIMIZED "Build the optimized kernels" OFF) -option(EXECUTORCH_BUILD_QUANTIZED "Build the quantized kernels" OFF) - option(EXECUTORCH_BUILD_SDK "Build the ExecuTorch SDK") option(EXECUTORCH_BUILD_SIZE_TEST "Build the size test" OFF) @@ -413,9 +411,7 @@ if(EXECUTORCH_BUILD_OPTIMIZED) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/optimized) endif() -if(EXECUTORCH_BUILD_QUANTIZED) - add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/quantized) -endif() +add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/kernels/quantized) add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/configurations) @@ -445,7 +441,7 @@ cmake_dependent_option( EXECUTORCH_BUILD_HOST_TARGETS OFF) if(EXECUTORCH_BUILD_EXECUTOR_RUNNER) # Baseline libraries that executor_runner will link against. - set(_executor_runner_libs executorch gflags) + set(_executor_runner_libs executorch gflags quantized_ops_lib) if(EXECUTORCH_BUILD_OPTIMIZED) list(APPEND _executor_runner_libs optimized_native_cpu_ops_lib) @@ -453,11 +449,6 @@ if(EXECUTORCH_BUILD_EXECUTOR_RUNNER) list(APPEND _executor_runner_libs portable_ops_lib) endif() - # Generate lib to register quantized ops - if(EXECUTORCH_BUILD_QUANTIZED) - list(APPEND _executor_runner_libs quantized_ops_lib) - endif() - add_executable(executor_runner ${_executor_runner__srcs}) if(CMAKE_BUILD_TYPE STREQUAL "Release" AND NOT APPLE) target_link_options(executor_runner PRIVATE "LINKER:--gc-sections") diff --git a/build/Utils.cmake b/build/Utils.cmake index 39fa7317da8..7705a9add4c 100644 --- a/build/Utils.cmake +++ b/build/Utils.cmake @@ -74,8 +74,6 @@ function(executorch_print_configuration_summary) STATUS " EXECUTORCH_BUILD_QNN : ${EXECUTORCH_BUILD_QNN}") message(STATUS " EXECUTORCH_BUILD_OPTIMIZED : " "${EXECUTORCH_BUILD_OPTIMIZED}") - message(STATUS " EXECUTORCH_BUILD_QUANTIZED : " - "${EXECUTORCH_BUILD_QUANTIZED}") message( STATUS " EXECUTORCH_BUILD_SDK : ${EXECUTORCH_BUILD_SDK}") message( diff --git a/build/build_apple_frameworks.sh b/build/build_apple_frameworks.sh index 0b6adae0a7f..f2be98c5ce0 100755 --- a/build/build_apple_frameworks.sh +++ b/build/build_apple_frameworks.sh @@ -22,7 +22,7 @@ CUSTOM=OFF MPS=OFF OPTIMIZED=OFF PORTABLE=OFF -QUANTIZED=OFF +QUANTIZED=ON XNNPACK=OFF HEADERS_PATH="include" EXECUTORCH_FRAMEWORK="executorch:libexecutorch.a,libexecutorch_no_prim_ops.a,libextension_apple.a,libextension_data_loader.a,libextension_module.a:$HEADERS_PATH" @@ -51,7 +51,6 @@ usage() { echo " --mps Include this flag to build the Metal Performance Shaders backend." echo " --optimized Include this flag to build the Optimized backend." echo " --portable Include this flag to build the Portable backend." - echo " --quantized Include this flag to build the Quantized backend." echo " --xnnpack Include this flag to build the XNNPACK backend." echo echo "Example:" @@ -74,7 +73,6 @@ for arg in "$@"; do --mps) MPS=ON ;; --optimized) OPTIMIZED=ON ;; --portable) PORTABLE=ON ;; - --quantized) QUANTIZED=ON ;; --xnnpack) XNNPACK=ON ;; *) if [[ -z "$SOURCE_ROOT_DIR" ]]; then @@ -137,7 +135,6 @@ cmake_build() { -DEXECUTORCH_BUILD_CUSTOM=$CUSTOM \ -DEXECUTORCH_BUILD_MPS=$MPS \ -DEXECUTORCH_BUILD_OPTIMIZED=$OPTIMIZED \ - -DEXECUTORCH_BUILD_QUANTIZED=$QUANTIZED \ -DEXECUTORCH_BUILD_XNNPACK=$XNNPACK \ ${platform_flag:+-DIOS_PLATFORM=$platform_flag} cmake --build . --config $MODE @@ -181,7 +178,7 @@ append_framework_flag "$CUSTOM" "$CUSTOM_FRAMEWORK" append_framework_flag "$MPS" "$MPS_FRAMEWORK" append_framework_flag "$OPTIMIZED" "$OPTIMIZED_FRAMEWORK" append_framework_flag "$PORTABLE" "$PORTABLE_FRAMEWORK" -append_framework_flag "$QUANTIZED" "$QUANTIZED_FRAMEWORK" +append_framework_flag "ON" "$QUANTIZED_FRAMEWORK" append_framework_flag "$XNNPACK" "$XNNPACK_FRAMEWORK" "$SOURCE_ROOT_DIR"/build/create_frameworks.sh "${FRAMEWORK_FLAGS[@]}" diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake index acf8b6779d5..60c8ebda5e6 100644 --- a/build/executorch-config.cmake +++ b/build/executorch-config.cmake @@ -38,7 +38,7 @@ set(lib_list etdump bundled_program extension_data_loader ${FLATCCRT_LIB} mpsdelegate qnn_executorch_backend portable_ops_lib extension_module xnnpack_backend XNNPACK cpuinfo pthreadpool vulkan_backend optimized_kernels cpublas eigen_blas - optimized_ops_lib optimized_native_cpu_ops_lib + optimized_ops_lib optimized_native_cpu_ops_lib quantized_kernels quantized_ops_lib ) foreach(lib ${lib_list}) # Name of the variable which stores result of the find_library search diff --git a/examples/models/llama2/CMakeLists.txt b/examples/models/llama2/CMakeLists.txt index 0735b5331e8..fa3b7cff7e7 100644 --- a/examples/models/llama2/CMakeLists.txt +++ b/examples/models/llama2/CMakeLists.txt @@ -44,6 +44,7 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..) set(TORCH_ROOT ${EXECUTORCH_ROOT}/third-party/pytorch) include(${EXECUTORCH_ROOT}/build/Utils.cmake) +include(${EXECUTORCH_ROOT}/build/Codegen.cmake) if(NOT PYTHON_EXECUTABLE) resolve_python_executable() @@ -91,6 +92,7 @@ add_subdirectory(runner) if(EXECUTORCH_USE_TIKTOKEN) # find RE2 for tokenizer set(ABSL_ENABLE_INSTALL ON) + set(ABSL_PROPAGATE_CXX_STD ON) set(_pic_flag ${CMAKE_POSITION_INDEPENDENT_CODE}) set(CMAKE_POSITION_INDEPENDENT_CODE ON) @@ -118,6 +120,26 @@ else() target_link_options_shared_lib(portable_ops_lib) endif() +# quantized ops yaml file operation +merge_yaml( + FUNCTIONS_YAML ${CMAKE_CURRENT_SOURCE_DIR}/ops/quantized.yaml + FALLBACK_YAML ${EXECUTORCH_ROOT}/kernels/quantized/quantized.yaml + OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}) + +gen_selected_ops("${CMAKE_CURRENT_BINARY_DIR}/merged.yaml" "" "") +generate_bindings_for_kernels( + FUNCTIONS_YAML ${CMAKE_CURRENT_BINARY_DIR}/merged.yaml) +message("Generated files ${gen_command_sources}") + +# quantized_merge_ops_lib: Register quantized op kernels into the runtime +gen_operators_lib( + "quantized_merge_ops_lib" + KERNEL_LIBS quantized_kernels + DEPS executorch) +target_include_directories(quantized_merge_ops_lib PUBLIC ${_common_include_directories}) +target_link_options_shared_lib(quantized_merge_ops_lib) +list(APPEND link_libraries quantized_kernels quantized_merge_ops_lib) + if(EXECUTORCH_BUILD_CUSTOM) target_link_options_shared_lib(custom_ops) list(APPEND link_libraries custom_ops) diff --git a/examples/models/llama2/ops/quantized.yaml b/examples/models/llama2/ops/quantized.yaml index 8e435169e17..6708510908f 100644 --- a/examples/models/llama2/ops/quantized.yaml +++ b/examples/models/llama2/ops/quantized.yaml @@ -1,10 +1,10 @@ -- func: llama_quantized::embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!) +- func: llama_quantized::DEPRECATED_DO_NOT_USE_embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!) variants: function kernels: - arg_meta: null kernel_name: torch::executor::quantized_embedding_byte_out -- func: llama_quantized::embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!) +- func: llama_quantized::DEPRECATED_DO_NOT_USE_embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!) variants: function kernels: - arg_meta: null diff --git a/examples/models/llama2/ops/quantized_ops.py b/examples/models/llama2/ops/quantized_ops.py index 5d13856442d..0ad80233626 100644 --- a/examples/models/llama2/ops/quantized_ops.py +++ b/examples/models/llama2/ops/quantized_ops.py @@ -15,22 +15,22 @@ "llama_quantized", "DEF" ) # to not be confused with torch.ops.quantized.* ops. quantized_lib.define( - "embedding_byte(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, " + "DEPRECATED_DO_NOT_USE_embedding_byte(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, " "int weight_quant_min, int weight_quant_max, Tensor indices) -> Tensor", ) quantized_lib.define( - "embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, " + "DEPRECATED_DO_NOT_USE_embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, " "int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)", ) quantized_lib.define( - "embedding_byte.dtype(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, " + "DEPRECATED_DO_NOT_USE_embedding_byte.dtype(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, " "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None) -> Tensor", ) quantized_lib.define( - "embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, " + "DEPRECATED_DO_NOT_USE_embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, " "int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)", ) @@ -66,7 +66,9 @@ def embedding_byte_weight_checks(weight, weight_scales, weight_zero_points): ), f"Expecting weight_zero_points tensor to be None or have same number of rows as weights, but found {weight.size()} and {weight_zero_points.size()}" -@impl(quantized_lib, "embedding_byte", "CompositeExplicitAutograd") +@impl( + quantized_lib, "DEPRECATED_DO_NOT_USE_embedding_byte", "CompositeExplicitAutograd" +) def embedding_byte( weight: torch.Tensor, weight_scales: torch.Tensor, @@ -92,7 +94,7 @@ def embedding_byte( return torch.ops.aten.embedding.default(weight, indices) -@impl_abstract("llama_quantized::embedding_byte.out") +@impl_abstract("llama_quantized::DEPRECATED_DO_NOT_USE_embedding_byte.out") def embedding_byte_out_meta( weight: torch.Tensor, weight_scales: torch.Tensor, @@ -112,7 +114,11 @@ def embedding_byte_out_meta( ) -@impl(quantized_lib, "embedding_byte.dtype", "CompositeExplicitAutograd") +@impl( + quantized_lib, + "DEPRECATED_DO_NOT_USE_embedding_byte.dtype", + "CompositeExplicitAutograd", +) def embedding_byte_dtype( weight: torch.Tensor, weight_scales: torch.Tensor, @@ -140,7 +146,7 @@ def embedding_byte_dtype( return torch.ops.aten.embedding.default(weight, indices) -@impl_abstract("llama_quantized::embedding_byte.dtype_out") +@impl_abstract("llama_quantized::DEPRECATED_DO_NOT_USE_embedding_byte.dtype_out") def embedding_byte_dtype_out_meta( weight: torch.Tensor, weight_scales: torch.Tensor, diff --git a/examples/models/llama2/quant_lib.py b/examples/models/llama2/quant_lib.py index 226f10421b9..c7453248b7d 100644 --- a/examples/models/llama2/quant_lib.py +++ b/examples/models/llama2/quant_lib.py @@ -105,7 +105,7 @@ def check_embedding_byte_registered(): 'Use `python -c "import torch as _; print(_.__path__)"` to find where torch package is installed.\n' "Set that as TORCH_PACKAGE_DIR.\n" "Then from root executorch dir do the following:\n" - "rm -rf cmake-out && mkdir cmake-out && (cd cmake-out && cmake -DBUCK2= -DCMAKE_PREFIX_PATH=$TORCH_PACKAGE_DIR -DEXECUTORCH_BUILD_QUANTIZED=ON ..) && cmake --build . -j16\n" + "rm -rf cmake-out && mkdir cmake-out && (cd cmake-out && cmake -DBUCK2= -DCMAKE_PREFIX_PATH=$TORCH_PACKAGE_DIR -DEXECUTORCH_BUILD_QUANTIZED_OPS_AOT=ON ..) && cmake --build . -j16\n" 'To find the location of the lib: find cmake-out -name "libquantized_ops_aot_lib*"\n' "Then specify the said library via -s torch.Tensor: - return torch.ops.llama_quantized.embedding_byte.dtype( + return torch.ops.llama_quantized.DEPRECATED_DO_NOT_USE_embedding_byte.dtype( self.weight, self.scales, None, 0, 0, indices, dtype=self.dtype ) diff --git a/examples/models/llama2/runner/targets.bzl b/examples/models/llama2/runner/targets.bzl index 5e1a324ce54..2a0dbe8dab0 100644 --- a/examples/models/llama2/runner/targets.bzl +++ b/examples/models/llama2/runner/targets.bzl @@ -4,7 +4,7 @@ def _get_operator_lib(aten = False): if aten: return ["//executorch/kernels/aten:generated_lib"] elif runtime.is_oss: - return ["//executorch/kernels/portable:generated_lib", "//executorch/examples/models/llama2/custom_ops:custom_ops"] + return ["//executorch/kernels/portable:generated_lib", "//executorch/examples/models/llama2/custom_ops:custom_ops", "//executorch/examples/models/llama2/ops:generated_lib"] else: return ["//executorch/configurations:optimized_native_cpu_ops", "//executorch/examples/models/llama2/custom_ops:custom_ops", "//executorch/examples/models/llama2/ops:generated_lib"] diff --git a/kernels/quantized/CMakeLists.txt b/kernels/quantized/CMakeLists.txt index 7be9e73827f..b34ba75ae29 100644 --- a/kernels/quantized/CMakeLists.txt +++ b/kernels/quantized/CMakeLists.txt @@ -10,6 +10,9 @@ # ~~~ cmake_minimum_required(VERSION 3.19) +option(EXECUTORCH_BUILD_QUANTIZED_OPS_AOT + "Build the optimized ops library for AOT export usage" OFF) + set(CMAKE_EXPORT_COMPILE_COMMANDS ON) if(NOT CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 17) @@ -49,7 +52,7 @@ message("Generated files ${gen_command_sources}") # quantized_ops_aot_lib quantized_ops_lib but none of these is a common # dependency of the other(s). This is not allowed by the Xcode "new build # system". -if(NOT CMAKE_GENERATOR STREQUAL "Xcode") +if(NOT CMAKE_GENERATOR STREQUAL "Xcode" AND EXECUTORCH_BUILD_QUANTIZED_OPS_AOT) # Build a AOT library to register quantized ops into PyTorch. This is a hack. set(_quantized_sources ${_quantized_kernels__srcs}