Skip to content
Closed
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
25 changes: 19 additions & 6 deletions .ci/scripts/test_llama.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
MODEL_NAME=$1 # stories110M.pt
BUILD_TOOL=$2 # buck2 or cmake
DTYPE=$3 # fp16 or fp32
MODE=${4:-"xnnpack"} # portable or xnnpack
MODE=${4:-"xnnpack+custom"} # portable or xnnpack+custom or xnnpack+custom+qe
if [[ $# -lt 4 ]]; then # Assuming 4 mandatory args
echo "Expecting atleast 4 positional arguments"
echo "Usage: [...]"
Expand All @@ -37,7 +37,7 @@ if [[ -z "${MODE:-}" ]]; then
exit 1
fi

if [[ "${MODE}" =~ xnnpack.* ]]; then
if [[ "${MODE}" =~ .*xnnpack.* ]]; then
XNNPACK=ON
else
XNNPACK=OFF
Expand All @@ -49,6 +49,12 @@ else
CUSTOM=OFF
fi

if [[ "${MODE}" =~ .*qe.* ]]; then
QE=ON
else
QE=OFF
fi

if [[ -z "${BUCK:-}" ]]; then
BUCK=buck2
fi
Expand All @@ -69,6 +75,7 @@ cmake_install_executorch_libraries() {
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=ON \
-DEXECUTORCH_BUILD_CUSTOM="$CUSTOM" \
-DEXECUTORCH_BUILD_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_QUANTIZED="$QE" \
-DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
-Bcmake-out .
Expand All @@ -84,7 +91,7 @@ cmake_build_llama_runner() {
-DEXECUTORCH_BUILD_CUSTOM="$CUSTOM" \
-DEXECUTORCH_BUILD_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_XNNPACK="$XNNPACK" \
-DEXECUTORCH_BUILD_OPTIMIZED=ON \
-DEXECUTORCH_BUILD_QUANTIZED="$QE" \
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we not always build? haveing so many options for build feels like additional burden for users. Maybe do default opt-in?

-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" \
-Bcmake-out/${dir} \
${dir}
Expand Down Expand Up @@ -126,9 +133,15 @@ fi
# Export model.
EXPORTED_MODEL_NAME="${EXPORTED_MODEL_NAME}.pte"
echo "Exporting ${EXPORTED_MODEL_NAME}"
EXPORT_ARGS="-c stories110M.pt -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME}"
if [[ "${MODE}" == "xnnpack+kv+custom" ]]; then
EXPORT_ARGS="${EXPORT_ARGS} -kv --use_sdpa_with_kv_cache -X -qmode 8da4w -G 128"
EXPORT_ARGS="-c stories110M.pt -p ${PARAMS} -d ${DTYPE} -n ${EXPORTED_MODEL_NAME} -kv"
if [[ "${XNNPACK}" == "ON" ]]; then
EXPORT_ARGS="${EXPORT_ARGS} -X -qmode 8da4w -G 128"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit: does += operator work?

fi
if [[ "${CUSTOM}" == "ON" ]]; then
EXPORT_ARGS="${EXPORT_ARGS} --use_sdpa_with_kv_cache"
fi
if [[ "${QE}" == "ON" ]]; then
EXPORT_ARGS="${EXPORT_ARGS} --embedding-quantize 8,1024"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

thanks for adding tests!

fi
# Add dynamically linked library location
$PYTHON_EXECUTABLE -m examples.models.llama2.export_llama ${EXPORT_ARGS}
Expand Down
2 changes: 1 addition & 1 deletion .ci/scripts/test_quantized_aot_lib.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ build_cmake_quantized_aot_lib() {
&& retry cmake -DBUCK2=buck2 \
-DCMAKE_BUILD_TYPE=Release \
-DCMAKE_PREFIX_PATH="$CMAKE_PREFIX_PATH" \
-DEXECUTORCH_BUILD_QUANTIZED=ON \
-DEXECUTORCH_BUILD_QUANTIZED_OPS_AOT=ON \
-DPYTHON_EXECUTABLE="$PYTHON_EXECUTABLE" ..)

cmake --build ${CMAKE_OUTPUT_DIR} -j4
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ jobs:
matrix:
dtype: [fp32]
build-tool: [buck2, cmake]
mode: [portable, xnnpack+kv+custom]
mode: [portable, xnnpack+custom, xnnpack+custom+qe]
fail-fast: false
with:
runner: linux.2xlarge
Expand Down
2 changes: 1 addition & 1 deletion build/executorch-config.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ set(lib_list
etdump bundled_program extension_data_loader ${FLATCCRT_LIB} mpsdelegate
qnn_executorch_backend portable_ops_lib extension_module xnnpack_backend
XNNPACK cpuinfo pthreadpool vulkan_backend optimized_kernels cpublas eigen_blas
optimized_ops_lib optimized_native_cpu_ops_lib
optimized_ops_lib optimized_native_cpu_ops_lib quantized_kernels quantized_ops_lib
)
foreach(lib ${lib_list})
# Name of the variable which stores result of the find_library search
Expand Down
25 changes: 25 additions & 0 deletions examples/models/llama2/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ set(EXECUTORCH_ROOT ${CMAKE_CURRENT_SOURCE_DIR}/../../..)
set(TORCH_ROOT ${EXECUTORCH_ROOT}/third-party/pytorch)

include(${EXECUTORCH_ROOT}/build/Utils.cmake)
include(${EXECUTORCH_ROOT}/build/Codegen.cmake)

if(NOT PYTHON_EXECUTABLE)
resolve_python_executable()
Expand Down Expand Up @@ -91,6 +92,7 @@ add_subdirectory(runner)
if(EXECUTORCH_USE_TIKTOKEN)
# find RE2 for tokenizer
set(ABSL_ENABLE_INSTALL ON)
set(ABSL_PROPAGATE_CXX_STD ON)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oh we depend on abseil for tiktoken?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, tiktoken -> re2 -> abseil

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

no tests using this path yet right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not yet

set(_pic_flag
${CMAKE_POSITION_INDEPENDENT_CODE})
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
Expand Down Expand Up @@ -118,6 +120,29 @@ else()
target_link_options_shared_lib(portable_ops_lib)
endif()

if(EXECUTORCH_BUILD_QUANTIZED)
# TODO(larryliu0820): after we delete llama_quantized ops we should be able to reuse
# quantized_kernels and quantized_ops_lib directly.
merge_yaml(
FUNCTIONS_YAML ${CMAKE_CURRENT_SOURCE_DIR}/ops/quantized.yaml
FALLBACK_YAML ${EXECUTORCH_ROOT}/kernels/quantized/quantized.yaml
OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR})

gen_selected_ops("${CMAKE_CURRENT_BINARY_DIR}/merged.yaml" "" "")
generate_bindings_for_kernels(
FUNCTIONS_YAML ${CMAKE_CURRENT_BINARY_DIR}/merged.yaml)
message("Generated files ${gen_command_sources}")

# quantized_merge_ops_lib: Register quantized op kernels into the runtime
gen_operators_lib(
"quantized_merge_ops_lib"
KERNEL_LIBS quantized_kernels
DEPS executorch)
target_include_directories(quantized_merge_ops_lib PUBLIC ${_common_include_directories})
target_link_options_shared_lib(quantized_merge_ops_lib)
list(APPEND link_libraries quantized_kernels quantized_merge_ops_lib)
endif()

if(EXECUTORCH_BUILD_CUSTOM)
target_link_options_shared_lib(custom_ops)
list(APPEND link_libraries custom_ops)
Expand Down
4 changes: 2 additions & 2 deletions examples/models/llama2/ops/quantized.yaml
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
- func: llama_quantized::embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)
- func: llama_quantized::DEPRECATED_DO_NOT_USE_embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)
variants: function
kernels:
- arg_meta: null
kernel_name: torch::executor::quantized_embedding_byte_out

- func: llama_quantized::embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
- func: llama_quantized::DEPRECATED_DO_NOT_USE_embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, int weight_quant_min, int weight_quant_max, Tensor indices, ScalarType? dtype=None, *, Tensor(a!) out) -> Tensor(a!)
variants: function
kernels:
- arg_meta: null
Expand Down
22 changes: 14 additions & 8 deletions examples/models/llama2/ops/quantized_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,22 @@
"llama_quantized", "DEF"
) # to not be confused with torch.ops.quantized.* ops.
quantized_lib.define(
"embedding_byte(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
"DEPRECATED_DO_NOT_USE_embedding_byte(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
"int weight_quant_min, int weight_quant_max, Tensor indices) -> Tensor",
)

quantized_lib.define(
"embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
"DEPRECATED_DO_NOT_USE_embedding_byte.out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
"int weight_quant_min, int weight_quant_max, Tensor indices, *, Tensor(a!) out) -> Tensor(a!)",
)

quantized_lib.define(
"embedding_byte.dtype(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
"DEPRECATED_DO_NOT_USE_embedding_byte.dtype(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
"int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None) -> Tensor",
)

quantized_lib.define(
"embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
"DEPRECATED_DO_NOT_USE_embedding_byte.dtype_out(Tensor weight, Tensor weight_scales, Tensor? weight_zero_points, "
"int weight_quant_min, int weight_quant_max, Tensor indices, *, ScalarType? dtype=None, Tensor(a!) out) -> Tensor(a!)",
)

Expand Down Expand Up @@ -66,7 +66,9 @@ def embedding_byte_weight_checks(weight, weight_scales, weight_zero_points):
), f"Expecting weight_zero_points tensor to be None or have same number of rows as weights, but found {weight.size()} and {weight_zero_points.size()}"


@impl(quantized_lib, "embedding_byte", "CompositeExplicitAutograd")
@impl(
quantized_lib, "DEPRECATED_DO_NOT_USE_embedding_byte", "CompositeExplicitAutograd"
)
def embedding_byte(
weight: torch.Tensor,
weight_scales: torch.Tensor,
Expand All @@ -92,7 +94,7 @@ def embedding_byte(
return torch.ops.aten.embedding.default(weight, indices)


@impl_abstract("llama_quantized::embedding_byte.out")
@impl_abstract("llama_quantized::DEPRECATED_DO_NOT_USE_embedding_byte.out")
def embedding_byte_out_meta(
weight: torch.Tensor,
weight_scales: torch.Tensor,
Expand All @@ -112,7 +114,11 @@ def embedding_byte_out_meta(
)


@impl(quantized_lib, "embedding_byte.dtype", "CompositeExplicitAutograd")
@impl(
quantized_lib,
"DEPRECATED_DO_NOT_USE_embedding_byte.dtype",
"CompositeExplicitAutograd",
)
def embedding_byte_dtype(
weight: torch.Tensor,
weight_scales: torch.Tensor,
Expand Down Expand Up @@ -140,7 +146,7 @@ def embedding_byte_dtype(
return torch.ops.aten.embedding.default(weight, indices)


@impl_abstract("llama_quantized::embedding_byte.dtype_out")
@impl_abstract("llama_quantized::DEPRECATED_DO_NOT_USE_embedding_byte.dtype_out")
def embedding_byte_dtype_out_meta(
weight: torch.Tensor,
weight_scales: torch.Tensor,
Expand Down
2 changes: 1 addition & 1 deletion examples/models/llama2/quant_lib.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,7 +105,7 @@ def check_embedding_byte_registered():
'Use `python -c "import torch as _; print(_.__path__)"` to find where torch package is installed.\n'
"Set that as TORCH_PACKAGE_DIR.\n"
"Then from root executorch dir do the following:\n"
"rm -rf cmake-out && mkdir cmake-out && (cd cmake-out && cmake -DBUCK2=<path-to-buck2> -DCMAKE_PREFIX_PATH=$TORCH_PACKAGE_DIR -DEXECUTORCH_BUILD_QUANTIZED=ON ..) && cmake --build . -j16\n"
"rm -rf cmake-out && mkdir cmake-out && (cd cmake-out && cmake -DBUCK2=<path-to-buck2> -DCMAKE_PREFIX_PATH=$TORCH_PACKAGE_DIR -DEXECUTORCH_BUILD_QUANTIZED_OPS_AOT=ON ..) && cmake --build . -j16\n"
'To find the location of the lib: find cmake-out -name "libquantized_ops_aot_lib*"\n'
"Then specify the said library via -s <path to libquantized_ops_aot_lib.so\n"
)
Expand Down
2 changes: 1 addition & 1 deletion examples/models/llama2/quantize.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,7 @@ def __init__(

@torch.no_grad()
def forward(self, indices: torch.Tensor) -> torch.Tensor:
return torch.ops.llama_quantized.embedding_byte.dtype(
return torch.ops.llama_quantized.DEPRECATED_DO_NOT_USE_embedding_byte.dtype(
self.weight, self.scales, None, 0, 0, indices, dtype=self.dtype
)

Expand Down
5 changes: 4 additions & 1 deletion kernels/quantized/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@
# ~~~
cmake_minimum_required(VERSION 3.19)

option(EXECUTORCH_BUILD_QUANTIZED_OPS_AOT
"Build the optimized ops library for AOT export usage" OFF)

set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
if(NOT CMAKE_CXX_STANDARD)
set(CMAKE_CXX_STANDARD 17)
Expand Down Expand Up @@ -49,7 +52,7 @@ message("Generated files ${gen_command_sources}")
# quantized_ops_aot_lib quantized_ops_lib but none of these is a common
# dependency of the other(s). This is not allowed by the Xcode "new build
# system".
if(NOT CMAKE_GENERATOR STREQUAL "Xcode")
if(NOT CMAKE_GENERATOR STREQUAL "Xcode" AND EXECUTORCH_BUILD_QUANTIZED_OPS_AOT)
# Build a AOT library to register quantized ops into PyTorch. This is a hack.
set(_quantized_sources
${_quantized_kernels__srcs}
Expand Down