From 227d4bfece6bb00f0365642ad1d7afd509079ba4 Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Tue, 20 Aug 2024 16:34:12 -0700 Subject: [PATCH 01/12] Bug fixes Differential Revision: D60773448 Pull Request resolved: https://github.com/pytorch/ao/pull/717 --- ...se_lowbit_weight_1x4x16_f32_neondot-impl.h | 16 +- ...se_lowbit_weight_1x8x16_f32_neondot-impl.h | 30 ++- .../cpu/aarch64/reduction/compute_sum.cpp | 5 +- .../aarch64/reduction/find_min_and_max.cpp | 28 ++- .../kernels/cpu/aarch64/tests/CMakeLists.txt | 9 + .../cpu/aarch64/tests/build_and_run_tests.sh | 9 +- .../kernels/cpu/aarch64/tests/test_linear.cpp | 233 ++++++++---------- .../cpu/aarch64/tests/test_reduction.cpp | 56 +++++ .../linear/examples/build_and_run_examples.sh | 3 + .../cpu/linear/tests/test_linear_operator.cpp | 5 +- 10 files changed, 246 insertions(+), 148 deletions(-) create mode 100644 torchao/experimental/kernels/cpu/aarch64/tests/test_reduction.cpp diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h index efcce4bb2e..e7fd56cf9b 100644 --- a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h +++ b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h @@ -218,7 +218,21 @@ void kernel_impl( if constexpr (has_clamp) { res = clamp(res, clamp_min, clamp_max); } - vst1q_f32(output + m_idx * output_m_stride + n_idx, res); + + // Store result + int remaining = n - n_idx; + float* store_loc = output + m_idx * output_m_stride + n_idx; + if (remaining >= 4) { + vst1q_f32(store_loc, res); + } else if (remaining >= 3) { + vst1_f32(store_loc, vget_low_f32(res)); + *(store_loc + 2) = res[2]; + } else if (remaining >= 2) { + vst1_f32(store_loc, vget_low_f32(res)); + } else { + *(store_loc) = res[0]; + } + } // n_idx activation_data_byte_ptr += (activation_ptr - activation_data_byte_ptr); } // m_idx diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h index 37f254c983..74ed288044 100644 --- a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h +++ b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h @@ -290,8 +290,34 @@ void kernel_impl( res_0123 = vec_clamp(res_0123, vec_min, vec_max); res_4567 = vec_clamp(res_4567, vec_min, vec_max); } - vst1q_f32(output + m_idx * output_m_stride + n_idx, res_0123); - vst1q_f32(output + m_idx * output_m_stride + n_idx + 4, res_4567); + + // Store result + int remaining = n - n_idx; + float* store_loc = output + m_idx * output_m_stride + n_idx; + if (remaining >= 8) { + vst1q_f32(store_loc, res_0123); + vst1q_f32(store_loc + 4, res_4567); + } else if (remaining >= 7) { + vst1q_f32(store_loc, res_0123); + vst1_f32(store_loc + 4, vget_low_f32(res_4567)); + *(store_loc + 6) = res_4567[2]; + } else if (remaining >= 6) { + vst1q_f32(store_loc, res_0123); + vst1_f32(store_loc + 4, vget_low_f32(res_4567)); + } else if (remaining >= 5) { + vst1q_f32(store_loc, res_0123); + *(store_loc + 4) = res_4567[0]; + } else if (remaining >= 4) { + vst1q_f32(store_loc, res_0123); + } else if (remaining >= 3) { + vst1_f32(store_loc, vget_low_f32(res_0123)); + *(store_loc + 2) = res_0123[2]; + } else if (remaining >= 2) { + vst1_f32(store_loc, vget_low_f32(res_0123)); + } else { + *store_loc = res_0123[0]; + } + } // n_idx activation_data_byte_ptr += (activation_ptr - activation_data_byte_ptr); } // m_idx diff --git a/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp b/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp index ab1f26180d..4fe0cb2e8f 100644 --- a/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp +++ b/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp @@ -1,15 +1,18 @@ // (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. #include +#include int32_t torchao::kernels::cpu::aarch64::reduction::compute_sum( const int8_t* vals, int size) { + assert(size >= 1); + int32_t res = 0; int i = 0; #pragma unroll(4) - for (; i < size; i += 16) { + for (; i + 15 < size; i += 16) { int8x16_t vec_vals = vld1q_s8(vals + i); res += (int)(vaddlvq_s8(vec_vals)); } diff --git a/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp b/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp index ed7ca01bb4..d30940ca60 100644 --- a/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp +++ b/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp @@ -1,23 +1,33 @@ // (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. #include +#include void torchao::kernels::cpu::aarch64::reduction::find_min_and_max( float32_t& min, float32_t& max, const float32_t* vals, int size) { - float32x4_t mins = vdupq_n_f32(0.0); - float32x4_t maxes = vdupq_n_f32(0.0); + assert(size > 0); + + // Needed in case size < 4 so we don't compare to + // uninitialized min/max values + min = vals[0]; + max = min; + int i = 0; - for (; i < size; i += 8) { - float32x4_t v1 = vld1q_f32(vals + i); - float32x4_t v2 = vld1q_f32(vals + i + 4); - mins = vminq_f32(v1, v2); - maxes = vmaxq_f32(v1, v2); + if (i + 3 < size) { + float32x4_t mins = vld1q_f32(vals + i); + float32x4_t maxes = mins; + i += 4; + for (; i + 3 < size; i += 4) { + float32x4_t v = vld1q_f32(vals + i); + mins = vminq_f32(mins, v); + maxes = vmaxq_f32(maxes, v); + } + min = vminvq_f32(mins); + max = vmaxvq_f32(maxes); } - min = vminvq_f32(mins); - max = vmaxvq_f32(maxes); // Remainder while (i < size) { diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/CMakeLists.txt b/torchao/experimental/kernels/cpu/aarch64/tests/CMakeLists.txt index 1b78f25b9c..4273c16785 100644 --- a/torchao/experimental/kernels/cpu/aarch64/tests/CMakeLists.txt +++ b/torchao/experimental/kernels/cpu/aarch64/tests/CMakeLists.txt @@ -35,6 +35,14 @@ target_link_libraries( dep ) +add_executable(test_reduction test_reduction.cpp) +target_link_libraries( + test_reduction + PRIVATE + GTest::gtest_main + dep +) + add_executable(test_bitpacking test_bitpacking.cpp) target_link_libraries( test_bitpacking @@ -61,6 +69,7 @@ target_link_libraries( include(GoogleTest) gtest_discover_tests(test_quantization) +gtest_discover_tests(test_reduction) gtest_discover_tests(test_bitpacking) gtest_discover_tests(test_linear) gtest_discover_tests(test_valpacking) diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/build_and_run_tests.sh b/torchao/experimental/kernels/cpu/aarch64/tests/build_and_run_tests.sh index 308455206c..6eddc520eb 100644 --- a/torchao/experimental/kernels/cpu/aarch64/tests/build_and_run_tests.sh +++ b/torchao/experimental/kernels/cpu/aarch64/tests/build_and_run_tests.sh @@ -7,7 +7,8 @@ cmake -DTORCHAO_LIBRARIES=${TORCHAO_LIBRARIES} -S ${TORCHAO_LIBRARIES}/torchao/e cmake --build ${CMAKE_OUT} # Run - ${CMAKE_OUT}/test_quantization - ${CMAKE_OUT}/test_bitpacking - ${CMAKE_OUT}/test_linear - ${CMAKE_OUT}/test_valpacking +${CMAKE_OUT}/test_quantization +${CMAKE_OUT}/test_reduction +${CMAKE_OUT}/test_bitpacking +${CMAKE_OUT}/test_linear +${CMAKE_OUT}/test_valpacking diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_linear.cpp b/torchao/experimental/kernels/cpu/aarch64/tests/test_linear.cpp index 4b61c162e0..39db050c61 100644 --- a/torchao/experimental/kernels/cpu/aarch64/tests/test_linear.cpp +++ b/torchao/experimental/kernels/cpu/aarch64/tests/test_linear.cpp @@ -10,12 +10,11 @@ float kTol = 0.0001; template -void test_channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot() { - int m = 7; - int k = 128; - int n = 13; - int group_size = 32; - +void test_channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot( + int m, + int k, + int n, + int group_size) { auto test_case = torchao:: channelwise_8bit_activation_groupwise_lowbit_weight_test_case::generate( m, @@ -50,7 +49,7 @@ void test_channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot test_case.weight_scales.data(), /*weight_zeros=*/test_case.weight_zeros.data()); - std::vector output(m * k); + std::vector output(m * n); kernel( output.data(), /*output_m_stride=*/n, @@ -72,70 +71,53 @@ void test_channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot TEST( test_channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot, Standard) { - constexpr int weight_nbit = 4; - constexpr bool has_weight_zeros = false; - constexpr bool has_bias = false; - constexpr bool has_clamp = false; - test_channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot< - weight_nbit, - has_weight_zeros, - has_bias, - has_clamp>(); + 4 /*weight_nbit*/, + false /*has_weight_zeros*/, + false /*has_bias*/, + false /*has_clamp*/>( + /*m=*/7, /*k=*/128, /*n=*/13, /*group_size=*/32); } TEST( test_channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot, HasWeightZeros) { - constexpr int weight_nbit = 4; - constexpr bool has_weight_zeros = true; - constexpr bool has_bias = false; - constexpr bool has_clamp = false; - test_channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot< - weight_nbit, - has_weight_zeros, - has_bias, - has_clamp>(); + 4 /*weight_nbit*/, + true /*has_weight_zeros*/, + false /*has_bias*/, + false /*has_clamp*/>( + /*m=*/7, /*k=*/128, /*n=*/13, /*group_size=*/32); } TEST( test_channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot, HasBias) { - constexpr int weight_nbit = 4; - constexpr bool has_weight_zeros = false; - constexpr bool has_bias = true; - constexpr bool has_clamp = false; - test_channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot< - weight_nbit, - has_weight_zeros, - has_bias, - has_clamp>(); + 4 /*weight_nbit*/, + false /*has_weight_zeros*/, + true /*has_bias*/, + false /*has_clamp*/>( + /*m=*/7, /*k=*/128, /*n=*/13, /*group_size=*/32); } TEST( test_channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot, HasClamp) { - constexpr int weight_nbit = 4; - constexpr bool has_weight_zeros = false; - constexpr bool has_bias = false; - constexpr bool has_clamp = true; - test_channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot< - weight_nbit, - has_weight_zeros, - has_bias, - has_clamp>(); + 4 /*weight_nbit*/, + false /*has_weight_zeros*/, + false /*has_bias*/, + true /*has_clamp*/>( + /*m=*/7, /*k=*/128, /*n=*/13, /*group_size=*/32); } template -void test_channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot() { - int m = 7; - int k = 64; - int n = 13; - int group_size = 16; - +void test_channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot( + int m, + int k, + int n, + int group_size) { auto test_case = torchao:: channelwise_8bit_activation_groupwise_lowbit_weight_test_case::generate( m, @@ -170,7 +152,7 @@ void test_channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot test_case.weight_scales.data(), /*weight_zeros=*/test_case.weight_zeros.data()); - std::vector output(m * k); + std::vector output(m * n); kernel( output.data(), /*output_m_stride=*/n, @@ -192,70 +174,66 @@ void test_channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot TEST( test_channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot, Standard) { - constexpr int weight_nbit = 4; - constexpr bool has_weight_zeros = false; - constexpr bool has_bias = false; - constexpr bool has_clamp = false; - test_channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot< - weight_nbit, - has_weight_zeros, - has_bias, - has_clamp>(); + 4 /*weight_nbit*/, + false /*has_weight_zeros*/, + false /*has_bias*/, + false /*has_clamp*/>( + /*m=*/7, /*k=*/64, /*n=*/13, /*group_size=*/16); } TEST( test_channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot, HasWeightZeros) { - constexpr int weight_nbit = 4; - constexpr bool has_weight_zeros = true; - constexpr bool has_bias = false; - constexpr bool has_clamp = false; - test_channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot< - weight_nbit, - has_weight_zeros, - has_bias, - has_clamp>(); + 4 /*weight_nbit*/, + true /*has_weight_zeros*/, + false /*has_bias*/, + false /*has_clamp*/>( + /*m=*/7, /*k=*/64, /*n=*/13, /*group_size=*/16); } TEST( test_channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot, HasBias) { - constexpr int weight_nbit = 4; - constexpr bool has_weight_zeros = false; - constexpr bool has_bias = true; - constexpr bool has_clamp = false; - test_channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot< - weight_nbit, - has_weight_zeros, - has_bias, - has_clamp>(); + 4 /*weight_nbit*/, + false /*has_weight_zeros*/, + true /*has_bias*/, + false /*has_clamp*/>( + /*m=*/7, /*k=*/64, /*n=*/13, /*group_size=*/16); } TEST( test_channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot, HasClamp) { - constexpr int weight_nbit = 4; - constexpr bool has_weight_zeros = false; - constexpr bool has_bias = false; - constexpr bool has_clamp = true; - test_channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot< - weight_nbit, - has_weight_zeros, - has_bias, - has_clamp>(); + 4 /*weight_nbit*/, + false /*has_weight_zeros*/, + false /*has_bias*/, + true /*has_clamp*/>( + /*m=*/7, /*k=*/64, /*n=*/13, /*group_size=*/16); } -template -void test_channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot() { - int m = 7; - int k = 64; - int n = 13; - int group_size = 16; +TEST( + test_channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot, + NLessThan4) { + for (int n = 1; n < 4; n++) { + test_channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot< + 4 /*weight_nbit*/, + false /*has_weight_zeros*/, + false /*has_bias*/, + true /*has_clamp*/>( + /*m=*/7, /*k=*/64, /*n=*/n, /*group_size=*/16); + } +} +template +void test_channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot( + int m, + int k, + int n, + int group_size) { auto test_case = torchao:: channelwise_8bit_activation_groupwise_lowbit_weight_test_case::generate( m, @@ -290,7 +268,7 @@ void test_channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot test_case.weight_scales.data(), /*weight_zeros=*/test_case.weight_zeros.data()); - std::vector output(m * k); + std::vector output(m * n); kernel( output.data(), /*output_m_stride=*/n, @@ -312,59 +290,56 @@ void test_channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot TEST( test_channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot, Standard) { - constexpr int weight_nbit = 4; - constexpr bool has_weight_zeros = false; - constexpr bool has_bias = false; - constexpr bool has_clamp = false; - test_channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot< - weight_nbit, - has_weight_zeros, - has_bias, - has_clamp>(); + 4 /*weight_nbit*/, + false /*has_weight_zeros*/, + false /*has_bias*/, + false /*has_clamp*/>( + /*m=*/7, /*k=*/64, /*n=*/13, /*group_size=*/16); } TEST( test_channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot, HasWeightZeros) { - constexpr int weight_nbit = 4; - constexpr bool has_weight_zeros = true; - constexpr bool has_bias = false; - constexpr bool has_clamp = false; - test_channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot< - weight_nbit, - has_weight_zeros, - has_bias, - has_clamp>(); + 4 /*weight_nbit*/, + true /*has_weight_zeros*/, + false /*has_bias*/, + false /*has_clamp*/>( + /*m=*/7, /*k=*/64, /*n=*/13, /*group_size=*/16); } TEST( test_channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot, HasBias) { - constexpr int weight_nbit = 4; - constexpr bool has_weight_zeros = false; - constexpr bool has_bias = true; - constexpr bool has_clamp = false; - test_channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot< - weight_nbit, - has_weight_zeros, - has_bias, - has_clamp>(); + 4 /*weight_nbit*/, + false /*has_weight_zeros*/, + true /*has_bias*/, + false /*has_clamp*/>( + /*m=*/7, /*k=*/64, /*n=*/13, /*group_size=*/16); } TEST( test_channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot, HasClamp) { - constexpr int weight_nbit = 4; - constexpr bool has_weight_zeros = false; - constexpr bool has_bias = false; - constexpr bool has_clamp = true; - test_channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot< - weight_nbit, - has_weight_zeros, - has_bias, - has_clamp>(); + 4 /*weight_nbit*/, + false /*has_weight_zeros*/, + false /*has_bias*/, + true /*has_clamp*/>( + /*m=*/7, /*k=*/64, /*n=*/13, /*group_size=*/16); +} + +TEST( + test_channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot, + NLessThan8) { + for (int n = 1; n < 8; n++) { + test_channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot< + 4 /*weight_nbit*/, + false /*has_weight_zeros*/, + false /*has_bias*/, + true /*has_clamp*/>( + /*m=*/7, /*k=*/64, /*n=*/n, /*group_size=*/16); + } } diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_reduction.cpp b/torchao/experimental/kernels/cpu/aarch64/tests/test_reduction.cpp new file mode 100644 index 0000000000..72da9a6e62 --- /dev/null +++ b/torchao/experimental/kernels/cpu/aarch64/tests/test_reduction.cpp @@ -0,0 +1,56 @@ +// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. + +#include +#include +#include +#include +#include +#include + +TEST(test_find_min_and_sum, SizeHasRemainderAfterDivideBy4) { + auto vals = torchao::get_random_vector(19, -1.0, 1.0); + float vmin, vmax; + torchao::kernels::cpu::aarch64::reduction::find_min_and_max( + vmin, vmax, vals.data(), vals.size()); + + auto expected_vmin = *std::min_element(vals.begin(), vals.end()); + auto expected_vmax = *std::max_element(vals.begin(), vals.end()); + EXPECT_EQ(vmin, expected_vmin); + EXPECT_EQ(vmax, expected_vmax); +} + +TEST(test_find_min_and_sum, SizeSmallerThan4) { + auto vals = torchao::get_random_vector(3, -1.0, 1.0); + float vmin, vmax; + torchao::kernels::cpu::aarch64::reduction::find_min_and_max( + vmin, vmax, vals.data(), vals.size()); + + auto expected_vmin = *std::min_element(vals.begin(), vals.end()); + auto expected_vmax = *std::max_element(vals.begin(), vals.end()); + EXPECT_EQ(vmin, expected_vmin); + EXPECT_EQ(vmax, expected_vmax); +} + +TEST(test_compute_sum, ExpectedOutput) { + auto vals = torchao::get_random_lowbit_vector(/*size=*/19, /*int8*/ 3); + int sum = torchao::kernels::cpu::aarch64::reduction::compute_sum( + (int8_t*)vals.data(), vals.size()); + int expected_sum = std::accumulate(vals.begin(), vals.end(), 0); + EXPECT_EQ(sum, expected_sum); +} + +TEST(test_compute_sum, SizeHasRemainderAfterDivideBy16) { + auto vals = torchao::get_random_lowbit_vector(/*size=*/17, /*int8*/ 3); + int sum = torchao::kernels::cpu::aarch64::reduction::compute_sum( + (int8_t*)vals.data(), vals.size()); + int expected_sum = std::accumulate(vals.begin(), vals.end(), 0); + EXPECT_EQ(sum, expected_sum); +} + +TEST(test_compute_sum, SizeSmallerThan16) { + auto vals = torchao::get_random_lowbit_vector(/*size=*/3, /*int8*/ 3); + int sum = torchao::kernels::cpu::aarch64::reduction::compute_sum( + (int8_t*)vals.data(), vals.size()); + int expected_sum = std::accumulate(vals.begin(), vals.end(), 0); + EXPECT_EQ(sum, expected_sum); +} diff --git a/torchao/experimental/kernels/cpu/linear/examples/build_and_run_examples.sh b/torchao/experimental/kernels/cpu/linear/examples/build_and_run_examples.sh index 2d1083058c..56c6ebbf25 100644 --- a/torchao/experimental/kernels/cpu/linear/examples/build_and_run_examples.sh +++ b/torchao/experimental/kernels/cpu/linear/examples/build_and_run_examples.sh @@ -4,8 +4,11 @@ SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd) export TORCHAO_LIBRARIES=${SCRIPT_DIR}/../../../../../.. +export CMAKE_PREFIX_PATH="$(python -c 'import torch.utils; print(torch.utils.cmake_prefix_path)')" +echo "CMAKE_PREFIX_PATH: ${CMAKE_PREFIX_PATH}" export CMAKE_OUT=/tmp/cmake-out/torch_ao/examples cmake -DTORCHAO_LIBRARIES=${TORCHAO_LIBRARIES} \ + -DCMAKE_PREFIX_PATH=${CMAKE_PREFIX_PATH} \ -S ${TORCHAO_LIBRARIES}/torchao/experimental/kernels/cpu/linear/examples \ -B ${CMAKE_OUT} \ -DOpenMP_ROOT=$(brew --prefix libomp) diff --git a/torchao/experimental/kernels/cpu/linear/tests/test_linear_operator.cpp b/torchao/experimental/kernels/cpu/linear/tests/test_linear_operator.cpp index c6f0da0acf..5408e426bf 100644 --- a/torchao/experimental/kernels/cpu/linear/tests/test_linear_operator.cpp +++ b/torchao/experimental/kernels/cpu/linear/tests/test_linear_operator.cpp @@ -30,7 +30,8 @@ void test_channelwise_8bit_activation_groupwise_lowbit_weight( has_weight_zeros, has_bias, has_clamp); - float output[m * n]; + + auto output = std::vector(m * n); for (auto linear_scheduling_policy : {LinearTileSchedulingPolicy::single_mc_parallel_nc, @@ -82,7 +83,7 @@ void test_channelwise_8bit_activation_groupwise_lowbit_weight( linear_tiling_params, linear_scheduling_policy, activation_data_buffer.get(), - output, + output.data(), m, n, k, From d683ba33e466107a2f33a8784199b319f1314423 Mon Sep 17 00:00:00 2001 From: Scott Roy <161522778+metascroy@users.noreply.github.com> Date: Wed, 21 Aug 2024 10:33:50 -0700 Subject: [PATCH 02/12] Update licenses in torchao/experimental Differential Revision: D61548961 Pull Request resolved: https://github.com/pytorch/ao/pull/720 --- .../kernels/cpu/aarch64/benchmarks/CMakeLists.txt | 6 +++++- .../cpu/aarch64/benchmarks/benchmark_bitpacking.cpp | 6 +++++- .../cpu/aarch64/benchmarks/benchmark_linear.cpp | 6 +++++- .../aarch64/benchmarks/benchmark_quantization.cpp | 6 +++++- .../aarch64/benchmarks/build_and_run_benchmarks.sh | 6 ++++++ .../kernels/cpu/aarch64/bitpacking/bitpack.h | 6 +++++- .../kernels/cpu/aarch64/bitpacking/macro.h | 6 ++++++ .../kernels/cpu/aarch64/bitpacking/uint3.h | 6 +++++- .../kernels/cpu/aarch64/bitpacking/uint4.h | 6 +++++- ...roupwise_lowbit_weight_1x1x32_f32_neondot-impl.h | 6 +++++- ...roupwise_lowbit_weight_1x4x16_f32_neondot-impl.h | 6 +++++- ...roupwise_lowbit_weight_1x8x16_f32_neondot-impl.h | 6 +++++- ...ctivation_prepare_activation_data_1xk_f32-impl.h | 6 +++++- .../kernels/cpu/aarch64/linear/linear.h | 6 +++++- .../kernels/cpu/aarch64/quantization/quantize.cpp | 6 +++++- .../kernels/cpu/aarch64/quantization/quantize.h | 6 +++++- .../kernels/cpu/aarch64/reduction/compute_sum.cpp | 6 +++++- .../cpu/aarch64/reduction/find_min_and_max.cpp | 6 +++++- .../kernels/cpu/aarch64/reduction/reduction.h | 6 +++++- .../kernels/cpu/aarch64/tests/CMakeLists.txt | 6 +++++- .../cpu/aarch64/tests/build_and_run_tests.sh | 6 ++++++ .../kernels/cpu/aarch64/tests/test_bitpacking.cpp | 6 +++++- .../kernels/cpu/aarch64/tests/test_linear.cpp | 6 +++++- .../kernels/cpu/aarch64/tests/test_quantization.cpp | 6 +++++- .../kernels/cpu/aarch64/tests/test_reduction.cpp | 6 +++++- .../kernels/cpu/aarch64/tests/test_utils.h | 6 +++++- .../kernels/cpu/aarch64/tests/test_valpacking.cpp | 6 +++++- .../kernels/cpu/aarch64/valpacking/interleave.cpp | 6 +++++- .../kernels/cpu/aarch64/valpacking/valpack.h | 6 +++++- .../kernels/cpu/linear/benchmarks/CMakeLists.txt | 6 +++++- .../linear/benchmarks/benchmark_linear_operator.cpp | 6 +++++- .../linear/benchmarks/build_and_run_benchmarks.sh | 6 +++++- ...e_8bit_activation_groupwise_lowbit_weight-impl.h | 6 +++++- ...elwise_8bit_activation_groupwise_lowbit_weight.h | 6 +++++- .../kernels/cpu/linear/examples/CMakeLists.txt | 6 +++++- ...tActivationGroupwiseLowbitWeightLinearOperator.h | 6 +++++- .../cpu/linear/examples/build_and_run_examples.sh | 6 +++++- .../linear/examples/separate_function_wrappers.cpp | 6 +++++- .../cpu/linear/examples/stateful_class_wrapper.cpp | 6 +++++- torchao/experimental/kernels/cpu/macro.h | 6 +++++- torchao/experimental/kernels/cpu/memory.h | 13 +++++++------ torchao/experimental/kernels/cpu/parallel.h | 6 +++++- 42 files changed, 215 insertions(+), 44 deletions(-) diff --git a/torchao/experimental/kernels/cpu/aarch64/benchmarks/CMakeLists.txt b/torchao/experimental/kernels/cpu/aarch64/benchmarks/CMakeLists.txt index 1c1a779dbe..5227ff1090 100644 --- a/torchao/experimental/kernels/cpu/aarch64/benchmarks/CMakeLists.txt +++ b/torchao/experimental/kernels/cpu/aarch64/benchmarks/CMakeLists.txt @@ -1,4 +1,8 @@ -# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. cmake_minimum_required(VERSION 3.19) project(benchmarks) diff --git a/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_bitpacking.cpp b/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_bitpacking.cpp index d03a3bfca8..926d475239 100644 --- a/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_bitpacking.cpp +++ b/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_bitpacking.cpp @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #include #include diff --git a/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_linear.cpp b/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_linear.cpp index 631bab42d4..8e3ec0516f 100644 --- a/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_linear.cpp +++ b/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_linear.cpp @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #include #include diff --git a/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_quantization.cpp b/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_quantization.cpp index 942855c017..868f01648d 100644 --- a/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_quantization.cpp +++ b/torchao/experimental/kernels/cpu/aarch64/benchmarks/benchmark_quantization.cpp @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #include #include diff --git a/torchao/experimental/kernels/cpu/aarch64/benchmarks/build_and_run_benchmarks.sh b/torchao/experimental/kernels/cpu/aarch64/benchmarks/build_and_run_benchmarks.sh index 1c38bc39e2..08f8358365 100644 --- a/torchao/experimental/kernels/cpu/aarch64/benchmarks/build_and_run_benchmarks.sh +++ b/torchao/experimental/kernels/cpu/aarch64/benchmarks/build_and_run_benchmarks.sh @@ -1,4 +1,10 @@ #!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + # Call script with sh build_and_run_benchmarks.sh {BENCHAMRK} SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd) diff --git a/torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h b/torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h index 8f6fe4a5b7..fce5abba42 100644 --- a/torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h +++ b/torchao/experimental/kernels/cpu/aarch64/bitpacking/bitpack.h @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #pragma once #include diff --git a/torchao/experimental/kernels/cpu/aarch64/bitpacking/macro.h b/torchao/experimental/kernels/cpu/aarch64/bitpacking/macro.h index 6bd06e0dfe..4861edbee7 100644 --- a/torchao/experimental/kernels/cpu/aarch64/bitpacking/macro.h +++ b/torchao/experimental/kernels/cpu/aarch64/bitpacking/macro.h @@ -1,3 +1,9 @@ +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. + #pragma once #define TORCHAO_ALWAYS_INLINE __attribute__((always_inline)) diff --git a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint3.h b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint3.h index b747148092..b76b146bad 100644 --- a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint3.h +++ b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint3.h @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #pragma once #include diff --git a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint4.h b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint4.h index c30949d72b..d4d3f391f8 100644 --- a/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint4.h +++ b/torchao/experimental/kernels/cpu/aarch64/bitpacking/uint4.h @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #pragma once #include diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot-impl.h b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot-impl.h index 626bff3487..19d4fe5bd0 100644 --- a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot-impl.h +++ b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x1x32_f32_neondot-impl.h @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #pragma once #include diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h index e7fd56cf9b..2fcd8d1310 100644 --- a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h +++ b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x4x16_f32_neondot-impl.h @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #pragma once #include diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h index 74ed288044..4974e909d4 100644 --- a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h +++ b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_groupwise_lowbit_weight_1x8x16_f32_neondot-impl.h @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #pragma once #include diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h index 7c2ba7d070..a67e2b0d14 100644 --- a/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h +++ b/torchao/experimental/kernels/cpu/aarch64/linear/channelwise_8bit_activation_prepare_activation_data_1xk_f32-impl.h @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #pragma once #include diff --git a/torchao/experimental/kernels/cpu/aarch64/linear/linear.h b/torchao/experimental/kernels/cpu/aarch64/linear/linear.h index 2607a2371a..cf3af21b5a 100644 --- a/torchao/experimental/kernels/cpu/aarch64/linear/linear.h +++ b/torchao/experimental/kernels/cpu/aarch64/linear/linear.h @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #pragma once #include diff --git a/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.cpp b/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.cpp index 3aed6d0192..523fd9360e 100644 --- a/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.cpp +++ b/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.cpp @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #include #include diff --git a/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h b/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h index af49836596..a8214cc449 100644 --- a/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h +++ b/torchao/experimental/kernels/cpu/aarch64/quantization/quantize.h @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #pragma once #include diff --git a/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp b/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp index 4fe0cb2e8f..3aa7f4a5d3 100644 --- a/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp +++ b/torchao/experimental/kernels/cpu/aarch64/reduction/compute_sum.cpp @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #include #include diff --git a/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp b/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp index d30940ca60..1516f3ceff 100644 --- a/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp +++ b/torchao/experimental/kernels/cpu/aarch64/reduction/find_min_and_max.cpp @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #include #include diff --git a/torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h b/torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h index 25110f4f34..f027c85304 100644 --- a/torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h +++ b/torchao/experimental/kernels/cpu/aarch64/reduction/reduction.h @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #pragma once #include diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/CMakeLists.txt b/torchao/experimental/kernels/cpu/aarch64/tests/CMakeLists.txt index 4273c16785..8e281ed79e 100644 --- a/torchao/experimental/kernels/cpu/aarch64/tests/CMakeLists.txt +++ b/torchao/experimental/kernels/cpu/aarch64/tests/CMakeLists.txt @@ -1,4 +1,8 @@ -# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. cmake_minimum_required(VERSION 3.19) project(tests) diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/build_and_run_tests.sh b/torchao/experimental/kernels/cpu/aarch64/tests/build_and_run_tests.sh index 6eddc520eb..ce8861ac65 100644 --- a/torchao/experimental/kernels/cpu/aarch64/tests/build_and_run_tests.sh +++ b/torchao/experimental/kernels/cpu/aarch64/tests/build_and_run_tests.sh @@ -1,4 +1,10 @@ #!/bin/bash +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd) export TORCHAO_LIBRARIES=${SCRIPT_DIR}/../../../../../.. export CMAKE_OUT=/tmp/cmake-out/torch_ao/tests diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_bitpacking.cpp b/torchao/experimental/kernels/cpu/aarch64/tests/test_bitpacking.cpp index 9e530da8e5..28a46f8e06 100644 --- a/torchao/experimental/kernels/cpu/aarch64/tests/test_bitpacking.cpp +++ b/torchao/experimental/kernels/cpu/aarch64/tests/test_bitpacking.cpp @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #include #include diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_linear.cpp b/torchao/experimental/kernels/cpu/aarch64/tests/test_linear.cpp index 39db050c61..22a2ed0f8f 100644 --- a/torchao/experimental/kernels/cpu/aarch64/tests/test_linear.cpp +++ b/torchao/experimental/kernels/cpu/aarch64/tests/test_linear.cpp @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #include #include diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_quantization.cpp b/torchao/experimental/kernels/cpu/aarch64/tests/test_quantization.cpp index 6fac44244a..74fc5ef527 100644 --- a/torchao/experimental/kernels/cpu/aarch64/tests/test_quantization.cpp +++ b/torchao/experimental/kernels/cpu/aarch64/tests/test_quantization.cpp @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #include #include diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_reduction.cpp b/torchao/experimental/kernels/cpu/aarch64/tests/test_reduction.cpp index 72da9a6e62..16eb87fbb7 100644 --- a/torchao/experimental/kernels/cpu/aarch64/tests/test_reduction.cpp +++ b/torchao/experimental/kernels/cpu/aarch64/tests/test_reduction.cpp @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #include #include diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h b/torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h index 2782095877..4e5083d9ef 100644 --- a/torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h +++ b/torchao/experimental/kernels/cpu/aarch64/tests/test_utils.h @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #pragma once #include diff --git a/torchao/experimental/kernels/cpu/aarch64/tests/test_valpacking.cpp b/torchao/experimental/kernels/cpu/aarch64/tests/test_valpacking.cpp index 5497a62f72..02be12a675 100644 --- a/torchao/experimental/kernels/cpu/aarch64/tests/test_valpacking.cpp +++ b/torchao/experimental/kernels/cpu/aarch64/tests/test_valpacking.cpp @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #include #include diff --git a/torchao/experimental/kernels/cpu/aarch64/valpacking/interleave.cpp b/torchao/experimental/kernels/cpu/aarch64/valpacking/interleave.cpp index ace1d1697a..8cbf036957 100644 --- a/torchao/experimental/kernels/cpu/aarch64/valpacking/interleave.cpp +++ b/torchao/experimental/kernels/cpu/aarch64/valpacking/interleave.cpp @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #include #include diff --git a/torchao/experimental/kernels/cpu/aarch64/valpacking/valpack.h b/torchao/experimental/kernels/cpu/aarch64/valpacking/valpack.h index ecfb16ac83..383f717805 100644 --- a/torchao/experimental/kernels/cpu/aarch64/valpacking/valpack.h +++ b/torchao/experimental/kernels/cpu/aarch64/valpacking/valpack.h @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #pragma once diff --git a/torchao/experimental/kernels/cpu/linear/benchmarks/CMakeLists.txt b/torchao/experimental/kernels/cpu/linear/benchmarks/CMakeLists.txt index 72aa539d26..61e5eeae27 100644 --- a/torchao/experimental/kernels/cpu/linear/benchmarks/CMakeLists.txt +++ b/torchao/experimental/kernels/cpu/linear/benchmarks/CMakeLists.txt @@ -1,4 +1,8 @@ -# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. cmake_minimum_required(VERSION 3.19) project(benchmarks) diff --git a/torchao/experimental/kernels/cpu/linear/benchmarks/benchmark_linear_operator.cpp b/torchao/experimental/kernels/cpu/linear/benchmarks/benchmark_linear_operator.cpp index 48df081f5b..ad6563eabe 100644 --- a/torchao/experimental/kernels/cpu/linear/benchmarks/benchmark_linear_operator.cpp +++ b/torchao/experimental/kernels/cpu/linear/benchmarks/benchmark_linear_operator.cpp @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #include #include diff --git a/torchao/experimental/kernels/cpu/linear/benchmarks/build_and_run_benchmarks.sh b/torchao/experimental/kernels/cpu/linear/benchmarks/build_and_run_benchmarks.sh index a5451777e5..18da0e992d 100644 --- a/torchao/experimental/kernels/cpu/linear/benchmarks/build_and_run_benchmarks.sh +++ b/torchao/experimental/kernels/cpu/linear/benchmarks/build_and_run_benchmarks.sh @@ -1,5 +1,9 @@ #!/bin/bash -# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. # Call script with sh build_and_run_benchmarks.sh {BENCHAMRK} diff --git a/torchao/experimental/kernels/cpu/linear/channelwise_8bit_activation_groupwise_lowbit_weight-impl.h b/torchao/experimental/kernels/cpu/linear/channelwise_8bit_activation_groupwise_lowbit_weight-impl.h index 177ea57729..6196e69fa9 100644 --- a/torchao/experimental/kernels/cpu/linear/channelwise_8bit_activation_groupwise_lowbit_weight-impl.h +++ b/torchao/experimental/kernels/cpu/linear/channelwise_8bit_activation_groupwise_lowbit_weight-impl.h @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #pragma once #include diff --git a/torchao/experimental/kernels/cpu/linear/channelwise_8bit_activation_groupwise_lowbit_weight.h b/torchao/experimental/kernels/cpu/linear/channelwise_8bit_activation_groupwise_lowbit_weight.h index c006b45ce6..73ca5e073c 100644 --- a/torchao/experimental/kernels/cpu/linear/channelwise_8bit_activation_groupwise_lowbit_weight.h +++ b/torchao/experimental/kernels/cpu/linear/channelwise_8bit_activation_groupwise_lowbit_weight.h @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #pragma once diff --git a/torchao/experimental/kernels/cpu/linear/examples/CMakeLists.txt b/torchao/experimental/kernels/cpu/linear/examples/CMakeLists.txt index a86005b73f..73314651c5 100644 --- a/torchao/experimental/kernels/cpu/linear/examples/CMakeLists.txt +++ b/torchao/experimental/kernels/cpu/linear/examples/CMakeLists.txt @@ -1,4 +1,8 @@ -# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. project(examples) diff --git a/torchao/experimental/kernels/cpu/linear/examples/Channelwise8BitActivationGroupwiseLowbitWeightLinearOperator.h b/torchao/experimental/kernels/cpu/linear/examples/Channelwise8BitActivationGroupwiseLowbitWeightLinearOperator.h index 1bb9500bcb..06e3bfc433 100644 --- a/torchao/experimental/kernels/cpu/linear/examples/Channelwise8BitActivationGroupwiseLowbitWeightLinearOperator.h +++ b/torchao/experimental/kernels/cpu/linear/examples/Channelwise8BitActivationGroupwiseLowbitWeightLinearOperator.h @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #pragma once #include diff --git a/torchao/experimental/kernels/cpu/linear/examples/build_and_run_examples.sh b/torchao/experimental/kernels/cpu/linear/examples/build_and_run_examples.sh index 56c6ebbf25..9c244e54cc 100644 --- a/torchao/experimental/kernels/cpu/linear/examples/build_and_run_examples.sh +++ b/torchao/experimental/kernels/cpu/linear/examples/build_and_run_examples.sh @@ -1,5 +1,9 @@ #!/bin/bash -# (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &> /dev/null && pwd) export TORCHAO_LIBRARIES=${SCRIPT_DIR}/../../../../../.. diff --git a/torchao/experimental/kernels/cpu/linear/examples/separate_function_wrappers.cpp b/torchao/experimental/kernels/cpu/linear/examples/separate_function_wrappers.cpp index 00783d4b37..ba3e5b29b3 100644 --- a/torchao/experimental/kernels/cpu/linear/examples/separate_function_wrappers.cpp +++ b/torchao/experimental/kernels/cpu/linear/examples/separate_function_wrappers.cpp @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #include #include diff --git a/torchao/experimental/kernels/cpu/linear/examples/stateful_class_wrapper.cpp b/torchao/experimental/kernels/cpu/linear/examples/stateful_class_wrapper.cpp index 5f106fcee9..5fb24c683d 100644 --- a/torchao/experimental/kernels/cpu/linear/examples/stateful_class_wrapper.cpp +++ b/torchao/experimental/kernels/cpu/linear/examples/stateful_class_wrapper.cpp @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #include #include diff --git a/torchao/experimental/kernels/cpu/macro.h b/torchao/experimental/kernels/cpu/macro.h index 441c8b8cee..62c73f1f34 100644 --- a/torchao/experimental/kernels/cpu/macro.h +++ b/torchao/experimental/kernels/cpu/macro.h @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #pragma once diff --git a/torchao/experimental/kernels/cpu/memory.h b/torchao/experimental/kernels/cpu/memory.h index 55f0e6bf09..cf3220f0e3 100644 --- a/torchao/experimental/kernels/cpu/memory.h +++ b/torchao/experimental/kernels/cpu/memory.h @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #pragma once @@ -9,12 +13,9 @@ namespace torchao { -using aligned_byte_ptr = - std::unique_ptr; +using aligned_byte_ptr = std::unique_ptr; -aligned_byte_ptr make_aligned_byte_ptr( - size_t alignment, - size_t size) { +aligned_byte_ptr make_aligned_byte_ptr(size_t alignment, size_t size) { // Adjust size to next multiple of alignment >= size size_t adjusted_size = ((size + alignment - 1) / alignment) * alignment; diff --git a/torchao/experimental/kernels/cpu/parallel.h b/torchao/experimental/kernels/cpu/parallel.h index b61223c760..b1fe4dea6d 100644 --- a/torchao/experimental/kernels/cpu/parallel.h +++ b/torchao/experimental/kernels/cpu/parallel.h @@ -1,4 +1,8 @@ -// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary. +// Copyright (c) Meta Platforms, Inc. and affiliates. +// All rights reserved. +// +// This source code is licensed under the license found in the +// LICENSE file in the root directory of this source tree. #pragma once From 9cc81e6385a51e415ef89397c3fe51f2a8d7453c Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Thu, 22 Aug 2024 02:15:42 +0800 Subject: [PATCH 03/12] Enable `to(device=device_name)` for `Uintx` (#722) enable to device for Uintx Signed-off-by: yiliu30 --- test/dtypes/test_uintx.py | 15 +++++++++++++++ torchao/dtypes/uintx/Uintx.py | 29 +++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+) diff --git a/test/dtypes/test_uintx.py b/test/dtypes/test_uintx.py index 387e11e8b0..15b85942bb 100644 --- a/test/dtypes/test_uintx.py +++ b/test/dtypes/test_uintx.py @@ -36,6 +36,21 @@ def __init__(self, scale, device): def forward(self, x): return self.net(x) +@pytest.mark.parametrize("bit_width", bit_widths) +@pytest.mark.parametrize("group_size", group_sizes) +@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available") +@pytest.mark.skipif(not TORCH_VERSION_AT_LEAST_2_5, reason="only works with fix in the nightly build") +def test_uintx_quant_on_cpu_then_move_to_cuda(bit_width, group_size): + scale = 512 + fp16_mod_on_cpu = Linear16(scale, "cpu") + quantize_(fp16_mod_on_cpu, uintx_weight_only(bit_width, group_size=group_size)) + test_input_on_cpu = torch.randn(scale*2, dtype=torch.float16, device="cpu") + output_on_cpu = fp16_mod_on_cpu(test_input_on_cpu) + fp16_mod_on_cuda = fp16_mod_on_cpu.to("cuda") + test_input_on_cuda = test_input_on_cpu.to("cuda") + output_on_cuda = fp16_mod_on_cuda(test_input_on_cuda) + assert torch.allclose(output_on_cpu, output_on_cuda.cpu(), atol=1.0e-3), "The output of the model on CPU and CUDA should be close" + @pytest.mark.parametrize("bit_width", bit_widths) @pytest.mark.parametrize("group_size", group_sizes) @pytest.mark.parametrize("device", devices) diff --git a/torchao/dtypes/uintx/Uintx.py b/torchao/dtypes/uintx/Uintx.py index 9fdaab0f43..12187f8d82 100644 --- a/torchao/dtypes/uintx/Uintx.py +++ b/torchao/dtypes/uintx/Uintx.py @@ -105,6 +105,35 @@ def from_uint8(cls, int_data: torch.Tensor, bit_width, pack_dim: int = -1): return cls(shards, int_data.shape, bit_width, pack_dim) + def _get_to_kwargs(self, *args, **kwargs): + device, dtype, _, memory_format = torch._C._nn._parse_to(*args, **kwargs) + device = self.device if device is None else device + dtype = self.dtype if dtype is None else dtype + memory_format = ( + memory_format if memory_format is not None else torch.preserve_format + ) + kwargs = { + "device": device, + "dtype": dtype, + "memory_format": memory_format, + } + return kwargs + + def to(self, *args, **kwargs): + if "copy" in kwargs: + return super().to(*args, **kwargs) + kwargs = self._get_to_kwargs(*args, **kwargs) + if "device" in kwargs: + return self.__class__( + list(shard.to(kwargs["device"]) for shard in self.get_shards()), + self.packed_shape, + self.bit_width, + self.pack_dim, + ) + return super().to(*args, **kwargs) + + + implements = UintxTensor.implements From 5c0e060a1a204d51ce6bc09901bc6f0574d2e38c Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Wed, 21 Aug 2024 15:59:30 -0400 Subject: [PATCH 04/12] torchao-nightly -> --pre torchao in README (#723) --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f7522fa1e7..0043df22da 100644 --- a/README.md +++ b/README.md @@ -148,7 +148,7 @@ pip install torchao --extra-index-url https://download.pytorch.org/whl/cu121 # f Nightly Release ```Shell -pip install --pre torchao-nightly --index-url https://download.pytorch.org/whl/nightly/cu121 # full options are cpu/cu118/cu121/cu124 +pip install --pre torchao --index-url https://download.pytorch.org/whl/nightly/cu121 # full options are cpu/cu118/cu121/cu124 ``` From source From cdb25a45aff7c1475e6922fa89d0c6193eed1a47 Mon Sep 17 00:00:00 2001 From: Masaki Kozuki Date: Thu, 22 Aug 2024 12:38:45 +0900 Subject: [PATCH 05/12] cast `local_scale_tensor` to fp32 for precompute of fp8 dynamic scaling (#713) * cast local_scale_tensor to fp32 Signed-off-by: Masaki Kozuki * test, but numerics not looking nice Signed-off-by: Masaki Kozuki * `Optional[torch.dtype]`, not `dtype | None` Signed-off-by: Masaki Kozuki * Update test/float8/test_fsdp2/test_fsdp2.py --------- Signed-off-by: Masaki Kozuki Co-authored-by: Mark Saroufim --- test/float8/test_fsdp2/test_fsdp2.py | 10 +++++++--- torchao/float8/fsdp_utils.py | 2 +- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/test/float8/test_fsdp2/test_fsdp2.py b/test/float8/test_fsdp2/test_fsdp2.py index 30aa735480..cf5bd8fa03 100644 --- a/test/float8/test_fsdp2/test_fsdp2.py +++ b/test/float8/test_fsdp2/test_fsdp2.py @@ -3,7 +3,7 @@ import pytest import threading import unittest -from typing import Any, List +from typing import Any, List, Optional from torchao.utils import TORCH_VERSION_AT_LEAST_2_5 @@ -59,7 +59,7 @@ def init_multi_module(self) -> nn.Module: self.broadcast_module(module) return module - def init_transformer(self, weight_tying: bool) -> nn.Module: + def init_transformer(self, weight_tying: bool, dtype: Optional[torch.dtype] = None) -> nn.Module: torch.manual_seed(42) args = ModelArgs( n_layers=3, @@ -70,6 +70,8 @@ def init_transformer(self, weight_tying: bool) -> nn.Module: vocab_size=32, ) module = Transformer(args).cuda() + if dtype is not None: + module = module.to(dtype=dtype) self.broadcast_module(module) return module @@ -96,6 +98,7 @@ def test_transformer_parity(self): ScalingType.DELAYED, ], "compile_transformer_block": [False, True], + "dtype": [torch.float32, torch.bfloat16], }, self._test_transformer_parity, ) @@ -106,6 +109,7 @@ def _test_transformer_parity( precompute: bool, scaling_type_weight: ScalingType, compile_transformer_block: bool, + dtype: Optional[torch.dtype] = None, ): if not enable_fsdp_float8_all_gather and precompute: return @@ -117,7 +121,7 @@ def _test_transformer_parity( # latter uses fp8 compute. With fp8 all-gather, FSDP would pre-cast to # fp8 for that tied weight, incorrectly using fp8 for the embedding. weight_tying = not enable_fsdp_float8_all_gather - module = self.init_transformer(weight_tying=weight_tying).cuda() + module = self.init_transformer(weight_tying=weight_tying, dtype=dtype) ref_module = copy.deepcopy(module) float8_linear_config1 = Float8LinearConfig( cast_config_weight=CastConfig(scaling_type=scaling_type_weight), diff --git a/torchao/float8/fsdp_utils.py b/torchao/float8/fsdp_utils.py index 3f10784f7f..bbb954eca1 100644 --- a/torchao/float8/fsdp_utils.py +++ b/torchao/float8/fsdp_utils.py @@ -67,7 +67,7 @@ def precompute_float8_dynamic_scale_for_fsdp(module: nn.Module) -> None: scale_tensor = torch.finfo(torch.float8_e4m3fn).max / amax_tensor # Replicate if amax_tensor.dtype is torch.float16: scale_tensor = torch.clamp(scale_tensor, max=torch.finfo(torch.float16).max) - local_scale_tensor = scale_tensor.to_local() + local_scale_tensor = scale_tensor.to_local().to(dtype=torch.float32) for i, float8_linear in enumerate(float8_linears): float8_linear.weight._local_tensor._precomputed_scale = local_scale_tensor[i] From 99644e9a9645f8d330f9df92a70134cb51976077 Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Wed, 21 Aug 2024 22:24:11 -0700 Subject: [PATCH 06/12] Fix affine quantized tensor to device calls (#726) * Fix affine quantized tensor to device calls Summary: Fixes: https://github.com/pytorch/ao/issues/698 Also added `TorchAOBaseTensor` addressing part of https://github.com/pytorch/ao/issues/710 Test Plan: python test/dtypes/test_affine_quantized.py Reviewers: Subscribers: Tasks: Tags: --- test/dtypes/test_affine_quantized.py | 17 ++++++ torchao/dtypes/affine_quantized_tensor.py | 58 +++++++------------ .../linear_activation_quantized_tensor.py | 21 ++----- torchao/utils.py | 25 ++++++++ 4 files changed, 68 insertions(+), 53 deletions(-) diff --git a/test/dtypes/test_affine_quantized.py b/test/dtypes/test_affine_quantized.py index 5260c7d55d..d1e11ab822 100644 --- a/test/dtypes/test_affine_quantized.py +++ b/test/dtypes/test_affine_quantized.py @@ -51,6 +51,23 @@ def test_weights_only(self): else: _ = torch.load(f, weights_only=False) + @unittest.skipIf(not torch.cuda.is_available(), "Need CUDA available") + def test_to_device(self): + from torchao.quantization import quantize_ + for apply_quant in [int8_weight_only(), int8_dynamic_activation_int4_weight(), int8_dynamic_activation_int8_weight()]: + l = torch.nn.Linear(128, 256, dtype=torch.bfloat16) + ql = apply_quant(l) + ql.to("cuda") + + l = torch.nn.Linear(128, 256, dtype=torch.bfloat16) + ql = apply_quant(l) + ql.to(device="cuda") + + l = torch.nn.Linear(128, 256, dtype=torch.bfloat16) + ql = apply_quant(l) + ql.cuda() + + if __name__ == "__main__": run_tests() diff --git a/torchao/dtypes/affine_quantized_tensor.py b/torchao/dtypes/affine_quantized_tensor.py index ef96f11e71..2f0b113198 100644 --- a/torchao/dtypes/affine_quantized_tensor.py +++ b/torchao/dtypes/affine_quantized_tensor.py @@ -16,7 +16,6 @@ pack_tinygemm_scales_and_zeros, ) from torch.utils._python_dispatch import return_and_correct_aliasing -from torchao.utils import find_multiple from torchao.dtypes.utils import ( _implements, _dispatch__torch_function__, @@ -29,14 +28,18 @@ ) from torch.utils._python_dispatch import is_traceable_wrapper_subclass from dataclasses import dataclass -from torchao.utils import TORCH_VERSION_AT_LEAST_2_5 +from torchao.utils import ( + find_multiple, + TorchAOBaseTensor, + TORCH_VERSION_AT_LEAST_2_5, +) aten = torch.ops.aten ############################### # Base Layout Tensor Subclass # ############################### -class AQTLayout(torch.Tensor): +class AQTLayout(TorchAOBaseTensor): """ Base class for the layout tensor for `AffineQuantizedTensor` """ @@ -61,19 +64,6 @@ def __repr__(self): layout_type = self.get_layout_type() return f"{self.__class__.__name__}(int_data={int_data}, scale={scale}, zero_point={zero_point}, layout_type={layout_type})" - def _get_to_kwargs(self, *args, **kwargs): - device, dtype, _, memory_format = torch._C._nn._parse_to(*args, **kwargs) - device = self.device if device is None else device - dtype = self.dtype if dtype is None else dtype - memory_format = ( - memory_format if memory_format is not None else torch.preserve_format - ) - kwargs = { - "device": device, - "dtype": dtype, - "memory_format": memory_format, - } - return kwargs ############################## # Tensor Subclass Definition # @@ -83,7 +73,7 @@ def _get_to_kwargs(self, *args, **kwargs): def _register_quantized_linear_dispatch(dispatch_condition, impl): _QLINEAR_DISPATCH_TABLE[dispatch_condition] = impl -class AffineQuantizedTensor(torch.Tensor): +class AffineQuantizedTensor(TorchAOBaseTensor): """ Affine quantized tensor subclass. Affine quantization means we quantize the floating point tensor with an affine transformation: quantized_tensor = float_tensor / scale + zero_point @@ -223,7 +213,7 @@ def from_float( input_float = layout_type.pre_process(input_float) scale, zero_point = choose_qparams_affine(input_float, mapping_type, block_size, target_dtype, quant_min, quant_max, eps, scale_dtype, zero_point_dtype, preserve_zero, zero_point_domain) int_data = quantize_affine(input_float, block_size, scale, zero_point, target_dtype, quant_min, quant_max, zero_point_domain) - + int_data = layout_type.post_process(int_data) layout_tensor_ctr = get_layout_tensor_constructor(type(layout_type)) layout_tensor = layout_tensor_ctr(int_data, scale, zero_point, layout_type) @@ -273,25 +263,9 @@ def from_float_static( def layout_type(self) -> LayoutType: return self.layout_tensor.layout_type - def _get_to_kwargs(self, *args, **kwargs): - device, dtype, _, memory_format = torch._C._nn._parse_to(*args, **kwargs) - device = self.device if device is None else device - dtype = self.dtype if dtype is None else dtype - memory_format = ( - memory_format if memory_format is not None else torch.preserve_format - ) - kwargs = { - "device": device, - "dtype": dtype, - "memory_format": memory_format, - } - return kwargs - def to(self, *args, **kwargs): kwargs = self._get_to_kwargs(*args, **kwargs) device = kwargs.pop("device") - # not supported yet - kwargs.pop("memory_format") return self.__class__( self.layout_tensor.to(device), self.block_size, @@ -446,6 +420,11 @@ def __torch_dispatch__(cls, func, types, args, kwargs): func, args, kwargs, args[0]._apply_fn_to_data(torch.detach) ) + if func is aten.clone.default: + return return_and_correct_aliasing( + func, args, kwargs, args[0]._apply_fn_to_data(torch.clone) + ) + if func is aten.t.default: tensor = args[0] new = tensor.__class__( @@ -576,10 +555,10 @@ def from_plain( scale: torch.Tensor, zero_point: torch.Tensor, layout_type: LayoutType - ): - + ): + assert isinstance(layout_type, TensorCoreTiledLayoutType) - + if TORCH_VERSION_AT_LEAST_2_5: int_data = (int_data[::, ::2] << 4 | int_data[::, 1::2]).to(torch.uint8) assert int_data.dtype == torch.uint8, "torch.ops.aten._convert_weight_to_int4pack in torch 2.5 expects `uint8` dtype" @@ -617,6 +596,11 @@ def __torch_dispatch__(cls, func, types, args, kwargs): func, args, kwargs, args[0]._apply_fn_to_data(torch.detach) ) + if func is aten.clone.default: + return return_and_correct_aliasing( + func, args, kwargs, args[0]._apply_fn_to_data(torch.clone) + ) + if func is aten.t.default: """we don't need to repack the weight and just rely on external shape being changed and record the status of transpose/no-transpose diff --git a/torchao/quantization/linear_activation_quantized_tensor.py b/torchao/quantization/linear_activation_quantized_tensor.py index d3faa5d4c3..c8ecaa60d1 100644 --- a/torchao/quantization/linear_activation_quantized_tensor.py +++ b/torchao/quantization/linear_activation_quantized_tensor.py @@ -6,7 +6,10 @@ ) from typing import Callable from torch.utils._python_dispatch import return_and_correct_aliasing -from torchao.utils import TORCH_VERSION_AT_LEAST_2_5 +from torchao.utils import ( + TorchAOBaseTensor, + TORCH_VERSION_AT_LEAST_2_5, +) __all__ = [ "LinearActivationQuantizedTensor", @@ -15,7 +18,7 @@ aten = torch.ops.aten -class LinearActivationQuantizedTensor(torch.Tensor): +class LinearActivationQuantizedTensor(TorchAOBaseTensor): """ Applies activation quantization for linear operator """ @@ -74,20 +77,6 @@ def _apply_fn_to_data(self, fn): self.input_quant_func, ) - def _get_to_kwargs(self, *args, **kwargs): - device, dtype, _, memory_format = torch._C._nn._parse_to(*args, **kwargs) - device = self.device if device is None else device - dtype = self.dtype if dtype is None else dtype - memory_format = ( - memory_format if memory_format is not None else torch.preserve_format - ) - kwargs = { - "device": device, - "dtype": dtype, - "memory_format": memory_format, - } - return kwargs - def to(self, *args, **kwargs): kwargs = self._get_to_kwargs(*args, **kwargs) return self.__class__( diff --git a/torchao/utils.py b/torchao/utils.py index 61b1f5d426..9239fc999f 100644 --- a/torchao/utils.py +++ b/torchao/utils.py @@ -20,6 +20,7 @@ "_register_custom_op", "get_model_size_in_bytes", "unwrap_tensor_subclass", + "TorchAOBaseTensor", "TORCH_VERSION_AT_LEAST_2_2", "TORCH_VERSION_AT_LEAST_2_3", "TORCH_VERSION_AT_LEAST_2_4", @@ -281,6 +282,30 @@ def unwrap_tensor_subclass(model, filter_fn=None): unwrap_tensor_subclass(child) return model +class TorchAOBaseTensor(torch.Tensor): + """A util tensor subclass that provides commonly used functions + """ + def _get_to_kwargs(self, *args, **kwargs): + # `torch._C._nn._parse_to` can't handle `layout` argument + for arg in args: + if isinstance(arg, torch.layout): + args.remove(arg) + if "layout" in kwargs: + kwargs.pop("layout") + # ignoring `non_blocking` and `memory_format` args since these are not + # very useful for most of the tensor subclasses + # if in the future there are use cases that need these, we'd recommend + # to override `_get_to_kwargs` and return these args + device, dtype, _, _ = torch._C._nn._parse_to(*args, **kwargs) + device = self.device if device is None else device + dtype = self.dtype if dtype is None else dtype + kwargs = { + "device": device, + "dtype": dtype, + } + return kwargs + + def parse_version(version_string): # Extract just the X.Y.Z part from the version string From 8002099e48a511834de614224cced90e2a9e4edd Mon Sep 17 00:00:00 2001 From: Thien Tran Date: Fri, 23 Aug 2024 00:41:41 +0800 Subject: [PATCH 07/12] [Llama] Make torchao's Llama trainable (#728) * initial change * skip safetensors weights * update quantized training script * add activation checkpointing --- .../quantized_training/pretrain_llama2.py | 38 +++++++++++++------ scripts/download.py | 2 +- torchao/_models/llama/model.py | 24 ++++++++---- 3 files changed, 43 insertions(+), 21 deletions(-) diff --git a/benchmarks/quantized_training/pretrain_llama2.py b/benchmarks/quantized_training/pretrain_llama2.py index 344a3a71af..de3ed04e8f 100644 --- a/benchmarks/quantized_training/pretrain_llama2.py +++ b/benchmarks/quantized_training/pretrain_llama2.py @@ -1,5 +1,5 @@ # pre-train a mini Llama2 on TinyStories with INT8 quantized training -# pip install transformers sentencepiece wandb +# pip install huggingface_hub sentencepiece wandb # # BF16 baseline: python benchmarks/quantized_training/pretrain_llama2.py --seed 2024 --n_steps 10_000 --compile # INT8 QT: python benchmarks/quantized_training/pretrain_llama2.py --seed 2024 --n_steps 10_000 --compile --quantize int8_weight_only @@ -9,21 +9,33 @@ os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" import argparse +from functools import partial from pathlib import Path import numpy as np import torch import wandb +from torch.utils.checkpoint import checkpoint from tqdm import tqdm -from transformers import LlamaConfig, LlamaForCausalLM +from torchao._models.llama.model import ModelArgs, Transformer from torchao.prototype import low_bit_optim from torchao.prototype.quantized_training import int8_weight_only_quantized_training from torchao.quantization.quant_api import quantize_ -def get_loss(model: LlamaForCausalLM, batch: torch.Tensor): - return model(batch, labels=batch).loss +# hack from fairseq +# https://github.com/facebookresearch/fairseq/blob/920a548ca770fb1a951f7f4289b4d3a0c1bc226f/fairseq/modules/checkpoint_activations.py +def enable_activation_checkpointing(m: torch.nn.Module): + assert not hasattr(m, "_forward") + m._forward = m.forward + m.forward = partial(checkpoint, m.forward) + + +def get_loss(model: Transformer, batch: torch.Tensor): + logits = model(batch)[:, :-1].flatten(0, 1) + labels = batch[:, 1:].flatten() + return torch.nn.functional.cross_entropy(logits, labels) def get_tinystories(): @@ -91,17 +103,19 @@ def get_tinystories(): if args.seed is not None: torch.manual_seed(args.seed) - config = LlamaConfig( - hidden_size=args.d_model, + config = ModelArgs( + block_size=args.seq_len, + n_layer=args.depth, + n_head=args.d_model // args.head_dim, + dim=args.d_model, intermediate_size=args.ffn_size, - num_hidden_layers=args.depth, - num_attention_heads=args.d_model // args.head_dim, - max_position_embeddings=args.seq_len, - use_cache=False, ) - model = LlamaForCausalLM(config).bfloat16().cuda() + model = Transformer(config).bfloat16().cuda() + with torch.device("cuda"): + model.setup_caches(args.batch_size, args.seq_len, training=True) if args.activation_checkpointing: - model.gradient_checkpointing_enable() + for layer in model.layers: + enable_activation_checkpointing(layer) if args.quantize == "int8_weight_only": quantize_(model, int8_weight_only_quantized_training(), set_inductor_config=False) elif args.quantize is not None: diff --git a/scripts/download.py b/scripts/download.py index 571e03adb5..3fc89e7126 100644 --- a/scripts/download.py +++ b/scripts/download.py @@ -15,7 +15,7 @@ def hf_download(repo_id: Optional[str] = None, hf_token: Optional[str] = None) - from huggingface_hub import snapshot_download os.makedirs(f"checkpoints/{repo_id}", exist_ok=True) try: - snapshot_download(repo_id, local_dir=f"checkpoints/{repo_id}", local_dir_use_symlinks=False, token=hf_token) + snapshot_download(repo_id, local_dir=f"checkpoints/{repo_id}", local_dir_use_symlinks=False, token=hf_token, ignore_patterns="*.safetensors") except HTTPError as e: if e.response.status_code == 401: print("You need to pass a valid `--hf_token=...` to download private checkpoints.") diff --git a/torchao/_models/llama/model.py b/torchao/_models/llama/model.py index 58a1709642..eaaccd7a53 100644 --- a/torchao/_models/llama/model.py +++ b/torchao/_models/llama/model.py @@ -150,7 +150,7 @@ def __init__(self, config: ModelArgs) -> None: self.max_batch_size = -1 self.max_seq_length = -1 - def setup_caches(self, max_batch_size, max_seq_length): + def setup_caches(self, max_batch_size, max_seq_length, training: bool = False): if self.max_seq_length >= max_seq_length and self.max_batch_size >= max_batch_size: return head_dim = self.config.dim // self.config.n_head @@ -163,16 +163,21 @@ def setup_caches(self, max_batch_size, max_seq_length): dtype = self.output.scales.dtype elif hasattr(self.output, "scales_and_zeros"): dtype = self.output.scales_and_zeros.dtype - for b in self.layers: - b.attention.kv_cache = KVCache(max_batch_size, max_seq_length, self.config.n_local_heads, head_dim, dtype) + if not training: + for b in self.layers: + b.attention.kv_cache = KVCache(max_batch_size, max_seq_length, self.config.n_local_heads, head_dim, dtype) self.freqs_cis = precompute_freqs_cis(self.config.block_size, self.config.dim // self.config.n_head, self.config.rope_base, dtype) self.causal_mask = torch.tril(torch.ones(self.max_seq_length, self.max_seq_length, dtype=torch.bool)) def forward(self, idx: Tensor, input_pos: Optional[Tensor] = None) -> Tensor: assert self.freqs_cis is not None, "Caches must be initialized first" - mask = self.causal_mask[None, None, input_pos] - freqs_cis = self.freqs_cis[input_pos] + if input_pos is not None: + mask = self.causal_mask[None, None, input_pos] + freqs_cis = self.freqs_cis[input_pos] + else: + mask = None + freqs_cis = self.freqs_cis[:idx.shape[1]] x = self.tok_embeddings(idx) for i, layer in enumerate(self.layers): @@ -194,7 +199,7 @@ def __init__(self, config: ModelArgs) -> None: self.ffn_norm = RMSNorm(config.dim, config.norm_eps) self.attention_norm = RMSNorm(config.dim, config.norm_eps) - def forward(self, x: Tensor, input_pos: Tensor, freqs_cis: Tensor, mask: Tensor) -> Tensor: + def forward(self, x: Tensor, input_pos: Optional[Tensor], freqs_cis: Tensor, mask: Optional[Tensor]) -> Tensor: h = x + self.attention(self.attention_norm(x), freqs_cis, mask, input_pos) out = h + self.feed_forward(self.ffn_norm(h)) return out @@ -224,7 +229,7 @@ def load_hook(self, state_dict, prefix, *args): wv = state_dict.pop(prefix + "wv.weight") state_dict[prefix + "wqkv.weight"] = torch.cat([wq, wk, wv]) - def forward(self, x: Tensor, freqs_cis: Tensor, mask: Tensor, input_pos: Optional[Tensor] = None) -> Tensor: + def forward(self, x: Tensor, freqs_cis: Tensor, mask: Optional[Tensor], input_pos: Optional[Tensor] = None) -> Tensor: bsz, seqlen, _ = x.shape kv_size = self.n_local_heads * self.head_dim @@ -244,7 +249,10 @@ def forward(self, x: Tensor, freqs_cis: Tensor, mask: Tensor, input_pos: Optiona k = k.repeat_interleave(self.n_head // self.n_local_heads, dim=1) v = v.repeat_interleave(self.n_head // self.n_local_heads, dim=1) - y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0) + if mask is not None: + y = F.scaled_dot_product_attention(q, k, v, attn_mask=mask, dropout_p=0.0) + else: + y = F.scaled_dot_product_attention(q, k, v, dropout_p=0.0, is_causal=True) y = y.transpose(1, 2).contiguous().view(bsz, seqlen, self.dim) From c0b0731d819d621698673e10d33ae585de56f1df Mon Sep 17 00:00:00 2001 From: Masaki Kozuki Date: Fri, 23 Aug 2024 02:59:58 +0900 Subject: [PATCH 08/12] Specify output dtype to `torch.float32` in `_foreach_norm` (#727) one less kernel Signed-off-by: Masaki Kozuki --- torchao/float8/fsdp_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/torchao/float8/fsdp_utils.py b/torchao/float8/fsdp_utils.py index bbb954eca1..81859de4bb 100644 --- a/torchao/float8/fsdp_utils.py +++ b/torchao/float8/fsdp_utils.py @@ -59,7 +59,7 @@ def precompute_float8_dynamic_scale_for_fsdp(module: nn.Module) -> None: return # inf-norm is equivalent to max(abs(w)) - max_weights = torch._foreach_norm(weights, ord=math.inf) # Partial + max_weights = torch._foreach_norm(weights, ord=math.inf, dtype=torch.float32) # Partial amax_tensor = torch.stack(max_weights) # Partial # clamp is dispatched through DTensor # it will issue a single all-reduce @@ -67,7 +67,7 @@ def precompute_float8_dynamic_scale_for_fsdp(module: nn.Module) -> None: scale_tensor = torch.finfo(torch.float8_e4m3fn).max / amax_tensor # Replicate if amax_tensor.dtype is torch.float16: scale_tensor = torch.clamp(scale_tensor, max=torch.finfo(torch.float16).max) - local_scale_tensor = scale_tensor.to_local().to(dtype=torch.float32) + local_scale_tensor = scale_tensor.to_local() for i, float8_linear in enumerate(float8_linears): float8_linear.weight._local_tensor._precomputed_scale = local_scale_tensor[i] From b09307abbe0c46a282a14bcf69f1173d18eaba1c Mon Sep 17 00:00:00 2001 From: Vasiliy Kuznetsov Date: Thu, 22 Aug 2024 11:36:11 -0700 Subject: [PATCH 09/12] f8 roofline: make utils reusable (#731) Summary: Moves the float8 roofline gemm and memory traffic utils to `torchao.float8.roofline_utils`, so they can be reused in other places. For now, I want to use this in ao_benchmarks. Test Plan: ``` python benchmarks/float8/float8_roofline.py ~/local/tmp/test.txt --gemm_time_strategy roofline ``` Reviewers: Subscribers: Tasks: Tags: --- benchmarks/float8/float8_roofline.py | 222 ++------------------------- torchao/float8/roofline_utils.py | 220 ++++++++++++++++++++++++++ 2 files changed, 230 insertions(+), 212 deletions(-) create mode 100644 torchao/float8/roofline_utils.py diff --git a/benchmarks/float8/float8_roofline.py b/benchmarks/float8/float8_roofline.py index a5a05f0414..91d344ac19 100644 --- a/benchmarks/float8/float8_roofline.py +++ b/benchmarks/float8/float8_roofline.py @@ -1,3 +1,9 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD 3-Clause license found in the +# LICENSE file in the root directory of this source tree. + """ This is a script to estimate the benefit from converting a `torch.nn.Linear` layer to float8, by estimating the difference in e2e GPU kernel time between: @@ -45,26 +51,10 @@ import torch import torch.utils.benchmark as benchmark -BYTES_PER_EL_FLOAT8 = 1 -BYTES_PER_EL_BF16 = 2 - -# https://www.nvidia.com/en-us/data-center/h100/, divide by 2 because no sparsity -H100_BF16_PEAK_TOPS = 989e12 -H100_FP8_PEAK_TOPS = 1979e12 - -# 2.4 TB per second, custom to Meta's H100 variant -H100_PEAK_MEM_BW_BYTES_SEC = 2.4e12 - -# based on quick experimental observation with sample large inputs -H100_PCT_ACHIEVABLE_GEMM_TOPS = 0.6 - -# based on previous experience looking at pointwise triton kernels with large inputs, -# which would hit about 2.2k GBPS on Meta's H100 variant -H100_PCT_ACHIEVABLE_MEM_BW = 0.92 - -# Source: run a triton kernel with a single element read/write on an H100 and -# measure GPU time from the trace -TRITON_KERNEL_1_ELEMENT_TIME_SEC = 0.002 * 0.001 +from torchao.float8.roofline_utils import ( + get_gemm_time_sympy, + get_float8_mem_sympy, +) def benchmark_fn_in_sec(f, *args, **kwargs): @@ -78,90 +68,6 @@ def benchmark_fn_in_sec(f, *args, **kwargs): return measurement.mean -def get_tensor_memory_traffic_bytes( - dim0, - dim1, - scaling_type: str, - fuse_with_prev=False, - model_torch_compile_limitations=False, -): - # assumes input bf16, output f8 - numel = dim0 * dim1 - - if scaling_type == "dynamic": - # x_bf16 = ... - # kernel 1: x_bf16 -> max_abs_stage_1 -> tmp - # kernel 2 (not modeled): tmp -> max_abs_stage_2 -> max_abs - # kernel 3: x_bf16, max_abs -> to_float8 -> x_fp8 - - if fuse_with_prev: - kernel_1_rw = 0 - else: - # kernel 1: read numel, write 0 (assume size(tmp) ~ 0) - kernel_1_rw = BYTES_PER_EL_BF16 * numel - - # kernel 3: read in bf16, write twice in float8 (row-major and col-major) - kernel_3_rw = BYTES_PER_EL_BF16 * numel + 2 * BYTES_PER_EL_FLOAT8 * numel - - if model_torch_compile_limitations: - # today, the kernel to do cast_to_fp8_row_major_and_col_major(input_bf16, ...) - # has an extra memory read of the input in fp8 - # context: https://github.com/pytorch/pytorch/issues/130015 - tc_adjustment = numel * BYTES_PER_EL_FLOAT8 - else: - tc_adjustment = 0 - - return kernel_1_rw + kernel_3_rw + tc_adjustment - - else: - assert scaling_type == "delayed", "unsupported" - # x_bf16 = ... - # kernel 1: x_bf16 -> max_abs_stage_1_and_to_float8 -> x_float8, tmp - # kernel 2 (not modeled): tmp -> max_abs_stage_2 -> max_abs - # kernel 3 (not modeled): scale -> reciprocal -> inv_scale - - if fuse_with_prev: - kernel_1_r = 0 - else: - kernel_1_r = numel * BYTES_PER_EL_BF16 - # write twice: once in row major, once in col-major - kernel_1_w = numel * BYTES_PER_EL_FLOAT8 * 2 - - if model_torch_compile_limitations: - # today, the kernel to do cast_to_fp8_row_major_and_col_major(input_bf16, ...) - # has an extra memory read of the input in fp8 - # context: https://github.com/pytorch/pytorch/issues/130015 - tc_adjustment = numel * BYTES_PER_EL_FLOAT8 - - # https://github.com/pytorch/pytorch/issues/128063 - # instead of - # kernel 1: x_bf16 -> max(abs(x)), x_fp8 - # kernel 2: not modeled - # kernel 3: not modeled - # we get - # kernel 1: x_bf16 -> max(abs(x)) - # reads: same as before - # writes: 0 - # ... - # kernel 4: x_bf16, scale -> x_fp8 - # reads: numel * BYTES_PER_EL_BF16 - # writes: 2 * numel * BYTES_PER_EL_FLOAT8 - # Note that assuming worst case, this issue brings the memory - # traffic for delayed scaling to be equal to that of dynamic scaling. - tc_adjustment += ( - # subtract writes from kernel 1 - -1 * 2 * numel * BYTES_PER_EL_FLOAT8 - # add reads for kernel 4 - + numel * BYTES_PER_EL_BF16 - # add writes for kernel 4 - + 2 * numel * BYTES_PER_EL_FLOAT8 - ) - else: - tc_adjustment = 0 - - return kernel_1_r + kernel_1_w + tc_adjustment - - def get_gemm_times_cache(gemm_benchmarks_file: str): cache = {} with open(gemm_benchmarks_file, 'r') as f: @@ -176,114 +82,6 @@ def get_gemm_times_cache(gemm_benchmarks_file: str): return cache -def get_gemm_time_sympy(M, K, N, dtype): - gemm_ops = 2 * M * K * N + 2 * M * N * K + 2 * K * M * N - if dtype is torch.bfloat16: - peak_tops = H100_BF16_PEAK_TOPS - elif dtype in (torch.float8_e4m3fn, torch.float8_e5m2): - peak_tops = H100_FP8_PEAK_TOPS - gemm_time_s = gemm_ops / peak_tops / H100_PCT_ACHIEVABLE_GEMM_TOPS - return gemm_time_s - - -def get_float8_mem_sympy( - M, - K, - N, - model_torch_compile_limitations: bool = False, - scaling_type_input: str = "dynamic", - scaling_type_weight: str = "dynamic", - scaling_type_grad_output: str = "dynamic", -): - - assert scaling_type_input in ("dynamic", "delayed"), "unsupported" - assert scaling_type_weight in ("dynamic", "delayed"), "unsupported" - assert scaling_type_grad_output in ("dynamic", "delayed"), "unsupported" - - # there are three gemms in the fwd/bwd of a linear: - # - # input @ weight_t = output - # MxK @ KxN => MxN - # - # grad_output @ weight = grad_input - # MxN @ NxK => MxK - # - # input_t @ grad_output = grad_weight - # KxM @ MxN => KxN - - # - # forward - output - # - fwd_fp8_input_mem = get_tensor_memory_traffic_bytes( - M, K, scaling_type_input, fuse_with_prev=True, - model_torch_compile_limitations=model_torch_compile_limitations) - fwd_fp8_weight_mem = get_tensor_memory_traffic_bytes( - K, N, scaling_type_weight, fuse_with_prev=False, - model_torch_compile_limitations=model_torch_compile_limitations) - fwd_fp8_total_mem = fwd_fp8_input_mem + fwd_fp8_weight_mem - - # - # backward - grad_input - # - gi_fp8_grad_output_mem = get_tensor_memory_traffic_bytes( - M, N, scaling_type_grad_output, fuse_with_prev=True, - model_torch_compile_limitations=model_torch_compile_limitations) - # already casted, assuming that we save weight from fw to bw - # TODO: model this if FSDP float8 all-gather is on - # TODO: model this if we don't save weight from fw to bw, and recompute instead - gi_fp8_weight_mem = 0 - - # - # backward - grad_weight - # - # TODO: model this if we don't save fp8 input from fw to bw - gw_fp8_input_t_mem = 0 # already casted - # this should be always 0 - gw_fp8_grad_output_mem = 0 # already casted - - bwd_fp8_total_mem = \ - gi_fp8_grad_output_mem + gi_fp8_weight_mem + \ - gw_fp8_input_t_mem + gw_fp8_grad_output_mem - fp8_total_mem = fwd_fp8_total_mem + bwd_fp8_total_mem - fp8_mem_time_s = ( - fp8_total_mem / H100_PEAK_MEM_BW_BYTES_SEC / H100_PCT_ACHIEVABLE_MEM_BW - ) - - # Adjust final estimate for small kernel launches - # note that we do this adjustment here because we are assuming a minimal - # kernel overhead in the units of seconds, and the per-gemm-input memory - # estimations are in the units of bytes. - num_extra_kernels = 0 - if scaling_type_input == "dynamic": - # second stage of max-abs reduction - num_extra_kernels += 1 - elif scaling_type_input == "delayed": - # second stage of max-abs reduction - num_extra_kernels += 1 - # reciprocal of scale - num_extra_kernels += 1 - if scaling_type_weight == "dynamic": - # second stage of max-abs reduction - num_extra_kernels += 1 - elif scaling_type_weight == "delayed": - # second stage of max-abs reduction - num_extra_kernels += 1 - # reciprocal of scale - num_extra_kernels += 1 - if scaling_type_grad_output == "dynamic": - # second stage of max-abs reduction - num_extra_kernels += 1 - elif scaling_type_grad_output == "delayed": - # second stage of max-abs reduction - num_extra_kernels += 1 - # reciprocal of scale - num_extra_kernels += 1 - - extra_kernel_overhead_s = num_extra_kernels * TRITON_KERNEL_1_ELEMENT_TIME_SEC - - return fp8_mem_time_s + extra_kernel_overhead_s - - def run( outfile: str, gemm_time_strategy: str = "benchmarks", diff --git a/torchao/float8/roofline_utils.py b/torchao/float8/roofline_utils.py new file mode 100644 index 0000000000..490435fbf9 --- /dev/null +++ b/torchao/float8/roofline_utils.py @@ -0,0 +1,220 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD 3-Clause license found in the +# LICENSE file in the root directory of this source tree. + +import torch + +BYTES_PER_EL_FLOAT8 = 1 +BYTES_PER_EL_BF16 = 2 + +# https://www.nvidia.com/en-us/data-center/h100/, divide by 2 because no sparsity +H100_BF16_PEAK_TOPS = 989e12 +H100_FP8_PEAK_TOPS = 1979e12 + +# 2.4 TB per second, custom to Meta's H100 variant +H100_PEAK_MEM_BW_BYTES_SEC = 2.4e12 + +# based on quick experimental observation with sample large inputs +H100_PCT_ACHIEVABLE_GEMM_TOPS = 0.6 + +# based on previous experience looking at pointwise triton kernels with large inputs, +# which would hit about 2.2k GBPS on Meta's H100 variant +H100_PCT_ACHIEVABLE_MEM_BW = 0.92 + +# Source: run a triton kernel with a single element read/write on an H100 and +# measure GPU time from the trace +TRITON_KERNEL_1_ELEMENT_TIME_SEC = 0.002 * 0.001 + + +def get_tensor_memory_traffic_bytes( + dim0, + dim1, + scaling_type: str, + fuse_with_prev=False, + model_torch_compile_limitations=False, +): + # assumes input bf16, output f8 + numel = dim0 * dim1 + + if scaling_type == "dynamic": + # x_bf16 = ... + # kernel 1: x_bf16 -> max_abs_stage_1 -> tmp + # kernel 2 (not modeled): tmp -> max_abs_stage_2 -> max_abs + # kernel 3: x_bf16, max_abs -> to_float8 -> x_fp8 + + if fuse_with_prev: + kernel_1_rw = 0 + else: + # kernel 1: read numel, write 0 (assume size(tmp) ~ 0) + kernel_1_rw = BYTES_PER_EL_BF16 * numel + + # kernel 3: read in bf16, write twice in float8 (row-major and col-major) + kernel_3_rw = BYTES_PER_EL_BF16 * numel + 2 * BYTES_PER_EL_FLOAT8 * numel + + if model_torch_compile_limitations: + # today, the kernel to do cast_to_fp8_row_major_and_col_major(input_bf16, ...) + # has an extra memory read of the input in fp8 + # context: https://github.com/pytorch/pytorch/issues/130015 + tc_adjustment = numel * BYTES_PER_EL_FLOAT8 + else: + tc_adjustment = 0 + + return kernel_1_rw + kernel_3_rw + tc_adjustment + + else: + assert scaling_type == "delayed", "unsupported" + # x_bf16 = ... + # kernel 1: x_bf16 -> max_abs_stage_1_and_to_float8 -> x_float8, tmp + # kernel 2 (not modeled): tmp -> max_abs_stage_2 -> max_abs + # kernel 3 (not modeled): scale -> reciprocal -> inv_scale + + if fuse_with_prev: + kernel_1_r = 0 + else: + kernel_1_r = numel * BYTES_PER_EL_BF16 + # write twice: once in row major, once in col-major + kernel_1_w = numel * BYTES_PER_EL_FLOAT8 * 2 + + if model_torch_compile_limitations: + # today, the kernel to do cast_to_fp8_row_major_and_col_major(input_bf16, ...) + # has an extra memory read of the input in fp8 + # context: https://github.com/pytorch/pytorch/issues/130015 + tc_adjustment = numel * BYTES_PER_EL_FLOAT8 + + # https://github.com/pytorch/pytorch/issues/128063 + # instead of + # kernel 1: x_bf16 -> max(abs(x)), x_fp8 + # kernel 2: not modeled + # kernel 3: not modeled + # we get + # kernel 1: x_bf16 -> max(abs(x)) + # reads: same as before + # writes: 0 + # ... + # kernel 4: x_bf16, scale -> x_fp8 + # reads: numel * BYTES_PER_EL_BF16 + # writes: 2 * numel * BYTES_PER_EL_FLOAT8 + # Note that assuming worst case, this issue brings the memory + # traffic for delayed scaling to be equal to that of dynamic scaling. + tc_adjustment += ( + # subtract writes from kernel 1 + -1 * 2 * numel * BYTES_PER_EL_FLOAT8 + # add reads for kernel 4 + + numel * BYTES_PER_EL_BF16 + # add writes for kernel 4 + + 2 * numel * BYTES_PER_EL_FLOAT8 + ) + else: + tc_adjustment = 0 + + return kernel_1_r + kernel_1_w + tc_adjustment + + +def get_gemm_time_sympy(M, K, N, dtype): + gemm_ops = 2 * M * K * N + 2 * M * N * K + 2 * K * M * N + if dtype is torch.bfloat16: + peak_tops = H100_BF16_PEAK_TOPS + elif dtype in (torch.float8_e4m3fn, torch.float8_e5m2): + peak_tops = H100_FP8_PEAK_TOPS + gemm_time_s = gemm_ops / peak_tops / H100_PCT_ACHIEVABLE_GEMM_TOPS + return gemm_time_s + + +def get_float8_mem_sympy( + M, + K, + N, + model_torch_compile_limitations: bool = False, + scaling_type_input: str = "dynamic", + scaling_type_weight: str = "dynamic", + scaling_type_grad_output: str = "dynamic", +): + + assert scaling_type_input in ("dynamic", "delayed"), "unsupported" + assert scaling_type_weight in ("dynamic", "delayed"), "unsupported" + assert scaling_type_grad_output in ("dynamic", "delayed"), "unsupported" + + # there are three gemms in the fwd/bwd of a linear: + # + # input @ weight_t = output + # MxK @ KxN => MxN + # + # grad_output @ weight = grad_input + # MxN @ NxK => MxK + # + # input_t @ grad_output = grad_weight + # KxM @ MxN => KxN + + # + # forward - output + # + fwd_fp8_input_mem = get_tensor_memory_traffic_bytes( + M, K, scaling_type_input, fuse_with_prev=True, + model_torch_compile_limitations=model_torch_compile_limitations) + fwd_fp8_weight_mem = get_tensor_memory_traffic_bytes( + K, N, scaling_type_weight, fuse_with_prev=False, + model_torch_compile_limitations=model_torch_compile_limitations) + fwd_fp8_total_mem = fwd_fp8_input_mem + fwd_fp8_weight_mem + + # + # backward - grad_input + # + gi_fp8_grad_output_mem = get_tensor_memory_traffic_bytes( + M, N, scaling_type_grad_output, fuse_with_prev=True, + model_torch_compile_limitations=model_torch_compile_limitations) + # already casted, assuming that we save weight from fw to bw + # TODO: model this if FSDP float8 all-gather is on + # TODO: model this if we don't save weight from fw to bw, and recompute instead + gi_fp8_weight_mem = 0 + + # + # backward - grad_weight + # + # TODO: model this if we don't save fp8 input from fw to bw + gw_fp8_input_t_mem = 0 # already casted + # this should be always 0 + gw_fp8_grad_output_mem = 0 # already casted + + bwd_fp8_total_mem = \ + gi_fp8_grad_output_mem + gi_fp8_weight_mem + \ + gw_fp8_input_t_mem + gw_fp8_grad_output_mem + fp8_total_mem = fwd_fp8_total_mem + bwd_fp8_total_mem + fp8_mem_time_s = ( + fp8_total_mem / H100_PEAK_MEM_BW_BYTES_SEC / H100_PCT_ACHIEVABLE_MEM_BW + ) + + # Adjust final estimate for small kernel launches + # note that we do this adjustment here because we are assuming a minimal + # kernel overhead in the units of seconds, and the per-gemm-input memory + # estimations are in the units of bytes. + num_extra_kernels = 0 + if scaling_type_input == "dynamic": + # second stage of max-abs reduction + num_extra_kernels += 1 + elif scaling_type_input == "delayed": + # second stage of max-abs reduction + num_extra_kernels += 1 + # reciprocal of scale + num_extra_kernels += 1 + if scaling_type_weight == "dynamic": + # second stage of max-abs reduction + num_extra_kernels += 1 + elif scaling_type_weight == "delayed": + # second stage of max-abs reduction + num_extra_kernels += 1 + # reciprocal of scale + num_extra_kernels += 1 + if scaling_type_grad_output == "dynamic": + # second stage of max-abs reduction + num_extra_kernels += 1 + elif scaling_type_grad_output == "delayed": + # second stage of max-abs reduction + num_extra_kernels += 1 + # reciprocal of scale + num_extra_kernels += 1 + + extra_kernel_overhead_s = num_extra_kernels * TRITON_KERNEL_1_ELEMENT_TIME_SEC + + return fp8_mem_time_s + extra_kernel_overhead_s From 986019488130e4f9cb737e45505d357425100bfd Mon Sep 17 00:00:00 2001 From: Jerry Zhang Date: Thu, 22 Aug 2024 15:43:36 -0700 Subject: [PATCH 10/12] Small fix for micro benchmark code (#711) Summary: There seems to be some problems running benchmark_aq.py: ``` torch._dynamo.exc.Unsupported: torch.* op returned non-Tensor int call_function ``` when we run the benchmark with multiple shapes sometimes. But the problem will be gone if we reset the dynamo caches before each benchmark run Test Plan: python benchmarks/benchmark_aq.py Reviewers: Subscribers: Tasks: Tags: --- benchmarks/benchmark_aq.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/benchmarks/benchmark_aq.py b/benchmarks/benchmark_aq.py index 174038d206..ebf9e1e738 100644 --- a/benchmarks/benchmark_aq.py +++ b/benchmarks/benchmark_aq.py @@ -17,6 +17,7 @@ _replace_with_custom_fn_if_matches_filter, ) import copy +from torchao.utils import unwrap_tensor_subclass def _int8wo_api(mod, **kwargs): if TORCH_VERSION_AT_LEAST_2_4: @@ -133,15 +134,17 @@ def _bench_quantized_tensor_subclass_perf(api, ref_api, M, N, K, kwargs=None): WARMUP = 20 RUNS = 100 + torch._dynamo.reset() m_ref = torch.compile(m_ref, mode='max-autotune', fullgraph=True) benchmark_model(m_ref, WARMUP, example_inputs) ref_elapsed_time = benchmark_model(m_ref, RUNS, example_inputs) + torch._dynamo.reset() m = torch.compile(m, mode='max-autotune', fullgraph=True) benchmark_model(m, WARMUP, example_inputs) elapsed_time = benchmark_model(m, RUNS, example_inputs) - + torch._dynamo.reset() m_bf16 = torch.compile(m_bf16, mode='max-autotune', fullgraph=True) benchmark_model(m_bf16, WARMUP, example_inputs) bf16_elapsed_time = benchmark_model(m_bf16, RUNS, example_inputs) From 68e46436bfa0ee3fffbc3c3e134485b51ff76a40 Mon Sep 17 00:00:00 2001 From: Sergii Dymchenko Date: Thu, 22 Aug 2024 20:01:44 -0700 Subject: [PATCH 11/12] Fix docstring args names (#735) --- torchao/float8/float8_utils.py | 2 +- torchao/float8/inference.py | 2 +- torchao/ops.py | 4 ++-- .../mixed_precision/scripts/naive_intNwo.py | 2 +- torchao/quantization/quant_primitives.py | 12 ++++++------ 5 files changed, 11 insertions(+), 11 deletions(-) diff --git a/torchao/float8/float8_utils.py b/torchao/float8/float8_utils.py index 868d4f52a6..54613e5b05 100644 --- a/torchao/float8/float8_utils.py +++ b/torchao/float8/float8_utils.py @@ -209,7 +209,7 @@ def pad_tensor_for_matmul( Args: tensor: The tensor to pad. - both: Whether to pad both dimensions or just the second dimension. + dims: Dimensions to pad. Returns: torch.Tensor: The padded tensor. diff --git a/torchao/float8/inference.py b/torchao/float8/inference.py index f441009c48..ccf83d7cef 100644 --- a/torchao/float8/inference.py +++ b/torchao/float8/inference.py @@ -151,7 +151,7 @@ def from_float( Create an nn.Linear with fp8 compute from another nn.Linear Args: - mod (torch.nn.Linear): nn.Linear to convert + module (torch.nn.Linear): nn.Linear to convert quant_config (QuantConfig): Configuration for the weight and activation casting """ forward_config = ScaledMMConfig( diff --git a/torchao/ops.py b/torchao/ops.py index 4fcc8681a6..cb337aabbe 100644 --- a/torchao/ops.py +++ b/torchao/ops.py @@ -115,7 +115,7 @@ def dequantize_tensor_core_tiled_layout(packed_w: Tensor, scales_and_zeros: Tens Args: packed_w: torch.tensor: 4D tensor with shape `(N / 8) x (K / (inner_k_tiles * 16)) x 32 x inner_k_tiles / 2`, dtype is torch.int32 scales_and_zeros: torch.tensor: 3D tensor with shape `numQGroups x N x 2`, dtype is torch.bfloat16 where numQGroups is K / qGroupSize - qGroupSize: int + group_size: int inner_k_tiles: int Returns: @@ -158,4 +158,4 @@ def _(packed_w: Tensor, scales_and_zeros: Tensor, group_size: int, inner_k_tiles torch._check(scales_and_zeros.size(1) == N, lambda: "scales_and_zeros must have N at dim 1") torch._check(scales_and_zeros.size(2) == 2, lambda: "scales_and_zeros must have 2 at dim 2") - return torch.empty((N, K), dtype=torch.bfloat16, device=packed_w.device) \ No newline at end of file + return torch.empty((N, K), dtype=torch.bfloat16, device=packed_w.device) diff --git a/torchao/quantization/prototype/mixed_precision/scripts/naive_intNwo.py b/torchao/quantization/prototype/mixed_precision/scripts/naive_intNwo.py index 6ebe458a46..6ec933435b 100644 --- a/torchao/quantization/prototype/mixed_precision/scripts/naive_intNwo.py +++ b/torchao/quantization/prototype/mixed_precision/scripts/naive_intNwo.py @@ -12,7 +12,7 @@ def intN_weight_only(group_size=32, n=8, symmetric=False): ''' Apply int N-bit weight only quantization to a linear layer. Args: - `groupsize`: parameter for quantization, controls the granularity of quantization, smaller size is more fine grained, choices are [512, 256, 128, 64, 32] + `group_size`: parameter for quantization, controls the granularity of quantization, smaller size is more fine grained, choices are [512, 256, 128, 64, 32] `n`: number of bits to quantize to, choices are [8, 6, 5, 4, 3, 2] Usage: from torchao.quantization import quantize_ diff --git a/torchao/quantization/quant_primitives.py b/torchao/quantization/quant_primitives.py index 1ac97de3c6..bd4656f6c5 100644 --- a/torchao/quantization/quant_primitives.py +++ b/torchao/quantization/quant_primitives.py @@ -33,7 +33,7 @@ class MappingType(Enum): """How floating point number is mapped to integer number - symmetric mapping means floating point range is symetrically mapped to integer range + symmetric mapping means floating point range is symmetrically mapped to integer range let's say we have floating point range (-3.5, 10.2) and integer range (-8, 7) (int4) we'll use (-10.2, 10.2) as the range for floating point and map that to (-8, 7) e.g. scale = (10.2 - (-10.2)) / (7 - (-8)) @@ -167,7 +167,7 @@ def quantize_affine( output_dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor quant_min (Optional[int]): minimum quantized value for output Tensor, if not specified, it will be derived from dtype quant_max (Optional[int]): maximum quantized value for output Tensor, if not specified, it will be derived from dtype - zero_point_domain (ZeroPointDomain): the domain that zero_point is in, should be eitehr integer or float + zero_point_domain (ZeroPointDomain): the domain that zero_point is in, should be either integer or float if zero_point is in integer domain, zero point is added to the quantized integer value during quantization if zero_point is in floating point domain, zero point is subtracted from the floating point (unquantized) @@ -287,11 +287,11 @@ def dequantize_affine( e.g. when size is the same as the input tensor dimension, we are using per tensor quantization scale (Tensor): quantization parameter for affine quantization zero_point (Tensor): quantization parameter for affine quantization - dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor + input_dtype (torch.dtype): requested dtype (e.g. torch.uint8) for output Tensor quant_min (Optional[int]): minimum quantized value for input Tensor quant_max (Optional[int]): maximum quantized value for input Tensor output_dtype (torch.dtype): dtype for output Tensor, default is fp32 - zero_point_domain (ZeroPointDomain): the domain that zero_point is in, should be eitehr integer or float + zero_point_domain (ZeroPointDomain): the domain that zero_point is in, should be either integer or float if zero_point is in integer domain, zero point is added to the quantized integer value during quantization if zero_point is in floating point domain, zero point is subtracted from the floating point (unquantized) @@ -413,7 +413,7 @@ def fake_quantize_affine( quant_dtype (torch.dtype): desired quantized dtype for determining and validating quant_min and quant_max values. quant_min (Optional[int]): minimum quantized value for output Tensor, if not specified, it will be derived from dtype quant_max (Optional[int]): maximum quantized value for output Tensor, if not specified, it will be derived from dtype - zero_point_domain (ZeroPointDomain): the domain that zero_point is in, should be eitehr integer or float + zero_point_domain (ZeroPointDomain): the domain that zero_point is in, should be either integer or float if zero_point is in integer domain, zero point is added to the quantized integer value during quantization if zero_point is in floating point domain, zero point is subtracted from the floating point (unquantized) @@ -549,7 +549,7 @@ def choose_qparams_affine( If we don't need zero to be exactly representable, we won't do rounding and clamping for zero_point - zero_point_domain (ZeroPointDomain): the domain that zero_point is in, should be eitehr integer or float + zero_point_domain (ZeroPointDomain): the domain that zero_point is in, should be either integer or float if zero_point is in integer domain, zero point is added to the quantized integer value during quantization if zero_point is in floating point domain, zero point is subtracted from the floating point (unquantized) From 0ed30902e3879b98c7a985915ee37ccb6048a2cb Mon Sep 17 00:00:00 2001 From: ebsmothers Date: Thu, 22 Aug 2024 22:51:25 -0700 Subject: [PATCH 12/12] Move non-NF4 tensor to device prior to quantization on copy (#737) --- torchao/dtypes/nf4tensor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/torchao/dtypes/nf4tensor.py b/torchao/dtypes/nf4tensor.py index 90516ea199..b386f85ae0 100644 --- a/torchao/dtypes/nf4tensor.py +++ b/torchao/dtypes/nf4tensor.py @@ -339,7 +339,7 @@ def copy_(func, *args, **kwargs): # Convert Non NF4Tensor into NF4 for copy in if not isinstance(copy_in, NF4Tensor): copy_in_nf4 = NF4Tensor.from_tensor( - copy_in, original.block_size, original.scaler_block_size + copy_in.to(original.device), original.block_size, original.scaler_block_size ) return original.copy_(copy_in_nf4)