Skip to content

Commit

Permalink
x86 sse2/xop/avx/avx2/avx512/vnni/vnniint8 optimization for gemm int8 (
Browse files Browse the repository at this point in the history
…#5763)

* skip round problem
* sde on ubuntu24
  • Loading branch information
nihui authored Dec 17, 2024
1 parent cdcbb3b commit 44e0d95
Show file tree
Hide file tree
Showing 20 changed files with 18,053 additions and 1,484 deletions.
57 changes: 0 additions & 57 deletions .github/workflows/linux-x64-cpu-gcc-sde.yml

This file was deleted.

85 changes: 85 additions & 0 deletions .github/workflows/linux-x64-sde.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
name: linux-x64-sde
on:
push:
branches: [master]
paths:
- '.github/workflows/linux-x64-sde.yml'
- 'CMakeLists.txt'
- 'cmake/**'
- 'src/*'
- 'src/layer/*'
- 'src/layer/x86/**'
- 'tests/**'
- 'tools/**'
- '!tools/pnnx/**'
- 'examples/**'
pull_request:
branches: [master]
paths:
- '.github/workflows/linux-x64-sde.yml'
- 'CMakeLists.txt'
- 'cmake/**'
- 'src/*'
- 'src/layer/*'
- 'src/layer/x86/**'
- 'tests/**'
- 'tools/**'
- '!tools/pnnx/**'
- 'examples/**'
concurrency:
group: linux-x64-sde-${{ github.ref }}
cancel-in-progress: true
permissions:
contents: read

jobs:
gcc-sde:
runs-on: ubuntu-24.04
steps:
- uses: actions/checkout@v4
- name: update
run: sudo apt-get update
- name: gcc14
run: sudo apt-get install gcc-14 g++-14
- name: Setup SDE binaries
uses: petarpetrovt/[email protected]
- name: build
env:
CC: gcc-14
CXX: g++-14
run: |
mkdir build && cd build
cmake -DNCNN_BUILD_TESTS=ON ..
cmake --build . -j $(nproc)
- name: test-p4p
run: |
cd build
TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-p4p;--" ctest --output-on-failure -j $(nproc)
- name: test-snb
run: |
cd build
TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-snb;--" ctest --output-on-failure -j $(nproc)
- name: test-hsw
run: |
cd build
TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-hsw;--" ctest --output-on-failure -j $(nproc)
- name: test-adl
run: |
cd build
TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-adl;--" ctest --output-on-failure -j $(nproc)
- name: test-arl
run: |
cd build
TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-arl;--" ctest --output-on-failure -j $(nproc)
- name: test-skx
run: |
cd build
TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-skx;--" ctest --output-on-failure -j $(nproc)
- name: test-spr
run: |
cd build
TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-spr;--" ctest --output-on-failure -j $(nproc)
- name: test-gnr
run: |
cd build
TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-gnr;--" ctest --output-on-failure -j $(nproc)
69 changes: 40 additions & 29 deletions .github/workflows/test-coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ jobs:
LD_LIBRARY_PATH: /data/action/install/lib64
run: |
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_VULKAN=ON -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_XOP=OFF -DNCNN_AVXVNNI=OFF -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_VULKAN=ON -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_XOP=OFF -DNCNN_AVXVNNI=OFF -DNCNN_AVXNECONVERT=OFF -DNCNN_AVX512=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
cmake --build . -j 4
- name: test
env:
Expand All @@ -54,61 +54,72 @@ jobs:
lcov --list lcov.info
- name: codecov
id: codecov
continue-on-error: true
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
disable_search: true
plugins: noop
files: build/lcov.info
- name: set the status
if: always()
run: |
if ${{ steps.codecov.outcome=='success' }}; then
echo fine
else
exit 1
fi

linux-gcc-x64-avx512-spr:
runs-on: ubuntu-22.04
linux-gcc-x64-sde:
name: linux-gcc-sde-${{ matrix.cpu }}
runs-on: ubuntu-24.04
strategy:
fail-fast: false
matrix:
include:
- { cpu: hsw, AVX2: ON, AVXVNNI: OFF, AVXVNNIINT8: OFF, AVXNECONVERT: OFF, AVX512: OFF, AVX512VNNI: OFF, AVX512BF16: OFF, AVX512FP16: OFF }
- { cpu: adl, AVX2: ON, AVXVNNI: ON, AVXVNNIINT8: OFF, AVXNECONVERT: OFF, AVX512: OFF, AVX512VNNI: OFF, AVX512BF16: OFF, AVX512FP16: OFF }
- { cpu: arl, AVX2: ON, AVXVNNI: ON, AVXVNNIINT8: ON, AVXNECONVERT: ON, AVX512: OFF, AVX512VNNI: OFF, AVX512BF16: OFF, AVX512FP16: OFF }
- { cpu: spr, AVX2: ON, AVXVNNI: OFF, AVXVNNIINT8: OFF, AVXNECONVERT: OFF, AVX512: ON, AVX512VNNI: ON, AVX512BF16: ON, AVX512FP16: ON }
steps:
- uses: actions/checkout@v4
- name: update
run: sudo apt-get update
- name: gcc12
run: sudo apt-get install gcc-12 g++-12
- name: gcc14
run: sudo apt-get install gcc-14 g++-14
- name: lcov
run: sudo apt-get install lcov
- name: Setup SDE binaries
uses: petarpetrovt/[email protected]
- name: build-avx512-spr
- name: build
env:
CC: gcc-12
CXX: g++-12
CC: gcc-14
CXX: g++-14
run: |
mkdir build-avx512-spr && cd build-avx512-spr
cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF -DNCNN_AVX2=ON -DNCNN_AVX512=ON -DNCNN_AVX512VNNI=ON -DNCNN_AVX512BF16=ON -DNCNN_AVX512FP16=ON -DNCNN_XOP=OFF -DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
cmake --build . -j 2
- name: test-avx512-spr
mkdir build && cd build
cmake -DCMAKE_BUILD_TYPE=debug -DNCNN_COVERAGE=ON -DNCNN_RUNTIME_CPU=OFF \
-DNCNN_AVX=ON \
-DNCNN_F16C=ON \
-DNCNN_XOP=OFF \
-DNCNN_AVX2=${{ matrix.AVX2 }} \
-DNCNN_AVXVNNI=${{ matrix.AVXVNNI }} \
-DNCNN_AVXVNNIINT8=${{ matrix.AVXVNNIINT8 }} \
-DNCNN_AVXNECONVERT=${{ matrix.AVXNECONVERT }} \
-DNCNN_AVX512=${{ matrix.AVX512 }} \
-DNCNN_AVX512VNNI=${{ matrix.AVX512VNNI }} \
-DNCNN_AVX512BF16=${{ matrix.AVX512BF16 }} \
-DNCNN_AVX512FP16=${{ matrix.AVX512FP16 }} \
-DNCNN_OPENMP=OFF -DNCNN_BUILD_TOOLS=OFF -DNCNN_BUILD_EXAMPLES=OFF -DNCNN_BUILD_TESTS=ON ..
cmake --build . -j $(nproc)
- name: test
run: |
cd build-avx512-spr
TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-spr;--" ctest --output-on-failure -j 2
cd build
TESTS_EXECUTABLE_LOADER=$SDE_PATH/sde64 TESTS_EXECUTABLE_LOADER_ARGUMENTS="-${{ matrix.cpu }};--" ctest --output-on-failure -j $(nproc)
- name: lcov-collect
run: |
cd build-avx512-spr
lcov --gcov-tool gcov-12 -d ./src -c -o lcov.info
cd build
lcov --gcov-tool gcov-14 -d ./src -c -o lcov.info
lcov -r lcov.info '/usr/*' -o lcov.info
lcov -r lcov.info '*/build-avx512-spr/*' -o lcov.info
lcov -r lcov.info '*/build/*' -o lcov.info
lcov --list lcov.info
- name: codecov-avx512-spr
- name: codecov
uses: codecov/codecov-action@v5
with:
token: ${{ secrets.CODECOV_TOKEN }}
disable_search: true
plugins: noop
files: build-avx512-spr/lcov.info
files: build/lcov.info

linux-gcc-riscv64-rvv:
strategy:
Expand Down
4 changes: 2 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -508,7 +508,7 @@ else()
check_cxx_compiler_flag("/arch:AVX512" NCNN_COMPILER_SUPPORT_X86_AVX512)

set(CMAKE_REQUIRED_FLAGS "/arch:AVX2")
check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m256i _s, _a, _b; _s = _mm256_dpwssd_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI)
check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m256i _s, _a, _b; _s = _mm256_dpwssd_avx_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI)

set(CMAKE_REQUIRED_FLAGS "/arch:AVX2")
check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m256i _s, _a, _b; _s = _mm256_dpbssd_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT8)
Expand Down Expand Up @@ -545,7 +545,7 @@ else()
check_cxx_compiler_flag("/arch:AVX512 -mfma -mf16c -mavx512cd -mavx512bw -mavx512dq -mavx512vl" NCNN_COMPILER_SUPPORT_X86_AVX512)

set(CMAKE_REQUIRED_FLAGS "/arch:AVX2 -mfma -mf16c -mavxvnni")
check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m256i _s, _a, _b; _s = _mm256_dpwssd_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI)
check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m256i _s, _a, _b; _s = _mm256_dpwssd_avx_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI)

set(CMAKE_REQUIRED_FLAGS "/arch:AVX2 -mfma -mf16c -mavxvnni -mavxvnniint8")
check_cxx_source_compiles("#include <immintrin.h>\nint main() { __m256i _s, _a, _b; _s = _mm256_dpbssd_epi32(_s, _a, _b); return 0; }" NCNN_COMPILER_SUPPORT_X86_AVX_VNNI_INT8)
Expand Down
11 changes: 1 addition & 10 deletions src/layer/gemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -220,9 +220,7 @@ static void gemm_transB_int8(const Mat& A_int8, const Mat& BT_int8, const Mat& A
const int N = BT_int8.h;
const int K = A_int8.w; // assert A_int8.w == BT_int8.w

// NCNN_LOGE("naive ds %f %f", A_int8_scales[0], BT_int8_scale);

// #pragma omp parallel for num_threads(opt.num_threads)
#pragma omp parallel for num_threads(opt.num_threads)
for (int i = 0; i < M; i++)
{
const int out_hstep = top_blob.dims == 3 ? (int)top_blob.cstep : top_blob.w;
Expand All @@ -232,16 +230,13 @@ static void gemm_transB_int8(const Mat& A_int8, const Mat& BT_int8, const Mat& A

const float descale = 1.f / (A_int8_scales[i] * BT_int8_scale);

// NCNN_LOGE("descale %f", descale);

for (int j = 0; j < N; j++)
{
const signed char* ptrBT = BT_int8.row<const signed char>(j);

int sum = 0;
for (int k = 0; k < K; k++)
{
// NCNN_LOGE("ptrA[%d] %d", k, ptrA[k]);
sum += ptrA[k] * ptrBT[k];
}

Expand Down Expand Up @@ -501,8 +496,6 @@ int Gemm::forward_int8(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& t
absmax = std::max(absmax, (float)fabs(ptr[k]));
}

// NCNN_LOGE("A[%d] absmax %f", i, absmax);

float A_int8_scale = absmax == 0.f ? 1.f : 127.f / absmax;
A_int8_scales[i] = A_int8_scale;

Expand Down Expand Up @@ -534,8 +527,6 @@ int Gemm::forward_int8(const std::vector<Mat>& bottom_blobs, std::vector<Mat>& t
}
}

// NCNN_LOGE("B0 absmax %f", absmax);

B_int8_scale = absmax == 0.f ? 1.f : 127.f / absmax;

for (int i = 0; i < B0_int8.h; i++)
Expand Down
Loading

0 comments on commit 44e0d95

Please sign in to comment.