From 5c6dce43ac54072b9ea8de2d0636dd0315769060 Mon Sep 17 00:00:00 2001 From: amdkila <47991923+amdkila@users.noreply.github.com> Date: Fri, 13 Sep 2019 13:51:09 -0600 Subject: [PATCH 01/31] Changed timeout from hours to minutes (#699) --- Jenkinsfile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Jenkinsfile b/Jenkinsfile index f988ab19b..a173609a6 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -63,7 +63,7 @@ rocBLASCI: platform.runCommand(this, command) } - rocblas.timeout.test = 10 + rocblas.timeout.test = 600 def testCommand = { From ad53fd932fc09cde4110ef68b5c846512e36913d Mon Sep 17 00:00:00 2001 From: amcamd Date: Fri, 13 Sep 2019 18:53:17 -0500 Subject: [PATCH 02/31] set clang include directory, fix for centos build error --- clients/benchmarks/CMakeLists.txt | 9 ++++++++- clients/gtest/CMakeLists.txt | 9 ++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/clients/benchmarks/CMakeLists.txt b/clients/benchmarks/CMakeLists.txt index b27746611..a6b8194c1 100644 --- a/clients/benchmarks/CMakeLists.txt +++ b/clients/benchmarks/CMakeLists.txt @@ -68,7 +68,14 @@ set( BLIS_LIBRARY ${CMAKE_SOURCE_DIR}/build/deps/blis/lib/libblis.so ) if( EXISTS /etc/redhat-release) set( OPENMP_INCLUDE_DIR /opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include ) set( OPENMP_LIBRARY /opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/libgomp.so ) - set( CLANG_INCLUDE_DIR /opt/rocm/hcc/lib/clang/9.0.0/include ) + + if(EXISTS /opt/rocm/hcc/lib/clang/10.0.0/include/immintrin.h) + set( CLANG_INCLUDE_DIR /opt/rocm/hcc/lib/clang/10.0.0/include ) + elseif (EXISTS /opt/rocm/hcc/lib/clang/9.0.0/include/immintrin.h) + set( CLANG_INCLUDE_DIR /opt/rocm/hcc/lib/clang/9.0.0/include ) + else() + error("cannot find immintrin.h") + endif() # External header includes included as system files target_include_directories( rocblas-bench diff --git a/clients/gtest/CMakeLists.txt b/clients/gtest/CMakeLists.txt index 0ad325a4b..4b3e211b6 100644 --- a/clients/gtest/CMakeLists.txt +++ b/clients/gtest/CMakeLists.txt @@ -104,7 +104,14 @@ set( BLIS_LIBRARY ${CMAKE_SOURCE_DIR}/build/deps/blis/lib/libblis.so ) if( EXISTS /etc/redhat-release) set( OPENMP_INCLUDE_DIR /opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include ) set( OPENMP_LIBRARY /opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/libgomp.so ) - set( CLANG_INCLUDE_DIR /opt/rocm/hcc/lib/clang/9.0.0/include ) + + if(EXISTS /opt/rocm/hcc/lib/clang/10.0.0/include/immintrin.h) + set( CLANG_INCLUDE_DIR /opt/rocm/hcc/lib/clang/10.0.0/include ) + elseif (EXISTS /opt/rocm/hcc/lib/clang/9.0.0/include/immintrin.h) + set( CLANG_INCLUDE_DIR /opt/rocm/hcc/lib/clang/9.0.0/include ) + else() + error("cannot find immintrin.h") + endif() # External header includes included as system files target_include_directories( rocblas-test From bb0806242ae7136bc3b8cf8583a9d7171444665d Mon Sep 17 00:00:00 2001 From: zaliu <35415350+zaliu@users.noreply.github.com> Date: Mon, 16 Sep 2019 17:08:10 -0700 Subject: [PATCH 03/31] hot fix to restore loading of DGEMM replacement kernels (#701) --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c872ca3d..7c549db0b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -140,7 +140,7 @@ include( ROCMPackageConfigHelpers ) include( ROCMInstallSymlinks ) # Versioning via rocm-cmake -set ( VERSION_STRING "2.8.0" ) +set ( VERSION_STRING "2.8.1" ) rocm_setup_version( VERSION ${VERSION_STRING} ) # Append our library helper cmake path and the cmake path for hip (for convenience) @@ -182,12 +182,12 @@ if( BUILD_WITH_TENSILE ) else() # Use the virtual-env setup and download package from specified repot: set( tensile_fork "ROCmSoftwarePlatform" CACHE STRING "Tensile fork to use" ) - set( tensile_tag 015477ad8c8ea0ef6f59b8d49a65015b46b8a48e CACHE STRING "Tensile tag to download" ) + set( tensile_tag 3f69739ed495346e243beef51b1f3f3ebe6d09a5 CACHE STRING "Tensile tag to download" ) virtualenv_install("git+https://github.com/${tensile_fork}/Tensile.git@${tensile_tag}") message (STATUS "using GIT Tensile fork=${tensile_fork} from branch=${tensile_tag}") endif() list(APPEND CMAKE_PREFIX_PATH ${VIRTUALENV_HOME_DIR}) - find_package(Tensile 4.11.0 EXACT REQUIRED HIP LLVM OpenMP PATHS "${INSTALLED_TENSILE_PATH}") + find_package(Tensile 4.12.0 EXACT REQUIRED HIP LLVM OpenMP PATHS "${INSTALLED_TENSILE_PATH}") endif() # Find HCC/HIP dependencies From 6bf3b50726d7fd22a1d3c7fb92f51ea675a61bb7 Mon Sep 17 00:00:00 2001 From: saadrahim <44449863+saadrahim@users.noreply.github.com> Date: Tue, 17 Sep 2019 17:08:56 -0600 Subject: [PATCH 04/31] SLES support (#704) * Merging master with SLES commit * Specifying GPU architecture for ubuntu and sles (#695) * Fixing Timeout --- CMakeLists.txt | 6 ++++ Jenkinsfile | 19 +++++++---- clients/CMakeLists.txt | 2 +- clients/benchmarks/CMakeLists.txt | 20 ++++++++---- clients/gtest/CMakeLists.txt | 15 ++++++--- cmake/os-detection.cmake | 24 ++++++++++++++ docker/dockerfile-build-sles | 53 +++++++++++++++++++++++++++++++ docker/dockerfile-install-sles | 5 +++ install.sh | 47 ++++++++++++++++++++++----- 9 files changed, 166 insertions(+), 25 deletions(-) create mode 100644 cmake/os-detection.cmake create mode 100644 docker/dockerfile-build-sles create mode 100644 docker/dockerfile-install-sles diff --git a/CMakeLists.txt b/CMakeLists.txt index 7c549db0b..4b6ef0711 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -33,6 +33,8 @@ if ( NOT DEFINED CMAKE_Fortran_COMPILER AND NOT DEFINED ENV{FC} ) set( CMAKE_Fortran_COMPILER "gfortran" ) endif() + + project( rocblas LANGUAGES CXX ) set(THREADS_PREFER_PTHREAD_FLAG ON) @@ -139,6 +141,10 @@ include( ROCMInstallTargets ) include( ROCMPackageConfigHelpers ) include( ROCMInstallSymlinks ) +include (cmake/os-detection.cmake) +get_os_id(OS_ID) +message (STATUS "OS detected is ${OS_ID}") + # Versioning via rocm-cmake set ( VERSION_STRING "2.8.1" ) rocm_setup_version( VERSION ${VERSION_STRING} ) diff --git a/Jenkinsfile b/Jenkinsfile index a173609a6..a7ba98197 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -32,7 +32,7 @@ rocBLASCI: rocblas.paths.build_command = './install.sh -lasm_ci -c' // Define test architectures, optional rocm version argument is available - def nodes = new dockerNodes(['gfx900 && ubuntu', 'gfx906 && ubuntu', 'gfx900 && centos7', 'gfx906 && centos7'], rocblas) + def nodes = new dockerNodes(['ubuntu && gfx900', 'gfx900 && centos7', 'gfx906 && centos7', 'sles && gfx906'], rocblas) boolean formatCheck = true @@ -52,6 +52,15 @@ rocBLASCI: LD_LIBRARY_PATH=/opt/rocm/hcc/lib CXX=/opt/rocm/bin/hipcc ${project.paths.build_command} --hip-clang """ } + else if(platform.jenkinsLabel.contains('sles')) + { + command = """#!/usr/bin/env bash + set -x + cd ${project.paths.project_build_prefix} + LD_LIBRARY_PATH=/opt/rocm/hcc/lib CXX=/opt/rocm/bin/hipcc sudo ${project.paths.build_command} + """ + } + else { command = """#!/usr/bin/env bash @@ -63,15 +72,13 @@ rocBLASCI: platform.runCommand(this, command) } - rocblas.timeout.test = 600 - def testCommand = { platform, project-> def command - if(platform.jenkinsLabel.contains('centos')) + if(platform.jenkinsLabel.contains('centos') || platform.jenkinsLabel.contains('sles')) { if(auxiliary.isJobStartedByTimer()) { @@ -92,7 +99,7 @@ rocBLASCI: LD_LIBRARY_PATH=/opt/rocm/hcc/lib ./example-sscal LD_LIBRARY_PATH=/opt/rocm/hcc/lib GTEST_LISTENER=NO_PASS_LINE_IN_LOG sudo ./rocblas-test --gtest_output=xml --gtest_color=yes --gtest_filter=*quick*:*pre_checkin*-*known_bug* #--gtest_filter=*checkin* """ - + platform.runCommand(this, command) junit "${project.paths.project_build_prefix}/build/release/clients/staging/*.xml" } @@ -145,7 +152,7 @@ rocBLASCI: platform.runCommand(this, command) platform.archiveArtifacts(this, """${project.paths.project_build_prefix}/build/release/package/*.rpm""") } - else if(platform.jenkinsLabel.contains('hip-clang')) + else if(platform.jenkinsLabel.contains('hip-clang') || platform.jenkinsLabel.contains('sles')) { packageCommand = null } diff --git a/clients/CMakeLists.txt b/clients/CMakeLists.txt index 9d8ebcb94..72d664eaf 100755 --- a/clients/CMakeLists.txt +++ b/clients/CMakeLists.txt @@ -24,7 +24,7 @@ endif() # This project may compile dependencies for clients project( rocblas-clients LANGUAGES CXX ) -if(EXISTS /etc/redhat-release) +if(OS_ID_rhel OR OS_ID_centos OR OS_ID_sles) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp=libgomp -pthread") else() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread") diff --git a/clients/benchmarks/CMakeLists.txt b/clients/benchmarks/CMakeLists.txt index a6b8194c1..7d1cddbca 100644 --- a/clients/benchmarks/CMakeLists.txt +++ b/clients/benchmarks/CMakeLists.txt @@ -65,9 +65,11 @@ target_include_directories( rocblas-bench set( BLIS_INCLUDE_DIR ${CMAKE_SOURCE_DIR}/build/deps/blis/include/blis ) set( BLIS_LIBRARY ${CMAKE_SOURCE_DIR}/build/deps/blis/lib/libblis.so ) -if( EXISTS /etc/redhat-release) + +if( OS_ID_rhel OR OS_ID_centos OR OS_ID_sles) + if( OS_ID_rhel OR OS_ID_centos) set( OPENMP_INCLUDE_DIR /opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include ) - set( OPENMP_LIBRARY /opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/libgomp.so ) + set( OPENMP_LIBRARY /opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/libgomp.so ) if(EXISTS /opt/rocm/hcc/lib/clang/10.0.0/include/immintrin.h) set( CLANG_INCLUDE_DIR /opt/rocm/hcc/lib/clang/10.0.0/include ) @@ -76,18 +78,24 @@ if( EXISTS /etc/redhat-release) else() error("cannot find immintrin.h") endif() + else() + #SLES + set( OPENMP_INCLUDE_DIR /usr/lib64/gcc/x86_64-suse-linux/7/include/ ) + set( OPENMP_LIBRARY /usr/lib64/gcc/x86_64-suse-linux/7/libgomp.so ) + set( CLANG_INCLUDE_DIR /opt/rocm/hcc/lib/clang/9.0.0/include ) + endif() # External header includes included as system files target_include_directories( rocblas-bench SYSTEM PRIVATE - $ + $ $ $ $ $ $ - $ - ) + $ + ) if(LINK_BLIS) target_link_libraries( rocblas-bench PRIVATE ${Boost_LIBRARIES} ${BLIS_LIBRARY} ${OPENMP_LIBRARY} cblas lapack roc::rocblas ) else() @@ -104,7 +112,7 @@ else() $ $ ) - + if(LINK_BLIS) target_link_libraries( rocblas-bench PRIVATE ${Boost_LIBRARIES} ${BLIS_LIBRARY} cblas lapack roc::rocblas ) else() diff --git a/clients/gtest/CMakeLists.txt b/clients/gtest/CMakeLists.txt index 4b3e211b6..3754d52a4 100644 --- a/clients/gtest/CMakeLists.txt +++ b/clients/gtest/CMakeLists.txt @@ -101,7 +101,8 @@ target_include_directories( rocblas-test set( BLIS_INCLUDE_DIR ${CMAKE_SOURCE_DIR}/build/deps/blis/include/blis ) set( BLIS_LIBRARY ${CMAKE_SOURCE_DIR}/build/deps/blis/lib/libblis.so ) -if( EXISTS /etc/redhat-release) +if( OS_ID_rhel OR OS_ID_centos OR OS_ID_sles) + if( OS_ID_rhel OR OS_ID_centos) set( OPENMP_INCLUDE_DIR /opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include ) set( OPENMP_LIBRARY /opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/libgomp.so ) @@ -112,18 +113,24 @@ if( EXISTS /etc/redhat-release) else() error("cannot find immintrin.h") endif() + else() + #SLES + set( OPENMP_INCLUDE_DIR /usr/lib64/gcc/x86_64-suse-linux/7/include/ ) + set( OPENMP_LIBRARY /usr/lib64/gcc/x86_64-suse-linux/7/libgomp.so ) + set( CLANG_INCLUDE_DIR /opt/rocm/hcc/lib/clang/9.0.0/include ) + endif() # External header includes included as system files target_include_directories( rocblas-test SYSTEM PRIVATE - $ + $ $ $ $ $ $ $ - $ + $ ) if(LINK_BLIS) @@ -142,7 +149,7 @@ else() $ $ ) - + if(LINK_BLIS) target_link_libraries( rocblas-test PRIVATE ${GTEST_LIBRARIES} ${Boost_LIBRARIES} ${BLIS_LIBRARY} cblas lapack roc::rocblas ) else() diff --git a/cmake/os-detection.cmake b/cmake/os-detection.cmake new file mode 100644 index 000000000..723d7f1a6 --- /dev/null +++ b/cmake/os-detection.cmake @@ -0,0 +1,24 @@ +# ######################################################################## +# Copyright 2019 Advanced Micro Devices, Inc. +# ######################################################################## + +function (get_os_id OS_ID) + set(_os_id "unknown") + if (EXISTS "/etc/os-release") + read_key("ID" _os_id) + endif() + set(${OS_ID} ${_os_id} PARENT_SCOPE) + set(${OS_ID}_${_os_id} TRUE PARENT_SCOPE) +endfunction() + +function (read_key KEYVALUE OUTPUT) + #finds the line with the keyvalue + file (STRINGS /etc/os-release _keyvalue_line REGEX "^${KEYVALUE}=") + + #remove keyvalue= + string (REGEX REPLACE "^${KEYVALUE}=\"?(.*)" "\\1" _output "${_keyvalue_line}") + + #remove trailing quote + string (REGEX REPLACE "\"$" "" _output "${_output}") + set(${OUTPUT} ${_output} PARENT_SCOPE) +endfunction () diff --git a/docker/dockerfile-build-sles b/docker/dockerfile-build-sles new file mode 100644 index 000000000..1f645f27f --- /dev/null +++ b/docker/dockerfile-build-sles @@ -0,0 +1,53 @@ +# Parameters related to building rocBLAS +ARG base_image + +FROM ${base_image} +LABEL maintainer="rocblas-maintainer@amd.com" + +ARG user_uid + +# Install dependent packages +# Dependencies: +# * hcc-config.cmake: pkg-config +# * tensile: python2.7, python-yaml +# * rocblas-test: gfortran, googletest +# * rocblas-bench: libboost-program-options-dev +# * libhsakmt.so: libnuma1 +RUN zypper -n update && zypper -n install\ + rock-dkms \ + sudo \ + ca-certificates \ + git \ + gcc-c++ \ + gcc-fortran \ + make \ + cmake \ + fftw3-devel \ + rpm-build \ + dpkg \ + python2-PyYAML \ + libboost_program_options1_66_0-devel\ + libcxxtools9 \ + libnuma1 \ + llvm7-devel \ + python3-pip + +RUN pip3 install wheel && pip3 install tox pyyaml + +# docker pipeline runs containers with particular uid +# create a jenkins user with this specific uid so it can use sudo priviledges +# Grant any member of sudo group password-less sudo privileges +RUN useradd --create-home -u ${user_uid} -o -G video --shell /bin/bash jenkins && \ + echo '%video ALL=(ALL) NOPASSWD:ALL' | tee /etc/sudoers.d/sudo-nopasswd && \ + chmod 400 /etc/sudoers.d/sudo-nopasswd + +ARG ROCBLAS_SRC_ROOT=/usr/local/src/rocBLAS + +# Clone rocblas repo +# Build client dependencies and install into /usr/local (LAPACK & GTEST) +RUN mkdir -p ${ROCBLAS_SRC_ROOT} && cd ${ROCBLAS_SRC_ROOT} && \ + git clone -b develop --depth=1 https://github.com/ROCmSoftwarePlatform/rocBLAS . && \ + mkdir -p build/deps && cd build/deps && \ + cmake -DBUILD_BOOST=OFF ${ROCBLAS_SRC_ROOT}/deps && \ + make -j $(nproc) install && \ + rm -rf ${ROCBLAS_SRC_ROOT} diff --git a/docker/dockerfile-install-sles b/docker/dockerfile-install-sles new file mode 100644 index 000000000..09e9b7f25 --- /dev/null +++ b/docker/dockerfile-install-sles @@ -0,0 +1,5 @@ +# Parameters related to building rocblas +ARG base_image + +FROM ${base_image} +LABEL maintainer="rocblas-maintainer@amd.com" diff --git a/install.sh b/install.sh index a72f1bf14..93cfff8fa 100755 --- a/install.sh +++ b/install.sh @@ -37,7 +37,7 @@ supported_distro( ) fi case "${ID}" in - ubuntu|centos|rhel|fedora) + ubuntu|centos|rhel|fedora|sles) true ;; *) printf "This script is currently supported on Ubuntu, CentOS, RHEL and Fedora\n" @@ -104,6 +104,17 @@ install_dnf_packages( ) done } +install_zypper_packages( ) +{ + package_dependencies=("$@") + for package in "${package_dependencies[@]}"; do + if [[ $(rpm -q ${package} &> /dev/null; echo $? ) -ne 0 ]]; then + printf "\033[32mInstalling \033[33m${package}\033[32m from distro package manager\033[0m\n" + elevate_if_not_root zypper install -y ${package} + fi + done +} + # Take an array of packages as input, and delegate the work to the appropriate distro installer # prereq: ${ID} must be defined before calling # prereq: ${build_clients} must be defined before calling @@ -133,17 +144,21 @@ install_packages( ) "python34" "PyYAML" "python3*-PyYAML" "gcc-c++" "libcxx-devel" "libgomp" "hip_hcc" "rocm_smi64" "zlib-devel" ) + local library_dependencies_sles=( "make" "cmake" "python3-PyYAM" + "hip_hcc" "gcc-c++" "libcxxtools9" "rpm-build" ) if [[ "${build_cuda}" == true ]]; then # Ideally, this could be cuda-cublas-dev, but the package name has a version number in it library_dependencies_ubuntu+=( "cuda" ) library_dependencies_centos+=( "" ) # how to install cuda on centos? library_dependencies_fedora+=( "" ) # how to install cuda on fedora? + library_dependencies_sles+=( "" ) fi local client_dependencies_ubuntu=( "gfortran" "libboost-program-options-dev" "libomp-dev") local client_dependencies_centos=( "gcc-gfortran" "boost-devel" "libgomp") local client_dependencies_fedora=( "gcc-gfortran" "boost-devel" "libgomp") + local client_dependencies_sles=( "gcc-fortran" "boost-devel" "libboost_program_options1_66_0-devel" "libgomp1") case "${ID}" in ubuntu) @@ -174,6 +189,14 @@ install_packages( ) install_dnf_packages "${client_dependencies_fedora[@]}" fi ;; + + sles) + install_zypper_packages "${client_dependencies_sles[@]}" + + if [[ "${build_clients}" == true ]]; then + install_zypper_packages "${client_dependencies_sles[@]}" + fi + ;; *) echo "This script is currently supported on Ubuntu, CentOS, RHEL and Fedora" exit 2 @@ -347,13 +370,18 @@ fi if [[ "${cpu_ref_lib}" == blis ]] && [[ ! -f "${build_dir}/deps/blis/lib/libblis.so" ]]; then git submodule update --init cd extern/blis - if [[ -e "/etc/redhat-release" ]]; then - echo 'CentOS detected' - ./configure --prefix=../../${build_dir}/deps/blis --enable-threading=openmp auto - else - echo 'Ubuntu detected' - ./configure --prefix=../../${build_dir}/deps/blis --enable-threading=openmp CC=/opt/rocm/hcc/bin/clang auto - fi + case "${ID}" in + centos|rhel|sles) + ./configure --prefix=../../${build_dir}/deps/blis --enable-threading=openmp auto + ;; + ubuntu) + ./configure --prefix=../../${build_dir}/deps/blis --enable-threading=openmp CC=/opt/rocm/hcc/bin/clang auto + ;; + *) + echo "Unsupported OS for this script" + ./configure --prefix=../../${build_dir}/deps/blis --enable-threading=openmp auto + ;; + esac make install cd ../.. fi @@ -442,6 +470,9 @@ esac fedora) elevate_if_not_root dnf install rocblas-*.rpm ;; + sles) + elevate_if_not_root zypper --no-gpg-checks in -y install rocblas-*.rpm + ;; esac fi From 54713296100bb5c190a6bd31c206ffb588126a10 Mon Sep 17 00:00:00 2001 From: zaliu <35415350+zaliu@users.noreply.github.com> Date: Thu, 19 Sep 2019 08:24:36 -0700 Subject: [PATCH 05/31] BF16 replacement kernels (#705) * hot fix to restore loading of DGEMM replacement kernels * Revert "Switch to using separate D for gemm_ex benchmark calls (#667)" This reverts commit 402d231cb24502132b71fe3ec093188ce4f32a69. * bf16 kernels for gfx908 * use bf16 UseBeta=0 replacement kernels * update tensile_tag to use bf16 UseBeta=0 replacement kernels --- CMakeLists.txt | 4 +- clients/include/testing_gemm_ex.hpp | 12 +- .../testing_gemm_strided_batched_ex.hpp | 16 +- .../asm_full/arcturus_Cijk_Alik_Bljk_BH.yaml | 1731 +++++++++++++++++ 4 files changed, 1747 insertions(+), 16 deletions(-) create mode 100644 library/src/blas3/Tensile/Logic/asm_full/arcturus_Cijk_Alik_Bljk_BH.yaml diff --git a/CMakeLists.txt b/CMakeLists.txt index 4b6ef0711..7768e1f8a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -146,7 +146,7 @@ get_os_id(OS_ID) message (STATUS "OS detected is ${OS_ID}") # Versioning via rocm-cmake -set ( VERSION_STRING "2.8.1" ) +set ( VERSION_STRING "2.8.2" ) rocm_setup_version( VERSION ${VERSION_STRING} ) # Append our library helper cmake path and the cmake path for hip (for convenience) @@ -188,7 +188,7 @@ if( BUILD_WITH_TENSILE ) else() # Use the virtual-env setup and download package from specified repot: set( tensile_fork "ROCmSoftwarePlatform" CACHE STRING "Tensile fork to use" ) - set( tensile_tag 3f69739ed495346e243beef51b1f3f3ebe6d09a5 CACHE STRING "Tensile tag to download" ) + set( tensile_tag 71c62d1d1f63ced18688c481d1399ee9f56abd4b CACHE STRING "Tensile tag to download" ) virtualenv_install("git+https://github.com/${tensile_fork}/Tensile.git@${tensile_tag}") message (STATUS "using GIT Tensile fork=${tensile_fork} from branch=${tensile_tag}") endif() diff --git a/clients/include/testing_gemm_ex.hpp b/clients/include/testing_gemm_ex.hpp index a22cd52df..50dcab7f5 100644 --- a/clients/include/testing_gemm_ex.hpp +++ b/clients/include/testing_gemm_ex.hpp @@ -589,9 +589,9 @@ void testing_gemm_ex(const Arguments& arg) dC, arg.c_type, ldc, - dD, - arg.d_type, - ldd, + dC, + arg.c_type, + ldc, arg.compute_type, algo, solution_index, @@ -618,9 +618,9 @@ void testing_gemm_ex(const Arguments& arg) dC, arg.c_type, ldc, - dD, - arg.d_type, - ldd, + dC, + arg.c_type, + ldc, arg.compute_type, algo, solution_index, diff --git a/clients/include/testing_gemm_strided_batched_ex.hpp b/clients/include/testing_gemm_strided_batched_ex.hpp index dc7dbd52f..283d3fa89 100644 --- a/clients/include/testing_gemm_strided_batched_ex.hpp +++ b/clients/include/testing_gemm_strided_batched_ex.hpp @@ -759,10 +759,10 @@ void testing_gemm_strided_batched_ex(const Arguments& arg) arg.c_type, ldc, stride_c, - dD, - arg.d_type, - ldd, - stride_d, + dC, + arg.c_type, + ldc, + stride_c, batch_count, arg.compute_type, algo, @@ -794,10 +794,10 @@ void testing_gemm_strided_batched_ex(const Arguments& arg) arg.c_type, ldc, stride_c, - dD, - arg.d_type, - ldd, - stride_d, + dC, + arg.c_type, + ldc, + stride_c, batch_count, arg.compute_type, algo, diff --git a/library/src/blas3/Tensile/Logic/asm_full/arcturus_Cijk_Alik_Bljk_BH.yaml b/library/src/blas3/Tensile/Logic/asm_full/arcturus_Cijk_Alik_Bljk_BH.yaml new file mode 100644 index 000000000..d72dcf81e --- /dev/null +++ b/library/src/blas3/Tensile/Logic/asm_full/arcturus_Cijk_Alik_Bljk_BH.yaml @@ -0,0 +1,1731 @@ +- {MinimumRequiredVersion: 4.12.0} +- arcturus +- gfx908 +- [Device 7380, Device 7388, Device 738c] +- AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DestDataType: 7 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: false + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] +- - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [0, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 32 + MacroTileA: 128 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 8 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DestDataType: 7 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: false + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 0 + SolutionNameMin: Cijk_Alik_Bljk_BH_MT128x32x16_SE_ + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 2] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [0, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 2048 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 128 + MacroTile1: 64 + MacroTileA: 128 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 8 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 8 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DestDataType: 7 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: false + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 1 + SolutionNameMin: Cijk_Alik_Bljk_BH_MT128x64x16_SE_ + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [8, 4] + ThreadTile0: 8 + ThreadTile1: 4 + ThreadTileA: 8 + ThreadTileB: 4 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [0, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 2560 + LdsOffsetA: 0 + LdsOffsetB: 512 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 32 + MacroTile1: 128 + MacroTileA: 32 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 2 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DestDataType: 7 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: false + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 2 + SolutionNameMin: Cijk_Alik_Bljk_BH_MT32x128x16_SE_ + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 8] + ThreadTile0: 2 + ThreadTile1: 8 + ThreadTileA: 2 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [0, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 3072 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 32 + NumLoadsA: 4 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DestDataType: 7 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: false + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 3 + SolutionNameMin: Cijk_Alik_Bljk_BH_MT64x128x16_SE_ + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [0, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 16 + LVPB: 16 + LdcEqualsLdd: true + LdsNumElements: 6144 + LdsOffsetA: 0 + LdsOffsetB: 4096 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 256 + MacroTile1: 128 + MacroTileA: 256 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 128 + NumGlobalWriteVectorsPerThread: 128 + NumLoadsA: 16 + NumLoadsB: 8 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 8 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DestDataType: 7 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: false + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 4 + SolutionNameMin: Cijk_Alik_Bljk_BH_MT256x128x16_SE_ + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [16, 8] + ThreadTile0: 16 + ThreadTile1: 8 + ThreadTileA: 16 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 8 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 4 + GlobalLoadVectorWidthB: 4 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 4 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 4 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 8] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 32 + LSPB: 32 + LVCA: 8 + LVCB: 8 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 14336 + LdsNumElementsAlignedA: 2048 + LdsNumElementsAlignedB: 4096 + LdsOffsetA: 0 + LdsOffsetA_Blk: 8192 + LdsOffsetB: 2048 + LdsOffsetB_Blk: 10240 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 64 + MacroTile1: 128 + MacroTileA: 64 + MacroTileB: 128 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 32 + NumGlobalWriteVectorsPerThread: 8 + NumLoadsA: 2 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 4 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DestDataType: 7 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: false + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 5 + SolutionNameMin: Cijk_Alik_Bljk_BH_MT64x128x32_SE_ + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [4, 8] + ThreadTile0: 4 + ThreadTile1: 8 + ThreadTileA: 4 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 4 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [0, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1280 + LdsOffsetA: 0 + LdsOffsetB: 1024 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 64 + MacroTile1: 16 + MacroTileA: 64 + MacroTileB: 16 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 16 + NumLoadsB: 4 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 16 + NumLoadsPerpendicularB: 4 + NumThreads: 64 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DestDataType: 7 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: false + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 6 + SolutionNameMin: Cijk_Alik_Bljk_BH_MT64x16x16_SE_ + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [8, 2] + ThreadTile0: 8 + ThreadTile1: 2 + ThreadTileA: 8 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 1 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 0 + AssertSummationElementMultiple: 1 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: false + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 16 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: false + FractionalLoad: false + GlobalLoadVectorWidthA: 1 + GlobalLoadVectorWidthB: 1 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 1 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 1 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [0, 0, 0] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Source + LSCA: 16 + LSCB: 16 + LSPA: 4 + LSPB: 4 + LVCA: 16 + LVCB: 16 + LVPA: 4 + LVPB: 4 + LdcEqualsLdd: true + LdsNumElements: 1280 + LdsOffsetA: 0 + LdsOffsetB: 256 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 16 + MacroTile0: 16 + MacroTile1: 64 + MacroTileA: 16 + MacroTileB: 64 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 16 + NumGlobalWriteVectorsPerThread: 16 + NumLoadsA: 4 + NumLoadsB: 16 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 4 + NumLoadsPerpendicularB: 16 + NumThreads: 64 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: false + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DestDataType: 7 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: false + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: false + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_BH_MT16x64x16_SE_ + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 8 + SubGroup1: 8 + SubGroupA: 8 + SubGroupB: 8 + SuppressNoLoadLoop: false + ThreadTile: [2, 8] + ThreadTile0: 2 + ThreadTile1: 8 + ThreadTileA: 2 + ThreadTileB: 8 + UnrollMemFence: false + UseSgprForGRO: 0 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 1 + WorkGroup: [8, 8, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 3 + - AggressivePerfMode: 1 + AssertFree0ElementMultiple: 8 + AssertFree1ElementMultiple: 1 + AssertMinApproxSize: 3 + AssertSummationElementMultiple: 8 + AssignedDerivedParameters: false + AssignedProblemIndependentDerivedParameters: true + BufferLoad: true + BufferStore: true + CheckDimOverflow: 0 + CheckTensorDimAsserts: false + DepthU: 32 + DirectToLds: false + DirectToLdsA: false + DirectToLdsB: false + DisableKernelPieces: 0 + EdgeType: ShiftPtr + ExpandPointerSwap: true + FractionalLoad: 0 + GlobalLoadVectorWidthA: 2 + GlobalLoadVectorWidthB: 2 + GlobalRead2A: true + GlobalRead2B: true + GlobalReadCoalesceGroupA: true + GlobalReadCoalesceGroupB: true + GlobalReadCoalesceVectorA: true + GlobalReadCoalesceVectorB: true + GlobalReadVectorWidth: 2 + GlobalSplitU: 1 + GlobalSplitUSummationAssignmentRoundRobin: true + GlobalSplitUWorkGroupMappingRoundRobin: false + GlobalWriteVectorWidth: 2 + GuaranteeNoPartialA: true + GuaranteeNoPartialB: true + ISA: [9, 0, 8] + InnerUnroll: 1 + InterleaveAlpha: 0 + KernelLanguage: Assembly + LSCA: 32 + LSCB: 32 + LSPA: 16 + LSPB: 16 + LVCA: 16 + LVCB: 16 + LVPA: 8 + LVPB: 8 + LdcEqualsLdd: true + LdsNumElements: 4096 + LdsNumElementsAlignedA: 1024 + LdsNumElementsAlignedB: 1024 + LdsOffsetA: 0 + LdsOffsetA_Blk: 2048 + LdsOffsetB: 1024 + LdsOffsetB_Blk: 3072 + LdsPadA: 0 + LdsPadB: 0 + LocalDotLayout: 1 + LocalRead2A: true + LocalRead2B: true + LocalSplitU: 1 + LocalWrite2A: true + LocalWrite2B: true + LocalWriteUseSgprA: false + LocalWriteUseSgprB: false + LoopDoWhile: false + LoopTail: true + LoopUnroll: 32 + MacroTile0: 32 + MacroTile1: 32 + MacroTileA: 32 + MacroTileB: 32 + MacroTileShapeMax: 64 + MacroTileShapeMin: 1 + MaxOccupancy: 40 + MaxVgprNumber: 256 + MinVgprNumber: 0 + NonTemporalA: 0 + NonTemporalB: 0 + NonTemporalC: 0 + NumElementsPerThread: 4 + NumGlobalWriteVectorsPerThread: 2 + NumLoadsA: 2 + NumLoadsB: 2 + NumLoadsCoalescedA: 1 + NumLoadsCoalescedB: 1 + NumLoadsPerpendicularA: 2 + NumLoadsPerpendicularB: 2 + NumThreads: 256 + OptNoLoadLoop: 1 + PackBatchDims: 0 + PackFreeDims: 1 + PackGranularity: 2 + PackedC0IdxChars: [I] + PackedC0IndicesX: [0] + PackedC1IdxChars: [J] + PackedC1IndicesX: [1] + PerformanceSyncLocation: -1 + PerformanceWaitCount: -1 + PerformanceWaitLocation: -1 + PersistentKernel: 0 + PrefetchAcrossPersistent: 0 + PrefetchGlobalRead: true + PrefetchLocalRead: true + ProblemType: + AssignedDerivedParameters: true + Batched: true + ComplexConjugateA: false + ComplexConjugateB: false + ComputeDataType: 0 + DataType: 7 + DestDataType: 7 + HighPrecisionAccumulate: true + Index0: 0 + Index01A: 0 + Index01B: 1 + Index1: 1 + IndexAssignmentsA: [3, 0, 2] + IndexAssignmentsB: [3, 1, 2] + IndexAssignmentsLD: [4, 5, 6, 7] + IndexUnroll: 3 + IndexUnrollA: 0 + IndexUnrollB: 0 + IndicesBatch: [2] + IndicesFree: [0, 1] + IndicesSummation: [3] + NumIndicesBatch: 1 + NumIndicesC: 3 + NumIndicesFree: 2 + NumIndicesLD: 4 + NumIndicesSummation: 1 + OperationType: GEMM + SetConstStrideA: [] + SilentHighPrecisionAccumulate: false + TLUA: false + TLUB: false + Tensor0: 0 + Tensor1: 1 + TileA: 0 + TileAwareSelection: false + TileB: 1 + TotalIndices: 4 + TransposeA: true + TransposeB: false + UseBeta: false + UseInitialStrides: false + ZeroPadA: [] + ZeroPadB: [] + ReplacementKernel: true + ScheduleGlobalRead: 1 + ScheduleIterAlg: 1 + ScheduleLocalWrite: 1 + SolutionIndex: 8 + SolutionNameMin: Cijk_Alik_Bljk_BH_MT32x32x32_SE_ + StaggerU: 32 + StaggerUMapping: 0 + StaggerUStride: 256 + SubGroup0: 16 + SubGroup1: 16 + SubGroupA: 16 + SubGroupB: 16 + SuppressNoLoadLoop: false + ThreadTile: [2, 2] + ThreadTile0: 2 + ThreadTile1: 2 + ThreadTileA: 2 + ThreadTileB: 2 + UnrollMemFence: false + UseSgprForGRO: 1 + Valid: true + VectorAtomicWidth: 1 + VectorStore: true + VectorWidth: 2 + WorkGroup: [16, 16, 1] + WorkGroupMapping: 8 + WorkGroupMappingType: B + _staggerStrideShift: 2 +- [2, 3, 0, 1] +- - - [959, 1024, 1, 1024] + - [3, 1069.96] + - - [960, 1023, 1, 1024] + - [3, 1077.18] + - - [960, 1024, 1, 1023] + - [3, 1072.31] + - - [960, 1024, 1, 1025] + - [3, 1085.42] + - - [960, 1025, 1, 1024] + - [2, 822.091] + - - [961, 1024, 1, 1024] + - [2, 816.019] + - - [1023, 1024, 1, 1024] + - [0, 870.244] + - - [1024, 1023, 1, 1024] + - [0, 872.396] + - - [1024, 1024, 1, 1023] + - [2, 871.035] + - - [1024, 1024, 1, 1025] + - [0, 869.969] + - - [1024, 1025, 1, 1024] + - [0, 872.002] + - - [1025, 1024, 1, 1024] + - [0, 870.588] + - - [2039, 2048, 1, 2048] + - [1, 1385.94] + - - [2040, 2047, 1, 2048] + - [1, 1386.89] + - - [2040, 2048, 1, 2047] + - [1, 1374.9] + - - [2040, 2048, 1, 2049] + - [1, 1392.64] + - - [2040, 2049, 1, 2048] + - [1, 1389.45] + - - [2041, 2048, 1, 2048] + - [1, 1386.7] + - - [2047, 2048, 1, 2048] + - [1, 1389.61] + - - [2048, 2047, 1, 2048] + - [1, 1392.28] + - - [2048, 2048, 1, 2047] + - [1, 1382.82] + - - [2048, 2048, 1, 2049] + - [1, 1396.64] + - - [2048, 2049, 1, 2048] + - [1, 1393.76] + - - [2049, 2048, 1, 2048] + - [1, 1390.61] + - - [2999, 3072, 1, 3072] + - [1, 1618.27] + - - [3000, 3071, 1, 3072] + - [1, 1619.77] + - - [3000, 3072, 1, 3071] + - [1, 1606.4] + - - [3000, 3072, 1, 3073] + - [1, 1613.78] + - - [3000, 3073, 1, 3072] + - [1, 1618.01] + - - [3001, 3072, 1, 3072] + - [1, 1619.37] + - - [3071, 3072, 1, 3072] + - [1, 1655.81] + - - [3072, 3071, 1, 3072] + - [1, 1655.8] + - - [3072, 3072, 1, 3071] + - [1, 1641.6] + - - [3072, 3072, 1, 3073] + - [1, 1651.49] + - - [3072, 3073, 1, 3072] + - [1, 1655.85] + - - [3073, 3072, 1, 3072] + - [1, 1656.62] + - - [4079, 4096, 1, 4096] + - [1, 1605.54] + - - [4080, 4095, 1, 4096] + - [1, 1605.01] + - - [4080, 4096, 1, 4095] + - [4, 1614.56] + - - [4080, 4096, 1, 4097] + - [1, 1624.91] + - - [4080, 4097, 1, 4096] + - [1, 1605.97] + - - [4081, 4096, 1, 4096] + - [1, 1603.2] + - - [4095, 4096, 1, 4096] + - [1, 1607.15] + - - [4096, 4095, 1, 4096] + - [1, 1606.4] + - - [4096, 4096, 1, 4095] + - [4, 1618.29] + - - [4096, 4096, 1, 4097] + - [1, 1626.69] + - - [4096, 4097, 1, 4096] + - [1, 1607.94] + - - [4097, 4096, 1, 4096] + - [1, 1606.33] + - - [960, 1024, 1, 1024] + - [5, 9231.82] + - - [1024, 1024, 1, 1024] + - [5, 7523.44] + - - [2040, 2048, 1, 2048] + - [5, 11226.5] + - - [2048, 2048, 1, 2048] + - [5, 14214.3] + - - [3000, 3072, 1, 3072] + - [5, 12725.0] + - - [3072, 3072, 1, 3072] + - [5, 12566.4] + - - [4080, 4096, 1, 4096] + - [5, 16766.1] + - - [4096, 4096, 1, 4096] + - [5, 16883.5] + - - [63, 1024, 1, 1024] + - [6, 112.839] + - - [64, 1023, 1, 1024] + - [6, 114.974] + - - [64, 1024, 1, 1023] + - [6, 116.736] + - - [64, 1024, 1, 1025] + - [6, 116.251] + - - [64, 1025, 1, 1024] + - [6, 114.274] + - - [65, 1024, 1, 1024] + - [7, 107.545] + - - [64, 1024, 1, 1024] + - [8, 1823.61] +- null From ab003e41921c57a83d0c2604a653261df958bf27 Mon Sep 17 00:00:00 2001 From: zaliu <35415350+zaliu@users.noreply.github.com> Date: Thu, 19 Sep 2019 10:54:49 -0700 Subject: [PATCH 06/31] Restore usebeta1 logic (#707) * restore UseBeta=1 logic for arcturus BF16 TN --- ....yaml => arcturus_Cijk_Alik_Bljk_BBH.yaml} | 334 +++++------------- 1 file changed, 81 insertions(+), 253 deletions(-) rename library/src/blas3/Tensile/Logic/asm_full/{arcturus_Cijk_Alik_Bljk_BH.yaml => arcturus_Cijk_Alik_Bljk_BBH.yaml} (85%) diff --git a/library/src/blas3/Tensile/Logic/asm_full/arcturus_Cijk_Alik_Bljk_BH.yaml b/library/src/blas3/Tensile/Logic/asm_full/arcturus_Cijk_Alik_Bljk_BBH.yaml similarity index 85% rename from library/src/blas3/Tensile/Logic/asm_full/arcturus_Cijk_Alik_Bljk_BH.yaml rename to library/src/blas3/Tensile/Logic/asm_full/arcturus_Cijk_Alik_Bljk_BBH.yaml index d72dcf81e..79e819a61 100644 --- a/library/src/blas3/Tensile/Logic/asm_full/arcturus_Cijk_Alik_Bljk_BH.yaml +++ b/library/src/blas3/Tensile/Logic/asm_full/arcturus_Cijk_Alik_Bljk_BBH.yaml @@ -41,7 +41,7 @@ TotalIndices: 4 TransposeA: true TransposeB: false - UseBeta: false + UseBeta: true UseInitialStrides: false ZeroPadA: [] ZeroPadB: [] @@ -184,7 +184,7 @@ TotalIndices: 4 TransposeA: true TransposeB: false - UseBeta: false + UseBeta: true UseInitialStrides: false ZeroPadA: [] ZeroPadB: [] @@ -193,7 +193,7 @@ ScheduleIterAlg: 1 ScheduleLocalWrite: 1 SolutionIndex: 0 - SolutionNameMin: Cijk_Alik_Bljk_BH_MT128x32x16_SE_ + SolutionNameMin: Cijk_Alik_Bljk_BBH_MT128x32x16_SE_ StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -356,7 +356,7 @@ TotalIndices: 4 TransposeA: true TransposeB: false - UseBeta: false + UseBeta: true UseInitialStrides: false ZeroPadA: [] ZeroPadB: [] @@ -365,7 +365,7 @@ ScheduleIterAlg: 1 ScheduleLocalWrite: 1 SolutionIndex: 1 - SolutionNameMin: Cijk_Alik_Bljk_BH_MT128x64x16_SE_ + SolutionNameMin: Cijk_Alik_Bljk_BBH_MT128x64x16_SE_ StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -528,7 +528,7 @@ TotalIndices: 4 TransposeA: true TransposeB: false - UseBeta: false + UseBeta: true UseInitialStrides: false ZeroPadA: [] ZeroPadB: [] @@ -537,7 +537,7 @@ ScheduleIterAlg: 1 ScheduleLocalWrite: 1 SolutionIndex: 2 - SolutionNameMin: Cijk_Alik_Bljk_BH_MT32x128x16_SE_ + SolutionNameMin: Cijk_Alik_Bljk_BBH_MT32x128x16_SE_ StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -700,7 +700,7 @@ TotalIndices: 4 TransposeA: true TransposeB: false - UseBeta: false + UseBeta: true UseInitialStrides: false ZeroPadA: [] ZeroPadB: [] @@ -709,7 +709,7 @@ ScheduleIterAlg: 1 ScheduleLocalWrite: 1 SolutionIndex: 3 - SolutionNameMin: Cijk_Alik_Bljk_BH_MT64x128x16_SE_ + SolutionNameMin: Cijk_Alik_Bljk_BBH_MT64x128x16_SE_ StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -872,7 +872,7 @@ TotalIndices: 4 TransposeA: true TransposeB: false - UseBeta: false + UseBeta: true UseInitialStrides: false ZeroPadA: [] ZeroPadB: [] @@ -881,7 +881,7 @@ ScheduleIterAlg: 1 ScheduleLocalWrite: 1 SolutionIndex: 4 - SolutionNameMin: Cijk_Alik_Bljk_BH_MT256x128x16_SE_ + SolutionNameMin: Cijk_Alik_Bljk_BBH_MT256x128x16_SE_ StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -1048,7 +1048,7 @@ TotalIndices: 4 TransposeA: true TransposeB: false - UseBeta: false + UseBeta: true UseInitialStrides: false ZeroPadA: [] ZeroPadB: [] @@ -1057,7 +1057,7 @@ ScheduleIterAlg: 1 ScheduleLocalWrite: 1 SolutionIndex: 5 - SolutionNameMin: Cijk_Alik_Bljk_BH_MT64x128x32_SE_ + SolutionNameMin: Cijk_Alik_Bljk_BBH_MT64x128x32_SE_ StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -1220,7 +1220,7 @@ TotalIndices: 4 TransposeA: true TransposeB: false - UseBeta: false + UseBeta: true UseInitialStrides: false ZeroPadA: [] ZeroPadB: [] @@ -1229,7 +1229,7 @@ ScheduleIterAlg: 1 ScheduleLocalWrite: 1 SolutionIndex: 6 - SolutionNameMin: Cijk_Alik_Bljk_BH_MT64x16x16_SE_ + SolutionNameMin: Cijk_Alik_Bljk_BBH_MT64x16x16_SE_ StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -1253,178 +1253,6 @@ WorkGroupMapping: 8 WorkGroupMappingType: B _staggerStrideShift: 3 - - AggressivePerfMode: 1 - AssertFree0ElementMultiple: 1 - AssertFree1ElementMultiple: 1 - AssertMinApproxSize: 0 - AssertSummationElementMultiple: 1 - AssignedDerivedParameters: false - AssignedProblemIndependentDerivedParameters: true - BufferLoad: false - BufferStore: true - CheckDimOverflow: 0 - CheckTensorDimAsserts: false - DepthU: 16 - DirectToLds: false - DirectToLdsA: false - DirectToLdsB: false - DisableKernelPieces: 0 - EdgeType: ShiftPtr - ExpandPointerSwap: false - FractionalLoad: false - GlobalLoadVectorWidthA: 1 - GlobalLoadVectorWidthB: 1 - GlobalRead2A: true - GlobalRead2B: true - GlobalReadCoalesceGroupA: true - GlobalReadCoalesceGroupB: true - GlobalReadCoalesceVectorA: true - GlobalReadCoalesceVectorB: true - GlobalReadVectorWidth: 1 - GlobalSplitU: 1 - GlobalSplitUSummationAssignmentRoundRobin: true - GlobalSplitUWorkGroupMappingRoundRobin: false - GlobalWriteVectorWidth: 1 - GuaranteeNoPartialA: true - GuaranteeNoPartialB: true - ISA: [0, 0, 0] - InnerUnroll: 1 - InterleaveAlpha: 0 - KernelLanguage: Source - LSCA: 16 - LSCB: 16 - LSPA: 4 - LSPB: 4 - LVCA: 16 - LVCB: 16 - LVPA: 4 - LVPB: 4 - LdcEqualsLdd: true - LdsNumElements: 1280 - LdsOffsetA: 0 - LdsOffsetB: 256 - LdsPadA: 0 - LdsPadB: 0 - LocalDotLayout: 1 - LocalRead2A: true - LocalRead2B: true - LocalSplitU: 1 - LocalWrite2A: true - LocalWrite2B: true - LocalWriteUseSgprA: false - LocalWriteUseSgprB: false - LoopDoWhile: false - LoopTail: true - LoopUnroll: 16 - MacroTile0: 16 - MacroTile1: 64 - MacroTileA: 16 - MacroTileB: 64 - MacroTileShapeMax: 64 - MacroTileShapeMin: 1 - MaxOccupancy: 40 - MaxVgprNumber: 256 - MinVgprNumber: 0 - NonTemporalA: 0 - NonTemporalB: 0 - NonTemporalC: 0 - NumElementsPerThread: 16 - NumGlobalWriteVectorsPerThread: 16 - NumLoadsA: 4 - NumLoadsB: 16 - NumLoadsCoalescedA: 1 - NumLoadsCoalescedB: 1 - NumLoadsPerpendicularA: 4 - NumLoadsPerpendicularB: 16 - NumThreads: 64 - OptNoLoadLoop: 1 - PackBatchDims: 0 - PackFreeDims: 1 - PackGranularity: 2 - PackedC0IdxChars: [I] - PackedC0IndicesX: [0] - PackedC1IdxChars: [J] - PackedC1IndicesX: [1] - PerformanceSyncLocation: -1 - PerformanceWaitCount: -1 - PerformanceWaitLocation: -1 - PersistentKernel: 0 - PrefetchAcrossPersistent: 0 - PrefetchGlobalRead: false - PrefetchLocalRead: true - ProblemType: - AssignedDerivedParameters: true - Batched: true - ComplexConjugateA: false - ComplexConjugateB: false - ComputeDataType: 0 - DataType: 7 - DestDataType: 7 - HighPrecisionAccumulate: true - Index0: 0 - Index01A: 0 - Index01B: 1 - Index1: 1 - IndexAssignmentsA: [3, 0, 2] - IndexAssignmentsB: [3, 1, 2] - IndexAssignmentsLD: [4, 5, 6, 7] - IndexUnroll: 3 - IndexUnrollA: 0 - IndexUnrollB: 0 - IndicesBatch: [2] - IndicesFree: [0, 1] - IndicesSummation: [3] - NumIndicesBatch: 1 - NumIndicesC: 3 - NumIndicesFree: 2 - NumIndicesLD: 4 - NumIndicesSummation: 1 - OperationType: GEMM - SetConstStrideA: [] - SilentHighPrecisionAccumulate: false - TLUA: false - TLUB: false - Tensor0: 0 - Tensor1: 1 - TileA: 0 - TileAwareSelection: false - TileB: 1 - TotalIndices: 4 - TransposeA: true - TransposeB: false - UseBeta: false - UseInitialStrides: false - ZeroPadA: [] - ZeroPadB: [] - ReplacementKernel: false - ScheduleGlobalRead: 1 - ScheduleIterAlg: 1 - ScheduleLocalWrite: 1 - SolutionIndex: 7 - SolutionNameMin: Cijk_Alik_Bljk_BH_MT16x64x16_SE_ - StaggerU: 32 - StaggerUMapping: 0 - StaggerUStride: 256 - SubGroup0: 8 - SubGroup1: 8 - SubGroupA: 8 - SubGroupB: 8 - SuppressNoLoadLoop: false - ThreadTile: [2, 8] - ThreadTile0: 2 - ThreadTile1: 8 - ThreadTileA: 2 - ThreadTileB: 8 - UnrollMemFence: false - UseSgprForGRO: 0 - Valid: true - VectorAtomicWidth: 1 - VectorStore: true - VectorWidth: 1 - WorkGroup: [8, 8, 1] - WorkGroupMapping: 8 - WorkGroupMappingType: B - _staggerStrideShift: 3 - AggressivePerfMode: 1 AssertFree0ElementMultiple: 8 AssertFree1ElementMultiple: 1 @@ -1568,7 +1396,7 @@ TotalIndices: 4 TransposeA: true TransposeB: false - UseBeta: false + UseBeta: true UseInitialStrides: false ZeroPadA: [] ZeroPadB: [] @@ -1576,8 +1404,8 @@ ScheduleGlobalRead: 1 ScheduleIterAlg: 1 ScheduleLocalWrite: 1 - SolutionIndex: 8 - SolutionNameMin: Cijk_Alik_Bljk_BH_MT32x32x32_SE_ + SolutionIndex: 7 + SolutionNameMin: Cijk_Alik_Bljk_BBH_MT32x32x32_SE_ StaggerU: 32 StaggerUMapping: 0 StaggerUStride: 256 @@ -1603,129 +1431,129 @@ _staggerStrideShift: 2 - [2, 3, 0, 1] - - - [959, 1024, 1, 1024] - - [3, 1069.96] + - [3, 1055.49] - - [960, 1023, 1, 1024] - - [3, 1077.18] + - [3, 1071.67] - - [960, 1024, 1, 1023] - - [3, 1072.31] + - [3, 1069.84] - - [960, 1024, 1, 1025] - - [3, 1085.42] + - [3, 1077.07] - - [960, 1025, 1, 1024] - - [2, 822.091] + - [2, 822.52] - - [961, 1024, 1, 1024] - - [2, 816.019] + - [2, 813.489] - - [1023, 1024, 1, 1024] - - [0, 870.244] + - [0, 865.973] - - [1024, 1023, 1, 1024] - - [0, 872.396] + - [0, 865.805] - - [1024, 1024, 1, 1023] - - [2, 871.035] + - [2, 866.42] - - [1024, 1024, 1, 1025] - - [0, 869.969] + - [0, 865.43] - - [1024, 1025, 1, 1024] - - [0, 872.002] + - [0, 866.658] - - [1025, 1024, 1, 1024] - - [0, 870.588] + - [0, 866.155] - - [2039, 2048, 1, 2048] - - [1, 1385.94] + - [1, 1376.9] - - [2040, 2047, 1, 2048] - - [1, 1386.89] + - [1, 1378.17] - - [2040, 2048, 1, 2047] - - [1, 1374.9] + - [1, 1372.13] - - [2040, 2048, 1, 2049] - - [1, 1392.64] + - [1, 1387.02] - - [2040, 2049, 1, 2048] - - [1, 1389.45] + - [1, 1380.06] - - [2041, 2048, 1, 2048] - - [1, 1386.7] + - [1, 1379.05] - - [2047, 2048, 1, 2048] - - [1, 1389.61] + - [1, 1384.61] - - [2048, 2047, 1, 2048] - - [1, 1392.28] + - [1, 1383.95] - - [2048, 2048, 1, 2047] - - [1, 1382.82] + - [1, 1374.29] - - [2048, 2048, 1, 2049] - - [1, 1396.64] + - [1, 1391.34] - - [2048, 2049, 1, 2048] - - [1, 1393.76] + - [1, 1384.69] - - [2049, 2048, 1, 2048] - - [1, 1390.61] + - [1, 1383.0] - - [2999, 3072, 1, 3072] - - [1, 1618.27] + - [1, 1614.3] - - [3000, 3071, 1, 3072] - - [1, 1619.77] + - [1, 1614.82] - - [3000, 3072, 1, 3071] - - [1, 1606.4] + - [1, 1600.23] - - [3000, 3072, 1, 3073] - - [1, 1613.78] + - [1, 1608.61] - - [3000, 3073, 1, 3072] - - [1, 1618.01] + - [1, 1614.31] - - [3001, 3072, 1, 3072] - - [1, 1619.37] + - [1, 1616.45] - - [3071, 3072, 1, 3072] - - [1, 1655.81] + - [1, 1651.37] - - [3072, 3071, 1, 3072] - - [1, 1655.8] + - [1, 1652.19] - - [3072, 3072, 1, 3071] - - [1, 1641.6] + - [1, 1636.33] - - [3072, 3072, 1, 3073] - - [1, 1651.49] + - [1, 1645.1] - - [3072, 3073, 1, 3072] - - [1, 1655.85] + - [1, 1651.42] - - [3073, 3072, 1, 3072] - - [1, 1656.62] + - [1, 1652.4] - - [4079, 4096, 1, 4096] - - [1, 1605.54] + - [1, 1600.96] - - [4080, 4095, 1, 4096] - - [1, 1605.01] + - [1, 1600.47] - - [4080, 4096, 1, 4095] - - [4, 1614.56] + - [4, 1604.11] - - [4080, 4096, 1, 4097] - - [1, 1624.91] + - [1, 1621.16] - - [4080, 4097, 1, 4096] - - [1, 1605.97] + - [1, 1602.61] - - [4081, 4096, 1, 4096] - - [1, 1603.2] + - [1, 1599.52] - - [4095, 4096, 1, 4096] - - [1, 1607.15] + - [1, 1604.71] - - [4096, 4095, 1, 4096] - - [1, 1606.4] + - [1, 1604.32] - - [4096, 4096, 1, 4095] - - [4, 1618.29] + - [4, 1609.64] - - [4096, 4096, 1, 4097] - - [1, 1626.69] + - [1, 1625.9] - - [4096, 4097, 1, 4096] - - [1, 1607.94] + - [1, 1605.8] - - [4097, 4096, 1, 4096] - - [1, 1606.33] + - [1, 1603.85] - - [960, 1024, 1, 1024] - - [5, 9231.82] + - [5, 9007.14] - - [1024, 1024, 1, 1024] - - [5, 7523.44] + - [5, 7604.43] - - [2040, 2048, 1, 2048] - - [5, 11226.5] + - [5, 11066.3] - - [2048, 2048, 1, 2048] - - [5, 14214.3] + - [5, 14117.1] - - [3000, 3072, 1, 3072] - - [5, 12725.0] + - [5, 12651.4] - - [3072, 3072, 1, 3072] - - [5, 12566.4] + - [5, 12416.1] - - [4080, 4096, 1, 4096] - - [5, 16766.1] + - [5, 16848.0] - - [4096, 4096, 1, 4096] - - [5, 16883.5] + - [5, 16911.1] - - [63, 1024, 1, 1024] - - [6, 112.839] + - [6, 111.559] - - [64, 1023, 1, 1024] - - [6, 114.974] + - [6, 113.219] - - [64, 1024, 1, 1023] - - [6, 116.736] + - [6, 114.051] - - [64, 1024, 1, 1025] - - [6, 116.251] + - [6, 114.243] - - [64, 1025, 1, 1024] - - [6, 114.274] + - [6, 112.937] - - [65, 1024, 1, 1024] - - [7, 107.545] + - [6, 109.143] - - [64, 1024, 1, 1024] - - [8, 1823.61] + - [7, 1762.34] - null From 45dce72985fadcc9e9ca7c52a64311f26b506cfb Mon Sep 17 00:00:00 2001 From: saadrahim <44449863+saadrahim@users.noreply.github.com> Date: Thu, 19 Sep 2019 17:36:26 -0600 Subject: [PATCH 07/31] Supporting clang10 for SLES (#708) --- clients/benchmarks/CMakeLists.txt | 17 ++++++++--------- clients/gtest/CMakeLists.txt | 17 ++++++++--------- 2 files changed, 16 insertions(+), 18 deletions(-) diff --git a/clients/benchmarks/CMakeLists.txt b/clients/benchmarks/CMakeLists.txt index 7d1cddbca..801775bef 100644 --- a/clients/benchmarks/CMakeLists.txt +++ b/clients/benchmarks/CMakeLists.txt @@ -67,9 +67,14 @@ set( BLIS_LIBRARY ${CMAKE_SOURCE_DIR}/build/deps/blis/lib/libblis.so ) if( OS_ID_rhel OR OS_ID_centos OR OS_ID_sles) - if( OS_ID_rhel OR OS_ID_centos) - set( OPENMP_INCLUDE_DIR /opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include ) - set( OPENMP_LIBRARY /opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/libgomp.so ) + if( OS_ID_rhel OR OS_ID_centos) + set( OPENMP_INCLUDE_DIR /opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include ) + set( OPENMP_LIBRARY /opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/libgomp.so ) + else() + #SLES + set( OPENMP_INCLUDE_DIR /usr/lib64/gcc/x86_64-suse-linux/7/include/ ) + set( OPENMP_LIBRARY /usr/lib64/gcc/x86_64-suse-linux/7/libgomp.so ) + endif() if(EXISTS /opt/rocm/hcc/lib/clang/10.0.0/include/immintrin.h) set( CLANG_INCLUDE_DIR /opt/rocm/hcc/lib/clang/10.0.0/include ) @@ -78,12 +83,6 @@ if( OS_ID_rhel OR OS_ID_centos OR OS_ID_sles) else() error("cannot find immintrin.h") endif() - else() - #SLES - set( OPENMP_INCLUDE_DIR /usr/lib64/gcc/x86_64-suse-linux/7/include/ ) - set( OPENMP_LIBRARY /usr/lib64/gcc/x86_64-suse-linux/7/libgomp.so ) - set( CLANG_INCLUDE_DIR /opt/rocm/hcc/lib/clang/9.0.0/include ) - endif() # External header includes included as system files target_include_directories( rocblas-bench diff --git a/clients/gtest/CMakeLists.txt b/clients/gtest/CMakeLists.txt index 3754d52a4..d1ba7b2d0 100644 --- a/clients/gtest/CMakeLists.txt +++ b/clients/gtest/CMakeLists.txt @@ -102,9 +102,14 @@ set( BLIS_INCLUDE_DIR ${CMAKE_SOURCE_DIR}/build/deps/blis/include/blis ) set( BLIS_LIBRARY ${CMAKE_SOURCE_DIR}/build/deps/blis/lib/libblis.so ) if( OS_ID_rhel OR OS_ID_centos OR OS_ID_sles) - if( OS_ID_rhel OR OS_ID_centos) - set( OPENMP_INCLUDE_DIR /opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include ) - set( OPENMP_LIBRARY /opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/libgomp.so ) + if( OS_ID_rhel OR OS_ID_centos) + set( OPENMP_INCLUDE_DIR /opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/include ) + set( OPENMP_LIBRARY /opt/rh/devtoolset-7/root/usr/lib/gcc/x86_64-redhat-linux/7/libgomp.so ) + else() + #SLES + set( OPENMP_INCLUDE_DIR /usr/lib64/gcc/x86_64-suse-linux/7/include/ ) + set( OPENMP_LIBRARY /usr/lib64/gcc/x86_64-suse-linux/7/libgomp.so ) + endif() if(EXISTS /opt/rocm/hcc/lib/clang/10.0.0/include/immintrin.h) set( CLANG_INCLUDE_DIR /opt/rocm/hcc/lib/clang/10.0.0/include ) @@ -113,12 +118,6 @@ if( OS_ID_rhel OR OS_ID_centos OR OS_ID_sles) else() error("cannot find immintrin.h") endif() - else() - #SLES - set( OPENMP_INCLUDE_DIR /usr/lib64/gcc/x86_64-suse-linux/7/include/ ) - set( OPENMP_LIBRARY /usr/lib64/gcc/x86_64-suse-linux/7/libgomp.so ) - set( CLANG_INCLUDE_DIR /opt/rocm/hcc/lib/clang/9.0.0/include ) - endif() # External header includes included as system files target_include_directories( rocblas-test From f19f5ae2b614a6ce06b04975ddbaa0a80354b982 Mon Sep 17 00:00:00 2001 From: Torre Zuk <42548444+TorreZuk@users.noreply.github.com> Date: Fri, 4 Oct 2019 14:37:32 -0600 Subject: [PATCH 08/31] Batched syr (#727) * adding syr batched and strided batched * rocblas_stride, reusable template pattern work * WIP * fixes testing and format * restore dependency * adds minimal batches & bad arg * fix bad arg testing * constify ptrs, spelling * add alpha vector support, PR feedback --- clients/gtest/rocblas_gtest.yaml | 2 +- clients/gtest/syr_gtest.cpp | 116 ++++++-- clients/gtest/syr_gtest.yaml | 75 ++++- clients/include/rocblas.hpp | 38 +++ clients/include/rocblas_template.yaml | 4 + clients/include/testing_syr.hpp | 2 +- clients/include/testing_syr_batched.hpp | 268 ++++++++++++++++++ .../include/testing_syr_strided_batched.hpp | 259 +++++++++++++++++ library/include/rocblas-functions.h | 163 +++++++++-- library/src/CMakeLists.txt | 2 + library/src/blas2/rocblas_syr.cpp | 89 +----- library/src/blas2/rocblas_syr.hpp | 80 ++++++ library/src/blas2/rocblas_syr_batched.cpp | 142 ++++++++++ library/src/blas2/rocblas_syr_batched.hpp | 97 +++++++ .../src/blas2/rocblas_syr_strided_batched.cpp | 168 +++++++++++ .../src/blas2/rocblas_syr_strided_batched.hpp | 105 +++++++ 16 files changed, 1489 insertions(+), 121 deletions(-) create mode 100644 clients/include/testing_syr_batched.hpp create mode 100644 clients/include/testing_syr_strided_batched.hpp create mode 100644 library/src/blas2/rocblas_syr.hpp create mode 100644 library/src/blas2/rocblas_syr_batched.cpp create mode 100644 library/src/blas2/rocblas_syr_batched.hpp create mode 100644 library/src/blas2/rocblas_syr_strided_batched.cpp create mode 100644 library/src/blas2/rocblas_syr_strided_batched.hpp diff --git a/clients/gtest/rocblas_gtest.yaml b/clients/gtest/rocblas_gtest.yaml index 3cbf2a1ac..67770c925 100644 --- a/clients/gtest/rocblas_gtest.yaml +++ b/clients/gtest/rocblas_gtest.yaml @@ -3,7 +3,7 @@ include: gemv_gtest.yaml include: gemv_batched_gtest.yaml include: gemv_strided_batched_gtest.yaml include: gemm_gtest.yaml -include :gemm_batched_gtest.yaml +include: gemm_batched_gtest.yaml include: gemm_strided_batched_gtest.yaml include: symv_gtest.yaml include: syr_gtest.yaml diff --git a/clients/gtest/syr_gtest.cpp b/clients/gtest/syr_gtest.cpp index 5f80fa3ce..7b9b38acd 100644 --- a/clients/gtest/syr_gtest.cpp +++ b/clients/gtest/syr_gtest.cpp @@ -7,6 +7,8 @@ #include "rocblas_datatype2string.hpp" #include "rocblas_test.hpp" #include "testing_syr.hpp" +#include "testing_syr_batched.hpp" +#include "testing_syr_strided_batched.hpp" #include "type_dispatch.hpp" #include #include @@ -14,7 +16,73 @@ namespace { - // By default, this test does not apply to any types. + // possible test cases + enum syr_test_type + { + SYR, + SYR_BATCHED, + SYR_STRIDED_BATCHED, + }; + + //syr test template + template