Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 27 additions & 3 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}")

option(CK_TIME_KERNEL "Turning off will disable kernel timing globally" ON)

## OpenMP
if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
# workaround issue hipcc in rocm3.5 cannot find openmp
Expand Down Expand Up @@ -72,8 +74,9 @@ message(STATUS "Build with HIP ${HIP_VERSION}")


rocm_create_package(
NAME CK-${CK_BACKEND}
NAME composablekernel
DESCRIPTION "High Performance Composable Kernel for AMD GPUs"
MAINTAINER "MIOpen Kernels Dev Team <dl.MIOpen@amd.com>"
LDCONFIG
)

Expand Down Expand Up @@ -226,15 +229,14 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/lib)
set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/bin)

configure_file("${PROJECT_SOURCE_DIR}/include/ck/hip_version.hpp.in" "${PROJECT_BINARY_DIR}/include/ck/hip_version.hpp")
configure_file("${PROJECT_SOURCE_DIR}/include/ck/options.hpp.in" "${PROJECT_BINARY_DIR}/include/ck/options.hpp")

include_directories(BEFORE
${PROJECT_SOURCE_DIR}/include
${PROJECT_BINARY_DIR}/include
${PROJECT_SOURCE_DIR}/library/include
)

include(googletest)

SET(BUILD_DEV ON CACHE BOOL "BUILD_DEV")
if(BUILD_DEV)
Expand All @@ -247,3 +249,25 @@ add_subdirectory(library)
add_subdirectory(example)
add_subdirectory(test)
add_subdirectory(profiler)

#Create an interface target for the include only files and call it "composablekernels"
include(CMakePackageConfigHelpers)

set(version 1.0.0)
write_basic_package_version_file(
"${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
VERSION "${version}"
COMPATIBILITY AnyNewerVersion
)

configure_package_config_file(${CMAKE_CURRENT_SOURCE_DIR}/Config.cmake.in
"${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
NO_CHECK_REQUIRED_COMPONENTS_MACRO
)

install(FILES
"${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfig.cmake"
"${CMAKE_CURRENT_BINARY_DIR}/composable_kernelConfigVersion.cmake"
DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/composable_kernel
)
11 changes: 11 additions & 0 deletions Config.cmake.in
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
@PACKAGE_INIT@

set(_composable_kernel_supported_components device_operations host_tensor)

foreach(_comp ${composable_kernel_FIND_COMPONENTS})
if(NOT _comp IN_LIST _composable_kernel_supported_components)
set(composable_kernel_FOUND False)
set(composable_kernel_NOT_FOUND_MESSAGE "Unsupported component: ${_comp}")
endif()
include("${CMAKE_CURRENT_LIST_DIR}/composable_kernel${_comp}Targets.cmake")
endforeach()
15 changes: 1 addition & 14 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -11,32 +11,23 @@ ARG DEB_ROCM_REPO=http://repo.radeon.com/rocm/apt/.apt_$ROCMVERSION/
RUN apt-get update
RUN apt-get install -y wget gnupg
RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | apt-key add -
RUN if ! [ -z $OSDB_BKC_VERSION ]; then \
echo "Using BKC VERISION: $OSDB_BKC_VERSION";\
sh -c "echo deb [arch=amd64 trusted=yes] http://compute-artifactory.amd.com/artifactory/list/rocm-osdb-deb/ compute-rocm-dkms-no-npi-hipclang ${OSDB_BKC_VERSION} > /etc/apt/sources.list.d/rocm.list" ;\
cat /etc/apt/sources.list.d/rocm.list;\
else \
sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list" ;\
fi
RUN sh -c "echo deb [arch=amd64] $DEB_ROCM_REPO ubuntu main > /etc/apt/sources.list.d/rocm.list"
RUN wget --no-check-certificate -qO - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | apt-key add -
RUN sh -c "echo deb https://apt.kitware.com/ubuntu/ bionic main | tee -a /etc/apt/sources.list"

# ADD requirements.txt requirements.txt
# Install dependencies
RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
apt-utils \
sshpass \
build-essential \
cmake-data=3.15.1-0kitware1 \
cmake=3.15.1-0kitware1 \
curl \
doxygen \
g++ \
gdb \
git \
hip-rocclr \
jq \
lcov \
libelf-dev \
libncurses5-dev \
libnuma-dev \
Expand All @@ -62,8 +53,6 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
apt-get clean && \
rm -rf /var/lib/apt/lists/*

# RUN pip3 install --default-timeout=100000 -r requirements.txt

# Setup ubsan environment to printstacktrace
RUN ln -s /usr/bin/llvm-symbolizer-3.8 /usr/local/bin/llvm-symbolizer
ENV UBSAN_OPTIONS=print_stacktrace=1
Expand Down Expand Up @@ -92,5 +81,3 @@ ADD rbuild.ini /rbuild.ini
ADD dev-requirements.txt dev-requirements.txt
RUN rbuild prepare -s develop -d $PREFIX
RUN groupadd -f render
# RUN cget install -f min-requirements.txt
# RUN CXXFLAGS='-isystem $PREFIX/include' cget install -f ./mlir-requirements.txt
19 changes: 18 additions & 1 deletion Jenkinsfile
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ pipeline {
{
agent{ label rocmnode("gfx908")}
environment{
setup_args = """ -D CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " -DBUILD_DEV=On """
setup_args = """ -D CMAKE_CXX_FLAGS=" --offload-arch=gfx900 --offload-arch=gfx906 --offload-arch=gfx908 --offload-arch=gfx90a -O3 " -DBUILD_DEV=On """
Comment thread
aosewski marked this conversation as resolved.
}
steps{
buildHipClangJobAndReboot(setup_args:setup_args, config_targets: "check", no_reboot:true, build_type: 'Release')
Expand All @@ -341,6 +341,23 @@ pipeline {

}
}
stage("Client App")
{
parallel
{
stage("Run Client App")
{
agent{ label rocmnode("gfx908")}
environment{
setup_args = """ -D -DBUILD_DEV=Off -DCMAKE_INSTALL_PREFIX=../install CMAKE_CXX_FLAGS="--offload-arch=gfx908 -O3 " """
execute_args = """ cd ../test/client_app && rm -rf build && mkdir build && cd build && cmake -DCMAKE_PREFIX_PATH="${env.WORKSPACE}/install;/opt/rocm" .. && make """
}
steps{
buildHipClangJobAndReboot(setup_args: setup_args, config_targets: "install", no_reboot:true, build_type: 'Release', execute_cmd: execute_args, prefixpath: '/usr/local')
}
}
}
}
stage("Performance Tests")
{
parallel
Expand Down
10 changes: 10 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -43,3 +43,13 @@ Instructions for running each individual examples are under ```example/```
make -j ckProfiler
```
Instructions for running ckProfiler are under ```profiler/```


## Caveat
### Kernel Timing and Verification
CK's own kernel timer will warn up kernel once, and then run it multiple times
to get average kernel time. For some kernels that use atomic add, this will cause
output buffer to be accumulated multiple times, causing verfication failure.
To work around it, do not use CK's own timer and do verification at the same time.
CK's own timer and verification in each example and ckProfiler can be enabled or
disabled from command line.
3 changes: 2 additions & 1 deletion cmake/googletest.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ list(APPEND GTEST_CMAKE_CXX_FLAGS
-Wno-zero-as-null-pointer-constant
-Wno-unused-member-function
-Wno-comma
-Wno-old-style-cast
)
message(STATUS "Suppressing googltest warnings with flags: ${GTEST_CMAKE_CXX_FLAGS}")

Expand All @@ -35,4 +36,4 @@ FetchContent_MakeAvailable(googletest)
target_compile_options(gtest PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
target_compile_options(gtest_main PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
target_compile_options(gmock PRIVATE ${GTEST_CMAKE_CXX_FLAGS})

target_compile_options(gmock_main PRIVATE ${GTEST_CMAKE_CXX_FLAGS})
14 changes: 7 additions & 7 deletions example/01_gemm/gemm_xdl_bf16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::

int main(int argc, char* argv[])
{
bool do_verification = 0;
int init_method = 0;
int nrepeat = 5;
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;

// GEMM shape
ck::index_t M = 3840;
Expand All @@ -105,13 +105,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]);
time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]);
time_kernel = std::stoi(argv[3]);

M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
Expand All @@ -125,7 +125,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: run kernel # of times (>1)\n");
printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
exit(0);
}
Expand Down Expand Up @@ -198,7 +198,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem");
}

float ave_time = invoker.Run(argument, nrepeat);
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});

std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
Expand Down
14 changes: 7 additions & 7 deletions example/01_gemm/gemm_xdl_fp16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::

int main(int argc, char* argv[])
{
bool do_verification = 0;
int init_method = 0;
int nrepeat = 5;
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;

// GEMM shape
ck::index_t M = 3840;
Expand All @@ -73,13 +73,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]);
time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]);
time_kernel = std::stoi(argv[3]);

M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
Expand All @@ -93,7 +93,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: run kernel # of times (>1)\n");
printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
exit(0);
}
Expand Down Expand Up @@ -171,7 +171,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem");
}

float ave_time = invoker.Run(argument, nrepeat);
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});

std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
Expand Down
14 changes: 7 additions & 7 deletions example/01_gemm/gemm_xdl_int8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,9 +83,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::

int main(int argc, char* argv[])
{
bool do_verification = 0;
int init_method = 0;
int nrepeat = 5;
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;

// GEMM shape
ck::index_t M = 3840;
Expand All @@ -100,13 +100,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]);
time_kernel = std::stoi(argv[3]);
}
else if(argc == 10)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]);
time_kernel = std::stoi(argv[3]);

M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
Expand All @@ -120,7 +120,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: run kernel # of times (>1)\n");
printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
exit(0);
}
Expand Down Expand Up @@ -194,7 +194,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem");
}

float ave_time = invoker.Run(argument, nrepeat);
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});

std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
Expand Down
16 changes: 8 additions & 8 deletions example/02_gemm_alpha_beta/gemm_xdl_alpha_beta.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -86,9 +86,9 @@ using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemmBias2D<AD

int main(int argc, char* argv[])
{
bool do_verification = 0;
int init_method = 0;
int nrepeat = 5;
bool do_verification = true;
int init_method = 1;
bool time_kernel = false;

// GEMM shape
ck::index_t M = 3840;
Expand All @@ -106,13 +106,13 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]);
time_kernel = std::stoi(argv[3]);
}
else if(argc == 6)
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]);
time_kernel = std::stoi(argv[3]);

alpha = std::stof(argv[4]);
beta = std::stof(argv[5]);
Expand All @@ -121,7 +121,7 @@ int main(int argc, char* argv[])
{
do_verification = std::stoi(argv[1]);
init_method = std::stoi(argv[2]);
nrepeat = std::stoi(argv[3]);
time_kernel = std::stoi(argv[3]);

M = std::stoi(argv[4]);
N = std::stoi(argv[5]);
Expand All @@ -138,7 +138,7 @@ int main(int argc, char* argv[])
{
printf("arg1: verification (0=no, 1=yes)\n");
printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
printf("arg3: run kernel # of times (>1)\n");
printf("arg3: time kernel (0=n0, 1=yes)\n");
printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC, alpha, beta\n");
exit(0);
}
Expand Down Expand Up @@ -216,7 +216,7 @@ int main(int argc, char* argv[])
"not support this GEMM problem");
}

float ave_time = invoker.Run(argument, nrepeat);
float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});

std::size_t flop = std::size_t(2) * M * N * K;
std::size_t num_btype =
Expand Down
Loading