Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 14 additions & 21 deletions cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -859,6 +859,10 @@ set(ONNXRUNTIME_PROVIDER_NAMES cpu)
set(ORT_PROVIDER_FLAGS)

if (onnxruntime_USE_CUDA)
include(cuda_configuration)
setup_cuda_compiler()
setup_cuda_architectures()

enable_language(CUDA)
message( STATUS "CMAKE_CUDA_COMPILER_VERSION: ${CMAKE_CUDA_COMPILER_VERSION}")

Expand All @@ -878,9 +882,6 @@ if (onnxruntime_USE_CUDA)
set(onnxruntime_USE_FLASH_ATTENTION OFF)
endif()

if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.4)
message( FATAL_ERROR "Failed build due to CUDA compiler version < 11.4")
endif()
if (WIN32)
message( STATUS "Lean Attention unsupported in Windows")
set(onnxruntime_USE_LEAN_ATTENTION OFF)
Expand Down Expand Up @@ -1590,25 +1591,17 @@ if (onnxruntime_USE_CUDA)
file(TO_CMAKE_PATH CUDAToolkit_ROOT ${onnxruntime_CUDA_HOME})
endif()
find_package(CUDAToolkit REQUIRED)
if (NOT CMAKE_CUDA_ARCHITECTURES)
# Note that we generate SASS+PTX code for specified cuda architectures by assigning "xy"
# To add SASS only, assign "xy-real"
# To add PTX only, assign "xy-virtual"
if (CMAKE_LIBRARY_ARCHITECTURE STREQUAL "aarch64-linux-gnu")
# Support for Jetson/Tegra ARM devices
set(CMAKE_CUDA_ARCHITECTURES "53-real;62-real;72-real;87") # TX1/Nano, TX2, Xavier, Orin
else()
if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12)
# 37, 50 still work in CUDA 11 but are marked deprecated and will be removed in future CUDA version.
set(CMAKE_CUDA_ARCHITECTURES "37-real;50-real;52-real;60-real;70-real;75-real;80-real;86-real;89")
elseif (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.8)
set(CMAKE_CUDA_ARCHITECTURES "52-real;60-real;70-real;75-real;80-real;86-real;89-real;90")
else()
# https://cmake.org/cmake/help/latest/prop_tgt/CUDA_ARCHITECTURES.html
set(CMAKE_CUDA_ARCHITECTURES "all") # Supporting all, including latest Blackwell B series & RTX 50 series
endif()
endif()

if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 11.8)
add_definitions("-DENABLE_FP8")
message(STATUS "CUDA Toolkit version is greater or equal than 11.8, enable -DENABLE_FP8 flag")
endif()

if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
add_definitions("-DENABLE_FP4")
message(STATUS "CUDA Toolkit version is greater or equal than 12.8, enable -DENABLE_FP4 flag")
endif()

set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -Xfatbin=-compress-all")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --Werror default-stream-launch")
Expand Down
172 changes: 172 additions & 0 deletions cmake/external/cuda_configuration.cmake
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
#
# SPDX-FileCopyrightText: Copyright (c) 1993-2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#

macro(setup_cuda_compiler)
# Determine CUDA version before enabling the language extension check_language(CUDA) clears CMAKE_CUDA_HOST_COMPILER
# if CMAKE_CUDA_COMPILER is not set
include(CheckLanguage)
if(NOT CMAKE_CUDA_COMPILER AND CMAKE_CUDA_HOST_COMPILER)
set(CMAKE_CUDA_HOST_COMPILER_BACKUP ${CMAKE_CUDA_HOST_COMPILER})
endif()
check_language(CUDA)
if(CMAKE_CUDA_HOST_COMPILER_BACKUP)
set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CUDA_HOST_COMPILER_BACKUP})
check_language(CUDA)
endif()
if(CMAKE_CUDA_COMPILER)
message(STATUS "CUDA compiler: ${CMAKE_CUDA_COMPILER}")
if(NOT WIN32) # Linux
execute_process(
COMMAND "bash" "-c" "${CMAKE_CUDA_COMPILER} --version | grep -E -o 'V[0-9]+.[0-9]+.[0-9]+' | cut -c2-"
RESULT_VARIABLE _BASH_SUCCESS
OUTPUT_VARIABLE CMAKE_CUDA_COMPILER_VERSION
OUTPUT_STRIP_TRAILING_WHITESPACE)

if(NOT _BASH_SUCCESS EQUAL 0)
message(FATAL_ERROR "Failed to determine CUDA version")
endif()

else() # Windows
execute_process(
COMMAND ${CMAKE_CUDA_COMPILER} --version
OUTPUT_VARIABLE versionString
RESULT_VARIABLE versionResult)

if(versionResult EQUAL 0 AND versionString MATCHES "V[0-9]+\\.[0-9]+\\.[0-9]+")
string(REGEX REPLACE "V" "" version ${CMAKE_MATCH_0})
set(CMAKE_CUDA_COMPILER_VERSION "${version}")
else()
message(FATAL_ERROR "Failed to determine CUDA version")
endif()
endif()
else()
message(FATAL_ERROR "No CUDA compiler found")
endif()

set(CUDA_REQUIRED_VERSION "11.4")
if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS CUDA_REQUIRED_VERSION)
message(FATAL_ERROR "CUDA version ${CMAKE_CUDA_COMPILER_VERSION} must be at least ${CUDA_REQUIRED_VERSION}")
endif()
endmacro()

macro(setup_cuda_architectures)
# cmake-format: off
# Initialize and normalize CMAKE_CUDA_ARCHITECTURES before enabling CUDA.
# Special values:
# (1) `native` is resolved to HIGHEST available architecture. Fallback to `all` if detection failed.
# (2) `all` / `all-major` / unset is resolved to a default set of architectures we optimized and compiler supports.
# Numerical architectures:
# * For `-virtual` architectures, the last one is kept as it is, and the others are ignored.
# * `-real` suffix is automatically added for other cases.
# * Always use accelerated (`-a` suffix) target for supported real architectures.
# cmake-format: on

if(CMAKE_CUDA_ARCHITECTURES STREQUAL "native")
# Detect highest available compute capability
set(OUTPUTFILE ${PROJECT_BINARY_DIR}/detect_cuda_arch)
set(CUDAFILE ${CMAKE_SOURCE_DIR}/utils/detect_cuda_arch.cu)
execute_process(COMMAND ${CMAKE_CUDA_COMPILER} -lcuda ${CUDAFILE} -o ${OUTPUTFILE})
message(VERBOSE "Detecting native CUDA compute capability")
execute_process(
COMMAND ${OUTPUTFILE}
RESULT_VARIABLE CUDA_RETURN_CODE
OUTPUT_VARIABLE CUDA_ARCH_OUTPUT)
if(NOT ${CUDA_RETURN_CODE} EQUAL 0)
message(WARNING "Detecting native CUDA compute capability - fail")
message(WARNING "CUDA compute capability detection failed, compiling for all optimized architectures")
unset(CMAKE_CUDA_ARCHITECTURES)
else()
message(STATUS "Detecting native CUDA compute capability - done")
set(CMAKE_CUDA_ARCHITECTURES "${CUDA_ARCH_OUTPUT}")
endif()
elseif(CMAKE_CUDA_ARCHITECTURES STREQUAL "all")
unset(CMAKE_CUDA_ARCHITECTURES)
message(STATUS "Setting CMAKE_CUDA_ARCHITECTURES to all enables a list of architectures OnnxRuntime optimized for, "
"not all architectures CUDA compiler supports.")
elseif(CMAKE_CUDA_ARCHITECTURES STREQUAL "all-major")
unset(CMAKE_CUDA_ARCHITECTURES)
message(
STATUS "Setting CMAKE_CUDA_ARCHITECTURES to all-major enables a list of architectures OnnxRuntime optimized for, "
"not all major architectures CUDA compiler supports.")
else()
message(STATUS "Original CMAKE_CUDA_ARCHITECTURES : ${CMAKE_CUDA_ARCHITECTURES}")
endif()

if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
if(CMAKE_LIBRARY_ARCHITECTURE STREQUAL "aarch64-linux-gnu")
# Support for Jetson/Tegra ARM devices
set(CMAKE_CUDA_ARCHITECTURES "53;62;72;87") # TX1/Nano, TX2, Xavier, Orin
else()
if(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12)
# 37, 50 still work in CUDA 11 but are marked deprecated and will be removed in future CUDA version.
set(CMAKE_CUDA_ARCHITECTURES "37;50;52;60;70;75;80;86;89")
elseif(CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12.8)
set(CMAKE_CUDA_ARCHITECTURES "52;60;70;75;80;86;89;90")
else()
set(CMAKE_CUDA_ARCHITECTURES "60;70;75;80;86;89;90;100;120")
endif()
endif()
endif()

unset(CMAKE_CUDA_ARCHITECTURES_CLEAN)
unset(CMAKE_CUDA_ARCHITECTURES_LAST_VIRTUAL)
foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES)
if(CUDA_ARCH STREQUAL "")
continue()
endif()

if(CUDA_ARCH MATCHES "^([1-9])([0-9])+a?-virtual$")
set(CMAKE_CUDA_ARCHITECTURES_LAST_VIRTUAL ${CUDA_ARCH})
elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)a?-real$")
list(APPEND CMAKE_CUDA_ARCHITECTURES_CLEAN ${CMAKE_MATCH_1})
elseif(CUDA_ARCH MATCHES "^(([1-9])([0-9])+)a?$")
list(APPEND CMAKE_CUDA_ARCHITECTURES_CLEAN ${CMAKE_MATCH_1})
else()
message(FATAL_ERROR "Unrecognized CUDA architecture: ${CUDA_ARCH}")
endif()
endforeach()
list(REMOVE_DUPLICATES CMAKE_CUDA_ARCHITECTURES_CLEAN)
set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES_CLEAN})

# CMAKE_CUDA_ARCHITECTURES_ORIG contains all architectures enabled, without automatically added -real or -a suffix.
set(CMAKE_CUDA_ARCHITECTURES_ORIG "${CMAKE_CUDA_ARCHITECTURES}")
message(STATUS "GPU architectures: ${CMAKE_CUDA_ARCHITECTURES_ORIG}")

set(ARCHITECTURES_WITH_KERNELS "80" "86" "89" "90" "100" "120")
foreach(CUDA_ARCH IN LISTS ARCHITECTURES_WITH_KERNELS)
if(NOT "${CUDA_ARCH}" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
add_definitions("-DEXCLUDE_SM_${CUDA_ARCH}")
message(STATUS "Excluding SM ${CUDA_ARCH}")
endif()
endforeach()

# Enable accelerated features (like WGMMA, TMA and setmaxnreg) for SM >= 90.
set(ARCHITECTURES_WITH_ACCEL "90" "100" "101" "120")
unset(CMAKE_CUDA_ARCHITECTURES_NORMALIZED)
foreach(CUDA_ARCH IN LISTS CMAKE_CUDA_ARCHITECTURES)
if("${CUDA_ARCH}" IN_LIST ARCHITECTURES_WITH_ACCEL)
list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}a-real")
else()
list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CUDA_ARCH}-real")
endif()
endforeach()

if(DEFINED CMAKE_CUDA_ARCHITECTURES_LAST_VIRTUAL)
list(APPEND CMAKE_CUDA_ARCHITECTURES_NORMALIZED "${CMAKE_CUDA_ARCHITECTURES_LAST_VIRTUAL}")
endif()

set(CMAKE_CUDA_ARCHITECTURES ${CMAKE_CUDA_ARCHITECTURES_NORMALIZED})

message(STATUS "CMAKE_CUDA_ARCHITECTURES: ${CMAKE_CUDA_ARCHITECTURES}")
endmacro()
7 changes: 6 additions & 1 deletion cmake/onnxruntime_providers_cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@
set(onnxruntime_NVCC_THREADS "1" CACHE STRING "Number of threads that NVCC can use for compilation.")
target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--threads \"${onnxruntime_NVCC_THREADS}\">")
endif()

# Since CUDA 12.8, compiling diagnostics become stricter
if (CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8)
target_compile_options(${target} PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:--relocatable-device-code=true>")
Expand Down Expand Up @@ -261,6 +261,11 @@
set_target_properties(${target} PROPERTIES LINKER_LANGUAGE CUDA)
set_target_properties(${target} PROPERTIES FOLDER "ONNXRuntime")

if("90" IN_LIST CMAKE_CUDA_ARCHITECTURES_ORIG)
target_compile_options(${target} PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xptxas=-w>)
target_compile_definitions(${target} PRIVATE COMPILE_HOPPER_TMA_GEMMS)
endif()

if (onnxruntime_ENABLE_CUDA_PROFILING) # configure cupti for cuda profiling
target_link_libraries(${target} PRIVATE CUDA::cupti)
endif()
Expand Down
39 changes: 39 additions & 0 deletions cmake/utils/detect_cuda_arch.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
#include <algorithm>
#include <cuda_runtime.h>
#include <iomanip>
#include <iostream>
#include <vector>

int main(int argc, char* argv[])
{
int n_devices = 0;
int rc = cudaGetDeviceCount(&n_devices);
if (rc != cudaSuccess)
{
cudaError_t error = cudaGetLastError();
std::cout << "CUDA error: " << cudaGetErrorString(error) << std::endl;
return rc;
}

std::vector<std::pair<int, int>> arch(n_devices);
for (int cd = 0; cd < n_devices; ++cd)
{
cudaDeviceProp dev;
int rc = cudaGetDeviceProperties(&dev, cd);
if (rc != cudaSuccess)
{
cudaError_t error = cudaGetLastError();
std::cout << "CUDA error: " << cudaGetErrorString(error) << std::endl;
return rc;
}
else
{
arch[cd] = {dev.major, dev.minor};
}
}

std::pair<int, int> best_cc = *std::max_element(begin(arch), end(arch));
std::cout << best_cc.first << best_cc.second;

return 0;
}
46 changes: 46 additions & 0 deletions onnxruntime/contrib_ops/cuda/llm/common/cuda_runtime_utils.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cuda_runtime_api.h>
#include "core/providers/cuda/shared_inc/cuda_call.h"

namespace onnxruntime::llm::common {
inline int getDevice() {
int deviceID{0};
CUDA_CALL_THROW(cudaGetDevice(&deviceID));
return deviceID;
}

inline int getSMVersion() {
int device{-1};
CUDA_CALL_THROW(cudaGetDevice(&device));
int sm_major = 0;
int sm_minor = 0;
CUDA_CALL_THROW(cudaDeviceGetAttribute(&sm_major, cudaDevAttrComputeCapabilityMajor, device));
CUDA_CALL_THROW(cudaDeviceGetAttribute(&sm_minor, cudaDevAttrComputeCapabilityMinor, device));
return sm_major * 10 + sm_minor;
}

inline int getMultiProcessorCount() {
int nSM{0};
int deviceID{0};
CUDA_CALL_THROW(cudaGetDevice(&deviceID));
CUDA_CALL_THROW(cudaDeviceGetAttribute(&nSM, cudaDevAttrMultiProcessorCount, deviceID));
return nSM;
}
} // namespace onnxruntime::llm::common
18 changes: 18 additions & 0 deletions onnxruntime/contrib_ops/cuda/llm/common/logger.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.

#pragma once

#include "core/providers/shared_library/provider_api.h"

#ifndef NDEBUG
#define ORT_LLM_LOG_TRACE(msg) LOGS_DEFAULT(VERBOSE) << msg
#define ORT_LLM_LOG_DEBUG(msg) LOGS_DEFAULT(VERBOSE) << msg
#else
#define ORT_LLM_LOG_TRACE(msg)
#define ORT_LLM_LOG_DEBUG(msg)
#endif

#define ORT_LLM_LOG_INFO(msg) LOGS_DEFAULT(INFO) << msg
#define ORT_LLM_LOG_WARNING(msg) LOGS_DEFAULT(WARNING) << msg
#define ORT_LLM_LOG_ERROR(msg) LOGS_DEFAULT(ERROR) << msg
Loading
Loading