diff --git a/.flake/pkgs/legion.nix b/.flake/pkgs/legion.nix index 814ef85e00..543ba3c04b 100644 --- a/.flake/pkgs/legion.nix +++ b/.flake/pkgs/legion.nix @@ -1,19 +1,37 @@ { lib -, stdenv , fetchFromGitLab , cmake +, config , python3 , cudaPackages ? { } , cudaCapabilities ? [ "60" "70" "80" "86" ] +, rocmPackages ? { } , maxDim ? 5 +, useCuda ? config.cudaSupport +, useRocm ? config.rocmSupport +, stdenv ? if useCuda then cudaPackages.backendStdenv else rocmPackages.llvm.rocmClangStdenv }: # from https://codeberg.org/Uli/nix-things/src/commit/776519e382c81b136c1d0b10d8c7b52b4acb9192/overlays/cq/python/libclang-python.nix let cmakeFlag = x: if x then "1" else "0"; - inherit (cudaPackages) cudatoolkit; + inherit (lib) + cmakeBool + cmakeFeature + optionals + ; + + cudaBuildInputs = with cudaPackages; [ + cudatoolkit + ]; + rocmBuildInputs = with rocmPackages; [ + clr + rocthrust + rocprim + llvm.clang + ]; in stdenv.mkDerivation rec { @@ -35,19 +53,41 @@ stdenv.mkDerivation rec { cmakeFlags = [ "-DLegion_USE_Python=1" "-DLegion_BUILD_BINDINGS=1" - "-DLegion_USE_CUDA=1" - "-DLegion_CUDA_ARCH=${lib.concatStringsSep "," cudaCapabilities}" "-DLegion_MAX_DIM=${toString maxDim}" - ]; + ] + ++ optionals useRocm [ + # TODO: this is the legacy way of setting hip compiler. Once we update nixpkgs version we should use the new way. It will be a quick fix + (cmakeFeature "Legion_USE_HIP" "1") + (cmakeFeature "HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets)) + (cmakeFeature "HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang") + (cmakeFeature "HIP_RUNTIME" "rocclr") + (cmakeFeature "HIP_PLATFORM" "amd") + (cmakeFeature "HIP_PATH" "${rocmPackages.clr}/hip") + (cmakeFeature "HIP_ROOT_DIR" "${rocmPackages.clr}") + (cmakeFeature "HIP_THRUST_ROOT_DIR" "${rocmPackages.rocthrust}") + (cmakeFeature "ROCM_PATH" "${rocmPackages.clr}") + + (cmakeFeature "HIP_INCLUDE_DIRS" "${rocmPackages.clr}/hip/include") + + (cmakeFeature "CMAKE_CXX_COMPILER" "hipcc") + (cmakeFeature "CMAKE_C_COMPILER" "hipcc") + ] + ++ optionals useCuda [ + (cmakeFeature "Legion_USE_CUDA" "1") + (cmakeFeature "CMAKE_CUDA_ARCH" (builtins.concatStringsSep ";" cudaCapabilities)) + ]; + + buildInputs = [ python3 - cudatoolkit - ]; + ] + ++ optionals useCuda cudaBuildInputs + ++ optionals useRocm rocmBuildInputs; meta = with lib; { description = "Legion is a parallel programming model for distributed, heterogeneous machines"; homepage = "https://github.com/StanfordLegion/legion"; license = licenses.asl20; }; -} +} \ No newline at end of file diff --git a/.proj.toml b/.proj.toml index 01ae36eddd..ee2d677c18 100644 --- a/.proj.toml +++ b/.proj.toml @@ -2,26 +2,34 @@ project_name = "flexflow" testsuite_macro = "FF_TEST_SUITE" namespace_name = "FlexFlow" header_extension = ".h" +fix_compile_commands = false build_targets = [ - "utils", - "op-attrs", "kernels", - "pcg", + # "pcg", # "substitutions", # "compiler", - "substitution-generator", - "local-execution", + # "substitution-generator", + # "local-execution", ] test_targets = [ - "utils-tests", - "op-attrs-tests", - "pcg-tests", + # "utils-tests", # "substitutions-tests", # "compiler-tests", - "substitution-generator-tests", + # "pcg", + # "substitutions", + # "compiler", + # "substitution-generator", ] [cmake_flags_extra] -FF_CUDA_ARCH = "60" -CMAKE_CUDA_ARCHITECTURES = "60" +FF_USE_HIP_ROCM = "ON" +FF_GPU_BACKEND = "hip_rocm" +# CMAKE_CUDA_ARCHITECTURES = "60" +CMAKE_HIP_ARCHITECTURES = "gfx900" +# HIP_PLATFORM = "amd" +# HIP_RUNTIME = "rocclr" +CMAKE_CXX_COMPILER = "hipcc" +CMAKE_C_COMPILER = "hipcc" + +# FF_CUDA_ARCH = "60" diff --git a/CMakeLists.txt b/CMakeLists.txt index a518931ac5..211f9a867c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -84,9 +84,9 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Linux") set(LIBEXT ".so") endif() -include(cuda) -include(cudnn) -include(nccl) +# include(cuda) +# include(cudnn) +# include(nccl) if (FF_USE_CODE_COVERAGE) include(CodeCoverage) append_coverage_compiler_flags() diff --git a/flake.nix b/flake.nix index 1c54b4f025..b718ea05e4 100644 --- a/flake.nix +++ b/flake.nix @@ -6,10 +6,14 @@ extra-substituters = [ "https://ff.cachix.org" "https://cuda-maintainers.cachix.org/" + "https://llama-cpp.cachix.org" + "https://nixos-rocm.cachix.org/" ]; extra-trusted-public-keys = [ "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E=" "ff.cachix.org-1:/kyZ0w35ToSJBjpiNfPLrL3zTjuPkUiqf2WH0GIShXM=" + "nixos-rocm.cachix.org-1:VEpsf7pRIijjd8csKjFNBGzkBqOmw8H9PRmgAq14LnE=" + "llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc=" ]; }; @@ -29,11 +33,33 @@ pkgs = import nixpkgs { inherit system; config.allowUnfree = true; + config.rocmSupport = true; }; lib = pkgs.lib; + inherit (pkgs.rocmPackages) clr miopen miopengemm rccl rocm-runtime; + + rocm = pkgs.symlinkJoin { + name = "rocm"; + paths = with pkgs.rocmPackages; [ + rocm-thunk + rocm-runtime + rocm-device-libs + clr + hipcc + rccl + llvm.clang + miopen + miopengemm + miopen-hip + hipblas + rocm-cmake + hip-common + ]; + }; + mkShell = pkgs.mkShell.override { - stdenv = pkgs.cudaPackages.backendStdenv; + stdenv = pkgs.rocmPackages.llvm.rocmClangStdenv; }; in { @@ -61,7 +87,14 @@ devShells = rec { ci = mkShell { shellHook = '' + export HIP_COMPILER="${pkgs.rocmPackages.llvm.clang}/bin/clang" export PATH="$HOME/ff/.scripts/:$PATH" + export ROCM_PATH=${clr} + export HIP_DEVICE_LIB_PATH="${pkgs.rocmPackages.rocm-device-libs}/amdgcn/bitcode" + # export HIP_ROOT_DIR=${clr} + # export HIP_PATH=${clr}/hip + # export HIP_INCLUDE_DIRS=${clr}/hip/include + echo "ROCm path set to: $ROCM_PATH" ''; CMAKE_FLAGS = lib.strings.concatStringsSep " " [ @@ -76,6 +109,14 @@ "-DFF_USE_EXTERNAL_RANGEV3=ON" "-DFF_USE_EXTERNAL_BOOST_PREPROCESSOR=ON" "-DFF_USE_EXTERNAL_TYPE_INDEX=ON" + + # hip related flags + "-DHIP_PLATFORM=amd" + # "-DHIP_RUNTIME=rocclr" + # "-DHIP_COMPILER=${pkgs.rocmPackages.llvm.clang}/bin/clang" + "-DHIP_PATH=${clr}/hip" + "-DHIP_ROOT_DIR=${clr}/hip" + ]; RC_PARAMS = "max_discard_ratio=100"; @@ -92,14 +133,7 @@ ccache pkg-config python3 - cudatoolkit - cudaPackages.cuda_nvcc - cudaPackages.cudnn - cudaPackages.nccl - cudaPackages.libcublas - cudaPackages.cuda_cudart tl-expected - lcov # for code coverage ]) (with self.packages.${system}; [ legion @@ -107,6 +141,21 @@ rapidcheckFull doctest ]) + (with pkgs.rocmPackages; [ + clr + miopen + miopengemm + rccl + rocm-runtime + hipblas + hipcc + hip-common + rocm-cmake + miopen-hip + rocm-thunk + rocm-device-libs + ]) + # [ rocm ] ]; }; @@ -129,7 +178,6 @@ compdb jq gh - lcov # for code coverage ]) (with proj-repo.packages.${system}; [ proj @@ -152,4 +200,4 @@ }; } ); -} +} \ No newline at end of file diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt index b2b81c85bd..67ab5c09a0 100644 --- a/lib/kernels/CMakeLists.txt +++ b/lib/kernels/CMakeLists.txt @@ -1,13 +1,50 @@ set(project_target kernels) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CXX_STANDARD_REQUIRED ON) + +message("rocm path: $ENV{ROCM_PATH}") + project(${project_target} - LANGUAGES CXX CUDA) + LANGUAGES CXX HIP) + +message("rocm path after: $ENV{ROCM_PATH}") + + +# if (DEFINED ENV{ROCM_PATH}) +# set(ROCM_PATH $ENV{ROCM_PATH}) +# else() + # message(FATAL_ERROR "ROCM_PATH is not set") +# endif() +list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}) +if(CXX_IS_HIPCC) + if(LINUX) + if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang") + message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++") + endif() + + message(WARNING "Setting hipcc as the C++ compiler is legacy behavior." + " Prefer setting the HIP compiler directly. See README for details.") + endif() +else() + # Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES. + if(AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES) + set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_ARGETS}) + endif() + cmake_minimum_required(VERSION 3.21) + enable_language(HIP) +endif() + + +find_package(hip REQUIRED) +find_package(miopen REQUIRED) +find_package(rccl REQUIRED) file(GLOB_RECURSE SRC CONFIGURE_DEPENDS LIST_DIRECTORIES False - src/*.cc - src/cuda/ops/*.cu + # src/*.cc + src/hip/concat_kernels.cpp ) add_library( @@ -25,9 +62,9 @@ target_include_directories( target_link_libraries( ${project_target} op-attrs - cuda - cudnn - nccl + MIOpen + hip::host + rccl ) define_ff_vars(${project_target}) @@ -35,5 +72,5 @@ define_ff_vars(${project_target}) set_target_properties( ${project_target} PROPERTIES - CUDA_STANDARD 17 -) + HIP_STANDARD 17 +) \ No newline at end of file diff --git a/lib/kernels/include/kernels/device.h b/lib/kernels/include/kernels/device.h index 439937177a..ac44438367 100644 --- a/lib/kernels/include/kernels/device.h +++ b/lib/kernels/include/kernels/device.h @@ -7,7 +7,7 @@ #include #elif defined(FF_USE_HIP_ROCM) #include -#include +#include #include #else #error "Unknown device" @@ -57,21 +57,21 @@ typedef miopenTensorDescriptor_t ffTensorDescriptor_t; typedef miopenActivationDescriptor_t ffActivationDescriptor_t; typedef miopenPoolingDescriptor_t ffPoolingDescriptor_t; typedef miopenBatchNormMode_t ffBatchNormMode_t; -typedef miopenFilterDescriptor_t ffFilterDescriptor_t; +typedef miopenTensorDescriptor_t ffFilterDescriptor_t; typedef miopenConvolutionDescriptor_t ffConvolutionDescriptor_t; -typedef miopenConvolutionFwdAlgo_t ffConvolutionFwdAlgo_t; -typedef miopenConvolutionBwdFilterAlgo_t ffConvolutionBwdFilterAlgo_t; -typedef miopenConvolutionBwdDataAlgo_t ffConvolutionBwdDataAlgo_t; +// typedef miopenConvolutionFwdAlgo_t ffConvolutionFwdAlgo_t; //we don't have this one in miopen +// typedef miopenConvolutionBwdFilterAlgo_t ffConvolutionBwdFilterAlgo_t; // don't have this either +// typedef miopenConvolutionBwdDataAlgo_t ffConvolutionBwdDataAlgo_t; typedef miopenDropoutDescriptor_t ffDropoutDescriptor_t; -typedef miopenOpTensorDescriptor_t ffOpTensorDescriptor_t; +typedef miopenTensorDescriptor_t ffOpTensorDescriptor_t; //don't have this either but will use miopenTensorDescriptor_t as a placeholder typedef miopenReduceTensorDescriptor_t ffReduceTensorDescriptor_t; -typedef miopenAttnDescriptor_t ffAttnDescriptor_t; -typedef miopenSeqDataDescriptor_t ffSeqDataDescriptor_t; +// typedef miopenAttnDescriptor_t ffAttnDescriptor_t; +// typedef miopenSeqDataDescriptor_t ffSeqDataDescriptor_t; typedef miopenHandle_t ffHandle_t; typedef hipEvent_t ffEvent_t; typedef hipblasHandle_t ffblasHandle_t; typedef miopenStatus_t ffStatus_t; -typedef hipblasDataType_t ffDataType_t; +typedef hipblasDatatype_t ffDataType_t; typedef miopenDataType_t ffCudnnDataType_t; typedef hipError_t ffError_t; #else diff --git a/lib/kernels/include/kernels/ff_handle.h b/lib/kernels/include/kernels/ff_handle.h index 89df04e3c1..116f04f35a 100644 --- a/lib/kernels/include/kernels/ff_handle.h +++ b/lib/kernels/include/kernels/ff_handle.h @@ -1,7 +1,9 @@ #ifndef _FLEXFLOW_KERNELS_FF_HANDLE_H #define _FLEXFLOW_KERNELS_FF_HANDLE_H -#ifdef FF_USE_NCCL +#ifdef FF_USE_HIP_ROCM +#include +#elif FF_USE_NCCL #include #endif @@ -18,12 +20,12 @@ struct PerDeviceFFHandle { size_t workSpaceSize; bool allowTensorOpMathConversion; -#ifdef FF_USE_NCCL +#if defined(FF_USE_HIP_ROCM) || defined(FF_USE_NCCL) ncclComm_t ncclComm; #endif }; -#ifdef FF_USE_NCCL +#if defined(FF_USE_HIP_ROCM) || defined(FF_USE_NCCL) FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(PerDeviceFFHandle, dnn, blas, diff --git a/lib/kernels/include/kernels/nccl.h b/lib/kernels/include/kernels/nccl.h index b8a6784676..5a074bbfad 100644 --- a/lib/kernels/include/kernels/nccl.h +++ b/lib/kernels/include/kernels/nccl.h @@ -1,10 +1,15 @@ #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_NCCL_H #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_NCCL_H -#ifdef FF_USE_NCCL +#ifdef FF_USE_HIP_ROCM +#include +#elif FF_USE_NCCL +#include +#endif + +#if defined(FF_USE_HIP_ROCM) || defined(FF_USE_NCCL) #include #include -#include #define checkNCCL(cmd) \ do { \ diff --git a/lib/kernels/src/device.cc b/lib/kernels/src/device.cc index 0df5e84ee9..7156957df1 100644 --- a/lib/kernels/src/device.cc +++ b/lib/kernels/src/device.cc @@ -39,7 +39,7 @@ ffError_t #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA) return cudaEventElapsedTime(elapsed, start, stop); #elif defined(FF_USE_HIP_ROCM) - return cudaEventElapsedTime(elapsed, start, stop); + return hipEventElapsedTime(elapsed, start, stop); #endif } diff --git a/lib/kernels/src/device.h b/lib/kernels/src/device.h index 173cd14557..cc441dd9a4 100644 --- a/lib/kernels/src/device.h +++ b/lib/kernels/src/device.h @@ -84,13 +84,13 @@ __host__ void relu_backward_kernel(DataType data_type, void *output_grad_ptr, void const *output_ptr, size_t output_size, - cudaStream_t stream); + hipStream_t stream); __host__ void sigmoid_backward_kernel(DataType data_type, void *output_grad_ptr, void const *output_ptr, size_t output_size, - cudaStream_t stream); + hipStream_t stream); template __global__ void apply_add_with_scale(DT *data_ptr, diff --git a/lib/kernels/src/hip/ops/concat_kernels.cpp b/lib/kernels/src/hip/ops/concat_kernels.cpp index aa38be739b..369563ab9e 100644 --- a/lib/kernels/src/hip/ops/concat_kernels.cpp +++ b/lib/kernels/src/hip/ops/concat_kernels.cpp @@ -13,10 +13,12 @@ * limitations under the License. */ +#include "device.h" #include "kernels/concat_kernels.h" #include "device.h" #include #include +#include namespace FlexFlow { namespace Kernels { @@ -73,9 +75,10 @@ void backward_kernel(hipStream_t stream, coord_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS]; int num_inputs = input_grads.size(); assert(num_inputs <= MAX_NUM_INPUTS); + calc_blk_size(num_blocks, output_blk_size, output_grad.shape, axis); for (int i = 0; i < num_inputs; i++) { - shape = input_grads[i].shape; + ArrayShape shape = input_grads[i].shape; size_t input_num_blocks = 1; calc_blk_size(input_num_blocks, input_blk_sizes[i], shape, axis); assert(input_num_blocks == num_blocks); @@ -99,4 +102,4 @@ void backward_kernel(hipStream_t stream, } // namespace Concat } // namespace Kernels -} // namespace FlexFlow +} // namespace FlexFlow \ No newline at end of file diff --git a/lib/utils/CMakeLists.txt b/lib/utils/CMakeLists.txt index a0d77b9f76..7215684f7e 100644 --- a/lib/utils/CMakeLists.txt +++ b/lib/utils/CMakeLists.txt @@ -12,7 +12,7 @@ ff_add_library( visit_struct fmt json - cuda + # cuda ) add_subdirectory(ffi)