diff --git a/.flake/pkgs/legion.nix b/.flake/pkgs/legion.nix
index 814ef85e00..543ba3c04b 100644
--- a/.flake/pkgs/legion.nix
+++ b/.flake/pkgs/legion.nix
@@ -1,19 +1,37 @@
 { lib
-, stdenv
 , fetchFromGitLab
 , cmake
+, config
 , python3
 , cudaPackages ? { }
 , cudaCapabilities ? [ "60" "70" "80" "86" ]
+, rocmPackages ? { }
 , maxDim ? 5
+, useCuda ? config.cudaSupport
+, useRocm ? config.rocmSupport
+, stdenv ? if useCuda then cudaPackages.backendStdenv else rocmPackages.llvm.rocmClangStdenv
 }:
 
 # from https://codeberg.org/Uli/nix-things/src/commit/776519e382c81b136c1d0b10d8c7b52b4acb9192/overlays/cq/python/libclang-python.nix
 
 let 
   cmakeFlag = x: if x then "1" else "0";
-
   inherit (cudaPackages) cudatoolkit;
+  inherit (lib)
+    cmakeBool
+    cmakeFeature
+    optionals
+    ;
+
+  cudaBuildInputs = with cudaPackages; [
+    cudatoolkit
+  ];
+  rocmBuildInputs = with rocmPackages; [
+    clr
+    rocthrust
+    rocprim
+    llvm.clang
+  ];
 in
 
 stdenv.mkDerivation rec {
@@ -35,19 +53,41 @@ stdenv.mkDerivation rec {
   cmakeFlags = [
     "-DLegion_USE_Python=1"
     "-DLegion_BUILD_BINDINGS=1"
-    "-DLegion_USE_CUDA=1"
-    "-DLegion_CUDA_ARCH=${lib.concatStringsSep "," cudaCapabilities}"
     "-DLegion_MAX_DIM=${toString maxDim}"
-  ];
+  ]
+  ++ optionals useRocm [
+        # TODO: this is the legacy way of setting hip compiler. Once we update nixpkgs version we should use the new way. It will be a quick fix
+        (cmakeFeature "Legion_USE_HIP" "1")
+        (cmakeFeature "HIP_ARCHITECTURES" (builtins.concatStringsSep ";" rocmPackages.clr.gpuTargets))
+        (cmakeFeature "HIP_COMPILER" "${rocmPackages.llvm.clang}/bin/clang")
+        (cmakeFeature "HIP_RUNTIME" "rocclr")
+        (cmakeFeature "HIP_PLATFORM" "amd")
+        (cmakeFeature "HIP_PATH" "${rocmPackages.clr}/hip")
+        (cmakeFeature "HIP_ROOT_DIR" "${rocmPackages.clr}")
+        (cmakeFeature "HIP_THRUST_ROOT_DIR" "${rocmPackages.rocthrust}")
+        (cmakeFeature "ROCM_PATH" "${rocmPackages.clr}")
+
+        (cmakeFeature "HIP_INCLUDE_DIRS" "${rocmPackages.clr}/hip/include")
+
+        (cmakeFeature "CMAKE_CXX_COMPILER" "hipcc")
+        (cmakeFeature "CMAKE_C_COMPILER" "hipcc")
+      ]
+  ++ optionals useCuda [
+        (cmakeFeature "Legion_USE_CUDA" "1")
+        (cmakeFeature "CMAKE_CUDA_ARCH" (builtins.concatStringsSep ";" cudaCapabilities))
+      ];
+
+
 
   buildInputs = [ 
     python3
-    cudatoolkit
-  ];
+  ]
+  ++ optionals useCuda cudaBuildInputs
+  ++ optionals useRocm rocmBuildInputs;
 
   meta = with lib; {
     description = "Legion is a parallel programming model for distributed, heterogeneous machines";
     homepage = "https://github.com/StanfordLegion/legion";
     license = licenses.asl20;
   };
-}
+}
\ No newline at end of file
diff --git a/.proj.toml b/.proj.toml
index 01ae36eddd..ee2d677c18 100644
--- a/.proj.toml
+++ b/.proj.toml
@@ -2,26 +2,34 @@ project_name = "flexflow"
 testsuite_macro = "FF_TEST_SUITE"
 namespace_name = "FlexFlow"
 header_extension = ".h"
+fix_compile_commands = false
 
 build_targets = [
-  "utils",
-  "op-attrs",
   "kernels",
-  "pcg",
+  # "pcg",
   # "substitutions",
   # "compiler",
-  "substitution-generator",
-  "local-execution",
+  # "substitution-generator",
+  # "local-execution",
 ]
 test_targets = [
-  "utils-tests",
-  "op-attrs-tests",
-  "pcg-tests",
+  # "utils-tests",
   # "substitutions-tests",
   # "compiler-tests",
-  "substitution-generator-tests",
+  # "pcg",
+  # "substitutions",
+  # "compiler",
+  # "substitution-generator",
 ]
 
 [cmake_flags_extra]
-FF_CUDA_ARCH = "60"
-CMAKE_CUDA_ARCHITECTURES = "60"
+FF_USE_HIP_ROCM = "ON"
+FF_GPU_BACKEND = "hip_rocm"
+# CMAKE_CUDA_ARCHITECTURES = "60"
+CMAKE_HIP_ARCHITECTURES = "gfx900"
+# HIP_PLATFORM = "amd"
+# HIP_RUNTIME = "rocclr"
+CMAKE_CXX_COMPILER = "hipcc"
+CMAKE_C_COMPILER = "hipcc"
+
+# FF_CUDA_ARCH = "60"
diff --git a/CMakeLists.txt b/CMakeLists.txt
index a518931ac5..211f9a867c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -84,9 +84,9 @@ if(${CMAKE_SYSTEM_NAME} MATCHES "Linux")
   set(LIBEXT ".so")
 endif()
 
-include(cuda)
-include(cudnn)
-include(nccl)
+# include(cuda)
+# include(cudnn)
+# include(nccl)
 if (FF_USE_CODE_COVERAGE)
   include(CodeCoverage)
   append_coverage_compiler_flags()
diff --git a/flake.nix b/flake.nix
index 1c54b4f025..b718ea05e4 100644
--- a/flake.nix
+++ b/flake.nix
@@ -6,10 +6,14 @@
     extra-substituters = [
       "https://ff.cachix.org"
       "https://cuda-maintainers.cachix.org/"
+      "https://llama-cpp.cachix.org"
+      "https://nixos-rocm.cachix.org/"
     ];
     extra-trusted-public-keys = [
       "cuda-maintainers.cachix.org-1:0dq3bujKpuEPMCX6U4WylrUDZ9JyUG0VpVZa7CNfq5E="
       "ff.cachix.org-1:/kyZ0w35ToSJBjpiNfPLrL3zTjuPkUiqf2WH0GIShXM="
+      "nixos-rocm.cachix.org-1:VEpsf7pRIijjd8csKjFNBGzkBqOmw8H9PRmgAq14LnE="
+      "llama-cpp.cachix.org-1:H75X+w83wUKTIPSO1KWy9ADUrzThyGs8P5tmAbkWhQc="
     ];
   };
 
@@ -29,11 +33,33 @@
       pkgs = import nixpkgs {
         inherit system;
         config.allowUnfree = true;
+        config.rocmSupport = true;
       };
       lib = pkgs.lib;
 
+      inherit (pkgs.rocmPackages) clr miopen miopengemm rccl rocm-runtime;
+
+      rocm = pkgs.symlinkJoin {
+        name = "rocm";
+        paths = with pkgs.rocmPackages; [ 
+          rocm-thunk 
+          rocm-runtime 
+          rocm-device-libs 
+          clr 
+          hipcc
+          rccl
+          llvm.clang
+          miopen
+          miopengemm
+          miopen-hip
+          hipblas
+          rocm-cmake
+          hip-common
+        ];
+      };
+
       mkShell = pkgs.mkShell.override {
-        stdenv = pkgs.cudaPackages.backendStdenv;
+        stdenv = pkgs.rocmPackages.llvm.rocmClangStdenv;
       };
     in 
     {
@@ -61,7 +87,14 @@
       devShells = rec {
         ci = mkShell {
           shellHook = ''
+            export HIP_COMPILER="${pkgs.rocmPackages.llvm.clang}/bin/clang"
             export PATH="$HOME/ff/.scripts/:$PATH"
+            export ROCM_PATH=${clr}
+            export HIP_DEVICE_LIB_PATH="${pkgs.rocmPackages.rocm-device-libs}/amdgcn/bitcode"
+            # export HIP_ROOT_DIR=${clr}
+            # export HIP_PATH=${clr}/hip
+            # export HIP_INCLUDE_DIRS=${clr}/hip/include
+            echo "ROCm path set to: $ROCM_PATH"
           '';
           
           CMAKE_FLAGS = lib.strings.concatStringsSep " " [
@@ -76,6 +109,14 @@
             "-DFF_USE_EXTERNAL_RANGEV3=ON"
             "-DFF_USE_EXTERNAL_BOOST_PREPROCESSOR=ON"
             "-DFF_USE_EXTERNAL_TYPE_INDEX=ON"
+
+            # hip related flags
+            "-DHIP_PLATFORM=amd"
+            # "-DHIP_RUNTIME=rocclr"
+            # "-DHIP_COMPILER=${pkgs.rocmPackages.llvm.clang}/bin/clang"
+            "-DHIP_PATH=${clr}/hip"
+            "-DHIP_ROOT_DIR=${clr}/hip"
+
           ];
 
           RC_PARAMS = "max_discard_ratio=100";
@@ -92,14 +133,7 @@
               ccache
               pkg-config
               python3
-              cudatoolkit
-              cudaPackages.cuda_nvcc
-              cudaPackages.cudnn
-              cudaPackages.nccl
-              cudaPackages.libcublas
-              cudaPackages.cuda_cudart
               tl-expected
-              lcov # for code coverage
             ])
             (with self.packages.${system}; [
               legion
@@ -107,6 +141,21 @@
               rapidcheckFull
               doctest
             ])
+            (with pkgs.rocmPackages; [
+              clr
+              miopen
+              miopengemm
+              rccl
+              rocm-runtime
+              hipblas
+              hipcc
+              hip-common
+              rocm-cmake
+              miopen-hip
+              rocm-thunk
+              rocm-device-libs
+            ])
+            # [ rocm ]
           ];
         };
 
@@ -129,7 +178,6 @@
               compdb
               jq
               gh
-              lcov # for code coverage
             ])
             (with proj-repo.packages.${system}; [
               proj
@@ -152,4 +200,4 @@
       };
     }
   );
-}
+}
\ No newline at end of file
diff --git a/lib/kernels/CMakeLists.txt b/lib/kernels/CMakeLists.txt
index b2b81c85bd..67ab5c09a0 100644
--- a/lib/kernels/CMakeLists.txt
+++ b/lib/kernels/CMakeLists.txt
@@ -1,13 +1,50 @@
 set(project_target kernels)
 
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+
+message("rocm path: $ENV{ROCM_PATH}")
+
 project(${project_target} 
-	LANGUAGES CXX CUDA)
+  LANGUAGES CXX HIP)
+
+message("rocm path after: $ENV{ROCM_PATH}")
+
+
+# if (DEFINED ENV{ROCM_PATH})
+#   set(ROCM_PATH $ENV{ROCM_PATH})
+# else()
+  # message(FATAL_ERROR "ROCM_PATH is not set")
+# endif()
+list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH})
+if(CXX_IS_HIPCC)
+  if(LINUX)
+      if (NOT ${CMAKE_CXX_COMPILER_ID} MATCHES "Clang")
+          message(WARNING "Only LLVM is supported for HIP, hint: CXX=/opt/rocm/llvm/bin/clang++")
+      endif()
+
+      message(WARNING "Setting hipcc as the C++ compiler is legacy behavior."
+              " Prefer setting the HIP compiler directly. See README for details.")
+  endif()
+else()
+    # Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
+    if(AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
+        set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_ARGETS})
+    endif()
+    cmake_minimum_required(VERSION 3.21)
+    enable_language(HIP)
+endif()
+
+
+find_package(hip REQUIRED)
+find_package(miopen REQUIRED)
+find_package(rccl REQUIRED)
 
 file(GLOB_RECURSE SRC
      CONFIGURE_DEPENDS
      LIST_DIRECTORIES False
-     src/*.cc
-     src/cuda/ops/*.cu
+    #  src/*.cc
+     src/hip/concat_kernels.cpp
      )
 
 add_library(
@@ -25,9 +62,9 @@ target_include_directories(
 target_link_libraries(
   ${project_target}
   op-attrs
-  cuda
-  cudnn
-  nccl
+  MIOpen
+  hip::host
+  rccl
 )
 
 define_ff_vars(${project_target})
@@ -35,5 +72,5 @@ define_ff_vars(${project_target})
 set_target_properties(
   ${project_target} 
   PROPERTIES 
-  CUDA_STANDARD 17
-)
+  HIP_STANDARD 17
+)
\ No newline at end of file
diff --git a/lib/kernels/include/kernels/device.h b/lib/kernels/include/kernels/device.h
index 439937177a..ac44438367 100644
--- a/lib/kernels/include/kernels/device.h
+++ b/lib/kernels/include/kernels/device.h
@@ -7,7 +7,7 @@
 #include <cudnn.h>
 #elif defined(FF_USE_HIP_ROCM)
 #include <hip/hip_runtime.h>
-#include <hipblas.h>
+#include <hipblas/hipblas.h>
 #include <miopen/miopen.h>
 #else
 #error "Unknown device"
@@ -57,21 +57,21 @@ typedef miopenTensorDescriptor_t ffTensorDescriptor_t;
 typedef miopenActivationDescriptor_t ffActivationDescriptor_t;
 typedef miopenPoolingDescriptor_t ffPoolingDescriptor_t;
 typedef miopenBatchNormMode_t ffBatchNormMode_t;
-typedef miopenFilterDescriptor_t ffFilterDescriptor_t;
+typedef miopenTensorDescriptor_t ffFilterDescriptor_t;
 typedef miopenConvolutionDescriptor_t ffConvolutionDescriptor_t;
-typedef miopenConvolutionFwdAlgo_t ffConvolutionFwdAlgo_t;
-typedef miopenConvolutionBwdFilterAlgo_t ffConvolutionBwdFilterAlgo_t;
-typedef miopenConvolutionBwdDataAlgo_t ffConvolutionBwdDataAlgo_t;
+// typedef miopenConvolutionFwdAlgo_t ffConvolutionFwdAlgo_t;  //we don't have this one in miopen
+// typedef miopenConvolutionBwdFilterAlgo_t ffConvolutionBwdFilterAlgo_t; // don't have this either
+// typedef miopenConvolutionBwdDataAlgo_t ffConvolutionBwdDataAlgo_t;
 typedef miopenDropoutDescriptor_t ffDropoutDescriptor_t;
-typedef miopenOpTensorDescriptor_t ffOpTensorDescriptor_t;
+typedef miopenTensorDescriptor_t ffOpTensorDescriptor_t;    //don't have this either but will use miopenTensorDescriptor_t as a placeholder
 typedef miopenReduceTensorDescriptor_t ffReduceTensorDescriptor_t;
-typedef miopenAttnDescriptor_t ffAttnDescriptor_t;
-typedef miopenSeqDataDescriptor_t ffSeqDataDescriptor_t;
+// typedef miopenAttnDescriptor_t ffAttnDescriptor_t;
+// typedef miopenSeqDataDescriptor_t ffSeqDataDescriptor_t;
 typedef miopenHandle_t ffHandle_t;
 typedef hipEvent_t ffEvent_t;
 typedef hipblasHandle_t ffblasHandle_t;
 typedef miopenStatus_t ffStatus_t;
-typedef hipblasDataType_t ffDataType_t;
+typedef hipblasDatatype_t ffDataType_t;
 typedef miopenDataType_t ffCudnnDataType_t;
 typedef hipError_t ffError_t;
 #else
diff --git a/lib/kernels/include/kernels/ff_handle.h b/lib/kernels/include/kernels/ff_handle.h
index 89df04e3c1..116f04f35a 100644
--- a/lib/kernels/include/kernels/ff_handle.h
+++ b/lib/kernels/include/kernels/ff_handle.h
@@ -1,7 +1,9 @@
 #ifndef _FLEXFLOW_KERNELS_FF_HANDLE_H
 #define _FLEXFLOW_KERNELS_FF_HANDLE_H
 
-#ifdef FF_USE_NCCL
+#ifdef FF_USE_HIP_ROCM
+#include <rccl/rccl.h>
+#elif FF_USE_NCCL
 #include <nccl.h>
 #endif
 
@@ -18,12 +20,12 @@ struct PerDeviceFFHandle {
   size_t workSpaceSize;
   bool allowTensorOpMathConversion;
 
-#ifdef FF_USE_NCCL
+#if defined(FF_USE_HIP_ROCM) || defined(FF_USE_NCCL)
   ncclComm_t ncclComm;
 #endif
 };
 
-#ifdef FF_USE_NCCL
+#if defined(FF_USE_HIP_ROCM) || defined(FF_USE_NCCL)
 FF_VISITABLE_STRUCT_NONSTANDARD_CONSTRUCTION(PerDeviceFFHandle,
                                              dnn,
                                              blas,
diff --git a/lib/kernels/include/kernels/nccl.h b/lib/kernels/include/kernels/nccl.h
index b8a6784676..5a074bbfad 100644
--- a/lib/kernels/include/kernels/nccl.h
+++ b/lib/kernels/include/kernels/nccl.h
@@ -1,10 +1,15 @@
 #ifndef _FLEXFLOW_KERNELS_INCLUDE_KERNELS_NCCL_H
 #define _FLEXFLOW_KERNELS_INCLUDE_KERNELS_NCCL_H
 
-#ifdef FF_USE_NCCL
+#ifdef FF_USE_HIP_ROCM
+#include <rccl/rccl.h>
+#elif FF_USE_NCCL
+#include <nccl.h>
+#endif
+
+#if defined(FF_USE_HIP_ROCM) || defined(FF_USE_NCCL)
 #include <cstdio>
 #include <cstdlib>
-#include <nccl.h>
 
 #define checkNCCL(cmd)                                                         \
   do {                                                                         \
diff --git a/lib/kernels/src/device.cc b/lib/kernels/src/device.cc
index 0df5e84ee9..7156957df1 100644
--- a/lib/kernels/src/device.cc
+++ b/lib/kernels/src/device.cc
@@ -39,7 +39,7 @@ ffError_t
 #if defined(FF_USE_CUDA) || defined(FF_USE_HIP_CUDA)
   return cudaEventElapsedTime(elapsed, start, stop);
 #elif defined(FF_USE_HIP_ROCM)
-  return cudaEventElapsedTime(elapsed, start, stop);
+  return hipEventElapsedTime(elapsed, start, stop);
 #endif
 }
 
diff --git a/lib/kernels/src/device.h b/lib/kernels/src/device.h
index 173cd14557..cc441dd9a4 100644
--- a/lib/kernels/src/device.h
+++ b/lib/kernels/src/device.h
@@ -84,13 +84,13 @@ __host__ void relu_backward_kernel(DataType data_type,
                                    void *output_grad_ptr,
                                    void const *output_ptr,
                                    size_t output_size,
-                                   cudaStream_t stream);
+                                   hipStream_t stream);
 
 __host__ void sigmoid_backward_kernel(DataType data_type,
                                       void *output_grad_ptr,
                                       void const *output_ptr,
                                       size_t output_size,
-                                      cudaStream_t stream);
+                                      hipStream_t stream);
 
 template <typename DT>
 __global__ void apply_add_with_scale(DT *data_ptr,
diff --git a/lib/kernels/src/hip/ops/concat_kernels.cpp b/lib/kernels/src/hip/ops/concat_kernels.cpp
index aa38be739b..369563ab9e 100644
--- a/lib/kernels/src/hip/ops/concat_kernels.cpp
+++ b/lib/kernels/src/hip/ops/concat_kernels.cpp
@@ -13,10 +13,12 @@
  * limitations under the License.
  */
 
+#include "device.h"
 #include "kernels/concat_kernels.h"
 #include "device.h"
 #include <cassert>
 #include <hip/hip_runtime.h>
+#include <cassert>
 
 namespace FlexFlow {
 namespace Kernels {
@@ -73,9 +75,10 @@ void backward_kernel(hipStream_t stream,
   coord_t num_blocks = 1, output_blk_size = 1, input_blk_sizes[MAX_NUM_INPUTS];
   int num_inputs = input_grads.size();
   assert(num_inputs <= MAX_NUM_INPUTS);
+  
   calc_blk_size(num_blocks, output_blk_size, output_grad.shape, axis);
   for (int i = 0; i < num_inputs; i++) {
-    shape = input_grads[i].shape;
+    ArrayShape shape = input_grads[i].shape;
     size_t input_num_blocks = 1;
     calc_blk_size(input_num_blocks, input_blk_sizes[i], shape, axis);
     assert(input_num_blocks == num_blocks);
@@ -99,4 +102,4 @@ void backward_kernel(hipStream_t stream,
 
 } // namespace Concat
 } // namespace Kernels
-} // namespace FlexFlow
+} // namespace FlexFlow
\ No newline at end of file
diff --git a/lib/utils/CMakeLists.txt b/lib/utils/CMakeLists.txt
index a0d77b9f76..7215684f7e 100644
--- a/lib/utils/CMakeLists.txt
+++ b/lib/utils/CMakeLists.txt
@@ -12,7 +12,7 @@ ff_add_library(
     visit_struct
     fmt
     json
-    cuda
+    # cuda
 )      
 
 add_subdirectory(ffi)